(** * Programming Pearls Column 2 Exercise C * * Given a file of words, produce a list of the groups of anagrams. * * This is implemented as a sequence of processes that operate on streams and * arrays. Each word in the file is signed by sorting its characters, and then * the words are sorted by signature. The final process simply prints out the * words, adding a newline when the signatures change. * * Author: Michael DiBernardo *) (** * Signed words are represented as records. *) type signed_word = { signature: string; word: string } (** * Produce a { signature, word } pair from a word. Both the word and the * signature are lowercased in the returned pair. The string argument is not * modified. *) let sign_word word = let word = (String.lowercase word) in (* There isn't an iteri for strings, so we have to track i ourselves. *) let word_length = String.length word and i = ref 0 in (* Copy the characters from the string into an array. *) let word_chars = Array.make word_length ' ' in ( String.iter (fun c -> ( (Array.set word_chars !i c); incr i)) word; (* Produce the signature by sorting the array. *) (Array.sort Char.compare word_chars); (* Recopy the sorted characters into a string that will serve as the * signature. *) let signature = String.make word_length ' ' in Array.iteri (fun i c -> (String.set signature i c)) word_chars; { signature = signature; word = word } ) (** * Compare a signed word to another by comparing their signatures. *) let signed_word_compare x y = String.compare x.signature y.signature (** * Emulate 'for line in file' behaviour from python. Returns a stream that will * produce the next line in the file until it is empty. * * Thanks to pango from #ocaml for pointing me to the Stream.from_ functions. *) let make_linestream filename = let line_channel = open_in filename in Stream.from (fun stream_element_index -> try (Some (input_line line_channel)) with End_of_file -> None ) (** * Given a file that has one word per line, this will produce a signature-word * pair for each word. The signature-word pairs are placed in the array in the * same order that the corresponding words were presented in the file. *) let sign_words filename = (* First pass to count words in stream to determine necessary array length. *) let counting_stream = (make_linestream filename) and word_count = ref 0 in ( Stream.iter (fun _ -> incr word_count) counting_stream; (* Now sign all the words and place them into an array. There's no iteri for * streams, so we track i ourselves. Although, I suppose I could have used * the stream 'count'. *) let signed_words = (Array.make !word_count { signature = ""; word = "" }) and word_stream = (make_linestream filename) and i = ref 0 in (Stream.iter (fun word -> ((Array.set signed_words !i (sign_word word)); incr i)) word_stream); signed_words ) (** * Sorts the given array of signature-word pairs by signature. *) let sort_signatures unsorted_signatures = ( Array.sort signed_word_compare unsorted_signatures; unsorted_signatures ) (** * Given an array of signature-word pairs that is sorted by signature, this will * print all the words, where words that share a signature are placed on the * same line. *) let print_anagram_groups_from_sorted_signatures sorted_signatures = let print_word i signed_word = let next_index = (i + 1) in ( Printf.printf "%s " signed_word.word; if (next_index = (Array.length sorted_signatures)) or (signed_word.signature <> sorted_signatures.(next_index).signature) then Printf.printf "\n" ) in Array.iteri print_word sorted_signatures (** * Main sequence of processes. *) let print_anagram_groups_from_file filename = let unsorted_signatures = (sign_words filename) in let sorted_signatures = (sort_signatures unsorted_signatures) in print_anagram_groups_from_sorted_signatures sorted_signatures (* Main program. *) let _ = if (Array.length Sys.argv) = 2 then let input_file_name = (Array.get Sys.argv 1) in print_anagram_groups_from_file input_file_name else Printf.printf "USAGE: group_anagrams \n"