diff --git a/rust/anagram/src/lib.rs b/rust/anagram/src/lib.rs index b8b4124..3c92822 100644 --- a/rust/anagram/src/lib.rs +++ b/rust/anagram/src/lib.rs @@ -1,41 +1,45 @@ use icu::{ - collator::{Collator, CollatorOptions, Strength}, + collator::{AlternateHandling, Collator, CollatorOptions, MaxVariable, Strength}, segmenter::GraphemeClusterSegmenter, }; use itertools::Itertools; use std::collections::HashSet; +// We define an anagram as any permutation of the graphemes of a string, ignoring +// case, accents/diacritical marks, punctuation, spaces, etc. There are some +// language-specific instances where this fails, but it covers a lot of cases. pub fn anagrams_for<'a>(word: &str, possible_anagrams: &'a [&str]) -> HashSet<&'a str> { let mut options = CollatorOptions::new(); - // Ignore case sensitivity, accents, etc. + // Ignore case sensitivity, accents, punctuation, spaces, etc. options.strength = Some(Strength::Primary); + options.alternate_handling = Some(AlternateHandling::Shifted); + options.max_variable = Some(MaxVariable::Punctuation); // Strictly speaking an anagram only makes sense within a given locale, // but we don't know it, so leave it as undefined. let collator = Collator::try_new(Default::default(), options).unwrap(); - let word_anagram = first_anagram(&word, &collator); + let word_permutation = first_grapheme_permutation(&word, &collator); possible_anagrams .iter() .copied() .filter(|possibility| { - let possibility_anagram = first_anagram(&possibility, &collator); - let is_anagram = collator.compare(&word_anagram, &possibility_anagram).is_eq(); + let possibility_permutation = first_grapheme_permutation(&possibility, &collator); + // If two strings are anagrams of each other, they will have the same first + // permutation. + let is_anagram = collator + .compare(&word_permutation, &possibility_permutation) + .is_eq(); let is_same = collator.compare(&possibility, &word).is_eq(); is_anagram && !is_same }) .collect() } -// We define the first anagram of a string as the concatenation of -// sorting all of its grapheme clusters. To do this fully correctly, -// we would want to know what language we should use when considering -// whether two grapheme clusters are "equal". -fn first_anagram(word: &str, collator: &Collator) -> String { +fn first_grapheme_permutation(word: &str, collator: &Collator) -> String { let segmenter = GraphemeClusterSegmenter::new(); - let mut graphemes: Vec<&str> = segmenter + segmenter .segment_str(&word) .tuple_windows() .map(|(i, j)| &word[i..j]) - .collect(); - graphemes.sort_by(|a, b| collator.compare(a, b)); - graphemes.into_iter().collect() + .sorted_by(|a, b| collator.compare(a, b)) + .collect() } diff --git a/rust/anagram/tests/anagram.rs b/rust/anagram/tests/anagram.rs index 221824e..5b7b387 100644 --- a/rust/anagram/tests/anagram.rs +++ b/rust/anagram/tests/anagram.rs @@ -171,3 +171,47 @@ fn different_words_but_same_ascii_sum() { process_anagram_case(word, &inputs, &outputs); } + +#[test] +fn hebrew() { + let word = "גרנוט ציפה"; + + let inputs = ["צנטריפוגה"]; + + let outputs = vec!["צנטריפוגה"]; + + process_anagram_case(word, &inputs, &outputs); +} + +#[test] +fn accents() { + let word = "Siobhán Donaghy"; + + let inputs = ["Shanghai Nobody"]; + + let outputs = vec!["Shanghai Nobody"]; + + process_anagram_case(word, &inputs, &outputs); +} + +#[test] +fn galileo() { + let word = "Altissimum planetam tergeminum observavi"; + + let inputs = ["smaismrmilmepoetalevmibunenvgttauiras"]; + + let outputs = vec!["smaismrmilmepoetalevmibunenvgttauiras"]; + + process_anagram_case(word, &inputs, &outputs); +} + +#[test] +fn punctuation_spaces() { + let word = "Party in the U.S.A."; + + let inputs = ["paternity haus"]; + + let outputs = vec!["paternity haus"]; + + process_anagram_case(word, &inputs, &outputs); +} \ No newline at end of file