diff --git a/rust/anagram/Cargo.toml b/rust/anagram/Cargo.toml index 5e06e4e..f99e68e 100644 --- a/rust/anagram/Cargo.toml +++ b/rust/anagram/Cargo.toml @@ -2,3 +2,7 @@ edition = "2021" name = "anagram" version = "0.0.0" + +[dependencies] +icu = "1.3.2" +itertools = "0.11.0" diff --git a/rust/anagram/src/lib.rs b/rust/anagram/src/lib.rs index f029d04..b8b4124 100644 --- a/rust/anagram/src/lib.rs +++ b/rust/anagram/src/lib.rs @@ -1,5 +1,41 @@ +use icu::{ + collator::{Collator, CollatorOptions, Strength}, + segmenter::GraphemeClusterSegmenter, +}; +use itertools::Itertools; use std::collections::HashSet; -pub fn anagrams_for<'a>(word: &str, possible_anagrams: &[&str]) -> HashSet<&'a str> { - todo!("For the '{word}' word find anagrams among the following words: {possible_anagrams:?}"); +pub fn anagrams_for<'a>(word: &str, possible_anagrams: &'a [&str]) -> HashSet<&'a str> { + let mut options = CollatorOptions::new(); + // Ignore case sensitivity, accents, etc. + options.strength = Some(Strength::Primary); + // Strictly speaking an anagram only makes sense within a given locale, + // but we don't know it, so leave it as undefined. + let collator = Collator::try_new(Default::default(), options).unwrap(); + let word_anagram = first_anagram(&word, &collator); + possible_anagrams + .iter() + .copied() + .filter(|possibility| { + let possibility_anagram = first_anagram(&possibility, &collator); + let is_anagram = collator.compare(&word_anagram, &possibility_anagram).is_eq(); + let is_same = collator.compare(&possibility, &word).is_eq(); + is_anagram && !is_same + }) + .collect() +} + +// We define the first anagram of a string as the concatenation of +// sorting all of its grapheme clusters. To do this fully correctly, +// we would want to know what language we should use when considering +// whether two grapheme clusters are "equal". +fn first_anagram(word: &str, collator: &Collator) -> String { + let segmenter = GraphemeClusterSegmenter::new(); + let mut graphemes: Vec<&str> = segmenter + .segment_str(&word) + .tuple_windows() + .map(|(i, j)| &word[i..j]) + .collect(); + graphemes.sort_by(|a, b| collator.compare(a, b)); + graphemes.into_iter().collect() } diff --git a/rust/anagram/tests/anagram.rs b/rust/anagram/tests/anagram.rs index 4ac7a28..221824e 100644 --- a/rust/anagram/tests/anagram.rs +++ b/rust/anagram/tests/anagram.rs @@ -20,7 +20,6 @@ fn no_matches() { } #[test] -#[ignore] fn detect_simple_anagram() { let word = "ant"; @@ -32,7 +31,6 @@ fn detect_simple_anagram() { } #[test] -#[ignore] fn does_not_confuse_different_duplicates() { let word = "galea"; @@ -44,7 +42,6 @@ fn does_not_confuse_different_duplicates() { } #[test] -#[ignore] fn eliminate_anagram_subsets() { let word = "good"; @@ -56,7 +53,6 @@ fn eliminate_anagram_subsets() { } #[test] -#[ignore] fn detect_anagram() { let word = "listen"; @@ -68,7 +64,6 @@ fn detect_anagram() { } #[test] -#[ignore] fn multiple_anagrams() { let word = "allergy"; @@ -87,7 +82,6 @@ fn multiple_anagrams() { } #[test] -#[ignore] fn case_insensitive_anagrams() { let word = "Orchestra"; @@ -99,7 +93,6 @@ fn case_insensitive_anagrams() { } #[test] -#[ignore] fn unicode_anagrams() { let word = "ΑΒΓ"; @@ -112,7 +105,6 @@ fn unicode_anagrams() { } #[test] -#[ignore] fn misleading_unicode_anagrams() { // Despite what a human might think these words contain different letters, the input uses Greek // A and B while the list of potential anagrams uses Latin A and B. @@ -126,7 +118,6 @@ fn misleading_unicode_anagrams() { } #[test] -#[ignore] fn does_not_detect_a_word_as_its_own_anagram() { let word = "banana"; @@ -138,7 +129,6 @@ fn does_not_detect_a_word_as_its_own_anagram() { } #[test] -#[ignore] fn does_not_detect_a_differently_cased_word_as_its_own_anagram() { let word = "banana"; @@ -150,7 +140,6 @@ fn does_not_detect_a_differently_cased_word_as_its_own_anagram() { } #[test] -#[ignore] fn does_not_detect_a_differently_cased_unicode_word_as_its_own_anagram() { let word = "ΑΒΓ"; @@ -162,7 +151,6 @@ fn does_not_detect_a_differently_cased_unicode_word_as_its_own_anagram() { } #[test] -#[ignore] fn same_bytes_different_chars() { let word = "a⬂"; // 61 E2 AC 82 @@ -174,7 +162,6 @@ fn same_bytes_different_chars() { } #[test] -#[ignore] fn different_words_but_same_ascii_sum() { let word = "bc";