Solve rust/anagram

2023-10-11 16:02:27 -05:00 · 2023-10-11 16:02:27 -05:00 · a2adfa0e9d
commit a2adfa0e9d
parent a675ae7b81
3 changed files with 42 additions and 15 deletions
--- a/rust/anagram/Cargo.toml
+++ b/rust/anagram/Cargo.toml
@ -2,3 +2,7 @@
 edition = "2021"
 name = "anagram"
 version = "0.0.0"
+
+[dependencies]
+icu = "1.3.2"
+itertools = "0.11.0"
--- a/rust/anagram/src/lib.rs
+++ b/rust/anagram/src/lib.rs
@ -1,5 +1,41 @@
+use icu::{
+    collator::{Collator, CollatorOptions, Strength},
+    segmenter::GraphemeClusterSegmenter,
+};
+use itertools::Itertools;
 use std::collections::HashSet;

-pub fn anagrams_for<'a>(word: &str, possible_anagrams: &[&str]) -> HashSet<&'a str> {
-    todo!("For the '{word}' word find anagrams among the following words: {possible_anagrams:?}");
+pub fn anagrams_for<'a>(word: &str, possible_anagrams: &'a [&str]) -> HashSet<&'a str> {
+    let mut options = CollatorOptions::new();
+    // Ignore case sensitivity, accents, etc.
+    options.strength = Some(Strength::Primary);
+    // Strictly speaking an anagram only makes sense within a given locale,
+    // but we don't know it, so leave it as undefined.
+    let collator = Collator::try_new(Default::default(), options).unwrap();
+    let word_anagram = first_anagram(&word, &collator);
+    possible_anagrams
+        .iter()
+        .copied()
+        .filter(|possibility| {
+            let possibility_anagram = first_anagram(&possibility, &collator);
+            let is_anagram = collator.compare(&word_anagram, &possibility_anagram).is_eq();
+            let is_same = collator.compare(&possibility, &word).is_eq();
+            is_anagram && !is_same
+        })
+        .collect()
+}
+
+// We define the first anagram of a string as the concatenation of
+// sorting all of its grapheme clusters. To do this fully correctly,
+// we would want to know what language we should use when considering
+// whether two grapheme clusters are "equal".
+fn first_anagram(word: &str, collator: &Collator) -> String {
+    let segmenter = GraphemeClusterSegmenter::new();
+    let mut graphemes: Vec<&str> = segmenter
+        .segment_str(&word)
+        .tuple_windows()
+        .map(|(i, j)| &word[i..j])
+        .collect();
+    graphemes.sort_by(|a, b| collator.compare(a, b));
+    graphemes.into_iter().collect()
 }
--- a/rust/anagram/tests/anagram.rs
+++ b/rust/anagram/tests/anagram.rs
@ -20,7 +20,6 @@ fn no_matches() {
 }

 #[test]
-#[ignore]
 fn detect_simple_anagram() {
    let word = "ant";

@ -32,7 +31,6 @@ fn detect_simple_anagram() {
 }

 #[test]
-#[ignore]
 fn does_not_confuse_different_duplicates() {
    let word = "galea";

@ -44,7 +42,6 @@ fn does_not_confuse_different_duplicates() {
 }

 #[test]
-#[ignore]
 fn eliminate_anagram_subsets() {
    let word = "good";

@ -56,7 +53,6 @@ fn eliminate_anagram_subsets() {
 }

 #[test]
-#[ignore]
 fn detect_anagram() {
    let word = "listen";

@ -68,7 +64,6 @@ fn detect_anagram() {
 }

 #[test]
-#[ignore]
 fn multiple_anagrams() {
    let word = "allergy";

@ -87,7 +82,6 @@ fn multiple_anagrams() {
 }

 #[test]
-#[ignore]
 fn case_insensitive_anagrams() {
    let word = "Orchestra";

@ -99,7 +93,6 @@ fn case_insensitive_anagrams() {
 }

 #[test]
-#[ignore]
 fn unicode_anagrams() {
    let word = "ΑΒΓ";

@ -112,7 +105,6 @@ fn unicode_anagrams() {
 }

 #[test]
-#[ignore]
 fn misleading_unicode_anagrams() {
    // Despite what a human might think these words contain different letters, the input uses Greek
    // A and B while the list of potential anagrams uses Latin A and B.
@ -126,7 +118,6 @@ fn misleading_unicode_anagrams() {
 }

 #[test]
-#[ignore]
 fn does_not_detect_a_word_as_its_own_anagram() {
    let word = "banana";

@ -138,7 +129,6 @@ fn does_not_detect_a_word_as_its_own_anagram() {
 }

 #[test]
-#[ignore]
 fn does_not_detect_a_differently_cased_word_as_its_own_anagram() {
    let word = "banana";

@ -150,7 +140,6 @@ fn does_not_detect_a_differently_cased_word_as_its_own_anagram() {
 }

 #[test]
-#[ignore]
 fn does_not_detect_a_differently_cased_unicode_word_as_its_own_anagram() {
    let word = "ΑΒΓ";

@ -162,7 +151,6 @@ fn does_not_detect_a_differently_cased_unicode_word_as_its_own_anagram() {
 }

 #[test]
-#[ignore]
 fn same_bytes_different_chars() {
    let word = "a⬂"; // 61 E2 AC 82

@ -174,7 +162,6 @@ fn same_bytes_different_chars() {
 }

 #[test]
-#[ignore]
 fn different_words_but_same_ascii_sum() {
    let word = "bc";