Add more functionality to rust/anagram
This commit is contained in:
parent
a2adfa0e9d
commit
3cb8a72670
2 changed files with 62 additions and 14 deletions
|
@ -1,41 +1,45 @@
|
|||
use icu::{
|
||||
collator::{Collator, CollatorOptions, Strength},
|
||||
collator::{AlternateHandling, Collator, CollatorOptions, MaxVariable, Strength},
|
||||
segmenter::GraphemeClusterSegmenter,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use std::collections::HashSet;
|
||||
|
||||
// We define an anagram as any permutation of the graphemes of a string, ignoring
|
||||
// case, accents/diacritical marks, punctuation, spaces, etc. There are some
|
||||
// language-specific instances where this fails, but it covers a lot of cases.
|
||||
pub fn anagrams_for<'a>(word: &str, possible_anagrams: &'a [&str]) -> HashSet<&'a str> {
|
||||
let mut options = CollatorOptions::new();
|
||||
// Ignore case sensitivity, accents, etc.
|
||||
// Ignore case sensitivity, accents, punctuation, spaces, etc.
|
||||
options.strength = Some(Strength::Primary);
|
||||
options.alternate_handling = Some(AlternateHandling::Shifted);
|
||||
options.max_variable = Some(MaxVariable::Punctuation);
|
||||
// Strictly speaking an anagram only makes sense within a given locale,
|
||||
// but we don't know it, so leave it as undefined.
|
||||
let collator = Collator::try_new(Default::default(), options).unwrap();
|
||||
let word_anagram = first_anagram(&word, &collator);
|
||||
let word_permutation = first_grapheme_permutation(&word, &collator);
|
||||
possible_anagrams
|
||||
.iter()
|
||||
.copied()
|
||||
.filter(|possibility| {
|
||||
let possibility_anagram = first_anagram(&possibility, &collator);
|
||||
let is_anagram = collator.compare(&word_anagram, &possibility_anagram).is_eq();
|
||||
let possibility_permutation = first_grapheme_permutation(&possibility, &collator);
|
||||
// If two strings are anagrams of each other, they will have the same first
|
||||
// permutation.
|
||||
let is_anagram = collator
|
||||
.compare(&word_permutation, &possibility_permutation)
|
||||
.is_eq();
|
||||
let is_same = collator.compare(&possibility, &word).is_eq();
|
||||
is_anagram && !is_same
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
// We define the first anagram of a string as the concatenation of
|
||||
// sorting all of its grapheme clusters. To do this fully correctly,
|
||||
// we would want to know what language we should use when considering
|
||||
// whether two grapheme clusters are "equal".
|
||||
fn first_anagram(word: &str, collator: &Collator) -> String {
|
||||
fn first_grapheme_permutation(word: &str, collator: &Collator) -> String {
|
||||
let segmenter = GraphemeClusterSegmenter::new();
|
||||
let mut graphemes: Vec<&str> = segmenter
|
||||
segmenter
|
||||
.segment_str(&word)
|
||||
.tuple_windows()
|
||||
.map(|(i, j)| &word[i..j])
|
||||
.collect();
|
||||
graphemes.sort_by(|a, b| collator.compare(a, b));
|
||||
graphemes.into_iter().collect()
|
||||
.sorted_by(|a, b| collator.compare(a, b))
|
||||
.collect()
|
||||
}
|
||||
|
|
|
@ -171,3 +171,47 @@ fn different_words_but_same_ascii_sum() {
|
|||
|
||||
process_anagram_case(word, &inputs, &outputs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hebrew() {
|
||||
let word = "גרנוט ציפה";
|
||||
|
||||
let inputs = ["צנטריפוגה"];
|
||||
|
||||
let outputs = vec!["צנטריפוגה"];
|
||||
|
||||
process_anagram_case(word, &inputs, &outputs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn accents() {
|
||||
let word = "Siobhán Donaghy";
|
||||
|
||||
let inputs = ["Shanghai Nobody"];
|
||||
|
||||
let outputs = vec!["Shanghai Nobody"];
|
||||
|
||||
process_anagram_case(word, &inputs, &outputs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn galileo() {
|
||||
let word = "Altissimum planetam tergeminum observavi";
|
||||
|
||||
let inputs = ["smaismrmilmepoetalevmibunenvgttauiras"];
|
||||
|
||||
let outputs = vec!["smaismrmilmepoetalevmibunenvgttauiras"];
|
||||
|
||||
process_anagram_case(word, &inputs, &outputs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn punctuation_spaces() {
|
||||
let word = "Party in the U.S.A.";
|
||||
|
||||
let inputs = ["paternity haus"];
|
||||
|
||||
let outputs = vec!["paternity haus"];
|
||||
|
||||
process_anagram_case(word, &inputs, &outputs);
|
||||
}
|
Loading…
Reference in a new issue