/// <summary> /// Gets rules for a combination of name type, rule type and a single language. /// </summary> /// <param name="nameType">The <see cref="NameType"/> to consider.</param> /// <param name="rt">The <see cref="RuleType"/> to consider.</param> /// <param name="lang">The language to consider.</param> /// <returns>A list of <see cref="Rule"/>s that apply.</returns> public static IList <Rule> GetInstance(NameType nameType, RuleType rt, string lang) { return(GetInstance(nameType, rt, LanguageSet.From(new JCG.HashSet <string>() { lang }))); }
/// <summary> /// Applies the given phoneme expression to all phonemes in this phoneme builder. /// <para/> /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are /// incompatible. /// </summary> /// <param name="phonemeExpr">The expression to apply.</param> /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param> public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes) { // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that IList <Phoneme> newPhonemes = new List <Phoneme>(maxPhonemes); //EXPR_continue: foreach (Phoneme left in this.phonemes) { foreach (Phoneme right in phonemeExpr.Phonemes) { LanguageSet languages = left.Languages.RestrictTo(right.Languages); if (!languages.IsEmpty) { Phoneme join = new Phoneme(left, right, languages); if (newPhonemes.Count < maxPhonemes) { newPhonemes.Add(join); if (newPhonemes.Count >= maxPhonemes) { goto EXPR_break; } } } } } EXPR_break : { } this.phonemes.Clear(); // LUCENENET: We need to filter out any duplicates, since we converted from LinkedHashSet // to List. this.phonemes.AddRange(newPhonemes.Where(x => !phonemes.Any(y => y.Equals(x)))); }
/// <summary> /// Applies the given phoneme expression to all phonemes in this phoneme builder. /// <para/> /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are /// incompatible. /// </summary> /// <param name="phonemeExpr">The expression to apply.</param> /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param> public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes) { ISet <Phoneme> newPhonemes = new JCG.LinkedHashSet <Phoneme>(maxPhonemes); //EXPR_continue: foreach (Phoneme left in this.phonemes) { foreach (Phoneme right in phonemeExpr.Phonemes) { LanguageSet languages = left.Languages.RestrictTo(right.Languages); if (!languages.IsEmpty) { Phoneme join = new Phoneme(left, right, languages); if (newPhonemes.Count < maxPhonemes) { newPhonemes.Add(join); if (newPhonemes.Count >= maxPhonemes) { goto EXPR_break; } } } } } EXPR_break : { } this.phonemes.Clear(); this.phonemes.UnionWith(newPhonemes); }
public void TestLanguageGuessing(String name, String language, String exactness) { LanguageSet guesses = this.lang.GuessLanguages(name); Assert.True(guesses.Contains(language), "language predicted for name '" + name + "' is wrong: " + guesses + " should contain '" + language + "'" ); }
/// <summary> /// Gets rules for a combination of name type, rule type and languages. /// </summary> /// <param name="nameType">The <see cref="NameType"/> to consider.</param> /// <param name="rt">The <see cref="RuleType"/> to consider.</param> /// <param name="langs">The set of languages to consider.</param> /// <returns>A list of <see cref="Rule"/>s that apply.</returns> public static IList <Rule> GetInstance(NameType nameType, RuleType rt, LanguageSet langs) { IDictionary <string, IList <Rule> > ruleMap = GetInstanceMap(nameType, rt, langs); IList <Rule> allRules = new List <Rule>(); foreach (IList <Rule> rules in ruleMap.Values) { allRules.AddRange(rules); } return(allRules); }
/** * This code is similar in style to code found in Solr: * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java * * Making a JUnit test out of it to protect Solr from possible future * regressions in Commons-Codec. */ private static String Encode(IDictionary <String, String> args, bool concat, String input) { LanguageSet languageSet; PhoneticEngine engine; // PhoneticEngine = NameType + RuleType + concat // we use common-codec's defaults: GENERIC + APPROX + true String nameTypeArg; args.TryGetValue("nameType", out nameTypeArg); NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : (NameType)Enum.Parse(typeof(NameType), nameTypeArg, true); String ruleTypeArg; args.TryGetValue("ruleType", out ruleTypeArg); RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : (RuleType)Enum.Parse(typeof(RuleType), ruleTypeArg, true); engine = new PhoneticEngine(nameType, ruleType, concat); // LanguageSet: defaults to automagic, otherwise a comma-separated list. String languageSetArg; args.TryGetValue("languageSet", out languageSetArg); if (languageSetArg == null || languageSetArg.equals("auto")) { languageSet = null; } else { languageSet = LanguageSet.From(new HashSet <String>(Arrays.AsList(languageSetArg.Split(',').TrimEnd()))); } /* * org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this: * * encoded = (languages == null) * ? engine.encode(termAtt.toString()) * : engine.encode(termAtt.toString(), languages); * * Hence our approach, below: */ if (languageSet == null) { return(engine.Encode(input)); } else { return(engine.Encode(input, languageSet)); } }
private static Phoneme ParsePhoneme(string ph) { int open = ph.IndexOf('['); if (open >= 0) { if (!ph.EndsWith("]", StringComparison.Ordinal)) { throw new ArgumentException("Phoneme expression contains a '[' but does not end in ']'"); } string before = ph.Substring(0, open - 0); string input = ph.Substring(open + 1, (ph.Length - 1) - (open + 1)); ISet <string> langs = new JCG.HashSet <string>(PLUS.Split(input).TrimEnd()); return(new Phoneme(before, LanguageSet.From(langs))); } else { return(new Phoneme(ph, Languages.ANY_LANGUAGE)); } }
/// <summary> /// Guesses the languages of a word. /// </summary> /// <param name="input">The word.</param> /// <returns>A Set of Strings of language names that are potential matches for the input word.</returns> public virtual LanguageSet GuessLanguages(string input) { string text = input.ToLowerInvariant(); ISet <string> langs = new JCG.HashSet <string>(this.languages.GetLanguages()); foreach (LangRule rule in this.rules) { if (rule.Matches(text)) { if (rule.acceptOnMatch) { IList <string> toRemove = new JCG.List <string>(); foreach (var item in langs) { if (!rule.languages.Contains(item)) { toRemove.Add(item); } } foreach (var item in toRemove) { langs.Remove(item); } } else { foreach (var item in rule.languages) { langs.Remove(item); } } } } LanguageSet ls = LanguageSet.From(langs); return(ls.Equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls); }
public override LanguageSet RestrictTo(LanguageSet other) { if (other == Languages.NO_LANGUAGES) { return(other); } else if (other == Languages.ANY_LANGUAGE) { return(this); } else { SomeLanguages sl = (SomeLanguages)other; ISet <string> ls = new JCG.HashSet <string>(Math.Min(languages.Count, sl.languages.Count)); foreach (string lang in languages) { if (sl.languages.Contains(lang)) { ls.Add(lang); } } return(From(ls)); } }
/// <summary> /// An empty builder where all phonemes must come from some set of languages. This will contain a single /// phoneme of zero characters. This can then be appended to. This should be the only way to create a new /// phoneme from scratch. /// </summary> /// <param name="languages">The set of languages.</param> /// <returns>A new, empty phoneme builder.</returns> public static PhonemeBuilder Empty(LanguageSet languages) { return(new PhonemeBuilder(new Phoneme("", languages))); }
/// <summary> /// Encodes an input string into an output phonetic representation, given a set of possible origin languages. /// </summary> /// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param> /// <param name="languageSet"></param> /// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns> public virtual string Encode(string input, LanguageSet languageSet) { IDictionary <string, IList <Rule> > rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet); // rules common across many (all) languages IDictionary <string, IList <Rule> > finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common"); // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages IDictionary <string, IList <Rule> > finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet); // tidy the input // lower case is a locale-dependent operation input = input.ToLowerInvariant().Replace('-', ' ').Trim(); if (this.nameType == NameType.GENERIC) { if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'")) { // check for d' string remainder = input.Substring(2); string combined = "d" + remainder; return("(" + Encode(remainder) + ")-(" + Encode(combined) + ")"); } foreach (string l in NAME_PREFIXES[this.nameType]) { // handle generic prefixes if (input.StartsWith(l + " ", StringComparison.Ordinal)) { // check for any prefix in the words list string remainder = input.Substring(l.Length + 1); // input without the prefix string combined = l + remainder; // input with prefix without space return("(" + Encode(remainder) + ")-(" + Encode(combined) + ")"); } } } IList <string> words = WHITESPACE.Split(input).ToList(); IList <string> words2 = new List <string>(); // special-case handling of word prefixes based upon the name type switch (this.nameType) { case NameType.SEPHARDIC: foreach (string aWord in words) { string[] parts = aWord.Split(new char[] { '\'' }, StringSplitOptions.RemoveEmptyEntries); string lastPart = parts[parts.Length - 1]; words2.Add(lastPart); } words2.RemoveAll(NAME_PREFIXES[this.nameType]); break; case NameType.ASHKENAZI: words2.AddRange(words); words2.RemoveAll(NAME_PREFIXES[this.nameType]); break; case NameType.GENERIC: words2.AddRange(words); break; default: throw new InvalidOperationException("Unreachable case: " + this.nameType); } if (this.concat) { // concat mode enabled input = Join(words2, " "); } else if (words2.Count == 1) { // not a multi-word name //input = words.iterator().next(); input = words.FirstOrDefault(); } else { // encode each word in a multi-word name separately (normally used for approx matches) StringBuilder result = new StringBuilder(); foreach (string word in words2) { result.Append("-").Append(Encode(word)); } // return the result without the leading "-" return(result.ToString(1, result.Length - 1)); } PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet); // loop over each char in the input - we will handle the increment manually for (int i = 0; i < input.Length;) { RulesApplication rulesApplication = new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke(); i = rulesApplication.I; phonemeBuilder = rulesApplication.PhonemeBuilder; } // Apply the general rules phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1); // Apply the language-specific rules phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2); return(phonemeBuilder.MakeString()); }
/// <summary> /// Encodes a string to its phonetic representation. /// </summary> /// <param name="input">The string to encode.</param> /// <returns>The encoding of the input.</returns> public virtual string Encode(string input) { LanguageSet languageSet = this.lang.GuessLanguages(input); return(Encode(input, languageSet)); }
/// <summary> /// Guesses the language of a word. /// </summary> /// <param name="text">The word.</param> /// <returns>The language that the word originates from or <see cref="Languages.ANY"/> if there was no unique match.</returns> public virtual string GuessLanguage(string text) { LanguageSet ls = GuessLanguages(text); return(ls.IsSingleton ? ls.GetAny() : Languages.ANY); }
public abstract LanguageSet RestrictTo(LanguageSet other);
public override LanguageSet RestrictTo(LanguageSet other) { return(other); }
public Phoneme(ICharSequence phonemeText, LanguageSet languages) { this.phonemeText = new StringBuilder(phonemeText.ToString()); this.languages = languages; }
public Phoneme(string phonemeText, LanguageSet languages) { this.phonemeText = new StringBuilder(phonemeText); this.languages = languages; }
/// <summary> /// Gets rules for a combination of name type, rule type and languages. /// <para/> /// since 1.9 /// </summary> /// <param name="nameType">The <see cref="NameType"/> to consider.</param> /// <param name="rt">The <see cref="RuleType"/> to consider.</param> /// <param name="langs">The set of languages to consider.</param> /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns> public static IDictionary <string, IList <Rule> > GetInstanceMap(NameType nameType, RuleType rt, LanguageSet langs) { return(langs.IsSingleton ? GetInstanceMap(nameType, rt, langs.GetAny()) : GetInstanceMap(nameType, rt, Languages.ANY)); }
public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight, LanguageSet languages) : this(phonemeLeft.phonemeText, languages) { this.phonemeText.Append(phonemeRight.phonemeText); }