/// <summary> /// Gets rules for a combination of name type, rule type and a single language. /// </summary> /// <param name="nameType">The <see cref="NameType"/> to consider.</param> /// <param name="rt">The <see cref="RuleType"/> to consider.</param> /// <param name="lang">The language to consider.</param> /// <returns>A list of <see cref="Rule"/>s that apply.</returns> public static IList <Rule> GetInstance(NameType nameType, RuleType rt, string lang) { return(GetInstance(nameType, rt, LanguageSet.From(new JCG.HashSet <string>() { lang }))); }
/** * This code is similar in style to code found in Solr: * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java * * Making a JUnit test out of it to protect Solr from possible future * regressions in Commons-Codec. */ private static String Encode(IDictionary <String, String> args, bool concat, String input) { LanguageSet languageSet; PhoneticEngine engine; // PhoneticEngine = NameType + RuleType + concat // we use common-codec's defaults: GENERIC + APPROX + true String nameTypeArg; args.TryGetValue("nameType", out nameTypeArg); NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : (NameType)Enum.Parse(typeof(NameType), nameTypeArg, true); String ruleTypeArg; args.TryGetValue("ruleType", out ruleTypeArg); RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : (RuleType)Enum.Parse(typeof(RuleType), ruleTypeArg, true); engine = new PhoneticEngine(nameType, ruleType, concat); // LanguageSet: defaults to automagic, otherwise a comma-separated list. String languageSetArg; args.TryGetValue("languageSet", out languageSetArg); if (languageSetArg == null || languageSetArg.equals("auto")) { languageSet = null; } else { languageSet = LanguageSet.From(new HashSet <String>(Arrays.AsList(languageSetArg.Split(',').TrimEnd()))); } /* * org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this: * * encoded = (languages == null) * ? engine.encode(termAtt.toString()) * : engine.encode(termAtt.toString(), languages); * * Hence our approach, below: */ if (languageSet == null) { return(engine.Encode(input)); } else { return(engine.Encode(input, languageSet)); } }
private static Phoneme ParsePhoneme(string ph) { int open = ph.IndexOf('['); if (open >= 0) { if (!ph.EndsWith("]", StringComparison.Ordinal)) { throw new ArgumentException("Phoneme expression contains a '[' but does not end in ']'"); } string before = ph.Substring(0, open - 0); string input = ph.Substring(open + 1, (ph.Length - 1) - (open + 1)); ISet <string> langs = new JCG.HashSet <string>(PLUS.Split(input).TrimEnd()); return(new Phoneme(before, LanguageSet.From(langs))); } else { return(new Phoneme(ph, Languages.ANY_LANGUAGE)); } }
/// <summary> /// Guesses the languages of a word. /// </summary> /// <param name="input">The word.</param> /// <returns>A Set of Strings of language names that are potential matches for the input word.</returns> public virtual LanguageSet GuessLanguages(string input) { string text = input.ToLowerInvariant(); ISet <string> langs = new JCG.HashSet <string>(this.languages.GetLanguages()); foreach (LangRule rule in this.rules) { if (rule.Matches(text)) { if (rule.acceptOnMatch) { IList <string> toRemove = new JCG.List <string>(); foreach (var item in langs) { if (!rule.languages.Contains(item)) { toRemove.Add(item); } } foreach (var item in toRemove) { langs.Remove(item); } } else { foreach (var item in rule.languages) { langs.Remove(item); } } } } LanguageSet ls = LanguageSet.From(langs); return(ls.Equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls); }