Ejemplo n.º 1
0
 /// <summary>
 /// Gets rules for a combination of name type, rule type and a single language.
 /// </summary>
 /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
 /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
 /// <param name="lang">The language to consider.</param>
 /// <returns>A list of <see cref="Rule"/>s that apply.</returns>
 public static IList <Rule> GetInstance(NameType nameType, RuleType rt, string lang)
 {
     return(GetInstance(nameType, rt, LanguageSet.From(new JCG.HashSet <string>()
     {
         lang
     })));
 }
Ejemplo n.º 2
0
            /// <summary>
            /// Applies the given phoneme expression to all phonemes in this phoneme builder.
            /// <para/>
            /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
            /// incompatible.
            /// </summary>
            /// <param name="phonemeExpr">The expression to apply.</param>
            /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param>
            public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes)
            {
                // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that
                IList <Phoneme> newPhonemes = new List <Phoneme>(maxPhonemes);

                //EXPR_continue:
                foreach (Phoneme left in this.phonemes)
                {
                    foreach (Phoneme right in phonemeExpr.Phonemes)
                    {
                        LanguageSet languages = left.Languages.RestrictTo(right.Languages);
                        if (!languages.IsEmpty)
                        {
                            Phoneme join = new Phoneme(left, right, languages);
                            if (newPhonemes.Count < maxPhonemes)
                            {
                                newPhonemes.Add(join);
                                if (newPhonemes.Count >= maxPhonemes)
                                {
                                    goto EXPR_break;
                                }
                            }
                        }
                    }
                }
                EXPR_break : { }

                this.phonemes.Clear();
                // LUCENENET: We need to filter out any duplicates, since we converted from LinkedHashSet
                // to List.
                this.phonemes.AddRange(newPhonemes.Where(x => !phonemes.Any(y => y.Equals(x))));
            }
Ejemplo n.º 3
0
            /// <summary>
            /// Applies the given phoneme expression to all phonemes in this phoneme builder.
            /// <para/>
            /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are
            /// incompatible.
            /// </summary>
            /// <param name="phonemeExpr">The expression to apply.</param>
            /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param>
            public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes)
            {
                ISet <Phoneme> newPhonemes = new JCG.LinkedHashSet <Phoneme>(maxPhonemes);

                //EXPR_continue:
                foreach (Phoneme left in this.phonemes)
                {
                    foreach (Phoneme right in phonemeExpr.Phonemes)
                    {
                        LanguageSet languages = left.Languages.RestrictTo(right.Languages);
                        if (!languages.IsEmpty)
                        {
                            Phoneme join = new Phoneme(left, right, languages);
                            if (newPhonemes.Count < maxPhonemes)
                            {
                                newPhonemes.Add(join);
                                if (newPhonemes.Count >= maxPhonemes)
                                {
                                    goto EXPR_break;
                                }
                            }
                        }
                    }
                }
                EXPR_break : { }

                this.phonemes.Clear();
                this.phonemes.UnionWith(newPhonemes);
            }
Ejemplo n.º 4
0
        public void TestLanguageGuessing(String name, String language, String exactness)
        {
            LanguageSet guesses = this.lang.GuessLanguages(name);

            Assert.True(guesses.Contains(language),
                        "language predicted for name '" + name + "' is wrong: " + guesses + " should contain '" + language + "'"
                        );
        }
Ejemplo n.º 5
0
        /// <summary>
        /// Gets rules for a combination of name type, rule type and languages.
        /// </summary>
        /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
        /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
        /// <param name="langs">The set of languages to consider.</param>
        /// <returns>A list of <see cref="Rule"/>s that apply.</returns>
        public static IList <Rule> GetInstance(NameType nameType, RuleType rt,
                                               LanguageSet langs)
        {
            IDictionary <string, IList <Rule> > ruleMap = GetInstanceMap(nameType, rt, langs);
            IList <Rule> allRules = new List <Rule>();

            foreach (IList <Rule> rules in ruleMap.Values)
            {
                allRules.AddRange(rules);
            }
            return(allRules);
        }
Ejemplo n.º 6
0
        /**
         * This code is similar in style to code found in Solr:
         * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
         *
         * Making a JUnit test out of it to protect Solr from possible future
         * regressions in Commons-Codec.
         */
        private static String Encode(IDictionary <String, String> args, bool concat, String input)
        {
            LanguageSet    languageSet;
            PhoneticEngine engine;

            // PhoneticEngine = NameType + RuleType + concat
            // we use common-codec's defaults: GENERIC + APPROX + true
            String nameTypeArg;

            args.TryGetValue("nameType", out nameTypeArg);
            NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : (NameType)Enum.Parse(typeof(NameType), nameTypeArg, true);

            String ruleTypeArg;

            args.TryGetValue("ruleType", out ruleTypeArg);
            RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : (RuleType)Enum.Parse(typeof(RuleType), ruleTypeArg, true);

            engine = new PhoneticEngine(nameType, ruleType, concat);

            // LanguageSet: defaults to automagic, otherwise a comma-separated list.
            String languageSetArg;

            args.TryGetValue("languageSet", out languageSetArg);
            if (languageSetArg == null || languageSetArg.equals("auto"))
            {
                languageSet = null;
            }
            else
            {
                languageSet = LanguageSet.From(new HashSet <String>(Arrays.AsList(languageSetArg.Split(',').TrimEnd())));
            }

            /*
             *  org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
             *
             *  encoded = (languages == null)
             *      ? engine.encode(termAtt.toString())
             *      : engine.encode(termAtt.toString(), languages);
             *
             *  Hence our approach, below:
             */
            if (languageSet == null)
            {
                return(engine.Encode(input));
            }
            else
            {
                return(engine.Encode(input, languageSet));
            }
        }
Ejemplo n.º 7
0
        private static Phoneme ParsePhoneme(string ph)
        {
            int open = ph.IndexOf('[');

            if (open >= 0)
            {
                if (!ph.EndsWith("]", StringComparison.Ordinal))
                {
                    throw new ArgumentException("Phoneme expression contains a '[' but does not end in ']'");
                }
                string        before = ph.Substring(0, open - 0);
                string        input  = ph.Substring(open + 1, (ph.Length - 1) - (open + 1));
                ISet <string> langs  = new JCG.HashSet <string>(PLUS.Split(input).TrimEnd());

                return(new Phoneme(before, LanguageSet.From(langs)));
            }
            else
            {
                return(new Phoneme(ph, Languages.ANY_LANGUAGE));
            }
        }
Ejemplo n.º 8
0
        /// <summary>
        /// Guesses the languages of a word.
        /// </summary>
        /// <param name="input">The word.</param>
        /// <returns>A Set of Strings of language names that are potential matches for the input word.</returns>
        public virtual LanguageSet GuessLanguages(string input)
        {
            string text = input.ToLowerInvariant();

            ISet <string> langs = new JCG.HashSet <string>(this.languages.GetLanguages());

            foreach (LangRule rule in this.rules)
            {
                if (rule.Matches(text))
                {
                    if (rule.acceptOnMatch)
                    {
                        IList <string> toRemove = new JCG.List <string>();
                        foreach (var item in langs)
                        {
                            if (!rule.languages.Contains(item))
                            {
                                toRemove.Add(item);
                            }
                        }
                        foreach (var item in toRemove)
                        {
                            langs.Remove(item);
                        }
                    }
                    else
                    {
                        foreach (var item in rule.languages)
                        {
                            langs.Remove(item);
                        }
                    }
                }
            }

            LanguageSet ls = LanguageSet.From(langs);

            return(ls.Equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls);
        }
Ejemplo n.º 9
0
 public override LanguageSet RestrictTo(LanguageSet other)
 {
     if (other == Languages.NO_LANGUAGES)
     {
         return(other);
     }
     else if (other == Languages.ANY_LANGUAGE)
     {
         return(this);
     }
     else
     {
         SomeLanguages sl = (SomeLanguages)other;
         ISet <string> ls = new JCG.HashSet <string>(Math.Min(languages.Count, sl.languages.Count));
         foreach (string lang in languages)
         {
             if (sl.languages.Contains(lang))
             {
                 ls.Add(lang);
             }
         }
         return(From(ls));
     }
 }
Ejemplo n.º 10
0
 /// <summary>
 /// An empty builder where all phonemes must come from some set of languages. This will contain a single
 /// phoneme of zero characters. This can then be appended to. This should be the only way to create a new
 /// phoneme from scratch.
 /// </summary>
 /// <param name="languages">The set of languages.</param>
 /// <returns>A new, empty phoneme builder.</returns>
 public static PhonemeBuilder Empty(LanguageSet languages)
 {
     return(new PhonemeBuilder(new Phoneme("", languages)));
 }
Ejemplo n.º 11
0
        /// <summary>
        /// Encodes an input string into an output phonetic representation, given a set of possible origin languages.
        /// </summary>
        /// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param>
        /// <param name="languageSet"></param>
        /// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns>
        public virtual string Encode(string input, LanguageSet languageSet)
        {
            IDictionary <string, IList <Rule> > rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet);
            // rules common across many (all) languages
            IDictionary <string, IList <Rule> > finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common");
            // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
            IDictionary <string, IList <Rule> > finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet);

            // tidy the input
            // lower case is a locale-dependent operation
            input = input.ToLowerInvariant().Replace('-', ' ').Trim();

            if (this.nameType == NameType.GENERIC)
            {
                if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'"))
                { // check for d'
                    string remainder = input.Substring(2);
                    string combined  = "d" + remainder;
                    return("(" + Encode(remainder) + ")-(" + Encode(combined) + ")");
                }
                foreach (string l in NAME_PREFIXES[this.nameType])
                {
                    // handle generic prefixes
                    if (input.StartsWith(l + " ", StringComparison.Ordinal))
                    {
                        // check for any prefix in the words list
                        string remainder = input.Substring(l.Length + 1); // input without the prefix
                        string combined  = l + remainder;                 // input with prefix without space
                        return("(" + Encode(remainder) + ")-(" + Encode(combined) + ")");
                    }
                }
            }

            IList <string> words  = WHITESPACE.Split(input).ToList();
            IList <string> words2 = new List <string>();

            // special-case handling of word prefixes based upon the name type
            switch (this.nameType)
            {
            case NameType.SEPHARDIC:
                foreach (string aWord in words)
                {
                    string[] parts    = aWord.Split(new char[] { '\'' }, StringSplitOptions.RemoveEmptyEntries);
                    string   lastPart = parts[parts.Length - 1];
                    words2.Add(lastPart);
                }
                words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                break;

            case NameType.ASHKENAZI:
                words2.AddRange(words);
                words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                break;

            case NameType.GENERIC:
                words2.AddRange(words);
                break;

            default:
                throw new InvalidOperationException("Unreachable case: " + this.nameType);
            }

            if (this.concat)
            {
                // concat mode enabled
                input = Join(words2, " ");
            }
            else if (words2.Count == 1)
            {
                // not a multi-word name
                //input = words.iterator().next();
                input = words.FirstOrDefault();
            }
            else
            {
                // encode each word in a multi-word name separately (normally used for approx matches)
                StringBuilder result = new StringBuilder();
                foreach (string word in words2)
                {
                    result.Append("-").Append(Encode(word));
                }
                // return the result without the leading "-"
                return(result.ToString(1, result.Length - 1));
            }

            PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet);

            // loop over each char in the input - we will handle the increment manually
            for (int i = 0; i < input.Length;)
            {
                RulesApplication rulesApplication =
                    new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke();
                i = rulesApplication.I;
                phonemeBuilder = rulesApplication.PhonemeBuilder;
            }

            // Apply the general rules
            phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1);
            // Apply the language-specific rules
            phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2);

            return(phonemeBuilder.MakeString());
        }
Ejemplo n.º 12
0
        /// <summary>
        /// Encodes a string to its phonetic representation.
        /// </summary>
        /// <param name="input">The string to encode.</param>
        /// <returns>The encoding of the input.</returns>
        public virtual string Encode(string input)
        {
            LanguageSet languageSet = this.lang.GuessLanguages(input);

            return(Encode(input, languageSet));
        }
Ejemplo n.º 13
0
        /// <summary>
        /// Guesses the language of a word.
        /// </summary>
        /// <param name="text">The word.</param>
        /// <returns>The language that the word originates from or <see cref="Languages.ANY"/> if there was no unique match.</returns>
        public virtual string GuessLanguage(string text)
        {
            LanguageSet ls = GuessLanguages(text);

            return(ls.IsSingleton ? ls.GetAny() : Languages.ANY);
        }
Ejemplo n.º 14
0
 public abstract LanguageSet RestrictTo(LanguageSet other);
Ejemplo n.º 15
0
 public override LanguageSet RestrictTo(LanguageSet other)
 {
     return(other);
 }
Ejemplo n.º 16
0
 public Phoneme(ICharSequence phonemeText, LanguageSet languages)
 {
     this.phonemeText = new StringBuilder(phonemeText.ToString());
     this.languages   = languages;
 }
Ejemplo n.º 17
0
 public Phoneme(string phonemeText, LanguageSet languages)
 {
     this.phonemeText = new StringBuilder(phonemeText);
     this.languages   = languages;
 }
Ejemplo n.º 18
0
 /// <summary>
 /// Gets rules for a combination of name type, rule type and languages.
 /// <para/>
 /// since 1.9
 /// </summary>
 /// <param name="nameType">The <see cref="NameType"/> to consider.</param>
 /// <param name="rt">The <see cref="RuleType"/> to consider.</param>
 /// <param name="langs">The set of languages to consider.</param>
 /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns>
 public static IDictionary <string, IList <Rule> > GetInstanceMap(NameType nameType, RuleType rt,
                                                                  LanguageSet langs)
 {
     return(langs.IsSingleton ? GetInstanceMap(nameType, rt, langs.GetAny()) :
            GetInstanceMap(nameType, rt, Languages.ANY));
 }
Ejemplo n.º 19
0
 public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight, LanguageSet languages)
     : this(phonemeLeft.phonemeText, languages)
 {
     this.phonemeText.Append(phonemeRight.phonemeText);
 }