예제 #1
0
        /// <summary>
        /// Applies the final rules to convert from a language-specific phonetic representation to a
        /// language-independent representation.
        /// </summary>
        /// <param name="phonemeBuilder">The current phonemes.</param>
        /// <param name="finalRules">The final rules to apply.</param>
        /// <returns>The resulting phonemes.</returns>
        private PhonemeBuilder ApplyFinalRules(PhonemeBuilder phonemeBuilder,
                                               IDictionary <string, IList <Rule> > finalRules)
        {
            if (finalRules == null)
            {
                throw new ArgumentNullException("finalRules can not be null");
            }
            if (finalRules.Count == 0)
            {
                return(phonemeBuilder);
            }

            ISet <Phoneme> phonemes = new SortedSet <Phoneme>(Phoneme.COMPARER);

            foreach (Phoneme phoneme in phonemeBuilder.Phonemes)
            {
                PhonemeBuilder subBuilder  = PhonemeBuilder.Empty(phoneme.Languages);
                string         phonemeText = phoneme.GetPhonemeText();

                for (int i = 0; i < phonemeText.Length;)
                {
                    RulesApplication rulesApplication =
                        new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).Invoke();
                    bool found = rulesApplication.IsFound;
                    subBuilder = rulesApplication.PhonemeBuilder;

                    if (!found)
                    {
                        // not found, appending as-is
                        subBuilder.Append(phonemeText.Substring(i, 1));
                    }

                    i = rulesApplication.I;
                }

                phonemes.UnionWith(subBuilder.Phonemes);
            }

            return(new PhonemeBuilder(phonemes.ToList()));
        }
예제 #2
0
        /// <summary>
        /// Encodes an input string into an output phonetic representation, given a set of possible origin languages.
        /// </summary>
        /// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param>
        /// <param name="languageSet"></param>
        /// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns>
        public virtual string Encode(string input, LanguageSet languageSet)
        {
            IDictionary <string, IList <Rule> > rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet);
            // rules common across many (all) languages
            IDictionary <string, IList <Rule> > finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common");
            // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages
            IDictionary <string, IList <Rule> > finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet);

            // tidy the input
            // lower case is a locale-dependent operation
            input = input.ToLowerInvariant().Replace('-', ' ').Trim();

            if (this.nameType == NameType.GENERIC)
            {
                if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'"))
                { // check for d'
                    string remainder = input.Substring(2);
                    string combined  = "d" + remainder;
                    return("(" + Encode(remainder) + ")-(" + Encode(combined) + ")");
                }
                foreach (string l in NAME_PREFIXES[this.nameType])
                {
                    // handle generic prefixes
                    if (input.StartsWith(l + " ", StringComparison.Ordinal))
                    {
                        // check for any prefix in the words list
                        string remainder = input.Substring(l.Length + 1); // input without the prefix
                        string combined  = l + remainder;                 // input with prefix without space
                        return("(" + Encode(remainder) + ")-(" + Encode(combined) + ")");
                    }
                }
            }

            IList <string> words  = WHITESPACE.Split(input).ToList();
            IList <string> words2 = new List <string>();

            // special-case handling of word prefixes based upon the name type
            switch (this.nameType)
            {
            case NameType.SEPHARDIC:
                foreach (string aWord in words)
                {
                    string[] parts    = aWord.Split(new char[] { '\'' }, StringSplitOptions.RemoveEmptyEntries);
                    string   lastPart = parts[parts.Length - 1];
                    words2.Add(lastPart);
                }
                words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                break;

            case NameType.ASHKENAZI:
                words2.AddRange(words);
                words2.RemoveAll(NAME_PREFIXES[this.nameType]);
                break;

            case NameType.GENERIC:
                words2.AddRange(words);
                break;

            default:
                throw new InvalidOperationException("Unreachable case: " + this.nameType);
            }

            if (this.concat)
            {
                // concat mode enabled
                input = Join(words2, " ");
            }
            else if (words2.Count == 1)
            {
                // not a multi-word name
                //input = words.iterator().next();
                input = words.FirstOrDefault();
            }
            else
            {
                // encode each word in a multi-word name separately (normally used for approx matches)
                StringBuilder result = new StringBuilder();
                foreach (string word in words2)
                {
                    result.Append("-").Append(Encode(word));
                }
                // return the result without the leading "-"
                return(result.ToString(1, result.Length - 1));
            }

            PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet);

            // loop over each char in the input - we will handle the increment manually
            for (int i = 0; i < input.Length;)
            {
                RulesApplication rulesApplication =
                    new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke();
                i = rulesApplication.I;
                phonemeBuilder = rulesApplication.PhonemeBuilder;
            }

            // Apply the general rules
            phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1);
            // Apply the language-specific rules
            phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2);

            return(phonemeBuilder.MakeString());
        }