示例#1
0
            public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer)
            {
                int idx = startIdx;

                len = 0;
                if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0)
                {
                    return(false);
                }                                                                                        // first word must match
                Utils.CaseType caseType = Utils.GetCaseType(words[idx]);
                if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC))
                {
                    return(false);
                }                                                                                                                            // *** only for the demo
                idx++;
                for (int i = 1; i < mWords.Count; i++)
                {
                    while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower()))
                    {
                        idx++;
                    }                                                                                   // skip stop words
                    if (idx == words.Length)
                    {
                        return(false);
                    }
                    if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0)
                    {
                        return(false);
                    }
                }
                len = idx - startIdx;
                return(true);
            }
示例#2
0
    private static bool GetAcronymLemma(string word, string lemma, out string acronymLemma)
    {
        Match m = mAcronymRegex.Match(word);

        if (m.Success && mLemListSuffix.Contains(m.Result("${suffix}").TrimStart('-', '–', '—')))
        {
            string         acronym  = m.Result("${acronym}");
            Utils.CaseType caseType = Utils.GetCaseType(acronym);
            if ((caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC) && lemma.Length >= 1)
            {
                acronymLemma = char.ToUpper(lemma[0]) + lemma.Substring(1);
                return(true);
            }
            else if (caseType == Utils.CaseType.ABC) // uppercase
            {
                m = mAcronymRegex.Match(lemma);
                if (m.Success)
                {
                    acronymLemma = acronym + m.Result("${suffix}");
                }
                else
                {
                    acronymLemma = acronym;
                }
                return(true);
            }
        }
        acronymLemma = null;
        return(false);
    }
示例#3
0
 private static string ApplyLemmaRules(string lemma, string word, string tag)
 {
     if (tag == "N" || tag == "M")
     {
         return(word.ToLower());
     }
     if (tag == "O")
     {
         if (word.Length == 2 && word[1] == '.')
         {
             return(word);
         }
         else
         {
             return(word.ToLower());
         }
     }
     if (word.Length >= 1)
     {
         Utils.CaseType caseType   = Utils.GetCaseType(word);
         bool           isFirstCap = caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC || caseType == Utils.CaseType.ABC;
         bool           isAllCaps  = caseType == Utils.CaseType.ABC;
         if (tag.StartsWith("R"))
         {
             string acronymLemma;
             if (GetAcronymLemma(word, lemma, out acronymLemma))
             {
                 return(acronymLemma);
             }
         }
         else if (tag.StartsWith("Kr"))
         {
             if (isAllCaps)
             {
                 return(lemma.ToUpper());
             }
         }
         else if (tag.StartsWith("Pp"))
         {
             string acronymLemma;
             if (GetAcronymLemma(word, lemma, out acronymLemma))
             {
                 return(acronymLemma);
             }
             else if (mLemListPpLemma.Contains(lemma))
             {
                 return(char.ToUpper(lemma[0]) + lemma.Substring(1));
             }
         }
         else if (tag.StartsWith("Ps"))
         {
             string acronymLemma;
             if (GetAcronymLemma(word, lemma, out acronymLemma))
             {
                 return(acronymLemma);
             }
             else if (mLemListPsLemma.Contains(lemma))
             {
                 return(lemma);
             }
             else if (isFirstCap && lemma.Length >= 1)
             {
                 return(char.ToUpper(lemma[0]) + lemma.Substring(1));
             }
         }
         else if (tag.StartsWith("So"))
         {
             if (word.Length == 1 || mLemListSoLemma.Contains(word))
             {
                 return(word);
             }
             Match m = mAcronymRegex.Match(word);
             if (m.Success && mLemListSoLemma.Contains(m.Result("${acronym}")))
             {
                 Match mLemma = mAcronymRegex.Match(lemma);
                 if (mLemma.Success)
                 {
                     return(m.Result("${acronym}") + mLemma.Result("${suffix}"));
                 }
                 else
                 {
                     return(m.Result("${acronym}"));
                 }
             }
         }
         else if (tag.StartsWith("Sl"))
         {
             string acronymLemma;
             if (GetAcronymLemma(word, lemma, out acronymLemma))
             {
                 return(acronymLemma);
             }
             else if (isAllCaps)
             {
                 return(lemma.ToUpper());
             }
             else if (isFirstCap && lemma.Length >= 1)
             {
                 return(char.ToUpper(lemma[0]) + lemma.Substring(1));
             }
         }
     }
     return(lemma);
 }