public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer) { int idx = startIdx; len = 0; if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0) { return(false); } // first word must match Utils.CaseType caseType = Utils.GetCaseType(words[idx]); if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC)) { return(false); } // *** only for the demo idx++; for (int i = 1; i < mWords.Count; i++) { while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower())) { idx++; } // skip stop words if (idx == words.Length) { return(false); } if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0) { return(false); } } len = idx - startIdx; return(true); }
private static bool GetAcronymLemma(string word, string lemma, out string acronymLemma) { Match m = mAcronymRegex.Match(word); if (m.Success && mLemListSuffix.Contains(m.Result("${suffix}").TrimStart('-', '–', '—'))) { string acronym = m.Result("${acronym}"); Utils.CaseType caseType = Utils.GetCaseType(acronym); if ((caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC) && lemma.Length >= 1) { acronymLemma = char.ToUpper(lemma[0]) + lemma.Substring(1); return(true); } else if (caseType == Utils.CaseType.ABC) // uppercase { m = mAcronymRegex.Match(lemma); if (m.Success) { acronymLemma = acronym + m.Result("${suffix}"); } else { acronymLemma = acronym; } return(true); } } acronymLemma = null; return(false); }
private static string ApplyLemmaRules(string lemma, string word, string tag) { if (tag == "N" || tag == "M") { return(word.ToLower()); } if (tag == "O") { if (word.Length == 2 && word[1] == '.') { return(word); } else { return(word.ToLower()); } } if (word.Length >= 1) { Utils.CaseType caseType = Utils.GetCaseType(word); bool isFirstCap = caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC || caseType == Utils.CaseType.ABC; bool isAllCaps = caseType == Utils.CaseType.ABC; if (tag.StartsWith("R")) { string acronymLemma; if (GetAcronymLemma(word, lemma, out acronymLemma)) { return(acronymLemma); } } else if (tag.StartsWith("Kr")) { if (isAllCaps) { return(lemma.ToUpper()); } } else if (tag.StartsWith("Pp")) { string acronymLemma; if (GetAcronymLemma(word, lemma, out acronymLemma)) { return(acronymLemma); } else if (mLemListPpLemma.Contains(lemma)) { return(char.ToUpper(lemma[0]) + lemma.Substring(1)); } } else if (tag.StartsWith("Ps")) { string acronymLemma; if (GetAcronymLemma(word, lemma, out acronymLemma)) { return(acronymLemma); } else if (mLemListPsLemma.Contains(lemma)) { return(lemma); } else if (isFirstCap && lemma.Length >= 1) { return(char.ToUpper(lemma[0]) + lemma.Substring(1)); } } else if (tag.StartsWith("So")) { if (word.Length == 1 || mLemListSoLemma.Contains(word)) { return(word); } Match m = mAcronymRegex.Match(word); if (m.Success && mLemListSoLemma.Contains(m.Result("${acronym}"))) { Match mLemma = mAcronymRegex.Match(lemma); if (mLemma.Success) { return(m.Result("${acronym}") + mLemma.Result("${suffix}")); } else { return(m.Result("${acronym}")); } } } else if (tag.StartsWith("Sl")) { string acronymLemma; if (GetAcronymLemma(word, lemma, out acronymLemma)) { return(acronymLemma); } else if (isAllCaps) { return(lemma.ToUpper()); } else if (isFirstCap && lemma.Length >= 1) { return(char.ToUpper(lemma[0]) + lemma.Substring(1)); } } } return(lemma); }