private void InitializeInstance(IEnumerable <string> tokens, IEnumerable <string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) { mCaseMatchingType = caseMatchingType; mEnabled = enabled; IEnumerator <string> enumTokens = tokens.GetEnumerator(); IEnumerator <string> enumPosConstraints = posConstraints.GetEnumerator(); while (enumTokens.MoveNext() && enumPosConstraints.MoveNext()) { string tokenStr = Normalize(enumTokens.Current); string posConstraint = enumPosConstraints.Current; if (!gazetteer.IsStopWord(tokenStr.ToLower())) { string lemma = null; if (lemmatize) { lemma = mLemmatizer.GetStem(tokenStr); if (lemma == "") { lemma = tokenStr; } } GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma); mTokens.Add(token); } } if (mTokens.Count > 0) { PrepareTokens(caseMatchingType, lemmatize); } }
public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer) { int idx = startIdx; len = 0; if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0) { return(false); } // first word must match Utils.CaseType caseType = Utils.GetCaseType(words[idx]); if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC)) { return(false); } // *** only for the demo idx++; for (int i = 1; i < mWords.Count; i++) { while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower())) { idx++; } // skip stop words if (idx == words.Length) { return(false); } if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0) { return(false); } } len = idx - startIdx; return(true); }
public void Match(Gazetteer gazetteer, out ArrayList <Pair <int, int> > spans) { spans = new ArrayList <Pair <int, int> >(); foreach (GazetteerTerm term in gazetteer.mTerms) { if (!term.mEnabled) { continue; } int lastIdx = mTokens.Count - term.mTokens.Count; for (int i = 0; i <= lastIdx; i++) { int j = i; bool found = false; for (int k = 0; k < term.mTokens.Count; k++) { if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/ k == 0)) { break; } if (found = k == term.mTokens.Count - 1) { break; } j++; while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower())) { j++; } if (j >= mTokens.Count) { break; } } if (found) // gazetteer term found (starting at micro-token i, ending at micro-token j) { int len = mTokens[j].mSpanEnd - mTokens[i].mSpanStart + 1; // *** this counts all chars in the annotation (incl. spaces and non-token chars) if (len >= term.mMinLen) { //spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd)); spans.Add(new Pair <int, int>(i, j)); } } } } }
public void Match(Gazetteer gazetteer, out ArrayList<Pair<int, int>> spans) { spans = new ArrayList<Pair<int, int>>(); foreach (GazetteerTerm term in gazetteer.mTerms) { if (!term.mEnabled) { continue; } int lastIdx = mTokens.Count - term.mTokens.Count; for (int i = 0; i <= lastIdx; i++) { int j = i; bool found = false; for (int k = 0; k < term.mTokens.Count; k++) { if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/k == 0)) { break; } if (found = k == term.mTokens.Count - 1) { break; } j++; while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower())) { j++; } if (j >= mTokens.Count) { break; } } if (found) // gazetteer term found (starting at micro-token i, ending at micro-token j) { spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd)); } } } }
private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) { mCaseMatchingType = caseMatchingType; mEnabled = enabled; IEnumerator<string> enumTokens = tokens.GetEnumerator(); IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator(); while (enumTokens.MoveNext() && enumPosConstraints.MoveNext()) { string tokenStr = Normalize(enumTokens.Current); string posConstraint = enumPosConstraints.Current; if (!gazetteer.IsStopWord(tokenStr.ToLower())) { string lemma = null; if (lemmatize) { lemma = mLemmatizer.GetStem(tokenStr); if (lemma == "") { lemma = tokenStr; } } GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma); mTokens.Add(token); } } if (mTokens.Count > 0) { PrepareTokens(caseMatchingType, lemmatize); } }
public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer) { int idx = startIdx; len = 0; if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0) { return false; } // first word must match Utils.CaseType caseType = Utils.GetCaseType(words[idx]); if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC)) { return false; } // *** only for the demo idx++; for (int i = 1; i < mWords.Count; i++) { while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower())) { idx++; } // skip stop words if (idx == words.Length) { return false; } if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0) { return false; } } len = idx - startIdx; return true; }