private void InitializeInstance(IEnumerable <string> tokens, IEnumerable <string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) { mCaseMatchingType = caseMatchingType; mEnabled = enabled; IEnumerator <string> enumTokens = tokens.GetEnumerator(); IEnumerator <string> enumPosConstraints = posConstraints.GetEnumerator(); while (enumTokens.MoveNext() && enumPosConstraints.MoveNext()) { string tokenStr = Normalize(enumTokens.Current); string posConstraint = enumPosConstraints.Current; if (!gazetteer.IsStopWord(tokenStr.ToLower())) { string lemma = null; if (lemmatize) { lemma = mLemmatizer.GetStem(tokenStr); if (lemma == "") { lemma = tokenStr; } } GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma); mTokens.Add(token); } } if (mTokens.Count > 0) { PrepareTokens(caseMatchingType, lemmatize); } }
private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken) { // check POS tag if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint)) { return(false); } // check word or lemma string gazTokenStr; string docTokenStr; if (gazToken.mLemma == null) { gazTokenStr = gazToken.mTokenStr; docTokenStr = docToken.mTokenStr; } else { gazTokenStr = gazToken.mLemma; docTokenStr = docToken.mLemma; } switch (caseMatchingType) { case CaseMatchingType.IgnoreCase: return(string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0); case CaseMatchingType.ExactMatch: case CaseMatchingType.AllLowercase: case CaseMatchingType.AllUppercase: case CaseMatchingType.AllCapsStrict: case CaseMatchingType.InitCapStrict: return(gazTokenStr == docTokenStr); case CaseMatchingType.InitCapLoose: return((!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0) || (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0)); case CaseMatchingType.AllCapsLoose: return(char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0); default: throw new ArgumentValueException("caseMatchingType"); } }
private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken) { // check POS tag if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint)) { return false; } // check word or lemma string gazTokenStr; string docTokenStr; if (gazToken.mLemma == null) { gazTokenStr = gazToken.mTokenStr; docTokenStr = docToken.mTokenStr; } else { gazTokenStr = gazToken.mLemma; docTokenStr = docToken.mLemma; } switch (caseMatchingType) { case CaseMatchingType.IgnoreCase: return string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0; case CaseMatchingType.ExactMatch: case CaseMatchingType.AllLowercase: case CaseMatchingType.AllUppercase: case CaseMatchingType.AllCapsStrict: case CaseMatchingType.InitCapStrict: return gazTokenStr == docTokenStr; case CaseMatchingType.InitCapLoose: return (!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0) || (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0); case CaseMatchingType.AllCapsLoose: return char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0; default: throw new ArgumentValueException("caseMatchingType"); } }
private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) { mCaseMatchingType = caseMatchingType; mEnabled = enabled; IEnumerator<string> enumTokens = tokens.GetEnumerator(); IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator(); while (enumTokens.MoveNext() && enumPosConstraints.MoveNext()) { string tokenStr = Normalize(enumTokens.Current); string posConstraint = enumPosConstraints.Current; if (!gazetteer.IsStopWord(tokenStr.ToLower())) { string lemma = null; if (lemmatize) { lemma = mLemmatizer.GetStem(tokenStr); if (lemma == "") { lemma = tokenStr; } } GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma); mTokens.Add(token); } } if (mTokens.Count > 0) { PrepareTokens(caseMatchingType, lemmatize); } }