예제 #1
0
            private void InitializeInstance(IEnumerable <string> tokens, IEnumerable <string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
            {
                mCaseMatchingType = caseMatchingType;
                mEnabled          = enabled;
                IEnumerator <string> enumTokens         = tokens.GetEnumerator();
                IEnumerator <string> enumPosConstraints = posConstraints.GetEnumerator();

                while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
                {
                    string tokenStr      = Normalize(enumTokens.Current);
                    string posConstraint = enumPosConstraints.Current;
                    if (!gazetteer.IsStopWord(tokenStr.ToLower()))
                    {
                        string lemma = null;
                        if (lemmatize)
                        {
                            lemma = mLemmatizer.GetStem(tokenStr);
                            if (lemma == "")
                            {
                                lemma = tokenStr;
                            }
                        }
                        GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
                        mTokens.Add(token);
                    }
                }
                if (mTokens.Count > 0)
                {
                    PrepareTokens(caseMatchingType, lemmatize);
                }
            }
예제 #2
0
            private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken)
            {
                // check POS tag
                if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint))
                {
                    return(false);
                }
                // check word or lemma
                string gazTokenStr;
                string docTokenStr;

                if (gazToken.mLemma == null)
                {
                    gazTokenStr = gazToken.mTokenStr;
                    docTokenStr = docToken.mTokenStr;
                }
                else
                {
                    gazTokenStr = gazToken.mLemma;
                    docTokenStr = docToken.mLemma;
                }
                switch (caseMatchingType)
                {
                case CaseMatchingType.IgnoreCase:
                    return(string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0);

                case CaseMatchingType.ExactMatch:
                case CaseMatchingType.AllLowercase:
                case CaseMatchingType.AllUppercase:
                case CaseMatchingType.AllCapsStrict:
                case CaseMatchingType.InitCapStrict:
                    return(gazTokenStr == docTokenStr);

                case CaseMatchingType.InitCapLoose:
                    return((!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0) ||
                           (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0));

                case CaseMatchingType.AllCapsLoose:
                    return(char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0);

                default:
                    throw new ArgumentValueException("caseMatchingType");
                }
            }
예제 #3
0
 private bool Match(GazetteerToken gazToken, Token docToken, CaseMatchingType caseMatchingType, bool firstToken)
 {
     // check POS tag
     if (gazToken.mPosConstraint != null && !docToken.mPosTag.StartsWith(gazToken.mPosConstraint)) { return false; }
     // check word or lemma
     string gazTokenStr;
     string docTokenStr;
     if (gazToken.mLemma == null)
     {
         gazTokenStr = gazToken.mTokenStr;
         docTokenStr = docToken.mTokenStr;
     }
     else
     {
         gazTokenStr = gazToken.mLemma;
         docTokenStr = docToken.mLemma;
     }
     switch (caseMatchingType)
     {
         case CaseMatchingType.IgnoreCase:
             return string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
         case CaseMatchingType.ExactMatch:
         case CaseMatchingType.AllLowercase:
         case CaseMatchingType.AllUppercase:
         case CaseMatchingType.AllCapsStrict:
         case CaseMatchingType.InitCapStrict:
             return gazTokenStr == docTokenStr;
         case CaseMatchingType.InitCapLoose:
             return (!firstToken && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0)
                 || (firstToken && char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0);
         case CaseMatchingType.AllCapsLoose:
             return char.IsUpper(docTokenStr[0]) && string.Compare(gazTokenStr, docTokenStr, StringComparison.OrdinalIgnoreCase) == 0;
         default:
             throw new ArgumentValueException("caseMatchingType");
     }
 }
예제 #4
0
 private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 {
     mCaseMatchingType = caseMatchingType;
     mEnabled = enabled;
     IEnumerator<string> enumTokens = tokens.GetEnumerator();
     IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator();
     while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
     {
         string tokenStr = Normalize(enumTokens.Current);
         string posConstraint = enumPosConstraints.Current;
         if (!gazetteer.IsStopWord(tokenStr.ToLower()))
         {
             string lemma = null;
             if (lemmatize)
             {
                 lemma = mLemmatizer.GetStem(tokenStr);
                 if (lemma == "") { lemma = tokenStr; }
             }
             GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
             mTokens.Add(token);
         }
     }
     if (mTokens.Count > 0)
     {
         PrepareTokens(caseMatchingType, lemmatize);
     }
 }