Example #1
0
            private void InitializeInstance(IEnumerable <string> tokens, IEnumerable <string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
            {
                mCaseMatchingType = caseMatchingType;
                mEnabled          = enabled;
                IEnumerator <string> enumTokens         = tokens.GetEnumerator();
                IEnumerator <string> enumPosConstraints = posConstraints.GetEnumerator();

                while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
                {
                    string tokenStr      = Normalize(enumTokens.Current);
                    string posConstraint = enumPosConstraints.Current;
                    if (!gazetteer.IsStopWord(tokenStr.ToLower()))
                    {
                        string lemma = null;
                        if (lemmatize)
                        {
                            lemma = mLemmatizer.GetStem(tokenStr);
                            if (lemma == "")
                            {
                                lemma = tokenStr;
                            }
                        }
                        GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
                        mTokens.Add(token);
                    }
                }
                if (mTokens.Count > 0)
                {
                    PrepareTokens(caseMatchingType, lemmatize);
                }
            }
Example #2
0
            public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer)
            {
                int idx = startIdx;

                len = 0;
                if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0)
                {
                    return(false);
                }                                                                                        // first word must match
                Utils.CaseType caseType = Utils.GetCaseType(words[idx]);
                if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC))
                {
                    return(false);
                }                                                                                                                            // *** only for the demo
                idx++;
                for (int i = 1; i < mWords.Count; i++)
                {
                    while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower()))
                    {
                        idx++;
                    }                                                                                   // skip stop words
                    if (idx == words.Length)
                    {
                        return(false);
                    }
                    if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0)
                    {
                        return(false);
                    }
                }
                len = idx - startIdx;
                return(true);
            }
Example #3
0
 public void Match(Gazetteer gazetteer, out ArrayList <Pair <int, int> > spans)
 {
     spans = new ArrayList <Pair <int, int> >();
     foreach (GazetteerTerm term in gazetteer.mTerms)
     {
         if (!term.mEnabled)
         {
             continue;
         }
         int lastIdx = mTokens.Count - term.mTokens.Count;
         for (int i = 0; i <= lastIdx; i++)
         {
             int  j     = i;
             bool found = false;
             for (int k = 0; k < term.mTokens.Count; k++)
             {
                 if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/ k == 0))
                 {
                     break;
                 }
                 if (found = k == term.mTokens.Count - 1)
                 {
                     break;
                 }
                 j++;
                 while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower()))
                 {
                     j++;
                 }
                 if (j >= mTokens.Count)
                 {
                     break;
                 }
             }
             if (found)                                                     // gazetteer term found (starting at micro-token i, ending at micro-token j)
             {
                 int len = mTokens[j].mSpanEnd - mTokens[i].mSpanStart + 1; // *** this counts all chars in the annotation (incl. spaces and non-token chars)
                 if (len >= term.mMinLen)
                 {
                     //spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd));
                     spans.Add(new Pair <int, int>(i, j));
                 }
             }
         }
     }
 }
Example #4
0
 public void Match(Gazetteer gazetteer, out ArrayList<Pair<int, int>> spans)
 {
     spans = new ArrayList<Pair<int, int>>();
     foreach (GazetteerTerm term in gazetteer.mTerms)
     {
         if (!term.mEnabled) { continue; }
         int lastIdx = mTokens.Count - term.mTokens.Count;
         for (int i = 0; i <= lastIdx; i++)
         {
             int j = i;
             bool found = false;
             for (int k = 0; k < term.mTokens.Count; k++)
             {
                 if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/k == 0)) { break; }
                 if (found = k == term.mTokens.Count - 1) { break; }
                 j++;
                 while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower())) { j++; }
                 if (j >= mTokens.Count) { break; }
             }
             if (found) // gazetteer term found (starting at micro-token i, ending at micro-token j)
             {
                 spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd));
             }
         }
     }
 }
Example #5
0
 private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 {
     mCaseMatchingType = caseMatchingType;
     mEnabled = enabled;
     IEnumerator<string> enumTokens = tokens.GetEnumerator();
     IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator();
     while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
     {
         string tokenStr = Normalize(enumTokens.Current);
         string posConstraint = enumPosConstraints.Current;
         if (!gazetteer.IsStopWord(tokenStr.ToLower()))
         {
             string lemma = null;
             if (lemmatize)
             {
                 lemma = mLemmatizer.GetStem(tokenStr);
                 if (lemma == "") { lemma = tokenStr; }
             }
             GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
             mTokens.Add(token);
         }
     }
     if (mTokens.Count > 0)
     {
         PrepareTokens(caseMatchingType, lemmatize);
     }
 }
Example #6
0
 public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer)
 {
     int idx = startIdx;
     len = 0;
     if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0) { return false; } // first word must match
     Utils.CaseType caseType = Utils.GetCaseType(words[idx]);
     if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC)) { return false; } // *** only for the demo
     idx++;
     for (int i = 1; i < mWords.Count; i++)
     {
         while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower())) { idx++; } // skip stop words
         if (idx == words.Length) { return false; }
         if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0) { return false; }
     }
     len = idx - startIdx;
     return true;
 }