Example #1
0
 public void Match(Gazetteer gazetteer, out ArrayList<Pair<int, int>> spans)
 {
     spans = new ArrayList<Pair<int, int>>();
     foreach (GazetteerTerm term in gazetteer.mTerms)
     {
         if (!term.mEnabled) { continue; }
         int lastIdx = mTokens.Count - term.mTokens.Count;
         for (int i = 0; i <= lastIdx; i++)
         {
             int j = i;
             bool found = false;
             for (int k = 0; k < term.mTokens.Count; k++)
             {
                 if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/k == 0)) { break; }
                 if (found = k == term.mTokens.Count - 1) { break; }
                 j++;
                 while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower())) { j++; }
                 if (j >= mTokens.Count) { break; }
             }
             if (found) // gazetteer term found (starting at micro-token i, ending at micro-token j)
             {
                 spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd));
             }
         }
     }
 }
Example #2
0
 //public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 //{
 //    InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
 //}
 public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag)
 {
     // default settings
     CaseMatchingType caseMatchingType = defaultCaseMatchingType;
     bool lemmatize = defaultLemmatizeFlag;
     bool enabled = defaultEnabledFlag;
     // parse term settings
     termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) {
         ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled);
         return "";
     }));
     ArrayList<string> tokens = new ArrayList<string>();
     ArrayList<string> posConstraints = new ArrayList<string>();
     Match match = mGazetteerMicroTokenRegex.Match(termDef);
     while (match.Success)
     {
         string token = match.Value;
         string[] tokenParts = token.Split('/');
         string posConstraint = null;
         if (tokenParts.Length == 2)
         {
             token = tokenParts[0];
             posConstraint = tokenParts[1];
         }
         tokens.Add(token);
         posConstraints.Add(posConstraint);
         match = match.NextMatch();
     }
     InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
 }
Example #3
0
 private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 {
     mCaseMatchingType = caseMatchingType;
     mEnabled = enabled;
     IEnumerator<string> enumTokens = tokens.GetEnumerator();
     IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator();
     while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
     {
         string tokenStr = Normalize(enumTokens.Current);
         string posConstraint = enumPosConstraints.Current;
         if (!gazetteer.IsStopWord(tokenStr.ToLower()))
         {
             string lemma = null;
             if (lemmatize)
             {
                 lemma = mLemmatizer.GetStem(tokenStr);
                 if (lemma == "") { lemma = tokenStr; }
             }
             GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
             mTokens.Add(token);
         }
     }
     if (mTokens.Count > 0)
     {
         PrepareTokens(caseMatchingType, lemmatize);
     }
 }
Example #4
0
 public void LoadGazetteers()
 {
     mLogger.Info("LoadGazetteers", "Loading gazetteers ...");
     Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER);
     mLogger.Info("LoadGazetteers", "Found {0} gazetteers.", gazetteers.Length);
     // create gazetteer objects
     foreach (Entity gazetteer in gazetteers)
     {
         Gazetteer gazetteerObj = new Gazetteer(gazetteer.Uri);
         mGazetteers.Add(gazetteer.Uri, gazetteerObj);
         // read stop words
         gazetteerObj.ReadStopWords(mRdfStore);
     }
     // import gazetteers and read conditions
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers);
         mGazetteers[gazetteer.Uri].ReadConditions(mRdfStore, mGazetteers);
     }
     // read terms
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore);
     }
 }
Example #5
0
 public Condition(Gazetteer gazetteer, Level level)
 {
     mGazetteer = gazetteer;
     mLevel = level;
 }
Example #6
0
 public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer)
 {
     int idx = startIdx;
     len = 0;
     if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0) { return false; } // first word must match
     Utils.CaseType caseType = Utils.GetCaseType(words[idx]);
     if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC)) { return false; } // *** only for the demo
     idx++;
     for (int i = 1; i < mWords.Count; i++)
     {
         while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower())) { idx++; } // skip stop words
         if (idx == words.Length) { return false; }
         if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0) { return false; }
     }
     len = idx - startIdx;
     return true;
 }
Example #7
0
 public bool Match(string[] tokens, int startIdx, out int len, Gazetteer gazetteer)
 {
     if (IsMatch(tokens, startIdx, out len, gazetteer)) { return true; }
     foreach (Gazetteer importedGazetteer in mImportedGazetteers)
     {
         if (importedGazetteer.Match(tokens, startIdx, out len, gazetteer)) { return true; }
     }
     return false;
 }
Example #8
0
 public bool IsMatch(string[] tokens, int startIdx, out int len, Gazetteer gazetteer)
 {
     len = startIdx;
     foreach (Term term in mTerms)
     {
         if (term.Match(tokens, startIdx, out len, gazetteer)) { return true; }
     }
     return false;
 }
Example #9
0
 public void ReadGazetteers()
 {
     mLogger.Info("ReadGazetteers", "Reading gazetteers ...");
     Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER);
     mLogger.Info("ReadGazetteers", "Found {0} gazetteers.", gazetteers.Length);
     // gazetteer objects
     foreach (Entity gazetteer in gazetteers)
     {
         Gazetteer gazetteerObj = new Gazetteer();
         gazetteerObj.mUri = gazetteer.Uri;
         mGazetteers.Add(gazetteer.Uri, gazetteerObj);
         gazetteerObj.ReadStopWords(mRdfStore); // stop words
         gazetteerObj.ReadSettings(mRdfStore); // settings
     }
     // imported gazetteers
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers);
     }
     // terms
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore);
     }
 }
 public Condition(Gazetteer gazetteer, Type type)
 {
     mGazetteer = gazetteer;
     mType = type;
 }