public void Match(Gazetteer gazetteer, out ArrayList<Pair<int, int>> spans) { spans = new ArrayList<Pair<int, int>>(); foreach (GazetteerTerm term in gazetteer.mTerms) { if (!term.mEnabled) { continue; } int lastIdx = mTokens.Count - term.mTokens.Count; for (int i = 0; i <= lastIdx; i++) { int j = i; bool found = false; for (int k = 0; k < term.mTokens.Count; k++) { if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/k == 0)) { break; } if (found = k == term.mTokens.Count - 1) { break; } j++; while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower())) { j++; } if (j >= mTokens.Count) { break; } } if (found) // gazetteer term found (starting at micro-token i, ending at micro-token j) { spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd)); } } } }
//public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) //{ // InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer); //} public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag) { // default settings CaseMatchingType caseMatchingType = defaultCaseMatchingType; bool lemmatize = defaultLemmatizeFlag; bool enabled = defaultEnabledFlag; // parse term settings termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) { ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled); return ""; })); ArrayList<string> tokens = new ArrayList<string>(); ArrayList<string> posConstraints = new ArrayList<string>(); Match match = mGazetteerMicroTokenRegex.Match(termDef); while (match.Success) { string token = match.Value; string[] tokenParts = token.Split('/'); string posConstraint = null; if (tokenParts.Length == 2) { token = tokenParts[0]; posConstraint = tokenParts[1]; } tokens.Add(token); posConstraints.Add(posConstraint); match = match.NextMatch(); } InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer); }
private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer) { mCaseMatchingType = caseMatchingType; mEnabled = enabled; IEnumerator<string> enumTokens = tokens.GetEnumerator(); IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator(); while (enumTokens.MoveNext() && enumPosConstraints.MoveNext()) { string tokenStr = Normalize(enumTokens.Current); string posConstraint = enumPosConstraints.Current; if (!gazetteer.IsStopWord(tokenStr.ToLower())) { string lemma = null; if (lemmatize) { lemma = mLemmatizer.GetStem(tokenStr); if (lemma == "") { lemma = tokenStr; } } GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma); mTokens.Add(token); } } if (mTokens.Count > 0) { PrepareTokens(caseMatchingType, lemmatize); } }
public void LoadGazetteers() { mLogger.Info("LoadGazetteers", "Loading gazetteers ..."); Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER); mLogger.Info("LoadGazetteers", "Found {0} gazetteers.", gazetteers.Length); // create gazetteer objects foreach (Entity gazetteer in gazetteers) { Gazetteer gazetteerObj = new Gazetteer(gazetteer.Uri); mGazetteers.Add(gazetteer.Uri, gazetteerObj); // read stop words gazetteerObj.ReadStopWords(mRdfStore); } // import gazetteers and read conditions foreach (Entity gazetteer in gazetteers) { mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers); mGazetteers[gazetteer.Uri].ReadConditions(mRdfStore, mGazetteers); } // read terms foreach (Entity gazetteer in gazetteers) { mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore); } }
public Condition(Gazetteer gazetteer, Level level) { mGazetteer = gazetteer; mLevel = level; }
public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer) { int idx = startIdx; len = 0; if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0) { return false; } // first word must match Utils.CaseType caseType = Utils.GetCaseType(words[idx]); if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC)) { return false; } // *** only for the demo idx++; for (int i = 1; i < mWords.Count; i++) { while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower())) { idx++; } // skip stop words if (idx == words.Length) { return false; } if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0) { return false; } } len = idx - startIdx; return true; }
public bool Match(string[] tokens, int startIdx, out int len, Gazetteer gazetteer) { if (IsMatch(tokens, startIdx, out len, gazetteer)) { return true; } foreach (Gazetteer importedGazetteer in mImportedGazetteers) { if (importedGazetteer.Match(tokens, startIdx, out len, gazetteer)) { return true; } } return false; }
public bool IsMatch(string[] tokens, int startIdx, out int len, Gazetteer gazetteer) { len = startIdx; foreach (Term term in mTerms) { if (term.Match(tokens, startIdx, out len, gazetteer)) { return true; } } return false; }
public void ReadGazetteers() { mLogger.Info("ReadGazetteers", "Reading gazetteers ..."); Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER); mLogger.Info("ReadGazetteers", "Found {0} gazetteers.", gazetteers.Length); // gazetteer objects foreach (Entity gazetteer in gazetteers) { Gazetteer gazetteerObj = new Gazetteer(); gazetteerObj.mUri = gazetteer.Uri; mGazetteers.Add(gazetteer.Uri, gazetteerObj); gazetteerObj.ReadStopWords(mRdfStore); // stop words gazetteerObj.ReadSettings(mRdfStore); // settings } // imported gazetteers foreach (Entity gazetteer in gazetteers) { mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers); } // terms foreach (Entity gazetteer in gazetteers) { mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore); } }
public Condition(Gazetteer gazetteer, Type type) { mGazetteer = gazetteer; mType = type; }