Beispiel #1
0
            //public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
            //{
            //    InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
            //}

            public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag)
            {
                // default settings
                CaseMatchingType caseMatchingType = defaultCaseMatchingType;
                bool             lemmatize        = defaultLemmatizeFlag;
                bool             enabled          = defaultEnabledFlag;

                // parse term settings
                termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) {
                    ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled);
                    return("");
                }));
                ArrayList <string> tokens         = new ArrayList <string>();
                ArrayList <string> posConstraints = new ArrayList <string>();
                Match match = mGazetteerMicroTokenRegex.Match(termDef);

                while (match.Success)
                {
                    string   token         = match.Value;
                    string[] tokenParts    = token.Split('/');
                    string   posConstraint = null;
                    if (tokenParts.Length == 2)
                    {
                        token         = tokenParts[0];
                        posConstraint = tokenParts[1];
                    }
                    tokens.Add(token);
                    posConstraints.Add(posConstraint);
                    match = match.NextMatch();
                }
                InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
            }
Beispiel #2
0
 public void LoadGazetteers()
 {
     mLogger.Info("LoadGazetteers", "Loading gazetteers ...");
     Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER);
     mLogger.Info("LoadGazetteers", "Found {0} gazetteers.", gazetteers.Length);
     // create gazetteer objects
     foreach (Entity gazetteer in gazetteers)
     {
         Gazetteer gazetteerObj = new Gazetteer(gazetteer.Uri);
         mGazetteers.Add(gazetteer.Uri, gazetteerObj);
         // read stop words
         gazetteerObj.ReadStopWords(mRdfStore);
     }
     // import gazetteers and read conditions
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers);
         mGazetteers[gazetteer.Uri].ReadConditions(mRdfStore, mGazetteers);
     }
     // read terms
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore);
     }
 }
Beispiel #3
0
            public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer)
            {
                int idx = startIdx;

                len = 0;
                if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0)
                {
                    return(false);
                }                                                                                        // first word must match
                Utils.CaseType caseType = Utils.GetCaseType(words[idx]);
                if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC))
                {
                    return(false);
                }                                                                                                                            // *** only for the demo
                idx++;
                for (int i = 1; i < mWords.Count; i++)
                {
                    while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower()))
                    {
                        idx++;
                    }                                                                                   // skip stop words
                    if (idx == words.Length)
                    {
                        return(false);
                    }
                    if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0)
                    {
                        return(false);
                    }
                }
                len = idx - startIdx;
                return(true);
            }
Beispiel #4
0
 public void ReadGazetteers()
 {
     mLogger.Info("ReadGazetteers", "Reading gazetteers ...");
     Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER);
     mLogger.Info("ReadGazetteers", "Found {0} gazetteers.", gazetteers.Length);
     // gazetteer objects
     foreach (Entity gazetteer in gazetteers)
     {
         Gazetteer gazetteerObj = new Gazetteer();
         gazetteerObj.mUri = gazetteer.Uri;
         mGazetteers.Add(gazetteer.Uri, gazetteerObj);
         gazetteerObj.ReadStopWords(mRdfStore); // stop words
         gazetteerObj.ReadSettings(mRdfStore);  // settings
     }
     // imported gazetteers
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers);
     }
     // terms
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore);
     }
 }
Beispiel #5
0
 public bool IsMatch(string[] tokens, int startIdx, out int len, Gazetteer gazetteer)
 {
     len = startIdx;
     foreach (Term term in mTerms)
     {
         if (term.Match(tokens, startIdx, out len, gazetteer))
         {
             return(true);
         }
     }
     return(false);
 }
        public void CheckGazetteer(Gazetteer gazetteer, string word)
        {
            if (gazetteer.Contains(word) && GetParent().GetData().GetName().Equals("NNP"))
            {
                GetLayerInfo().SetLayerData(ViewLayerType.NER, gazetteer.GetName());
            }

            if (word.Contains("'") && gazetteer.Contains(word.Substring(0, word.IndexOf("'"))) &&
                GetParent().GetData().GetName().Equals("NNP"))
            {
                GetLayerInfo().SetLayerData(ViewLayerType.NER, gazetteer.GetName());
            }
        }
        public async Task <IActionResult> OnGetAsync(long?id)
        {
            if (id == null)
            {
                return(NotFound());
            }

            Gazetteer = await _context.Gazetteer.FirstOrDefaultAsync(m => m.GazetteerId == id);

            if (Gazetteer == null)
            {
                return(NotFound());
            }
            return(Page());
        }
Beispiel #8
0
 public bool Match(string[] tokens, int startIdx, out int len, Gazetteer gazetteer)
 {
     if (IsMatch(tokens, startIdx, out len, gazetteer))
     {
         return(true);
     }
     foreach (Gazetteer importedGazetteer in mImportedGazetteers)
     {
         if (importedGazetteer.Match(tokens, startIdx, out len, gazetteer))
         {
             return(true);
         }
     }
     return(false);
 }
        public async Task <IActionResult> OnPostAsync(long?id)
        {
            if (id == null)
            {
                return(NotFound());
            }

            Gazetteer = await _context.Gazetteer.FindAsync(id);

            if (Gazetteer != null)
            {
                _context.Gazetteer.Remove(Gazetteer);
                await _context.SaveChangesAsync();
            }

            return(RedirectToPage("./Index"));
        }
Beispiel #10
0
 public void Match(Gazetteer gazetteer, out ArrayList <Pair <int, int> > spans)
 {
     spans = new ArrayList <Pair <int, int> >();
     foreach (GazetteerTerm term in gazetteer.mTerms)
     {
         if (!term.mEnabled)
         {
             continue;
         }
         int lastIdx = mTokens.Count - term.mTokens.Count;
         for (int i = 0; i <= lastIdx; i++)
         {
             int  j     = i;
             bool found = false;
             for (int k = 0; k < term.mTokens.Count; k++)
             {
                 if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/ k == 0))
                 {
                     break;
                 }
                 if (found = k == term.mTokens.Count - 1)
                 {
                     break;
                 }
                 j++;
                 while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower()))
                 {
                     j++;
                 }
                 if (j >= mTokens.Count)
                 {
                     break;
                 }
             }
             if (found)                                                     // gazetteer term found (starting at micro-token i, ending at micro-token j)
             {
                 int len = mTokens[j].mSpanEnd - mTokens[i].mSpanStart + 1; // *** this counts all chars in the annotation (incl. spaces and non-token chars)
                 if (len >= term.mMinLen)
                 {
                     //spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd));
                     spans.Add(new Pair <int, int>(i, j));
                 }
             }
         }
     }
 }
        public void TestContains()
        {
            var gazetteer = new Gazetteer("location", "gazetteer-location.txt");

            Assert.True(gazetteer.Contains("bağdat"));
            Assert.True(gazetteer.Contains("BAĞDAT"));
            Assert.True(gazetteer.Contains("belçika"));
            Assert.True(gazetteer.Contains("BELÇİKA"));
            Assert.True(gazetteer.Contains("körfez"));
            Assert.True(gazetteer.Contains("KÖRFEZ"));
            Assert.True(gazetteer.Contains("küba"));
            Assert.True(gazetteer.Contains("KÜBA"));
            Assert.True(gazetteer.Contains("varşova"));
            Assert.True(gazetteer.Contains("VARŞOVA"));
            Assert.True(gazetteer.Contains("krallık"));
            Assert.True(gazetteer.Contains("KRALLIK"));
            Assert.True(gazetteer.Contains("berlin"));
            Assert.True(gazetteer.Contains("BERLİN"));
        }
 public Condition(Gazetteer gazetteer, Level level)
 {
     mGazetteer = gazetteer;
     mLevel = level;
 }
Beispiel #13
0
            private void InitializeInstance(IEnumerable <string> tokens, IEnumerable <string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
            {
                mCaseMatchingType = caseMatchingType;
                mEnabled          = enabled;
                IEnumerator <string> enumTokens         = tokens.GetEnumerator();
                IEnumerator <string> enumPosConstraints = posConstraints.GetEnumerator();

                while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
                {
                    string tokenStr      = Normalize(enumTokens.Current);
                    string posConstraint = enumPosConstraints.Current;
                    if (!gazetteer.IsStopWord(tokenStr.ToLower()))
                    {
                        string lemma = null;
                        if (lemmatize)
                        {
                            lemma = mLemmatizer.GetStem(tokenStr);
                            if (lemma == "")
                            {
                                lemma = tokenStr;
                            }
                        }
                        GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
                        mTokens.Add(token);
                    }
                }
                if (mTokens.Count > 0)
                {
                    PrepareTokens(caseMatchingType, lemmatize);
                }
            }
Beispiel #14
0
            public ArrayList <string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList <Pair <int, int> > spans)
            {
                Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceEntityInfo
                    = new Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > >();
                Dictionary <TextBlock, Set <Gazetteer> > textBlockEntityInfo
                    = new Dictionary <TextBlock, Set <Gazetteer> >();
                Set <Gazetteer> documentEntityInfo
                    = new Set <Gazetteer>();
                ArrayList <Pair <int, int> > sentenceSpans = new ArrayList <Pair <int, int> >();

                // look for gazetteer terms
                foreach (KeyValuePair <string, Gazetteer> gazetteer in e.mGazetteers)
                {
                    foreach (TextBlock textBlock in mTextBlocks)
                    {
                        foreach (Sentence sentence in textBlock.mSentences)
                        {
                            sentence.Match(gazetteer.Value, out sentenceSpans);
                            if (sentenceSpans.Count > 0)
                            {
                                Dictionary <Gazetteer, ArrayList <Pair <int, int> > > sentenceInfo;
                                if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo))
                                {
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                }
                                else
                                {
                                    sentenceInfo = new Dictionary <Gazetteer, ArrayList <Pair <int, int> > >();
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                    sentenceEntityInfo.Add(sentence, sentenceInfo);
                                }
                            }
                        }
                    }
                }
                // propagate discovered entities
                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        documentEntityInfo.Add(gazetteerInfo.Key);
                        TextBlock       textBlock = sentenceInfo.Key.mTextBlock;
                        Set <Gazetteer> textBlockInfo;
                        if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo))
                        {
                            textBlockInfo.Add(gazetteerInfo.Key);
                        }
                        else
                        {
                            textBlockInfo = new Set <Gazetteer>(new Gazetteer[] { gazetteerInfo.Key });
                            textBlockEntityInfo.Add(textBlock, textBlockInfo);
                        }
                    }
                }
                // check conditions
                spans = new ArrayList <Pair <int, int> >();
                ArrayList <string> discoveredEntities = new ArrayList <string>(); // gazetteer URIs

                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        Gazetteer       gazetteer           = gazetteerInfo.Key;
                        Set <Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock];
                        bool            valid = true;
                        foreach (Condition condition in gazetteer.mConditions)
                        {
                            if (condition.mLevel == Condition.Level.Document)
                            {
                                if (!documentEntityInfo.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mLevel == Condition.Level.Block)
                            {
                                if (!textBlockGazetteers.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mLevel == Condition.Level.Sentence)
                            {
                                if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                        }
                        if (valid)
                        {
                            for (int i = 0; i < gazetteerInfo.Value.Count; i++)
                            {
                                discoveredEntities.Add(gazetteer.mUri);
                            }
                            spans.AddRange(gazetteerInfo.Value);
                        }
                    }
                }
                return(discoveredEntities);
            }
Beispiel #15
0
 public Condition(Gazetteer gazetteer, Level level)
 {
     mGazetteer = gazetteer;
     mLevel     = level;
 }
 public void Match(Gazetteer gazetteer, out ArrayList<Pair<int, int>> spans)
 {
     spans = new ArrayList<Pair<int, int>>();
     foreach (GazetteerTerm term in gazetteer.mTerms)
     {
         if (!term.mEnabled) { continue; }
         int lastIdx = mTokens.Count - term.mTokens.Count;
         for (int i = 0; i <= lastIdx; i++)
         {
             int j = i;
             bool found = false;
             for (int k = 0; k < term.mTokens.Count; k++)
             {
                 if (!Match(term.mTokens[k], mTokens[j], term.mCaseMatchingType, /*firstToken=*/k == 0)) { break; }
                 if (found = k == term.mTokens.Count - 1) { break; }
                 j++;
                 while (j < mTokens.Count && gazetteer.IsStopWord(mTokens[j].mTokenStr.ToLower())) { j++; }
                 if (j >= mTokens.Count) { break; }
             }
             if (found) // gazetteer term found (starting at micro-token i, ending at micro-token j)
             {
                 spans.Add(new Pair<int, int>(mTokens[i].mSpanStart, mTokens[j].mSpanEnd));
             }
         }
     }
 }
Beispiel #17
0
            public ArrayList <string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList <Pair <int, int> > spans)
            {
                Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceEntityInfo
                    = new Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > >();
                Dictionary <TextBlock, Set <Gazetteer> > textBlockEntityInfo
                    = new Dictionary <TextBlock, Set <Gazetteer> >();
                Set <Gazetteer> documentEntityInfo
                    = new Set <Gazetteer>();
                ArrayList <Pair <int, int> > sentenceSpans = new ArrayList <Pair <int, int> >();

                // look for gazetteer terms
                foreach (KeyValuePair <string, Gazetteer> gazetteer in e.mGazetteers)
                {
                    foreach (TextBlock textBlock in mTextBlocks)
                    {
                        foreach (Sentence sentence in textBlock.mSentences)
                        {
                            sentence.Match(gazetteer.Value, out sentenceSpans);
                            if (sentenceSpans.Count > 0)
                            {
                                Dictionary <Gazetteer, ArrayList <Pair <int, int> > > sentenceInfo;
                                if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo))
                                {
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                }
                                else
                                {
                                    sentenceInfo = new Dictionary <Gazetteer, ArrayList <Pair <int, int> > >();
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                    sentenceEntityInfo.Add(sentence, sentenceInfo);
                                }
                            }
                        }
                    }
                }
                // propagate discovered entities
                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        documentEntityInfo.Add(gazetteerInfo.Key);
                        TextBlock       textBlock = sentenceInfo.Key.mTextBlock;
                        Set <Gazetteer> textBlockInfo;
                        if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo))
                        {
                            textBlockInfo.Add(gazetteerInfo.Key);
                        }
                        else
                        {
                            textBlockInfo = new Set <Gazetteer>(new Gazetteer[] { gazetteerInfo.Key });
                            textBlockEntityInfo.Add(textBlock, textBlockInfo);
                        }
                    }
                }
                // check conditions
                spans = new ArrayList <Pair <int, int> >();
                ArrayList <string> discoveredEntities = new ArrayList <string>(); // gazetteer URIs

                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        Gazetteer       gazetteer           = gazetteerInfo.Key;
                        Set <Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock];
                        bool            valid = true;
                        foreach (Condition condition in gazetteer.mConditions)
                        {
                            if (condition.mType == Condition.Type.Document)
                            {
                                if (!documentEntityInfo.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mType == Condition.Type.Block)
                            {
                                if (!textBlockGazetteers.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mType == Condition.Type.Sentence)
                            {
                                if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mType == Condition.Type.FollowedBy)
                            {
                                // fast check
                                if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                                // thorough check
                                ArrayList <Pair <int, int> > tmp       = new ArrayList <Pair <int, int> >();
                                ArrayList <Pair <int, int> > condSpans = sentenceInfo.Value[condition.mGazetteer];
                                foreach (Pair <int, int> span in gazetteerInfo.Value)
                                {
                                    //Console.WriteLine(span);
                                    foreach (Pair <int, int> condSpan in condSpans)
                                    {
                                        //Console.WriteLine("  " + condSpan);
                                        if (span.Second == condSpan.First - 1) // span is valid
                                        {
                                            tmp.Add(span);
                                        }
                                    }
                                }
                                if (tmp.Count == 0)
                                {
                                    valid = false; break;
                                }
                                //Console.WriteLine(tmp);
                                gazetteerInfo.Value.Clear();
                                gazetteerInfo.Value.AddRange(tmp);
                            }
                        }
                        if (valid)
                        {
                            for (int i = 0; i < gazetteerInfo.Value.Count; i++)
                            {
                                // check if inside another span
                                bool            skip = false;
                                Pair <int, int> span = gazetteerInfo.Value[i];
                                foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazInfo in sentenceInfo.Value)
                                {
                                    foreach (Pair <int, int> otherSpan in gazInfo.Value)
                                    {
                                        if (span.First >= otherSpan.First && span.Second <= otherSpan.Second && span != otherSpan)
                                        {
                                            skip = true;
                                            break;
                                        }
                                    }
                                    if (skip)
                                    {
                                        break;
                                    }
                                }
                                if (!skip)
                                {
                                    discoveredEntities.Add(gazetteer.mUri);
                                    //spans.Add(span);
                                    spans.Add(new Pair <int, int>(sentenceInfo.Key.mTokens[span.First].mSpanStart, sentenceInfo.Key.mTokens[span.Second].mSpanEnd));
                                }
                            }
                        }
                    }
                }
                return(discoveredEntities);
            }
 private void InitializeInstance(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 {
     mCaseMatchingType = caseMatchingType;
     mEnabled = enabled;
     IEnumerator<string> enumTokens = tokens.GetEnumerator();
     IEnumerator<string> enumPosConstraints = posConstraints.GetEnumerator();
     while (enumTokens.MoveNext() && enumPosConstraints.MoveNext())
     {
         string tokenStr = Normalize(enumTokens.Current);
         string posConstraint = enumPosConstraints.Current;
         if (!gazetteer.IsStopWord(tokenStr.ToLower()))
         {
             string lemma = null;
             if (lemmatize)
             {
                 lemma = mLemmatizer.GetStem(tokenStr);
                 if (lemma == "") { lemma = tokenStr; }
             }
             GazetteerToken token = new GazetteerToken(tokenStr, posConstraint, lemma);
             mTokens.Add(token);
         }
     }
     if (mTokens.Count > 0)
     {
         PrepareTokens(caseMatchingType, lemmatize);
     }
 }
 public void LoadGazetteers()
 {
     mLogger.Info("LoadGazetteers", "Loading gazetteers ...");
     Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER);
     mLogger.Info("LoadGazetteers", "Found {0} gazetteers.", gazetteers.Length);
     // create gazetteer objects
     foreach (Entity gazetteer in gazetteers)
     {
         Gazetteer gazetteerObj = new Gazetteer(gazetteer.Uri);
         mGazetteers.Add(gazetteer.Uri, gazetteerObj);
         // read stop words
         gazetteerObj.ReadStopWords(mRdfStore);
     }
     // import gazetteers and read conditions
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers);
         mGazetteers[gazetteer.Uri].ReadConditions(mRdfStore, mGazetteers);
     }
     // read terms
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore);
     }
 }
 /**
  * <summary>Constructor for creating Person, Organization, and Location gazetteers in automatic Named Entity Recognition.</summary>
  */
 public AutoNER()
 {
     personGazetteer       = new Gazetteer("PERSON", "gazetteer-person.txt");
     organizationGazetteer = new Gazetteer("ORGANIZATION", "gazetteer-organization.txt");
     locationGazetteer     = new Gazetteer("LOCATION", "gazetteer-location.txt");
 }
 public Condition(Gazetteer gazetteer, Type type)
 {
     mGazetteer = gazetteer;
     mType = type;
 }
Beispiel #22
0
 public Condition(Gazetteer gazetteer, Type type)
 {
     mGazetteer = gazetteer;
     mType      = type;
 }
Beispiel #23
0
 public bool Match(string[] tokens, int startIdx, out int len, Gazetteer gazetteer)
 {
     if (IsMatch(tokens, startIdx, out len, gazetteer)) { return true; }
     foreach (Gazetteer importedGazetteer in mImportedGazetteers)
     {
         if (importedGazetteer.Match(tokens, startIdx, out len, gazetteer)) { return true; }
     }
     return false;
 }
Beispiel #24
0
 public bool IsMatch(string[] tokens, int startIdx, out int len, Gazetteer gazetteer)
 {
     len = startIdx;
     foreach (Term term in mTerms)
     {
         if (term.Match(tokens, startIdx, out len, gazetteer)) { return true; }
     }
     return false;
 }
Beispiel #25
0
 public void ReadGazetteers()
 {
     mLogger.Info("ReadGazetteers", "Reading gazetteers ...");
     Entity[] gazetteers = mRdfStore.SelectSubjects(P_TYPE, C_GAZETTEER);
     mLogger.Info("ReadGazetteers", "Found {0} gazetteers.", gazetteers.Length);
     // gazetteer objects
     foreach (Entity gazetteer in gazetteers)
     {
         Gazetteer gazetteerObj = new Gazetteer();
         gazetteerObj.mUri = gazetteer.Uri;
         mGazetteers.Add(gazetteer.Uri, gazetteerObj);
         gazetteerObj.ReadStopWords(mRdfStore); // stop words
         gazetteerObj.ReadSettings(mRdfStore); // settings
     }
     // imported gazetteers
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ImportGazetteers(mRdfStore, mGazetteers);
     }
     // terms
     foreach (Entity gazetteer in gazetteers)
     {
         mGazetteers[gazetteer.Uri].ReadTerms(mRdfStore);
     }
 }
Beispiel #26
0
 public bool Match(string[] words, int startIdx, out int len, Gazetteer gazetteer)
 {
     int idx = startIdx;
     len = 0;
     if (string.Compare(words[idx], mWords[0], gazetteer.mIgnoreCase) != 0) { return false; } // first word must match
     Utils.CaseType caseType = Utils.GetCaseType(words[idx]);
     if (!(caseType == Utils.CaseType.ABC || caseType == Utils.CaseType.Abc || caseType == Utils.CaseType.AbC)) { return false; } // *** only for the demo
     idx++;
     for (int i = 1; i < mWords.Count; i++)
     {
         while (idx < words.Length && gazetteer.IsStopWord(words[idx].ToLower())) { idx++; } // skip stop words
         if (idx == words.Length) { return false; }
         if (string.Compare(words[idx++], mWords[i], gazetteer.mIgnoreCase) != 0) { return false; }
     }
     len = idx - startIdx;
     return true;
 }
 //public GazetteerTerm(IEnumerable<string> tokens, IEnumerable<string> posConstraints, bool lemmatize, CaseMatchingType caseMatchingType, bool enabled, Gazetteer gazetteer)
 //{
 //    InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
 //}
 public GazetteerTerm(string termDef, Gazetteer gazetteer, CaseMatchingType defaultCaseMatchingType, bool defaultLemmatizeFlag, bool defaultEnabledFlag)
 {
     // default settings
     CaseMatchingType caseMatchingType = defaultCaseMatchingType;
     bool lemmatize = defaultLemmatizeFlag;
     bool enabled = defaultEnabledFlag;
     // parse term settings
     termDef = mConstraintRegex.Replace(termDef, new MatchEvaluator(delegate(Match m) {
         ParseGazetteerSettings(m.Value, ref caseMatchingType, ref lemmatize, ref enabled);
         return "";
     }));
     ArrayList<string> tokens = new ArrayList<string>();
     ArrayList<string> posConstraints = new ArrayList<string>();
     Match match = mGazetteerMicroTokenRegex.Match(termDef);
     while (match.Success)
     {
         string token = match.Value;
         string[] tokenParts = token.Split('/');
         string posConstraint = null;
         if (tokenParts.Length == 2)
         {
             token = tokenParts[0];
             posConstraint = tokenParts[1];
         }
         tokens.Add(token);
         posConstraints.Add(posConstraint);
         match = match.NextMatch();
     }
     InitializeInstance(tokens, posConstraints, lemmatize, caseMatchingType, enabled, gazetteer);
 }