Beispiel #1
0
            public ArrayList <string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList <Pair <int, int> > spans)
            {
                Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceEntityInfo
                    = new Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > >();
                Dictionary <TextBlock, Set <Gazetteer> > textBlockEntityInfo
                    = new Dictionary <TextBlock, Set <Gazetteer> >();
                Set <Gazetteer> documentEntityInfo
                    = new Set <Gazetteer>();
                ArrayList <Pair <int, int> > sentenceSpans = new ArrayList <Pair <int, int> >();

                // look for gazetteer terms
                foreach (KeyValuePair <string, Gazetteer> gazetteer in e.mGazetteers)
                {
                    foreach (TextBlock textBlock in mTextBlocks)
                    {
                        foreach (Sentence sentence in textBlock.mSentences)
                        {
                            sentence.Match(gazetteer.Value, out sentenceSpans);
                            if (sentenceSpans.Count > 0)
                            {
                                Dictionary <Gazetteer, ArrayList <Pair <int, int> > > sentenceInfo;
                                if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo))
                                {
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                }
                                else
                                {
                                    sentenceInfo = new Dictionary <Gazetteer, ArrayList <Pair <int, int> > >();
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                    sentenceEntityInfo.Add(sentence, sentenceInfo);
                                }
                            }
                        }
                    }
                }
                // propagate discovered entities
                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        documentEntityInfo.Add(gazetteerInfo.Key);
                        TextBlock       textBlock = sentenceInfo.Key.mTextBlock;
                        Set <Gazetteer> textBlockInfo;
                        if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo))
                        {
                            textBlockInfo.Add(gazetteerInfo.Key);
                        }
                        else
                        {
                            textBlockInfo = new Set <Gazetteer>(new Gazetteer[] { gazetteerInfo.Key });
                            textBlockEntityInfo.Add(textBlock, textBlockInfo);
                        }
                    }
                }
                // check conditions
                spans = new ArrayList <Pair <int, int> >();
                ArrayList <string> discoveredEntities = new ArrayList <string>(); // gazetteer URIs

                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        Gazetteer       gazetteer           = gazetteerInfo.Key;
                        Set <Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock];
                        bool            valid = true;
                        foreach (Condition condition in gazetteer.mConditions)
                        {
                            if (condition.mLevel == Condition.Level.Document)
                            {
                                if (!documentEntityInfo.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mLevel == Condition.Level.Block)
                            {
                                if (!textBlockGazetteers.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mLevel == Condition.Level.Sentence)
                            {
                                if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                        }
                        if (valid)
                        {
                            for (int i = 0; i < gazetteerInfo.Value.Count; i++)
                            {
                                discoveredEntities.Add(gazetteer.mUri);
                            }
                            spans.AddRange(gazetteerInfo.Value);
                        }
                    }
                }
                return(discoveredEntities);
            }
Beispiel #2
0
            public ArrayList <string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList <Pair <int, int> > spans)
            {
                Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceEntityInfo
                    = new Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > >();
                Dictionary <TextBlock, Set <Gazetteer> > textBlockEntityInfo
                    = new Dictionary <TextBlock, Set <Gazetteer> >();
                Set <Gazetteer> documentEntityInfo
                    = new Set <Gazetteer>();
                ArrayList <Pair <int, int> > sentenceSpans = new ArrayList <Pair <int, int> >();

                // look for gazetteer terms
                foreach (KeyValuePair <string, Gazetteer> gazetteer in e.mGazetteers)
                {
                    foreach (TextBlock textBlock in mTextBlocks)
                    {
                        foreach (Sentence sentence in textBlock.mSentences)
                        {
                            sentence.Match(gazetteer.Value, out sentenceSpans);
                            if (sentenceSpans.Count > 0)
                            {
                                Dictionary <Gazetteer, ArrayList <Pair <int, int> > > sentenceInfo;
                                if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo))
                                {
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                }
                                else
                                {
                                    sentenceInfo = new Dictionary <Gazetteer, ArrayList <Pair <int, int> > >();
                                    sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                                    sentenceEntityInfo.Add(sentence, sentenceInfo);
                                }
                            }
                        }
                    }
                }
                // propagate discovered entities
                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        documentEntityInfo.Add(gazetteerInfo.Key);
                        TextBlock       textBlock = sentenceInfo.Key.mTextBlock;
                        Set <Gazetteer> textBlockInfo;
                        if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo))
                        {
                            textBlockInfo.Add(gazetteerInfo.Key);
                        }
                        else
                        {
                            textBlockInfo = new Set <Gazetteer>(new Gazetteer[] { gazetteerInfo.Key });
                            textBlockEntityInfo.Add(textBlock, textBlockInfo);
                        }
                    }
                }
                // check conditions
                spans = new ArrayList <Pair <int, int> >();
                ArrayList <string> discoveredEntities = new ArrayList <string>(); // gazetteer URIs

                foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo)
                {
                    foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value)
                    {
                        Gazetteer       gazetteer           = gazetteerInfo.Key;
                        Set <Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock];
                        bool            valid = true;
                        foreach (Condition condition in gazetteer.mConditions)
                        {
                            if (condition.mType == Condition.Type.Document)
                            {
                                if (!documentEntityInfo.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mType == Condition.Type.Block)
                            {
                                if (!textBlockGazetteers.Contains(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mType == Condition.Type.Sentence)
                            {
                                if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                            }
                            else if (condition.mType == Condition.Type.FollowedBy)
                            {
                                // fast check
                                if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer))
                                {
                                    valid = false; break;
                                }
                                // thorough check
                                ArrayList <Pair <int, int> > tmp       = new ArrayList <Pair <int, int> >();
                                ArrayList <Pair <int, int> > condSpans = sentenceInfo.Value[condition.mGazetteer];
                                foreach (Pair <int, int> span in gazetteerInfo.Value)
                                {
                                    //Console.WriteLine(span);
                                    foreach (Pair <int, int> condSpan in condSpans)
                                    {
                                        //Console.WriteLine("  " + condSpan);
                                        if (span.Second == condSpan.First - 1) // span is valid
                                        {
                                            tmp.Add(span);
                                        }
                                    }
                                }
                                if (tmp.Count == 0)
                                {
                                    valid = false; break;
                                }
                                //Console.WriteLine(tmp);
                                gazetteerInfo.Value.Clear();
                                gazetteerInfo.Value.AddRange(tmp);
                            }
                        }
                        if (valid)
                        {
                            for (int i = 0; i < gazetteerInfo.Value.Count; i++)
                            {
                                // check if inside another span
                                bool            skip = false;
                                Pair <int, int> span = gazetteerInfo.Value[i];
                                foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazInfo in sentenceInfo.Value)
                                {
                                    foreach (Pair <int, int> otherSpan in gazInfo.Value)
                                    {
                                        if (span.First >= otherSpan.First && span.Second <= otherSpan.Second && span != otherSpan)
                                        {
                                            skip = true;
                                            break;
                                        }
                                    }
                                    if (skip)
                                    {
                                        break;
                                    }
                                }
                                if (!skip)
                                {
                                    discoveredEntities.Add(gazetteer.mUri);
                                    //spans.Add(span);
                                    spans.Add(new Pair <int, int>(sentenceInfo.Key.mTokens[span.First].mSpanStart, sentenceInfo.Key.mTokens[span.Second].mSpanEnd));
                                }
                            }
                        }
                    }
                }
                return(discoveredEntities);
            }
 public ArrayList<string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList<Pair<int, int>> spans)
 {
     Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceEntityInfo
         = new Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>>();
     Dictionary<TextBlock, Set<Gazetteer>> textBlockEntityInfo
         = new Dictionary<TextBlock, Set<Gazetteer>>();
     Set<Gazetteer> documentEntityInfo
         = new Set<Gazetteer>();
     ArrayList<Pair<int, int>> sentenceSpans = new ArrayList<Pair<int, int>>();
     // look for gazetteer terms
     foreach (KeyValuePair<string, Gazetteer> gazetteer in e.mGazetteers)
     {
         foreach (TextBlock textBlock in mTextBlocks)
         {
             foreach (Sentence sentence in textBlock.mSentences)
             {
                 sentence.Match(gazetteer.Value, out sentenceSpans);
                 if (sentenceSpans.Count > 0)
                 {
                     Dictionary<Gazetteer, ArrayList<Pair<int, int>>> sentenceInfo;
                     if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo))
                     {
                         sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                     }
                     else
                     {
                         sentenceInfo = new Dictionary<Gazetteer, ArrayList<Pair<int, int>>>();
                         sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                         sentenceEntityInfo.Add(sentence, sentenceInfo);
                     }
                 }
             }
         }
     }
     // propagate discovered entities
     foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo)
     {
         foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value)
         {
             documentEntityInfo.Add(gazetteerInfo.Key);
             TextBlock textBlock = sentenceInfo.Key.mTextBlock;
             Set<Gazetteer> textBlockInfo;
             if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo))
             {
                 textBlockInfo.Add(gazetteerInfo.Key);
             }
             else
             {
                 textBlockInfo = new Set<Gazetteer>(new Gazetteer[] { gazetteerInfo.Key });
                 textBlockEntityInfo.Add(textBlock, textBlockInfo);
             }
         }
     }
     // check conditions
     spans = new ArrayList<Pair<int, int>>();
     ArrayList<string> discoveredEntities = new ArrayList<string>(); // gazetteer URIs
     foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo)
     {
         foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value)
         {
             Gazetteer gazetteer = gazetteerInfo.Key;
             Set<Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock];
             bool valid = true;
             foreach (Condition condition in gazetteer.mConditions)
             {
                 if (condition.mLevel == Condition.Level.Document)
                 {
                     if (!documentEntityInfo.Contains(condition.mGazetteer)) { valid = false; break; }
                 }
                 else if (condition.mLevel == Condition.Level.Block)
                 {
                     if (!textBlockGazetteers.Contains(condition.mGazetteer)) { valid = false; break; }
                 }
                 else if (condition.mLevel == Condition.Level.Sentence)
                 {
                     if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; }
                 }
             }
             if (valid)
             {
                 for (int i = 0; i < gazetteerInfo.Value.Count; i++)
                 {
                     discoveredEntities.Add(gazetteer.mUri);
                 }
                 spans.AddRange(gazetteerInfo.Value);
             }
         }
     }
     return discoveredEntities;
 }
 public ArrayList<string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList<Pair<int, int>> spans)
 {
     Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceEntityInfo
         = new Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>>();
     Dictionary<TextBlock, Set<Gazetteer>> textBlockEntityInfo
         = new Dictionary<TextBlock, Set<Gazetteer>>();
     Set<Gazetteer> documentEntityInfo
         = new Set<Gazetteer>();
     ArrayList<Pair<int, int>> sentenceSpans = new ArrayList<Pair<int, int>>();
     // look for gazetteer terms
     foreach (KeyValuePair<string, Gazetteer> gazetteer in e.mGazetteers)
     {
         foreach (TextBlock textBlock in mTextBlocks)
         {
             foreach (Sentence sentence in textBlock.mSentences)
             {
                 sentence.Match(gazetteer.Value, out sentenceSpans);
                 if (sentenceSpans.Count > 0)
                 {
                     Dictionary<Gazetteer, ArrayList<Pair<int, int>>> sentenceInfo;
                     if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo))
                     {
                         sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                     }
                     else
                     {
                         sentenceInfo = new Dictionary<Gazetteer, ArrayList<Pair<int, int>>>();
                         sentenceInfo.Add(gazetteer.Value, sentenceSpans);
                         sentenceEntityInfo.Add(sentence, sentenceInfo);
                     }
                 }
             }
         }
     }
     // propagate discovered entities
     foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo)
     {
         foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value)
         {
             documentEntityInfo.Add(gazetteerInfo.Key);
             TextBlock textBlock = sentenceInfo.Key.mTextBlock;
             Set<Gazetteer> textBlockInfo;
             if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo))
             {
                 textBlockInfo.Add(gazetteerInfo.Key);
             }
             else
             {
                 textBlockInfo = new Set<Gazetteer>(new Gazetteer[] { gazetteerInfo.Key });
                 textBlockEntityInfo.Add(textBlock, textBlockInfo);
             }
         }
     }
     // check conditions
     spans = new ArrayList<Pair<int, int>>();
     ArrayList<string> discoveredEntities = new ArrayList<string>(); // gazetteer URIs
     foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo)
     {
         foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value)
         {
             Gazetteer gazetteer = gazetteerInfo.Key;
             Set<Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock];
             bool valid = true;
             foreach (Condition condition in gazetteer.mConditions)
             {
                 if (condition.mType == Condition.Type.Document)
                 {
                     if (!documentEntityInfo.Contains(condition.mGazetteer)) { valid = false; break; }
                 }
                 else if (condition.mType == Condition.Type.Block)
                 {
                     if (!textBlockGazetteers.Contains(condition.mGazetteer)) { valid = false; break; }
                 }
                 else if (condition.mType == Condition.Type.Sentence)
                 {
                     if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; }
                 }
                 else if (condition.mType == Condition.Type.FollowedBy)
                 {
                     // fast check
                     if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; }
                     // thorough check
                     ArrayList<Pair<int, int>> tmp = new ArrayList<Pair<int, int>>();
                     ArrayList<Pair<int, int>> condSpans = sentenceInfo.Value[condition.mGazetteer];
                     foreach (Pair<int, int> span in gazetteerInfo.Value)
                     {
                         //Console.WriteLine(span);
                         foreach (Pair<int, int> condSpan in condSpans)
                         {
                             //Console.WriteLine("  " + condSpan);
                             if (span.Second == condSpan.First - 1) // span is valid
                             {
                                 tmp.Add(span);
                             }
                         }
                     }
                     if (tmp.Count == 0) { valid = false; break; }
                     //Console.WriteLine(tmp);
                     gazetteerInfo.Value.Clear();
                     gazetteerInfo.Value.AddRange(tmp);
                 }
             }
             if (valid)
             {
                 for (int i = 0; i < gazetteerInfo.Value.Count; i++)
                 {
                     // check if inside another span
                     bool skip = false;
                     Pair<int, int> span = gazetteerInfo.Value[i];
                     foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazInfo in sentenceInfo.Value)
                     {
                         foreach (Pair<int, int> otherSpan in gazInfo.Value)
                         {
                             if (span.First >= otherSpan.First && span.Second <= otherSpan.Second && span != otherSpan)
                             {
                                 skip = true;
                                 break;
                             }
                         }
                         if (skip) { break; }
                     }
                     if (!skip)
                     {
                         discoveredEntities.Add(gazetteer.mUri);
                         //spans.Add(span);
                         spans.Add(new Pair<int, int>(sentenceInfo.Key.mTokens[span.First].mSpanStart, sentenceInfo.Key.mTokens[span.Second].mSpanEnd));
                     }
                 }
             }
         }
     }
     return discoveredEntities;
 }