public ArrayList <string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList <Pair <int, int> > spans) { Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceEntityInfo = new Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > >(); Dictionary <TextBlock, Set <Gazetteer> > textBlockEntityInfo = new Dictionary <TextBlock, Set <Gazetteer> >(); Set <Gazetteer> documentEntityInfo = new Set <Gazetteer>(); ArrayList <Pair <int, int> > sentenceSpans = new ArrayList <Pair <int, int> >(); // look for gazetteer terms foreach (KeyValuePair <string, Gazetteer> gazetteer in e.mGazetteers) { foreach (TextBlock textBlock in mTextBlocks) { foreach (Sentence sentence in textBlock.mSentences) { sentence.Match(gazetteer.Value, out sentenceSpans); if (sentenceSpans.Count > 0) { Dictionary <Gazetteer, ArrayList <Pair <int, int> > > sentenceInfo; if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo)) { sentenceInfo.Add(gazetteer.Value, sentenceSpans); } else { sentenceInfo = new Dictionary <Gazetteer, ArrayList <Pair <int, int> > >(); sentenceInfo.Add(gazetteer.Value, sentenceSpans); sentenceEntityInfo.Add(sentence, sentenceInfo); } } } } } // propagate discovered entities foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value) { documentEntityInfo.Add(gazetteerInfo.Key); TextBlock textBlock = sentenceInfo.Key.mTextBlock; Set <Gazetteer> textBlockInfo; if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo)) { textBlockInfo.Add(gazetteerInfo.Key); } else { textBlockInfo = new Set <Gazetteer>(new Gazetteer[] { gazetteerInfo.Key }); textBlockEntityInfo.Add(textBlock, textBlockInfo); } } } // check conditions spans = new ArrayList <Pair <int, int> >(); ArrayList <string> discoveredEntities = new ArrayList <string>(); // gazetteer URIs foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value) { Gazetteer gazetteer = gazetteerInfo.Key; Set <Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock]; bool valid = true; foreach (Condition condition in gazetteer.mConditions) { if (condition.mLevel == Condition.Level.Document) { if (!documentEntityInfo.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mLevel == Condition.Level.Block) { if (!textBlockGazetteers.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mLevel == Condition.Level.Sentence) { if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; } } } if (valid) { for (int i = 0; i < gazetteerInfo.Value.Count; i++) { discoveredEntities.Add(gazetteer.mUri); } spans.AddRange(gazetteerInfo.Value); } } } return(discoveredEntities); }
public ArrayList <string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList <Pair <int, int> > spans) { Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceEntityInfo = new Dictionary <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > >(); Dictionary <TextBlock, Set <Gazetteer> > textBlockEntityInfo = new Dictionary <TextBlock, Set <Gazetteer> >(); Set <Gazetteer> documentEntityInfo = new Set <Gazetteer>(); ArrayList <Pair <int, int> > sentenceSpans = new ArrayList <Pair <int, int> >(); // look for gazetteer terms foreach (KeyValuePair <string, Gazetteer> gazetteer in e.mGazetteers) { foreach (TextBlock textBlock in mTextBlocks) { foreach (Sentence sentence in textBlock.mSentences) { sentence.Match(gazetteer.Value, out sentenceSpans); if (sentenceSpans.Count > 0) { Dictionary <Gazetteer, ArrayList <Pair <int, int> > > sentenceInfo; if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo)) { sentenceInfo.Add(gazetteer.Value, sentenceSpans); } else { sentenceInfo = new Dictionary <Gazetteer, ArrayList <Pair <int, int> > >(); sentenceInfo.Add(gazetteer.Value, sentenceSpans); sentenceEntityInfo.Add(sentence, sentenceInfo); } } } } } // propagate discovered entities foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value) { documentEntityInfo.Add(gazetteerInfo.Key); TextBlock textBlock = sentenceInfo.Key.mTextBlock; Set <Gazetteer> textBlockInfo; if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo)) { textBlockInfo.Add(gazetteerInfo.Key); } else { textBlockInfo = new Set <Gazetteer>(new Gazetteer[] { gazetteerInfo.Key }); textBlockEntityInfo.Add(textBlock, textBlockInfo); } } } // check conditions spans = new ArrayList <Pair <int, int> >(); ArrayList <string> discoveredEntities = new ArrayList <string>(); // gazetteer URIs foreach (KeyValuePair <Sentence, Dictionary <Gazetteer, ArrayList <Pair <int, int> > > > sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazetteerInfo in sentenceInfo.Value) { Gazetteer gazetteer = gazetteerInfo.Key; Set <Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock]; bool valid = true; foreach (Condition condition in gazetteer.mConditions) { if (condition.mType == Condition.Type.Document) { if (!documentEntityInfo.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mType == Condition.Type.Block) { if (!textBlockGazetteers.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mType == Condition.Type.Sentence) { if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; } } else if (condition.mType == Condition.Type.FollowedBy) { // fast check if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; } // thorough check ArrayList <Pair <int, int> > tmp = new ArrayList <Pair <int, int> >(); ArrayList <Pair <int, int> > condSpans = sentenceInfo.Value[condition.mGazetteer]; foreach (Pair <int, int> span in gazetteerInfo.Value) { //Console.WriteLine(span); foreach (Pair <int, int> condSpan in condSpans) { //Console.WriteLine(" " + condSpan); if (span.Second == condSpan.First - 1) // span is valid { tmp.Add(span); } } } if (tmp.Count == 0) { valid = false; break; } //Console.WriteLine(tmp); gazetteerInfo.Value.Clear(); gazetteerInfo.Value.AddRange(tmp); } } if (valid) { for (int i = 0; i < gazetteerInfo.Value.Count; i++) { // check if inside another span bool skip = false; Pair <int, int> span = gazetteerInfo.Value[i]; foreach (KeyValuePair <Gazetteer, ArrayList <Pair <int, int> > > gazInfo in sentenceInfo.Value) { foreach (Pair <int, int> otherSpan in gazInfo.Value) { if (span.First >= otherSpan.First && span.Second <= otherSpan.Second && span != otherSpan) { skip = true; break; } } if (skip) { break; } } if (!skip) { discoveredEntities.Add(gazetteer.mUri); //spans.Add(span); spans.Add(new Pair <int, int>(sentenceInfo.Key.mTokens[span.First].mSpanStart, sentenceInfo.Key.mTokens[span.Second].mSpanEnd)); } } } } } return(discoveredEntities); }
public ArrayList<string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList<Pair<int, int>> spans) { Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceEntityInfo = new Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>>(); Dictionary<TextBlock, Set<Gazetteer>> textBlockEntityInfo = new Dictionary<TextBlock, Set<Gazetteer>>(); Set<Gazetteer> documentEntityInfo = new Set<Gazetteer>(); ArrayList<Pair<int, int>> sentenceSpans = new ArrayList<Pair<int, int>>(); // look for gazetteer terms foreach (KeyValuePair<string, Gazetteer> gazetteer in e.mGazetteers) { foreach (TextBlock textBlock in mTextBlocks) { foreach (Sentence sentence in textBlock.mSentences) { sentence.Match(gazetteer.Value, out sentenceSpans); if (sentenceSpans.Count > 0) { Dictionary<Gazetteer, ArrayList<Pair<int, int>>> sentenceInfo; if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo)) { sentenceInfo.Add(gazetteer.Value, sentenceSpans); } else { sentenceInfo = new Dictionary<Gazetteer, ArrayList<Pair<int, int>>>(); sentenceInfo.Add(gazetteer.Value, sentenceSpans); sentenceEntityInfo.Add(sentence, sentenceInfo); } } } } } // propagate discovered entities foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value) { documentEntityInfo.Add(gazetteerInfo.Key); TextBlock textBlock = sentenceInfo.Key.mTextBlock; Set<Gazetteer> textBlockInfo; if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo)) { textBlockInfo.Add(gazetteerInfo.Key); } else { textBlockInfo = new Set<Gazetteer>(new Gazetteer[] { gazetteerInfo.Key }); textBlockEntityInfo.Add(textBlock, textBlockInfo); } } } // check conditions spans = new ArrayList<Pair<int, int>>(); ArrayList<string> discoveredEntities = new ArrayList<string>(); // gazetteer URIs foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value) { Gazetteer gazetteer = gazetteerInfo.Key; Set<Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock]; bool valid = true; foreach (Condition condition in gazetteer.mConditions) { if (condition.mLevel == Condition.Level.Document) { if (!documentEntityInfo.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mLevel == Condition.Level.Block) { if (!textBlockGazetteers.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mLevel == Condition.Level.Sentence) { if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; } } } if (valid) { for (int i = 0; i < gazetteerInfo.Value.Count; i++) { discoveredEntities.Add(gazetteer.mUri); } spans.AddRange(gazetteerInfo.Value); } } } return discoveredEntities; }
public ArrayList<string> DiscoverEntities(EntityRecognitionEngine e, out ArrayList<Pair<int, int>> spans) { Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceEntityInfo = new Dictionary<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>>(); Dictionary<TextBlock, Set<Gazetteer>> textBlockEntityInfo = new Dictionary<TextBlock, Set<Gazetteer>>(); Set<Gazetteer> documentEntityInfo = new Set<Gazetteer>(); ArrayList<Pair<int, int>> sentenceSpans = new ArrayList<Pair<int, int>>(); // look for gazetteer terms foreach (KeyValuePair<string, Gazetteer> gazetteer in e.mGazetteers) { foreach (TextBlock textBlock in mTextBlocks) { foreach (Sentence sentence in textBlock.mSentences) { sentence.Match(gazetteer.Value, out sentenceSpans); if (sentenceSpans.Count > 0) { Dictionary<Gazetteer, ArrayList<Pair<int, int>>> sentenceInfo; if (sentenceEntityInfo.TryGetValue(sentence, out sentenceInfo)) { sentenceInfo.Add(gazetteer.Value, sentenceSpans); } else { sentenceInfo = new Dictionary<Gazetteer, ArrayList<Pair<int, int>>>(); sentenceInfo.Add(gazetteer.Value, sentenceSpans); sentenceEntityInfo.Add(sentence, sentenceInfo); } } } } } // propagate discovered entities foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value) { documentEntityInfo.Add(gazetteerInfo.Key); TextBlock textBlock = sentenceInfo.Key.mTextBlock; Set<Gazetteer> textBlockInfo; if (textBlockEntityInfo.TryGetValue(textBlock, out textBlockInfo)) { textBlockInfo.Add(gazetteerInfo.Key); } else { textBlockInfo = new Set<Gazetteer>(new Gazetteer[] { gazetteerInfo.Key }); textBlockEntityInfo.Add(textBlock, textBlockInfo); } } } // check conditions spans = new ArrayList<Pair<int, int>>(); ArrayList<string> discoveredEntities = new ArrayList<string>(); // gazetteer URIs foreach (KeyValuePair<Sentence, Dictionary<Gazetteer, ArrayList<Pair<int, int>>>> sentenceInfo in sentenceEntityInfo) { foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazetteerInfo in sentenceInfo.Value) { Gazetteer gazetteer = gazetteerInfo.Key; Set<Gazetteer> textBlockGazetteers = textBlockEntityInfo[sentenceInfo.Key.mTextBlock]; bool valid = true; foreach (Condition condition in gazetteer.mConditions) { if (condition.mType == Condition.Type.Document) { if (!documentEntityInfo.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mType == Condition.Type.Block) { if (!textBlockGazetteers.Contains(condition.mGazetteer)) { valid = false; break; } } else if (condition.mType == Condition.Type.Sentence) { if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; } } else if (condition.mType == Condition.Type.FollowedBy) { // fast check if (!sentenceInfo.Value.ContainsKey(condition.mGazetteer)) { valid = false; break; } // thorough check ArrayList<Pair<int, int>> tmp = new ArrayList<Pair<int, int>>(); ArrayList<Pair<int, int>> condSpans = sentenceInfo.Value[condition.mGazetteer]; foreach (Pair<int, int> span in gazetteerInfo.Value) { //Console.WriteLine(span); foreach (Pair<int, int> condSpan in condSpans) { //Console.WriteLine(" " + condSpan); if (span.Second == condSpan.First - 1) // span is valid { tmp.Add(span); } } } if (tmp.Count == 0) { valid = false; break; } //Console.WriteLine(tmp); gazetteerInfo.Value.Clear(); gazetteerInfo.Value.AddRange(tmp); } } if (valid) { for (int i = 0; i < gazetteerInfo.Value.Count; i++) { // check if inside another span bool skip = false; Pair<int, int> span = gazetteerInfo.Value[i]; foreach (KeyValuePair<Gazetteer, ArrayList<Pair<int, int>>> gazInfo in sentenceInfo.Value) { foreach (Pair<int, int> otherSpan in gazInfo.Value) { if (span.First >= otherSpan.First && span.Second <= otherSpan.Second && span != otherSpan) { skip = true; break; } } if (skip) { break; } } if (!skip) { discoveredEntities.Add(gazetteer.mUri); //spans.Add(span); spans.Add(new Pair<int, int>(sentenceInfo.Key.mTokens[span.First].mSpanStart, sentenceInfo.Key.mTokens[span.Second].mSpanEnd)); } } } } } return discoveredEntities; }