public override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { return; } try { document.CreateAnnotationIndex(); EntityRecognitionEngine.Document erDoc = new EntityRecognitionEngine.Document(); foreach (TextBlock tb in document.GetAnnotatedBlocks(mBlockSelector)) { erDoc.BeginNewTextBlock(); foreach (TextBlock s in document.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector hardcoded { ArrayList <string> tokens = new ArrayList <string>(); ArrayList <string> posTags = new ArrayList <string>(); ArrayList <int> spanInfo = new ArrayList <int>(); foreach (TextBlock token in document.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd)) // *** token selector hardcoded { tokens.Add(token.Text); posTags.Add(token.Annotation.Features.GetFeatureValue("posTag")); // *** POS tag feature name hardcoded spanInfo.Add(token.SpanStart); } erDoc.AddSentence(tokens, spanInfo, posTags); } } ArrayList <Pair <int, int> > spans; ArrayList <string> entities = erDoc.DiscoverEntities(mEntityRecognitionEngine, out spans); int i = 0; foreach (string gazetteerUri in entities) { string instanceUri = mEntityRecognitionEngine.GetIdentifiedInstance(gazetteerUri); if (instanceUri != null) { string annotationName = GetAnnotationName(mEntityRecognitionEngine.GetInstanceClassPath(instanceUri)); Annotation annotation = new Annotation(spans[i].First, spans[i].Second, annotationName); document.AddAnnotation(annotation); annotation.Features.SetFeatureValue("gazetteerUri", gazetteerUri); annotation.Features.SetFeatureValue("instanceUri", instanceUri); annotation.Features.SetFeatureValue("instanceClassUri", mEntityRecognitionEngine.GetInstanceClass(instanceUri)); // TODO: instanceLabel, instanceClassLabel } i++; } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }
public override void ProcessDocument(Document document) { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { return; } try { document.CreateAnnotationIndex(); EntityRecognitionEngine.Document erDoc = new EntityRecognitionEngine.Document(); foreach (TextBlock tb in document.GetAnnotatedBlocks(mBlockSelector)) { erDoc.BeginNewTextBlock(); foreach (TextBlock s in document.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector hardcoded { ArrayList<string> tokens = new ArrayList<string>(); ArrayList<string> posTags = new ArrayList<string>(); ArrayList<int> spanInfo = new ArrayList<int>(); foreach (TextBlock token in document.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd)) // *** token selector hardcoded { tokens.Add(token.Text); posTags.Add(token.Annotation.Features.GetFeatureValue("posTag")); // *** POS tag feature name hardcoded spanInfo.Add(token.SpanStart); } erDoc.AddSentence(tokens, spanInfo, posTags); } } ArrayList<Pair<int, int>> spans; ArrayList<string> entities = erDoc.DiscoverEntities(mEntityRecognitionEngine, out spans); int i = 0; foreach (string gazetteerUri in entities) { string instanceUri = mEntityRecognitionEngine.GetIdentifiedInstance(gazetteerUri); if (instanceUri != null) { string annotationName = GetAnnotationName(mEntityRecognitionEngine.GetInstanceClassPath(instanceUri)); Annotation annotation = new Annotation(spans[i].First, spans[i].Second, annotationName); document.AddAnnotation(annotation); annotation.Features.SetFeatureValue("gazetteerUri", gazetteerUri); annotation.Features.SetFeatureValue("instanceUri", instanceUri); annotation.Features.SetFeatureValue("instanceClassUri", mEntityRecognitionEngine.GetInstanceClass(instanceUri)); // TODO: instanceLabel, instanceClassLabel } i++; } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } }