示例#1
0
        public override void ProcessDocument(Document document)
        {
            string contentType = document.Features.GetFeatureValue("contentType");

            if (contentType != "Text")
            {
                return;
            }
            try
            {
                document.CreateAnnotationIndex();
                EntityRecognitionEngine.Document erDoc = new EntityRecognitionEngine.Document();
                foreach (TextBlock tb in document.GetAnnotatedBlocks(mBlockSelector))
                {
                    erDoc.BeginNewTextBlock();
                    foreach (TextBlock s in document.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector hardcoded
                    {
                        ArrayList <string> tokens   = new ArrayList <string>();
                        ArrayList <string> posTags  = new ArrayList <string>();
                        ArrayList <int>    spanInfo = new ArrayList <int>();
                        foreach (TextBlock token in document.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd)) // *** token selector hardcoded
                        {
                            tokens.Add(token.Text);
                            posTags.Add(token.Annotation.Features.GetFeatureValue("posTag")); // *** POS tag feature name hardcoded
                            spanInfo.Add(token.SpanStart);
                        }
                        erDoc.AddSentence(tokens, spanInfo, posTags);
                    }
                }
                ArrayList <Pair <int, int> > spans;
                ArrayList <string>           entities = erDoc.DiscoverEntities(mEntityRecognitionEngine, out spans);
                int i = 0;
                foreach (string gazetteerUri in entities)
                {
                    string instanceUri = mEntityRecognitionEngine.GetIdentifiedInstance(gazetteerUri);
                    if (instanceUri != null)
                    {
                        string     annotationName = GetAnnotationName(mEntityRecognitionEngine.GetInstanceClassPath(instanceUri));
                        Annotation annotation     = new Annotation(spans[i].First, spans[i].Second, annotationName);
                        document.AddAnnotation(annotation);
                        annotation.Features.SetFeatureValue("gazetteerUri", gazetteerUri);
                        annotation.Features.SetFeatureValue("instanceUri", instanceUri);
                        annotation.Features.SetFeatureValue("instanceClassUri", mEntityRecognitionEngine.GetInstanceClass(instanceUri));
                        // TODO: instanceLabel, instanceClassLabel
                    }
                    i++;
                }
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessDocument", exception);
            }
        }
 public override void ProcessDocument(Document document)
 {
     string contentType = document.Features.GetFeatureValue("contentType");
     if (contentType != "Text") { return; }
     try
     {
         document.CreateAnnotationIndex();
         EntityRecognitionEngine.Document erDoc = new EntityRecognitionEngine.Document();
         foreach (TextBlock tb in document.GetAnnotatedBlocks(mBlockSelector))
         {
             erDoc.BeginNewTextBlock();
             foreach (TextBlock s in document.GetAnnotatedBlocks("Sentence", tb.SpanStart, tb.SpanEnd)) // *** sentence selector hardcoded
             {
                 ArrayList<string> tokens = new ArrayList<string>();
                 ArrayList<string> posTags = new ArrayList<string>();
                 ArrayList<int> spanInfo = new ArrayList<int>();
                 foreach (TextBlock token in document.GetAnnotatedBlocks("Token", s.SpanStart, s.SpanEnd)) // *** token selector hardcoded
                 {
                     tokens.Add(token.Text);
                     posTags.Add(token.Annotation.Features.GetFeatureValue("posTag")); // *** POS tag feature name hardcoded
                     spanInfo.Add(token.SpanStart);
                 }
                 erDoc.AddSentence(tokens, spanInfo, posTags);
             }
         }
         ArrayList<Pair<int, int>> spans;
         ArrayList<string> entities = erDoc.DiscoverEntities(mEntityRecognitionEngine, out spans);
         int i = 0;
         foreach (string gazetteerUri in entities)
         {
             string instanceUri = mEntityRecognitionEngine.GetIdentifiedInstance(gazetteerUri);
             if (instanceUri != null)
             {
                 string annotationName = GetAnnotationName(mEntityRecognitionEngine.GetInstanceClassPath(instanceUri));
                 Annotation annotation = new Annotation(spans[i].First, spans[i].Second, annotationName);
                 document.AddAnnotation(annotation);
                 annotation.Features.SetFeatureValue("gazetteerUri", gazetteerUri);
                 annotation.Features.SetFeatureValue("instanceUri", instanceUri);
                 annotation.Features.SetFeatureValue("instanceClassUri", mEntityRecognitionEngine.GetInstanceClass(instanceUri));
                 // TODO: instanceLabel, instanceClassLabel
             }
             i++;
         }
     }
     catch (Exception exception)
     {
         mLogger.Error("ProcessDocument", exception);
     }
 }