private Annotation ReadSentence(string docId, IEnumerator <string> lineIterator)
        {
            Annotation sentence = new Annotation(string.Empty);

            sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId);
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), new List <EntityMention>());
            // we'll need to set things like the tokens and textContent after we've
            // fully read the sentence
            // contains the full text that we've read so far
            StringBuilder textContent = new StringBuilder();
            int           tokenCount  = 0;
            // how many tokens we've seen so far
            IList <CoreLabel> tokens = new List <CoreLabel>();
            // when we've seen two blank lines in a row, this sentence is over (one
            // blank line separates the sentence and the relations
            int    numBlankLinesSeen = 0;
            string sentenceID        = null;
            // keeps tracks of entities we've seen so far for use by relations
            IDictionary <string, EntityMention> indexToEntityMention = new Dictionary <string, EntityMention>();

            while (lineIterator.MoveNext() && numBlankLinesSeen < 2)
            {
                string currentLine = lineIterator.Current;
                currentLine = currentLine.Replace("COMMA", ",");
                IList <string> pieces = StringUtils.Split(currentLine);
                string         identifier;
                int            size = pieces.Count;
                switch (size)
                {
                case 1:
                {
                    // blank line between sentences or relations
                    numBlankLinesSeen++;
                    break;
                }

                case 3:
                {
                    // relation
                    string type = pieces[2];
                    IList <ExtractionObject> args    = new List <ExtractionObject>();
                    EntityMention            entity1 = indexToEntityMention[pieces[0]];
                    EntityMention            entity2 = indexToEntityMention[pieces[1]];
                    args.Add(entity1);
                    args.Add(entity2);
                    Span span = new Span(entity1.GetExtentTokenStart(), entity2.GetExtentTokenEnd());
                    // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
                    identifier = RelationMention.MakeUniqueId();
                    RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args);
                    AnnotationUtils.AddRelationMention(sentence, relationMention);
                    break;
                }

                case 9:
                {
                    // token

                    /*
                     * Roth token lines look like this:
                     *
                     * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
                     */
                    // Entities may be multiple words joined by '/'; we split these up
                    IList <string> words = StringUtils.Split(pieces[5], "/");
                    //List<String> postags = StringUtils.split(pieces.get(4),"/");
                    string text = StringUtils.Join(words, " ");
                    identifier = "entity" + pieces[0] + '-' + pieces[2];
                    string nerTag = GetNormalizedNERTag(pieces[1]);
                    // entity type of the word/expression
                    if (sentenceID == null)
                    {
                        sentenceID = pieces[0];
                    }
                    if (!nerTag.Equals("O"))
                    {
                        Span extentSpan = new Span(tokenCount, tokenCount + words.Count);
                        // Temporarily sets the head span to equal the extent span.
                        // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
                        // The head span is later modified if preprocessSentences is called.
                        EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null);
                        AnnotationUtils.AddEntityMention(sentence, entity);
                        // we can get by using these indices as strings since we only use them
                        // as a hash key
                        string index = pieces[2];
                        indexToEntityMention[index] = entity;
                    }
                    // int i =0;
                    foreach (string word in words)
                    {
                        CoreLabel label = new CoreLabel();
                        label.SetWord(word);
                        //label.setTag(postags.get(i));
                        label.Set(typeof(CoreAnnotations.TextAnnotation), word);
                        label.Set(typeof(CoreAnnotations.ValueAnnotation), word);
                        // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
                        // not keeping track of character offsets
                        tokens.Add(label);
                    }
                    // i++;
                    textContent.Append(text);
                    textContent.Append(' ');
                    tokenCount += words.Count;
                    break;
                }
                }
            }
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), textContent.ToString());
            sentence.Set(typeof(CoreAnnotations.ValueAnnotation), textContent.ToString());
            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            sentence.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceID);
            return(sentence);
        }
Beispiel #2
0
        /// <summary>
        /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence
        /// objects.
        /// </summary>
        /// <remarks>
        /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence
        /// objects. However, you probably should call parse() instead.
        /// </remarks>
        /// <param name="prefix">
        /// prefix of ACE filename to read (e.g.
        /// "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
        /// ) (no ".apf.xml" extension)
        /// </param>
        /// <returns>list of RelationSentence objects</returns>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Xml.Sax.SAXException"/>
        /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/>
        private IList <ICoreMap> ReadDocument(string prefix, Annotation corpus)
        {
            logger.Info("Reading document: " + prefix);
            IList <ICoreMap> results = new List <ICoreMap>();
            AceDocument      aceDocument;

            if (aceVersion.Equals("ACE2004"))
            {
                aceDocument = AceDocument.ParseDocument(prefix, false, aceVersion);
            }
            else
            {
                aceDocument = AceDocument.ParseDocument(prefix, false);
            }
            string docId = aceDocument.GetId();
            // map entity mention ID strings to their EntityMention counterparts
            IDictionary <string, EntityMention> entityMentionMap = Generics.NewHashMap();

            /*
             * for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
             * List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
             * StringBuffer b = new StringBuffer();
             * for(AceToken t: tokens) b.append(t.getLiteral() + " " );
             * logger.info("SENTENCE: " + b.toString());
             * }
             */
            int tokenOffset = 0;

            for (int sentenceIndex = 0; sentenceIndex < aceDocument.GetSentenceCount(); sentenceIndex++)
            {
                IList <AceToken>  tokens      = aceDocument.GetSentence(sentenceIndex);
                IList <CoreLabel> words       = new List <CoreLabel>();
                StringBuilder     textContent = new StringBuilder();
                for (int i = 0; i < tokens.Count; i++)
                {
                    CoreLabel l = new CoreLabel();
                    l.SetWord(tokens[i].GetLiteral());
                    l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word());
                    l.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), tokens[i].GetByteStart());
                    l.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), tokens[i].GetByteEnd());
                    words.Add(l);
                    if (i > 0)
                    {
                        textContent.Append(" ");
                    }
                    textContent.Append(tokens[i].GetLiteral());
                }
                // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
                if (words.Count == 1)
                {
                    string word = words[0].Word();
                    if (word.StartsWith("<") && word.EndsWith(">"))
                    {
                        tokenOffset += tokens.Count;
                        continue;
                    }
                }
                ICoreMap sentence = new Annotation(textContent.ToString());
                sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId);
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words);
                logger.Info("Reading sentence: \"" + textContent + "\"");
                IList <AceEntityMention>   entityMentions   = aceDocument.GetEntityMentions(sentenceIndex);
                IList <AceRelationMention> relationMentions = aceDocument.GetRelationMentions(sentenceIndex);
                IList <AceEventMention>    eventMentions    = aceDocument.GetEventMentions(sentenceIndex);
                // convert entity mentions
                foreach (AceEntityMention aceEntityMention in entityMentions)
                {
                    string corefID = string.Empty;
                    foreach (string entityID in aceDocument.GetKeySetEntities())
                    {
                        AceEntity e = aceDocument.GetEntity(entityID);
                        if (e.GetMentions().Contains(aceEntityMention))
                        {
                            corefID = entityID;
                            break;
                        }
                    }
                    EntityMention convertedMention = ConvertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
                    //        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
                    entityCounts.IncrementCount(convertedMention.GetType());
                    logger.Info("CONVERTED MENTION HEAD SPAN: " + convertedMention.GetHead());
                    logger.Info("CONVERTED ENTITY MENTION: " + convertedMention);
                    AnnotationUtils.AddEntityMention(sentence, convertedMention);
                    entityMentionMap[aceEntityMention.GetId()] = convertedMention;
                }
                // TODO: make Entity objects as needed
                // convert relation mentions
                foreach (AceRelationMention aceRelationMention in relationMentions)
                {
                    RelationMention convertedMention = ConvertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
                    if (convertedMention != null)
                    {
                        relationCounts.IncrementCount(convertedMention.GetType());
                        logger.Info("CONVERTED RELATION MENTION: " + convertedMention);
                        AnnotationUtils.AddRelationMention(sentence, convertedMention);
                    }
                }
                // TODO: make Relation objects
                // convert EventMentions
                foreach (AceEventMention aceEventMention in eventMentions)
                {
                    EventMention convertedMention = ConvertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
                    if (convertedMention != null)
                    {
                        eventCounts.IncrementCount(convertedMention.GetType());
                        logger.Info("CONVERTED EVENT MENTION: " + convertedMention);
                        AnnotationUtils.AddEventMention(sentence, convertedMention);
                    }
                }
                // TODO: make Event objects
                results.Add(sentence);
                tokenOffset += tokens.Count;
            }
            return(results);
        }