Esempio n. 1
0
        /// <summary>
        /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence
        /// objects.
        /// </summary>
        /// <remarks>
        /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence
        /// objects. However, you probably should call parse() instead.
        /// </remarks>
        /// <param name="prefix">
        /// prefix of ACE filename to read (e.g.
        /// "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
        /// ) (no ".apf.xml" extension)
        /// </param>
        /// <returns>list of RelationSentence objects</returns>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Xml.Sax.SAXException"/>
        /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/>
        private IList <ICoreMap> ReadDocument(string prefix, Annotation corpus)
        {
            logger.Info("Reading document: " + prefix);
            IList <ICoreMap> results = new List <ICoreMap>();
            AceDocument      aceDocument;

            if (aceVersion.Equals("ACE2004"))
            {
                aceDocument = AceDocument.ParseDocument(prefix, false, aceVersion);
            }
            else
            {
                aceDocument = AceDocument.ParseDocument(prefix, false);
            }
            string docId = aceDocument.GetId();
            // map entity mention ID strings to their EntityMention counterparts
            IDictionary <string, EntityMention> entityMentionMap = Generics.NewHashMap();

            /*
             * for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
             * List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
             * StringBuffer b = new StringBuffer();
             * for(AceToken t: tokens) b.append(t.getLiteral() + " " );
             * logger.info("SENTENCE: " + b.toString());
             * }
             */
            int tokenOffset = 0;

            for (int sentenceIndex = 0; sentenceIndex < aceDocument.GetSentenceCount(); sentenceIndex++)
            {
                IList <AceToken>  tokens      = aceDocument.GetSentence(sentenceIndex);
                IList <CoreLabel> words       = new List <CoreLabel>();
                StringBuilder     textContent = new StringBuilder();
                for (int i = 0; i < tokens.Count; i++)
                {
                    CoreLabel l = new CoreLabel();
                    l.SetWord(tokens[i].GetLiteral());
                    l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word());
                    l.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), tokens[i].GetByteStart());
                    l.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), tokens[i].GetByteEnd());
                    words.Add(l);
                    if (i > 0)
                    {
                        textContent.Append(" ");
                    }
                    textContent.Append(tokens[i].GetLiteral());
                }
                // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
                if (words.Count == 1)
                {
                    string word = words[0].Word();
                    if (word.StartsWith("<") && word.EndsWith(">"))
                    {
                        tokenOffset += tokens.Count;
                        continue;
                    }
                }
                ICoreMap sentence = new Annotation(textContent.ToString());
                sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId);
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words);
                logger.Info("Reading sentence: \"" + textContent + "\"");
                IList <AceEntityMention>   entityMentions   = aceDocument.GetEntityMentions(sentenceIndex);
                IList <AceRelationMention> relationMentions = aceDocument.GetRelationMentions(sentenceIndex);
                IList <AceEventMention>    eventMentions    = aceDocument.GetEventMentions(sentenceIndex);
                // convert entity mentions
                foreach (AceEntityMention aceEntityMention in entityMentions)
                {
                    string corefID = string.Empty;
                    foreach (string entityID in aceDocument.GetKeySetEntities())
                    {
                        AceEntity e = aceDocument.GetEntity(entityID);
                        if (e.GetMentions().Contains(aceEntityMention))
                        {
                            corefID = entityID;
                            break;
                        }
                    }
                    EntityMention convertedMention = ConvertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
                    //        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
                    entityCounts.IncrementCount(convertedMention.GetType());
                    logger.Info("CONVERTED MENTION HEAD SPAN: " + convertedMention.GetHead());
                    logger.Info("CONVERTED ENTITY MENTION: " + convertedMention);
                    AnnotationUtils.AddEntityMention(sentence, convertedMention);
                    entityMentionMap[aceEntityMention.GetId()] = convertedMention;
                }
                // TODO: make Entity objects as needed
                // convert relation mentions
                foreach (AceRelationMention aceRelationMention in relationMentions)
                {
                    RelationMention convertedMention = ConvertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
                    if (convertedMention != null)
                    {
                        relationCounts.IncrementCount(convertedMention.GetType());
                        logger.Info("CONVERTED RELATION MENTION: " + convertedMention);
                        AnnotationUtils.AddRelationMention(sentence, convertedMention);
                    }
                }
                // TODO: make Relation objects
                // convert EventMentions
                foreach (AceEventMention aceEventMention in eventMentions)
                {
                    EventMention convertedMention = ConvertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
                    if (convertedMention != null)
                    {
                        eventCounts.IncrementCount(convertedMention.GetType());
                        logger.Info("CONVERTED EVENT MENTION: " + convertedMention);
                        AnnotationUtils.AddEventMention(sentence, convertedMention);
                    }
                }
                // TODO: make Event objects
                results.Add(sentence);
                tokenOffset += tokens.Count;
            }
            return(results);
        }