/// <summary> /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence /// objects. /// </summary> /// <remarks> /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence /// objects. However, you probably should call parse() instead. /// </remarks> /// <param name="prefix"> /// prefix of ACE filename to read (e.g. /// "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01" /// ) (no ".apf.xml" extension) /// </param> /// <returns>list of RelationSentence objects</returns> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> private IList <ICoreMap> ReadDocument(string prefix, Annotation corpus) { logger.Info("Reading document: " + prefix); IList <ICoreMap> results = new List <ICoreMap>(); AceDocument aceDocument; if (aceVersion.Equals("ACE2004")) { aceDocument = AceDocument.ParseDocument(prefix, false, aceVersion); } else { aceDocument = AceDocument.ParseDocument(prefix, false); } string docId = aceDocument.GetId(); // map entity mention ID strings to their EntityMention counterparts IDictionary <string, EntityMention> entityMentionMap = Generics.NewHashMap(); /* * for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) { * List<AceToken> tokens = aceDocument.getSentence(sentenceIndex); * StringBuffer b = new StringBuffer(); * for(AceToken t: tokens) b.append(t.getLiteral() + " " ); * logger.info("SENTENCE: " + b.toString()); * } */ int tokenOffset = 0; for (int sentenceIndex = 0; sentenceIndex < aceDocument.GetSentenceCount(); sentenceIndex++) { IList <AceToken> tokens = aceDocument.GetSentence(sentenceIndex); IList <CoreLabel> words = new List <CoreLabel>(); StringBuilder textContent = new StringBuilder(); for (int i = 0; i < tokens.Count; i++) { CoreLabel l = new CoreLabel(); l.SetWord(tokens[i].GetLiteral()); l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word()); l.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), tokens[i].GetByteStart()); l.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), tokens[i].GetByteEnd()); words.Add(l); if (i > 0) { textContent.Append(" "); } textContent.Append(tokens[i].GetLiteral()); } // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer) if (words.Count == 1) { string word = words[0].Word(); if (word.StartsWith("<") && word.EndsWith(">")) { tokenOffset += tokens.Count; continue; } } ICoreMap sentence = new Annotation(textContent.ToString()); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words); logger.Info("Reading sentence: \"" + textContent + "\""); IList <AceEntityMention> entityMentions = aceDocument.GetEntityMentions(sentenceIndex); IList <AceRelationMention> relationMentions = aceDocument.GetRelationMentions(sentenceIndex); IList <AceEventMention> eventMentions = aceDocument.GetEventMentions(sentenceIndex); // convert entity mentions foreach (AceEntityMention aceEntityMention in entityMentions) { string corefID = string.Empty; foreach (string entityID in aceDocument.GetKeySetEntities()) { AceEntity e = aceDocument.GetEntity(entityID); if (e.GetMentions().Contains(aceEntityMention)) { corefID = entityID; break; } } EntityMention convertedMention = ConvertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID); // EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset); entityCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED MENTION HEAD SPAN: " + convertedMention.GetHead()); logger.Info("CONVERTED ENTITY MENTION: " + convertedMention); AnnotationUtils.AddEntityMention(sentence, convertedMention); entityMentionMap[aceEntityMention.GetId()] = convertedMention; } // TODO: make Entity objects as needed // convert relation mentions foreach (AceRelationMention aceRelationMention in relationMentions) { RelationMention convertedMention = ConvertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap); if (convertedMention != null) { relationCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED RELATION MENTION: " + convertedMention); AnnotationUtils.AddRelationMention(sentence, convertedMention); } } // TODO: make Relation objects // convert EventMentions foreach (AceEventMention aceEventMention in eventMentions) { EventMention convertedMention = ConvertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset); if (convertedMention != null) { eventCounts.IncrementCount(convertedMention.GetType()); logger.Info("CONVERTED EVENT MENTION: " + convertedMention); AnnotationUtils.AddEventMention(sentence, convertedMention); } } // TODO: make Event objects results.Add(sentence); tokenOffset += tokens.Count; } return(results); }