// TODO: never used? public virtual void ConstructSentenceRelationMentions() { // // construct the mRelationEntityMentions matrix // ICollection <string> relKeys = mRelationMentions.Keys; foreach (string key in relKeys) { AceRelationMention rm = mRelationMentions[key]; int sentence = mTokens[rm.GetArg(0).GetHead().GetTokenStart()].GetSentence(); // // no need to adjust the number of rows: was done in parseDocument // // store the relation mentions in increasing order // (a) of the start position of their head, or // (b) if start is the same, in increasing order of ends List <AceRelationMention> sentRels = mSentenceRelationMentions[sentence]; bool added = false; for (int i = 0; i < sentRels.Count; i++) { AceRelationMention crt = sentRels[i]; if ((crt.GetMinTokenStart() > rm.GetMinTokenStart()) || (crt.GetMinTokenStart() == rm.GetMinTokenStart() && crt.GetMaxTokenEnd() > rm.GetMaxTokenEnd())) { sentRels.Add(i, rm); added = true; break; } } if (!added) { sentRels.Add(rm); } } }
/// <summary>Parses an ACE document.</summary> /// <remarks> /// Parses an ACE document. Works in the following steps: (a) reads both the /// XML annotations; (b) reads the tokens; (c) matches the tokens against the /// annotations (d) constructs mSentenceEntityMentions and /// mRelationEntityMentions /// </remarks> /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> public static Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument ParseDocument(string prefix, bool usePredictedBoundaries) { mLog.Fine("Reading document " + prefix); Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument doc = null; // // read the ACE XML annotations // if (usePredictedBoundaries == false) { doc = AceDomReader.ParseDocument(new File(prefix + XmlExt)); } else { // log.info("Parsed " + doc.getEntityMentions().size() + // " entities in document " + prefix); // // will use the predicted entity boundaries (see below) // int lastSlash = prefix.LastIndexOf(File.separator); System.Diagnostics.Debug.Assert((lastSlash > 0 && lastSlash < prefix.Length - 1)); string id = Sharpen.Runtime.Substring(prefix, lastSlash + 1); // log.info(id + ": " + prefix); doc = new Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument(id); } doc.SetPrefix(prefix); // // read the raw byte stream // string trueCasedFileName = prefix + OrigExt + ".truecase"; if ((new File(trueCasedFileName).Exists())) { mLog.Severe("Using truecased file: " + trueCasedFileName); doc.ReadRawBytes(trueCasedFileName); } else { doc.ReadRawBytes(prefix + OrigExt); } // // read the AceTokens // int offsetToSubtract = 0; IList <IList <AceToken> > sentences = AceSentenceSegmenter.TokenizeAndSegmentSentences(prefix); doc.SetSentences(sentences); foreach (IList <AceToken> sentence in sentences) { foreach (AceToken token in sentence) { offsetToSubtract = token.AdjustPhrasePositions(offsetToSubtract, token.GetLiteral()); doc.AddToken(token); } } // // match char sequences to phrases // doc.MatchCharSeqs(prefix); // // construct the mEntityMentions matrix // ICollection <string> entityKeys = doc.mEntityMentions.Keys; int sentence_1; foreach (string key in entityKeys) { AceEntityMention em = doc.mEntityMentions[key]; sentence_1 = doc.mTokens[em.GetHead().GetTokenStart()].GetSentence(); // adjust the number of rows if necessary while (sentence_1 >= doc.mSentenceEntityMentions.Count) { doc.mSentenceEntityMentions.Add(new List <AceEntityMention>()); doc.mSentenceRelationMentions.Add(new List <AceRelationMention>()); doc.mSentenceEventMentions.Add(new List <AceEventMention>()); } // store the entity mentions in increasing order: // (a) of the start position of their head // (b) if start is the same, in increasing order of the head end List <AceEntityMention> sentEnts = doc.mSentenceEntityMentions[sentence_1]; bool added = false; for (int i = 0; i < sentEnts.Count; i++) { AceEntityMention crt = sentEnts[i]; if ((crt.GetHead().GetTokenStart() > em.GetHead().GetTokenStart()) || (crt.GetHead().GetTokenStart() == em.GetHead().GetTokenStart() && crt.GetHead().GetTokenEnd() > em.GetHead().GetTokenEnd())) { sentEnts.Add(i, em); added = true; break; } } if (!added) { sentEnts.Add(em); } } // // construct the mRelationMentions matrix // ICollection <string> relKeys = doc.mRelationMentions.Keys; foreach (string key_1 in relKeys) { AceRelationMention rm = doc.mRelationMentions[key_1]; sentence_1 = doc.mTokens[rm.GetArg(0).GetHead().GetTokenStart()].GetSentence(); // // no need to adjust the number of rows: was done above // // store the relation mentions in increasing order // (a) of the start position of their head, or // (b) if start is the same, in increasing order of ends List <AceRelationMention> sentRels = doc.mSentenceRelationMentions[sentence_1]; bool added = false; for (int i = 0; i < sentRels.Count; i++) { AceRelationMention crt = sentRels[i]; if ((crt.GetMinTokenStart() > rm.GetMinTokenStart()) || (crt.GetMinTokenStart() == rm.GetMinTokenStart() && crt.GetMaxTokenEnd() > rm.GetMaxTokenEnd())) { sentRels.Add(i, rm); added = true; break; } } if (!added) { sentRels.Add(rm); } } // // construct the mEventMentions matrix // ICollection <string> eventKeys = doc.mEventMentions.Keys; foreach (string key_2 in eventKeys) { AceEventMention em = doc.mEventMentions[key_2]; sentence_1 = doc.mTokens[em.GetMinTokenStart()].GetSentence(); /* * adjust the number of rows if necessary -- if you're wondering why we do * this here again, (after we've done it for entities) it's because we can * have an event with no entities near the end of the document and thus * won't have created rows in mSentence*Mentions */ while (sentence_1 >= doc.mSentenceEntityMentions.Count) { doc.mSentenceEntityMentions.Add(new List <AceEntityMention>()); doc.mSentenceRelationMentions.Add(new List <AceRelationMention>()); doc.mSentenceEventMentions.Add(new List <AceEventMention>()); } // store the event mentions in increasing order // (a) first, event mentions with no arguments // (b) then by the start position of their head, or // (c) if start is the same, in increasing order of ends List <AceEventMention> sentEvents = doc.mSentenceEventMentions[sentence_1]; bool added = false; for (int i = 0; i < sentEvents.Count; i++) { AceEventMention crt = sentEvents[i]; if ((crt.GetMinTokenStart() > em.GetMinTokenStart()) || (crt.GetMinTokenStart() == em.GetMinTokenStart() && crt.GetMaxTokenEnd() > em.GetMaxTokenEnd())) { sentEvents.Add(i, em); added = true; break; } } if (!added) { sentEvents.Add(em); } } return(doc); }