// // heeyoung : skip relation, event parsing part - for ACE2004 // /// <exception cref="System.IO.IOException"/> /// <exception cref="Org.Xml.Sax.SAXException"/> /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/> public static Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument ParseDocument(string prefix, bool usePredictedBoundaries, string AceVersion) { mLog.Fine("Reading document " + prefix); Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument doc = null; // // read the ACE XML annotations // if (usePredictedBoundaries == false) { doc = AceDomReader.ParseDocument(new File(prefix + XmlExt)); } else { // log.info("Parsed " + doc.getEntityMentions().size() + // " entities in document " + prefix); // // will use the predicted entity boundaries (see below) // int lastSlash = prefix.LastIndexOf(File.separator); System.Diagnostics.Debug.Assert((lastSlash > 0 && lastSlash < prefix.Length - 1)); string id = Sharpen.Runtime.Substring(prefix, lastSlash + 1); // log.info(id + ": " + prefix); doc = new Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument(id); } doc.SetPrefix(prefix); // // read the raw byte stream // string trueCasedFileName = prefix + OrigExt + ".truecase"; if ((new File(trueCasedFileName).Exists())) { mLog.Severe("Using truecased file: " + trueCasedFileName); doc.ReadRawBytes(trueCasedFileName); } else { doc.ReadRawBytes(prefix + OrigExt); } // // read the AceTokens // int offsetToSubtract = 0; IList <IList <AceToken> > sentences = AceSentenceSegmenter.TokenizeAndSegmentSentences(prefix); doc.SetSentences(sentences); foreach (IList <AceToken> sentence in sentences) { foreach (AceToken token in sentence) { offsetToSubtract = token.AdjustPhrasePositions(offsetToSubtract, token.GetLiteral()); doc.AddToken(token); } } // // match char sequences to phrases // doc.MatchCharSeqs(prefix); // // construct the mEntityMentions matrix // ICollection <string> entityKeys = doc.mEntityMentions.Keys; int sentence_1; foreach (string key in entityKeys) { AceEntityMention em = doc.mEntityMentions[key]; sentence_1 = doc.mTokens[em.GetHead().GetTokenStart()].GetSentence(); // adjust the number of rows if necessary while (sentence_1 >= doc.mSentenceEntityMentions.Count) { doc.mSentenceEntityMentions.Add(new List <AceEntityMention>()); doc.mSentenceRelationMentions.Add(new List <AceRelationMention>()); doc.mSentenceEventMentions.Add(new List <AceEventMention>()); } // store the entity mentions in increasing order: // (a) of the start position of their head // (b) if start is the same, in increasing order of the head end List <AceEntityMention> sentEnts = doc.mSentenceEntityMentions[sentence_1]; bool added = false; for (int i = 0; i < sentEnts.Count; i++) { AceEntityMention crt = sentEnts[i]; if ((crt.GetHead().GetTokenStart() > em.GetHead().GetTokenStart()) || (crt.GetHead().GetTokenStart() == em.GetHead().GetTokenStart() && crt.GetHead().GetTokenEnd() > em.GetHead().GetTokenEnd())) { sentEnts.Add(i, em); added = true; break; } } if (!added) { sentEnts.Add(em); } } return(doc); }
/// <summary>Matches all relevant mentions, i.e.</summary> /// <remarks> /// Matches all relevant mentions, i.e. entities and anchors, to tokens Note: /// entity mentions may match with multiple tokens! /// </remarks> public virtual void MatchCharSeqs(string filePrefix) { // // match the head and extent of entity mentions // ICollection <string> keys = mEntityMentions.Keys; foreach (string key in keys) { AceEntityMention m = mEntityMentions[key]; // // match the head charseq to 1+ phrase(s) // try { m.GetHead().Match(mTokens); } catch (MatchException) { mLog.Severe("READER ERROR: Failed to match entity mention head: " + "[" + m.GetHead().GetText() + ", " + m.GetHead().GetByteStart() + ", " + m.GetHead().GetByteEnd() + "]"); mLog.Severe("Document tokens: " + TokensWithByteSpan(m.GetHead().GetByteStart(), m.GetHead().GetByteEnd())); mLog.Severe("Document prefix: " + filePrefix); System.Environment.Exit(1); } // // match the extent charseq to 1+ phrase(s) // try { m.GetExtent().Match(mTokens); } catch (MatchException) { mLog.Severe("READER ERROR: Failed to match entity mention extent: " + "[" + m.GetExtent().GetText() + ", " + m.GetExtent().GetByteStart() + ", " + m.GetExtent().GetByteEnd() + "]"); mLog.Severe("Document tokens: " + TokensWithByteSpan(m.GetExtent().GetByteStart(), m.GetExtent().GetByteEnd())); System.Environment.Exit(1); } // // set the head word of the mention // m.DetectHeadToken(this); } // we need to do this for events as well since they may not have any AceEntityMentions associated with them (if they have no arguments) ICollection <string> eventKeys = mEventMentions.Keys; foreach (string key_1 in eventKeys) { AceEventMention m = mEventMentions[key_1]; // // match the extent charseq to 1+ phrase(s) // try { m.GetExtent().Match(mTokens); } catch (MatchException) { mLog.Severe("READER ERROR: Failed to match event mention extent: " + "[" + m.GetExtent().GetText() + ", " + m.GetExtent().GetByteStart() + ", " + m.GetExtent().GetByteEnd() + "]"); mLog.Severe("Document tokens: " + TokensWithByteSpan(m.GetExtent().GetByteStart(), m.GetExtent().GetByteEnd())); System.Environment.Exit(1); } } }