コード例 #1
0
        // TODO: never used?
        public virtual void ConstructSentenceRelationMentions()
        {
            //
            // construct the mRelationEntityMentions matrix
            //
            ICollection <string> relKeys = mRelationMentions.Keys;

            foreach (string key in relKeys)
            {
                AceRelationMention rm = mRelationMentions[key];
                int sentence          = mTokens[rm.GetArg(0).GetHead().GetTokenStart()].GetSentence();
                //
                // no need to adjust the number of rows: was done in parseDocument
                //
                // store the relation mentions in increasing order
                // (a) of the start position of their head, or
                // (b) if start is the same, in increasing order of ends
                List <AceRelationMention> sentRels = mSentenceRelationMentions[sentence];
                bool added = false;
                for (int i = 0; i < sentRels.Count; i++)
                {
                    AceRelationMention crt = sentRels[i];
                    if ((crt.GetMinTokenStart() > rm.GetMinTokenStart()) || (crt.GetMinTokenStart() == rm.GetMinTokenStart() && crt.GetMaxTokenEnd() > rm.GetMaxTokenEnd()))
                    {
                        sentRels.Add(i, rm);
                        added = true;
                        break;
                    }
                }
                if (!added)
                {
                    sentRels.Add(rm);
                }
            }
        }
コード例 #2
0
        /// <summary>Parses an ACE document.</summary>
        /// <remarks>
        /// Parses an ACE document. Works in the following steps: (a) reads both the
        /// XML annotations; (b) reads the tokens; (c) matches the tokens against the
        /// annotations (d) constructs mSentenceEntityMentions and
        /// mRelationEntityMentions
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Xml.Sax.SAXException"/>
        /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/>
        public static Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument ParseDocument(string prefix, bool usePredictedBoundaries)
        {
            mLog.Fine("Reading document " + prefix);
            Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument doc = null;
            //
            // read the ACE XML annotations
            //
            if (usePredictedBoundaries == false)
            {
                doc = AceDomReader.ParseDocument(new File(prefix + XmlExt));
            }
            else
            {
                // log.info("Parsed " + doc.getEntityMentions().size() +
                // " entities in document " + prefix);
                //
                // will use the predicted entity boundaries (see below)
                //
                int lastSlash = prefix.LastIndexOf(File.separator);
                System.Diagnostics.Debug.Assert((lastSlash > 0 && lastSlash < prefix.Length - 1));
                string id = Sharpen.Runtime.Substring(prefix, lastSlash + 1);
                // log.info(id + ": " + prefix);
                doc = new Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceDocument(id);
            }
            doc.SetPrefix(prefix);
            //
            // read the raw byte stream
            //
            string trueCasedFileName = prefix + OrigExt + ".truecase";

            if ((new File(trueCasedFileName).Exists()))
            {
                mLog.Severe("Using truecased file: " + trueCasedFileName);
                doc.ReadRawBytes(trueCasedFileName);
            }
            else
            {
                doc.ReadRawBytes(prefix + OrigExt);
            }
            //
            // read the AceTokens
            //
            int offsetToSubtract = 0;
            IList <IList <AceToken> > sentences = AceSentenceSegmenter.TokenizeAndSegmentSentences(prefix);

            doc.SetSentences(sentences);
            foreach (IList <AceToken> sentence in sentences)
            {
                foreach (AceToken token in sentence)
                {
                    offsetToSubtract = token.AdjustPhrasePositions(offsetToSubtract, token.GetLiteral());
                    doc.AddToken(token);
                }
            }
            //
            // match char sequences to phrases
            //
            doc.MatchCharSeqs(prefix);
            //
            // construct the mEntityMentions matrix
            //
            ICollection <string> entityKeys = doc.mEntityMentions.Keys;
            int sentence_1;

            foreach (string key in entityKeys)
            {
                AceEntityMention em = doc.mEntityMentions[key];
                sentence_1 = doc.mTokens[em.GetHead().GetTokenStart()].GetSentence();
                // adjust the number of rows if necessary
                while (sentence_1 >= doc.mSentenceEntityMentions.Count)
                {
                    doc.mSentenceEntityMentions.Add(new List <AceEntityMention>());
                    doc.mSentenceRelationMentions.Add(new List <AceRelationMention>());
                    doc.mSentenceEventMentions.Add(new List <AceEventMention>());
                }
                // store the entity mentions in increasing order:
                // (a) of the start position of their head
                // (b) if start is the same, in increasing order of the head end
                List <AceEntityMention> sentEnts = doc.mSentenceEntityMentions[sentence_1];
                bool added = false;
                for (int i = 0; i < sentEnts.Count; i++)
                {
                    AceEntityMention crt = sentEnts[i];
                    if ((crt.GetHead().GetTokenStart() > em.GetHead().GetTokenStart()) || (crt.GetHead().GetTokenStart() == em.GetHead().GetTokenStart() && crt.GetHead().GetTokenEnd() > em.GetHead().GetTokenEnd()))
                    {
                        sentEnts.Add(i, em);
                        added = true;
                        break;
                    }
                }
                if (!added)
                {
                    sentEnts.Add(em);
                }
            }
            //
            // construct the mRelationMentions matrix
            //
            ICollection <string> relKeys = doc.mRelationMentions.Keys;

            foreach (string key_1 in relKeys)
            {
                AceRelationMention rm = doc.mRelationMentions[key_1];
                sentence_1 = doc.mTokens[rm.GetArg(0).GetHead().GetTokenStart()].GetSentence();
                //
                // no need to adjust the number of rows: was done above
                //
                // store the relation mentions in increasing order
                // (a) of the start position of their head, or
                // (b) if start is the same, in increasing order of ends
                List <AceRelationMention> sentRels = doc.mSentenceRelationMentions[sentence_1];
                bool added = false;
                for (int i = 0; i < sentRels.Count; i++)
                {
                    AceRelationMention crt = sentRels[i];
                    if ((crt.GetMinTokenStart() > rm.GetMinTokenStart()) || (crt.GetMinTokenStart() == rm.GetMinTokenStart() && crt.GetMaxTokenEnd() > rm.GetMaxTokenEnd()))
                    {
                        sentRels.Add(i, rm);
                        added = true;
                        break;
                    }
                }
                if (!added)
                {
                    sentRels.Add(rm);
                }
            }
            //
            // construct the mEventMentions matrix
            //
            ICollection <string> eventKeys = doc.mEventMentions.Keys;

            foreach (string key_2 in eventKeys)
            {
                AceEventMention em = doc.mEventMentions[key_2];
                sentence_1 = doc.mTokens[em.GetMinTokenStart()].GetSentence();

                /*
                 * adjust the number of rows if necessary -- if you're wondering why we do
                 * this here again, (after we've done it for entities) it's because we can
                 * have an event with no entities near the end of the document and thus
                 * won't have created rows in mSentence*Mentions
                 */
                while (sentence_1 >= doc.mSentenceEntityMentions.Count)
                {
                    doc.mSentenceEntityMentions.Add(new List <AceEntityMention>());
                    doc.mSentenceRelationMentions.Add(new List <AceRelationMention>());
                    doc.mSentenceEventMentions.Add(new List <AceEventMention>());
                }
                // store the event mentions in increasing order
                // (a) first, event mentions with no arguments
                // (b) then by the start position of their head, or
                // (c) if start is the same, in increasing order of ends
                List <AceEventMention> sentEvents = doc.mSentenceEventMentions[sentence_1];
                bool added = false;
                for (int i = 0; i < sentEvents.Count; i++)
                {
                    AceEventMention crt = sentEvents[i];
                    if ((crt.GetMinTokenStart() > em.GetMinTokenStart()) || (crt.GetMinTokenStart() == em.GetMinTokenStart() && crt.GetMaxTokenEnd() > em.GetMaxTokenEnd()))
                    {
                        sentEvents.Add(i, em);
                        added = true;
                        break;
                    }
                }
                if (!added)
                {
                    sentEvents.Add(em);
                }
            }
            return(doc);
        }