コード例 #1
0
        private EntityMention ConvertAceEntityMention(AceEntityMention entityMention, string docId, ICoreMap sentence, int tokenOffset, string corefID)
        {
            EntityMention converted = ConvertAceEntityMention(entityMention, docId, sentence, tokenOffset);

            converted.SetCorefID(corefID);
            return(converted);
        }
コード例 #2
0
        // nothing to do by default
        /// <summary>
        /// Converts NamedEntityTagAnnotation tags into
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// s. This
        /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching
        /// type.
        /// </summary>
        /// <param name="sentence">A sentence, ideally annotated with NamedEntityTagAnnotation</param>
        /// <param name="nerTag">The name of the NER tag to copy, e.g. "DATE".</param>
        /// <param name="entityType">
        /// The type of the
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// objects created
        /// </param>
        public virtual void MakeAnnotationFromGivenNERTag(ICoreMap sentence, string nerTag, string entityType)
        {
            IList <CoreLabel>     words    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));

            System.Diagnostics.Debug.Assert(words != null);
            System.Diagnostics.Debug.Assert(mentions != null);
            for (int start = 0; start < words.Count; start++)
            {
                int end;
                // find the first token after start that isn't of nerType
                for (end = start; end < words.Count; end++)
                {
                    string ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    if (!ne.Equals(nerTag))
                    {
                        break;
                    }
                }
                if (end > start)
                {
                    // found a match!
                    EntityMention m = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null);
                    logger.Info("Created " + entityType + " entity mention: " + m);
                    start = end - 1;
                    mentions.Add(m);
                }
            }
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions);
        }
コード例 #3
0
        private static Element ToXML(EntityMention entity, string curNS)
        {
            Element top = new Element("entity", curNS);

            top.AddAttribute(new Attribute("id", entity.GetObjectId()));
            Element type = new Element("type", curNS);

            type.AppendChild(entity.GetType());
            top.AppendChild(entity.GetType());
            if (entity.GetNormalizedName() != null)
            {
                Element nm = new Element("normalized", curNS);
                nm.AppendChild(entity.GetNormalizedName());
                top.AppendChild(nm);
            }
            if (entity.GetSubType() != null)
            {
                Element subtype = new Element("subtype", curNS);
                subtype.AppendChild(entity.GetSubType());
                top.AppendChild(subtype);
            }
            Element span = new Element("span", curNS);

            span.AddAttribute(new Attribute("start", int.ToString(entity.GetHeadTokenStart())));
            span.AddAttribute(new Attribute("end", int.ToString(entity.GetHeadTokenEnd())));
            top.AppendChild(span);
            top.AppendChild(MakeProbabilitiesElement(entity, curNS));
            return(top);
        }
コード例 #4
0
        public virtual EntityMention MakeEntityMention(ICoreMap sentence, int start, int end, string label, string identifier)
        {
            Span   span    = new Span(start, end);
            string type    = null;
            string subtype = null;

            if (!label.StartsWith("B-") && !label.StartsWith("I-"))
            {
                type    = label;
                subtype = null;
            }
            else
            {
                // TODO: add support for subtypes! (needed at least in ACE)
                type    = Sharpen.Runtime.Substring(label, 2);
                subtype = null;
            }
            // TODO: add support for subtypes! (needed at least in ACE)
            EntityMention     entity = entityMentionFactory.ConstructEntityMention(identifier, sentence, span, span, type, subtype, null);
            ICounter <string> probs  = new ClassicCounter <string>();

            probs.SetCount(entity.GetType(), 1.0);
            entity.SetTypeProbabilities(probs);
            return(entity);
        }
コード例 #5
0
        public virtual void MakeEntityMention(ICoreMap sentence, int start, int end, string label, IList <EntityMention> entities, int sentCount)
        {
            System.Diagnostics.Debug.Assert((start >= 0));
            string        identifier = MakeEntityMentionIdentifier(sentence, sentCount, entities.Count);
            EntityMention entity     = MakeEntityMention(sentence, start, end, label, identifier);

            entities.Add(entity);
        }
コード例 #6
0
        private string MakeLabel(EntityMention m)
        {
            string label = m.GetType();

            if (useSubTypes && m.GetSubType() != null)
            {
                label += "-" + m.GetSubType();
            }
            return(label);
        }
コード例 #7
0
        /// <summary>
        /// Convert an
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceEntityMention"/>
        /// to an
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// .
        /// </summary>
        /// <param name="entityMention">
        ///
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Domains.Ace.Reader.AceEntityMention"/>
        /// to convert
        /// </param>
        /// <param name="docId">ID of the document containing this entity mention</param>
        /// <param name="sentence"/>
        /// <param name="tokenOffset">
        /// An offset in the calculations of position of the extent to sentence boundary
        /// (the ace.reader stores absolute token offset from the beginning of the document, but
        /// we need token offsets from the beginning of the sentence =&gt; adjust by tokenOffset)
        /// </param>
        /// <returns>
        /// entity as an
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// </returns>
        private EntityMention ConvertAceEntityMention(AceEntityMention entityMention, string docId, ICoreMap sentence, int tokenOffset)
        {
            //log.info("TYPE is " + entityMention.getParent().getType());
            //log.info("SUBTYPE is " + entityMention.getParent().getSubtype());
            //log.info("LDCTYPE is " + entityMention.getLdctype());
            AceCharSeq ext      = entityMention.GetExtent();
            AceCharSeq head     = entityMention.GetHead();
            int        extStart = ext.GetTokenStart() - tokenOffset;
            int        extEnd   = ext.GetTokenEnd() - tokenOffset + 1;

            if (extStart < 0)
            {
                logger.Severe("READER ERROR: Invalid extent start " + extStart + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence);
                logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
                extStart = 0;
            }
            if (extEnd > sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count)
            {
                logger.Severe("READER ERROR: Invalid extent end " + extEnd + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence);
                logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity extent.");
                extEnd = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count;
            }
            int headStart = head.GetTokenStart() - tokenOffset;
            int headEnd   = head.GetTokenEnd() - tokenOffset + 1;

            if (headStart < 0)
            {
                logger.Severe("READER ERROR: Invalid head start " + headStart + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence);
                logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
                headStart = 0;
            }
            if (headEnd > sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count)
            {
                logger.Severe("READER ERROR: Invalid head end " + headEnd + " for entity mention " + entityMention.GetId() + " in document " + docId + " in sentence " + sentence);
                logger.Severe("This may happen due to incorrect EOS detection. Adjusting entity head span.");
                headEnd = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Count;
            }
            // must adjust due to possible incorrect EOS detection
            if (headStart < extStart)
            {
                headStart = extStart;
            }
            if (headEnd > extEnd)
            {
                headEnd = extEnd;
            }
            System.Diagnostics.Debug.Assert((headStart < headEnd));
            // note: the ace.reader stores absolute token offset from the beginning of the document, but
            //       we need token offsets from the beginning of the sentence => adjust by tokenOffset
            // note: in ace.reader the end token position is inclusive, but
            //       in our setup the end token position is exclusive => add 1 to end
            EntityMention converted = new EntityMention(entityMention.GetId(), sentence, new Span(extStart, extEnd), new Span(headStart, headEnd), entityMention.GetParent().GetType(), entityMention.GetParent().GetSubtype(), entityMention.GetLdctype());

            return(converted);
        }
コード例 #8
0
        /*
         * Sets the head word and the index for an entity, given the parse tree for
         * the sentence containing the entity.
         *
         * This code is no longer used, but I've kept it around (at least for now) as
         * reference when we modify preProcessSentences().
         */
        private void SetHeadWord(EntityMention entity, Tree tree)
        {
            IList <Tree> leaves        = tree.GetLeaves();
            Tree         argRoot       = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd()]);
            Tree         headWordNode  = argRoot.HeadTerminal(headFinder);
            int          headWordIndex = GetIndexByObjectEquality(leaves, headWordNode);

            if (StringUtils.IsPunct(leaves[entity.GetExtentTokenEnd()].Label().Value().Trim()) && (headWordIndex >= entity.GetExtentTokenEnd() || headWordIndex < entity.GetExtentTokenStart()))
            {
                argRoot       = tree.JoinNode(leaves[entity.GetExtentTokenStart()], leaves[entity.GetExtentTokenEnd() - 1]);
                headWordNode  = argRoot.HeadTerminal(headFinder);
                headWordIndex = GetIndexByObjectEquality(leaves, headWordNode);
                if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd() - 1)
                {
                    entity.SetHeadTokenPosition(headWordIndex);
                    entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
                }
            }
            if (headWordIndex >= entity.GetExtentTokenStart() && headWordIndex <= entity.GetExtentTokenEnd())
            {
                entity.SetHeadTokenPosition(headWordIndex);
                entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
            }
            else
            {
                // Re-parse the argument words by themselves
                // Get the list of words in the arg by looking at the leaves between
                // arg.getExtentTokenStart() and arg.getExtentTokenEnd() inclusive
                IList <string> argWords = new List <string>();
                for (int i = entity.GetExtentTokenStart(); i <= entity.GetExtentTokenEnd(); i++)
                {
                    argWords.Add(leaves[i].Label().Value());
                }
                if (StringUtils.IsPunct(argWords[argWords.Count - 1]))
                {
                    argWords.Remove(argWords.Count - 1);
                }
                Tree argTree = ParseStrings(argWords);
                headWordNode  = argTree.HeadTerminal(headFinder);
                headWordIndex = GetIndexByObjectEquality(argTree.GetLeaves(), headWordNode) + entity.GetExtentTokenStart();
                entity.SetHeadTokenPosition(headWordIndex);
                entity.SetHeadTokenSpan(new Span(headWordIndex, headWordIndex + 1));
            }
        }
コード例 #9
0
        /// <summary>Find the index of the head of an entity.</summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="tree">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <param name="setHeadSpan">Whether to set the head span in the entity mention.</param>
        /// <returns>The index of the entity head</returns>
        public virtual int AssignSyntacticHead(EntityMention ent, Tree tree, IList <CoreLabel> tokens, bool setHeadSpan)
        {
            if (ent.GetSyntacticHeadTokenPosition() != -1)
            {
                return(ent.GetSyntacticHeadTokenPosition());
            }
            logger.Finest("Finding syntactic head for entity: " + ent + " in tree: " + tree.ToString());
            logger.Finest("Flat sentence is: " + tokens);
            Tree sh = null;

            try
            {
                sh = FindSyntacticHead(ent, tree, tokens);
            }
            catch (Exception e)
            {
                logger.Severe("WARNING: failed to parse sentence. Will continue with the right-most head heuristic: " + SentenceToString(tokens));
                Sharpen.Runtime.PrintStackTrace(e);
            }
            int headPos = ent.GetExtentTokenEnd() - 1;

            if (sh != null)
            {
                CoreLabel label = (CoreLabel)sh.Label();
                headPos = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
            }
            else
            {
                logger.Fine("WARNING: failed to find syntactic head for entity: " + ent + " in tree: " + tree);
                logger.Fine("Fallback strategy: will set head to last token in mention: " + tokens[headPos]);
            }
            ent.SetHeadTokenPosition(headPos);
            if (setHeadSpan)
            {
                // set the head span to match exactly the syntactic head
                // this is needed for some corpora where the head span is not given
                ent.SetHeadTokenSpan(new Span(headPos, headPos + 1));
            }
            return(headPos);
        }
コード例 #10
0
        /// <summary>
        /// This is the original version of
        /// <see cref="FindSyntacticHead(Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention, Edu.Stanford.Nlp.Trees.Tree, System.Collections.Generic.IList{E})"/>
        /// before Chris's modifications.
        /// There's no good reason to use it except for producing historical results.
        /// It Finds the syntactic head of the given entity mention.
        /// </summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="root">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <returns>
        /// The tree object corresponding to the head. This MUST be a child of root.
        /// It will be a leaf in the parse tree.
        /// </returns>
        public virtual Tree OriginalFindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens)
        {
            logger.Fine("Searching for tree matching " + ent);
            Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd());

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch));
                return(SafeHead(exactMatch));
            }
            //
            // no exact match found
            // in this case, we parse the actual extent of the mention
            //
            IList <CoreLabel> extentTokens = new List <CoreLabel>();

            for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++)
            {
                extentTokens.Add(tokens[i]);
            }
            Tree tree = Parse(extentTokens);

            logger.Fine("No exact match found. Local parse:\n" + tree.PennString());
            ConvertToCoreLabels(tree);
            tree.IndexSpans(ent.GetExtentTokenStart());
            Tree extentHead = SafeHead(tree);

            System.Diagnostics.Debug.Assert((extentHead != null));
            // extentHead is a child in the local extent parse tree. we need to find the
            // corresponding node in the main tree
            CoreLabel l        = (CoreLabel)extentHead.Label();
            Tree      realHead = FindTreeWithSpan(root, l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), l.Get(typeof(CoreAnnotations.EndIndexAnnotation)));

            System.Diagnostics.Debug.Assert((realHead != null));
            return(realHead);
        }
コード例 #11
0
        /// <summary>
        /// Converts NamedEntityTagAnnotation tags into
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// s. This
        /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching
        /// type.
        /// </summary>
        /// <param name="sentence">A sentence annotated with NamedEntityTagAnnotation</param>
        public virtual void MakeAnnotationFromAllNERTags(ICoreMap sentence)
        {
            IList <CoreLabel>     words    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));

            System.Diagnostics.Debug.Assert(words != null);
            if (mentions == null)
            {
                this.logger.Info("mentions are null");
                mentions = new List <EntityMention>();
            }
            for (int start = 0; start < words.Count; start++)
            {
                int end;
                // find the first token after start that isn't of nerType
                string lastneTag = null;
                string ne        = null;
                for (end = start; end < words.Count; end++)
                {
                    ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    if (ne.Equals(SeqClassifierFlags.DefaultBackgroundSymbol) || (lastneTag != null && !ne.Equals(lastneTag)))
                    {
                        break;
                    }
                    lastneTag = ne;
                }
                if (end > start)
                {
                    // found a match!
                    string        entityType = this.GetEntityTypeForTag(lastneTag);
                    EntityMention m          = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null);
                    //TODO: changed entityType in the above sentence to nerTag - Sonal
                    logger.Info("Created " + entityType + " entity mention: " + m);
                    start = end - 1;
                    mentions.Add(m);
                }
            }
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions);
        }
コード例 #12
0
        /// <summary>
        /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence
        /// objects.
        /// </summary>
        /// <remarks>
        /// Reads in a single ACE*.apf.xml file and convert it to RelationSentence
        /// objects. However, you probably should call parse() instead.
        /// </remarks>
        /// <param name="prefix">
        /// prefix of ACE filename to read (e.g.
        /// "/u/mcclosky/scr/data/ACE2005/english_test/bc/CNN_CF_20030827.1630.01"
        /// ) (no ".apf.xml" extension)
        /// </param>
        /// <returns>list of RelationSentence objects</returns>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="Org.Xml.Sax.SAXException"/>
        /// <exception cref="Javax.Xml.Parsers.ParserConfigurationException"/>
        private IList <ICoreMap> ReadDocument(string prefix, Annotation corpus)
        {
            logger.Info("Reading document: " + prefix);
            IList <ICoreMap> results = new List <ICoreMap>();
            AceDocument      aceDocument;

            if (aceVersion.Equals("ACE2004"))
            {
                aceDocument = AceDocument.ParseDocument(prefix, false, aceVersion);
            }
            else
            {
                aceDocument = AceDocument.ParseDocument(prefix, false);
            }
            string docId = aceDocument.GetId();
            // map entity mention ID strings to their EntityMention counterparts
            IDictionary <string, EntityMention> entityMentionMap = Generics.NewHashMap();

            /*
             * for (int sentenceIndex = 0; sentenceIndex < aceDocument.getSentenceCount(); sentenceIndex++) {
             * List<AceToken> tokens = aceDocument.getSentence(sentenceIndex);
             * StringBuffer b = new StringBuffer();
             * for(AceToken t: tokens) b.append(t.getLiteral() + " " );
             * logger.info("SENTENCE: " + b.toString());
             * }
             */
            int tokenOffset = 0;

            for (int sentenceIndex = 0; sentenceIndex < aceDocument.GetSentenceCount(); sentenceIndex++)
            {
                IList <AceToken>  tokens      = aceDocument.GetSentence(sentenceIndex);
                IList <CoreLabel> words       = new List <CoreLabel>();
                StringBuilder     textContent = new StringBuilder();
                for (int i = 0; i < tokens.Count; i++)
                {
                    CoreLabel l = new CoreLabel();
                    l.SetWord(tokens[i].GetLiteral());
                    l.Set(typeof(CoreAnnotations.ValueAnnotation), l.Word());
                    l.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), tokens[i].GetByteStart());
                    l.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), tokens[i].GetByteEnd());
                    words.Add(l);
                    if (i > 0)
                    {
                        textContent.Append(" ");
                    }
                    textContent.Append(tokens[i].GetLiteral());
                }
                // skip "sentences" that are really just SGML tags (which come from using the RobustTokenizer)
                if (words.Count == 1)
                {
                    string word = words[0].Word();
                    if (word.StartsWith("<") && word.EndsWith(">"))
                    {
                        tokenOffset += tokens.Count;
                        continue;
                    }
                }
                ICoreMap sentence = new Annotation(textContent.ToString());
                sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId);
                sentence.Set(typeof(CoreAnnotations.TokensAnnotation), words);
                logger.Info("Reading sentence: \"" + textContent + "\"");
                IList <AceEntityMention>   entityMentions   = aceDocument.GetEntityMentions(sentenceIndex);
                IList <AceRelationMention> relationMentions = aceDocument.GetRelationMentions(sentenceIndex);
                IList <AceEventMention>    eventMentions    = aceDocument.GetEventMentions(sentenceIndex);
                // convert entity mentions
                foreach (AceEntityMention aceEntityMention in entityMentions)
                {
                    string corefID = string.Empty;
                    foreach (string entityID in aceDocument.GetKeySetEntities())
                    {
                        AceEntity e = aceDocument.GetEntity(entityID);
                        if (e.GetMentions().Contains(aceEntityMention))
                        {
                            corefID = entityID;
                            break;
                        }
                    }
                    EntityMention convertedMention = ConvertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset, corefID);
                    //        EntityMention convertedMention = convertAceEntityMention(aceEntityMention, docId, sentence, tokenOffset);
                    entityCounts.IncrementCount(convertedMention.GetType());
                    logger.Info("CONVERTED MENTION HEAD SPAN: " + convertedMention.GetHead());
                    logger.Info("CONVERTED ENTITY MENTION: " + convertedMention);
                    AnnotationUtils.AddEntityMention(sentence, convertedMention);
                    entityMentionMap[aceEntityMention.GetId()] = convertedMention;
                }
                // TODO: make Entity objects as needed
                // convert relation mentions
                foreach (AceRelationMention aceRelationMention in relationMentions)
                {
                    RelationMention convertedMention = ConvertAceRelationMention(aceRelationMention, docId, sentence, entityMentionMap);
                    if (convertedMention != null)
                    {
                        relationCounts.IncrementCount(convertedMention.GetType());
                        logger.Info("CONVERTED RELATION MENTION: " + convertedMention);
                        AnnotationUtils.AddRelationMention(sentence, convertedMention);
                    }
                }
                // TODO: make Relation objects
                // convert EventMentions
                foreach (AceEventMention aceEventMention in eventMentions)
                {
                    EventMention convertedMention = ConvertAceEventMention(aceEventMention, docId, sentence, entityMentionMap, tokenOffset);
                    if (convertedMention != null)
                    {
                        eventCounts.IncrementCount(convertedMention.GetType());
                        logger.Info("CONVERTED EVENT MENTION: " + convertedMention);
                        AnnotationUtils.AddEventMention(sentence, convertedMention);
                    }
                }
                // TODO: make Event objects
                results.Add(sentence);
                tokenOffset += tokens.Count;
            }
            return(results);
        }
コード例 #13
0
        /// <summary>Finds the syntactic head of the given entity mention.</summary>
        /// <param name="ent">The entity mention</param>
        /// <param name="root">The Tree for the entire sentence in which it occurs.</param>
        /// <param name="tokens">The Sentence in which it occurs</param>
        /// <returns>
        /// The tree object corresponding to the head. This MUST be a child of root.
        /// It will be a leaf in the parse tree.
        /// </returns>
        public virtual Tree FindSyntacticHead(EntityMention ent, Tree root, IList <CoreLabel> tokens)
        {
            if (!useNewHeadFinder)
            {
                return(OriginalFindSyntacticHead(ent, root, tokens));
            }
            logger.Fine("Searching for tree matching " + ent);
            Tree exactMatch = FindTreeWithSpan(root, ent.GetExtentTokenStart(), ent.GetExtentTokenEnd());

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                logger.Fine("Mention \"" + ent + "\" mapped to tree: " + PrintTree(exactMatch));
                return(SafeHead(exactMatch));
            }
            // no exact match found
            // in this case, we parse the actual extent of the mention, embedded in a sentence
            // context, so as to make the parser work better :-)
            int approximateness            = 0;
            IList <CoreLabel> extentTokens = new List <CoreLabel>();

            extentTokens.Add(InitCoreLabel("It"));
            extentTokens.Add(InitCoreLabel("was"));
            int AddedWords = 2;

            for (int i = ent.GetExtentTokenStart(); i < ent.GetExtentTokenEnd(); i++)
            {
                // Add everything except separated dashes! The separated dashes mess with the parser too badly.
                CoreLabel label = tokens[i];
                if (!"-".Equals(label.Word()))
                {
                    extentTokens.Add(tokens[i]);
                }
                else
                {
                    approximateness++;
                }
            }
            extentTokens.Add(InitCoreLabel("."));
            // constrain the parse to the part we're interested in.
            // Starting from ADDED_WORDS comes from skipping "It was".
            // -1 to exclude the period.
            // We now let it be any kind of nominal constituent, since there
            // are VP and S ones
            ParserConstraint         constraint  = new ParserConstraint(AddedWords, extentTokens.Count - 1, ".*");
            IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint);
            Tree tree = Parse(extentTokens, constraints);

            logger.Fine("No exact match found. Local parse:\n" + tree.PennString());
            ConvertToCoreLabels(tree);
            tree.IndexSpans(ent.GetExtentTokenStart() - AddedWords);
            // remember it has ADDED_WORDS extra words at the beginning
            Tree subtree    = FindPartialSpan(tree, ent.GetExtentTokenStart());
            Tree extentHead = SafeHead(subtree);

            logger.Fine("Head is: " + extentHead);
            System.Diagnostics.Debug.Assert((extentHead != null));
            // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
            // Because we deleted dashes, it's index will be >= the index in the extent parse tree
            CoreLabel l = (CoreLabel)extentHead.Label();
            // Tree realHead = findTreeWithSpan(root, l.get(CoreAnnotations.BeginIndexAnnotation.class), l.get(CoreAnnotations.EndIndexAnnotation.class));
            Tree realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness);

            if (realHead != null)
            {
                logger.Fine("Chosen head: " + realHead);
            }
            return(realHead);
        }
コード例 #14
0
        /// <summary>
        /// 对section进行命名实体识别
        /// </summary>
        /// <param name="sectionIndex">section在新闻中的位置</param>
        /// <param name="tokens">section中词--词性pair的集合</param>
        private void splitEntity(int sectionIndex, string content)
        {
            string sentence   = "";
            int    entityFlag = -1;
            int    wordIndex  = -1;

            if (content.Equals("") || content.Equals(" "))
            {
                return;
            }
            addMargin();//段首添加缩进
            string[] tokens = content.Split(' ');
            foreach (string wordWithFlag in tokens)
            {
                string[] tempArray = wordWithFlag.Split('/');
                string   word      = tempArray[0];
                string   flag      = tempArray[1];
                int      wordFlag  = Array.IndexOf(Const.entityList, flag);
                if (sentence != "")
                {
                    if (entityFlag == wordFlag)
                    {
                        sentence += word;
                        wordIndex++;
                    }
                    else
                    {
                        addText(sentence, entityFlag);
                        if (entityFlag >= 0 && entityFlag != 4) //找到entity
                        {
                            var entityMention = new EntityMention();
                            entityMention.indexInSection = wordIndex;
                            entityMention.indexInNews    = wordIndex + sectionIndex;
                            entityMention.newsId         = this.newsId;
                            entityMention.value          = sentence;
                            entityMentionList.Add(entityMention);
                            if (!entityMap.ContainsKey(sentence))
                            {
                                var entity = DBHelper.db.Queryable <NamedEntity>().Where(it => it.value == sentence).First();
                                if (entity == null)    //只保存未存入数据库的
                                {
                                    entity       = new NamedEntity();
                                    entity.value = sentence;
                                    entityMap.Add(sentence, entity);
                                }
                            }
                        }
                        entityFlag = wordFlag;
                        sentence   = word;
                        wordIndex++;
                    }
                }
                else
                {
                    entityFlag = wordFlag;
                    sentence   = word;
                    wordIndex++;
                }
            }
            if (sentence != "")
            {
                addText(sentence, entityFlag);
                addLineBreak();
            }
        }
コード例 #15
0
        private bool TransformAnnotationDocument(Models.Document doc)
        {
            string text = doc.RawText;
            string type = doc.Type;

            string user             = "******";
            string todayString      = DateTime.Today.ToString("MMddyyyy");
            string originalFilename = doc.FileName;

            //*****************************************************************************

            // Here we write to file the chosen DocumentSentiment which has one format
            string sentiment            = doc.DocumentSentiment;
            var    newSentimentFilename = Path.ChangeExtension(originalFilename, ".snt");

            try
            {
                string filePath = null;
                if (ConfigurationManager.AppSettings["environment"] == Debug)
                {
                    filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename);
                }
                else if (ConfigurationManager.AppSettings["environment"] == Release)
                {
                    filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentimentFilename;
                }
                System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                file.Directory.Create();
                using (StreamWriter sentFile = new StreamWriter(file.FullName, false))
                {
                    sentFile.WriteLine(sentiment);
                }
            }
            catch (Exception e)
            {
                // Don't know what to do in this case
            }

            //*****************************************************************************

            // Here we write to file the chosen Sentence-level sentiment which has different format
            List <string> senSentiment = doc.SentenceSentiment;
            List <string> docSentences = doc.Sentences;
            var           newSentenceSentimentFilename = Path.ChangeExtension(originalFilename, ".csv");

            try
            {
                string filePath = null;
                if (ConfigurationManager.AppSettings["environment"] == Debug)
                {
                    filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename);
                }
                else if (ConfigurationManager.AppSettings["environment"] == Release)
                {
                    filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newSentenceSentimentFilename;
                }
                System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                file.Directory.Create();

                using (StreamWriter sentFile = new StreamWriter(file.FullName, false))
                {
                    var writer = new CsvWriter(sentFile);
                    writer.Configuration.Delimiter = ",";

                    // Write the header
                    writer.WriteField("Sentiment");
                    writer.WriteField("Sentence");
                    writer.NextRecord();

                    for (int sen = 0; sen < senSentiment.Count; sen++)
                    {
                        var sentence = docSentences[sen];
                        var senSen   = senSentiment[sen];
                        if (senSen == null)
                        {
                            writer.WriteField("Unknown");
                        }
                        else
                        {
                            writer.WriteField(senSen);
                        }
                        writer.WriteField(sentence);
                        writer.NextRecord();
                    }
                }
            }
            catch (Exception e)
            {
                // Don't know what to do in this case
            }

            //*****************************************************************************

            // Process the user entered annotations

            string            annotations       = doc.Annotations == null || doc.Annotations == "" ? "" : doc.Annotations;
            List <Annotation> clientAnnotations = JsonConvert.DeserializeObject <List <Annotation> >(annotations);

            if (clientAnnotations != null)
            {
                clientAnnotations.Sort(delegate(Annotation ca1, Annotation ca2)
                {
                    return(ca1.begin.CompareTo(ca2.begin));
                });
            }

            // Here we write to file with the chosen annotation type

            if (type == "default")
            {
                var newFilename          = Path.ChangeExtension(originalFilename, ".ann");
                List <EntityMention> ems = new List <EntityMention>();
                if (clientAnnotations != null)
                {
                    foreach (Annotation clientAnnotation in clientAnnotations)
                    {
                        EntityMention em = new EntityMention();
                        em.begin = clientAnnotation.begin;
                        em.end   = clientAnnotation.end;
                        em.type  = clientAnnotation.type;
                        em.text  = text.Substring(clientAnnotation.begin, clientAnnotation.end - clientAnnotation.begin);
                        ems.Add(em);
                    }
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Release)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter annFile = new StreamWriter(file.FullName, false))
                    {
                        annFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        annFile.WriteLine("###FORMAT: " + type + " ###");
                        foreach (EntityMention em in ems)
                        {
                            annFile.WriteLine(em);
                        }
                    }
                    return(true);
                }
                catch (Exception e)
                {
                    return(false);
                }
            }
            else if (type == "xml")
            {
                var    newFilename     = Path.ChangeExtension(originalFilename, ".xml");
                string fulltext        = "";
                int    currentLocation = 0;
                if (clientAnnotations != null)
                {
                    foreach (Annotation clientAnnotation in clientAnnotations)
                    {
                        int    begin      = clientAnnotation.begin;
                        int    end        = clientAnnotation.end;
                        string entityType = clientAnnotation.type;
                        fulltext       += text.Substring(currentLocation, begin - currentLocation);
                        fulltext       += "<" + entityType + ">";
                        fulltext       += text.Substring(begin, end - begin);
                        fulltext       += "</" + entityType + ">";
                        currentLocation = end;
                    }
                    fulltext += text.Substring(currentLocation);
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Release)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter xmlFile = new StreamWriter(file.FullName, false))
                    {
                        xmlFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        xmlFile.WriteLine("###FORMAT: " + type + " ###");
                        xmlFile.WriteLine(fulltext);
                    }
                }
                catch (Exception e)
                {
                    return(false);
                }
                return(true);
            }
            else if (type == "stanford")
            {
                var        newFilename            = Path.ChangeExtension(originalFilename, ".conll");
                string     fulltext               = "";
                int        clientAnnotationNumber = 0;
                int        clientAnnotationSize   = 0;
                Annotation clientAnnotation       = null;
                int        clientAnnotationBegin  = Int32.MaxValue;
                int        clientAnnotationEnd    = Int32.MaxValue;
                string     clientAnnotationType   = "";
                if (clientAnnotations != null && clientAnnotations.Count > 0)
                {
                    clientAnnotationSize  = clientAnnotations.Count;
                    clientAnnotation      = clientAnnotations[0];
                    clientAnnotationBegin = clientAnnotation.begin;
                    clientAnnotationEnd   = clientAnnotation.end;
                    clientAnnotationType  = clientAnnotation.type;
                }
                edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(text);
                PipelineDispenser.StanfordPipeline.annotate(document);
                List <CoreMap> sentences = JavaExtensions.ToList <CoreMap>((java.util.List)document.get(typeof(SentencesAnnotation)));
                foreach (CoreMap sentence in sentences)
                {
                    List <CoreLabel> tokens = JavaExtensions.ToList <CoreLabel>((java.util.List)document.get(typeof(TokensAnnotation)));
                    foreach (CoreLabel token in tokens)
                    {
                        int    tokenBegin = token.beginPosition();
                        int    tokenEnd   = token.endPosition();
                        string chosenNer  = "O";
                        if (isContainedIn(tokenBegin, tokenEnd, clientAnnotationBegin, clientAnnotationEnd))
                        {
                            chosenNer = clientAnnotationType;
                            if (tokenEnd == clientAnnotationEnd)
                            {
                                clientAnnotationNumber++;
                                if (clientAnnotationNumber < clientAnnotationSize)
                                {
                                    clientAnnotation      = clientAnnotations[clientAnnotationNumber];
                                    clientAnnotationBegin = clientAnnotation.begin;
                                    clientAnnotationEnd   = clientAnnotation.end;
                                    clientAnnotationType  = clientAnnotation.type;
                                }
                            }
                        }
                        fulltext += (token.value() + " " + chosenNer + Environment.NewLine);
                    }
                    fulltext += Environment.NewLine;
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter conllFile = new StreamWriter(file.FullName, false))
                    {
                        conllFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        conllFile.WriteLine("###FORMAT: " + type + " ###");
                        conllFile.WriteLine(fulltext);
                    }
                }
                catch (Exception e)
                {
                    return(false);
                }
                return(true);
            }
            else if (type == "luis")
            {
                var    newFilename = Path.ChangeExtension(originalFilename, ".lou");
                string fulltext    = "";
                if (clientAnnotations != null)
                {
                    foreach (Annotation clientAnnotation in clientAnnotations)
                    {
                        EntityMention em = new EntityMention();
                        em.begin  = clientAnnotation.begin;
                        em.end    = clientAnnotation.end - 1;
                        em.type   = clientAnnotation.type;
                        fulltext += (
                            "{" +
                            "\"entity\": \"" + em.type
                            + "\", \"startPos\": " + em.begin
                            + ", \"endPos\": " + em.end
                            + "}," + "\n"
                            );
                    }
                }

                try
                {
                    string filePath = null;
                    if (ConfigurationManager.AppSettings["environment"] == Debug)
                    {
                        filePath = System.Web.HttpContext.Current.Server.MapPath(ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename);
                    }
                    if (ConfigurationManager.AppSettings["environment"] == Release)
                    {
                        filePath = ConfigurationManager.AppSettings["filesRoot"] + todayString + "/" + user + "/" + newFilename;
                    }
                    System.IO.FileInfo file = new System.IO.FileInfo(filePath);
                    file.Directory.Create();
                    using (StreamWriter annFile = new StreamWriter(file.FullName, false))
                    {
                        annFile.WriteLine("###THIS IS A COMMENT BLOCK###");
                        annFile.WriteLine("###FORMAT: " + type + " ###");
                        annFile.WriteLine(fulltext);
                    }
                    return(true);
                }
                catch (Exception e)
                {
                    return(false);
                }
            }
            else
            {
                return(false);
            }
        }
コード例 #16
0
        public string OpenFile(string location)
        {
            List <object> annotationParts = new List <object>();

            // First, read sentiment file.  It is ok for it to not exist
            try
            {
                var      sentimentLocation = Path.ChangeExtension(location, ".snt");
                string[] sentimentLines    = File.ReadAllLines(sentimentLocation);
                string   sentiment         = sentimentLines[0];
                annotationParts.Add(sentiment);
            } catch (Exception e)
            {
                annotationParts.Add("UNK");
            }

            // Second read the ann file
            //===========================================
            // Read configuration for entity types first
            List <EntityType> entityTypes = fetchEntityTypesFromConfiguration();

            try {
                string[] allLines      = File.ReadAllLines(location);
                string   formatLine    = allLines[1];
                var      formatPattern = @"###FORMAT: ([A-Za-z]+) ###";
                var      match         = Regex.Match(formatLine, formatPattern);
                string   format        = match.Groups[1].Value;
                if (format == "default")
                {
                    List <Annotation> annotations = new List <Annotation>();
                    for (int index = 2; index < allLines.Length; index++)
                    {
                        EntityMention em  = EntityMention.FromString(allLines[index]);
                        Annotation    ann = new Annotation();
                        ann.begin = em.begin;
                        ann.end   = em.end;
                        ann.type  = em.type;
                        ann.color = colorDict[ann.type];
                        annotations.Add(ann);
                    }
                    annotationParts.Add(annotations);
                    return(JsonConvert.SerializeObject(annotationParts));
                }
                else if (format == "xml")
                {
                    List <Annotation> annotations = new List <Annotation>();
                    string            fulltext    = String.Concat(new List <string>(allLines).GetRange(2, allLines.Length).ToArray());
                    var   inlinePattern           = BuildEntityTypePattern(entityTypes);
                    Regex inlineRegex             = new Regex(inlinePattern);
                    int   xmlJunkOffset           = 0;
                    foreach (Match inlineMatch in inlineRegex.Matches(fulltext))
                    {
                        string     text  = match.Value;
                        int        begin = match.Index;
                        int        end   = inlineMatch.Length - begin;
                        Annotation ann   = new Annotation();
                        ann.begin = begin - xmlJunkOffset;
                        ann.end   = end - xmlJunkOffset;
                        SetAnnotationType(ann, entityTypes, text);
                        ann.color = colorDict[ann.type];
                    }
                    annotationParts.Add(annotations);
                    return(JsonConvert.SerializeObject(annotationParts));
                }
                else
                {
                    return(null);
                }
            } catch (Exception e)
            {
                return(null);
            }
        }
コード例 #17
0
        private Annotation ReadSentence(string docId, IEnumerator <string> lineIterator)
        {
            Annotation sentence = new Annotation(string.Empty);

            sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docId);
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), new List <EntityMention>());
            // we'll need to set things like the tokens and textContent after we've
            // fully read the sentence
            // contains the full text that we've read so far
            StringBuilder textContent = new StringBuilder();
            int           tokenCount  = 0;
            // how many tokens we've seen so far
            IList <CoreLabel> tokens = new List <CoreLabel>();
            // when we've seen two blank lines in a row, this sentence is over (one
            // blank line separates the sentence and the relations
            int    numBlankLinesSeen = 0;
            string sentenceID        = null;
            // keeps tracks of entities we've seen so far for use by relations
            IDictionary <string, EntityMention> indexToEntityMention = new Dictionary <string, EntityMention>();

            while (lineIterator.MoveNext() && numBlankLinesSeen < 2)
            {
                string currentLine = lineIterator.Current;
                currentLine = currentLine.Replace("COMMA", ",");
                IList <string> pieces = StringUtils.Split(currentLine);
                string         identifier;
                int            size = pieces.Count;
                switch (size)
                {
                case 1:
                {
                    // blank line between sentences or relations
                    numBlankLinesSeen++;
                    break;
                }

                case 3:
                {
                    // relation
                    string type = pieces[2];
                    IList <ExtractionObject> args    = new List <ExtractionObject>();
                    EntityMention            entity1 = indexToEntityMention[pieces[0]];
                    EntityMention            entity2 = indexToEntityMention[pieces[1]];
                    args.Add(entity1);
                    args.Add(entity2);
                    Span span = new Span(entity1.GetExtentTokenStart(), entity2.GetExtentTokenEnd());
                    // identifier = "relation" + sentenceID + "-" + sentence.getAllRelations().size();
                    identifier = RelationMention.MakeUniqueId();
                    RelationMention relationMention = new RelationMention(identifier, sentence, span, type, null, args);
                    AnnotationUtils.AddRelationMention(sentence, relationMention);
                    break;
                }

                case 9:
                {
                    // token

                    /*
                     * Roth token lines look like this:
                     *
                     * 19 Peop 9 O NNP/NNP Jamal/Ghosheh O O O
                     */
                    // Entities may be multiple words joined by '/'; we split these up
                    IList <string> words = StringUtils.Split(pieces[5], "/");
                    //List<String> postags = StringUtils.split(pieces.get(4),"/");
                    string text = StringUtils.Join(words, " ");
                    identifier = "entity" + pieces[0] + '-' + pieces[2];
                    string nerTag = GetNormalizedNERTag(pieces[1]);
                    // entity type of the word/expression
                    if (sentenceID == null)
                    {
                        sentenceID = pieces[0];
                    }
                    if (!nerTag.Equals("O"))
                    {
                        Span extentSpan = new Span(tokenCount, tokenCount + words.Count);
                        // Temporarily sets the head span to equal the extent span.
                        // This is so the entity has a head (in particular, getValue() works) even if preprocessSentences isn't called.
                        // The head span is later modified if preprocessSentences is called.
                        EntityMention entity = new EntityMention(identifier, sentence, extentSpan, extentSpan, nerTag, null, null);
                        AnnotationUtils.AddEntityMention(sentence, entity);
                        // we can get by using these indices as strings since we only use them
                        // as a hash key
                        string index = pieces[2];
                        indexToEntityMention[index] = entity;
                    }
                    // int i =0;
                    foreach (string word in words)
                    {
                        CoreLabel label = new CoreLabel();
                        label.SetWord(word);
                        //label.setTag(postags.get(i));
                        label.Set(typeof(CoreAnnotations.TextAnnotation), word);
                        label.Set(typeof(CoreAnnotations.ValueAnnotation), word);
                        // we don't set TokenBeginAnnotation or TokenEndAnnotation since we're
                        // not keeping track of character offsets
                        tokens.Add(label);
                    }
                    // i++;
                    textContent.Append(text);
                    textContent.Append(' ');
                    tokenCount += words.Count;
                    break;
                }
                }
            }
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), textContent.ToString());
            sentence.Set(typeof(CoreAnnotations.ValueAnnotation), textContent.ToString());
            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            sentence.Set(typeof(CoreAnnotations.SentenceIDAnnotation), sentenceID);
            return(sentence);
        }