Esempio n. 1
0
        private void AddDependencyFeatures(ICounter <string> features, string prefix, SemanticGraphEdge e, bool addWord)
        {
            if (e == null)
            {
                features.IncrementCount("no-" + prefix);
                return;
            }
            IndexedWord parent         = e.GetSource();
            string      parentPOS      = parent.Tag();
            string      parentWord     = parent.Word();
            string      parentRelation = e.GetRelation().ToString();

            //String parentDir = e.getSource().beginPosition() < e.getTarget().beginPosition()
            //    ? "right" : "left";
            if (addWord)
            {
                features.IncrementCount(prefix + "-word=" + WordIndicator(parentWord, parentPOS));
            }
            features.IncrementCount(prefix + "-POS=" + parentPOS);
            features.IncrementCount(prefix + "-relation=" + parentRelation);
        }
        public virtual SimpleMatrix GetMentionEmbeddings(Mention m, SimpleMatrix docEmbedding)
        {
            IEnumerator <SemanticGraphEdge> depIterator = m.enhancedDependency.IncomingEdgeIterator(m.headIndexedWord);
            SemanticGraphEdge depRelation = depIterator.MoveNext() ? depIterator.Current : null;

            return(NeuralUtils.Concatenate(GetAverageEmbedding(m.sentenceWords, m.startIndex, m.endIndex), GetAverageEmbedding(m.sentenceWords, m.startIndex - 5, m.startIndex), GetAverageEmbedding(m.sentenceWords, m.endIndex, m.endIndex + 5), GetAverageEmbedding
                                               (m.sentenceWords.SubList(0, m.sentenceWords.Count - 1)), docEmbedding, GetWordEmbedding(m.sentenceWords, m.headIndex), GetWordEmbedding(m.sentenceWords, m.startIndex), GetWordEmbedding(m.sentenceWords, m.endIndex - 1), GetWordEmbedding(m.sentenceWords
                                                                                                                                                                                                                                                                                           , m.startIndex - 1), GetWordEmbedding(m.sentenceWords, m.endIndex), GetWordEmbedding(m.sentenceWords, m.startIndex - 2), GetWordEmbedding(m.sentenceWords, m.endIndex + 1), GetWordEmbedding(depRelation == null ? null : depRelation.GetSource(
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            ).Word())));
        }
Esempio n. 3
0
        private ICounter <string> GetFeatures(Document doc, Mention m, IDictionary <int, IList <Mention> > mentionsByHeadIndex)
        {
            ICounter <string> features = new ClassicCounter <string>();

            // type features
            features.IncrementCount("mention-type=" + m.mentionType);
            features.IncrementCount("gender=" + m.gender);
            features.IncrementCount("person-fine=" + m.person);
            features.IncrementCount("head-ne-type=" + m.nerString);
            IList <string> singletonFeatures = m.GetSingletonFeatures(dictionaries);

            foreach (KeyValuePair <int, string> e in SingletonFeatures)
            {
                if (e.Key < singletonFeatures.Count)
                {
                    features.IncrementCount(e.Value + "=" + singletonFeatures[e.Key]);
                }
            }
            // length and location features
            AddNumeric(features, "mention-length", m.SpanToString().Length);
            AddNumeric(features, "mention-words", m.originalSpan.Count);
            AddNumeric(features, "sentence-words", m.sentenceWords.Count);
            features.IncrementCount("sentence-words=" + Bin(m.sentenceWords.Count));
            features.IncrementCount("mention-position", m.mentionNum / (double)doc.predictedMentions.Count);
            features.IncrementCount("sentence-position", m.sentNum / (double)doc.numSentences);
            // lexical features
            CoreLabel firstWord    = FirstWord(m);
            CoreLabel lastWord     = LastWord(m);
            CoreLabel headWord     = HeadWord(m);
            CoreLabel prevWord     = PrevWord(m);
            CoreLabel nextWord     = NextWord(m);
            CoreLabel prevprevWord = PrevprevWord(m);
            CoreLabel nextnextWord = NextnextWord(m);
            string    headPOS      = GetPOS(headWord);
            string    firstPOS     = GetPOS(firstWord);
            string    lastPOS      = GetPOS(lastWord);
            string    prevPOS      = GetPOS(prevWord);
            string    nextPOS      = GetPOS(nextWord);
            string    prevprevPOS  = GetPOS(prevprevWord);
            string    nextnextPOS  = GetPOS(nextnextWord);

            features.IncrementCount("first-word=" + WordIndicator(firstWord, firstPOS));
            features.IncrementCount("last-word=" + WordIndicator(lastWord, lastPOS));
            features.IncrementCount("head-word=" + WordIndicator(headWord, headPOS));
            features.IncrementCount("next-word=" + WordIndicator(nextWord, nextPOS));
            features.IncrementCount("prev-word=" + WordIndicator(prevWord, prevPOS));
            features.IncrementCount("next-bigram=" + WordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
            features.IncrementCount("prev-bigram=" + WordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
            features.IncrementCount("next-pos=" + nextPOS);
            features.IncrementCount("prev-pos=" + prevPOS);
            features.IncrementCount("first-pos=" + firstPOS);
            features.IncrementCount("last-pos=" + lastPOS);
            features.IncrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
            features.IncrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
            AddDependencyFeatures(features, "parent", GetDependencyParent(m), true);
            AddFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
            AddFeature(features, "is-generic", m.originalSpan.Count == 1 && firstPOS.Equals("NNS"));
            // syntax features
            IndexedWord w       = m.headIndexedWord;
            string      depPath = string.Empty;
            int         depth   = 0;

            while (w != null)
            {
                SemanticGraphEdge e_1 = GetDependencyParent(m, w);
                depth++;
                if (depth <= 3 && e_1 != null)
                {
                    depPath += (depPath.IsEmpty() ? string.Empty : "_") + e_1.GetRelation().ToString();
                    features.IncrementCount("dep-path=" + depPath);
                    w = e_1.GetSource();
                }
                else
                {
                    w = null;
                }
            }
            if (useConstituencyParse)
            {
                int fullEmbeddingLevel    = HeadEmbeddingLevel(m.contextParseTree, m.headIndex);
                int mentionEmbeddingLevel = HeadEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
                if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1)
                {
                    features.IncrementCount("mention-embedding-level=" + Bin(fullEmbeddingLevel - mentionEmbeddingLevel));
                    features.IncrementCount("head-embedding-level=" + Bin(mentionEmbeddingLevel));
                }
                else
                {
                    features.IncrementCount("undetermined-embedding-level");
                }
                features.IncrementCount("num-embedded-nps=" + Bin(NumEmbeddedNps(m.mentionSubTree)));
                string syntaxPath = string.Empty;
                Tree   tree       = m.contextParseTree;
                Tree   head       = tree.GetLeaves()[m.headIndex].Ancestor(1, tree);
                depth = 0;
                foreach (Tree node in tree.PathNodeToNode(head, tree))
                {
                    syntaxPath += node.Value() + "-";
                    features.IncrementCount("syntax-path=" + syntaxPath);
                    depth++;
                    if (depth >= 4 || node.Value().Equals("S"))
                    {
                        break;
                    }
                }
            }
            // mention containment features
            AddFeature(features, "contained-in-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null));
            AddFeature(features, "contains-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null));
            // features from dcoref rules
            AddFeature(features, "bare-plural", m.originalSpan.Count == 1 && headPOS.Equals("NNS"));
            AddFeature(features, "quantifier-start", dictionaries.quantifiers.Contains(firstWord.Word().ToLower()));
            AddFeature(features, "negative-start", firstWord.Word().ToLower().Matches("none|no|nothing|not"));
            AddFeature(features, "partitive", RuleBasedCorefMentionFinder.PartitiveRule(m, m.sentenceWords, dictionaries));
            AddFeature(features, "adjectival-demonym", dictionaries.IsAdjectivalDemonym(m.SpanToString()));
            if (doc.docType != Document.DocType.Article && m.person == Dictionaries.Person.You && nextWord != null && Sharpen.Runtime.EqualsIgnoreCase(nextWord.Word(), "know"))
            {
                features.IncrementCount("generic-you");
            }
            return(features);
        }
        public virtual void Process(int id, Document document)
        {
            IJsonArrayBuilder clusters = Javax.Json.Json.CreateArrayBuilder();

            foreach (CorefCluster gold in document.goldCorefClusters.Values)
            {
                IJsonArrayBuilder c = Javax.Json.Json.CreateArrayBuilder();
                foreach (Mention m in gold.corefMentions)
                {
                    c.Add(m.mentionID);
                }
                clusters.Add(c.Build());
            }
            goldClusterWriter.Println(Javax.Json.Json.CreateObjectBuilder().Add(id.ToString(), clusters.Build()).Build());
            IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetLabeledMentionPairs(document);
            IList <Mention> mentionsList = CorefUtils.GetSortedMentions(document);
            IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >();

            foreach (Mention m_1 in mentionsList)
            {
                IList <Mention> withIndex = mentionsByHeadIndex.ComputeIfAbsent(m_1.headIndex, null);
                withIndex.Add(m_1);
            }
            IJsonObjectBuilder docFeatures = Javax.Json.Json.CreateObjectBuilder();

            docFeatures.Add("doc_id", id);
            docFeatures.Add("type", document.docType == Document.DocType.Article ? 1 : 0);
            docFeatures.Add("source", document.docInfo["DOC_ID"].Split("/")[0]);
            IJsonArrayBuilder sentences = Javax.Json.Json.CreateArrayBuilder();

            foreach (ICoreMap sentence in document.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                sentences.Add(GetSentenceArray(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))));
            }
            IJsonObjectBuilder mentions = Javax.Json.Json.CreateObjectBuilder();

            foreach (Mention m_2 in document.predictedMentionsByID.Values)
            {
                IEnumerator <SemanticGraphEdge> iterator = m_2.enhancedDependency.IncomingEdgeIterator(m_2.headIndexedWord);
                SemanticGraphEdge relation    = iterator.MoveNext() ? iterator.Current : null;
                string            depRelation = relation == null ? "no-parent" : relation.GetRelation().ToString();
                string            depParent   = relation == null ? "<missing>" : relation.GetSource().Word();
                mentions.Add(m_2.mentionNum.ToString(), Javax.Json.Json.CreateObjectBuilder().Add("doc_id", id).Add("mention_id", m_2.mentionID).Add("mention_num", m_2.mentionNum).Add("sent_num", m_2.sentNum).Add("start_index", m_2.startIndex).Add("end_index"
                                                                                                                                                                                                                                                        , m_2.endIndex).Add("head_index", m_2.headIndex).Add("mention_type", m_2.mentionType.ToString()).Add("dep_relation", depRelation).Add("dep_parent", depParent).Add("sentence", GetSentenceArray(m_2.sentenceWords)).Add("contained-in-other-mention"
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                , mentionsByHeadIndex[m_2.headIndex].Stream().AnyMatch(null) ? 1 : 0).Build());
            }
            IJsonArrayBuilder featureNames = Javax.Json.Json.CreateArrayBuilder().Add("same-speaker").Add("antecedent-is-mention-speaker").Add("mention-is-antecedent-speaker").Add("relaxed-head-match").Add("exact-string-match").Add("relaxed-string-match"
                                                                                                                                                                                                                                        );
            IJsonObjectBuilder features = Javax.Json.Json.CreateObjectBuilder();
            IJsonObjectBuilder labels   = Javax.Json.Json.CreateObjectBuilder();

            foreach (KeyValuePair <Pair <int, int>, bool> e in mentionPairs)
            {
                Mention           m1      = document.predictedMentionsByID[e.Key.first];
                Mention           m2      = document.predictedMentionsByID[e.Key.second];
                string            key     = m1.mentionNum + " " + m2.mentionNum;
                IJsonArrayBuilder builder = Javax.Json.Json.CreateArrayBuilder();
                foreach (int val in CategoricalFeatureExtractor.PairwiseFeatures(document, m1, m2, dictionaries, conll))
                {
                    builder.Add(val);
                }
                features.Add(key, builder.Build());
                labels.Add(key, e.Value ? 1 : 0);
            }
            IJsonObject docData = Javax.Json.Json.CreateObjectBuilder().Add("sentences", sentences.Build()).Add("mentions", mentions.Build()).Add("labels", labels.Build()).Add("pair_feature_names", featureNames.Build()).Add("pair_features", features.Build
                                                                                                                                                                                                                                    ()).Add("document_features", docFeatures.Build()).Build();

            dataWriter.Println(docData);
        }