Example #1
0
 public virtual void Annotate(Annotation annotation)
 {
     // iterate through each sentence, iterate through each entity mention in the sentence
     foreach (ICoreMap sentence in annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         foreach (ICoreMap entityMention in sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)))
         {
             // if the entityMention is of type PERSON, see if name is in one of the lists for male and female names
             // annotate the entity mention's CoreMap
             if (entityMention.Get(typeof(CoreAnnotations.EntityTypeAnnotation)).Equals("PERSON"))
             {
                 CoreLabel firstName = entityMention.Get(typeof(CoreAnnotations.TokensAnnotation))[0];
                 if (maleNames.Contains(firstName.Word().ToLower()))
                 {
                     AnnotateEntityMention(entityMention, "MALE");
                 }
                 else
                 {
                     if (femaleNames.Contains(firstName.Word().ToLower()))
                     {
                         AnnotateEntityMention(entityMention, "FEMALE");
                     }
                 }
             }
         }
     }
 }
        private void AddAcronyms(Annotation ann)
        {
            // Find all the organizations in a document
            IList <ICoreMap> allMentionsSoFar = new List <ICoreMap>();

            foreach (ICoreMap sentence in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                Sharpen.Collections.AddAll(allMentionsSoFar, sentence.Get(typeof(CoreAnnotations.MentionsAnnotation)));
            }
            IList <IList <CoreLabel> > organizations = new List <IList <CoreLabel> >();

            foreach (ICoreMap mention in allMentionsSoFar)
            {
                if ("ORGANIZATION".Equals(mention.Get(nerCoreAnnotationClass)))
                {
                    organizations.Add(mention.Get(typeof(CoreAnnotations.TokensAnnotation)));
                }
            }
            // Skip very long documents
            if (organizations.Count > 100)
            {
                return;
            }
            // Iterate over tokens...
            foreach (ICoreMap sentence_1 in ann.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                IList <ICoreMap>  sentenceMentions = new List <ICoreMap>();
                IList <CoreLabel> tokens           = sentence_1.Get(typeof(CoreAnnotations.TokensAnnotation));
                int totalTokensOffset = sentence_1.Get(typeof(CoreAnnotations.TokenBeginAnnotation));
                for (int i = 0; i < tokens.Count; ++i)
                {
                    // ... that look like they might be an acronym and are not already a mention
                    CoreLabel token = tokens[i];
                    if ("O".Equals(token.Ner()) && token.Word().ToUpper().Equals(token.Word()) && token.Word().Length >= 3)
                    {
                        foreach (IList <CoreLabel> org in organizations)
                        {
                            // ... and actually are an acronym
                            if (AcronymMatcher.IsAcronym(token.Word(), org))
                            {
                                // ... and add them.
                                // System.out.println("found ACRONYM ORG");
                                token.SetNER("ORGANIZATION");
                                ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, i, i + 1, totalTokensOffset, null, null, null);
                                chunk.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), "ORGANIZATION");
                                sentenceMentions.Add(chunk);
                            }
                        }
                    }
                }
            }
        }
        private bool MatchedRestriction(CoreLabel coreLabel, string label)
        {
            bool use = false;

            if (PatternFactory.useTargetNERRestriction)
            {
                foreach (string s in constVars.allowedNERsforLabels[label])
                {
                    if (coreLabel.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)).Matches(s))
                    {
                        use = true;
                        break;
                    }
                }
            }
            else
            {
                //System.out.println("not matching NER");
                use = true;
            }
            if (use)
            {
                string tag = coreLabel.Tag();
                if (constVars.allowedTagsInitials != null && constVars.allowedTagsInitials.Contains(label))
                {
                    foreach (string allowed in constVars.allowedTagsInitials[label])
                    {
                        if (tag.StartsWith(allowed))
                        {
                            use = true;
                            break;
                        }
                        use = false;
                    }
                }
            }
            if (constVars.debug >= 4)
            {
                if (use)
                {
                    System.Console.Out.WriteLine(coreLabel.Word() + " matched restriction " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars
                                                                                                                                                                                                                                                            .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty));
                }
                else
                {
                    System.Console.Out.WriteLine(coreLabel.Word() + " did not matched restrict " + (PatternFactory.useTargetNERRestriction ? constVars.allowedNERsforLabels[label] : string.Empty) + "and" + PatternFactory.useTargetNERRestriction + " and " + (constVars
                                                                                                                                                                                                                                                                 .allowedTagsInitials != null ? constVars.allowedTagsInitials[label] : string.Empty));
                }
            }
            return(use);
        }
 private static bool ContainsStopWord(CoreLabel l, ICollection <string> commonEngWords, Pattern ignoreWordRegex)
 {
     // if(useWordResultCache.containsKey(l.word()))
     // return useWordResultCache.get(l.word());
     if ((commonEngWords.Contains(l.Lemma()) || commonEngWords.Contains(l.Word())) || (ignoreWordRegex != null && ignoreWordRegex.Matcher(l.Lemma()).Matches()))
     {
         //|| (ignoreWords !=null && (ignoreWords.contains(l.lemma()) || ignoreWords.contains(l.word())))) {
         // useWordResultCache.putIfAbsent(l.word(), false);
         return(true);
     }
     //
     // if (l.word().length() >= minLen4Fuzzy) {
     // try {
     // String matchedFuzzy = NoisyLabelSentences.containsFuzzy(commonEngWords,
     // l.word(), minLen4Fuzzy);
     // if (matchedFuzzy != null) {
     // synchronized (commonEngWords) {
     // commonEngWords.add(l.word());
     // System.out.println("word is " + l.word() + " and matched fuzzy with " +
     // matchedFuzzy);
     // }
     // useWordResultCache.putIfAbsent(l.word(), false);
     // return false;
     // }
     // } catch (Exception e) {
     // e.printStackTrace();
     // System.out.println("Exception " + " while fuzzy matching " + l.word());
     // }
     // }
     // useWordResultCache.putIfAbsent(l.word(), true);
     return(false);
 }
Example #5
0
 protected internal override IHasWord GetNext()
 {
     while (wordIter == null || !wordIter.MoveNext())
     {
         if (!tok.MoveNext())
         {
             return(null);
         }
         CoreLabel token = tok.Current;
         string    s     = token.Word();
         if (s == null)
         {
             return(null);
         }
         if (s.Equals(WhitespaceLexer.Newline))
         {
             // if newlines were significant, we should make sure to return
             // them when we see them
             IList <IHasWord> se = Java.Util.Collections.SingletonList <IHasWord>(token);
             wordIter = se.GetEnumerator();
         }
         else
         {
             IList <IHasWord> se = wordSegmenter.Segment(s);
             wordIter = se.GetEnumerator();
         }
     }
     return(wordIter.Current);
 }
Example #6
0
        /// <summary>
        /// Handles contractions like del and al, marked by the lexer
        /// del =&gt; de + l =&gt; de + el
        /// al =&gt; a + l =&gt; a + el
        /// con[mts]igo =&gt; con + [mts]i
        /// </summary>
        private CoreLabel ProcessContraction(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string word = cl.Word();
            string first;
            string second;
            int    secondOffset = 0;
            int    secondLength = 0;
            string lowered      = word.ToLower();

            switch (lowered)
            {
            case "del":
            case "al":
            {
                first = Sharpen.Runtime.Substring(word, 0, lowered.Length - 1);
                char lastChar = word[lowered.Length - 1];
                if (char.IsLowerCase(lastChar))
                {
                    second = "el";
                }
                else
                {
                    second = "EL";
                }
                secondOffset = 1;
                secondLength = lowered.Length - 1;
                break;
            }

            case "conmigo":
            case "consigo":
            {
                first        = Sharpen.Runtime.Substring(word, 0, 3);
                second       = word[3] + "í";
                secondOffset = 3;
                secondLength = 4;
                break;
            }

            case "contigo":
            {
                first        = Sharpen.Runtime.Substring(word, 0, 3);
                second       = Sharpen.Runtime.Substring(word, 3, 5);
                secondOffset = 3;
                secondLength = 4;
                break;
            }

            default:
            {
                throw new ArgumentException("Invalid contraction provided to processContraction");
            }
            }
            int secondStart = cl.BeginPosition() + secondOffset;
            int secondEnd   = secondStart + secondLength;

            compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd));
            return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart));
        }
Example #7
0
        private string WordIndicator(CoreLabel cl1, CoreLabel cl2, string Pos)
        {
            string w1 = cl1 == null ? "NONE" : cl1.Word().ToLower();
            string w2 = cl2 == null ? "NONE" : cl2.Word().ToLower();

            return(WordIndicator(w1 + "_" + w2, Pos));
        }
        /// <summary>Get the text value of this entity.</summary>
        /// <remarks>
        /// Get the text value of this entity.
        /// The headTokenSpan MUST be set before calling this method!
        /// </remarks>
        public override string GetValue()
        {
            IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            // int lastEnd = -1;
            StringBuilder sb = new StringBuilder();

            for (int i = headTokenSpan.Start(); i < headTokenSpan.End(); i++)
            {
                CoreLabel token = tokens[i];
                // we are not guaranteed to have CharacterOffsets so we can't use them...

                /*
                 * Integer start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
                 * Integer end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
                 *
                 * if (start != null && end != null) {
                 * if (lastEnd != -1 && !start.equals(lastEnd)) {
                 * sb.append(StringUtils.repeat(" ", start - lastEnd));
                 * lastEnd = end;
                 * }
                 * } else {
                 * if (lastEnd != -1) sb.append(" ");
                 * lastEnd = 0;
                 * }
                 */
                if (i > headTokenSpan.Start())
                {
                    sb.Append(" ");
                }
                sb.Append(token.Word());
            }
            return(sb.ToString());
        }
Example #9
0
        protected internal virtual ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            n        = cInfo[loc + 1];
            CoreLabel            n2       = cInfo[loc + 2];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            string charc  = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charn  = n.Get(typeof(CoreAnnotations.CharAnnotation));
            string charn2 = n2.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp  = p.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation));

            // Default feature set...a 5 character window
            // plus a few other language-independent features
            features.Add(charc + "-c");
            features.Add(charn + "-n1");
            features.Add(charn2 + "-n2");
            features.Add(charp + "-p");
            features.Add(charp2 + "-p2");
            // Length feature
            if (charc.Length > 1)
            {
                features.Add("length");
            }
            // Character-level class features
            bool seenPunc  = false;
            bool seenDigit = false;

            for (int i = 0; i < limit; ++i)
            {
                char charcC = charc[i];
                seenPunc  = seenPunc || Characters.IsPunctuation(charcC);
                seenDigit = seenDigit || char.IsDigit(charcC);
                string cuBlock = Characters.UnicodeBlockStringOf(charcC);
                features.Add(cuBlock + "-uBlock");
                string cuType = char.GetType(charcC).ToString();
                features.Add(cuType + "-uType");
            }
            if (seenPunc)
            {
                features.Add("haspunc");
            }
            if (seenDigit)
            {
                features.Add("hasdigit");
            }
            // Token-level features
            string word  = c.Word();
            int    index = c.Index();

            features.Add(Math.Min(MaxBefore, index) + "-before");
            features.Add(Math.Min(MaxAfter, word.Length - charc.Length - index) + "-after");
            features.Add(Math.Min(MaxLength, word.Length) + "-length");
            // Indicator transition feature
            features.Add("cliqueC");
            return(features);
        }
Example #10
0
 private string WordIndicator(CoreLabel cl, string Pos)
 {
     if (cl == null)
     {
         return("NONE");
     }
     return(WordIndicator(cl.Word().ToLower(), Pos));
 }
        private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i)
        {
            ICounter <string> feat = new ClassicCounter <string>();
            CoreLabel         l    = sent[i];
            string            label;

            if (l.Get(answerClass).ToString().Equals(answerLabel))
            {
                label = answerLabel;
            }
            else
            {
                label = "O";
            }
            CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases));

            if (matchedPhrases == null)
            {
                matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>();
                matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word()));
            }
            foreach (CandidatePhrase w in matchedPhrases.AllValues())
            {
                int num = this.clusterIds[w.GetPhrase()];
                if (num == null)
                {
                    num = -1;
                }
                feat.SetCount("Cluster-" + num, 1.0);
            }
            // feat.incrementCount("WORD-" + l.word());
            // feat.incrementCount("LEMMA-" + l.lemma());
            // feat.incrementCount("TAG-" + l.tag());
            int window = 0;

            for (int j = Math.Max(0, i - window); j < i; j++)
            {
                CoreLabel lj = sent[j];
                feat.IncrementCount("PREV-" + "WORD-" + lj.Word());
                feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("PREV-" + "TAG-" + lj.Tag());
            }
            for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++)
            {
                CoreLabel lj = sent[j_1];
                feat.IncrementCount("NEXT-" + "WORD-" + lj.Word());
                feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag());
            }
            // System.out.println("adding " + l.word() + " as " + label);
            return(new RVFDatum <string, string>(feat, label));
        }
        protected internal override ICollection <string> FeaturesCpC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = base.FeaturesCpC(cInfo, loc);
            CoreLabel            c        = cInfo[loc];

            // "Wrapper" feature: identity of first and last two chars of the current word.
            // This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive
            // pronouns if the word starts with al-.
            if (c.Word().Length > 3)
            {
                string start = Sharpen.Runtime.Substring(c.Word(), 0, 2);
                string end   = Sharpen.Runtime.Substring(c.Word(), c.Word().Length - 2);
                if (c.Index() == 2)
                {
                    features.Add(start + "_" + end + "-begin-wrap");
                }
                if (c.Index() == c.Word().Length - 1)
                {
                    features.Add(start + "_" + end + "-end-wrap");
                }
            }
            return(features);
        }
Example #13
0
 /// <summary>Splits a compound marked by the lexer.</summary>
 private CoreLabel ProcessCompound(CoreLabel cl)
 {
     cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
     string[] parts = cl.Word().ReplaceAll("-", " - ").Split("\\s+");
     foreach (string part in parts)
     {
         CoreLabel newLabel = new CoreLabel(cl);
         newLabel.SetWord(part);
         newLabel.SetValue(part);
         newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
         compoundBuffer.Add(newLabel);
     }
     return(compoundBuffer.Remove(0));
 }
Example #14
0
        /// <summary>Splits a contraction marked by the lexer.</summary>
        /// <remarks>
        /// Splits a contraction marked by the lexer.
        /// au =&gt; a + u =&gt; à + le
        /// aux =&gt; a + ux =&gt; à + les
        /// des =&gt; de + s =&gt; de + les
        /// du =&gt; d + u =&gt; de + le
        /// </remarks>
        private CoreLabel ProcessContraction(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string word = cl.Word();
            string first;
            string second;
            int    secondOffset = 0;
            int    secondLength = 0;
            string lowered      = word.ToLower();

            switch (lowered)
            {
            case "au":
            {
                first        = "à";
                second       = "le";
                secondOffset = 1;
                secondLength = 1;
                break;
            }

            case "aux":
            {
                first        = "à";
                second       = "les";
                secondOffset = 1;
                secondLength = 2;
                break;
            }

            case "du":
            {
                first        = "de";
                second       = "le";
                secondOffset = 1;
                secondLength = 1;
                break;
            }

            default:
            {
                throw new ArgumentException("Invalid contraction provided to processContraction");
            }
            }
            int secondStart = cl.BeginPosition() + secondOffset;
            int secondEnd   = secondStart + secondLength;

            compoundBuffer.Add(CopyCoreLabel(cl, second, secondStart, secondEnd));
            return(CopyCoreLabel(cl, first, cl.BeginPosition(), secondStart));
        }
Example #15
0
        private static IList <string> GetContentWords(Mention m)
        {
            IList <string> words = new List <string>();

            for (int i = m.startIndex; i < m.endIndex; i++)
            {
                CoreLabel cl  = m.sentenceWords[i];
                string    Pos = cl.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                if (Pos.Equals("NN") || Pos.Equals("NNS") || Pos.Equals("NNP") || Pos.Equals("NNPS"))
                {
                    words.Add(cl.Word().ToLower());
                }
            }
            return(words);
        }
Example #16
0
        public virtual IList <int> ScanForAnimates(Pair <int, int> span)
        {
            IList <int>       animateIndices = new List <int>();
            IList <CoreLabel> tokens         = doc.Get(typeof(CoreAnnotations.TokensAnnotation));

            for (int i = span.first; i <= span.second && i < tokens.Count; i++)
            {
                CoreLabel token = tokens[i];
                if (animacySet.Contains(token.Word()))
                {
                    animateIndices.Add(i);
                }
            }
            return(animateIndices);
        }
Example #17
0
        private static ICollection <string> GetPropers(Mention m)
        {
            ICollection <string> propers = new HashSet <string>();

            for (int i = m.startIndex; i < m.endIndex; i++)
            {
                CoreLabel cl   = m.sentenceWords[i];
                string    Pos  = cl.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                string    word = cl.Word().ToLower();
                if (Propers.Contains(Pos))
                {
                    propers.Add(word);
                }
            }
            return(propers);
        }
Example #18
0
        public virtual string GetExtentString()
        {
            IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            StringBuilder     sb     = new StringBuilder();

            for (int i = extentTokenSpan.Start(); i < extentTokenSpan.End(); i++)
            {
                CoreLabel token = tokens[i];
                if (i > extentTokenSpan.Start())
                {
                    sb.Append(" ");
                }
                sb.Append(token.Word());
            }
            return(sb.ToString());
        }
Example #19
0
        private void SetTrueCaseText(CoreLabel l)
        {
            string trueCase     = l.GetString <CoreAnnotations.TrueCaseAnnotation>();
            string text         = l.Word();
            string trueCaseText = text;

            switch (trueCase)
            {
            case "UPPER":
            {
                trueCaseText = text.ToUpper();
                break;
            }

            case "LOWER":
            {
                trueCaseText = text.ToLower();
                break;
            }

            case "INIT_UPPER":
            {
                trueCaseText = char.ToTitleCase(text[0]) + Sharpen.Runtime.Substring(text, 1).ToLower();
                break;
            }

            case "O":
            {
                // The model predicted mixed case, so lookup the map:
                string lower = text.ToLower();
                if (mixedCaseMap.Contains(lower))
                {
                    trueCaseText = mixedCaseMap[lower];
                }
                // else leave it as it was?
                break;
            }
            }
            // System.err.println(text + " was classified as " + trueCase + " and so became " + trueCaseText);
            l.Set(typeof(CoreAnnotations.TrueCaseTextAnnotation), trueCaseText);
            if (overwriteText)
            {
                l.Set(typeof(CoreAnnotations.TextAnnotation), trueCaseText);
                l.Set(typeof(CoreAnnotations.ValueAnnotation), trueCaseText);
            }
        }
Example #20
0
        /// <summary>Splits a compound marked by the lexer.</summary>
        private CoreLabel ProcessCompound(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string[] parts       = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - "));
            int      lengthAccum = 0;

            foreach (string part in parts)
            {
                CoreLabel newLabel = new CoreLabel(cl);
                newLabel.SetWord(part);
                newLabel.SetValue(part);
                newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum);
                newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length);
                newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
                compoundBuffer.Add(newLabel);
                lengthAccum += part.Length;
            }
            return(compoundBuffer.Remove(0));
        }
        public virtual void TestUsingIterator()
        {
            string s = "\n\n@@123\nthis\nis\na\nsentence\n\n@@12\nThis\nis another\n.\n\n";

            string[] output = new string[] { "@@", "123", "this", "is", "a", "sentence", "@@", "12", "This", "is", "another", "." };
            string[] outWSs = new string[] { "@@", "ddd", "xxxx", "xx", "x", "xxxxx", "@@", "dd", "Xxxx", "xx", "xxxxx", "." };
            NUnit.Framework.Assert.AreEqual(output.Length, outWSs.Length, "Two output arrays should have same length");
            Properties         props = PropertiesUtils.AsProperties("wordShape", "chris2");
            SeqClassifierFlags flags = new SeqClassifierFlags(props);
            PlainTextDocumentReaderAndWriter <CoreLabel> readerAndWriter = new PlainTextDocumentReaderAndWriter <CoreLabel>();

            readerAndWriter.Init(flags);
            ReaderIteratorFactory           rif          = new ReaderIteratorFactory(new StringReader(s));
            ObjectBank <IList <CoreLabel> > di           = new ObjectBank <IList <CoreLabel> >(rif, readerAndWriter);
            ICollection <string>            knownLCWords = new HashSet <string>();
            ObjectBankWrapper <CoreLabel>   obw          = new ObjectBankWrapper <CoreLabel>(flags, di, knownLCWords);

            try
            {
                int outIdx = 0;
                for (IEnumerator <IList <CoreLabel> > iter = obw.GetEnumerator(); iter.MoveNext();)
                {
                    IList <CoreLabel> sent = iter.Current;
                    for (IEnumerator <CoreLabel> iter2 = sent.GetEnumerator(); iter2.MoveNext();)
                    {
                        CoreLabel cl    = iter2.Current;
                        string    tok   = cl.Word();
                        string    shape = cl.Get(typeof(CoreAnnotations.ShapeAnnotation));
                        NUnit.Framework.Assert.AreEqual(output[outIdx], tok);
                        NUnit.Framework.Assert.AreEqual(outWSs[outIdx], shape);
                        outIdx++;
                    }
                }
                if (outIdx < output.Length)
                {
                    NUnit.Framework.Assert.Fail("Too few things in iterator, lacking: " + output[outIdx]);
                }
            }
            catch (Exception e)
            {
                NUnit.Framework.Assert.Fail("Probably too many things in iterator: " + e);
            }
        }
Example #22
0
        //Note: this doesn't necessarily find all possible candidates, but is kind of a greedy version.
        // E.g. "Elizabeth and Jane" will return only "Elizabeth and Jane", but not "Elizabeth", and "Jane" as well.
        public virtual Pair <List <string>, List <Pair <int, int> > > ScanForNamesNew(Pair <int, int> textRun)
        {
            List <string>           potentialNames = new List <string>();
            List <Pair <int, int> > nameIndices    = new List <Pair <int, int> >();
            IList <CoreLabel>       tokens         = doc.Get(typeof(CoreAnnotations.TokensAnnotation));

            Sieve.TokenNode pointer = rootNameNode;
            for (int index = textRun.first; index <= textRun.second && index < tokens.Count; index++)
            {
                CoreLabel token     = tokens[index];
                string    tokenText = token.Word();
                //      System.out.println(token);
                if (pointer.childNodes.Keys.Contains(tokenText))
                {
                    pointer = pointer.childNodes[tokenText];
                }
                else
                {
                    if (!pointer.token.Equals("$ROOT"))
                    {
                        if (pointer.fullName != null)
                        {
                            potentialNames.Add(pointer.fullName);
                            nameIndices.Add(new Pair <int, int>(index - 1 - pointer.level, index - 1));
                        }
                        pointer = rootNameNode;
                    }
                }
            }
            int index_1 = textRun.second + 1;

            if (!pointer.token.Equals("$ROOT"))
            {
                //catch the end case
                if (pointer.fullName != null)
                {
                    potentialNames.Add(pointer.fullName);
                    nameIndices.Add(new Pair <int, int>(index_1 - 1 - pointer.level, index_1 - 1));
                }
                pointer = rootNameNode;
            }
            return(new Pair <List <string>, List <Pair <int, int> > >(potentialNames, nameIndices));
        }
Example #23
0
 public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees)
 {
     try
     {
         PrintWriter output = IOUtils.GetPrintWriter(outFile);
         for (int i = 0; i < sentences.Count; i++)
         {
             ICoreMap          sentence = sentences[i];
             DependencyTree    tree     = trees[i];
             IList <CoreLabel> tokens   = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             for (int j = 1; j <= size; ++j)
             {
                 CoreLabel token = tokens[j - 1];
                 output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j));
             }
             output.Println();
         }
         output.Close();
     }
     catch (Exception e)
     {
         throw new RuntimeIOException(e);
     }
 }
Example #24
0
        // && !text.contains("+") &&
        // !text.contains("*");// && !
        // text.contains("$") && !text.contains("\"");
        public static IDictionary <int, ISet> GetPatternsAroundTokens(DataInstance sent, ICollection <CandidatePhrase> stopWords)
        {
            IDictionary <int, ISet> p      = new Dictionary <int, ISet>();
            IList <CoreLabel>       tokens = sent.GetTokens();

            for (int i = 0; i < tokens.Count; i++)
            {
                //          p.put(
                //              i,
                //              new Triple<Set<Integer>, Set<Integer>, Set<Integer>>(
                //                  new HashSet<Integer>(), new HashSet<Integer>(),
                //                  new HashSet<Integer>()));
                p[i] = new HashSet <SurfacePattern>();
                CoreLabel token = tokens[i];
                // do not create patterns around stop words!
                if (PatternFactory.DoNotUse(token.Word(), stopWords))
                {
                    continue;
                }
                ICollection <SurfacePattern> pat = GetContext(sent.GetTokens(), i, stopWords);
                p[i] = pat;
            }
            return(p);
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
        public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves)
        {
            IList <ILabel> labels = tree.Yield();

            foreach (ILabel label in labels)
            {
                ++nTokens;
                if (!(label is CoreLabel))
                {
                    throw new ArgumentException("Only works with CoreLabels trees");
                }
                CoreLabel coreLabel = (CoreLabel)label;
                string    lemma     = coreLabel.Lemma();
                //PTB escaping since we're going to put this in the leaf
                if (lemma == null)
                {
                    // No lemma, so just add the surface form
                    lemma = coreLabel.Word();
                }
                else
                {
                    if (lemma.Equals("("))
                    {
                        lemma = "-LRB-";
                    }
                    else
                    {
                        if (lemma.Equals(")"))
                        {
                            lemma = "-RRB-";
                        }
                    }
                }
                if (lemmasAsLeaves)
                {
                    string escapedLemma = lemma;
                    coreLabel.SetWord(escapedLemma);
                    coreLabel.SetValue(escapedLemma);
                    coreLabel.SetLemma(lemma);
                }
                if (addMorphoToLeaves)
                {
                    string morphStr = coreLabel.OriginalText();
                    if (morphStr == null || morphStr.Equals(string.Empty))
                    {
                        morphStr = MorphoFeatureSpecification.NoAnalysis;
                    }
                    else
                    {
                        ++nMorphAnalyses;
                    }
                    // Normalize punctuation analyses
                    if (morphStr.StartsWith("PONCT"))
                    {
                        morphStr = "PUNC";
                    }
                    string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr);
                    coreLabel.SetValue(newLeaf);
                    coreLabel.SetWord(newLeaf);
                }
            }
        }
Example #27
0
        protected internal virtual Tree FindSyntacticHead(Mention m, Tree root, IList <CoreLabel> tokens)
        {
            // mention ends with 's
            int endIdx = m.endIndex;

            if (m.originalSpan.Count > 0)
            {
                string lastWord = m.originalSpan[m.originalSpan.Count - 1].Get(typeof(CoreAnnotations.TextAnnotation));
                if ((lastWord.Equals("'s") || lastWord.Equals("'")) && m.originalSpan.Count != 1)
                {
                    endIdx--;
                }
            }
            Tree exactMatch = FindTreeWithSpan(root, m.startIndex, endIdx);

            //
            // found an exact match
            //
            if (exactMatch != null)
            {
                return(SafeHead(exactMatch, endIdx));
            }
            // no exact match found
            // in this case, we parse the actual extent of the mention, embedded in a sentence
            // context, so as to make the parser work better :-)
            if (allowReparsing)
            {
                int approximateness            = 0;
                IList <CoreLabel> extentTokens = new List <CoreLabel>();
                extentTokens.Add(InitCoreLabel("It"));
                extentTokens.Add(InitCoreLabel("was"));
                int AddedWords = 2;
                for (int i = m.startIndex; i < endIdx; i++)
                {
                    // Add everything except separated dashes! The separated dashes mess with the parser too badly.
                    CoreLabel label = tokens[i];
                    if (!"-".Equals(label.Word()))
                    {
                        // necessary to copy tokens in case the parser does things like
                        // put new indices on the tokens
                        extentTokens.Add((CoreLabel)label.LabelFactory().NewLabel(label));
                    }
                    else
                    {
                        approximateness++;
                    }
                }
                extentTokens.Add(InitCoreLabel("."));
                // constrain the parse to the part we're interested in.
                // Starting from ADDED_WORDS comes from skipping "It was".
                // -1 to exclude the period.
                // We now let it be any kind of nominal constituent, since there
                // are VP and S ones
                ParserConstraint         constraint  = new ParserConstraint(AddedWords, extentTokens.Count - 1, Pattern.Compile(".*"));
                IList <ParserConstraint> constraints = Java.Util.Collections.SingletonList(constraint);
                Tree tree = Parse(extentTokens, constraints);
                ConvertToCoreLabels(tree);
                // now unnecessary, as parser uses CoreLabels?
                tree.IndexSpans(m.startIndex - AddedWords);
                // remember it has ADDED_WORDS extra words at the beginning
                Tree subtree = FindPartialSpan(tree, m.startIndex);
                // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
                // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
                // passed the right end (that is, just that final period).
                Tree extentHead = SafeHead(subtree, endIdx);
                System.Diagnostics.Debug.Assert((extentHead != null));
                // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
                // Because we deleted dashes, it's index will be >= the index in the extent parse tree
                CoreLabel l        = (CoreLabel)extentHead.Label();
                Tree      realHead = FunkyFindLeafWithApproximateSpan(root, l.Value(), l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)), approximateness);
                System.Diagnostics.Debug.Assert((realHead != null));
                return(realHead);
            }
            // If reparsing wasn't allowed, try to find a span in the tree
            // which happens to have the head
            Tree wordMatch = FindTreeWithSmallestSpan(root, m.startIndex, endIdx);

            if (wordMatch != null)
            {
                Tree head = SafeHead(wordMatch, endIdx);
                if (head != null)
                {
                    int index = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1;
                    if (index >= m.startIndex && index < endIdx)
                    {
                        return(head);
                    }
                }
            }
            // If that didn't work, guess that it's the last word
            int lastNounIdx = endIdx - 1;

            for (int i_1 = m.startIndex; i_1 < m.endIndex; i_1++)
            {
                if (tokens[i_1].Tag().StartsWith("N"))
                {
                    lastNounIdx = i_1;
                }
                else
                {
                    if (tokens[i_1].Tag().StartsWith("W"))
                    {
                        break;
                    }
                }
            }
            IList <Tree> leaves  = root.GetLeaves();
            Tree         endLeaf = leaves[lastNounIdx];

            return(endLeaf);
        }
Example #28
0
 /*
  * public void applyPats(Counter<E> patterns, String label, boolean computeDataFreq,  TwoDimensionalCounter<Pair<String, String>, Integer> wordsandLemmaPatExtracted,
  * CollectionValuedMap<Integer, Triple<String, Integer, Integer>> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{
  * Counter<E> patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter<E>();
  * Counter<E> patternsLearnedThisIterRest = new ClassicCounter<E>();
  * Set<String> specialWords = constVars.invertedIndex.getSpecialWordsList();
  * List<String> extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an");
  *
  * for(Entry<Integer, Double> en: patterns.entrySet()){
  * Integer pindex = en.getKey();
  * SurfacePattern p = constVars.getPatternIndex().get(pindex);
  * String[] n = p.getSimplerTokensNext();
  * String[] pr = p.getSimplerTokensPrev();
  * boolean rest = false;
  * if(n!=null){
  * for(String e: n){
  * if(!specialWords.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest == false && pr!=null){
  * for(String e: pr){
  * if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){
  * rest = true;
  * break;
  * }
  * }
  * }
  * if(rest)
  * patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue());
  * else
  * patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue());
  * }
  *
  *
  *
  * Map<String, Set<String>> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex());
  *
  * if (constVars.batchProcessSents) {
  * List<File> filesToLoad;
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0)
  * filesToLoad = Data.sentsFiles;
  * else{
  * filesToLoad = new ArrayList<File>();
  * for (String fname : sentidswithfilerest.keySet()) {
  * String filename;
  * //          if(!constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname;
  * //          else
  * filename = fname;
  * filesToLoad.add(new File(filename));
  * }
  * }
  *
  * for (File fname : filesToLoad) {
  * Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname);
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(fname);
  *
  * if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){
  *
  * String filename;
  * //          if(constVars.usingDirForSentsInIndex)
  * //            filename = constVars.saveSentencesSerDir+"/"+fname.getName();
  * //          else
  * filename = fname.getAbsolutePath();
  *
  * Set<String> sentIDs = sentidswithfilerest.get(filename);
  * if (sentIDs != null){
  * this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * Redwood.log(Redwood.DBG, "No sentIds for " + filename  + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  *
  * if (computeDataFreq){
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(fname.getName());
  * }
  * }
  *
  * //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error.
  * if(computeDataFreq){
  * for(File f: Data.sentsFiles){
  * if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){
  * Map<String, List<CoreLabel>> sents = IOUtils.readObjectFromFile(f);
  * Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
  * Data.fileNamesUsedToComputeRawFreq.add(f.getName());
  * }
  * }
  * }
  *
  * } else {
  *
  * if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) {
  * String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0);
  * Set<String> sentids = sentidswithfilerest.get(filename);
  * if (sentids != null) {
  * this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
  * } else
  * throw new RuntimeException("How come no sentIds for " + filename  + ". Index keyset is " + constVars.invertedIndex.getKeySet());
  * }
  * if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
  * this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
  * }
  * Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound);
  * }
  * Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  * }
  */
 private void StatsWithoutApplyingPatterns(IDictionary <string, DataInstance> sents, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, TwoDimensionalCounter <CandidatePhrase, E> wordsandLemmaPatExtracted)
 {
     foreach (KeyValuePair <string, DataInstance> sentEn in sents)
     {
         IDictionary <int, ICollection <E> > pat4Sent = patternsForEachToken.GetPatternsForAllTokens(sentEn.Key);
         if (pat4Sent == null)
         {
             throw new Exception("How come there are no patterns for " + sentEn.Key);
         }
         foreach (KeyValuePair <int, ICollection <E> > en in pat4Sent)
         {
             CoreLabel       token = null;
             ICollection <E> p1    = en.Value;
             //        Set<Integer> p1 = en.getValue().first();
             //        Set<Integer> p2 = en.getValue().second();
             //        Set<Integer> p3 = en.getValue().third();
             foreach (E index in patternsLearnedThisIter.KeySet())
             {
                 if (p1.Contains(index))
                 {
                     if (token == null)
                     {
                         token = sentEn.Value.GetTokens()[en.Key];
                     }
                     wordsandLemmaPatExtracted.IncrementCount(CandidatePhrase.CreateOrGet(token.Word(), token.Lemma()), index);
                 }
             }
         }
     }
 }
Example #29
0
        private ICounter <string> GetFeatures(Document doc, Mention m, IDictionary <int, IList <Mention> > mentionsByHeadIndex)
        {
            ICounter <string> features = new ClassicCounter <string>();

            // type features
            features.IncrementCount("mention-type=" + m.mentionType);
            features.IncrementCount("gender=" + m.gender);
            features.IncrementCount("person-fine=" + m.person);
            features.IncrementCount("head-ne-type=" + m.nerString);
            IList <string> singletonFeatures = m.GetSingletonFeatures(dictionaries);

            foreach (KeyValuePair <int, string> e in SingletonFeatures)
            {
                if (e.Key < singletonFeatures.Count)
                {
                    features.IncrementCount(e.Value + "=" + singletonFeatures[e.Key]);
                }
            }
            // length and location features
            AddNumeric(features, "mention-length", m.SpanToString().Length);
            AddNumeric(features, "mention-words", m.originalSpan.Count);
            AddNumeric(features, "sentence-words", m.sentenceWords.Count);
            features.IncrementCount("sentence-words=" + Bin(m.sentenceWords.Count));
            features.IncrementCount("mention-position", m.mentionNum / (double)doc.predictedMentions.Count);
            features.IncrementCount("sentence-position", m.sentNum / (double)doc.numSentences);
            // lexical features
            CoreLabel firstWord    = FirstWord(m);
            CoreLabel lastWord     = LastWord(m);
            CoreLabel headWord     = HeadWord(m);
            CoreLabel prevWord     = PrevWord(m);
            CoreLabel nextWord     = NextWord(m);
            CoreLabel prevprevWord = PrevprevWord(m);
            CoreLabel nextnextWord = NextnextWord(m);
            string    headPOS      = GetPOS(headWord);
            string    firstPOS     = GetPOS(firstWord);
            string    lastPOS      = GetPOS(lastWord);
            string    prevPOS      = GetPOS(prevWord);
            string    nextPOS      = GetPOS(nextWord);
            string    prevprevPOS  = GetPOS(prevprevWord);
            string    nextnextPOS  = GetPOS(nextnextWord);

            features.IncrementCount("first-word=" + WordIndicator(firstWord, firstPOS));
            features.IncrementCount("last-word=" + WordIndicator(lastWord, lastPOS));
            features.IncrementCount("head-word=" + WordIndicator(headWord, headPOS));
            features.IncrementCount("next-word=" + WordIndicator(nextWord, nextPOS));
            features.IncrementCount("prev-word=" + WordIndicator(prevWord, prevPOS));
            features.IncrementCount("next-bigram=" + WordIndicator(nextWord, nextnextWord, nextPOS + "_" + nextnextPOS));
            features.IncrementCount("prev-bigram=" + WordIndicator(prevprevWord, prevWord, prevprevPOS + "_" + prevPOS));
            features.IncrementCount("next-pos=" + nextPOS);
            features.IncrementCount("prev-pos=" + prevPOS);
            features.IncrementCount("first-pos=" + firstPOS);
            features.IncrementCount("last-pos=" + lastPOS);
            features.IncrementCount("next-pos-bigram=" + nextPOS + "_" + nextnextPOS);
            features.IncrementCount("prev-pos-bigram=" + prevprevPOS + "_" + prevPOS);
            AddDependencyFeatures(features, "parent", GetDependencyParent(m), true);
            AddFeature(features, "ends-with-head", m.headIndex == m.endIndex - 1);
            AddFeature(features, "is-generic", m.originalSpan.Count == 1 && firstPOS.Equals("NNS"));
            // syntax features
            IndexedWord w       = m.headIndexedWord;
            string      depPath = string.Empty;
            int         depth   = 0;

            while (w != null)
            {
                SemanticGraphEdge e_1 = GetDependencyParent(m, w);
                depth++;
                if (depth <= 3 && e_1 != null)
                {
                    depPath += (depPath.IsEmpty() ? string.Empty : "_") + e_1.GetRelation().ToString();
                    features.IncrementCount("dep-path=" + depPath);
                    w = e_1.GetSource();
                }
                else
                {
                    w = null;
                }
            }
            if (useConstituencyParse)
            {
                int fullEmbeddingLevel    = HeadEmbeddingLevel(m.contextParseTree, m.headIndex);
                int mentionEmbeddingLevel = HeadEmbeddingLevel(m.mentionSubTree, m.headIndex - m.startIndex);
                if (fullEmbeddingLevel != -1 && mentionEmbeddingLevel != -1)
                {
                    features.IncrementCount("mention-embedding-level=" + Bin(fullEmbeddingLevel - mentionEmbeddingLevel));
                    features.IncrementCount("head-embedding-level=" + Bin(mentionEmbeddingLevel));
                }
                else
                {
                    features.IncrementCount("undetermined-embedding-level");
                }
                features.IncrementCount("num-embedded-nps=" + Bin(NumEmbeddedNps(m.mentionSubTree)));
                string syntaxPath = string.Empty;
                Tree   tree       = m.contextParseTree;
                Tree   head       = tree.GetLeaves()[m.headIndex].Ancestor(1, tree);
                depth = 0;
                foreach (Tree node in tree.PathNodeToNode(head, tree))
                {
                    syntaxPath += node.Value() + "-";
                    features.IncrementCount("syntax-path=" + syntaxPath);
                    depth++;
                    if (depth >= 4 || node.Value().Equals("S"))
                    {
                        break;
                    }
                }
            }
            // mention containment features
            AddFeature(features, "contained-in-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null));
            AddFeature(features, "contains-other-mention", mentionsByHeadIndex[m.headIndex].Stream().AnyMatch(null));
            // features from dcoref rules
            AddFeature(features, "bare-plural", m.originalSpan.Count == 1 && headPOS.Equals("NNS"));
            AddFeature(features, "quantifier-start", dictionaries.quantifiers.Contains(firstWord.Word().ToLower()));
            AddFeature(features, "negative-start", firstWord.Word().ToLower().Matches("none|no|nothing|not"));
            AddFeature(features, "partitive", RuleBasedCorefMentionFinder.PartitiveRule(m, m.sentenceWords, dictionaries));
            AddFeature(features, "adjectival-demonym", dictionaries.IsAdjectivalDemonym(m.SpanToString()));
            if (doc.docType != Document.DocType.Article && m.person == Dictionaries.Person.You && nextWord != null && Sharpen.Runtime.EqualsIgnoreCase(nextWord.Word(), "know"))
            {
                features.IncrementCount("generic-you");
            }
            return(features);
        }
Example #30
0
        private ICounter <string> GetFeatures(Document doc, Mention m1, Mention m2)
        {
            System.Diagnostics.Debug.Assert((m1.AppearEarlierThan(m2)));
            ICounter <string> features = new ClassicCounter <string>();

            // global features
            features.IncrementCount("bias");
            if (useDocSource)
            {
                features.IncrementCount("doc-type=" + doc.docType);
                if (doc.docInfo != null && doc.docInfo.Contains("DOC_ID"))
                {
                    features.IncrementCount("doc-source=" + doc.docInfo["DOC_ID"].Split("/")[1]);
                }
            }
            // singleton feature conjunctions
            IList <string> singletonFeatures1 = m1.GetSingletonFeatures(dictionaries);
            IList <string> singletonFeatures2 = m2.GetSingletonFeatures(dictionaries);

            foreach (KeyValuePair <int, string> e in SingletonFeatures)
            {
                if (e.Key < singletonFeatures1.Count && e.Key < singletonFeatures2.Count)
                {
                    features.IncrementCount(e.Value + "=" + singletonFeatures1[e.Key] + "_" + singletonFeatures2[e.Key]);
                }
            }
            SemanticGraphEdge p1 = GetDependencyParent(m1);
            SemanticGraphEdge p2 = GetDependencyParent(m2);

            features.IncrementCount("dep-relations=" + (p1 == null ? "null" : p1.GetRelation()) + "_" + (p2 == null ? "null" : p2.GetRelation()));
            features.IncrementCount("roles=" + GetRole(m1) + "_" + GetRole(m2));
            CoreLabel headCL1  = HeadWord(m1);
            CoreLabel headCL2  = HeadWord(m2);
            string    headPOS1 = GetPOS(headCL1);
            string    headPOS2 = GetPOS(headCL2);

            features.IncrementCount("head-pos-s=" + headPOS1 + "_" + headPOS2);
            features.IncrementCount("head-words=" + WordIndicator("h_" + headCL1.Word().ToLower() + "_" + headCL2.Word().ToLower(), headPOS1 + "_" + headPOS2));
            // agreement features
            AddFeature(features, "animacies-agree", m2.AnimaciesAgree(m1));
            AddFeature(features, "attributes-agree", m2.AttributesAgree(m1, dictionaries));
            AddFeature(features, "entity-types-agree", m2.EntityTypesAgree(m1, dictionaries));
            AddFeature(features, "numbers-agree", m2.NumbersAgree(m1));
            AddFeature(features, "genders-agree", m2.GendersAgree(m1));
            AddFeature(features, "ner-strings-equal", m1.nerString.Equals(m2.nerString));
            // string matching features
            AddFeature(features, "antecedent-head-in-anaphor", HeadContainedIn(m1, m2));
            AddFeature(features, "anaphor-head-in-antecedent", HeadContainedIn(m2, m1));
            if (m1.mentionType != Dictionaries.MentionType.Pronominal && m2.mentionType != Dictionaries.MentionType.Pronominal)
            {
                AddFeature(features, "antecedent-in-anaphor", m2.SpanToString().ToLower().Contains(m1.SpanToString().ToLower()));
                AddFeature(features, "anaphor-in-antecedent", m1.SpanToString().ToLower().Contains(m2.SpanToString().ToLower()));
                AddFeature(features, "heads-equal", Sharpen.Runtime.EqualsIgnoreCase(m1.headString, m2.headString));
                AddFeature(features, "heads-agree", m2.HeadsAgree(m1));
                AddFeature(features, "exact-match", m1.ToString().Trim().ToLower().Equals(m2.ToString().Trim().ToLower()));
                AddFeature(features, "partial-match", RelaxedStringMatch(m1, m2));
                double editDistance = StringUtils.EditDistance(m1.SpanToString(), m2.SpanToString()) / (double)(m1.SpanToString().Length + m2.SpanToString().Length);
                features.IncrementCount("edit-distance", editDistance);
                features.IncrementCount("edit-distance=" + ((int)(editDistance * 10) / 10.0));
                double headEditDistance = StringUtils.EditDistance(m1.headString, m2.headString) / (double)(m1.headString.Length + m2.headString.Length);
                features.IncrementCount("head-edit-distance", headEditDistance);
                features.IncrementCount("head-edit-distance=" + ((int)(headEditDistance * 10) / 10.0));
            }
            // distance features
            AddNumeric(features, "mention-distance", m2.mentionNum - m1.mentionNum);
            AddNumeric(features, "sentence-distance", m2.sentNum - m1.sentNum);
            if (m2.sentNum == m1.sentNum)
            {
                AddNumeric(features, "word-distance", m2.startIndex - m1.endIndex);
                if (m1.endIndex > m2.startIndex)
                {
                    features.IncrementCount("spans-intersect");
                }
            }
            // setup for dcoref features
            ICollection <Mention> ms1 = new HashSet <Mention>();

            ms1.Add(m1);
            ICollection <Mention> ms2 = new HashSet <Mention>();

            ms2.Add(m2);
            Random       r  = new Random();
            CorefCluster c1 = new CorefCluster(20000 + r.NextInt(10000), ms1);
            CorefCluster c2 = new CorefCluster(10000 + r.NextInt(10000), ms2);
            string       s2 = m2.LowercaseNormalizedSpanString();
            string       s1 = m1.LowercaseNormalizedSpanString();

            // discourse dcoref features
            AddFeature(features, "mention-speaker-PER0", Sharpen.Runtime.EqualsIgnoreCase(m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)), "PER0"));
            AddFeature(features, "antecedent-is-anaphor-speaker", CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
            AddFeature(features, "same-speaker", CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "person-disagree-same-speaker", CorefRules.EntityPersonDisagree(doc, m2, m1, dictionaries) && CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "antecedent-matches-anaphor-speaker", CorefRules.AntecedentMatchesMentionSpeakerAnnotation(m2, m1, doc));
            AddFeature(features, "discourse-you-PER0", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0"));
            AddFeature(features, "speaker-match-i-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.
                       EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "speaker-match-speaker-i", m2.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s2) && CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1, dictionaries));
            AddFeature(features, "speaker-match-i-speaker", m1.number == Dictionaries.Number.Singular && dictionaries.firstPersonPronouns.Contains(s1) && CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries));
            AddFeature(features, "speaker-match-you-you", dictionaries.secondPersonPronouns.Contains(s1) && dictionaries.secondPersonPronouns.Contains(s2) && CorefRules.EntitySameSpeaker(doc, m2, m1));
            AddFeature(features, "discourse-between-two-person", ((m2.person == Dictionaries.Person.I && m1.person == Dictionaries.Person.You || (m2.person == Dictionaries.Person.You && m1.person == Dictionaries.Person.I)) && (m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation
                                                                                                                                                                                                                                                          )) - m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) == 1) && doc.docType == Document.DocType.Conversation));
            AddFeature(features, "incompatible-not-match", m1.person != Dictionaries.Person.I && m2.person != Dictionaries.Person.I && (CorefRules.AntecedentIsMentionSpeaker(doc, m1, m2, dictionaries) || CorefRules.AntecedentIsMentionSpeaker(doc, m2, m1
                                                                                                                                                                                                                                                  , dictionaries)));
            int utteranceDist = Math.Abs(m1.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)) - m2.headWord.Get(typeof(CoreAnnotations.UtteranceAnnotation)));

            if (doc.docType != Document.DocType.Article && utteranceDist == 1 && !CorefRules.EntitySameSpeaker(doc, m2, m1))
            {
                AddFeature(features, "speaker-mismatch-i-i", m1.person == Dictionaries.Person.I && m2.person == Dictionaries.Person.I);
                AddFeature(features, "speaker-mismatch-you-you", m1.person == Dictionaries.Person.You && m2.person == Dictionaries.Person.You);
                AddFeature(features, "speaker-mismatch-we-we", m1.person == Dictionaries.Person.We && m2.person == Dictionaries.Person.We);
            }
            // other dcoref features
            string firstWord1 = FirstWord(m1).Word().ToLower();

            AddFeature(features, "indefinite-article-np", (m1.appositions == null && m1.predicateNominatives == null && (firstWord1.Equals("a") || firstWord1.Equals("an"))));
            AddFeature(features, "far-this", m2.LowercaseNormalizedSpanString().Equals("this") && Math.Abs(m2.sentNum - m1.sentNum) > 3);
            AddFeature(features, "per0-you-in-article", m2.person == Dictionaries.Person.You && doc.docType == Document.DocType.Article && m2.headWord.Get(typeof(CoreAnnotations.SpeakerAnnotation)).Equals("PER0"));
            AddFeature(features, "inside-in", m2.InsideIn(m1) || m1.InsideIn(m2));
            AddFeature(features, "indefinite-determiners", dictionaries.indefinitePronouns.Contains(m1.originalSpan[0].Lemma()) || dictionaries.indefinitePronouns.Contains(m2.originalSpan[0].Lemma()));
            AddFeature(features, "entity-attributes-agree", CorefRules.EntityAttributesAgree(c2, c1));
            AddFeature(features, "entity-token-distance", CorefRules.EntityTokenDistance(m2, m1));
            AddFeature(features, "i-within-i", CorefRules.EntityIWithinI(m2, m1, dictionaries));
            AddFeature(features, "exact-string-match", CorefRules.EntityExactStringMatch(c2, c1, dictionaries, doc.roleSet));
            AddFeature(features, "entity-relaxed-heads-agree", CorefRules.EntityRelaxedHeadsAgreeBetweenMentions(c2, c1, m2, m1));
            AddFeature(features, "is-acronym", CorefRules.EntityIsAcronym(doc, c2, c1));
            AddFeature(features, "demonym", m2.IsDemonym(m1, dictionaries));
            AddFeature(features, "incompatible-modifier", CorefRules.EntityHaveIncompatibleModifier(m2, m1));
            AddFeature(features, "head-lemma-match", m1.headWord.Lemma().Equals(m2.headWord.Lemma()));
            AddFeature(features, "words-included", CorefRules.EntityWordsIncluded(c2, c1, m2, m1));
            AddFeature(features, "extra-proper-noun", CorefRules.EntityHaveExtraProperNoun(m2, m1, new HashSet <string>()));
            AddFeature(features, "number-in-later-mentions", CorefRules.EntityNumberInLaterMention(m2, m1));
            AddFeature(features, "sentence-context-incompatible", CorefRules.SentenceContextIncompatible(m2, m1, dictionaries));
            // syntax features
            if (useConstituencyParse)
            {
                if (m1.sentNum == m2.sentNum)
                {
                    int  clauseCount = 0;
                    Tree tree        = m2.contextParseTree;
                    Tree current     = m2.mentionSubTree;
                    while (true)
                    {
                        current = current.Ancestor(1, tree);
                        if (current.Label().Value().StartsWith("S"))
                        {
                            clauseCount++;
                        }
                        if (current.Dominates(m1.mentionSubTree))
                        {
                            break;
                        }
                        if (current.Label().Value().Equals("ROOT") || current.Ancestor(1, tree) == null)
                        {
                            break;
                        }
                    }
                    features.IncrementCount("clause-count", clauseCount);
                    features.IncrementCount("clause-count=" + Bin(clauseCount));
                }
                if (RuleBasedCorefMentionFinder.IsPleonastic(m2, m2.contextParseTree) || RuleBasedCorefMentionFinder.IsPleonastic(m1, m1.contextParseTree))
                {
                    features.IncrementCount("pleonastic-it");
                }
                if (MaximalNp(m1.mentionSubTree) == MaximalNp(m2.mentionSubTree))
                {
                    features.IncrementCount("same-maximal-np");
                }
                bool m1Embedded = HeadEmbeddingLevel(m1.mentionSubTree, m1.headIndex - m1.startIndex) > 1;
                bool m2Embedded = HeadEmbeddingLevel(m2.mentionSubTree, m2.headIndex - m2.startIndex) > 1;
                features.IncrementCount("embedding=" + m1Embedded + "_" + m2Embedded);
            }
            return(features);
        }