public static bool SetSpanLabel(Tree tree, Pair <int, int> span, string value)
        {
            if (!(tree.Label() is CoreLabel))
            {
                throw new AssertionError("Expected CoreLabels");
            }
            CoreLabel label = (CoreLabel)tree.Label();

            if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)).Equals(span.first) && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)).Equals(span.second))
            {
                label.SetValue(value);
                return(true);
            }
            if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)) > span.first && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)) < span.second)
            {
                return(false);
            }
            foreach (Tree child in tree.Children())
            {
                if (SetSpanLabel(child, span, value))
                {
                    return(true);
                }
            }
            return(false);
        }
Example #2
0
        public static string GetFeatureFromCoreLabel(CoreLabel label, FeatureFactory.FeatureComponent feature)
        {
            string value = null;

            switch (feature)
            {
            case FeatureFactory.FeatureComponent.Headword:
            {
                value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)).Value();
                break;
            }

            case FeatureFactory.FeatureComponent.Headtag:
            {
                value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)).Value();
                break;
            }

            case FeatureFactory.FeatureComponent.Value:
            {
                value = (label == null) ? Null : label.Value();
                break;
            }

            default:
            {
                throw new ArgumentException("Unexpected feature type: " + feature);
            }
            }
            return(value);
        }
        //return index of the token that ends this block of text.
        //key assumption: blocks are delimited by tokens (i.e. no token spans two blocks.)
        public static int GetEndIndex(int startIndex, IList <CoreLabel> tokens, string text)
        {
            text = text.Trim();
            //remove newlines that may throw off text length
            int       currIndex      = startIndex;
            CoreLabel token          = tokens[startIndex];
            int       tokenBeginChar = token.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
            int       offset         = text.IndexOf(token.Get(typeof(CoreAnnotations.OriginalTextAnnotation)));

            while (true)
            {
                int tokenEndChar = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                if (tokenEndChar - tokenBeginChar == text.Length)
                {
                    return(currIndex);
                }
                else
                {
                    if (tokenEndChar - tokenBeginChar > text.Length)
                    {
                        return(currIndex - 1);
                    }
                }
                currIndex++;
                if (currIndex == tokens.Count)
                {
                    return(currIndex - 1);
                }
                token = tokens[currIndex];
            }
        }
Example #4
0
        internal static Triple <bool, Token, string> GetContextTokenStr(CoreLabel tokenj)
        {
            Token  strgeneric  = new Token(PatternFactory.PatternType.Surface);
            string strOriginal = string.Empty;
            bool   isLabeledO  = true;

            //    for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) {
            //      if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) {
            //        isLabeledO = false;
            //        if (strOriginal.isEmpty()) {
            //          strOriginal = e.getKey();
            //        } else {
            //          strOriginal += "|" + e.getKey();
            //        }
            //        strgeneric.addRestriction(e.getKey(), e.getKey());
            //      }
            //    }
            foreach (KeyValuePair <string, Type> e in ConstantsAndVariables.GetGeneralizeClasses())
            {
                if (!tokenj.ContainsKey(e.Value) || tokenj.Get(e.Value) == null)
                {
                    throw new Exception(" Why does the token not have the class " + e.Value + " set? Existing classes " + tokenj.ToString(CoreLabel.OutputFormat.All));
                }
                if (!tokenj.Get(e.Value).Equals(ConstantsAndVariables.backgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = e.Key;
                    }
                    else
                    {
                        strOriginal += "|" + e.Key;
                    }
                    strgeneric.AddORRestriction(e.Value, e.Key);
                }
            }
            if (useContextNERRestriction)
            {
                string nerTag = tokenj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                if (nerTag != null && !nerTag.Equals(SeqClassifierFlags.DefaultBackgroundSymbol))
                {
                    isLabeledO = false;
                    if (strOriginal.IsEmpty())
                    {
                        strOriginal = nerTag;
                    }
                    else
                    {
                        strOriginal += "|" + nerTag;
                    }
                    strgeneric.AddORRestriction(typeof(CoreAnnotations.NamedEntityTagAnnotation), nerTag);
                }
            }
            return(new Triple <bool, Token, string>(isLabeledO, strgeneric, strOriginal));
        }
Example #5
0
        public virtual void CheckHeads(Tree t1, Tree t2)
        {
            NUnit.Framework.Assert.IsTrue(t1.Label() is CoreLabel);
            NUnit.Framework.Assert.IsTrue(t2.Label() is CoreLabel);
            CoreLabel l1 = (CoreLabel)t1.Label();
            CoreLabel l2 = (CoreLabel)t2.Label();

            NUnit.Framework.Assert.AreEqual(l1.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)), l2.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)));
            NUnit.Framework.Assert.AreEqual(l1.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)), l2.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)));
        }
Example #6
0
        /// <summary>
        /// Return the node containing the head word for this node (or
        /// <code>null</code> if none), as recorded in this node's {@link
        /// CoreLabel <code>CoreLabel</code>}.  (In contrast to {@link
        /// edu.stanford.nlp.ling.CategoryWordTag
        /// <code>CategoryWordTag</code>}, we store head words and head
        /// tags as references to nodes, not merely as <code>string</code>s.)
        /// </summary>
        /// <returns>the node containing the head word for this node</returns>
        public TreeGraphNode HeadWordNode()
        {
            TreeGraphNode hwn = SafeCast(_label.Get(typeof(TreeCoreAnnotations.HeadWordAnnotation)));

            if (hwn == null || (hwn.TreeGraph() != null && !(hwn.TreeGraph().Equals(this.TreeGraph()))))
            {
                return(null);
            }
            return(hwn);
        }
        public virtual void Annotate(Annotation annotation)
        {
            if (verbose)
            {
                log.Info("Adding RegexNER annotations ... ");
            }
            if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                throw new Exception("Unable to find sentences in " + annotation);
            }
            IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            foreach (ICoreMap sentence in sentences)
            {
                IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                classifier.Classify(tokens);
                foreach (CoreLabel token in tokens)
                {
                    if (token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null)
                    {
                        token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), classifier.flags.backgroundSymbol);
                    }
                }
                for (int start = 0; start < tokens.Count; start++)
                {
                    CoreLabel token_1    = tokens[start];
                    string    answerType = token_1.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    if (answerType == null)
                    {
                        continue;
                    }
                    string NERType   = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    int    answerEnd = FindEndOfAnswerAnnotation(tokens, start);
                    int    NERStart  = FindStartOfNERAnnotation(tokens, start);
                    int    NEREnd    = FindEndOfNERAnnotation(tokens, start);
                    // check that the spans are the same, specially handling the case of
                    // tokens with background named entity tags ("other")
                    if ((NERStart == start || NERType.Equals(classifier.flags.backgroundSymbol)) && (answerEnd == NEREnd || (NERType.Equals(classifier.flags.backgroundSymbol) && NEREnd >= answerEnd)))
                    {
                        // annotate each token in the span
                        for (int i = start; i < answerEnd; i++)
                        {
                            tokens[i].Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), answerType);
                        }
                    }
                    start = answerEnd - 1;
                }
            }
            if (verbose)
            {
                log.Info("done.");
            }
        }
        private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i)
        {
            ICounter <string> feat = new ClassicCounter <string>();
            CoreLabel         l    = sent[i];
            string            label;

            if (l.Get(answerClass).ToString().Equals(answerLabel))
            {
                label = answerLabel;
            }
            else
            {
                label = "O";
            }
            CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases));

            if (matchedPhrases == null)
            {
                matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>();
                matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word()));
            }
            foreach (CandidatePhrase w in matchedPhrases.AllValues())
            {
                int num = this.clusterIds[w.GetPhrase()];
                if (num == null)
                {
                    num = -1;
                }
                feat.SetCount("Cluster-" + num, 1.0);
            }
            // feat.incrementCount("WORD-" + l.word());
            // feat.incrementCount("LEMMA-" + l.lemma());
            // feat.incrementCount("TAG-" + l.tag());
            int window = 0;

            for (int j = Math.Max(0, i - window); j < i; j++)
            {
                CoreLabel lj = sent[j];
                feat.IncrementCount("PREV-" + "WORD-" + lj.Word());
                feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("PREV-" + "TAG-" + lj.Tag());
            }
            for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++)
            {
                CoreLabel lj = sent[j_1];
                feat.IncrementCount("NEXT-" + "WORD-" + lj.Word());
                feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma());
                feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag());
            }
            // System.out.println("adding " + l.word() + " as " + label);
            return(new RVFDatum <string, string>(feat, label));
        }
        public static void AddUnaryQueueFeatures(IList <string> features, CoreLabel label, string wtFeature)
        {
            if (label == null)
            {
                features.Add(wtFeature + Null);
                return;
            }
            string tag  = label.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)).Value();
            string word = label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)).Value();

            // TODO: check to see if this is slow because of the string concat
            features.Add(wtFeature + tag + "-" + word);
        }
        private void DoOneSentenceNew(IList <CoreLabel> words, Annotation doc, ICoreMap sentence)
        {
            IList <CoreLabel> newWords = NumberSequenceClassifier.CopyTokens(words, sentence);

            nsc.ClassifyWithGlobalInformation(newWords, doc, sentence);
            IEnumerator <CoreLabel> newFLIter = newWords.GetEnumerator();

            foreach (CoreLabel origWord in words)
            {
                CoreLabel newWord  = newFLIter.Current;
                string    before   = origWord.Ner();
                string    newGuess = newWord.Get(typeof(CoreAnnotations.AnswerAnnotation));
                // log.info(origWord.word());
                // log.info(origWord.ner());
                if (Verbose)
                {
                    log.Info(newWord);
                }
                // log.info("-------------------------------------");
                if ((before == null || before.Equals(BackgroundSymbol) || before.Equals("MISC")) && !newGuess.Equals(BackgroundSymbol))
                {
                    origWord.SetNER(newGuess);
                }
                // transfer other annotations generated by SUTime or NumberNormalizer
                NumberSequenceClassifier.TransferAnnotations(newWord, origWord);
            }
        }
Example #11
0
        /// <summary>helper method for creating version of document text without xml.</summary>
        public static string XmlFreeText(string documentText, Annotation annotation)
        {
            int firstTokenCharIndex = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
            // add white space for all text before first token
            string            cleanedText = Sharpen.Runtime.Substring(documentText, 0, firstTokenCharIndex).ReplaceAll("\\S", " ");
            int               tokenIndex  = 0;
            IList <CoreLabel> tokens      = annotation.Get(typeof(CoreAnnotations.TokensAnnotation));

            foreach (CoreLabel token in tokens)
            {
                // add the current token's text
                cleanedText += token.OriginalText();
                // add whitespace for non-tokens and xml in between these tokens
                tokenIndex += 1;
                if (tokenIndex < tokens.Count)
                {
                    CoreLabel nextToken          = tokens[tokenIndex];
                    int       inBetweenStart     = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation));
                    int       inBetweenEnd       = nextToken.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation));
                    string    inBetweenTokenText = Sharpen.Runtime.Substring(documentText, inBetweenStart, inBetweenEnd);
                    inBetweenTokenText = inBetweenTokenText.ReplaceAll("\\S", " ");
                    cleanedText       += inBetweenTokenText;
                }
            }
            // add white space for all non-token content after last token
            cleanedText += Sharpen.Runtime.Substring(documentText, cleanedText.Length, documentText.Length).ReplaceAll("\\S", " ");
            return(cleanedText);
        }
Example #12
0
        public virtual ICollection <string> FeaturesCnC(PaddedList <IN> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            c1       = cInfo[loc + 1];
            CoreLabel            p        = cInfo[loc - 1];
            string charc  = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp  = p.Get(typeof(CoreAnnotations.CharAnnotation));

            if (flags.useWordn)
            {
                features.Add(charc + "c");
                features.Add(charc1 + "c1");
                features.Add(charp + "p");
                features.Add(charp + charc + "pc");
                if (flags.useAs || flags.useMsr || flags.usePk || flags.useHk)
                {
                    features.Add(charc + charc1 + "cc1");
                    features.Add(charp + charc1 + "pc1");
                }
                features.Add("|wordn");
            }
            return(features);
        }
        internal static Tree CreateNode(Tree top, string label, params Tree[] children)
        {
            CoreLabel headLabel  = (CoreLabel)top.Label();
            CoreLabel production = new CoreLabel();

            production.SetValue(label);
            production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)));
            production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)));
            Tree newTop = new LabeledScoredTreeNode(production);

            foreach (Tree child in children)
            {
                newTop.AddChild(child);
            }
            return(newTop);
        }
Example #14
0
        private static Tree FunkyFindLeafWithApproximateSpan(Tree root, string token, int index, int approximateness)
        {
            // log.info("Searching " + root + "\n  for " + token + " at position " + index + " (plus up to " + approximateness + ")");
            IList <Tree> leaves = root.GetLeaves();

            foreach (Tree leaf in leaves)
            {
                CoreLabel label        = typeof(CoreLabel).Cast(leaf.Label());
                int       indexInteger = label.Get(typeof(CoreAnnotations.IndexAnnotation));
                if (indexInteger == null)
                {
                    continue;
                }
                int ind = indexInteger - 1;
                if (token.Equals(leaf.Value()) && ind >= index && ind <= index + approximateness)
                {
                    return(leaf);
                }
            }
            // this shouldn't happen
            //    throw new RuntimeException("RuleBasedCorefMentionFinder: ERROR: Failed to find head token");
            SieveCoreferenceSystem.logger.Warning("RuleBasedCorefMentionFinder: Failed to find head token:\n" + "Tree is: " + root + "\n" + "token = |" + token + "|" + index + "|, approx=" + approximateness);
            foreach (Tree leaf_1 in leaves)
            {
                if (token.Equals(leaf_1.Value()))
                {
                    //log.info("Found something: returning " + leaf);
                    return(leaf_1);
                }
            }
            int fallback = Math.Max(0, leaves.Count - 2);

            SieveCoreferenceSystem.logger.Warning("RuleBasedCorefMentionFinder: Last resort: returning as head: " + leaves[fallback]);
            return(leaves[fallback]);
        }
Example #15
0
        /// <summary>Returns a 0-based index of the head of the tree.</summary>
        /// <remarks>Returns a 0-based index of the head of the tree.  Assumes the leaves had been indexed from 1</remarks>
        internal static int HeadIndex(Tree tree)
        {
            CoreLabel label     = ErasureUtils.UncheckedCast(tree.Label());
            CoreLabel headLabel = label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation));

            return(headLabel.Index() - 1);
        }
Example #16
0
        /// <summary>Create a datum from a string.</summary>
        /// <remarks>
        /// Create a datum from a string. The CoreAnnotations must correspond to those used by
        /// SequenceClassifier. The following annotations are copied from the provided
        /// CoreLabel cl, if present:
        /// DomainAnnotation
        /// startOffset and endOffset will be added to the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// of
        /// the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreLabel"/>
        /// cl to give the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// and
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetEndAnnotation"/>
        /// of the resulting datum.
        /// </remarks>
        private static CoreLabel CreateDatum(CoreLabel cl, string token, string label, int startOffset, int endOffset)
        {
            CoreLabel newTok = new CoreLabel();

            newTok.Set(typeof(CoreAnnotations.TextAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.CharAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.AnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + startOffset);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + endOffset);
            if (cl != null && cl.ContainsKey(typeof(CoreAnnotations.DomainAnnotation)))
            {
                newTok.Set(typeof(CoreAnnotations.DomainAnnotation), cl.Get(typeof(CoreAnnotations.DomainAnnotation)));
            }
            return(newTok);
        }
Example #17
0
        public virtual void FindHead(ICoreMap s, IList <Mention> mentions)
        {
            Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
            IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation));

            tree.IndexSpans(0);
            foreach (Mention m in mentions)
            {
                if (lang == Locale.Chinese)
                {
                    FindHeadChinese(sent, m);
                }
                else
                {
                    CoreLabel head = (CoreLabel)FindSyntacticHead(m, tree, sent).Label();
                    m.headIndex  = head.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1;
                    m.headWord   = sent[m.headIndex];
                    m.headString = m.headWord.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower(Locale.English);
                }
                int start = m.headIndex - m.startIndex;
                if (start < 0 || start >= m.originalSpan.Count)
                {
                    Redwood.Log("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex + ": originalSpan=[" + StringUtils.JoinWords(m.originalSpan, " ") + "], head=" + m.headWord);
                    Redwood.Log("Setting head string to entire mention");
                    m.headIndex  = m.startIndex;
                    m.headWord   = m.originalSpan.Count > 0 ? m.originalSpan[0] : sent[m.startIndex];
                    m.headString = m.originalSpan.ToString();
                }
            }
        }
        /// <summary>Add a binary node to the existing node on top of the stack</summary>
        public virtual State Apply(State state, double scoreDelta)
        {
            TreeShapedStack <Tree> stack = state.stack;
            Tree right = stack.Peek();

            stack = stack.Pop();
            Tree left = stack.Peek();

            stack = stack.Pop();
            Tree head;

            switch (side)
            {
            case BinaryTransition.Side.Left:
            {
                head = left;
                break;
            }

            case BinaryTransition.Side.Right:
            {
                head = right;
                break;
            }

            default:
            {
                throw new ArgumentException("Unknown side " + side);
            }
            }
            if (!(head.Label() is CoreLabel))
            {
                throw new ArgumentException("Stack should have CoreLabel nodes");
            }
            CoreLabel headLabel  = (CoreLabel)head.Label();
            CoreLabel production = new CoreLabel();

            production.SetValue(label);
            production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)));
            production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)));
            Tree newTop = new LabeledScoredTreeNode(production);

            newTop.AddChild(left);
            newTop.AddChild(right);
            stack = stack.Push(newTop);
            return(new State(stack, state.transitions.Push(this), state.separators, state.sentence, state.tokenPosition, state.score + scoreDelta, false));
        }
Example #19
0
        protected internal virtual ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            n        = cInfo[loc + 1];
            CoreLabel            n2       = cInfo[loc + 2];
            CoreLabel            p        = cInfo[loc - 1];
            CoreLabel            p2       = cInfo[loc - 2];
            string charc  = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charn  = n.Get(typeof(CoreAnnotations.CharAnnotation));
            string charn2 = n2.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp  = p.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation));

            // Default feature set...a 5 character window
            // plus a few other language-independent features
            features.Add(charc + "-c");
            features.Add(charn + "-n1");
            features.Add(charn2 + "-n2");
            features.Add(charp + "-p");
            features.Add(charp2 + "-p2");
            // Length feature
            if (charc.Length > 1)
            {
                features.Add("length");
            }
            // Character-level class features
            bool seenPunc  = false;
            bool seenDigit = false;

            for (int i = 0; i < limit; ++i)
            {
                char charcC = charc[i];
                seenPunc  = seenPunc || Characters.IsPunctuation(charcC);
                seenDigit = seenDigit || char.IsDigit(charcC);
                string cuBlock = Characters.UnicodeBlockStringOf(charcC);
                features.Add(cuBlock + "-uBlock");
                string cuType = char.GetType(charcC).ToString();
                features.Add(cuType + "-uType");
            }
            if (seenPunc)
            {
                features.Add("haspunc");
            }
            if (seenDigit)
            {
                features.Add("hasdigit");
            }
            // Token-level features
            string word  = c.Word();
            int    index = c.Index();

            features.Add(Math.Min(MaxBefore, index) + "-before");
            features.Add(Math.Min(MaxAfter, word.Length - charc.Length - index) + "-after");
            features.Add(Math.Min(MaxLength, word.Length) + "-length");
            // Indicator transition feature
            features.Add("cliqueC");
            return(features);
        }
Example #20
0
 protected internal override T GetNext()
 {
     try
     {
         T nextToken;
         do
         {
             // initialized in do-while
             // Depending on the orthographic normalization options,
             // some tokens can be obliterated. In this case, keep iterating
             // until we see a non-zero length token.
             nextToken = (splitAny && !compoundBuffer.IsEmpty()) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next();
         }while (nextToken != null && nextToken.Word().IsEmpty());
         // Check for compounds to split
         if (splitAny && nextToken is CoreLabel)
         {
             CoreLabel cl = (CoreLabel)nextToken;
             if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation)))
             {
                 if (splitCompounds && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.CompoundAnnotation))
                 {
                     nextToken = (T)ProcessCompound(cl);
                 }
                 else
                 {
                     if (splitVerbs && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.VbPronAnnotation))
                     {
                         nextToken = (T)ProcessVerb(cl);
                     }
                     else
                     {
                         if (splitContractions && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.ContrAnnotation))
                         {
                             nextToken = (T)ProcessContraction(cl);
                         }
                     }
                 }
             }
         }
         return(nextToken);
     }
     catch (IOException e)
     {
         throw new RuntimeIOException(e);
     }
 }
Example #21
0
        private static Tree FindTreeWithSpan(Tree tree, int start, int end)
        {
            CoreLabel l = (CoreLabel)tree.Label();

            if (l != null && l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation)))
            {
                int myStart = l.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
                int myEnd   = l.Get(typeof(CoreAnnotations.EndIndexAnnotation));
                if (start == myStart && end == myEnd)
                {
                    // found perfect match
                    return(tree);
                }
                else
                {
                    if (end < myStart)
                    {
                        return(null);
                    }
                    else
                    {
                        if (start >= myEnd)
                        {
                            return(null);
                        }
                    }
                }
            }
            // otherwise, check inside children - a match is possible
            foreach (Tree kid in tree.Children())
            {
                if (kid == null)
                {
                    continue;
                }
                Tree ret = FindTreeWithSpan(kid, start, end);
                // found matching child
                if (ret != null)
                {
                    return(ret);
                }
            }
            // no match
            return(null);
        }
        // TODO not called any more, but possibly useful as a reference
        /// <summary>
        /// This should be called after the classifier has been trained and
        /// parseAndTrain has been called to accumulate test set
        /// This will return precision,recall and F1 measure
        /// </summary>
        public virtual void RunTestSet(IList <IList <CoreLabel> > testSet)
        {
            ICounter <string> tp     = new ClassicCounter <string>();
            ICounter <string> fp     = new ClassicCounter <string>();
            ICounter <string> fn     = new ClassicCounter <string>();
            ICounter <string> actual = new ClassicCounter <string>();

            foreach (IList <CoreLabel> labels in testSet)
            {
                IList <CoreLabel> unannotatedLabels = new List <CoreLabel>();
                // create a new label without answer annotation
                foreach (CoreLabel label in labels)
                {
                    CoreLabel newLabel = new CoreLabel();
                    newLabel.Set(annotationForWord, label.Get(annotationForWord));
                    newLabel.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), label.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)));
                    unannotatedLabels.Add(newLabel);
                }
                IList <CoreLabel> annotatedLabels = this.classifier.Classify(unannotatedLabels);
                int ind = 0;
                foreach (CoreLabel expectedLabel in labels)
                {
                    CoreLabel annotatedLabel = annotatedLabels[ind];
                    string    answer         = annotatedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    string    expectedAnswer = expectedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    actual.IncrementCount(expectedAnswer);
                    // match only non background symbols
                    if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer) && expectedAnswer.Equals(answer))
                    {
                        // true positives
                        tp.IncrementCount(answer);
                        System.Console.Out.WriteLine("True Positive:" + annotatedLabel);
                    }
                    else
                    {
                        if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(answer))
                        {
                            // false positives
                            fp.IncrementCount(answer);
                            System.Console.Out.WriteLine("False Positive:" + annotatedLabel);
                        }
                        else
                        {
                            if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer))
                            {
                                // false negatives
                                fn.IncrementCount(expectedAnswer);
                                System.Console.Out.WriteLine("False Negative:" + expectedLabel);
                            }
                        }
                    }
                    // else true negatives
                    ind++;
                }
            }
            actual.Remove(SeqClassifierFlags.DefaultBackgroundSymbol);
        }
Example #23
0
 private string FindParagraphSpeaker(IList <ICoreMap> paragraph, int paragraphUtterIndex, string nextParagraphSpeaker, int paragraphOffset, Dictionaries dict)
 {
     if (!speakers.Contains(paragraphUtterIndex))
     {
         if (!nextParagraphSpeaker.Equals(string.Empty))
         {
             speakers[paragraphUtterIndex] = nextParagraphSpeaker;
         }
         else
         {
             // find the speaker of this paragraph (John, nbc news)
             ICoreMap lastSent = paragraph[paragraph.Count - 1];
             string   speaker  = string.Empty;
             bool     hasVerb  = false;
             for (int i = 0; i < lastSent.Get(typeof(CoreAnnotations.TokensAnnotation)).Count; i++)
             {
                 CoreLabel w   = lastSent.Get(typeof(CoreAnnotations.TokensAnnotation))[i];
                 string    pos = w.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                 string    ner = w.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                 if (pos.StartsWith("V"))
                 {
                     hasVerb = true;
                     break;
                 }
                 if (ner.StartsWith("PER"))
                 {
                     IntTuple headPosition = new IntTuple(2);
                     headPosition.Set(0, paragraph.Count - 1 + paragraphOffset);
                     headPosition.Set(1, i);
                     if (mentionheadPositions.Contains(headPosition))
                     {
                         speaker = int.ToString(mentionheadPositions[headPosition].mentionID);
                     }
                 }
             }
             if (!hasVerb && !speaker.Equals(string.Empty))
             {
                 speakers[paragraphUtterIndex] = speaker;
             }
         }
     }
     return(FindNextParagraphSpeaker(paragraph, paragraphOffset, dict));
 }
Example #24
0
        /// <summary>The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations.</summary>
        /// <remarks>
        /// The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations. The (tab-separated) format is:
        /// Paragraph id
        /// Sentence id
        /// Token id
        /// Byte start
        /// Byte end
        /// Whitespace following the token (useful for pretty-printing the original text)
        /// Syntactic head id (-1 for the sentence root)
        /// Original token
        /// Normalized token (for quotes etc.)
        /// Lemma
        /// Penn Treebank POS tag
        /// NER tag (PERSON, NUMBER, DATE, DURATION, MISC, TIME, LOCATION, ORDINAL, MONEY, ORGANIZATION, SET, O)
        /// Stanford basic dependency label
        /// Within-quotation flag
        /// Character id (all coreferent tokens share the same character id)
        /// </remarks>
        /// <param name="filename"/>
        public static IDictionary <int, IList <CoreLabel> > ReadTokenFile(string filename, Annotation novel)
        {
            IList <string> lines = IOUtils.LinesFromFile(filename);
            IDictionary <int, IList <CoreLabel> > charsToTokens = new Dictionary <int, IList <CoreLabel> >();
            bool first       = true;
            int  tokenOffset = 0;

            foreach (string line in lines)
            {
                if (first)
                {
                    first = false;
                    continue;
                }
                string[]  pieces        = line.Split("\t");
                int       tokenId       = System.Convert.ToInt32(pieces[2]) + tokenOffset;
                string    token         = pieces[7];
                string    normalizedTok = pieces[8];
                int       characterId   = System.Convert.ToInt32(pieces[14]);
                CoreLabel novelTok      = novel.Get(typeof(CoreAnnotations.TokensAnnotation))[tokenId];
                // CoreNLP sometimes splits ". . . ." as ". . ." and "." and sometimes lemmatizes it. (The Steppe)
                if (pieces[7].Equals(". . . .") && !novelTok.Get(typeof(CoreAnnotations.OriginalTextAnnotation)).Equals(". . . ."))
                {
                    tokenOffset++;
                }
                if (characterId != -1)
                {
                    if (!novelTok.Get(typeof(CoreAnnotations.TextAnnotation)).Equals(normalizedTok))
                    {
                        System.Console.Error.WriteLine(token + " != " + novelTok.Get(typeof(CoreAnnotations.TextAnnotation)));
                    }
                    else
                    {
                        if (!charsToTokens.Contains(characterId))
                        {
                            charsToTokens[characterId] = new List <CoreLabel>();
                        }
                        charsToTokens[characterId].Add(novelTok);
                    }
                }
            }
            return(charsToTokens);
        }
Example #25
0
        private static void CheckContext(CoreLabel label, params string[] expectedContext)
        {
            IList <string> xmlContext = label.Get(typeof(CoreAnnotations.XmlContextAnnotation));

            NUnit.Framework.Assert.AreEqual(expectedContext.Length, xmlContext.Count);
            for (int i = 0; i < expectedContext.Length; ++i)
            {
                NUnit.Framework.Assert.AreEqual(expectedContext[i], xmlContext[i]);
            }
        }
 // public void getDecisionTree(Map<String, List<CoreLabel>> sents,
 // List<Pair<String, Integer>> chosen, Counter<String> weights, String
 // wekaOptions) {
 // RVFDataset<String, String> dataset = new RVFDataset<String, String>();
 // for (Pair<String, Integer> d : chosen) {
 // CoreLabel l = sents.get(d.first).get(d.second());
 // String w = l.word();
 // Integer num = this.clusterIds.get(w);
 // if (num == null)
 // num = -1;
 // double wt = weights.getCount("Cluster-" + num);
 // String label;
 // if (l.get(answerClass).toString().equals(answerLabel))
 // label = answerLabel;
 // else
 // label = "O";
 // Counter<String> feat = new ClassicCounter<String>();
 // feat.setCount("DIST", wt);
 // dataset.add(new RVFDatum<String, String>(feat, label));
 // }
 // WekaDatumClassifierFactory wekaFactory = new
 // WekaDatumClassifierFactory("weka.classifiers.trees.J48", wekaOptions);
 // WekaDatumClassifier classifier = wekaFactory.trainClassifier(dataset);
 // Classifier cls = classifier.getClassifier();
 // J48 j48decisiontree = (J48) cls;
 // System.out.println(j48decisiontree.toSummaryString());
 // System.out.println(j48decisiontree.toString());
 //
 // }
 private int Sample(IDictionary <string, DataInstance> sents, Random r, Random rneg, double perSelectNeg, double perSelectRand, int numrand, IList <Pair <string, int> > chosen, RVFDataset <string, string> dataset)
 {
     foreach (KeyValuePair <string, DataInstance> en in sents)
     {
         CoreLabel[] sent = Sharpen.Collections.ToArray(en.Value.GetTokens(), new CoreLabel[0]);
         for (int i = 0; i < sent.Length; i++)
         {
             CoreLabel l          = sent[i];
             bool      chooseThis = false;
             if (l.Get(answerClass).Equals(answerLabel))
             {
                 chooseThis = true;
             }
             else
             {
                 if ((!l.Get(answerClass).Equals("O") || negativeWords.Contains(l.Word().ToLower())) && GetRandomBoolean(r, perSelectNeg))
                 {
                     chooseThis = true;
                 }
                 else
                 {
                     if (GetRandomBoolean(r, perSelectRand))
                     {
                         numrand++;
                         chooseThis = true;
                     }
                     else
                     {
                         chooseThis = false;
                     }
                 }
             }
             if (chooseThis)
             {
                 chosen.Add(new Pair(en.Key, i));
                 RVFDatum <string, string> d = GetDatum(sent, i);
                 dataset.Add(d, en.Key, int.ToString(i));
             }
         }
     }
     return(numrand);
 }
Example #27
0
        /// <summary>Find and annotate chunks.</summary>
        /// <remarks>
        /// Find and annotate chunks.  Returns list of CoreMap (Annotation) objects
        /// each representing a chunk with the following annotations set:
        /// CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
        /// CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
        /// TokensAnnotation - List of tokens in this chunk
        /// TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
        /// TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
        /// TextAnnotation - String representing tokens in this chunks (token text separated by space)
        /// </remarks>
        /// <param name="tokens">- List of tokens to look for chunks</param>
        /// <param name="totalTokensOffset">- Index of tokens to offset by</param>
        /// <param name="labelKey">- Key to use to find the token label (to determine if inside chunk or not)</param>
        /// <param name="textKey">- Key to use to find the token text</param>
        /// <param name="tokenChunkKey">- If not null, each token is annotated with the chunk using this key</param>
        /// <param name="tokenLabelKey">- If not null, each token is annotated with the text associated with the chunk using this key</param>
        /// <param name="checkTokensCompatible">- If not null, additional check to see if this token and the previous are compatible</param>
        /// <returns>List of annotations (each as a CoreMap) representing the chunks of tokens</returns>
        public virtual IList <ICoreMap> GetAnnotatedChunks(IList <CoreLabel> tokens, int totalTokensOffset, Type textKey, Type labelKey, Type tokenChunkKey, Type tokenLabelKey, IPredicate <Pair <CoreLabel, CoreLabel> > checkTokensCompatible)
        {
            IList <ICoreMap> chunks = new ArrayList();

            LabeledChunkIdentifier.LabelTagType prevTagType = null;
            int tokenBegin = -1;

            for (int i = 0; i < tokens.Count; i++)
            {
                CoreLabel token = tokens[i];
                string    label = (string)token.Get(labelKey);
                LabeledChunkIdentifier.LabelTagType curTagType = GetTagType(label);
                bool isCompatible = true;
                if (checkTokensCompatible != null)
                {
                    CoreLabel prev = null;
                    if (i > 0)
                    {
                        prev = tokens[i - 1];
                    }
                    Pair <CoreLabel, CoreLabel> p = Pair.MakePair(token, prev);
                    isCompatible = checkTokensCompatible.Test(p);
                }
                if (IsEndOfChunk(prevTagType, curTagType) || !isCompatible)
                {
                    int tokenEnd = i;
                    if (tokenBegin >= 0 && tokenEnd > tokenBegin)
                    {
                        ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokenEnd, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey);
                        chunk.Set(labelKey, prevTagType.type);
                        chunks.Add(chunk);
                        tokenBegin = -1;
                    }
                }
                if (IsStartOfChunk(prevTagType, curTagType) || (!isCompatible && IsChunk(curTagType)))
                {
                    if (tokenBegin >= 0)
                    {
                        throw new Exception("New chunk started, prev chunk not ended yet!");
                    }
                    tokenBegin = i;
                }
                prevTagType = curTagType;
            }
            if (tokenBegin >= 0)
            {
                ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokens.Count, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey);
                chunk.Set(labelKey, prevTagType.type);
                chunks.Add(chunk);
            }
            //    System.out.println("number of chunks " +  chunks.size());
            return(chunks);
        }
Example #28
0
        /// <summary>Find the tree that covers the portion of interest.</summary>
        private static Tree FindPartialSpan(Tree root, int start)
        {
            CoreLabel label      = (CoreLabel)root.Label();
            int       startIndex = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation));

            if (startIndex == start)
            {
                return(root);
            }
            foreach (Tree kid in root.Children())
            {
                CoreLabel kidLabel = (CoreLabel)kid.Label();
                int       kidStart = kidLabel.Get(typeof(CoreAnnotations.BeginIndexAnnotation));
                int       kidEnd   = kidLabel.Get(typeof(CoreAnnotations.EndIndexAnnotation));
                if (kidStart <= start && kidEnd > start)
                {
                    return(FindPartialSpan(kid, start));
                }
            }
            throw new Exception("Shouldn't happen: " + start + " " + root);
        }
Example #29
0
        protected internal override ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc)
        {
            ICollection <string> features = base.FeaturesC(cInfo, loc);
            CoreLabel            n3       = cInfo[loc + 3];
            CoreLabel            p3       = cInfo[loc - 3];
            string charn3 = n3.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation));

            // a 7 character window instead of a 5 character window
            features.Add(charn3 + "-n3");
            features.Add(charp3 + "-p3");
            return(features);
        }
Example #30
0
        protected internal virtual ICollection <string> FeaturesCpC(PaddedList <In> cInfo, int loc)
        {
            ICollection <string> features = new List <string>();
            CoreLabel            c        = cInfo[loc];
            CoreLabel            p        = cInfo[loc - 1];
            string charc = c.Get(typeof(CoreAnnotations.CharAnnotation));
            string charp = p.Get(typeof(CoreAnnotations.CharAnnotation));

            features.Add(charc + charp + "-cngram");
            // Indicator transition feature
            features.Add("cliqueCpC");
            return(features);
        }