private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd)
        {
            CoreLabel token = new CoreLabel();

            token.SetOriginalText(tokenText);
            if (separatorPattern.Matcher(tokenText).Matches())
            {
                // Map to CoreNLP newline token
                tokenText = AbstractTokenizer.NewlineToken;
            }
            else
            {
                if (doNormalization && normalizeSpace)
                {
                    tokenText = tokenText.Replace(' ', '\u00A0');
                }
            }
            // change space to non-breaking space
            token.SetWord(tokenText);
            token.SetValue(tokenText);
            token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin);
            token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd);
            if (Verbose)
            {
                log.Info("Adding token " + token.ToShorterString());
            }
            return(token);
        }
Example #2
0
        // last except for the added period.
        private static CoreLabel InitCoreLabel(string token)
        {
            CoreLabel label = new CoreLabel();

            label.Set(typeof(CoreAnnotations.TextAnnotation), token);
            label.Set(typeof(CoreAnnotations.ValueAnnotation), token);
            return(label);
        }
        // TODO not called any more, but possibly useful as a reference
        /// <summary>
        /// This should be called after the classifier has been trained and
        /// parseAndTrain has been called to accumulate test set
        /// This will return precision,recall and F1 measure
        /// </summary>
        public virtual void RunTestSet(IList <IList <CoreLabel> > testSet)
        {
            ICounter <string> tp     = new ClassicCounter <string>();
            ICounter <string> fp     = new ClassicCounter <string>();
            ICounter <string> fn     = new ClassicCounter <string>();
            ICounter <string> actual = new ClassicCounter <string>();

            foreach (IList <CoreLabel> labels in testSet)
            {
                IList <CoreLabel> unannotatedLabels = new List <CoreLabel>();
                // create a new label without answer annotation
                foreach (CoreLabel label in labels)
                {
                    CoreLabel newLabel = new CoreLabel();
                    newLabel.Set(annotationForWord, label.Get(annotationForWord));
                    newLabel.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), label.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)));
                    unannotatedLabels.Add(newLabel);
                }
                IList <CoreLabel> annotatedLabels = this.classifier.Classify(unannotatedLabels);
                int ind = 0;
                foreach (CoreLabel expectedLabel in labels)
                {
                    CoreLabel annotatedLabel = annotatedLabels[ind];
                    string    answer         = annotatedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    string    expectedAnswer = expectedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation));
                    actual.IncrementCount(expectedAnswer);
                    // match only non background symbols
                    if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer) && expectedAnswer.Equals(answer))
                    {
                        // true positives
                        tp.IncrementCount(answer);
                        System.Console.Out.WriteLine("True Positive:" + annotatedLabel);
                    }
                    else
                    {
                        if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(answer))
                        {
                            // false positives
                            fp.IncrementCount(answer);
                            System.Console.Out.WriteLine("False Positive:" + annotatedLabel);
                        }
                        else
                        {
                            if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer))
                            {
                                // false negatives
                                fn.IncrementCount(expectedAnswer);
                                System.Console.Out.WriteLine("False Negative:" + expectedLabel);
                            }
                        }
                    }
                    // else true negatives
                    ind++;
                }
            }
            actual.Remove(SeqClassifierFlags.DefaultBackgroundSymbol);
        }
Example #4
0
        // last except for the added period.
        private static CoreLabel InitCoreLabel(string token, string posTag)
        {
            CoreLabel label = new CoreLabel();

            label.Set(typeof(CoreAnnotations.TextAnnotation), token);
            label.Set(typeof(CoreAnnotations.ValueAnnotation), token);
            label.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), posTag);
            return(label);
        }
        private static CoreLabel LoadToken(string line, bool haveExplicitAntecedent)
        {
            CoreLabel token = new CoreLabel();

            string[] bits = line.Split("\t", -1);
            if (bits.Length < 7)
            {
                throw new RuntimeIOException("ERROR: Invalid format token for serialized token (only " + bits.Length + " tokens): " + line);
            }
            // word
            string word = bits[0].ReplaceAll(SpaceHolder, " ");

            token.Set(typeof(CoreAnnotations.TextAnnotation), word);
            token.Set(typeof(CoreAnnotations.ValueAnnotation), word);
            // if(word.length() == 0) log.info("FOUND 0-LENGTH TOKEN!");
            // lemma
            if (bits[1].Length > 0 || bits[0].Length == 0)
            {
                string lemma = bits[1].ReplaceAll(SpaceHolder, " ");
                token.Set(typeof(CoreAnnotations.LemmaAnnotation), lemma);
            }
            // POS tag
            if (bits[2].Length > 0)
            {
                token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), bits[2]);
            }
            // NE tag
            if (bits[3].Length > 0)
            {
                token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), bits[3]);
            }
            // Normalized NE tag
            if (bits[4].Length > 0)
            {
                token.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), bits[4]);
            }
            // Character offsets
            if (bits[5].Length > 0)
            {
                token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), System.Convert.ToInt32(bits[5]));
            }
            if (bits[6].Length > 0)
            {
                token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), System.Convert.ToInt32(bits[6]));
            }
            if (haveExplicitAntecedent)
            {
                // This block is specific to KBP
                // We may have AntecedentAnnotation
                if (bits.Length > 7)
                {
                    string aa = bits[7].ReplaceAll(SpaceHolder, " ");
                    if (aa.Length > 0)
                    {
                        token.Set(typeof(CoreAnnotations.AntecedentAnnotation), aa);
                    }
                }
            }
            return(token);
        }
        // Arbitrary test input.  We just need to segment something on multiple threads to reproduce
        // the issue
        private static IList <CoreLabel> CreateTestTokens()
        {
            CoreLabel token = new CoreLabel();

            token.SetWord("你好,世界");
            token.SetValue("你好,世界");
            token.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1");
            token.Set(typeof(CoreAnnotations.AnswerAnnotation), "0");
            IList <CoreLabel> labels = new List <CoreLabel>();

            labels.Add(token);
            return(labels);
        }
        private static IList <CoreLabel> MakeListCoreLabel(string[] gold, string[] guess)
        {
            NUnit.Framework.Assert.AreEqual("Cannot run test on lists of different length", gold.Length, guess.Length);
            IList <CoreLabel> sentence = new List <CoreLabel>();

            for (int i = 0; i < gold.Length; ++i)
            {
                CoreLabel word = new CoreLabel();
                word.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), gold[i]);
                word.Set(typeof(CoreAnnotations.AnswerAnnotation), guess[i]);
                sentence.Add(word);
            }
            return(sentence);
        }
        public virtual void TestCoreLabelSetWordBehavior()
        {
            CoreLabel foo = new CoreLabel();

            foo.Set(typeof(CoreAnnotations.TextAnnotation), "foo");
            foo.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "B");
            foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool");
            // Lemma gets removed with word
            ArrayCoreMap copy = new ArrayCoreMap(foo);

            NUnit.Framework.Assert.AreEqual(copy, foo);
            foo.SetWord("foo");
            NUnit.Framework.Assert.AreEqual(copy, foo);
            // same word set
            foo.SetWord("bar");
            NUnit.Framework.Assert.IsFalse(copy.Equals(foo));
            // lemma removed
            foo.SetWord("foo");
            NUnit.Framework.Assert.IsFalse(copy.Equals(foo));
            // still removed
            foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool");
            NUnit.Framework.Assert.AreEqual(copy, foo);
            // back to normal
            // Hash code is consistent
            int hashCode = foo.GetHashCode();

            NUnit.Framework.Assert.AreEqual(copy.GetHashCode(), hashCode);
            foo.SetWord("bar");
            NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode());
            foo.SetWord("foo");
            NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode());
            // Hash code doesn't care between a value of null and the key not existing
            NUnit.Framework.Assert.IsTrue(foo.Lemma() == null);
            int lemmalessHashCode = foo.GetHashCode();

            foo.Remove(typeof(CoreAnnotations.LemmaAnnotation));
            NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode());
            foo.SetLemma(null);
            NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode());
            foo.SetLemma("fool");
            NUnit.Framework.Assert.AreEqual(hashCode, foo.GetHashCode());
            // Check equals
            foo.SetWord("bar");
            foo.SetWord("foo");
            ArrayCoreMap nulledCopy = new ArrayCoreMap(foo);

            NUnit.Framework.Assert.AreEqual(nulledCopy, foo);
            foo.Remove(typeof(CoreAnnotations.LemmaAnnotation));
            NUnit.Framework.Assert.AreEqual(nulledCopy, foo);
        }
Example #9
0
        public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words)
            where _T0 : IHasWord
        {
            IList <Tree> preterminals = Generics.NewArrayList();

            for (int index = 0; index < words.Count; ++index)
            {
                IHasWord  hw = words[index];
                CoreLabel wordLabel;
                string    tag;
                if (hw is CoreLabel)
                {
                    wordLabel = (CoreLabel)hw;
                    tag       = wordLabel.Tag();
                }
                else
                {
                    wordLabel = new CoreLabel();
                    wordLabel.SetValue(hw.Word());
                    wordLabel.SetWord(hw.Word());
                    if (!(hw is IHasTag))
                    {
                        throw new ArgumentException("Expected tagged words");
                    }
                    tag = ((IHasTag)hw).Tag();
                    wordLabel.SetTag(tag);
                }
                if (tag == null)
                {
                    throw new ArgumentException("Input word not tagged");
                }
                CoreLabel tagLabel = new CoreLabel();
                tagLabel.SetValue(tag);
                // Index from 1.  Tools downstream from the parser expect that
                // Internally this parser uses the index, so we have to
                // overwrite incorrect indices if the label is already indexed
                wordLabel.SetIndex(index + 1);
                tagLabel.SetIndex(index + 1);
                LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel);
                LabeledScoredTreeNode tagNode  = new LabeledScoredTreeNode(tagLabel);
                tagNode.AddChild(wordNode);
                // TODO: can we get away with not setting these on the wordLabel?
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                preterminals.Add(tagNode);
            }
            return(new State(preterminals));
        }
        internal static Tree CreateNode(Tree top, string label, params Tree[] children)
        {
            CoreLabel headLabel  = (CoreLabel)top.Label();
            CoreLabel production = new CoreLabel();

            production.SetValue(label);
            production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)));
            production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)));
            Tree newTop = new LabeledScoredTreeNode(production);

            foreach (Tree child in children)
            {
                newTop.AddChild(child);
            }
            return(newTop);
        }
 public override void PopulatePredictedLabels(IList <Tree> trees)
 {
     if (trees.Count != this.predicted.Count)
     {
         throw new ArgumentException("Number of gold and predicted trees not equal!");
     }
     for (int i = 0; i < trees.Count; i++)
     {
         IEnumerator <Tree> goldTree      = trees[i].GetEnumerator();
         IEnumerator <Tree> predictedTree = this.predicted[i].GetEnumerator();
         while (goldTree.MoveNext() || predictedTree.MoveNext())
         {
             Tree goldNode      = goldTree.Current;
             Tree predictedNode = predictedTree.Current;
             if (goldNode == null || predictedNode == null)
             {
                 throw new ArgumentException("Trees not of equal length");
             }
             if (goldNode.IsLeaf())
             {
                 continue;
             }
             CoreLabel label = (CoreLabel)goldNode.Label();
             label.Set(typeof(RNNCoreAnnotations.PredictedClass), RNNCoreAnnotations.GetPredictedClass(predictedNode));
         }
     }
 }
 public virtual void Annotate(Annotation annotation)
 {
     if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation)))
     {
         // TODO: parallelize
         IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
         foreach (ICoreMap sentence in sentences)
         {
             Tree binarized = sentence.Get(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation));
             if (binarized == null)
             {
                 throw new AssertionError("Binarized sentences not built by parser");
             }
             Tree collapsedUnary             = transformer.TransformTree(binarized);
             SentimentCostAndGradient scorer = new SentimentCostAndGradient(model, null);
             scorer.ForwardPropagateTree(collapsedUnary);
             sentence.Set(typeof(SentimentCoreAnnotations.SentimentAnnotatedTree), collapsedUnary);
             int sentiment = RNNCoreAnnotations.GetPredictedClass(collapsedUnary);
             sentence.Set(typeof(SentimentCoreAnnotations.SentimentClass), SentimentUtils.SentimentString(model, sentiment));
             Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
             if (tree != null)
             {
                 collapsedUnary.SetSpans();
                 // map the sentiment annotations onto the tree
                 IDictionary <IntPair, string> spanSentiment = Generics.NewHashMap();
                 foreach (Tree bt in collapsedUnary)
                 {
                     IntPair p       = bt.GetSpan();
                     int     sen     = RNNCoreAnnotations.GetPredictedClass(bt);
                     string  sentStr = SentimentUtils.SentimentString(model, sen);
                     if (!spanSentiment.Contains(p))
                     {
                         // we'll take the first = highest one discovered
                         spanSentiment[p] = sentStr;
                     }
                 }
                 if (((CoreLabel)tree.Label()).ContainsKey(typeof(CoreAnnotations.SpanAnnotation)))
                 {
                     throw new InvalidOperationException("This code assumes you don't have SpanAnnotation");
                 }
                 tree.SetSpans();
                 foreach (Tree t in tree)
                 {
                     IntPair p   = t.GetSpan();
                     string  str = spanSentiment[p];
                     if (str != null)
                     {
                         CoreLabel cl = (CoreLabel)t.Label();
                         cl.Set(typeof(SentimentCoreAnnotations.SentimentClass), str);
                         cl.Remove(typeof(CoreAnnotations.SpanAnnotation));
                     }
                 }
             }
         }
     }
     else
     {
         throw new Exception("unable to find sentences in: " + annotation);
     }
 }
        /// <summary>Add a binary node to the existing node on top of the stack</summary>
        public virtual State Apply(State state, double scoreDelta)
        {
            TreeShapedStack <Tree> stack = state.stack;
            Tree right = stack.Peek();

            stack = stack.Pop();
            Tree left = stack.Peek();

            stack = stack.Pop();
            Tree head;

            switch (side)
            {
            case BinaryTransition.Side.Left:
            {
                head = left;
                break;
            }

            case BinaryTransition.Side.Right:
            {
                head = right;
                break;
            }

            default:
            {
                throw new ArgumentException("Unknown side " + side);
            }
            }
            if (!(head.Label() is CoreLabel))
            {
                throw new ArgumentException("Stack should have CoreLabel nodes");
            }
            CoreLabel headLabel  = (CoreLabel)head.Label();
            CoreLabel production = new CoreLabel();

            production.SetValue(label);
            production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)));
            production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)));
            Tree newTop = new LabeledScoredTreeNode(production);

            newTop.AddChild(left);
            newTop.AddChild(right);
            stack = stack.Push(newTop);
            return(new State(stack, state.transitions.Push(this), state.separators, state.sentence, state.tokenPosition, state.score + scoreDelta, false));
        }
Example #14
0
        private void SetTrueCaseText(CoreLabel l)
        {
            string trueCase     = l.GetString <CoreAnnotations.TrueCaseAnnotation>();
            string text         = l.Word();
            string trueCaseText = text;

            switch (trueCase)
            {
            case "UPPER":
            {
                trueCaseText = text.ToUpper();
                break;
            }

            case "LOWER":
            {
                trueCaseText = text.ToLower();
                break;
            }

            case "INIT_UPPER":
            {
                trueCaseText = char.ToTitleCase(text[0]) + Sharpen.Runtime.Substring(text, 1).ToLower();
                break;
            }

            case "O":
            {
                // The model predicted mixed case, so lookup the map:
                string lower = text.ToLower();
                if (mixedCaseMap.Contains(lower))
                {
                    trueCaseText = mixedCaseMap[lower];
                }
                // else leave it as it was?
                break;
            }
            }
            // System.err.println(text + " was classified as " + trueCase + " and so became " + trueCaseText);
            l.Set(typeof(CoreAnnotations.TrueCaseTextAnnotation), trueCaseText);
            if (overwriteText)
            {
                l.Set(typeof(CoreAnnotations.TextAnnotation), trueCaseText);
                l.Set(typeof(CoreAnnotations.ValueAnnotation), trueCaseText);
            }
        }
Example #15
0
        /// <summary>see merge(CoreMap base, CoreMap toBeMerged)</summary>
        public static CoreLabel Merge(CoreLabel @base, CoreLabel toBeMerged)
        {
            //(variables)
            CoreLabel rtn = new CoreLabel(@base.Size());

            //(copy base)
            foreach (Type key in @base.KeySet())
            {
                rtn.Set(key, @base.Get(key));
            }
            //(merge)
            foreach (Type key_1 in toBeMerged.KeySet())
            {
                rtn.Set(key_1, toBeMerged.Get(key_1));
            }
            //(return)
            return(rtn);
        }
Example #16
0
        /// <summary>Copies the CoreLabel cl with the new word part</summary>
        private static CoreLabel CopyCoreLabel(CoreLabel cl, string part, int beginPosition, int endPosition)
        {
            CoreLabel newLabel = new CoreLabel(cl);

            newLabel.SetWord(part);
            newLabel.SetValue(part);
            newLabel.SetBeginPosition(beginPosition);
            newLabel.SetEndPosition(endPosition);
            newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
            return(newLabel);
        }
Example #17
0
        public virtual IList <CoreLabel> SegmentStringToTokenList(string line)
        {
            IList <CoreLabel> tokenList       = CollectionUtils.MakeList();
            IList <CoreLabel> labeledSequence = SegmentStringToIOB(line);

            foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence))
            {
                CoreLabel token = new CoreLabel();
                string    text  = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget());
                token.SetWord(text);
                token.SetValue(text);
                token.Set(typeof(CoreAnnotations.TextAnnotation), text);
                token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1");
                int start = labeledSequence[span.GetSource()].BeginPosition();
                int end   = labeledSequence[span.GetTarget() - 1].EndPosition();
                token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end));
                token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start);
                token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                tokenList.Add(token);
            }
            return(tokenList);
        }
Example #18
0
 /// <summary>Splits a compound marked by the lexer.</summary>
 private CoreLabel ProcessCompound(CoreLabel cl)
 {
     cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
     string[] parts = cl.Word().ReplaceAll("-", " - ").Split("\\s+");
     foreach (string part in parts)
     {
         CoreLabel newLabel = new CoreLabel(cl);
         newLabel.SetWord(part);
         newLabel.SetValue(part);
         newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
         compoundBuffer.Add(newLabel);
     }
     return(compoundBuffer.Remove(0));
 }
        /// <summary>Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.</summary>
        /// <remarks>
        /// Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.
        /// The original value() of the Tree nodes is preserved, and otherwise the label of tree
        /// leaves becomes the label from the List.
        /// </remarks>
        public static void MergeLabels(Tree tree, IList <CoreLabel> sentence)
        {
            // todo [cdm 2015]: This clearly shouldn't be here! Maybe it's not needed at all now since parsing code does this?
            int idx = 0;

            foreach (Tree t in tree.GetLeaves())
            {
                CoreLabel cl    = sentence[idx++];
                string    value = t.Value();
                cl.Set(typeof(CoreAnnotations.ValueAnnotation), value);
                t.SetLabel(cl);
            }
            tree.IndexLeaves();
        }
Example #20
0
 private static void TaggedLeafLabels(Tree t, IList <CoreLabel> l)
 {
     if (t.IsPreTerminal())
     {
         CoreLabel fl = (CoreLabel)t.GetChild(0).Label();
         fl.Set(typeof(CoreAnnotations.TagLabelAnnotation), t.Label());
         l.Add(fl);
     }
     else
     {
         foreach (Tree kid in t.Children())
         {
             TaggedLeafLabels(kid, l);
         }
     }
 }
Example #21
0
        /// <summary>Create a datum from a string.</summary>
        /// <remarks>
        /// Create a datum from a string. The CoreAnnotations must correspond to those used by
        /// SequenceClassifier. The following annotations are copied from the provided
        /// CoreLabel cl, if present:
        /// DomainAnnotation
        /// startOffset and endOffset will be added to the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// of
        /// the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreLabel"/>
        /// cl to give the
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/>
        /// and
        /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetEndAnnotation"/>
        /// of the resulting datum.
        /// </remarks>
        private static CoreLabel CreateDatum(CoreLabel cl, string token, string label, int startOffset, int endOffset)
        {
            CoreLabel newTok = new CoreLabel();

            newTok.Set(typeof(CoreAnnotations.TextAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.CharAnnotation), token);
            newTok.Set(typeof(CoreAnnotations.AnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + startOffset);
            newTok.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + endOffset);
            if (cl != null && cl.ContainsKey(typeof(CoreAnnotations.DomainAnnotation)))
            {
                newTok.Set(typeof(CoreAnnotations.DomainAnnotation), cl.Get(typeof(CoreAnnotations.DomainAnnotation)));
            }
            return(newTok);
        }
        /// <summary>Remove everything but the skeleton, the predictions, and the labels</summary>
        private Tree SimplifyTree(Tree tree)
        {
            CoreLabel newLabel = new CoreLabel();

            newLabel.Set(typeof(RNNCoreAnnotations.Predictions), RNNCoreAnnotations.GetPredictions(tree));
            newLabel.SetValue(tree.Label().Value());
            if (tree.IsLeaf())
            {
                return(tree.TreeFactory().NewLeaf(newLabel));
            }
            IList <Tree> children = Generics.NewArrayList(tree.Children().Length);

            for (int i = 0; i < tree.Children().Length; ++i)
            {
                children.Add(SimplifyTree(tree.Children()[i]));
            }
            return(tree.TreeFactory().NewTreeNode(newLabel, children));
        }
Example #23
0
        /// <summary>Splits a compound marked by the lexer.</summary>
        private CoreLabel ProcessCompound(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string[] parts       = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - "));
            int      lengthAccum = 0;

            foreach (string part in parts)
            {
                CoreLabel newLabel = new CoreLabel(cl);
                newLabel.SetWord(part);
                newLabel.SetValue(part);
                newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum);
                newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length);
                newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
                compoundBuffer.Add(newLabel);
                lengthAccum += part.Length;
            }
            return(compoundBuffer.Remove(0));
        }
 /// <exception cref="System.Exception"/>
 public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call()
 {
     // CollectionValuedMap<String, Integer> tokensMatchedPattern = new
     // CollectionValuedMap<String, Integer>();
     try
     {
         ICollection <CandidatePhrase> alreadyLabeledPhrases                    = new HashSet <CandidatePhrase>();
         TwoDimensionalCounter <CandidatePhrase, E>          allFreq            = new TwoDimensionalCounter <CandidatePhrase, E>();
         CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >();
         foreach (string sentid in sentids)
         {
             IList <CoreLabel> sent = sents[sentid].GetTokens();
             foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns)
             {
                 if (pEn.Key == null)
                 {
                     throw new Exception("why is the pattern " + pEn + " null?");
                 }
                 TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent));
                 //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                 //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                 //Higher branch values makes the faster but uses more memory
                 m.SetBranchLimit(5);
                 while (m.Find())
                 {
                     int s = m.Start("$term");
                     int e = m.End("$term");
                     System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label "
                                                     + label);
                     string phrase            = string.Empty;
                     string phraseLemma       = string.Empty;
                     bool   useWordNotLabeled = false;
                     bool   doNotUse          = false;
                     //find if the neighboring words are labeled - if so - club them together
                     if (constVars.clubNeighboringLabeledWords)
                     {
                         for (int i = s - 1; i >= 0; i--)
                         {
                             if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 s = i + 1;
                                 break;
                             }
                         }
                         for (int i_1 = e; i_1 < sent.Count; i_1++)
                         {
                             if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label))
                             {
                                 e = i_1;
                                 break;
                             }
                         }
                     }
                     //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                     bool[] addedindices = new bool[e - s];
                     // Arrays.fill(addedindices, false); // not needed as initialized false
                     for (int i_2 = s; i_2 < e; i_2++)
                     {
                         CoreLabel l = sent[i_2];
                         l.Set(typeof(PatternsAnnotations.MatchedPattern), true);
                         if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null)
                         {
                             l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>());
                         }
                         SurfacePattern pSur = (SurfacePattern)pEn.Value;
                         System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!");
                         System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet());
                         l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur);
                         foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label])
                         {
                             if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value))
                             {
                                 doNotUse = true;
                             }
                         }
                         bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex);
                         if (removePhrasesWithStopWords && containsStop)
                         {
                             doNotUse = true;
                         }
                         else
                         {
                             if (!containsStop || !removeStopWordsFromSelectedPhrases)
                             {
                                 if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label))
                                 {
                                     useWordNotLabeled = true;
                                 }
                                 phrase               += " " + l.Word();
                                 phraseLemma          += " " + l.Lemma();
                                 addedindices[i_2 - s] = true;
                             }
                         }
                     }
                     for (int i_3 = 0; i_3 < addedindices.Length; i_3++)
                     {
                         if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true)
                         {
                             doNotUse = true;
                             break;
                         }
                     }
                     if (!doNotUse)
                     {
                         matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1));
                         phrase = phrase.Trim();
                         if (!phrase.IsEmpty())
                         {
                             phraseLemma = phraseLemma.Trim();
                             CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma);
                             allFreq.IncrementCount(candPhrase, pEn.Value, 1.0);
                             if (!useWordNotLabeled)
                             {
                                 alreadyLabeledPhrases.Add(candPhrase);
                             }
                         }
                     }
                 }
             }
         }
         return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases));
     }
     catch (Exception e)
     {
         logger.Error(e);
         throw;
     }
 }
        /// <summary>
        /// Find the operators in this sentence, annotating the head word (only!) of each operator with the
        /// <see cref="OperatorAnnotation"/>
        /// .
        /// </summary>
        /// <param name="sentence">
        /// As in
        /// <see cref="DoOneSentence(Edu.Stanford.Nlp.Pipeline.Annotation, Edu.Stanford.Nlp.Util.ICoreMap)"/>
        /// </param>
        private void AnnotateOperators(ICoreMap sentence)
        {
            SemanticGraph     tree   = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation));
            IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            if (tree == null)
            {
                tree = sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation));
            }
            foreach (SemgrexPattern pattern in Patterns)
            {
                SemgrexMatcher matcher = pattern.Matcher(tree);
                while (matcher.Find())
                {
                    // Get terms
                    IndexedWord properSubject = matcher.GetNode("Subject");
                    IndexedWord quantifier;
                    IndexedWord subject;
                    bool        namedEntityQuantifier = false;
                    if (properSubject != null)
                    {
                        quantifier            = subject = properSubject;
                        namedEntityQuantifier = true;
                    }
                    else
                    {
                        quantifier = matcher.GetNode("quantifier");
                        subject    = matcher.GetNode("subject");
                    }
                    IndexedWord @object = matcher.GetNode("object");
                    // Validate quantifier
                    // At the end of this
                    Optional <Triple <Operator, int, int> > quantifierInfo;
                    if (namedEntityQuantifier)
                    {
                        // named entities have the "all" semantics by default.
                        if (!neQuantifiers)
                        {
                            continue;
                        }
                        quantifierInfo = Optional.Of(Triple.MakeTriple(Operator.ImplicitNamedEntity, quantifier.Index(), quantifier.Index()));
                    }
                    else
                    {
                        // note: empty quantifier span given
                        // find the quantifier, and return some info about it.
                        quantifierInfo = ValidateQuantifierByHead(sentence, quantifier, @object == null || subject == null);
                    }
                    // Awful hacks to regularize the subject of things like "one of" and "there are"
                    // (fix up 'there are')
                    if ("be".Equals(subject == null ? null : subject.Lemma()))
                    {
                        bool        hasExpl    = false;
                        IndexedWord newSubject = null;
                        foreach (SemanticGraphEdge outgoingEdge in tree.OutgoingEdgeIterable(subject))
                        {
                            if ("nsubj".Equals(outgoingEdge.GetRelation().ToString()))
                            {
                                newSubject = outgoingEdge.GetDependent();
                            }
                            else
                            {
                                if ("expl".Equals(outgoingEdge.GetRelation().ToString()))
                                {
                                    hasExpl = true;
                                }
                            }
                        }
                        if (hasExpl)
                        {
                            subject = newSubject;
                        }
                    }
                    // (fix up '$n$ of')
                    if ("CD".Equals(subject == null ? null : subject.Tag()))
                    {
                        foreach (SemanticGraphEdge outgoingEdge in tree.OutgoingEdgeIterable(subject))
                        {
                            string rel = outgoingEdge.GetRelation().ToString();
                            if (rel.StartsWith("nmod"))
                            {
                                subject = outgoingEdge.GetDependent();
                            }
                        }
                    }
                    // Set tokens
                    if (quantifierInfo.IsPresent())
                    {
                        // Compute span
                        IndexedWord pivot = matcher.GetNode("pivot");
                        if (pivot == null)
                        {
                            pivot = @object;
                        }
                        OperatorSpec scope = ComputeScope(tree, quantifierInfo.Get().first, pivot, Pair.MakePair(quantifierInfo.Get().second, quantifierInfo.Get().third), subject, namedEntityQuantifier, @object, tokens.Count);
                        // Set annotation
                        CoreLabel    token    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation))[quantifier.Index() - 1];
                        OperatorSpec oldScope = token.Get(typeof(NaturalLogicAnnotations.OperatorAnnotation));
                        if (oldScope == null || oldScope.QuantifierLength() < scope.QuantifierLength() || oldScope.instance != scope.instance)
                        {
                            token.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), scope);
                        }
                        else
                        {
                            token.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), OperatorSpec.Merge(oldScope, scope));
                        }
                    }
                }
            }
            // Ensure we didn't select overlapping quantifiers. For example, "a" and "a few" can often overlap.
            // In these cases, take the longer quantifier match.
            IList <OperatorSpec> quantifiers = new List <OperatorSpec>();

            for (int i = 0; i < tokens.Count; ++i)
            {
                CoreLabel    token = tokens[i];
                OperatorSpec @operator;
                if ((@operator = token.Get(typeof(NaturalLogicAnnotations.OperatorAnnotation))) != null)
                {
                    if (i == 0 && @operator.instance == Operator.No && tokens.Count > 2 && "PRP".Equals(tokens[1].Get(typeof(CoreAnnotations.PartOfSpeechAnnotation))))
                    {
                        // This is pragmatically not a negation -- ignore it
                        // For example, "no I don't like candy" or "no you like cats"
                        token.Remove(typeof(NaturalLogicAnnotations.OperatorAnnotation));
                    }
                    else
                    {
                        quantifiers.Add(@operator);
                    }
                }
            }
            quantifiers.Sort(null);
            foreach (OperatorSpec quantifier_1 in quantifiers)
            {
                for (int i_1 = quantifier_1.quantifierBegin; i_1 < quantifier_1.quantifierEnd; ++i_1)
                {
                    if (i_1 != quantifier_1.quantifierHead)
                    {
                        tokens[i_1].Remove(typeof(NaturalLogicAnnotations.OperatorAnnotation));
                    }
                }
            }
        }
        /// <summary>
        /// Annotate any unary quantifiers that weren't found in the main
        /// <see cref="AnnotateOperators(Edu.Stanford.Nlp.Util.ICoreMap)"/>
        /// method.
        /// </summary>
        /// <param name="sentence">The sentence to annotate.</param>
        private static void AnnotateUnaries(ICoreMap sentence)
        {
            // Get tree and tokens
            SemanticGraph tree = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation));

            if (tree == null)
            {
                tree = sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation));
            }
            IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            // Get operator exists mask
            bool[] isOperator = new bool[tokens.Count];
            for (int i = 0; i < isOperator.Length; ++i)
            {
                OperatorSpec spec = tokens[i].Get(typeof(NaturalLogicAnnotations.OperatorAnnotation));
                if (spec != null)
                {
                    for (int k = spec.quantifierBegin; k < spec.quantifierEnd; ++k)
                    {
                        isOperator[k] = true;
                    }
                }
            }
            // Match Semgrex
            SemgrexMatcher matcher = UnaryPattern.Matcher(tree);

            while (matcher.Find())
            {
                // Get relevant nodes
                IndexedWord quantifier = matcher.GetNode("quantifier");
                string      word       = quantifier.Word().ToLower();
                if (word.Equals("a") || word.Equals("an") || word.Equals("the") || "CD".Equals(quantifier.Tag()))
                {
                    continue;
                }
                // These are absurdly common, and uninformative, and we're just going to shoot ourselves in the foot from parsing errors and idiomatic expressions.
                IndexedWord subject = matcher.GetNode("subject");
                // ... If there is not already an operator there
                if (!isOperator[quantifier.Index() - 1])
                {
                    Optional <Triple <Operator, int, int> > quantifierInfo = ValidateQuantifierByHead(sentence, quantifier, true);
                    // ... and if we found a quantifier span
                    if (quantifierInfo.IsPresent())
                    {
                        // Then add the unary operator!
                        OperatorSpec scope = ComputeScope(tree, quantifierInfo.Get().first, subject, Pair.MakePair(quantifierInfo.Get().second, quantifierInfo.Get().third), null, false, null, tokens.Count);
                        CoreLabel    token = tokens[quantifier.Index() - 1];
                        token.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), scope);
                    }
                }
            }
            // Match TokensRegex
            TokenSequenceMatcher tokenMatcher = DoubtPattern.Matcher(tokens);

            while (tokenMatcher.Find())
            {
                IList <CoreLabel> doubt  = (IList <CoreLabel>)tokenMatcher.GroupNodes("$doubt");
                IList <CoreLabel> target = (IList <CoreLabel>)tokenMatcher.GroupNodes("$target");
                foreach (CoreLabel word in doubt)
                {
                    OperatorSpec spec = new OperatorSpec(Operator.GeneralNegPolarity, word.Index() - 1, word.Index(), target[0].Index() - 1, target[target.Count - 1].Index(), 0, 0, tokens.Count);
                    word.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), spec);
                }
            }
        }
        /// <summary>Annotate every token for its polarity, based on the operators found.</summary>
        /// <remarks>
        /// Annotate every token for its polarity, based on the operators found. This function will set the
        /// <see cref="PolarityAnnotation"/>
        /// for every token.
        /// </remarks>
        /// <param name="sentence">
        /// As in
        /// <see cref="DoOneSentence(Edu.Stanford.Nlp.Pipeline.Annotation, Edu.Stanford.Nlp.Util.ICoreMap)"/>
        /// </param>
        private static void AnnotatePolarity(ICoreMap sentence)
        {
            // Collect all the operators in this sentence
            IList <OperatorSpec> operators = new List <OperatorSpec>();
            IList <CoreLabel>    tokens    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            foreach (CoreLabel token in tokens)
            {
                OperatorSpec specOrNull = token.Get(typeof(NaturalLogicAnnotations.OperatorAnnotation));
                if (specOrNull != null)
                {
                    operators.Add(specOrNull);
                }
            }
            // Make sure every node of the dependency tree has a polarity.
            // This is separate from the code below in case the tokens in the dependency
            // tree don't correspond to the tokens in the sentence. This happens at least
            // when the constituency parser craps out on a long sentence, and the
            // dependency tree is put together haphazardly.
            if (sentence.ContainsKey(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)))
            {
                foreach (IndexedWord token_1 in sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)).VertexSet())
                {
                    token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), Polarity.Default);
                }
            }
            if (sentence.ContainsKey(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)))
            {
                foreach (IndexedWord token_1 in sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)).VertexSet())
                {
                    token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), Polarity.Default);
                }
            }
            if (sentence.ContainsKey(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation)))
            {
                foreach (IndexedWord token_1 in sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation)).VertexSet())
                {
                    token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), Polarity.Default);
                }
            }
            // Set polarity for each token
            for (int i = 0; i < tokens.Count; ++i)
            {
                CoreLabel token_1 = tokens[i];
                // Get operators in scope
                IList <Triple <int, Monotonicity, MonotonicityType> > inScope = new List <Triple <int, Monotonicity, MonotonicityType> >(4);
                foreach (OperatorSpec @operator in operators)
                {
                    if (i >= @operator.subjectBegin && i < @operator.subjectEnd)
                    {
                        inScope.Add(Triple.MakeTriple(@operator.subjectEnd - @operator.subjectBegin, @operator.instance.subjMono, @operator.instance.subjType));
                    }
                    else
                    {
                        if (i >= @operator.objectBegin && i < @operator.objectEnd)
                        {
                            inScope.Add(Triple.MakeTriple(@operator.objectEnd - @operator.objectBegin, @operator.instance.objMono, @operator.instance.objType));
                        }
                    }
                }
                // Sort the operators by their scope (approximated by the size of their argument span)
                inScope.Sort(null);
                // Create polarity
                IList <Pair <Monotonicity, MonotonicityType> > info = new List <Pair <Monotonicity, MonotonicityType> >(inScope.Count);
                foreach (Triple <int, Monotonicity, MonotonicityType> term in inScope)
                {
                    info.Add(Pair.MakePair(term.second, term.third));
                }
                Polarity polarity = new Polarity(info);
                // Set polarity
                token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), polarity);
            }
            // Set the PolarityDirectionAnnotation
            foreach (CoreLabel token_2 in tokens)
            {
                Polarity polarity = token_2.Get(typeof(NaturalLogicAnnotations.PolarityAnnotation));
                if (polarity != null)
                {
                    if (polarity.IsUpwards())
                    {
                        token_2.Set(typeof(NaturalLogicAnnotations.PolarityDirectionAnnotation), "up");
                    }
                    else
                    {
                        if (polarity.IsDownwards())
                        {
                            token_2.Set(typeof(NaturalLogicAnnotations.PolarityDirectionAnnotation), "down");
                        }
                        else
                        {
                            token_2.Set(typeof(NaturalLogicAnnotations.PolarityDirectionAnnotation), "flat");
                        }
                    }
                }
            }
        }
        /// <summary>
        /// This is the method to call for assigning labels and node vectors
        /// to the Tree.
        /// </summary>
        /// <remarks>
        /// This is the method to call for assigning labels and node vectors
        /// to the Tree.  After calling this, each of the non-leaf nodes will
        /// have the node vector and the predictions of their classes
        /// assigned to that subtree's node.  The annotations filled in are
        /// the RNNCoreAnnotations.NodeVector, Predictions, and
        /// PredictedClass.  In general, PredictedClass will be the most
        /// useful annotation except when training.
        /// </remarks>
        public virtual void ForwardPropagateTree(Tree tree)
        {
            SimpleMatrix nodeVector;
            // initialized below or Exception thrown // = null;
            SimpleMatrix classification;

            // initialized below or Exception thrown // = null;
            if (tree.IsLeaf())
            {
                // We do nothing for the leaves.  The preterminals will
                // calculate the classification for this word/tag.  In fact, the
                // recursion should not have gotten here (unless there are
                // degenerate trees of just one leaf)
                log.Info("SentimentCostAndGradient: warning: We reached leaves in forwardPropagate: " + tree);
                throw new AssertionError("We should not have reached leaves in forwardPropagate");
            }
            else
            {
                if (tree.IsPreTerminal())
                {
                    classification = model.GetUnaryClassification(tree.Label().Value());
                    string       word       = tree.Children()[0].Label().Value();
                    SimpleMatrix wordVector = model.GetWordVector(word);
                    nodeVector = NeuralUtils.ElementwiseApplyTanh(wordVector);
                }
                else
                {
                    if (tree.Children().Length == 1)
                    {
                        log.Info("SentimentCostAndGradient: warning: Non-preterminal nodes of size 1: " + tree);
                        throw new AssertionError("Non-preterminal nodes of size 1 should have already been collapsed");
                    }
                    else
                    {
                        if (tree.Children().Length == 2)
                        {
                            ForwardPropagateTree(tree.Children()[0]);
                            ForwardPropagateTree(tree.Children()[1]);
                            string       leftCategory  = tree.Children()[0].Label().Value();
                            string       rightCategory = tree.Children()[1].Label().Value();
                            SimpleMatrix W             = model.GetBinaryTransform(leftCategory, rightCategory);
                            classification = model.GetBinaryClassification(leftCategory, rightCategory);
                            SimpleMatrix leftVector     = RNNCoreAnnotations.GetNodeVector(tree.Children()[0]);
                            SimpleMatrix rightVector    = RNNCoreAnnotations.GetNodeVector(tree.Children()[1]);
                            SimpleMatrix childrenVector = NeuralUtils.ConcatenateWithBias(leftVector, rightVector);
                            if (model.op.useTensors)
                            {
                                SimpleTensor tensor    = model.GetBinaryTensor(leftCategory, rightCategory);
                                SimpleMatrix tensorIn  = NeuralUtils.Concatenate(leftVector, rightVector);
                                SimpleMatrix tensorOut = tensor.BilinearProducts(tensorIn);
                                nodeVector = NeuralUtils.ElementwiseApplyTanh(W.Mult(childrenVector).Plus(tensorOut));
                            }
                            else
                            {
                                nodeVector = NeuralUtils.ElementwiseApplyTanh(W.Mult(childrenVector));
                            }
                        }
                        else
                        {
                            log.Info("SentimentCostAndGradient: warning: Tree not correctly binarized: " + tree);
                            throw new AssertionError("Tree not correctly binarized");
                        }
                    }
                }
            }
            SimpleMatrix predictions = NeuralUtils.Softmax(classification.Mult(NeuralUtils.ConcatenateWithBias(nodeVector)));
            int          index       = GetPredictedClass(predictions);

            if (!(tree.Label() is CoreLabel))
            {
                log.Info("SentimentCostAndGradient: warning: No CoreLabels in nodes: " + tree);
                throw new AssertionError("Expected CoreLabels in the nodes");
            }
            CoreLabel label = (CoreLabel)tree.Label();

            label.Set(typeof(RNNCoreAnnotations.Predictions), predictions);
            label.Set(typeof(RNNCoreAnnotations.PredictedClass), index);
            label.Set(typeof(RNNCoreAnnotations.NodeVector), nodeVector);
        }
        /// <exception cref="System.IO.IOException"/>
        public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix)
        {
            Pattern          startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            Pattern          endLabelToken      = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            string           backgroundSymbol   = "O";
            IList <ICoreMap> sentences          = new List <ICoreMap>();
            int    lineNum = -1;
            string l       = null;

            while ((l = reader.ReadLine()) != null)
            {
                lineNum++;
                string[] t    = l.Split("\t", 2);
                string   id   = null;
                string   text = null;
                if (t.Length == 2)
                {
                    id   = t[0];
                    text = t[1];
                }
                else
                {
                    if (t.Length == 1)
                    {
                        text = t[0];
                        id   = lineNum.ToString();
                    }
                }
                id = sentIDprefix + id;
                DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
                PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
                dp.SetTokenizerFactory(tokenizerFactory);
                string label   = backgroundSymbol;
                int    sentNum = -1;
                foreach (IList <IHasWord> sentence in dp)
                {
                    sentNum++;
                    string            sentStr = string.Empty;
                    IList <CoreLabel> sent    = new List <CoreLabel>();
                    foreach (IHasWord tokw in sentence)
                    {
                        string  tok             = tokw.Word();
                        Matcher startingMatcher = startingLabelToken.Matcher(tok);
                        Matcher endMatcher      = endLabelToken.Matcher(tok);
                        if (startingMatcher.Matches())
                        {
                            //System.out.println("matched starting");
                            label = startingMatcher.Group(1);
                        }
                        else
                        {
                            if (endMatcher.Matches())
                            {
                                //System.out.println("matched end");
                                label = backgroundSymbol;
                            }
                            else
                            {
                                CoreLabel      c    = new CoreLabel();
                                IList <string> toks = new List <string>();
                                toks.Add(tok);
                                foreach (string toksplit in toks)
                                {
                                    sentStr += " " + toksplit;
                                    c.SetWord(toksplit);
                                    c.SetLemma(toksplit);
                                    c.SetValue(toksplit);
                                    c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit);
                                    c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok);
                                    if (setGoldClass)
                                    {
                                        c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
                                    }
                                    if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label))
                                    {
                                        c.Set(setClassForTheseLabels[label], label);
                                    }
                                    sent.Add(c);
                                }
                            }
                        }
                    }
                    ICoreMap sentcm = new ArrayCoreMap();
                    sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim());
                    sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent);
                    sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum);
                    sentences.Add(sentcm);
                }
            }
            return(sentences);
        }
Example #30
0
        /// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary>
        private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string
                                                                                                                                                                                                                                                  > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid)
        {
            // Error checks
            if (lemmas.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (pos.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (ner.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            // Create structure
            IList <CoreLabel> tokens = new List <CoreLabel>(words.Count);
            int beginChar            = 0;

            for (int i = 0; i < words.Count; ++i)
            {
                CoreLabel token = new CoreLabel(12);
                token.SetWord(words[i]);
                token.SetValue(words[i]);
                token.SetBeginPosition(beginChar);
                token.SetEndPosition(beginChar + words[i].Length);
                beginChar += words[i].Length + 1;
                token.SetLemma(lemmas[i]);
                token.SetTag(pos[i]);
                token.SetNER(ner[i]);
                token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
                token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
                token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1);
                token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i);
                token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1);
                tokens.Add(token);
            }
            gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t");
            ICoreMap sentence = new ArrayCoreMap(16);

            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            SemanticGraph graph = tree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph);
            SemanticGraph maltGraph = maltTree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph);
            sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss);
            sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0);
            sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count);
            Annotation doc = new Annotation(gloss);

            doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
            doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            return(doc);
        }