Ejemplo n.º 1
0
        public UnnamedDependency(string regent, string dependent)
        {
            // We store the text of the labels separately because it looks like
            // it is possible for an object to request a hash code using itself
            // in a partially reconstructed state when unserializing.  For
            // example, a TreeGraphNode might ask for the hash code of an
            // UnnamedDependency, which then uses an unfilled member of the same
            // TreeGraphNode to get the hash code.  Keeping the text of the
            // labels breaks that possible cycle.
            if (regent == null || dependent == null)
            {
                throw new ArgumentException("governor or dependent cannot be null");
            }
            CoreLabel headLabel = new CoreLabel();

            headLabel.SetValue(regent);
            headLabel.SetWord(regent);
            this.regent = headLabel;
            CoreLabel depLabel = new CoreLabel();

            depLabel.SetValue(dependent);
            depLabel.SetWord(dependent);
            this.dependent = depLabel;
            regentText     = regent;
            dependentText  = dependent;
        }
        private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd)
        {
            CoreLabel token = new CoreLabel();

            token.SetOriginalText(tokenText);
            if (separatorPattern.Matcher(tokenText).Matches())
            {
                // Map to CoreNLP newline token
                tokenText = AbstractTokenizer.NewlineToken;
            }
            else
            {
                if (doNormalization && normalizeSpace)
                {
                    tokenText = tokenText.Replace(' ', '\u00A0');
                }
            }
            // change space to non-breaking space
            token.SetWord(tokenText);
            token.SetValue(tokenText);
            token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin);
            token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd);
            if (Verbose)
            {
                log.Info("Adding token " + token.ToShorterString());
            }
            return(token);
        }
        public static bool SetSpanLabel(Tree tree, Pair <int, int> span, string value)
        {
            if (!(tree.Label() is CoreLabel))
            {
                throw new AssertionError("Expected CoreLabels");
            }
            CoreLabel label = (CoreLabel)tree.Label();

            if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)).Equals(span.first) && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)).Equals(span.second))
            {
                label.SetValue(value);
                return(true);
            }
            if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)) > span.first && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)) < span.second)
            {
                return(false);
            }
            foreach (Tree child in tree.Children())
            {
                if (SetSpanLabel(child, span, value))
                {
                    return(true);
                }
            }
            return(false);
        }
        private static void ReplacePOSTags(Tree tree)
        {
            IList <ILabel> yield    = tree.Yield();
            IList <ILabel> preYield = tree.PreTerminalYield();

            System.Diagnostics.Debug.Assert(yield.Count == preYield.Count);
            MorphoFeatureSpecification spec = new FrenchMorphoFeatureSpecification();

            for (int i = 0; i < yield.Count; i++)
            {
                // Morphological Analysis
                string morphStr = ((CoreLabel)yield[i]).OriginalText();
                if (morphStr == null || morphStr.Equals(string.Empty))
                {
                    morphStr = preYield[i].Value();
                    // POS subcategory
                    string subCat = ((CoreLabel)yield[i]).Category();
                    if (subCat != null && subCat != string.Empty)
                    {
                        morphStr += "-" + subCat + "--";
                    }
                    else
                    {
                        morphStr += "---";
                    }
                }
                MorphoFeatures feats = spec.StrToFeatures(morphStr);
                if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty))
                {
                    CoreLabel cl = (CoreLabel)preYield[i];
                    cl.SetValue(feats.GetAltTag());
                    cl.SetTag(feats.GetAltTag());
                }
            }
        }
Ejemplo n.º 5
0
        private static CoreLabel InitCoreLabel(string token)
        {
            CoreLabel label = new CoreLabel();

            label.SetWord(token);
            label.SetValue(token);
            label.Set(typeof(CoreAnnotations.TextAnnotation), token);
            label.Set(typeof(CoreAnnotations.ValueAnnotation), token);
            return(label);
        }
Ejemplo n.º 6
0
        /// <summary>Copies the CoreLabel cl with the new word part</summary>
        private static CoreLabel CopyCoreLabel(CoreLabel cl, string part, int beginPosition, int endPosition)
        {
            CoreLabel newLabel = new CoreLabel(cl);

            newLabel.SetWord(part);
            newLabel.SetValue(part);
            newLabel.SetBeginPosition(beginPosition);
            newLabel.SetEndPosition(endPosition);
            newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
            return(newLabel);
        }
Ejemplo n.º 7
0
        protected internal virtual CoreLabel MkWord(string gloss, int index)
        {
            CoreLabel w = new CoreLabel();

            w.SetWord(gloss);
            w.SetValue(gloss);
            if (index >= 0)
            {
                w.SetIndex(index);
            }
            return(w);
        }
        /// <summary>Create a dummy word, just with a given word at a given index.</summary>
        /// <remarks>
        /// Create a dummy word, just with a given word at a given index.
        /// Mostly useful for making semantic graphs.
        /// </remarks>
        public static CoreLabel MkWord(string gloss, int index)
        {
            CoreLabel w = new CoreLabel();

            w.SetWord(gloss);
            w.SetValue(gloss);
            if (index >= 0)
            {
                w.SetIndex(index);
            }
            return(w);
        }
Ejemplo n.º 9
0
 public override ILabel Label()
 {
     // TODO: move this CoreLabel construction logic somewhere appropriate
     var cLabel = new CoreLabel();
     if (this.parse.IsLeaf)
     {
         cLabel.SetWord(this.parse.Value);
         cLabel.SetBeginPosition(this.parse.Span.Start);
         cLabel.SetEndPosition(this.parse.Span.End);
         cLabel.SetValue(this.parse.Value);
     }
     else
     {
         cLabel.SetCategory(this.parse.Type);
         cLabel.SetValue(this.parse.Type);
         if (this.Depth() == 1)
         {
             cLabel.SetTag(this.parse.Type);
         }
     }
     return cLabel;
 }
        // Arbitrary test input.  We just need to segment something on multiple threads to reproduce
        // the issue
        private static IList <CoreLabel> CreateTestTokens()
        {
            CoreLabel token = new CoreLabel();

            token.SetWord("你好,世界");
            token.SetValue("你好,世界");
            token.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1");
            token.Set(typeof(CoreAnnotations.AnswerAnnotation), "0");
            IList <CoreLabel> labels = new List <CoreLabel>();

            labels.Add(token);
            return(labels);
        }
Ejemplo n.º 11
0
        public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words)
            where _T0 : IHasWord
        {
            IList <Tree> preterminals = Generics.NewArrayList();

            for (int index = 0; index < words.Count; ++index)
            {
                IHasWord  hw = words[index];
                CoreLabel wordLabel;
                string    tag;
                if (hw is CoreLabel)
                {
                    wordLabel = (CoreLabel)hw;
                    tag       = wordLabel.Tag();
                }
                else
                {
                    wordLabel = new CoreLabel();
                    wordLabel.SetValue(hw.Word());
                    wordLabel.SetWord(hw.Word());
                    if (!(hw is IHasTag))
                    {
                        throw new ArgumentException("Expected tagged words");
                    }
                    tag = ((IHasTag)hw).Tag();
                    wordLabel.SetTag(tag);
                }
                if (tag == null)
                {
                    throw new ArgumentException("Input word not tagged");
                }
                CoreLabel tagLabel = new CoreLabel();
                tagLabel.SetValue(tag);
                // Index from 1.  Tools downstream from the parser expect that
                // Internally this parser uses the index, so we have to
                // overwrite incorrect indices if the label is already indexed
                wordLabel.SetIndex(index + 1);
                tagLabel.SetIndex(index + 1);
                LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel);
                LabeledScoredTreeNode tagNode  = new LabeledScoredTreeNode(tagLabel);
                tagNode.AddChild(wordNode);
                // TODO: can we get away with not setting these on the wordLabel?
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                preterminals.Add(tagNode);
            }
            return(new State(preterminals));
        }
Ejemplo n.º 12
0
 /// <summary>Splits a compound marked by the lexer.</summary>
 private CoreLabel ProcessCompound(CoreLabel cl)
 {
     cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
     string[] parts = cl.Word().ReplaceAll("-", " - ").Split("\\s+");
     foreach (string part in parts)
     {
         CoreLabel newLabel = new CoreLabel(cl);
         newLabel.SetWord(part);
         newLabel.SetValue(part);
         newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
         compoundBuffer.Add(newLabel);
     }
     return(compoundBuffer.Remove(0));
 }
Ejemplo n.º 13
0
        public override ILabel Label()
        {
            // TODO: move this CoreLabel construction logic somewhere appropriate
            var cLabel = new CoreLabel();

            if (this.parse.IsLeaf)
            {
                cLabel.SetWord(this.parse.Value);
                cLabel.SetBeginPosition(this.parse.Span.Start);
                cLabel.SetEndPosition(this.parse.Span.End);
                cLabel.SetValue(this.parse.Value);
            }
            else
            {
                cLabel.SetCategory(this.parse.Type);
                cLabel.SetValue(this.parse.Type);
                if (this.Depth() == 1)
                {
                    cLabel.SetTag(this.parse.Type);
                }
            }
            return(cLabel);
        }
Ejemplo n.º 14
0
        // This probably isn't needed now; everything is always a core label. But no-op.
        private static void ConvertToCoreLabels(Tree tree)
        {
            ILabel l = tree.Label();

            if (!(l is CoreLabel))
            {
                CoreLabel cl = new CoreLabel();
                cl.SetValue(l.Value());
                tree.SetLabel(cl);
            }
            foreach (Tree kid in tree.Children())
            {
                ConvertToCoreLabels(kid);
            }
        }
        internal static Tree CreateNode(Tree top, string label, params Tree[] children)
        {
            CoreLabel headLabel  = (CoreLabel)top.Label();
            CoreLabel production = new CoreLabel();

            production.SetValue(label);
            production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)));
            production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)));
            Tree newTop = new LabeledScoredTreeNode(production);

            foreach (Tree child in children)
            {
                newTop.AddChild(child);
            }
            return(newTop);
        }
        /// <summary>Add a binary node to the existing node on top of the stack</summary>
        public virtual State Apply(State state, double scoreDelta)
        {
            TreeShapedStack <Tree> stack = state.stack;
            Tree right = stack.Peek();

            stack = stack.Pop();
            Tree left = stack.Peek();

            stack = stack.Pop();
            Tree head;

            switch (side)
            {
            case BinaryTransition.Side.Left:
            {
                head = left;
                break;
            }

            case BinaryTransition.Side.Right:
            {
                head = right;
                break;
            }

            default:
            {
                throw new ArgumentException("Unknown side " + side);
            }
            }
            if (!(head.Label() is CoreLabel))
            {
                throw new ArgumentException("Stack should have CoreLabel nodes");
            }
            CoreLabel headLabel  = (CoreLabel)head.Label();
            CoreLabel production = new CoreLabel();

            production.SetValue(label);
            production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)));
            production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)));
            Tree newTop = new LabeledScoredTreeNode(production);

            newTop.AddChild(left);
            newTop.AddChild(right);
            stack = stack.Push(newTop);
            return(new State(stack, state.transitions.Push(this), state.separators, state.sentence, state.tokenPosition, state.score + scoreDelta, false));
        }
        /// <summary>Remove everything but the skeleton, the predictions, and the labels</summary>
        private Tree SimplifyTree(Tree tree)
        {
            CoreLabel newLabel = new CoreLabel();

            newLabel.Set(typeof(RNNCoreAnnotations.Predictions), RNNCoreAnnotations.GetPredictions(tree));
            newLabel.SetValue(tree.Label().Value());
            if (tree.IsLeaf())
            {
                return(tree.TreeFactory().NewLeaf(newLabel));
            }
            IList <Tree> children = Generics.NewArrayList(tree.Children().Length);

            for (int i = 0; i < tree.Children().Length; ++i)
            {
                children.Add(SimplifyTree(tree.Children()[i]));
            }
            return(tree.TreeFactory().NewTreeNode(newLabel, children));
        }
Ejemplo n.º 18
0
        /// <summary>Splits a compound marked by the lexer.</summary>
        private CoreLabel ProcessCompound(CoreLabel cl)
        {
            cl.Remove(typeof(CoreAnnotations.ParentAnnotation));
            string[] parts       = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - "));
            int      lengthAccum = 0;

            foreach (string part in parts)
            {
                CoreLabel newLabel = new CoreLabel(cl);
                newLabel.SetWord(part);
                newLabel.SetValue(part);
                newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum);
                newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length);
                newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part);
                compoundBuffer.Add(newLabel);
                lengthAccum += part.Length;
            }
            return(compoundBuffer.Remove(0));
        }
        private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho)
        {
            if (!t.IsPreTerminal())
            {
                throw new ArgumentException("Can only operate on preterminals");
            }
            if (!(t.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel label = (CoreLabel)t.Label();
            Tree      child = t.Children()[0];

            if (!(child.Label() is CoreLabel))
            {
                throw new ArgumentException("Only operates on CoreLabels");
            }
            CoreLabel childLabel = (CoreLabel)child.Label();
            // Morphological Analysis
            string morphStr = childLabel.OriginalText();

            if (morphStr == null || morphStr.Equals(string.Empty))
            {
                morphStr = label.Value();
                // POS subcategory
                string subCat = childLabel.Category();
                if (subCat != null && subCat != string.Empty)
                {
                    morphStr += "-" + subCat + "--";
                }
                else
                {
                    morphStr += "---";
                }
            }
            MorphoFeatures feats = morpho.StrToFeatures(morphStr);

            if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty))
            {
                label.SetValue(feats.GetAltTag());
                label.SetTag(feats.GetAltTag());
            }
        }
Ejemplo n.º 20
0
        // static methods
        /// <summary>
        /// Sets the labels on the tree (except the leaves) to be the integer
        /// value of the sentiment prediction.
        /// </summary>
        /// <remarks>
        /// Sets the labels on the tree (except the leaves) to be the integer
        /// value of the sentiment prediction.  Makes it easy to print out
        /// with Tree.toString()
        /// </remarks>
        private static void SetSentimentLabels(Tree tree)
        {
            if (tree.IsLeaf())
            {
                return;
            }
            foreach (Tree child in tree.Children())
            {
                SetSentimentLabels(child);
            }
            ILabel label = tree.Label();

            if (!(label is CoreLabel))
            {
                throw new ArgumentException("Required a tree with CoreLabels");
            }
            CoreLabel cl = (CoreLabel)label;

            cl.SetValue(int.ToString(RNNCoreAnnotations.GetPredictedClass(tree)));
        }
Ejemplo n.º 21
0
        public virtual IList <CoreLabel> SegmentStringToTokenList(string line)
        {
            IList <CoreLabel> tokenList       = CollectionUtils.MakeList();
            IList <CoreLabel> labeledSequence = SegmentStringToIOB(line);

            foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence))
            {
                CoreLabel token = new CoreLabel();
                string    text  = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget());
                token.SetWord(text);
                token.SetValue(text);
                token.Set(typeof(CoreAnnotations.TextAnnotation), text);
                token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1");
                int start = labeledSequence[span.GetSource()].BeginPosition();
                int end   = labeledSequence[span.GetTarget() - 1].EndPosition();
                token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end));
                token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start);
                token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end);
                tokenList.Add(token);
            }
            return(tokenList);
        }
Ejemplo n.º 22
0
        public UnnamedDependency(string regent, string dependent)
        {
            if (regent == null || dependent == null)
            {
                throw new ArgumentException("governor or dependent cannot be null");
            }

            var headLabel = new CoreLabel();

            headLabel.SetValue(regent);
            headLabel.SetWord(regent);
            this._regent = headLabel;

            var depLabel = new CoreLabel();

            depLabel.SetValue(dependent);
            depLabel.SetWord(dependent);
            this._dependent = depLabel;

            RegentText    = regent;
            DependentText = dependent;
        }
        /// <summary>Parse a sentence represented as a List of tokens.</summary>
        /// <remarks>
        /// Parse a sentence represented as a List of tokens.
        /// The text must already have been tokenized and
        /// normalized into tokens that are appropriate to the treebank
        /// which was used to train the parser.  The tokens can be of
        /// multiple types, and the list items need not be homogeneous as to type
        /// (in particular, only some words might be given tags):
        /// <ul>
        /// <li>If a token implements HasWord, then the word to be parsed is
        /// given by its word() value.</li>
        /// <li>If a token implements HasTag and the tag() value is not
        /// null or the empty String, then the parser is strongly advised to assign
        /// a part of speech tag that <i>begins</i> with this String.</li>
        /// </ul>
        /// </remarks>
        /// <param name="sentence">The sentence to parse</param>
        /// <returns>true Iff the sentence was accepted by the grammar</returns>
        /// <exception cref="System.NotSupportedException">
        /// If the Sentence is too long or
        /// of zero length or the parse
        /// otherwise fails for resource reasons
        /// </exception>
        private bool ParseInternal <_T0>(IList <_T0> sentence)
            where _T0 : IHasWord
        {
            parseSucceeded   = false;
            parseNoMemory    = false;
            parseUnparsable  = false;
            parseSkipped     = false;
            parseFallback    = false;
            whatFailed       = null;
            addedPunct       = false;
            originalSentence = sentence;
            int length = sentence.Count;

            if (length == 0)
            {
                parseSkipped = true;
                throw new NotSupportedException("Can't parse a zero-length sentence!");
            }
            IList <IHasWord> sentenceB;

            if (op.wordFunction != null)
            {
                sentenceB = Generics.NewArrayList();
                foreach (IHasWord word in originalSentence)
                {
                    if (word is ILabel)
                    {
                        ILabel label    = (ILabel)word;
                        ILabel newLabel = label.LabelFactory().NewLabel(label);
                        if (newLabel is IHasWord)
                        {
                            sentenceB.Add((IHasWord)newLabel);
                        }
                        else
                        {
                            throw new AssertionError("This should have been a HasWord");
                        }
                    }
                    else
                    {
                        if (word is IHasTag)
                        {
                            TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag());
                            sentenceB.Add(tw);
                        }
                        else
                        {
                            sentenceB.Add(new Word(word.Word()));
                        }
                    }
                }
                foreach (IHasWord word_1 in sentenceB)
                {
                    word_1.SetWord(op.wordFunction.Apply(word_1.Word()));
                }
            }
            else
            {
                sentenceB = new List <IHasWord>(sentence);
            }
            if (op.testOptions.addMissingFinalPunctuation)
            {
                addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length);
            }
            if (length > op.testOptions.maxLength)
            {
                parseSkipped = true;
                throw new NotSupportedException("Sentence too long: length " + length);
            }
            TreePrint   treePrint = GetTreePrint();
            PrintWriter pwOut     = op.tlpParams.Pw();

            //Insert the boundary symbol
            if (sentence[0] is CoreLabel)
            {
                CoreLabel boundary = new CoreLabel();
                boundary.SetWord(LexiconConstants.Boundary);
                boundary.SetValue(LexiconConstants.Boundary);
                boundary.SetTag(LexiconConstants.BoundaryTag);
                boundary.SetIndex(sentence.Count + 1);
                //1-based indexing used in the parser
                sentenceB.Add(boundary);
            }
            else
            {
                sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag));
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG)
            {
                if (!pparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                if (op.testOptions.verbose)
                {
                    pwOut.Println("PParser output");
                    // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes
                    treePrint.PrintTree(GetBestPCFGParse(false), pwOut);
                }
            }
            // without scores on nodes
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doDep && !op.testOptions.useFastFactored)
            {
                if (!dparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                // cdm nov 2006: should move these printing bits to the main printing section,
                // so don't calculate the best parse twice!
                if (op.testOptions.verbose)
                {
                    pwOut.Println("DParser output");
                    treePrint.PrintTree(dparser.GetBestParse(), pwOut);
                }
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG && op.doDep)
            {
                if (!bparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                else
                {
                    parseSucceeded = true;
                }
            }
            return(true);
        }
 public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf)
 {
     tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf);
     foreach (Tree t in tree)
     {
         if (t.IsLeaf())
         {
             //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is
             //specified by HasContext.
             if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark))
             {
                 string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark);
                 if (toks.Length != 2)
                 {
                     log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value()));
                 }
                 else
                 {
                     if (t.Label() is CoreLabel)
                     {
                         CoreLabel cl = (CoreLabel)t.Label();
                         cl.SetValue(string.Intern(toks[0].Trim()));
                         cl.SetWord(string.Intern(toks[0].Trim()));
                         Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]);
                         string lemma         = lemmaMorph.First();
                         string morphAnalysis = lemmaMorph.Second();
                         if (lemma.Equals(toks[0]))
                         {
                             cl.SetOriginalText(string.Intern(toks[1].Trim()));
                         }
                         else
                         {
                             // TODO(spenceg): Does this help?
                             string newLemma = lexMapper.Map(null, lemma);
                             if (newLemma == null || newLemma.Trim().IsEmpty())
                             {
                                 newLemma = lemma;
                             }
                             string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis;
                             cl.SetOriginalText(string.Intern(newMorphAnalysis));
                         }
                     }
                     else
                     {
                         log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName));
                     }
                 }
             }
         }
         else
         {
             if (t.IsPreTerminal())
             {
                 if (t.Value() == null || t.Value().IsEmpty())
                 {
                     log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString()));
                 }
                 else
                 {
                     if (t.Label() is IHasTag)
                     {
                         ((IHasTag)t.Label()).SetTag(t.Value());
                     }
                 }
             }
             else
             {
                 //Phrasal nodes
                 // there are some nodes "/" missing preterminals.  We'll splice in a tag for these.
                 int          nk      = t.NumChildren();
                 IList <Tree> newKids = new List <Tree>(nk);
                 for (int j = 0; j < nk; j++)
                 {
                     Tree child = t.GetChild(j);
                     if (child.IsLeaf())
                     {
                         log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString()));
                         newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child)));
                     }
                     else
                     {
                         newKids.Add(child);
                     }
                 }
                 t.SetChildren(newKids);
             }
         }
     }
     //Every node in the tree has now been processed
     //
     // Additional processing for specific phrasal annotations
     //
     // special global coding for moving PRD annotation from constituent to verb tag.
     if (markPRDverb)
     {
         TregexMatcher m     = prdVerbPattern.Matcher(tree);
         Tree          match = null;
         while (m.Find())
         {
             if (m.GetMatch() != match)
             {
                 match = m.GetMatch();
                 match.Label().SetValue(match.Label().Value() + "-PRDverb");
                 Tree prd = m.GetNode("prd");
                 prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value()));
             }
         }
     }
     //Mark *only* subjects in verb-initial clauses
     if (retainNPSbj)
     {
         TregexMatcher m = npSbjPattern.Matcher(tree);
         while (m.Find())
         {
             Tree match = m.GetMatch();
             match.Label().SetValue("NP");
         }
     }
     if (tree.IsPreTerminal())
     {
         // The whole tree is a bare tag: bad!
         string val = tree.Label().Value();
         if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ"))
         {
             log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString()));
             tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree));
         }
         else
         {
             log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString()));
         }
     }
     //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets.
     //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method
     //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree.
     while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1)
     {
         tree = tree.FirstChild();
     }
     if (tree != null && !tree.Value().Equals(rootLabel))
     {
         tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree));
     }
     return(tree);
 }
Ejemplo n.º 25
0
        /*/**
   * Simple tree reading utility method.  Given a tree formatted as a PTB string, returns a Tree made by a specific TreeFactory.
   #1#
  public static Tree readTree(string ptbTreeString, TreeFactory treeFactory) {
    try {
      PennTreeReader ptr = new PennTreeReader(new StringReader(ptbTreeString), treeFactory);
      return ptr.readTree();
    } catch (IOException ex) {
      throw new SystemException(ex);
    }
  }*/

        /**
   * Simple tree reading utility method.  Given a tree formatted as a PTB string, returns a Tree made by the default TreeFactory (LabeledScoredTreeFactory)
   */
        /*public static Tree readTree(string str) {
    return readTree(str, defaultTreeFactory);
  }*/

        /// <summary>
        /// Converts the tree labels to CoreLabels.
        /// We need this because we store additional info in the CoreLabel, like token span.
        /// </summary>
        public static void ConvertToCoreLabels(Tree tree)
        {
            ILabel l = tree.Label();
            if (!(l is CoreLabel))
            {
                var cl = new CoreLabel();
                cl.SetValue(l.Value());
                tree.SetLabel(cl);
            }

            foreach (Tree kid in tree.Children())
            {
                ConvertToCoreLabels(kid);
            }
        }
        public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves)
        {
            IList <ILabel> labels = tree.Yield();

            foreach (ILabel label in labels)
            {
                ++nTokens;
                if (!(label is CoreLabel))
                {
                    throw new ArgumentException("Only works with CoreLabels trees");
                }
                CoreLabel coreLabel = (CoreLabel)label;
                string    lemma     = coreLabel.Lemma();
                //PTB escaping since we're going to put this in the leaf
                if (lemma == null)
                {
                    // No lemma, so just add the surface form
                    lemma = coreLabel.Word();
                }
                else
                {
                    if (lemma.Equals("("))
                    {
                        lemma = "-LRB-";
                    }
                    else
                    {
                        if (lemma.Equals(")"))
                        {
                            lemma = "-RRB-";
                        }
                    }
                }
                if (lemmasAsLeaves)
                {
                    string escapedLemma = lemma;
                    coreLabel.SetWord(escapedLemma);
                    coreLabel.SetValue(escapedLemma);
                    coreLabel.SetLemma(lemma);
                }
                if (addMorphoToLeaves)
                {
                    string morphStr = coreLabel.OriginalText();
                    if (morphStr == null || morphStr.Equals(string.Empty))
                    {
                        morphStr = MorphoFeatureSpecification.NoAnalysis;
                    }
                    else
                    {
                        ++nMorphAnalyses;
                    }
                    // Normalize punctuation analyses
                    if (morphStr.StartsWith("PONCT"))
                    {
                        morphStr = "PUNC";
                    }
                    string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr);
                    coreLabel.SetValue(newLeaf);
                    coreLabel.SetWord(newLeaf);
                }
            }
        }
        // static class
        public static Tree ConvertTree(IList <int> parentPointers, IList <string> sentence, IDictionary <IList <string>, int> phraseIds, IDictionary <int, double> sentimentScores, PTBEscapingProcessor escaper, int numClasses)
        {
            int maxNode = 0;

            foreach (int parent in parentPointers)
            {
                maxNode = Math.Max(maxNode, parent);
            }
            Tree[] subtrees = new Tree[maxNode + 1];
            for (int i = 0; i < sentence.Count; ++i)
            {
                CoreLabel word = new CoreLabel();
                word.SetValue(sentence[i]);
                Tree leaf = new LabeledScoredTreeNode(word);
                subtrees[i] = new LabeledScoredTreeNode(new CoreLabel());
                subtrees[i].AddChild(leaf);
            }
            for (int i_1 = sentence.Count; i_1 <= maxNode; ++i_1)
            {
                subtrees[i_1] = new LabeledScoredTreeNode(new CoreLabel());
            }
            bool[] connected = new bool[maxNode + 1];
            Tree   root      = null;

            for (int index = 0; index < parentPointers.Count; ++index)
            {
                if (parentPointers[index] == -1)
                {
                    if (root != null)
                    {
                        throw new Exception("Found two roots for sentence " + sentence);
                    }
                    root = subtrees[index];
                }
                else
                {
                    // Walk up the tree structure to make sure that leftmost
                    // phrases are added first.  Otherwise, if the numbers are
                    // inverted, we might get the right phrase added to a parent
                    // first, resulting in "case zero in this", for example,
                    // instead of "in this case zero"
                    // Note that because we keep track of which ones are already
                    // connected, we process this at most once per parent, so the
                    // overall construction time is still efficient.
                    Connect(parentPointers, subtrees, connected, index);
                }
            }
            for (int i_2 = 0; i_2 <= maxNode; ++i_2)
            {
                IList <Tree>   leaves = subtrees[i_2].GetLeaves();
                IList <string> words  = CollectionUtils.TransformAsList(leaves, TransformTreeToWord);
                // First we look for a copy of the phrase with -LRB- -RRB-
                // instead of ().  The sentiment trees sometimes have both, and
                // the escaped versions seem to have more reasonable scores.
                // If a particular phrase doesn't have -LRB- -RRB- we fall back
                // to the unescaped versions.
                int phraseId = phraseIds[CollectionUtils.TransformAsList(words, TransformParens)];
                if (phraseId == null)
                {
                    phraseId = phraseIds[words];
                }
                if (phraseId == null)
                {
                    throw new Exception("Could not find phrase id for phrase " + sentence);
                }
                // TODO: should we make this an option?  Perhaps we want cases
                // where the trees have the phrase id and not their class
                double score = sentimentScores[phraseId];
                if (score == null)
                {
                    throw new Exception("Could not find sentiment score for phrase id " + phraseId);
                }
                // TODO: make this a numClasses option
                int classLabel = Math.Round((float)Math.Floor(score * (float)numClasses));
                if (classLabel > numClasses - 1)
                {
                    classLabel = numClasses - 1;
                }
                subtrees[i_2].Label().SetValue(int.ToString(classLabel));
            }
            for (int i_3 = 0; i_3 < sentence.Count; ++i_3)
            {
                Tree leaf = subtrees[i_3].Children()[0];
                leaf.Label().SetValue(escaper.EscapeString(leaf.Label().Value()));
            }
            for (int i_4 = 0; i_4 < tregexPatterns.Length; ++i_4)
            {
                root = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(tregexPatterns[i_4], tsurgeonPatterns[i_4], root);
            }
            return(root);
        }
Ejemplo n.º 28
0
        /// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary>
        private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string
                                                                                                                                                                                                                                                  > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid)
        {
            // Error checks
            if (lemmas.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (pos.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            if (ner.Count != words.Count)
            {
                throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")");
            }
            // Create structure
            IList <CoreLabel> tokens = new List <CoreLabel>(words.Count);
            int beginChar            = 0;

            for (int i = 0; i < words.Count; ++i)
            {
                CoreLabel token = new CoreLabel(12);
                token.SetWord(words[i]);
                token.SetValue(words[i]);
                token.SetBeginPosition(beginChar);
                token.SetEndPosition(beginChar + words[i].Length);
                beginChar += words[i].Length + 1;
                token.SetLemma(lemmas[i]);
                token.SetTag(pos[i]);
                token.SetNER(ner[i]);
                token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
                token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
                token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1);
                token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i);
                token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1);
                tokens.Add(token);
            }
            gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t");
            ICoreMap sentence = new ArrayCoreMap(16);

            sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            SemanticGraph graph = tree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph);
            sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph);
            SemanticGraph maltGraph = maltTree.Apply(tokens);

            sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph);
            sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss);
            sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0);
            sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count);
            Annotation doc = new Annotation(gloss);

            doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens);
            doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence));
            doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???"));
            doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1));
            return(doc);
        }
        /// <exception cref="System.IO.IOException"/>
        public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix)
        {
            Pattern          startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            Pattern          endLabelToken      = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>");
            string           backgroundSymbol   = "O";
            IList <ICoreMap> sentences          = new List <ICoreMap>();
            int    lineNum = -1;
            string l       = null;

            while ((l = reader.ReadLine()) != null)
            {
                lineNum++;
                string[] t    = l.Split("\t", 2);
                string   id   = null;
                string   text = null;
                if (t.Length == 2)
                {
                    id   = t[0];
                    text = t[1];
                }
                else
                {
                    if (t.Length == 1)
                    {
                        text = t[0];
                        id   = lineNum.ToString();
                    }
                }
                id = sentIDprefix + id;
                DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text));
                PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false");
                dp.SetTokenizerFactory(tokenizerFactory);
                string label   = backgroundSymbol;
                int    sentNum = -1;
                foreach (IList <IHasWord> sentence in dp)
                {
                    sentNum++;
                    string            sentStr = string.Empty;
                    IList <CoreLabel> sent    = new List <CoreLabel>();
                    foreach (IHasWord tokw in sentence)
                    {
                        string  tok             = tokw.Word();
                        Matcher startingMatcher = startingLabelToken.Matcher(tok);
                        Matcher endMatcher      = endLabelToken.Matcher(tok);
                        if (startingMatcher.Matches())
                        {
                            //System.out.println("matched starting");
                            label = startingMatcher.Group(1);
                        }
                        else
                        {
                            if (endMatcher.Matches())
                            {
                                //System.out.println("matched end");
                                label = backgroundSymbol;
                            }
                            else
                            {
                                CoreLabel      c    = new CoreLabel();
                                IList <string> toks = new List <string>();
                                toks.Add(tok);
                                foreach (string toksplit in toks)
                                {
                                    sentStr += " " + toksplit;
                                    c.SetWord(toksplit);
                                    c.SetLemma(toksplit);
                                    c.SetValue(toksplit);
                                    c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit);
                                    c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok);
                                    if (setGoldClass)
                                    {
                                        c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label);
                                    }
                                    if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label))
                                    {
                                        c.Set(setClassForTheseLabels[label], label);
                                    }
                                    sent.Add(c);
                                }
                            }
                        }
                    }
                    ICoreMap sentcm = new ArrayCoreMap();
                    sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim());
                    sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent);
                    sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum);
                    sentences.Add(sentcm);
                }
            }
            return(sentences);
        }