Example #1
0
 /// <summary>
 /// Loops back through all the datums inserted for the most recent word
 /// and inserts statistics about the word they are a part of.
 /// </summary>
 /// <remarks>
 /// Loops back through all the datums inserted for the most recent word
 /// and inserts statistics about the word they are a part of. This needs to
 /// be post hoc because the CoreLabel lists coming from testing data sets
 /// are pre-segmented (so treating each of those CoreLabels as a "word" lets
 /// us cheat and get 100% classification accuracy by just looking at whether
 /// we're at the beginning of a "word").
 /// </remarks>
 /// <param name="iobList"/>
 /// <param name="currentWord"/>
 /// <param name="wordStartIndex"/>
 private static void FillInWordStatistics(IList <CoreLabel> iobList, string currentWord, int wordStartIndex)
 {
     for (int j = wordStartIndex; j < iobList.Count; j++)
     {
         CoreLabel tok = iobList[j];
         tok.SetIndex(j - wordStartIndex);
         tok.SetWord(currentWord);
     }
 }
Example #2
0
        protected internal virtual CoreLabel MkWord(string gloss, int index)
        {
            CoreLabel w = new CoreLabel();

            w.SetWord(gloss);
            w.SetValue(gloss);
            if (index >= 0)
            {
                w.SetIndex(index);
            }
            return(w);
        }
        /// <summary>Create a dummy word, just with a given word at a given index.</summary>
        /// <remarks>
        /// Create a dummy word, just with a given word at a given index.
        /// Mostly useful for making semantic graphs.
        /// </remarks>
        public static CoreLabel MkWord(string gloss, int index)
        {
            CoreLabel w = new CoreLabel();

            w.SetWord(gloss);
            w.SetValue(gloss);
            if (index >= 0)
            {
                w.SetIndex(index);
            }
            return(w);
        }
Example #4
0
        public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words)
            where _T0 : IHasWord
        {
            IList <Tree> preterminals = Generics.NewArrayList();

            for (int index = 0; index < words.Count; ++index)
            {
                IHasWord  hw = words[index];
                CoreLabel wordLabel;
                string    tag;
                if (hw is CoreLabel)
                {
                    wordLabel = (CoreLabel)hw;
                    tag       = wordLabel.Tag();
                }
                else
                {
                    wordLabel = new CoreLabel();
                    wordLabel.SetValue(hw.Word());
                    wordLabel.SetWord(hw.Word());
                    if (!(hw is IHasTag))
                    {
                        throw new ArgumentException("Expected tagged words");
                    }
                    tag = ((IHasTag)hw).Tag();
                    wordLabel.SetTag(tag);
                }
                if (tag == null)
                {
                    throw new ArgumentException("Input word not tagged");
                }
                CoreLabel tagLabel = new CoreLabel();
                tagLabel.SetValue(tag);
                // Index from 1.  Tools downstream from the parser expect that
                // Internally this parser uses the index, so we have to
                // overwrite incorrect indices if the label is already indexed
                wordLabel.SetIndex(index + 1);
                tagLabel.SetIndex(index + 1);
                LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel);
                LabeledScoredTreeNode tagNode  = new LabeledScoredTreeNode(tagLabel);
                tagNode.AddChild(wordNode);
                // TODO: can we get away with not setting these on the wordLabel?
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel);
                tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel);
                preterminals.Add(tagNode);
            }
            return(new State(preterminals));
        }
Example #5
0
        /// <summary>
        /// Convert a String to a list of characters suitable for labeling in an IOB
        /// segmentation model.
        /// </summary>
        /// <param name="tokenList"/>
        /// <param name="segMarker"/>
        /// <param name="applyRewriteRules">add rewrite labels (for training data)</param>
        /// <param name="stripRewrites">
        /// revert training data to old Green and DeNero model (remove
        /// rewrite labels but still rewrite to try to preserve raw text)
        /// </param>
        /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param>
        /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param>
        public static IList <CoreLabel> StringToIOB(IList <CoreLabel> tokenList, char segMarker, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText)
        {
            IList <CoreLabel> iobList      = new List <CoreLabel>(tokenList.Count * 7 + tokenList.Count);
            string            strSegMarker = segMarker.ToString();
            bool   addWhitespace           = false;
            int    numTokens      = tokenList.Count;
            string lastToken      = string.Empty;
            string currentWord    = string.Empty;
            int    wordStartIndex = 0;

            foreach (CoreLabel cl in tokenList)
            {
                // What type of token is this
                if (addWhitespace)
                {
                    FillInWordStatistics(iobList, currentWord, wordStartIndex);
                    currentWord    = string.Empty;
                    wordStartIndex = iobList.Count + 1;
                    iobList.Add(CreateDatum(cl, BoundaryChar, BoundarySymbol));
                    CoreLabel boundaryDatum = iobList[iobList.Count - 1];
                    boundaryDatum.SetIndex(0);
                    boundaryDatum.SetWord(string.Empty);
                    addWhitespace = false;
                }
                string             token   = cl.Word();
                IOBUtils.TokenType tokType = GetTokenType(token, strSegMarker);
                token = StripSegmentationMarkers(token, tokType);
                System.Diagnostics.Debug.Assert(token.Length != 0);
                if (ShouldNotSegment(token))
                {
                    iobList.Add(CreateDatum(cl, token, NosegSymbol));
                    addWhitespace = true;
                }
                else
                {
                    // Iterate over the characters in the token
                    TokenToDatums(iobList, cl, token, tokType, cl, lastToken, applyRewriteRules, stripRewrites, tf, origText);
                    addWhitespace = (tokType == IOBUtils.TokenType.BeginMarker || tokType == IOBUtils.TokenType.NoMarker);
                }
                currentWord += token;
                lastToken    = token;
            }
            FillInWordStatistics(iobList, currentWord, wordStartIndex);
            return(iobList);
        }
Example #6
0
        /// <summary>
        /// Depth-first (post-order) search through the tree, recording the stack state as the
        /// lineage every time a terminal is reached.
        /// </summary>
        /// <remarks>
        /// Depth-first (post-order) search through the tree, recording the stack state as the
        /// lineage every time a terminal is reached.
        /// This implementation uses the Index annotation to store depth. If CoreLabels are
        /// not present in the trees (or at least something that implements HasIndex), an exception will result.
        /// </remarks>
        /// <param name="t">The tree</param>
        /// <returns>A list of lineages</returns>
        private static IList <IList <CoreLabel> > MakeLineages(Tree t)
        {
            if (t == null)
            {
                return(null);
            }
            ((IHasIndex)t.Label()).SetIndex(0);
            Stack <Tree> treeStack = new Stack <Tree>();

            treeStack.Push(t);
            Stack <CoreLabel> labelStack = new Stack <CoreLabel>();
            CoreLabel         rootLabel  = new CoreLabel(t.Label());

            rootLabel.SetIndex(0);
            labelStack.Push(rootLabel);
            IList <IList <CoreLabel> > lineages = new List <IList <CoreLabel> >();

            while (!treeStack.IsEmpty())
            {
                Tree node      = treeStack.Pop();
                int  nodeDepth = ((IHasIndex)node.Label()).Index();
                while (!labelStack.IsEmpty() && labelStack.Peek().Index() != nodeDepth - 1)
                {
                    labelStack.Pop();
                }
                if (node.IsPreTerminal())
                {
                    IList <CoreLabel> lin = new List <CoreLabel>(labelStack);
                    lineages.Add(lin);
                }
                else
                {
                    foreach (Tree kid in node.Children())
                    {
                        ((IHasIndex)kid.Label()).SetIndex(nodeDepth + 1);
                        treeStack.Push(kid);
                    }
                    CoreLabel nodeLabel = new CoreLabel(node.Label());
                    nodeLabel.SetIndex(nodeDepth);
                    labelStack.Add(nodeLabel);
                }
            }
            return(lineages);
        }
Example #7
0
 /// <summary>
 /// Set the index for the current node
 /// </summary>
 public void SetIndex(int index)
 {
     _label.SetIndex(index);
 }
        /// <summary>Parse a sentence represented as a List of tokens.</summary>
        /// <remarks>
        /// Parse a sentence represented as a List of tokens.
        /// The text must already have been tokenized and
        /// normalized into tokens that are appropriate to the treebank
        /// which was used to train the parser.  The tokens can be of
        /// multiple types, and the list items need not be homogeneous as to type
        /// (in particular, only some words might be given tags):
        /// <ul>
        /// <li>If a token implements HasWord, then the word to be parsed is
        /// given by its word() value.</li>
        /// <li>If a token implements HasTag and the tag() value is not
        /// null or the empty String, then the parser is strongly advised to assign
        /// a part of speech tag that <i>begins</i> with this String.</li>
        /// </ul>
        /// </remarks>
        /// <param name="sentence">The sentence to parse</param>
        /// <returns>true Iff the sentence was accepted by the grammar</returns>
        /// <exception cref="System.NotSupportedException">
        /// If the Sentence is too long or
        /// of zero length or the parse
        /// otherwise fails for resource reasons
        /// </exception>
        private bool ParseInternal <_T0>(IList <_T0> sentence)
            where _T0 : IHasWord
        {
            parseSucceeded   = false;
            parseNoMemory    = false;
            parseUnparsable  = false;
            parseSkipped     = false;
            parseFallback    = false;
            whatFailed       = null;
            addedPunct       = false;
            originalSentence = sentence;
            int length = sentence.Count;

            if (length == 0)
            {
                parseSkipped = true;
                throw new NotSupportedException("Can't parse a zero-length sentence!");
            }
            IList <IHasWord> sentenceB;

            if (op.wordFunction != null)
            {
                sentenceB = Generics.NewArrayList();
                foreach (IHasWord word in originalSentence)
                {
                    if (word is ILabel)
                    {
                        ILabel label    = (ILabel)word;
                        ILabel newLabel = label.LabelFactory().NewLabel(label);
                        if (newLabel is IHasWord)
                        {
                            sentenceB.Add((IHasWord)newLabel);
                        }
                        else
                        {
                            throw new AssertionError("This should have been a HasWord");
                        }
                    }
                    else
                    {
                        if (word is IHasTag)
                        {
                            TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag());
                            sentenceB.Add(tw);
                        }
                        else
                        {
                            sentenceB.Add(new Word(word.Word()));
                        }
                    }
                }
                foreach (IHasWord word_1 in sentenceB)
                {
                    word_1.SetWord(op.wordFunction.Apply(word_1.Word()));
                }
            }
            else
            {
                sentenceB = new List <IHasWord>(sentence);
            }
            if (op.testOptions.addMissingFinalPunctuation)
            {
                addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length);
            }
            if (length > op.testOptions.maxLength)
            {
                parseSkipped = true;
                throw new NotSupportedException("Sentence too long: length " + length);
            }
            TreePrint   treePrint = GetTreePrint();
            PrintWriter pwOut     = op.tlpParams.Pw();

            //Insert the boundary symbol
            if (sentence[0] is CoreLabel)
            {
                CoreLabel boundary = new CoreLabel();
                boundary.SetWord(LexiconConstants.Boundary);
                boundary.SetValue(LexiconConstants.Boundary);
                boundary.SetTag(LexiconConstants.BoundaryTag);
                boundary.SetIndex(sentence.Count + 1);
                //1-based indexing used in the parser
                sentenceB.Add(boundary);
            }
            else
            {
                sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag));
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG)
            {
                if (!pparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                if (op.testOptions.verbose)
                {
                    pwOut.Println("PParser output");
                    // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes
                    treePrint.PrintTree(GetBestPCFGParse(false), pwOut);
                }
            }
            // without scores on nodes
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doDep && !op.testOptions.useFastFactored)
            {
                if (!dparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                // cdm nov 2006: should move these printing bits to the main printing section,
                // so don't calculate the best parse twice!
                if (op.testOptions.verbose)
                {
                    pwOut.Println("DParser output");
                    treePrint.PrintTree(dparser.GetBestParse(), pwOut);
                }
            }
            if (Thread.Interrupted())
            {
                throw new RuntimeInterruptedException();
            }
            if (op.doPCFG && op.doDep)
            {
                if (!bparser.Parse(sentenceB))
                {
                    return(parseSucceeded);
                }
                else
                {
                    parseSucceeded = true;
                }
            }
            return(true);
        }
 /// <summary>Set the index for the current node.</summary>
 protected internal virtual void SetIndex(int index)
 {
     label.SetIndex(index);
 }