/// <summary> /// Loops back through all the datums inserted for the most recent word /// and inserts statistics about the word they are a part of. /// </summary> /// <remarks> /// Loops back through all the datums inserted for the most recent word /// and inserts statistics about the word they are a part of. This needs to /// be post hoc because the CoreLabel lists coming from testing data sets /// are pre-segmented (so treating each of those CoreLabels as a "word" lets /// us cheat and get 100% classification accuracy by just looking at whether /// we're at the beginning of a "word"). /// </remarks> /// <param name="iobList"/> /// <param name="currentWord"/> /// <param name="wordStartIndex"/> private static void FillInWordStatistics(IList <CoreLabel> iobList, string currentWord, int wordStartIndex) { for (int j = wordStartIndex; j < iobList.Count; j++) { CoreLabel tok = iobList[j]; tok.SetIndex(j - wordStartIndex); tok.SetWord(currentWord); } }
protected internal virtual CoreLabel MkWord(string gloss, int index) { CoreLabel w = new CoreLabel(); w.SetWord(gloss); w.SetValue(gloss); if (index >= 0) { w.SetIndex(index); } return(w); }
/// <summary>Create a dummy word, just with a given word at a given index.</summary> /// <remarks> /// Create a dummy word, just with a given word at a given index. /// Mostly useful for making semantic graphs. /// </remarks> public static CoreLabel MkWord(string gloss, int index) { CoreLabel w = new CoreLabel(); w.SetWord(gloss); w.SetValue(gloss); if (index >= 0) { w.SetIndex(index); } return(w); }
public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words) where _T0 : IHasWord { IList <Tree> preterminals = Generics.NewArrayList(); for (int index = 0; index < words.Count; ++index) { IHasWord hw = words[index]; CoreLabel wordLabel; string tag; if (hw is CoreLabel) { wordLabel = (CoreLabel)hw; tag = wordLabel.Tag(); } else { wordLabel = new CoreLabel(); wordLabel.SetValue(hw.Word()); wordLabel.SetWord(hw.Word()); if (!(hw is IHasTag)) { throw new ArgumentException("Expected tagged words"); } tag = ((IHasTag)hw).Tag(); wordLabel.SetTag(tag); } if (tag == null) { throw new ArgumentException("Input word not tagged"); } CoreLabel tagLabel = new CoreLabel(); tagLabel.SetValue(tag); // Index from 1. Tools downstream from the parser expect that // Internally this parser uses the index, so we have to // overwrite incorrect indices if the label is already indexed wordLabel.SetIndex(index + 1); tagLabel.SetIndex(index + 1); LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel); LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel); tagNode.AddChild(wordNode); // TODO: can we get away with not setting these on the wordLabel? wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); preterminals.Add(tagNode); } return(new State(preterminals)); }
/// <summary> /// Convert a String to a list of characters suitable for labeling in an IOB /// segmentation model. /// </summary> /// <param name="tokenList"/> /// <param name="segMarker"/> /// <param name="applyRewriteRules">add rewrite labels (for training data)</param> /// <param name="stripRewrites"> /// revert training data to old Green and DeNero model (remove /// rewrite labels but still rewrite to try to preserve raw text) /// </param> /// <param name="tf">a TokenizerFactory returning ArabicTokenizers (for determining original segment boundaries)</param> /// <param name="origText">the original string before tokenization (for determining original segment boundaries)</param> public static IList <CoreLabel> StringToIOB(IList <CoreLabel> tokenList, char segMarker, bool applyRewriteRules, bool stripRewrites, ITokenizerFactory <CoreLabel> tf, string origText) { IList <CoreLabel> iobList = new List <CoreLabel>(tokenList.Count * 7 + tokenList.Count); string strSegMarker = segMarker.ToString(); bool addWhitespace = false; int numTokens = tokenList.Count; string lastToken = string.Empty; string currentWord = string.Empty; int wordStartIndex = 0; foreach (CoreLabel cl in tokenList) { // What type of token is this if (addWhitespace) { FillInWordStatistics(iobList, currentWord, wordStartIndex); currentWord = string.Empty; wordStartIndex = iobList.Count + 1; iobList.Add(CreateDatum(cl, BoundaryChar, BoundarySymbol)); CoreLabel boundaryDatum = iobList[iobList.Count - 1]; boundaryDatum.SetIndex(0); boundaryDatum.SetWord(string.Empty); addWhitespace = false; } string token = cl.Word(); IOBUtils.TokenType tokType = GetTokenType(token, strSegMarker); token = StripSegmentationMarkers(token, tokType); System.Diagnostics.Debug.Assert(token.Length != 0); if (ShouldNotSegment(token)) { iobList.Add(CreateDatum(cl, token, NosegSymbol)); addWhitespace = true; } else { // Iterate over the characters in the token TokenToDatums(iobList, cl, token, tokType, cl, lastToken, applyRewriteRules, stripRewrites, tf, origText); addWhitespace = (tokType == IOBUtils.TokenType.BeginMarker || tokType == IOBUtils.TokenType.NoMarker); } currentWord += token; lastToken = token; } FillInWordStatistics(iobList, currentWord, wordStartIndex); return(iobList); }
/// <summary> /// Depth-first (post-order) search through the tree, recording the stack state as the /// lineage every time a terminal is reached. /// </summary> /// <remarks> /// Depth-first (post-order) search through the tree, recording the stack state as the /// lineage every time a terminal is reached. /// This implementation uses the Index annotation to store depth. If CoreLabels are /// not present in the trees (or at least something that implements HasIndex), an exception will result. /// </remarks> /// <param name="t">The tree</param> /// <returns>A list of lineages</returns> private static IList <IList <CoreLabel> > MakeLineages(Tree t) { if (t == null) { return(null); } ((IHasIndex)t.Label()).SetIndex(0); Stack <Tree> treeStack = new Stack <Tree>(); treeStack.Push(t); Stack <CoreLabel> labelStack = new Stack <CoreLabel>(); CoreLabel rootLabel = new CoreLabel(t.Label()); rootLabel.SetIndex(0); labelStack.Push(rootLabel); IList <IList <CoreLabel> > lineages = new List <IList <CoreLabel> >(); while (!treeStack.IsEmpty()) { Tree node = treeStack.Pop(); int nodeDepth = ((IHasIndex)node.Label()).Index(); while (!labelStack.IsEmpty() && labelStack.Peek().Index() != nodeDepth - 1) { labelStack.Pop(); } if (node.IsPreTerminal()) { IList <CoreLabel> lin = new List <CoreLabel>(labelStack); lineages.Add(lin); } else { foreach (Tree kid in node.Children()) { ((IHasIndex)kid.Label()).SetIndex(nodeDepth + 1); treeStack.Push(kid); } CoreLabel nodeLabel = new CoreLabel(node.Label()); nodeLabel.SetIndex(nodeDepth); labelStack.Add(nodeLabel); } } return(lineages); }
/// <summary> /// Set the index for the current node /// </summary> public void SetIndex(int index) { _label.SetIndex(index); }
/// <summary>Parse a sentence represented as a List of tokens.</summary> /// <remarks> /// Parse a sentence represented as a List of tokens. /// The text must already have been tokenized and /// normalized into tokens that are appropriate to the treebank /// which was used to train the parser. The tokens can be of /// multiple types, and the list items need not be homogeneous as to type /// (in particular, only some words might be given tags): /// <ul> /// <li>If a token implements HasWord, then the word to be parsed is /// given by its word() value.</li> /// <li>If a token implements HasTag and the tag() value is not /// null or the empty String, then the parser is strongly advised to assign /// a part of speech tag that <i>begins</i> with this String.</li> /// </ul> /// </remarks> /// <param name="sentence">The sentence to parse</param> /// <returns>true Iff the sentence was accepted by the grammar</returns> /// <exception cref="System.NotSupportedException"> /// If the Sentence is too long or /// of zero length or the parse /// otherwise fails for resource reasons /// </exception> private bool ParseInternal <_T0>(IList <_T0> sentence) where _T0 : IHasWord { parseSucceeded = false; parseNoMemory = false; parseUnparsable = false; parseSkipped = false; parseFallback = false; whatFailed = null; addedPunct = false; originalSentence = sentence; int length = sentence.Count; if (length == 0) { parseSkipped = true; throw new NotSupportedException("Can't parse a zero-length sentence!"); } IList <IHasWord> sentenceB; if (op.wordFunction != null) { sentenceB = Generics.NewArrayList(); foreach (IHasWord word in originalSentence) { if (word is ILabel) { ILabel label = (ILabel)word; ILabel newLabel = label.LabelFactory().NewLabel(label); if (newLabel is IHasWord) { sentenceB.Add((IHasWord)newLabel); } else { throw new AssertionError("This should have been a HasWord"); } } else { if (word is IHasTag) { TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag()); sentenceB.Add(tw); } else { sentenceB.Add(new Word(word.Word())); } } } foreach (IHasWord word_1 in sentenceB) { word_1.SetWord(op.wordFunction.Apply(word_1.Word())); } } else { sentenceB = new List <IHasWord>(sentence); } if (op.testOptions.addMissingFinalPunctuation) { addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length); } if (length > op.testOptions.maxLength) { parseSkipped = true; throw new NotSupportedException("Sentence too long: length " + length); } TreePrint treePrint = GetTreePrint(); PrintWriter pwOut = op.tlpParams.Pw(); //Insert the boundary symbol if (sentence[0] is CoreLabel) { CoreLabel boundary = new CoreLabel(); boundary.SetWord(LexiconConstants.Boundary); boundary.SetValue(LexiconConstants.Boundary); boundary.SetTag(LexiconConstants.BoundaryTag); boundary.SetIndex(sentence.Count + 1); //1-based indexing used in the parser sentenceB.Add(boundary); } else { sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag)); } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG) { if (!pparser.Parse(sentenceB)) { return(parseSucceeded); } if (op.testOptions.verbose) { pwOut.Println("PParser output"); // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes treePrint.PrintTree(GetBestPCFGParse(false), pwOut); } } // without scores on nodes if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doDep && !op.testOptions.useFastFactored) { if (!dparser.Parse(sentenceB)) { return(parseSucceeded); } // cdm nov 2006: should move these printing bits to the main printing section, // so don't calculate the best parse twice! if (op.testOptions.verbose) { pwOut.Println("DParser output"); treePrint.PrintTree(dparser.GetBestParse(), pwOut); } } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG && op.doDep) { if (!bparser.Parse(sentenceB)) { return(parseSucceeded); } else { parseSucceeded = true; } } return(true); }
/// <summary>Set the index for the current node.</summary> protected internal virtual void SetIndex(int index) { label.SetIndex(index); }