public UnnamedDependency(string regent, string dependent) { // We store the text of the labels separately because it looks like // it is possible for an object to request a hash code using itself // in a partially reconstructed state when unserializing. For // example, a TreeGraphNode might ask for the hash code of an // UnnamedDependency, which then uses an unfilled member of the same // TreeGraphNode to get the hash code. Keeping the text of the // labels breaks that possible cycle. if (regent == null || dependent == null) { throw new ArgumentException("governor or dependent cannot be null"); } CoreLabel headLabel = new CoreLabel(); headLabel.SetValue(regent); headLabel.SetWord(regent); this.regent = headLabel; CoreLabel depLabel = new CoreLabel(); depLabel.SetValue(dependent); depLabel.SetWord(dependent); this.dependent = depLabel; regentText = regent; dependentText = dependent; }
private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd) { CoreLabel token = new CoreLabel(); token.SetOriginalText(tokenText); if (separatorPattern.Matcher(tokenText).Matches()) { // Map to CoreNLP newline token tokenText = AbstractTokenizer.NewlineToken; } else { if (doNormalization && normalizeSpace) { tokenText = tokenText.Replace(' ', '\u00A0'); } } // change space to non-breaking space token.SetWord(tokenText); token.SetValue(tokenText); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd); if (Verbose) { log.Info("Adding token " + token.ToShorterString()); } return(token); }
public static bool SetSpanLabel(Tree tree, Pair <int, int> span, string value) { if (!(tree.Label() is CoreLabel)) { throw new AssertionError("Expected CoreLabels"); } CoreLabel label = (CoreLabel)tree.Label(); if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)).Equals(span.first) && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)).Equals(span.second)) { label.SetValue(value); return(true); } if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)) > span.first && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)) < span.second) { return(false); } foreach (Tree child in tree.Children()) { if (SetSpanLabel(child, span, value)) { return(true); } } return(false); }
private static void ReplacePOSTags(Tree tree) { IList <ILabel> yield = tree.Yield(); IList <ILabel> preYield = tree.PreTerminalYield(); System.Diagnostics.Debug.Assert(yield.Count == preYield.Count); MorphoFeatureSpecification spec = new FrenchMorphoFeatureSpecification(); for (int i = 0; i < yield.Count; i++) { // Morphological Analysis string morphStr = ((CoreLabel)yield[i]).OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = preYield[i].Value(); // POS subcategory string subCat = ((CoreLabel)yield[i]).Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = spec.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { CoreLabel cl = (CoreLabel)preYield[i]; cl.SetValue(feats.GetAltTag()); cl.SetTag(feats.GetAltTag()); } } }
private static CoreLabel InitCoreLabel(string token) { CoreLabel label = new CoreLabel(); label.SetWord(token); label.SetValue(token); label.Set(typeof(CoreAnnotations.TextAnnotation), token); label.Set(typeof(CoreAnnotations.ValueAnnotation), token); return(label); }
/// <summary>Copies the CoreLabel cl with the new word part</summary> private static CoreLabel CopyCoreLabel(CoreLabel cl, string part, int beginPosition, int endPosition) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(beginPosition); newLabel.SetEndPosition(endPosition); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); return(newLabel); }
protected internal virtual CoreLabel MkWord(string gloss, int index) { CoreLabel w = new CoreLabel(); w.SetWord(gloss); w.SetValue(gloss); if (index >= 0) { w.SetIndex(index); } return(w); }
/// <summary>Create a dummy word, just with a given word at a given index.</summary> /// <remarks> /// Create a dummy word, just with a given word at a given index. /// Mostly useful for making semantic graphs. /// </remarks> public static CoreLabel MkWord(string gloss, int index) { CoreLabel w = new CoreLabel(); w.SetWord(gloss); w.SetValue(gloss); if (index >= 0) { w.SetIndex(index); } return(w); }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return cLabel; }
// Arbitrary test input. We just need to segment something on multiple threads to reproduce // the issue private static IList <CoreLabel> CreateTestTokens() { CoreLabel token = new CoreLabel(); token.SetWord("你好,世界"); token.SetValue("你好,世界"); token.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1"); token.Set(typeof(CoreAnnotations.AnswerAnnotation), "0"); IList <CoreLabel> labels = new List <CoreLabel>(); labels.Add(token); return(labels); }
public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words) where _T0 : IHasWord { IList <Tree> preterminals = Generics.NewArrayList(); for (int index = 0; index < words.Count; ++index) { IHasWord hw = words[index]; CoreLabel wordLabel; string tag; if (hw is CoreLabel) { wordLabel = (CoreLabel)hw; tag = wordLabel.Tag(); } else { wordLabel = new CoreLabel(); wordLabel.SetValue(hw.Word()); wordLabel.SetWord(hw.Word()); if (!(hw is IHasTag)) { throw new ArgumentException("Expected tagged words"); } tag = ((IHasTag)hw).Tag(); wordLabel.SetTag(tag); } if (tag == null) { throw new ArgumentException("Input word not tagged"); } CoreLabel tagLabel = new CoreLabel(); tagLabel.SetValue(tag); // Index from 1. Tools downstream from the parser expect that // Internally this parser uses the index, so we have to // overwrite incorrect indices if the label is already indexed wordLabel.SetIndex(index + 1); tagLabel.SetIndex(index + 1); LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel); LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel); tagNode.AddChild(wordNode); // TODO: can we get away with not setting these on the wordLabel? wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); preterminals.Add(tagNode); } return(new State(preterminals)); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = cl.Word().ReplaceAll("-", " - ").Split("\\s+"); foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); } return(compoundBuffer.Remove(0)); }
public override ILabel Label() { // TODO: move this CoreLabel construction logic somewhere appropriate var cLabel = new CoreLabel(); if (this.parse.IsLeaf) { cLabel.SetWord(this.parse.Value); cLabel.SetBeginPosition(this.parse.Span.Start); cLabel.SetEndPosition(this.parse.Span.End); cLabel.SetValue(this.parse.Value); } else { cLabel.SetCategory(this.parse.Type); cLabel.SetValue(this.parse.Type); if (this.Depth() == 1) { cLabel.SetTag(this.parse.Type); } } return(cLabel); }
// This probably isn't needed now; everything is always a core label. But no-op. private static void ConvertToCoreLabels(Tree tree) { ILabel l = tree.Label(); if (!(l is CoreLabel)) { CoreLabel cl = new CoreLabel(); cl.SetValue(l.Value()); tree.SetLabel(cl); } foreach (Tree kid in tree.Children()) { ConvertToCoreLabels(kid); } }
internal static Tree CreateNode(Tree top, string label, params Tree[] children) { CoreLabel headLabel = (CoreLabel)top.Label(); CoreLabel production = new CoreLabel(); production.SetValue(label); production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation))); production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation))); Tree newTop = new LabeledScoredTreeNode(production); foreach (Tree child in children) { newTop.AddChild(child); } return(newTop); }
/// <summary>Add a binary node to the existing node on top of the stack</summary> public virtual State Apply(State state, double scoreDelta) { TreeShapedStack <Tree> stack = state.stack; Tree right = stack.Peek(); stack = stack.Pop(); Tree left = stack.Peek(); stack = stack.Pop(); Tree head; switch (side) { case BinaryTransition.Side.Left: { head = left; break; } case BinaryTransition.Side.Right: { head = right; break; } default: { throw new ArgumentException("Unknown side " + side); } } if (!(head.Label() is CoreLabel)) { throw new ArgumentException("Stack should have CoreLabel nodes"); } CoreLabel headLabel = (CoreLabel)head.Label(); CoreLabel production = new CoreLabel(); production.SetValue(label); production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation))); production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation))); Tree newTop = new LabeledScoredTreeNode(production); newTop.AddChild(left); newTop.AddChild(right); stack = stack.Push(newTop); return(new State(stack, state.transitions.Push(this), state.separators, state.sentence, state.tokenPosition, state.score + scoreDelta, false)); }
/// <summary>Remove everything but the skeleton, the predictions, and the labels</summary> private Tree SimplifyTree(Tree tree) { CoreLabel newLabel = new CoreLabel(); newLabel.Set(typeof(RNNCoreAnnotations.Predictions), RNNCoreAnnotations.GetPredictions(tree)); newLabel.SetValue(tree.Label().Value()); if (tree.IsLeaf()) { return(tree.TreeFactory().NewLeaf(newLabel)); } IList <Tree> children = Generics.NewArrayList(tree.Children().Length); for (int i = 0; i < tree.Children().Length; ++i) { children.Add(SimplifyTree(tree.Children()[i])); } return(tree.TreeFactory().NewTreeNode(newLabel, children)); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - ")); int lengthAccum = 0; foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum); newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); lengthAccum += part.Length; } return(compoundBuffer.Remove(0)); }
private static void ReplacePOSTag(Tree t, MorphoFeatureSpecification morpho) { if (!t.IsPreTerminal()) { throw new ArgumentException("Can only operate on preterminals"); } if (!(t.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel label = (CoreLabel)t.Label(); Tree child = t.Children()[0]; if (!(child.Label() is CoreLabel)) { throw new ArgumentException("Only operates on CoreLabels"); } CoreLabel childLabel = (CoreLabel)child.Label(); // Morphological Analysis string morphStr = childLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = label.Value(); // POS subcategory string subCat = childLabel.Category(); if (subCat != null && subCat != string.Empty) { morphStr += "-" + subCat + "--"; } else { morphStr += "---"; } } MorphoFeatures feats = morpho.StrToFeatures(morphStr); if (feats.GetAltTag() != null && !feats.GetAltTag().Equals(string.Empty)) { label.SetValue(feats.GetAltTag()); label.SetTag(feats.GetAltTag()); } }
// static methods /// <summary> /// Sets the labels on the tree (except the leaves) to be the integer /// value of the sentiment prediction. /// </summary> /// <remarks> /// Sets the labels on the tree (except the leaves) to be the integer /// value of the sentiment prediction. Makes it easy to print out /// with Tree.toString() /// </remarks> private static void SetSentimentLabels(Tree tree) { if (tree.IsLeaf()) { return; } foreach (Tree child in tree.Children()) { SetSentimentLabels(child); } ILabel label = tree.Label(); if (!(label is CoreLabel)) { throw new ArgumentException("Required a tree with CoreLabels"); } CoreLabel cl = (CoreLabel)label; cl.SetValue(int.ToString(RNNCoreAnnotations.GetPredictedClass(tree))); }
public virtual IList <CoreLabel> SegmentStringToTokenList(string line) { IList <CoreLabel> tokenList = CollectionUtils.MakeList(); IList <CoreLabel> labeledSequence = SegmentStringToIOB(line); foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); string text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget()); token.SetWord(text); token.SetValue(text); token.Set(typeof(CoreAnnotations.TextAnnotation), text); token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1"); int start = labeledSequence[span.GetSource()].BeginPosition(); int end = labeledSequence[span.GetTarget() - 1].EndPosition(); token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end)); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); tokenList.Add(token); } return(tokenList); }
public UnnamedDependency(string regent, string dependent) { if (regent == null || dependent == null) { throw new ArgumentException("governor or dependent cannot be null"); } var headLabel = new CoreLabel(); headLabel.SetValue(regent); headLabel.SetWord(regent); this._regent = headLabel; var depLabel = new CoreLabel(); depLabel.SetValue(dependent); depLabel.SetWord(dependent); this._dependent = depLabel; RegentText = regent; DependentText = dependent; }
/// <summary>Parse a sentence represented as a List of tokens.</summary> /// <remarks> /// Parse a sentence represented as a List of tokens. /// The text must already have been tokenized and /// normalized into tokens that are appropriate to the treebank /// which was used to train the parser. The tokens can be of /// multiple types, and the list items need not be homogeneous as to type /// (in particular, only some words might be given tags): /// <ul> /// <li>If a token implements HasWord, then the word to be parsed is /// given by its word() value.</li> /// <li>If a token implements HasTag and the tag() value is not /// null or the empty String, then the parser is strongly advised to assign /// a part of speech tag that <i>begins</i> with this String.</li> /// </ul> /// </remarks> /// <param name="sentence">The sentence to parse</param> /// <returns>true Iff the sentence was accepted by the grammar</returns> /// <exception cref="System.NotSupportedException"> /// If the Sentence is too long or /// of zero length or the parse /// otherwise fails for resource reasons /// </exception> private bool ParseInternal <_T0>(IList <_T0> sentence) where _T0 : IHasWord { parseSucceeded = false; parseNoMemory = false; parseUnparsable = false; parseSkipped = false; parseFallback = false; whatFailed = null; addedPunct = false; originalSentence = sentence; int length = sentence.Count; if (length == 0) { parseSkipped = true; throw new NotSupportedException("Can't parse a zero-length sentence!"); } IList <IHasWord> sentenceB; if (op.wordFunction != null) { sentenceB = Generics.NewArrayList(); foreach (IHasWord word in originalSentence) { if (word is ILabel) { ILabel label = (ILabel)word; ILabel newLabel = label.LabelFactory().NewLabel(label); if (newLabel is IHasWord) { sentenceB.Add((IHasWord)newLabel); } else { throw new AssertionError("This should have been a HasWord"); } } else { if (word is IHasTag) { TaggedWord tw = new TaggedWord(word.Word(), ((IHasTag)word).Tag()); sentenceB.Add(tw); } else { sentenceB.Add(new Word(word.Word())); } } } foreach (IHasWord word_1 in sentenceB) { word_1.SetWord(op.wordFunction.Apply(word_1.Word())); } } else { sentenceB = new List <IHasWord>(sentence); } if (op.testOptions.addMissingFinalPunctuation) { addedPunct = AddSentenceFinalPunctIfNeeded(sentenceB, length); } if (length > op.testOptions.maxLength) { parseSkipped = true; throw new NotSupportedException("Sentence too long: length " + length); } TreePrint treePrint = GetTreePrint(); PrintWriter pwOut = op.tlpParams.Pw(); //Insert the boundary symbol if (sentence[0] is CoreLabel) { CoreLabel boundary = new CoreLabel(); boundary.SetWord(LexiconConstants.Boundary); boundary.SetValue(LexiconConstants.Boundary); boundary.SetTag(LexiconConstants.BoundaryTag); boundary.SetIndex(sentence.Count + 1); //1-based indexing used in the parser sentenceB.Add(boundary); } else { sentenceB.Add(new TaggedWord(LexiconConstants.Boundary, LexiconConstants.BoundaryTag)); } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG) { if (!pparser.Parse(sentenceB)) { return(parseSucceeded); } if (op.testOptions.verbose) { pwOut.Println("PParser output"); // getBestPCFGParse(false).pennPrint(pwOut); // with scores on nodes treePrint.PrintTree(GetBestPCFGParse(false), pwOut); } } // without scores on nodes if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doDep && !op.testOptions.useFastFactored) { if (!dparser.Parse(sentenceB)) { return(parseSucceeded); } // cdm nov 2006: should move these printing bits to the main printing section, // so don't calculate the best parse twice! if (op.testOptions.verbose) { pwOut.Println("DParser output"); treePrint.PrintTree(dparser.GetBestParse(), pwOut); } } if (Thread.Interrupted()) { throw new RuntimeInterruptedException(); } if (op.doPCFG && op.doDep) { if (!bparser.Parse(sentenceB)) { return(parseSucceeded); } else { parseSucceeded = true; } } return(true); }
public override Tree NormalizeWholeTree(Tree tree, ITreeFactory tf) { tree = tree.Prune(emptyFilter, tf).SpliceOut(aOverAFilter, tf); foreach (Tree t in tree) { if (t.IsLeaf()) { //Strip off morphological analyses and place them in the OriginalTextAnnotation, which is //specified by HasContext. if (t.Value().Contains(MorphoFeatureSpecification.MorphoMark)) { string[] toks = t.Value().Split(MorphoFeatureSpecification.MorphoMark); if (toks.Length != 2) { log.Err(string.Format("%s: Word contains malformed morph annotation: %s", this.GetType().FullName, t.Value())); } else { if (t.Label() is CoreLabel) { CoreLabel cl = (CoreLabel)t.Label(); cl.SetValue(string.Intern(toks[0].Trim())); cl.SetWord(string.Intern(toks[0].Trim())); Pair <string, string> lemmaMorph = MorphoFeatureSpecification.SplitMorphString(toks[0], toks[1]); string lemma = lemmaMorph.First(); string morphAnalysis = lemmaMorph.Second(); if (lemma.Equals(toks[0])) { cl.SetOriginalText(string.Intern(toks[1].Trim())); } else { // TODO(spenceg): Does this help? string newLemma = lexMapper.Map(null, lemma); if (newLemma == null || newLemma.Trim().IsEmpty()) { newLemma = lemma; } string newMorphAnalysis = newLemma + MorphoFeatureSpecification.LemmaMark + morphAnalysis; cl.SetOriginalText(string.Intern(newMorphAnalysis)); } } else { log.Error(string.Format("%s: Cannot store morph analysis in non-CoreLabel: %s", this.GetType().FullName, t.Label().GetType().FullName)); } } } } else { if (t.IsPreTerminal()) { if (t.Value() == null || t.Value().IsEmpty()) { log.Warn(string.Format("%s: missing tag for %s", this.GetType().FullName, t.PennString())); } else { if (t.Label() is IHasTag) { ((IHasTag)t.Label()).SetTag(t.Value()); } } } else { //Phrasal nodes // there are some nodes "/" missing preterminals. We'll splice in a tag for these. int nk = t.NumChildren(); IList <Tree> newKids = new List <Tree>(nk); for (int j = 0; j < nk; j++) { Tree child = t.GetChild(j); if (child.IsLeaf()) { log.Warn(string.Format("%s: Splicing in DUMMYTAG for %s", this.GetType().FullName, t.ToString())); newKids.Add(tf.NewTreeNode("DUMMYTAG", Java.Util.Collections.SingletonList(child))); } else { newKids.Add(child); } } t.SetChildren(newKids); } } } //Every node in the tree has now been processed // // Additional processing for specific phrasal annotations // // special global coding for moving PRD annotation from constituent to verb tag. if (markPRDverb) { TregexMatcher m = prdVerbPattern.Matcher(tree); Tree match = null; while (m.Find()) { if (m.GetMatch() != match) { match = m.GetMatch(); match.Label().SetValue(match.Label().Value() + "-PRDverb"); Tree prd = m.GetNode("prd"); prd.Label().SetValue(base.NormalizeNonterminal(prd.Label().Value())); } } } //Mark *only* subjects in verb-initial clauses if (retainNPSbj) { TregexMatcher m = npSbjPattern.Matcher(tree); while (m.Find()) { Tree match = m.GetMatch(); match.Label().SetValue("NP"); } } if (tree.IsPreTerminal()) { // The whole tree is a bare tag: bad! string val = tree.Label().Value(); if (val.Equals("CC") || val.StartsWith("PUNC") || val.Equals("CONJ")) { log.Warn(string.Format("%s: Bare tagged word being wrapped in FRAG %s", this.GetType().FullName, tree.PennString())); tree = tf.NewTreeNode("FRAG", Java.Util.Collections.SingletonList(tree)); } else { log.Warn(string.Format("%s: Bare tagged word %s", this.GetType().FullName, tree.PennString())); } } //Add start symbol so that the root has only one sub-state. Escape any enclosing brackets. //If the "tree" consists entirely of enclosing brackets e.g. ((())) then this method //will return null. In this case, readers e.g. PennTreeReader will try to read the next tree. while (tree != null && (tree.Value() == null || tree.Value().IsEmpty()) && tree.NumChildren() <= 1) { tree = tree.FirstChild(); } if (tree != null && !tree.Value().Equals(rootLabel)) { tree = tf.NewTreeNode(rootLabel, Java.Util.Collections.SingletonList(tree)); } return(tree); }
/*/** * Simple tree reading utility method. Given a tree formatted as a PTB string, returns a Tree made by a specific TreeFactory. #1# public static Tree readTree(string ptbTreeString, TreeFactory treeFactory) { try { PennTreeReader ptr = new PennTreeReader(new StringReader(ptbTreeString), treeFactory); return ptr.readTree(); } catch (IOException ex) { throw new SystemException(ex); } }*/ /** * Simple tree reading utility method. Given a tree formatted as a PTB string, returns a Tree made by the default TreeFactory (LabeledScoredTreeFactory) */ /*public static Tree readTree(string str) { return readTree(str, defaultTreeFactory); }*/ /// <summary> /// Converts the tree labels to CoreLabels. /// We need this because we store additional info in the CoreLabel, like token span. /// </summary> public static void ConvertToCoreLabels(Tree tree) { ILabel l = tree.Label(); if (!(l is CoreLabel)) { var cl = new CoreLabel(); cl.SetValue(l.Value()); tree.SetLabel(cl); } foreach (Tree kid in tree.Children()) { ConvertToCoreLabels(kid); } }
public static void MungeLeaves(Tree tree, bool lemmasAsLeaves, bool addMorphoToLeaves) { IList <ILabel> labels = tree.Yield(); foreach (ILabel label in labels) { ++nTokens; if (!(label is CoreLabel)) { throw new ArgumentException("Only works with CoreLabels trees"); } CoreLabel coreLabel = (CoreLabel)label; string lemma = coreLabel.Lemma(); //PTB escaping since we're going to put this in the leaf if (lemma == null) { // No lemma, so just add the surface form lemma = coreLabel.Word(); } else { if (lemma.Equals("(")) { lemma = "-LRB-"; } else { if (lemma.Equals(")")) { lemma = "-RRB-"; } } } if (lemmasAsLeaves) { string escapedLemma = lemma; coreLabel.SetWord(escapedLemma); coreLabel.SetValue(escapedLemma); coreLabel.SetLemma(lemma); } if (addMorphoToLeaves) { string morphStr = coreLabel.OriginalText(); if (morphStr == null || morphStr.Equals(string.Empty)) { morphStr = MorphoFeatureSpecification.NoAnalysis; } else { ++nMorphAnalyses; } // Normalize punctuation analyses if (morphStr.StartsWith("PONCT")) { morphStr = "PUNC"; } string newLeaf = string.Format("%s%s%s%s%s", coreLabel.Value(), MorphoFeatureSpecification.MorphoMark, lemma, MorphoFeatureSpecification.LemmaMark, morphStr); coreLabel.SetValue(newLeaf); coreLabel.SetWord(newLeaf); } } }
// static class public static Tree ConvertTree(IList <int> parentPointers, IList <string> sentence, IDictionary <IList <string>, int> phraseIds, IDictionary <int, double> sentimentScores, PTBEscapingProcessor escaper, int numClasses) { int maxNode = 0; foreach (int parent in parentPointers) { maxNode = Math.Max(maxNode, parent); } Tree[] subtrees = new Tree[maxNode + 1]; for (int i = 0; i < sentence.Count; ++i) { CoreLabel word = new CoreLabel(); word.SetValue(sentence[i]); Tree leaf = new LabeledScoredTreeNode(word); subtrees[i] = new LabeledScoredTreeNode(new CoreLabel()); subtrees[i].AddChild(leaf); } for (int i_1 = sentence.Count; i_1 <= maxNode; ++i_1) { subtrees[i_1] = new LabeledScoredTreeNode(new CoreLabel()); } bool[] connected = new bool[maxNode + 1]; Tree root = null; for (int index = 0; index < parentPointers.Count; ++index) { if (parentPointers[index] == -1) { if (root != null) { throw new Exception("Found two roots for sentence " + sentence); } root = subtrees[index]; } else { // Walk up the tree structure to make sure that leftmost // phrases are added first. Otherwise, if the numbers are // inverted, we might get the right phrase added to a parent // first, resulting in "case zero in this", for example, // instead of "in this case zero" // Note that because we keep track of which ones are already // connected, we process this at most once per parent, so the // overall construction time is still efficient. Connect(parentPointers, subtrees, connected, index); } } for (int i_2 = 0; i_2 <= maxNode; ++i_2) { IList <Tree> leaves = subtrees[i_2].GetLeaves(); IList <string> words = CollectionUtils.TransformAsList(leaves, TransformTreeToWord); // First we look for a copy of the phrase with -LRB- -RRB- // instead of (). The sentiment trees sometimes have both, and // the escaped versions seem to have more reasonable scores. // If a particular phrase doesn't have -LRB- -RRB- we fall back // to the unescaped versions. int phraseId = phraseIds[CollectionUtils.TransformAsList(words, TransformParens)]; if (phraseId == null) { phraseId = phraseIds[words]; } if (phraseId == null) { throw new Exception("Could not find phrase id for phrase " + sentence); } // TODO: should we make this an option? Perhaps we want cases // where the trees have the phrase id and not their class double score = sentimentScores[phraseId]; if (score == null) { throw new Exception("Could not find sentiment score for phrase id " + phraseId); } // TODO: make this a numClasses option int classLabel = Math.Round((float)Math.Floor(score * (float)numClasses)); if (classLabel > numClasses - 1) { classLabel = numClasses - 1; } subtrees[i_2].Label().SetValue(int.ToString(classLabel)); } for (int i_3 = 0; i_3 < sentence.Count; ++i_3) { Tree leaf = subtrees[i_3].Children()[0]; leaf.Label().SetValue(escaper.EscapeString(leaf.Label().Value())); } for (int i_4 = 0; i_4 < tregexPatterns.Length; ++i_4) { root = Edu.Stanford.Nlp.Trees.Tregex.Tsurgeon.Tsurgeon.ProcessPattern(tregexPatterns[i_4], tsurgeonPatterns[i_4], root); } return(root); }
/// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary> private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid) { // Error checks if (lemmas.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (pos.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (ner.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } // Create structure IList <CoreLabel> tokens = new List <CoreLabel>(words.Count); int beginChar = 0; for (int i = 0; i < words.Count; ++i) { CoreLabel token = new CoreLabel(12); token.SetWord(words[i]); token.SetValue(words[i]); token.SetBeginPosition(beginChar); token.SetEndPosition(beginChar + words[i].Length); beginChar += words[i].Length + 1; token.SetLemma(lemmas[i]); token.SetTag(pos[i]); token.SetNER(ner[i]); token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); tokens.Add(token); } gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t"); ICoreMap sentence = new ArrayCoreMap(16); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); SemanticGraph graph = tree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); SemanticGraph maltGraph = maltTree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count); Annotation doc = new Annotation(gloss); doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); return(doc); }
/// <exception cref="System.IO.IOException"/> public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix) { Pattern startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); Pattern endLabelToken = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); string backgroundSymbol = "O"; IList <ICoreMap> sentences = new List <ICoreMap>(); int lineNum = -1; string l = null; while ((l = reader.ReadLine()) != null) { lineNum++; string[] t = l.Split("\t", 2); string id = null; string text = null; if (t.Length == 2) { id = t[0]; text = t[1]; } else { if (t.Length == 1) { text = t[0]; id = lineNum.ToString(); } } id = sentIDprefix + id; DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text)); PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); dp.SetTokenizerFactory(tokenizerFactory); string label = backgroundSymbol; int sentNum = -1; foreach (IList <IHasWord> sentence in dp) { sentNum++; string sentStr = string.Empty; IList <CoreLabel> sent = new List <CoreLabel>(); foreach (IHasWord tokw in sentence) { string tok = tokw.Word(); Matcher startingMatcher = startingLabelToken.Matcher(tok); Matcher endMatcher = endLabelToken.Matcher(tok); if (startingMatcher.Matches()) { //System.out.println("matched starting"); label = startingMatcher.Group(1); } else { if (endMatcher.Matches()) { //System.out.println("matched end"); label = backgroundSymbol; } else { CoreLabel c = new CoreLabel(); IList <string> toks = new List <string>(); toks.Add(tok); foreach (string toksplit in toks) { sentStr += " " + toksplit; c.SetWord(toksplit); c.SetLemma(toksplit); c.SetValue(toksplit); c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit); c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok); if (setGoldClass) { c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); } if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label)) { c.Set(setClassForTheseLabels[label], label); } sent.Add(c); } } } } ICoreMap sentcm = new ArrayCoreMap(); sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim()); sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent); sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum); sentences.Add(sentcm); } } return(sentences); }