private CoreLabel MakeXmlToken(string tokenText, bool doNormalization, int charOffsetBegin, int charOffsetEnd) { CoreLabel token = new CoreLabel(); token.SetOriginalText(tokenText); if (separatorPattern.Matcher(tokenText).Matches()) { // Map to CoreNLP newline token tokenText = AbstractTokenizer.NewlineToken; } else { if (doNormalization && normalizeSpace) { tokenText = tokenText.Replace(' ', '\u00A0'); } } // change space to non-breaking space token.SetWord(tokenText); token.SetValue(tokenText); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), charOffsetBegin); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), charOffsetEnd); if (Verbose) { log.Info("Adding token " + token.ToShorterString()); } return(token); }
// last except for the added period. private static CoreLabel InitCoreLabel(string token) { CoreLabel label = new CoreLabel(); label.Set(typeof(CoreAnnotations.TextAnnotation), token); label.Set(typeof(CoreAnnotations.ValueAnnotation), token); return(label); }
// TODO not called any more, but possibly useful as a reference /// <summary> /// This should be called after the classifier has been trained and /// parseAndTrain has been called to accumulate test set /// This will return precision,recall and F1 measure /// </summary> public virtual void RunTestSet(IList <IList <CoreLabel> > testSet) { ICounter <string> tp = new ClassicCounter <string>(); ICounter <string> fp = new ClassicCounter <string>(); ICounter <string> fn = new ClassicCounter <string>(); ICounter <string> actual = new ClassicCounter <string>(); foreach (IList <CoreLabel> labels in testSet) { IList <CoreLabel> unannotatedLabels = new List <CoreLabel>(); // create a new label without answer annotation foreach (CoreLabel label in labels) { CoreLabel newLabel = new CoreLabel(); newLabel.Set(annotationForWord, label.Get(annotationForWord)); newLabel.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), label.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation))); unannotatedLabels.Add(newLabel); } IList <CoreLabel> annotatedLabels = this.classifier.Classify(unannotatedLabels); int ind = 0; foreach (CoreLabel expectedLabel in labels) { CoreLabel annotatedLabel = annotatedLabels[ind]; string answer = annotatedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation)); string expectedAnswer = expectedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation)); actual.IncrementCount(expectedAnswer); // match only non background symbols if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer) && expectedAnswer.Equals(answer)) { // true positives tp.IncrementCount(answer); System.Console.Out.WriteLine("True Positive:" + annotatedLabel); } else { if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(answer)) { // false positives fp.IncrementCount(answer); System.Console.Out.WriteLine("False Positive:" + annotatedLabel); } else { if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer)) { // false negatives fn.IncrementCount(expectedAnswer); System.Console.Out.WriteLine("False Negative:" + expectedLabel); } } } // else true negatives ind++; } } actual.Remove(SeqClassifierFlags.DefaultBackgroundSymbol); }
// last except for the added period. private static CoreLabel InitCoreLabel(string token, string posTag) { CoreLabel label = new CoreLabel(); label.Set(typeof(CoreAnnotations.TextAnnotation), token); label.Set(typeof(CoreAnnotations.ValueAnnotation), token); label.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), posTag); return(label); }
private static CoreLabel LoadToken(string line, bool haveExplicitAntecedent) { CoreLabel token = new CoreLabel(); string[] bits = line.Split("\t", -1); if (bits.Length < 7) { throw new RuntimeIOException("ERROR: Invalid format token for serialized token (only " + bits.Length + " tokens): " + line); } // word string word = bits[0].ReplaceAll(SpaceHolder, " "); token.Set(typeof(CoreAnnotations.TextAnnotation), word); token.Set(typeof(CoreAnnotations.ValueAnnotation), word); // if(word.length() == 0) log.info("FOUND 0-LENGTH TOKEN!"); // lemma if (bits[1].Length > 0 || bits[0].Length == 0) { string lemma = bits[1].ReplaceAll(SpaceHolder, " "); token.Set(typeof(CoreAnnotations.LemmaAnnotation), lemma); } // POS tag if (bits[2].Length > 0) { token.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), bits[2]); } // NE tag if (bits[3].Length > 0) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), bits[3]); } // Normalized NE tag if (bits[4].Length > 0) { token.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), bits[4]); } // Character offsets if (bits[5].Length > 0) { token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), System.Convert.ToInt32(bits[5])); } if (bits[6].Length > 0) { token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), System.Convert.ToInt32(bits[6])); } if (haveExplicitAntecedent) { // This block is specific to KBP // We may have AntecedentAnnotation if (bits.Length > 7) { string aa = bits[7].ReplaceAll(SpaceHolder, " "); if (aa.Length > 0) { token.Set(typeof(CoreAnnotations.AntecedentAnnotation), aa); } } } return(token); }
// Arbitrary test input. We just need to segment something on multiple threads to reproduce // the issue private static IList <CoreLabel> CreateTestTokens() { CoreLabel token = new CoreLabel(); token.SetWord("ä½ å¥½ï¼Œä¸–ç•Œ"); token.SetValue("ä½ å¥½ï¼Œä¸–ç•Œ"); token.Set(typeof(CoreAnnotations.ChineseSegAnnotation), "1"); token.Set(typeof(CoreAnnotations.AnswerAnnotation), "0"); IList <CoreLabel> labels = new List <CoreLabel>(); labels.Add(token); return(labels); }
private static IList <CoreLabel> MakeListCoreLabel(string[] gold, string[] guess) { NUnit.Framework.Assert.AreEqual("Cannot run test on lists of different length", gold.Length, guess.Length); IList <CoreLabel> sentence = new List <CoreLabel>(); for (int i = 0; i < gold.Length; ++i) { CoreLabel word = new CoreLabel(); word.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), gold[i]); word.Set(typeof(CoreAnnotations.AnswerAnnotation), guess[i]); sentence.Add(word); } return(sentence); }
public virtual void TestCoreLabelSetWordBehavior() { CoreLabel foo = new CoreLabel(); foo.Set(typeof(CoreAnnotations.TextAnnotation), "foo"); foo.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), "B"); foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); // Lemma gets removed with word ArrayCoreMap copy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(copy, foo); foo.SetWord("foo"); NUnit.Framework.Assert.AreEqual(copy, foo); // same word set foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // lemma removed foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(copy.Equals(foo)); // still removed foo.Set(typeof(CoreAnnotations.LemmaAnnotation), "fool"); NUnit.Framework.Assert.AreEqual(copy, foo); // back to normal // Hash code is consistent int hashCode = foo.GetHashCode(); NUnit.Framework.Assert.AreEqual(copy.GetHashCode(), hashCode); foo.SetWord("bar"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); foo.SetWord("foo"); NUnit.Framework.Assert.IsFalse(hashCode == foo.GetHashCode()); // Hash code doesn't care between a value of null and the key not existing NUnit.Framework.Assert.IsTrue(foo.Lemma() == null); int lemmalessHashCode = foo.GetHashCode(); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma(null); NUnit.Framework.Assert.AreEqual(lemmalessHashCode, foo.GetHashCode()); foo.SetLemma("fool"); NUnit.Framework.Assert.AreEqual(hashCode, foo.GetHashCode()); // Check equals foo.SetWord("bar"); foo.SetWord("foo"); ArrayCoreMap nulledCopy = new ArrayCoreMap(foo); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); foo.Remove(typeof(CoreAnnotations.LemmaAnnotation)); NUnit.Framework.Assert.AreEqual(nulledCopy, foo); }
public static State InitialStateFromTaggedSentence <_T0>(IList <_T0> words) where _T0 : IHasWord { IList <Tree> preterminals = Generics.NewArrayList(); for (int index = 0; index < words.Count; ++index) { IHasWord hw = words[index]; CoreLabel wordLabel; string tag; if (hw is CoreLabel) { wordLabel = (CoreLabel)hw; tag = wordLabel.Tag(); } else { wordLabel = new CoreLabel(); wordLabel.SetValue(hw.Word()); wordLabel.SetWord(hw.Word()); if (!(hw is IHasTag)) { throw new ArgumentException("Expected tagged words"); } tag = ((IHasTag)hw).Tag(); wordLabel.SetTag(tag); } if (tag == null) { throw new ArgumentException("Input word not tagged"); } CoreLabel tagLabel = new CoreLabel(); tagLabel.SetValue(tag); // Index from 1. Tools downstream from the parser expect that // Internally this parser uses the index, so we have to // overwrite incorrect indices if the label is already indexed wordLabel.SetIndex(index + 1); tagLabel.SetIndex(index + 1); LabeledScoredTreeNode wordNode = new LabeledScoredTreeNode(wordLabel); LabeledScoredTreeNode tagNode = new LabeledScoredTreeNode(tagLabel); tagNode.AddChild(wordNode); // TODO: can we get away with not setting these on the wordLabel? wordLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); wordLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), wordLabel); tagLabel.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), tagLabel); preterminals.Add(tagNode); } return(new State(preterminals)); }
internal static Tree CreateNode(Tree top, string label, params Tree[] children) { CoreLabel headLabel = (CoreLabel)top.Label(); CoreLabel production = new CoreLabel(); production.SetValue(label); production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation))); production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation))); Tree newTop = new LabeledScoredTreeNode(production); foreach (Tree child in children) { newTop.AddChild(child); } return(newTop); }
public override void PopulatePredictedLabels(IList <Tree> trees) { if (trees.Count != this.predicted.Count) { throw new ArgumentException("Number of gold and predicted trees not equal!"); } for (int i = 0; i < trees.Count; i++) { IEnumerator <Tree> goldTree = trees[i].GetEnumerator(); IEnumerator <Tree> predictedTree = this.predicted[i].GetEnumerator(); while (goldTree.MoveNext() || predictedTree.MoveNext()) { Tree goldNode = goldTree.Current; Tree predictedNode = predictedTree.Current; if (goldNode == null || predictedNode == null) { throw new ArgumentException("Trees not of equal length"); } if (goldNode.IsLeaf()) { continue; } CoreLabel label = (CoreLabel)goldNode.Label(); label.Set(typeof(RNNCoreAnnotations.PredictedClass), RNNCoreAnnotations.GetPredictedClass(predictedNode)); } } }
public virtual void Annotate(Annotation annotation) { if (annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { // TODO: parallelize IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { Tree binarized = sentence.Get(typeof(TreeCoreAnnotations.BinarizedTreeAnnotation)); if (binarized == null) { throw new AssertionError("Binarized sentences not built by parser"); } Tree collapsedUnary = transformer.TransformTree(binarized); SentimentCostAndGradient scorer = new SentimentCostAndGradient(model, null); scorer.ForwardPropagateTree(collapsedUnary); sentence.Set(typeof(SentimentCoreAnnotations.SentimentAnnotatedTree), collapsedUnary); int sentiment = RNNCoreAnnotations.GetPredictedClass(collapsedUnary); sentence.Set(typeof(SentimentCoreAnnotations.SentimentClass), SentimentUtils.SentimentString(model, sentiment)); Tree tree = sentence.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); if (tree != null) { collapsedUnary.SetSpans(); // map the sentiment annotations onto the tree IDictionary <IntPair, string> spanSentiment = Generics.NewHashMap(); foreach (Tree bt in collapsedUnary) { IntPair p = bt.GetSpan(); int sen = RNNCoreAnnotations.GetPredictedClass(bt); string sentStr = SentimentUtils.SentimentString(model, sen); if (!spanSentiment.Contains(p)) { // we'll take the first = highest one discovered spanSentiment[p] = sentStr; } } if (((CoreLabel)tree.Label()).ContainsKey(typeof(CoreAnnotations.SpanAnnotation))) { throw new InvalidOperationException("This code assumes you don't have SpanAnnotation"); } tree.SetSpans(); foreach (Tree t in tree) { IntPair p = t.GetSpan(); string str = spanSentiment[p]; if (str != null) { CoreLabel cl = (CoreLabel)t.Label(); cl.Set(typeof(SentimentCoreAnnotations.SentimentClass), str); cl.Remove(typeof(CoreAnnotations.SpanAnnotation)); } } } } } else { throw new Exception("unable to find sentences in: " + annotation); } }
/// <summary>Add a binary node to the existing node on top of the stack</summary> public virtual State Apply(State state, double scoreDelta) { TreeShapedStack <Tree> stack = state.stack; Tree right = stack.Peek(); stack = stack.Pop(); Tree left = stack.Peek(); stack = stack.Pop(); Tree head; switch (side) { case BinaryTransition.Side.Left: { head = left; break; } case BinaryTransition.Side.Right: { head = right; break; } default: { throw new ArgumentException("Unknown side " + side); } } if (!(head.Label() is CoreLabel)) { throw new ArgumentException("Stack should have CoreLabel nodes"); } CoreLabel headLabel = (CoreLabel)head.Label(); CoreLabel production = new CoreLabel(); production.SetValue(label); production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation))); production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation))); Tree newTop = new LabeledScoredTreeNode(production); newTop.AddChild(left); newTop.AddChild(right); stack = stack.Push(newTop); return(new State(stack, state.transitions.Push(this), state.separators, state.sentence, state.tokenPosition, state.score + scoreDelta, false)); }
private void SetTrueCaseText(CoreLabel l) { string trueCase = l.GetString <CoreAnnotations.TrueCaseAnnotation>(); string text = l.Word(); string trueCaseText = text; switch (trueCase) { case "UPPER": { trueCaseText = text.ToUpper(); break; } case "LOWER": { trueCaseText = text.ToLower(); break; } case "INIT_UPPER": { trueCaseText = char.ToTitleCase(text[0]) + Sharpen.Runtime.Substring(text, 1).ToLower(); break; } case "O": { // The model predicted mixed case, so lookup the map: string lower = text.ToLower(); if (mixedCaseMap.Contains(lower)) { trueCaseText = mixedCaseMap[lower]; } // else leave it as it was? break; } } // System.err.println(text + " was classified as " + trueCase + " and so became " + trueCaseText); l.Set(typeof(CoreAnnotations.TrueCaseTextAnnotation), trueCaseText); if (overwriteText) { l.Set(typeof(CoreAnnotations.TextAnnotation), trueCaseText); l.Set(typeof(CoreAnnotations.ValueAnnotation), trueCaseText); } }
/// <summary>see merge(CoreMap base, CoreMap toBeMerged)</summary> public static CoreLabel Merge(CoreLabel @base, CoreLabel toBeMerged) { //(variables) CoreLabel rtn = new CoreLabel(@base.Size()); //(copy base) foreach (Type key in @base.KeySet()) { rtn.Set(key, @base.Get(key)); } //(merge) foreach (Type key_1 in toBeMerged.KeySet()) { rtn.Set(key_1, toBeMerged.Get(key_1)); } //(return) return(rtn); }
/// <summary>Copies the CoreLabel cl with the new word part</summary> private static CoreLabel CopyCoreLabel(CoreLabel cl, string part, int beginPosition, int endPosition) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(beginPosition); newLabel.SetEndPosition(endPosition); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); return(newLabel); }
public virtual IList <CoreLabel> SegmentStringToTokenList(string line) { IList <CoreLabel> tokenList = CollectionUtils.MakeList(); IList <CoreLabel> labeledSequence = SegmentStringToIOB(line); foreach (IntPair span in IOBUtils.TokenSpansForIOB(labeledSequence)) { CoreLabel token = new CoreLabel(); string text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker, span.GetSource(), span.GetTarget()); token.SetWord(text); token.SetValue(text); token.Set(typeof(CoreAnnotations.TextAnnotation), text); token.Set(typeof(CoreAnnotations.ArabicSegAnnotation), "1"); int start = labeledSequence[span.GetSource()].BeginPosition(); int end = labeledSequence[span.GetTarget() - 1].EndPosition(); token.SetOriginalText(Sharpen.Runtime.Substring(line, start, end)); token.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), start); token.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), end); tokenList.Add(token); } return(tokenList); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = cl.Word().ReplaceAll("-", " - ").Split("\\s+"); foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); } return(compoundBuffer.Remove(0)); }
/// <summary>Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.</summary> /// <remarks> /// Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence. /// The original value() of the Tree nodes is preserved, and otherwise the label of tree /// leaves becomes the label from the List. /// </remarks> public static void MergeLabels(Tree tree, IList <CoreLabel> sentence) { // todo [cdm 2015]: This clearly shouldn't be here! Maybe it's not needed at all now since parsing code does this? int idx = 0; foreach (Tree t in tree.GetLeaves()) { CoreLabel cl = sentence[idx++]; string value = t.Value(); cl.Set(typeof(CoreAnnotations.ValueAnnotation), value); t.SetLabel(cl); } tree.IndexLeaves(); }
private static void TaggedLeafLabels(Tree t, IList <CoreLabel> l) { if (t.IsPreTerminal()) { CoreLabel fl = (CoreLabel)t.GetChild(0).Label(); fl.Set(typeof(CoreAnnotations.TagLabelAnnotation), t.Label()); l.Add(fl); } else { foreach (Tree kid in t.Children()) { TaggedLeafLabels(kid, l); } } }
/// <summary>Create a datum from a string.</summary> /// <remarks> /// Create a datum from a string. The CoreAnnotations must correspond to those used by /// SequenceClassifier. The following annotations are copied from the provided /// CoreLabel cl, if present: /// DomainAnnotation /// startOffset and endOffset will be added to the /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/> /// of /// the /// <see cref="Edu.Stanford.Nlp.Ling.CoreLabel"/> /// cl to give the /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/> /// and /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetEndAnnotation"/> /// of the resulting datum. /// </remarks> private static CoreLabel CreateDatum(CoreLabel cl, string token, string label, int startOffset, int endOffset) { CoreLabel newTok = new CoreLabel(); newTok.Set(typeof(CoreAnnotations.TextAnnotation), token); newTok.Set(typeof(CoreAnnotations.CharAnnotation), token); newTok.Set(typeof(CoreAnnotations.AnswerAnnotation), label); newTok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); newTok.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + startOffset); newTok.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + endOffset); if (cl != null && cl.ContainsKey(typeof(CoreAnnotations.DomainAnnotation))) { newTok.Set(typeof(CoreAnnotations.DomainAnnotation), cl.Get(typeof(CoreAnnotations.DomainAnnotation))); } return(newTok); }
/// <summary>Remove everything but the skeleton, the predictions, and the labels</summary> private Tree SimplifyTree(Tree tree) { CoreLabel newLabel = new CoreLabel(); newLabel.Set(typeof(RNNCoreAnnotations.Predictions), RNNCoreAnnotations.GetPredictions(tree)); newLabel.SetValue(tree.Label().Value()); if (tree.IsLeaf()) { return(tree.TreeFactory().NewLeaf(newLabel)); } IList <Tree> children = Generics.NewArrayList(tree.Children().Length); for (int i = 0; i < tree.Children().Length; ++i) { children.Add(SimplifyTree(tree.Children()[i])); } return(tree.TreeFactory().NewTreeNode(newLabel, children)); }
/// <summary>Splits a compound marked by the lexer.</summary> private CoreLabel ProcessCompound(CoreLabel cl) { cl.Remove(typeof(CoreAnnotations.ParentAnnotation)); string[] parts = pSpace.Split(pDash.Matcher(cl.Word()).ReplaceAll(" - ")); int lengthAccum = 0; foreach (string part in parts) { CoreLabel newLabel = new CoreLabel(cl); newLabel.SetWord(part); newLabel.SetValue(part); newLabel.SetBeginPosition(cl.BeginPosition() + lengthAccum); newLabel.SetEndPosition(cl.BeginPosition() + lengthAccum + part.Length); newLabel.Set(typeof(CoreAnnotations.OriginalTextAnnotation), part); compoundBuffer.Add(newLabel); lengthAccum += part.Length; } return(compoundBuffer.Remove(0)); }
/// <exception cref="System.Exception"/> public virtual Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> > Call() { // CollectionValuedMap<String, Integer> tokensMatchedPattern = new // CollectionValuedMap<String, Integer>(); try { ICollection <CandidatePhrase> alreadyLabeledPhrases = new HashSet <CandidatePhrase>(); TwoDimensionalCounter <CandidatePhrase, E> allFreq = new TwoDimensionalCounter <CandidatePhrase, E>(); CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat = new CollectionValuedMap <E, Triple <string, int, int> >(); foreach (string sentid in sentids) { IList <CoreLabel> sent = sents[sentid].GetTokens(); foreach (KeyValuePair <TokenSequencePattern, E> pEn in patterns) { if (pEn.Key == null) { throw new Exception("why is the pattern " + pEn + " null?"); } TokenSequenceMatcher m = ((TokenSequenceMatcher)pEn.Key.GetMatcher(sent)); // //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced // m.setFindType(SequenceMatcher.FindType.FIND_ALL); //Higher branch values makes the faster but uses more memory m.SetBranchLimit(5); while (m.Find()) { int s = m.Start("$term"); int e = m.End("$term"); System.Diagnostics.Debug.Assert(e - s <= PatternFactory.numWordsCompoundMapped[label], "How come the pattern " + pEn.Key + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped[label] + " for label " + label); string phrase = string.Empty; string phraseLemma = string.Empty; bool useWordNotLabeled = false; bool doNotUse = false; //find if the neighboring words are labeled - if so - club them together if (constVars.clubNeighboringLabeledWords) { for (int i = s - 1; i >= 0; i--) { if (!sent[i].Get(constVars.GetAnswerClass()[label]).Equals(label)) { s = i + 1; break; } } for (int i_1 = e; i_1 < sent.Count; i_1++) { if (!sent[i_1].Get(constVars.GetAnswerClass()[label]).Equals(label)) { e = i_1; break; } } } //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true bool[] addedindices = new bool[e - s]; // Arrays.fill(addedindices, false); // not needed as initialized false for (int i_2 = s; i_2 < e; i_2++) { CoreLabel l = sent[i_2]; l.Set(typeof(PatternsAnnotations.MatchedPattern), true); if (!l.ContainsKey(typeof(PatternsAnnotations.MatchedPatterns)) || l.Get(typeof(PatternsAnnotations.MatchedPatterns)) == null) { l.Set(typeof(PatternsAnnotations.MatchedPatterns), new HashSet <Pattern>()); } SurfacePattern pSur = (SurfacePattern)pEn.Value; System.Diagnostics.Debug.Assert(pSur != null, "Why is " + pEn.Value + " not present in the index?!"); System.Diagnostics.Debug.Assert(l.Get(typeof(PatternsAnnotations.MatchedPatterns)) != null, "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.KeySet()); l.Get(typeof(PatternsAnnotations.MatchedPatterns)).Add(pSur); foreach (KeyValuePair <Type, object> ig in constVars.GetIgnoreWordswithClassesDuringSelection()[label]) { if (l.ContainsKey(ig.Key) && l.Get(ig.Key).Equals(ig.Value)) { doNotUse = true; } } bool containsStop = ContainsStopWord(l, constVars.GetCommonEngWords(), PatternFactory.ignoreWordRegex); if (removePhrasesWithStopWords && containsStop) { doNotUse = true; } else { if (!containsStop || !removeStopWordsFromSelectedPhrases) { if (label == null || l.Get(constVars.GetAnswerClass()[label]) == null || !l.Get(constVars.GetAnswerClass()[label]).Equals(label)) { useWordNotLabeled = true; } phrase += " " + l.Word(); phraseLemma += " " + l.Lemma(); addedindices[i_2 - s] = true; } } } for (int i_3 = 0; i_3 < addedindices.Length; i_3++) { if (i_3 > 0 && i_3 < addedindices.Length - 1 && addedindices[i_3 - 1] == true && addedindices[i_3] == false && addedindices[i_3 + 1] == true) { doNotUse = true; break; } } if (!doNotUse) { matchedTokensByPat.Add(pEn.Value, new Triple <string, int, int>(sentid, s, e - 1)); phrase = phrase.Trim(); if (!phrase.IsEmpty()) { phraseLemma = phraseLemma.Trim(); CandidatePhrase candPhrase = CandidatePhrase.CreateOrGet(phrase, phraseLemma); allFreq.IncrementCount(candPhrase, pEn.Value, 1.0); if (!useWordNotLabeled) { alreadyLabeledPhrases.Add(candPhrase); } } } } } } return(new Triple <TwoDimensionalCounter <CandidatePhrase, E>, CollectionValuedMap <E, Triple <string, int, int> >, ICollection <CandidatePhrase> >(allFreq, matchedTokensByPat, alreadyLabeledPhrases)); } catch (Exception e) { logger.Error(e); throw; } }
/// <summary> /// Find the operators in this sentence, annotating the head word (only!) of each operator with the /// <see cref="OperatorAnnotation"/> /// . /// </summary> /// <param name="sentence"> /// As in /// <see cref="DoOneSentence(Edu.Stanford.Nlp.Pipeline.Annotation, Edu.Stanford.Nlp.Util.ICoreMap)"/> /// </param> private void AnnotateOperators(ICoreMap sentence) { SemanticGraph tree = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); if (tree == null) { tree = sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); } foreach (SemgrexPattern pattern in Patterns) { SemgrexMatcher matcher = pattern.Matcher(tree); while (matcher.Find()) { // Get terms IndexedWord properSubject = matcher.GetNode("Subject"); IndexedWord quantifier; IndexedWord subject; bool namedEntityQuantifier = false; if (properSubject != null) { quantifier = subject = properSubject; namedEntityQuantifier = true; } else { quantifier = matcher.GetNode("quantifier"); subject = matcher.GetNode("subject"); } IndexedWord @object = matcher.GetNode("object"); // Validate quantifier // At the end of this Optional <Triple <Operator, int, int> > quantifierInfo; if (namedEntityQuantifier) { // named entities have the "all" semantics by default. if (!neQuantifiers) { continue; } quantifierInfo = Optional.Of(Triple.MakeTriple(Operator.ImplicitNamedEntity, quantifier.Index(), quantifier.Index())); } else { // note: empty quantifier span given // find the quantifier, and return some info about it. quantifierInfo = ValidateQuantifierByHead(sentence, quantifier, @object == null || subject == null); } // Awful hacks to regularize the subject of things like "one of" and "there are" // (fix up 'there are') if ("be".Equals(subject == null ? null : subject.Lemma())) { bool hasExpl = false; IndexedWord newSubject = null; foreach (SemanticGraphEdge outgoingEdge in tree.OutgoingEdgeIterable(subject)) { if ("nsubj".Equals(outgoingEdge.GetRelation().ToString())) { newSubject = outgoingEdge.GetDependent(); } else { if ("expl".Equals(outgoingEdge.GetRelation().ToString())) { hasExpl = true; } } } if (hasExpl) { subject = newSubject; } } // (fix up '$n$ of') if ("CD".Equals(subject == null ? null : subject.Tag())) { foreach (SemanticGraphEdge outgoingEdge in tree.OutgoingEdgeIterable(subject)) { string rel = outgoingEdge.GetRelation().ToString(); if (rel.StartsWith("nmod")) { subject = outgoingEdge.GetDependent(); } } } // Set tokens if (quantifierInfo.IsPresent()) { // Compute span IndexedWord pivot = matcher.GetNode("pivot"); if (pivot == null) { pivot = @object; } OperatorSpec scope = ComputeScope(tree, quantifierInfo.Get().first, pivot, Pair.MakePair(quantifierInfo.Get().second, quantifierInfo.Get().third), subject, namedEntityQuantifier, @object, tokens.Count); // Set annotation CoreLabel token = sentence.Get(typeof(CoreAnnotations.TokensAnnotation))[quantifier.Index() - 1]; OperatorSpec oldScope = token.Get(typeof(NaturalLogicAnnotations.OperatorAnnotation)); if (oldScope == null || oldScope.QuantifierLength() < scope.QuantifierLength() || oldScope.instance != scope.instance) { token.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), scope); } else { token.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), OperatorSpec.Merge(oldScope, scope)); } } } } // Ensure we didn't select overlapping quantifiers. For example, "a" and "a few" can often overlap. // In these cases, take the longer quantifier match. IList <OperatorSpec> quantifiers = new List <OperatorSpec>(); for (int i = 0; i < tokens.Count; ++i) { CoreLabel token = tokens[i]; OperatorSpec @operator; if ((@operator = token.Get(typeof(NaturalLogicAnnotations.OperatorAnnotation))) != null) { if (i == 0 && @operator.instance == Operator.No && tokens.Count > 2 && "PRP".Equals(tokens[1].Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)))) { // This is pragmatically not a negation -- ignore it // For example, "no I don't like candy" or "no you like cats" token.Remove(typeof(NaturalLogicAnnotations.OperatorAnnotation)); } else { quantifiers.Add(@operator); } } } quantifiers.Sort(null); foreach (OperatorSpec quantifier_1 in quantifiers) { for (int i_1 = quantifier_1.quantifierBegin; i_1 < quantifier_1.quantifierEnd; ++i_1) { if (i_1 != quantifier_1.quantifierHead) { tokens[i_1].Remove(typeof(NaturalLogicAnnotations.OperatorAnnotation)); } } } }
/// <summary> /// Annotate any unary quantifiers that weren't found in the main /// <see cref="AnnotateOperators(Edu.Stanford.Nlp.Util.ICoreMap)"/> /// method. /// </summary> /// <param name="sentence">The sentence to annotate.</param> private static void AnnotateUnaries(ICoreMap sentence) { // Get tree and tokens SemanticGraph tree = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); if (tree == null) { tree = sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); } IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); // Get operator exists mask bool[] isOperator = new bool[tokens.Count]; for (int i = 0; i < isOperator.Length; ++i) { OperatorSpec spec = tokens[i].Get(typeof(NaturalLogicAnnotations.OperatorAnnotation)); if (spec != null) { for (int k = spec.quantifierBegin; k < spec.quantifierEnd; ++k) { isOperator[k] = true; } } } // Match Semgrex SemgrexMatcher matcher = UnaryPattern.Matcher(tree); while (matcher.Find()) { // Get relevant nodes IndexedWord quantifier = matcher.GetNode("quantifier"); string word = quantifier.Word().ToLower(); if (word.Equals("a") || word.Equals("an") || word.Equals("the") || "CD".Equals(quantifier.Tag())) { continue; } // These are absurdly common, and uninformative, and we're just going to shoot ourselves in the foot from parsing errors and idiomatic expressions. IndexedWord subject = matcher.GetNode("subject"); // ... If there is not already an operator there if (!isOperator[quantifier.Index() - 1]) { Optional <Triple <Operator, int, int> > quantifierInfo = ValidateQuantifierByHead(sentence, quantifier, true); // ... and if we found a quantifier span if (quantifierInfo.IsPresent()) { // Then add the unary operator! OperatorSpec scope = ComputeScope(tree, quantifierInfo.Get().first, subject, Pair.MakePair(quantifierInfo.Get().second, quantifierInfo.Get().third), null, false, null, tokens.Count); CoreLabel token = tokens[quantifier.Index() - 1]; token.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), scope); } } } // Match TokensRegex TokenSequenceMatcher tokenMatcher = DoubtPattern.Matcher(tokens); while (tokenMatcher.Find()) { IList <CoreLabel> doubt = (IList <CoreLabel>)tokenMatcher.GroupNodes("$doubt"); IList <CoreLabel> target = (IList <CoreLabel>)tokenMatcher.GroupNodes("$target"); foreach (CoreLabel word in doubt) { OperatorSpec spec = new OperatorSpec(Operator.GeneralNegPolarity, word.Index() - 1, word.Index(), target[0].Index() - 1, target[target.Count - 1].Index(), 0, 0, tokens.Count); word.Set(typeof(NaturalLogicAnnotations.OperatorAnnotation), spec); } } }
/// <summary>Annotate every token for its polarity, based on the operators found.</summary> /// <remarks> /// Annotate every token for its polarity, based on the operators found. This function will set the /// <see cref="PolarityAnnotation"/> /// for every token. /// </remarks> /// <param name="sentence"> /// As in /// <see cref="DoOneSentence(Edu.Stanford.Nlp.Pipeline.Annotation, Edu.Stanford.Nlp.Util.ICoreMap)"/> /// </param> private static void AnnotatePolarity(ICoreMap sentence) { // Collect all the operators in this sentence IList <OperatorSpec> operators = new List <OperatorSpec>(); IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); foreach (CoreLabel token in tokens) { OperatorSpec specOrNull = token.Get(typeof(NaturalLogicAnnotations.OperatorAnnotation)); if (specOrNull != null) { operators.Add(specOrNull); } } // Make sure every node of the dependency tree has a polarity. // This is separate from the code below in case the tokens in the dependency // tree don't correspond to the tokens in the sentence. This happens at least // when the constituency parser craps out on a long sentence, and the // dependency tree is put together haphazardly. if (sentence.ContainsKey(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation))) { foreach (IndexedWord token_1 in sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)).VertexSet()) { token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), Polarity.Default); } } if (sentence.ContainsKey(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation))) { foreach (IndexedWord token_1 in sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)).VertexSet()) { token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), Polarity.Default); } } if (sentence.ContainsKey(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation))) { foreach (IndexedWord token_1 in sentence.Get(typeof(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation)).VertexSet()) { token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), Polarity.Default); } } // Set polarity for each token for (int i = 0; i < tokens.Count; ++i) { CoreLabel token_1 = tokens[i]; // Get operators in scope IList <Triple <int, Monotonicity, MonotonicityType> > inScope = new List <Triple <int, Monotonicity, MonotonicityType> >(4); foreach (OperatorSpec @operator in operators) { if (i >= @operator.subjectBegin && i < @operator.subjectEnd) { inScope.Add(Triple.MakeTriple(@operator.subjectEnd - @operator.subjectBegin, @operator.instance.subjMono, @operator.instance.subjType)); } else { if (i >= @operator.objectBegin && i < @operator.objectEnd) { inScope.Add(Triple.MakeTriple(@operator.objectEnd - @operator.objectBegin, @operator.instance.objMono, @operator.instance.objType)); } } } // Sort the operators by their scope (approximated by the size of their argument span) inScope.Sort(null); // Create polarity IList <Pair <Monotonicity, MonotonicityType> > info = new List <Pair <Monotonicity, MonotonicityType> >(inScope.Count); foreach (Triple <int, Monotonicity, MonotonicityType> term in inScope) { info.Add(Pair.MakePair(term.second, term.third)); } Polarity polarity = new Polarity(info); // Set polarity token_1.Set(typeof(NaturalLogicAnnotations.PolarityAnnotation), polarity); } // Set the PolarityDirectionAnnotation foreach (CoreLabel token_2 in tokens) { Polarity polarity = token_2.Get(typeof(NaturalLogicAnnotations.PolarityAnnotation)); if (polarity != null) { if (polarity.IsUpwards()) { token_2.Set(typeof(NaturalLogicAnnotations.PolarityDirectionAnnotation), "up"); } else { if (polarity.IsDownwards()) { token_2.Set(typeof(NaturalLogicAnnotations.PolarityDirectionAnnotation), "down"); } else { token_2.Set(typeof(NaturalLogicAnnotations.PolarityDirectionAnnotation), "flat"); } } } } }
/// <summary> /// This is the method to call for assigning labels and node vectors /// to the Tree. /// </summary> /// <remarks> /// This is the method to call for assigning labels and node vectors /// to the Tree. After calling this, each of the non-leaf nodes will /// have the node vector and the predictions of their classes /// assigned to that subtree's node. The annotations filled in are /// the RNNCoreAnnotations.NodeVector, Predictions, and /// PredictedClass. In general, PredictedClass will be the most /// useful annotation except when training. /// </remarks> public virtual void ForwardPropagateTree(Tree tree) { SimpleMatrix nodeVector; // initialized below or Exception thrown // = null; SimpleMatrix classification; // initialized below or Exception thrown // = null; if (tree.IsLeaf()) { // We do nothing for the leaves. The preterminals will // calculate the classification for this word/tag. In fact, the // recursion should not have gotten here (unless there are // degenerate trees of just one leaf) log.Info("SentimentCostAndGradient: warning: We reached leaves in forwardPropagate: " + tree); throw new AssertionError("We should not have reached leaves in forwardPropagate"); } else { if (tree.IsPreTerminal()) { classification = model.GetUnaryClassification(tree.Label().Value()); string word = tree.Children()[0].Label().Value(); SimpleMatrix wordVector = model.GetWordVector(word); nodeVector = NeuralUtils.ElementwiseApplyTanh(wordVector); } else { if (tree.Children().Length == 1) { log.Info("SentimentCostAndGradient: warning: Non-preterminal nodes of size 1: " + tree); throw new AssertionError("Non-preterminal nodes of size 1 should have already been collapsed"); } else { if (tree.Children().Length == 2) { ForwardPropagateTree(tree.Children()[0]); ForwardPropagateTree(tree.Children()[1]); string leftCategory = tree.Children()[0].Label().Value(); string rightCategory = tree.Children()[1].Label().Value(); SimpleMatrix W = model.GetBinaryTransform(leftCategory, rightCategory); classification = model.GetBinaryClassification(leftCategory, rightCategory); SimpleMatrix leftVector = RNNCoreAnnotations.GetNodeVector(tree.Children()[0]); SimpleMatrix rightVector = RNNCoreAnnotations.GetNodeVector(tree.Children()[1]); SimpleMatrix childrenVector = NeuralUtils.ConcatenateWithBias(leftVector, rightVector); if (model.op.useTensors) { SimpleTensor tensor = model.GetBinaryTensor(leftCategory, rightCategory); SimpleMatrix tensorIn = NeuralUtils.Concatenate(leftVector, rightVector); SimpleMatrix tensorOut = tensor.BilinearProducts(tensorIn); nodeVector = NeuralUtils.ElementwiseApplyTanh(W.Mult(childrenVector).Plus(tensorOut)); } else { nodeVector = NeuralUtils.ElementwiseApplyTanh(W.Mult(childrenVector)); } } else { log.Info("SentimentCostAndGradient: warning: Tree not correctly binarized: " + tree); throw new AssertionError("Tree not correctly binarized"); } } } } SimpleMatrix predictions = NeuralUtils.Softmax(classification.Mult(NeuralUtils.ConcatenateWithBias(nodeVector))); int index = GetPredictedClass(predictions); if (!(tree.Label() is CoreLabel)) { log.Info("SentimentCostAndGradient: warning: No CoreLabels in nodes: " + tree); throw new AssertionError("Expected CoreLabels in the nodes"); } CoreLabel label = (CoreLabel)tree.Label(); label.Set(typeof(RNNCoreAnnotations.Predictions), predictions); label.Set(typeof(RNNCoreAnnotations.PredictedClass), index); label.Set(typeof(RNNCoreAnnotations.NodeVector), nodeVector); }
/// <exception cref="System.IO.IOException"/> public static IList <ICoreMap> ParseFile(BufferedReader reader, ICollection <string> categoriesAllowed, IDictionary <string, Type> setClassForTheseLabels, bool setGoldClass, string sentIDprefix) { Pattern startingLabelToken = Pattern.Compile("<(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); Pattern endLabelToken = Pattern.Compile("</(" + StringUtils.Join(categoriesAllowed, "|") + ")>"); string backgroundSymbol = "O"; IList <ICoreMap> sentences = new List <ICoreMap>(); int lineNum = -1; string l = null; while ((l = reader.ReadLine()) != null) { lineNum++; string[] t = l.Split("\t", 2); string id = null; string text = null; if (t.Length == 2) { id = t[0]; text = t[1]; } else { if (t.Length == 1) { text = t[0]; id = lineNum.ToString(); } } id = sentIDprefix + id; DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(text)); PTBTokenizer.PTBTokenizerFactory <CoreLabel> tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.NewCoreLabelTokenizerFactory("ptb3Escaping=false,normalizeParentheses=false,escapeForwardSlashAsterisk=false"); dp.SetTokenizerFactory(tokenizerFactory); string label = backgroundSymbol; int sentNum = -1; foreach (IList <IHasWord> sentence in dp) { sentNum++; string sentStr = string.Empty; IList <CoreLabel> sent = new List <CoreLabel>(); foreach (IHasWord tokw in sentence) { string tok = tokw.Word(); Matcher startingMatcher = startingLabelToken.Matcher(tok); Matcher endMatcher = endLabelToken.Matcher(tok); if (startingMatcher.Matches()) { //System.out.println("matched starting"); label = startingMatcher.Group(1); } else { if (endMatcher.Matches()) { //System.out.println("matched end"); label = backgroundSymbol; } else { CoreLabel c = new CoreLabel(); IList <string> toks = new List <string>(); toks.Add(tok); foreach (string toksplit in toks) { sentStr += " " + toksplit; c.SetWord(toksplit); c.SetLemma(toksplit); c.SetValue(toksplit); c.Set(typeof(CoreAnnotations.TextAnnotation), toksplit); c.Set(typeof(CoreAnnotations.OriginalTextAnnotation), tok); if (setGoldClass) { c.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); } if (setClassForTheseLabels != null && setClassForTheseLabels.Contains(label)) { c.Set(setClassForTheseLabels[label], label); } sent.Add(c); } } } } ICoreMap sentcm = new ArrayCoreMap(); sentcm.Set(typeof(CoreAnnotations.TextAnnotation), sentStr.Trim()); sentcm.Set(typeof(CoreAnnotations.TokensAnnotation), sent); sentcm.Set(typeof(CoreAnnotations.DocIDAnnotation), id + "-" + sentNum); sentences.Add(sentcm); } } return(sentences); }
/// <summary>Create an Annotation object (with a single sentence) from the given specification.</summary> private static Annotation ParseSentence(Optional <string> docid, Optional <int> sentenceIndex, string gloss, Func <IList <CoreLabel>, SemanticGraph> tree, Func <IList <CoreLabel>, SemanticGraph> maltTree, IList <string> words, IList <string > lemmas, IList <string> pos, IList <string> ner, Optional <string> sentenceid) { // Error checks if (lemmas.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + lemmas.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (pos.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + pos.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } if (ner.Count != words.Count) { throw new ArgumentException("Array lengths don't match: " + words.Count + " vs " + ner.Count + " (sentence " + sentenceid.OrElse("???") + ")"); } // Create structure IList <CoreLabel> tokens = new List <CoreLabel>(words.Count); int beginChar = 0; for (int i = 0; i < words.Count; ++i) { CoreLabel token = new CoreLabel(12); token.SetWord(words[i]); token.SetValue(words[i]); token.SetBeginPosition(beginChar); token.SetEndPosition(beginChar + words[i].Length); beginChar += words[i].Length + 1; token.SetLemma(lemmas[i]); token.SetTag(pos[i]); token.SetNER(ner[i]); token.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); token.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); token.Set(typeof(CoreAnnotations.IndexAnnotation), i + 1); token.Set(typeof(CoreAnnotations.TokenBeginAnnotation), i); token.Set(typeof(CoreAnnotations.TokenEndAnnotation), i + 1); tokens.Add(token); } gloss = gloss.Replace("\\n", "\n").Replace("\\t", "\t"); ICoreMap sentence = new ArrayCoreMap(16); sentence.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); SemanticGraph graph = tree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation), graph); sentence.Set(typeof(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation), graph); SemanticGraph maltGraph = maltTree.Apply(tokens); sentence.Set(typeof(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation), maltGraph); sentence.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); sentence.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); sentence.Set(typeof(CoreAnnotations.TextAnnotation), gloss); sentence.Set(typeof(CoreAnnotations.TokenBeginAnnotation), 0); sentence.Set(typeof(CoreAnnotations.TokenEndAnnotation), tokens.Count); Annotation doc = new Annotation(gloss); doc.Set(typeof(CoreAnnotations.TokensAnnotation), tokens); doc.Set(typeof(CoreAnnotations.SentencesAnnotation), Java.Util.Collections.SingletonList(sentence)); doc.Set(typeof(CoreAnnotations.DocIDAnnotation), docid.OrElse("???")); doc.Set(typeof(CoreAnnotations.SentenceIndexAnnotation), sentenceIndex.OrElse(-1)); return(doc); }