public static bool SetSpanLabel(Tree tree, Pair <int, int> span, string value) { if (!(tree.Label() is CoreLabel)) { throw new AssertionError("Expected CoreLabels"); } CoreLabel label = (CoreLabel)tree.Label(); if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)).Equals(span.first) && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)).Equals(span.second)) { label.SetValue(value); return(true); } if (label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)) > span.first && label.Get(typeof(CoreAnnotations.EndIndexAnnotation)) < span.second) { return(false); } foreach (Tree child in tree.Children()) { if (SetSpanLabel(child, span, value)) { return(true); } } return(false); }
public static string GetFeatureFromCoreLabel(CoreLabel label, FeatureFactory.FeatureComponent feature) { string value = null; switch (feature) { case FeatureFactory.FeatureComponent.Headword: { value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)).Value(); break; } case FeatureFactory.FeatureComponent.Headtag: { value = (label == null) ? Null : label.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)).Value(); break; } case FeatureFactory.FeatureComponent.Value: { value = (label == null) ? Null : label.Value(); break; } default: { throw new ArgumentException("Unexpected feature type: " + feature); } } return(value); }
//return index of the token that ends this block of text. //key assumption: blocks are delimited by tokens (i.e. no token spans two blocks.) public static int GetEndIndex(int startIndex, IList <CoreLabel> tokens, string text) { text = text.Trim(); //remove newlines that may throw off text length int currIndex = startIndex; CoreLabel token = tokens[startIndex]; int tokenBeginChar = token.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); int offset = text.IndexOf(token.Get(typeof(CoreAnnotations.OriginalTextAnnotation))); while (true) { int tokenEndChar = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); if (tokenEndChar - tokenBeginChar == text.Length) { return(currIndex); } else { if (tokenEndChar - tokenBeginChar > text.Length) { return(currIndex - 1); } } currIndex++; if (currIndex == tokens.Count) { return(currIndex - 1); } token = tokens[currIndex]; } }
internal static Triple <bool, Token, string> GetContextTokenStr(CoreLabel tokenj) { Token strgeneric = new Token(PatternFactory.PatternType.Surface); string strOriginal = string.Empty; bool isLabeledO = true; // for (Entry<String, Class<? extends TypesafeMap.Key<String>>> e : getAnswerClass().entrySet()) { // if (!tokenj.get(e.getValue()).equals(backgroundSymbol)) { // isLabeledO = false; // if (strOriginal.isEmpty()) { // strOriginal = e.getKey(); // } else { // strOriginal += "|" + e.getKey(); // } // strgeneric.addRestriction(e.getKey(), e.getKey()); // } // } foreach (KeyValuePair <string, Type> e in ConstantsAndVariables.GetGeneralizeClasses()) { if (!tokenj.ContainsKey(e.Value) || tokenj.Get(e.Value) == null) { throw new Exception(" Why does the token not have the class " + e.Value + " set? Existing classes " + tokenj.ToString(CoreLabel.OutputFormat.All)); } if (!tokenj.Get(e.Value).Equals(ConstantsAndVariables.backgroundSymbol)) { isLabeledO = false; if (strOriginal.IsEmpty()) { strOriginal = e.Key; } else { strOriginal += "|" + e.Key; } strgeneric.AddORRestriction(e.Value, e.Key); } } if (useContextNERRestriction) { string nerTag = tokenj.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (nerTag != null && !nerTag.Equals(SeqClassifierFlags.DefaultBackgroundSymbol)) { isLabeledO = false; if (strOriginal.IsEmpty()) { strOriginal = nerTag; } else { strOriginal += "|" + nerTag; } strgeneric.AddORRestriction(typeof(CoreAnnotations.NamedEntityTagAnnotation), nerTag); } } return(new Triple <bool, Token, string>(isLabeledO, strgeneric, strOriginal)); }
public virtual void CheckHeads(Tree t1, Tree t2) { NUnit.Framework.Assert.IsTrue(t1.Label() is CoreLabel); NUnit.Framework.Assert.IsTrue(t2.Label() is CoreLabel); CoreLabel l1 = (CoreLabel)t1.Label(); CoreLabel l2 = (CoreLabel)t2.Label(); NUnit.Framework.Assert.AreEqual(l1.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)), l2.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation))); NUnit.Framework.Assert.AreEqual(l1.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)), l2.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation))); }
/// <summary> /// Return the node containing the head word for this node (or /// <code>null</code> if none), as recorded in this node's {@link /// CoreLabel <code>CoreLabel</code>}. (In contrast to {@link /// edu.stanford.nlp.ling.CategoryWordTag /// <code>CategoryWordTag</code>}, we store head words and head /// tags as references to nodes, not merely as <code>string</code>s.) /// </summary> /// <returns>the node containing the head word for this node</returns> public TreeGraphNode HeadWordNode() { TreeGraphNode hwn = SafeCast(_label.Get(typeof(TreeCoreAnnotations.HeadWordAnnotation))); if (hwn == null || (hwn.TreeGraph() != null && !(hwn.TreeGraph().Equals(this.TreeGraph())))) { return(null); } return(hwn); }
public virtual void Annotate(Annotation annotation) { if (verbose) { log.Info("Adding RegexNER annotations ... "); } if (!annotation.ContainsKey(typeof(CoreAnnotations.SentencesAnnotation))) { throw new Exception("Unable to find sentences in " + annotation); } IList <ICoreMap> sentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); foreach (ICoreMap sentence in sentences) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); classifier.Classify(tokens); foreach (CoreLabel token in tokens) { if (token.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)) == null) { token.Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), classifier.flags.backgroundSymbol); } } for (int start = 0; start < tokens.Count; start++) { CoreLabel token_1 = tokens[start]; string answerType = token_1.Get(typeof(CoreAnnotations.AnswerAnnotation)); if (answerType == null) { continue; } string NERType = token_1.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); int answerEnd = FindEndOfAnswerAnnotation(tokens, start); int NERStart = FindStartOfNERAnnotation(tokens, start); int NEREnd = FindEndOfNERAnnotation(tokens, start); // check that the spans are the same, specially handling the case of // tokens with background named entity tags ("other") if ((NERStart == start || NERType.Equals(classifier.flags.backgroundSymbol)) && (answerEnd == NEREnd || (NERType.Equals(classifier.flags.backgroundSymbol) && NEREnd >= answerEnd))) { // annotate each token in the span for (int i = start; i < answerEnd; i++) { tokens[i].Set(typeof(CoreAnnotations.NamedEntityTagAnnotation), answerType); } } start = answerEnd - 1; } } if (verbose) { log.Info("done."); } }
private RVFDatum <string, string> GetDatum(CoreLabel[] sent, int i) { ICounter <string> feat = new ClassicCounter <string>(); CoreLabel l = sent[i]; string label; if (l.Get(answerClass).ToString().Equals(answerLabel)) { label = answerLabel; } else { label = "O"; } CollectionValuedMap <string, CandidatePhrase> matchedPhrases = l.Get(typeof(PatternsAnnotations.MatchedPhrases)); if (matchedPhrases == null) { matchedPhrases = new CollectionValuedMap <string, CandidatePhrase>(); matchedPhrases.Add(label, CandidatePhrase.CreateOrGet(l.Word())); } foreach (CandidatePhrase w in matchedPhrases.AllValues()) { int num = this.clusterIds[w.GetPhrase()]; if (num == null) { num = -1; } feat.SetCount("Cluster-" + num, 1.0); } // feat.incrementCount("WORD-" + l.word()); // feat.incrementCount("LEMMA-" + l.lemma()); // feat.incrementCount("TAG-" + l.tag()); int window = 0; for (int j = Math.Max(0, i - window); j < i; j++) { CoreLabel lj = sent[j]; feat.IncrementCount("PREV-" + "WORD-" + lj.Word()); feat.IncrementCount("PREV-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("PREV-" + "TAG-" + lj.Tag()); } for (int j_1 = i + 1; j_1 < sent.Length && j_1 <= i + window; j_1++) { CoreLabel lj = sent[j_1]; feat.IncrementCount("NEXT-" + "WORD-" + lj.Word()); feat.IncrementCount("NEXT-" + "LEMMA-" + lj.Lemma()); feat.IncrementCount("NEXT-" + "TAG-" + lj.Tag()); } // System.out.println("adding " + l.word() + " as " + label); return(new RVFDatum <string, string>(feat, label)); }
public static void AddUnaryQueueFeatures(IList <string> features, CoreLabel label, string wtFeature) { if (label == null) { features.Add(wtFeature + Null); return; } string tag = label.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation)).Value(); string word = label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)).Value(); // TODO: check to see if this is slow because of the string concat features.Add(wtFeature + tag + "-" + word); }
private void DoOneSentenceNew(IList <CoreLabel> words, Annotation doc, ICoreMap sentence) { IList <CoreLabel> newWords = NumberSequenceClassifier.CopyTokens(words, sentence); nsc.ClassifyWithGlobalInformation(newWords, doc, sentence); IEnumerator <CoreLabel> newFLIter = newWords.GetEnumerator(); foreach (CoreLabel origWord in words) { CoreLabel newWord = newFLIter.Current; string before = origWord.Ner(); string newGuess = newWord.Get(typeof(CoreAnnotations.AnswerAnnotation)); // log.info(origWord.word()); // log.info(origWord.ner()); if (Verbose) { log.Info(newWord); } // log.info("-------------------------------------"); if ((before == null || before.Equals(BackgroundSymbol) || before.Equals("MISC")) && !newGuess.Equals(BackgroundSymbol)) { origWord.SetNER(newGuess); } // transfer other annotations generated by SUTime or NumberNormalizer NumberSequenceClassifier.TransferAnnotations(newWord, origWord); } }
/// <summary>helper method for creating version of document text without xml.</summary> public static string XmlFreeText(string documentText, Annotation annotation) { int firstTokenCharIndex = annotation.Get(typeof(CoreAnnotations.TokensAnnotation))[0].Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); // add white space for all text before first token string cleanedText = Sharpen.Runtime.Substring(documentText, 0, firstTokenCharIndex).ReplaceAll("\\S", " "); int tokenIndex = 0; IList <CoreLabel> tokens = annotation.Get(typeof(CoreAnnotations.TokensAnnotation)); foreach (CoreLabel token in tokens) { // add the current token's text cleanedText += token.OriginalText(); // add whitespace for non-tokens and xml in between these tokens tokenIndex += 1; if (tokenIndex < tokens.Count) { CoreLabel nextToken = tokens[tokenIndex]; int inBetweenStart = token.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation)); int inBetweenEnd = nextToken.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)); string inBetweenTokenText = Sharpen.Runtime.Substring(documentText, inBetweenStart, inBetweenEnd); inBetweenTokenText = inBetweenTokenText.ReplaceAll("\\S", " "); cleanedText += inBetweenTokenText; } } // add white space for all non-token content after last token cleanedText += Sharpen.Runtime.Substring(documentText, cleanedText.Length, documentText.Length).ReplaceAll("\\S", " "); return(cleanedText); }
public virtual ICollection <string> FeaturesCnC(PaddedList <IN> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel c1 = cInfo[loc + 1]; CoreLabel p = cInfo[loc - 1]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charc1 = c1.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); if (flags.useWordn) { features.Add(charc + "c"); features.Add(charc1 + "c1"); features.Add(charp + "p"); features.Add(charp + charc + "pc"); if (flags.useAs || flags.useMsr || flags.usePk || flags.useHk) { features.Add(charc + charc1 + "cc1"); features.Add(charp + charc1 + "pc1"); } features.Add("|wordn"); } return(features); }
internal static Tree CreateNode(Tree top, string label, params Tree[] children) { CoreLabel headLabel = (CoreLabel)top.Label(); CoreLabel production = new CoreLabel(); production.SetValue(label); production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation))); production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation))); Tree newTop = new LabeledScoredTreeNode(production); foreach (Tree child in children) { newTop.AddChild(child); } return(newTop); }
private static Tree FunkyFindLeafWithApproximateSpan(Tree root, string token, int index, int approximateness) { // log.info("Searching " + root + "\n for " + token + " at position " + index + " (plus up to " + approximateness + ")"); IList <Tree> leaves = root.GetLeaves(); foreach (Tree leaf in leaves) { CoreLabel label = typeof(CoreLabel).Cast(leaf.Label()); int indexInteger = label.Get(typeof(CoreAnnotations.IndexAnnotation)); if (indexInteger == null) { continue; } int ind = indexInteger - 1; if (token.Equals(leaf.Value()) && ind >= index && ind <= index + approximateness) { return(leaf); } } // this shouldn't happen // throw new RuntimeException("RuleBasedCorefMentionFinder: ERROR: Failed to find head token"); SieveCoreferenceSystem.logger.Warning("RuleBasedCorefMentionFinder: Failed to find head token:\n" + "Tree is: " + root + "\n" + "token = |" + token + "|" + index + "|, approx=" + approximateness); foreach (Tree leaf_1 in leaves) { if (token.Equals(leaf_1.Value())) { //log.info("Found something: returning " + leaf); return(leaf_1); } } int fallback = Math.Max(0, leaves.Count - 2); SieveCoreferenceSystem.logger.Warning("RuleBasedCorefMentionFinder: Last resort: returning as head: " + leaves[fallback]); return(leaves[fallback]); }
/// <summary>Returns a 0-based index of the head of the tree.</summary> /// <remarks>Returns a 0-based index of the head of the tree. Assumes the leaves had been indexed from 1</remarks> internal static int HeadIndex(Tree tree) { CoreLabel label = ErasureUtils.UncheckedCast(tree.Label()); CoreLabel headLabel = label.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation)); return(headLabel.Index() - 1); }
/// <summary>Create a datum from a string.</summary> /// <remarks> /// Create a datum from a string. The CoreAnnotations must correspond to those used by /// SequenceClassifier. The following annotations are copied from the provided /// CoreLabel cl, if present: /// DomainAnnotation /// startOffset and endOffset will be added to the /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/> /// of /// the /// <see cref="Edu.Stanford.Nlp.Ling.CoreLabel"/> /// cl to give the /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetBeginAnnotation"/> /// and /// <see cref="Edu.Stanford.Nlp.Ling.CoreAnnotations.CharacterOffsetEndAnnotation"/> /// of the resulting datum. /// </remarks> private static CoreLabel CreateDatum(CoreLabel cl, string token, string label, int startOffset, int endOffset) { CoreLabel newTok = new CoreLabel(); newTok.Set(typeof(CoreAnnotations.TextAnnotation), token); newTok.Set(typeof(CoreAnnotations.CharAnnotation), token); newTok.Set(typeof(CoreAnnotations.AnswerAnnotation), label); newTok.Set(typeof(CoreAnnotations.GoldAnswerAnnotation), label); newTok.Set(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + startOffset); newTok.Set(typeof(CoreAnnotations.CharacterOffsetEndAnnotation), cl.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) + endOffset); if (cl != null && cl.ContainsKey(typeof(CoreAnnotations.DomainAnnotation))) { newTok.Set(typeof(CoreAnnotations.DomainAnnotation), cl.Get(typeof(CoreAnnotations.DomainAnnotation))); } return(newTok); }
public virtual void FindHead(ICoreMap s, IList <Mention> mentions) { Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); tree.IndexSpans(0); foreach (Mention m in mentions) { if (lang == Locale.Chinese) { FindHeadChinese(sent, m); } else { CoreLabel head = (CoreLabel)FindSyntacticHead(m, tree, sent).Label(); m.headIndex = head.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; m.headWord = sent[m.headIndex]; m.headString = m.headWord.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower(Locale.English); } int start = m.headIndex - m.startIndex; if (start < 0 || start >= m.originalSpan.Count) { Redwood.Log("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex + ": originalSpan=[" + StringUtils.JoinWords(m.originalSpan, " ") + "], head=" + m.headWord); Redwood.Log("Setting head string to entire mention"); m.headIndex = m.startIndex; m.headWord = m.originalSpan.Count > 0 ? m.originalSpan[0] : sent[m.startIndex]; m.headString = m.originalSpan.ToString(); } } }
/// <summary>Add a binary node to the existing node on top of the stack</summary> public virtual State Apply(State state, double scoreDelta) { TreeShapedStack <Tree> stack = state.stack; Tree right = stack.Peek(); stack = stack.Pop(); Tree left = stack.Peek(); stack = stack.Pop(); Tree head; switch (side) { case BinaryTransition.Side.Left: { head = left; break; } case BinaryTransition.Side.Right: { head = right; break; } default: { throw new ArgumentException("Unknown side " + side); } } if (!(head.Label() is CoreLabel)) { throw new ArgumentException("Stack should have CoreLabel nodes"); } CoreLabel headLabel = (CoreLabel)head.Label(); CoreLabel production = new CoreLabel(); production.SetValue(label); production.Set(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadWordLabelAnnotation))); production.Set(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation), headLabel.Get(typeof(TreeCoreAnnotations.HeadTagLabelAnnotation))); Tree newTop = new LabeledScoredTreeNode(production); newTop.AddChild(left); newTop.AddChild(right); stack = stack.Push(newTop); return(new State(stack, state.transitions.Push(this), state.separators, state.sentence, state.tokenPosition, state.score + scoreDelta, false)); }
protected internal virtual ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel n = cInfo[loc + 1]; CoreLabel n2 = cInfo[loc + 2]; CoreLabel p = cInfo[loc - 1]; CoreLabel p2 = cInfo[loc - 2]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charn = n.Get(typeof(CoreAnnotations.CharAnnotation)); string charn2 = n2.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); string charp2 = p2.Get(typeof(CoreAnnotations.CharAnnotation)); // Default feature set...a 5 character window // plus a few other language-independent features features.Add(charc + "-c"); features.Add(charn + "-n1"); features.Add(charn2 + "-n2"); features.Add(charp + "-p"); features.Add(charp2 + "-p2"); // Length feature if (charc.Length > 1) { features.Add("length"); } // Character-level class features bool seenPunc = false; bool seenDigit = false; for (int i = 0; i < limit; ++i) { char charcC = charc[i]; seenPunc = seenPunc || Characters.IsPunctuation(charcC); seenDigit = seenDigit || char.IsDigit(charcC); string cuBlock = Characters.UnicodeBlockStringOf(charcC); features.Add(cuBlock + "-uBlock"); string cuType = char.GetType(charcC).ToString(); features.Add(cuType + "-uType"); } if (seenPunc) { features.Add("haspunc"); } if (seenDigit) { features.Add("hasdigit"); } // Token-level features string word = c.Word(); int index = c.Index(); features.Add(Math.Min(MaxBefore, index) + "-before"); features.Add(Math.Min(MaxAfter, word.Length - charc.Length - index) + "-after"); features.Add(Math.Min(MaxLength, word.Length) + "-length"); // Indicator transition feature features.Add("cliqueC"); return(features); }
protected internal override T GetNext() { try { T nextToken; do { // initialized in do-while // Depending on the orthographic normalization options, // some tokens can be obliterated. In this case, keep iterating // until we see a non-zero length token. nextToken = (splitAny && !compoundBuffer.IsEmpty()) ? (T)compoundBuffer.Remove(0) : (T)lexer.Next(); }while (nextToken != null && nextToken.Word().IsEmpty()); // Check for compounds to split if (splitAny && nextToken is CoreLabel) { CoreLabel cl = (CoreLabel)nextToken; if (cl.ContainsKey(typeof(CoreAnnotations.ParentAnnotation))) { if (splitCompounds && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.CompoundAnnotation)) { nextToken = (T)ProcessCompound(cl); } else { if (splitVerbs && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.VbPronAnnotation)) { nextToken = (T)ProcessVerb(cl); } else { if (splitContractions && cl.Get(typeof(CoreAnnotations.ParentAnnotation)).Equals(SpanishLexer.ContrAnnotation)) { nextToken = (T)ProcessContraction(cl); } } } } } return(nextToken); } catch (IOException e) { throw new RuntimeIOException(e); } }
private static Tree FindTreeWithSpan(Tree tree, int start, int end) { CoreLabel l = (CoreLabel)tree.Label(); if (l != null && l.ContainsKey(typeof(CoreAnnotations.BeginIndexAnnotation)) && l.ContainsKey(typeof(CoreAnnotations.EndIndexAnnotation))) { int myStart = l.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); int myEnd = l.Get(typeof(CoreAnnotations.EndIndexAnnotation)); if (start == myStart && end == myEnd) { // found perfect match return(tree); } else { if (end < myStart) { return(null); } else { if (start >= myEnd) { return(null); } } } } // otherwise, check inside children - a match is possible foreach (Tree kid in tree.Children()) { if (kid == null) { continue; } Tree ret = FindTreeWithSpan(kid, start, end); // found matching child if (ret != null) { return(ret); } } // no match return(null); }
// TODO not called any more, but possibly useful as a reference /// <summary> /// This should be called after the classifier has been trained and /// parseAndTrain has been called to accumulate test set /// This will return precision,recall and F1 measure /// </summary> public virtual void RunTestSet(IList <IList <CoreLabel> > testSet) { ICounter <string> tp = new ClassicCounter <string>(); ICounter <string> fp = new ClassicCounter <string>(); ICounter <string> fn = new ClassicCounter <string>(); ICounter <string> actual = new ClassicCounter <string>(); foreach (IList <CoreLabel> labels in testSet) { IList <CoreLabel> unannotatedLabels = new List <CoreLabel>(); // create a new label without answer annotation foreach (CoreLabel label in labels) { CoreLabel newLabel = new CoreLabel(); newLabel.Set(annotationForWord, label.Get(annotationForWord)); newLabel.Set(typeof(CoreAnnotations.PartOfSpeechAnnotation), label.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation))); unannotatedLabels.Add(newLabel); } IList <CoreLabel> annotatedLabels = this.classifier.Classify(unannotatedLabels); int ind = 0; foreach (CoreLabel expectedLabel in labels) { CoreLabel annotatedLabel = annotatedLabels[ind]; string answer = annotatedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation)); string expectedAnswer = expectedLabel.Get(typeof(CoreAnnotations.AnswerAnnotation)); actual.IncrementCount(expectedAnswer); // match only non background symbols if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer) && expectedAnswer.Equals(answer)) { // true positives tp.IncrementCount(answer); System.Console.Out.WriteLine("True Positive:" + annotatedLabel); } else { if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(answer)) { // false positives fp.IncrementCount(answer); System.Console.Out.WriteLine("False Positive:" + annotatedLabel); } else { if (!SeqClassifierFlags.DefaultBackgroundSymbol.Equals(expectedAnswer)) { // false negatives fn.IncrementCount(expectedAnswer); System.Console.Out.WriteLine("False Negative:" + expectedLabel); } } } // else true negatives ind++; } } actual.Remove(SeqClassifierFlags.DefaultBackgroundSymbol); }
private string FindParagraphSpeaker(IList <ICoreMap> paragraph, int paragraphUtterIndex, string nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) { if (!speakers.Contains(paragraphUtterIndex)) { if (!nextParagraphSpeaker.Equals(string.Empty)) { speakers[paragraphUtterIndex] = nextParagraphSpeaker; } else { // find the speaker of this paragraph (John, nbc news) ICoreMap lastSent = paragraph[paragraph.Count - 1]; string speaker = string.Empty; bool hasVerb = false; for (int i = 0; i < lastSent.Get(typeof(CoreAnnotations.TokensAnnotation)).Count; i++) { CoreLabel w = lastSent.Get(typeof(CoreAnnotations.TokensAnnotation))[i]; string pos = w.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); string ner = w.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (pos.StartsWith("V")) { hasVerb = true; break; } if (ner.StartsWith("PER")) { IntTuple headPosition = new IntTuple(2); headPosition.Set(0, paragraph.Count - 1 + paragraphOffset); headPosition.Set(1, i); if (mentionheadPositions.Contains(headPosition)) { speaker = int.ToString(mentionheadPositions[headPosition].mentionID); } } } if (!hasVerb && !speaker.Equals(string.Empty)) { speakers[paragraphUtterIndex] = speaker; } } } return(FindNextParagraphSpeaker(paragraph, paragraphOffset, dict)); }
/// <summary>The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations.</summary> /// <remarks> /// The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations. The (tab-separated) format is: /// Paragraph id /// Sentence id /// Token id /// Byte start /// Byte end /// Whitespace following the token (useful for pretty-printing the original text) /// Syntactic head id (-1 for the sentence root) /// Original token /// Normalized token (for quotes etc.) /// Lemma /// Penn Treebank POS tag /// NER tag (PERSON, NUMBER, DATE, DURATION, MISC, TIME, LOCATION, ORDINAL, MONEY, ORGANIZATION, SET, O) /// Stanford basic dependency label /// Within-quotation flag /// Character id (all coreferent tokens share the same character id) /// </remarks> /// <param name="filename"/> public static IDictionary <int, IList <CoreLabel> > ReadTokenFile(string filename, Annotation novel) { IList <string> lines = IOUtils.LinesFromFile(filename); IDictionary <int, IList <CoreLabel> > charsToTokens = new Dictionary <int, IList <CoreLabel> >(); bool first = true; int tokenOffset = 0; foreach (string line in lines) { if (first) { first = false; continue; } string[] pieces = line.Split("\t"); int tokenId = System.Convert.ToInt32(pieces[2]) + tokenOffset; string token = pieces[7]; string normalizedTok = pieces[8]; int characterId = System.Convert.ToInt32(pieces[14]); CoreLabel novelTok = novel.Get(typeof(CoreAnnotations.TokensAnnotation))[tokenId]; // CoreNLP sometimes splits ". . . ." as ". . ." and "." and sometimes lemmatizes it. (The Steppe) if (pieces[7].Equals(". . . .") && !novelTok.Get(typeof(CoreAnnotations.OriginalTextAnnotation)).Equals(". . . .")) { tokenOffset++; } if (characterId != -1) { if (!novelTok.Get(typeof(CoreAnnotations.TextAnnotation)).Equals(normalizedTok)) { System.Console.Error.WriteLine(token + " != " + novelTok.Get(typeof(CoreAnnotations.TextAnnotation))); } else { if (!charsToTokens.Contains(characterId)) { charsToTokens[characterId] = new List <CoreLabel>(); } charsToTokens[characterId].Add(novelTok); } } } return(charsToTokens); }
private static void CheckContext(CoreLabel label, params string[] expectedContext) { IList <string> xmlContext = label.Get(typeof(CoreAnnotations.XmlContextAnnotation)); NUnit.Framework.Assert.AreEqual(expectedContext.Length, xmlContext.Count); for (int i = 0; i < expectedContext.Length; ++i) { NUnit.Framework.Assert.AreEqual(expectedContext[i], xmlContext[i]); } }
// public void getDecisionTree(Map<String, List<CoreLabel>> sents, // List<Pair<String, Integer>> chosen, Counter<String> weights, String // wekaOptions) { // RVFDataset<String, String> dataset = new RVFDataset<String, String>(); // for (Pair<String, Integer> d : chosen) { // CoreLabel l = sents.get(d.first).get(d.second()); // String w = l.word(); // Integer num = this.clusterIds.get(w); // if (num == null) // num = -1; // double wt = weights.getCount("Cluster-" + num); // String label; // if (l.get(answerClass).toString().equals(answerLabel)) // label = answerLabel; // else // label = "O"; // Counter<String> feat = new ClassicCounter<String>(); // feat.setCount("DIST", wt); // dataset.add(new RVFDatum<String, String>(feat, label)); // } // WekaDatumClassifierFactory wekaFactory = new // WekaDatumClassifierFactory("weka.classifiers.trees.J48", wekaOptions); // WekaDatumClassifier classifier = wekaFactory.trainClassifier(dataset); // Classifier cls = classifier.getClassifier(); // J48 j48decisiontree = (J48) cls; // System.out.println(j48decisiontree.toSummaryString()); // System.out.println(j48decisiontree.toString()); // // } private int Sample(IDictionary <string, DataInstance> sents, Random r, Random rneg, double perSelectNeg, double perSelectRand, int numrand, IList <Pair <string, int> > chosen, RVFDataset <string, string> dataset) { foreach (KeyValuePair <string, DataInstance> en in sents) { CoreLabel[] sent = Sharpen.Collections.ToArray(en.Value.GetTokens(), new CoreLabel[0]); for (int i = 0; i < sent.Length; i++) { CoreLabel l = sent[i]; bool chooseThis = false; if (l.Get(answerClass).Equals(answerLabel)) { chooseThis = true; } else { if ((!l.Get(answerClass).Equals("O") || negativeWords.Contains(l.Word().ToLower())) && GetRandomBoolean(r, perSelectNeg)) { chooseThis = true; } else { if (GetRandomBoolean(r, perSelectRand)) { numrand++; chooseThis = true; } else { chooseThis = false; } } } if (chooseThis) { chosen.Add(new Pair(en.Key, i)); RVFDatum <string, string> d = GetDatum(sent, i); dataset.Add(d, en.Key, int.ToString(i)); } } } return(numrand); }
/// <summary>Find and annotate chunks.</summary> /// <remarks> /// Find and annotate chunks. Returns list of CoreMap (Annotation) objects /// each representing a chunk with the following annotations set: /// CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk /// CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk /// TokensAnnotation - List of tokens in this chunk /// TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens) /// TokenEndAnnotation - Index of last token in chunk (index in original list of tokens) /// TextAnnotation - String representing tokens in this chunks (token text separated by space) /// </remarks> /// <param name="tokens">- List of tokens to look for chunks</param> /// <param name="totalTokensOffset">- Index of tokens to offset by</param> /// <param name="labelKey">- Key to use to find the token label (to determine if inside chunk or not)</param> /// <param name="textKey">- Key to use to find the token text</param> /// <param name="tokenChunkKey">- If not null, each token is annotated with the chunk using this key</param> /// <param name="tokenLabelKey">- If not null, each token is annotated with the text associated with the chunk using this key</param> /// <param name="checkTokensCompatible">- If not null, additional check to see if this token and the previous are compatible</param> /// <returns>List of annotations (each as a CoreMap) representing the chunks of tokens</returns> public virtual IList <ICoreMap> GetAnnotatedChunks(IList <CoreLabel> tokens, int totalTokensOffset, Type textKey, Type labelKey, Type tokenChunkKey, Type tokenLabelKey, IPredicate <Pair <CoreLabel, CoreLabel> > checkTokensCompatible) { IList <ICoreMap> chunks = new ArrayList(); LabeledChunkIdentifier.LabelTagType prevTagType = null; int tokenBegin = -1; for (int i = 0; i < tokens.Count; i++) { CoreLabel token = tokens[i]; string label = (string)token.Get(labelKey); LabeledChunkIdentifier.LabelTagType curTagType = GetTagType(label); bool isCompatible = true; if (checkTokensCompatible != null) { CoreLabel prev = null; if (i > 0) { prev = tokens[i - 1]; } Pair <CoreLabel, CoreLabel> p = Pair.MakePair(token, prev); isCompatible = checkTokensCompatible.Test(p); } if (IsEndOfChunk(prevTagType, curTagType) || !isCompatible) { int tokenEnd = i; if (tokenBegin >= 0 && tokenEnd > tokenBegin) { ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokenEnd, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.Set(labelKey, prevTagType.type); chunks.Add(chunk); tokenBegin = -1; } } if (IsStartOfChunk(prevTagType, curTagType) || (!isCompatible && IsChunk(curTagType))) { if (tokenBegin >= 0) { throw new Exception("New chunk started, prev chunk not ended yet!"); } tokenBegin = i; } prevTagType = curTagType; } if (tokenBegin >= 0) { ICoreMap chunk = ChunkAnnotationUtils.GetAnnotatedChunk(tokens, tokenBegin, tokens.Count, totalTokensOffset, tokenChunkKey, textKey, tokenLabelKey); chunk.Set(labelKey, prevTagType.type); chunks.Add(chunk); } // System.out.println("number of chunks " + chunks.size()); return(chunks); }
/// <summary>Find the tree that covers the portion of interest.</summary> private static Tree FindPartialSpan(Tree root, int start) { CoreLabel label = (CoreLabel)root.Label(); int startIndex = label.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); if (startIndex == start) { return(root); } foreach (Tree kid in root.Children()) { CoreLabel kidLabel = (CoreLabel)kid.Label(); int kidStart = kidLabel.Get(typeof(CoreAnnotations.BeginIndexAnnotation)); int kidEnd = kidLabel.Get(typeof(CoreAnnotations.EndIndexAnnotation)); if (kidStart <= start && kidEnd > start) { return(FindPartialSpan(kid, start)); } } throw new Exception("Shouldn't happen: " + start + " " + root); }
protected internal override ICollection <string> FeaturesC(PaddedList <In> cInfo, int loc) { ICollection <string> features = base.FeaturesC(cInfo, loc); CoreLabel n3 = cInfo[loc + 3]; CoreLabel p3 = cInfo[loc - 3]; string charn3 = n3.Get(typeof(CoreAnnotations.CharAnnotation)); string charp3 = p3.Get(typeof(CoreAnnotations.CharAnnotation)); // a 7 character window instead of a 5 character window features.Add(charn3 + "-n3"); features.Add(charp3 + "-p3"); return(features); }
protected internal virtual ICollection <string> FeaturesCpC(PaddedList <In> cInfo, int loc) { ICollection <string> features = new List <string>(); CoreLabel c = cInfo[loc]; CoreLabel p = cInfo[loc - 1]; string charc = c.Get(typeof(CoreAnnotations.CharAnnotation)); string charp = p.Get(typeof(CoreAnnotations.CharAnnotation)); features.Add(charc + charp + "-cngram"); // Indicator transition feature features.Add("cliqueCpC"); return(features); }