private string GetTag(string word) { int iW = wordIndex.AddToIndex(word); EnsureProbs(iW, false); return(Counters.Argmax(logProbs)); }
private static void ModifyUsingCoreNLPNER(Annotation doc) { Properties ann = new Properties(); ann.SetProperty("annotators", "pos, lemma, ner"); StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false); pipeline.Annotate(doc); foreach (ICoreMap sentence in doc.Get(typeof(CoreAnnotations.SentencesAnnotation))) { IList <EntityMention> entities = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); if (entities != null) { IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); foreach (EntityMention en in entities) { //System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType()); Span s = en.GetExtent(); ICounter <string> allNertagforSpan = new ClassicCounter <string>(); for (int i = s.Start(); i < s.End(); i++) { allNertagforSpan.IncrementCount(tokens[i].Ner()); } string entityNertag = Counters.Argmax(allNertagforSpan); en.SetType(entityNertag); } } } }
/// <summary> /// Attempt to infer the part of speech of the given preterminal node, which /// was created during the expansion of a multi-word token. /// </summary> private static string InferPOS(Tree t, Tree parent, TwoDimensionalCounter <string, string> unigramTagger) { string word = t.FirstChild().Value(); string containingPhraseStr = GetContainingPhrase(t, parent); // Overrides: let the manual POS model handle a few special cases first string overrideTag = MultiWordPreprocessor.ManualUWModel.GetOverrideTag(word, containingPhraseStr); if (overrideTag != null) { return(overrideTag); } ICollection <string> unigramTaggerKeys = unigramTagger.FirstKeySet(); // Try treating this word as a verb and stripping any clitic // pronouns. If the stripped version exists in the unigram // tagger, then stick with the verb hypothesis SpanishVerbStripper.StrippedVerb strippedVerb = verbStripper.SeparatePronouns(word); if (strippedVerb != null && unigramTaggerKeys.Contains(strippedVerb.GetStem())) { string pos = Counters.Argmax(unigramTagger.GetCounter(strippedVerb.GetStem())); if (pos.StartsWith("v")) { return(pos); } } if (unigramTagger.FirstKeySet().Contains(word)) { return(Counters.Argmax(unigramTagger.GetCounter(word), new MultiWordPreprocessor.POSTieBreaker())); } return(MultiWordPreprocessor.ManualUWModel.GetTag(word, containingPhraseStr)); }
public virtual void ClassifyMentions(IList <IList <Mention> > predictedMentions, Dictionaries dict, Properties props) { ICollection <string> neStrings = Generics.NewHashSet(); foreach (IList <Mention> predictedMention in predictedMentions) { foreach (Mention m in predictedMention) { string ne = m.headWord.Ner(); if (ne.Equals("O")) { continue; } foreach (CoreLabel cl in m.originalSpan) { if (!cl.Ner().Equals(ne)) { continue; } } neStrings.Add(m.LowercaseNormalizedSpanString()); } } foreach (IList <Mention> predicts in predictedMentions) { IDictionary <int, ICollection <Mention> > headPositions = Generics.NewHashMap(); foreach (Mention p in predicts) { if (!headPositions.Contains(p.headIndex)) { headPositions[p.headIndex] = Generics.NewHashSet(); } headPositions[p.headIndex].Add(p); } ICollection <Mention> remove = Generics.NewHashSet(); foreach (int hPos in headPositions.Keys) { ICollection <Mention> shares = headPositions[hPos]; if (shares.Count > 1) { ICounter <Mention> probs = new ClassicCounter <Mention>(); foreach (Mention p_1 in shares) { double trueProb = ProbabilityOf(p_1, shares, neStrings, dict, props); probs.IncrementCount(p_1, trueProb); } // add to remove Mention keep = Counters.Argmax(probs, null); probs.Remove(keep); Sharpen.Collections.AddAll(remove, probs.KeySet()); } } foreach (Mention r in remove) { predicts.Remove(r); } } }
public virtual L ClassOf(IDatum <L, F> example) { ICounter <L> scores = ScoresOf(example); if (scores != null) { return(Counters.Argmax(scores)); } else { return(defaultLabel); } }
/// <summary>Select the most common element of the given type in the given span.</summary> /// <remarks> /// Select the most common element of the given type in the given span. /// This is useful for, e.g., finding the most likely NER span of a given span, or the most /// likely POS tag of a given span. /// Null entries are removed. /// </remarks> /// <param name="span">The span of the sentence to find the mode element in. This must be entirely contained in the sentence.</param> /// <param name="selector">The property of the sentence we are getting the mode of. For example, <code>Sentence::posTags</code></param> /// <?/> /// <returns>The most common element of the given property in the sentence.</returns> public virtual E ModeInSpan <E>(Span span, IFunction <Sentence, IList <E> > selector) { if (!Span.FromValues(0, sentence.Length()).Contains(span)) { throw new ArgumentException("Span must be entirely contained in the sentence: " + span + " (sentence length=" + sentence.Length() + ")"); } ICounter <E> candidates = new ClassicCounter <E>(); foreach (int i in span) { candidates.IncrementCount(selector.Apply(sentence)[i]); } candidates.Remove(null); return(Counters.Argmax(candidates)); }
/// <summary>A utility to get useful information out of a CorefMention.</summary> /// <remarks> /// A utility to get useful information out of a CorefMention. In particular, it returns the CoreLabels which are /// associated with this mention, and it returns a score for how much we think this mention should be the canonical /// mention. /// </remarks> /// <param name="doc">The document this mention is referenced into.</param> /// <param name="mention">The mention itself.</param> /// <returns>A pair of the tokens in the mention, and a score for how much we like this mention as the canonical mention.</returns> private static Pair <IList <CoreLabel>, double> GrokCorefMention(Annotation doc, CorefChain.CorefMention mention) { IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.SentencesAnnotation))[mention.sentNum - 1].Get(typeof(CoreAnnotations.TokensAnnotation)); IList <CoreLabel> mentionAsTokens = tokens.SubList(mention.startIndex - 1, mention.endIndex - 1); // Try to assess this mention's NER type ICounter <string> nerVotes = new ClassicCounter <string>(); mentionAsTokens.Stream().Filter(null).ForEach(null); string ner = Counters.Argmax(nerVotes, null); double nerCount = nerVotes.GetCount(ner); double nerScore = nerCount * nerCount / ((double)mentionAsTokens.Count); // Return return(Pair.MakePair(mentionAsTokens, nerScore)); }
/// <summary> /// Score the given input, returning both the classification decision and the /// probability of that decision. /// </summary> /// <remarks> /// Score the given input, returning both the classification decision and the /// probability of that decision. /// Note that this method will not return a relation which does not type check. /// </remarks> /// <param name="input">The input to classify.</param> /// <returns>A pair with the relation we classified into, along with its confidence.</returns> public virtual Pair<string, double> Classify(KBPRelationExtractor.KBPInput input) { RVFDatum<string, string> datum = new RVFDatum<string, string>(Features(input)); ICounter<string> scores = classifier.ScoresOf(datum); Counters.ExpInPlace(scores); Counters.Normalize(scores); string best = Counters.Argmax(scores); // While it doesn't type check, continue going down the list. // NO_RELATION is always an option somewhere in there, so safe to keep going... while (!KBPRelationExtractorConstants.NoRelation.Equals(best) && scores.Size() > 1 && (!KBPRelationExtractor.RelationType.FromString(best).Get().validNamedEntityLabels.Contains(input.objectType) || KBPRelationExtractor.RelationType.FromString (best).Get().entityType != input.subjectType)) { scores.Remove(best); Counters.Normalize(scores); best = Counters.Argmax(scores); } return Pair.MakePair(best, scores.GetCount(best)); }
/// <summary>TODO(gabor) JavaDoc</summary> /// <param name="tokens"/> /// <param name="span"/> /// <returns/> public static string GuessNER(IList <CoreLabel> tokens, Span span) { ICounter <string> nerGuesses = new ClassicCounter <string>(); foreach (int i in span) { nerGuesses.IncrementCount(tokens[i].Ner()); } nerGuesses.Remove("O"); nerGuesses.Remove(null); if (nerGuesses.Size() > 0 && Counters.Max(nerGuesses) >= span.Size() / 2) { return(Counters.Argmax(nerGuesses)); } else { return("O"); } }
public virtual void InitMC <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data) { //if (!(gData instanceof Dataset)) { // throw new UnsupportedOperationException("Can only handle Datasets, not "+gData.getClass().getName()); //} // //Dataset data = (Dataset)gData; IPriorityQueue <Pair <int, Pair <double, bool> > > q = new BinaryHeapPriorityQueue <Pair <int, Pair <double, bool> > >(); total = 0; correct = 0; logLikelihood = 0.0; for (int i = 0; i < data.Size(); i++) { IDatum <L, F> d = data.GetRVFDatum(i); ICounter <L> scores = classifier.LogProbabilityOf(d); L guess = Counters.Argmax(scores); L correctLab = d.Label(); double guessScore = scores.GetCount(guess); double correctScore = scores.GetCount(correctLab); int guessInd = data.LabelIndex().IndexOf(guess); int correctInd = data.LabelIndex().IndexOf(correctLab); total++; if (guessInd == correctInd) { correct++; } logLikelihood += correctScore; q.Add(new Pair <int, Pair <double, bool> >(int.Parse(i), new Pair <double, bool>(guessScore, bool.ValueOf(guessInd == correctInd))), -guessScore); } accuracy = (double)correct / (double)total; IList <Pair <int, Pair <double, bool> > > sorted = q.ToSortedList(); scores = new double[sorted.Count]; isCorrect = new bool[sorted.Count]; for (int i_1 = 0; i_1 < sorted.Count; i_1++) { Pair <double, bool> next = sorted[i_1].Second(); scores[i_1] = next.First(); isCorrect[i_1] = next.Second(); } }
public static void TraverseAndFix(Tree t, TwoDimensionalCounter <string, string> pretermLabel, TwoDimensionalCounter <string, string> unigramTagger) { if (t.IsPreTerminal()) { if (t.Value().Equals(FrenchXMLTreeReader.MissingPos)) { nMissingPOS++; string word = t.FirstChild().Value(); string tag = (unigramTagger.FirstKeySet().Contains(word)) ? Counters.Argmax(unigramTagger.GetCounter(word)) : MWEPreprocessor.ManualUWModel.GetTag(word); t.SetValue(tag); } return; } foreach (Tree kid in t.Children()) { TraverseAndFix(kid, pretermLabel, unigramTagger); } //Post-order visit if (t.Value().Equals(FrenchXMLTreeReader.MissingPhrasal)) { nMissingPhrasal++; StringBuilder sb = new StringBuilder(); foreach (Tree kid_1 in t.Children()) { sb.Append(kid_1.Value()).Append(" "); } string posSequence = sb.ToString().Trim(); if (pretermLabel.FirstKeySet().Contains(posSequence)) { string phrasalCat = Counters.Argmax(pretermLabel.GetCounter(posSequence)); t.SetValue(phrasalCat); } else { System.Console.Out.WriteLine("No phrasal cat for: " + posSequence); } } }
/// <summary> /// Runs the Viterbi algorithm on the sequence model /// in order to find the best sequence. /// </summary> /// <remarks> /// Runs the Viterbi algorithm on the sequence model /// in order to find the best sequence. /// This sequence finder only works on SequenceModel's with rightWindow == 0. /// </remarks> /// <returns>An array containing the int tags of the best sequence</returns> public virtual int[] BestSequence(ISequenceModel ts) { return(Counters.Argmax(KBestSequences(ts, 1))); }
public virtual void TestClassicCounterHistoricalMain() { c.SetCount("p", 0); c.SetCount("q", 2); ClassicCounter <string> small_c = new ClassicCounter <string>(c); ICounter <string> c7 = c.GetFactory().Create(); c7.AddAll(c); NUnit.Framework.Assert.AreEqual(c.TotalCount(), 2.0); c.IncrementCount("p"); NUnit.Framework.Assert.AreEqual(c.TotalCount(), 3.0); c.IncrementCount("p", 2.0); NUnit.Framework.Assert.AreEqual(Counters.Min(c), 2.0); NUnit.Framework.Assert.AreEqual(Counters.Argmin(c), "q"); // Now p is p=3.0, q=2.0 c.SetCount("w", -5.0); c.SetCount("x", -4.5); IList <string> biggestKeys = new List <string>(c.KeySet()); NUnit.Framework.Assert.AreEqual(biggestKeys.Count, 4); biggestKeys.Sort(Counters.ToComparator(c, false, true)); NUnit.Framework.Assert.AreEqual("w", biggestKeys[0]); NUnit.Framework.Assert.AreEqual("x", biggestKeys[1]); NUnit.Framework.Assert.AreEqual("p", biggestKeys[2]); NUnit.Framework.Assert.AreEqual("q", biggestKeys[3]); NUnit.Framework.Assert.AreEqual(Counters.Min(c), -5.0, Tolerance); NUnit.Framework.Assert.AreEqual(Counters.Argmin(c), "w"); NUnit.Framework.Assert.AreEqual(Counters.Max(c), 3.0, Tolerance); NUnit.Framework.Assert.AreEqual(Counters.Argmax(c), "p"); if (integral) { NUnit.Framework.Assert.AreEqual(Counters.Mean(c), -1.0); } else { NUnit.Framework.Assert.AreEqual(Counters.Mean(c), -1.125, Tolerance); } if (!integral) { // only do this for floating point counters. Too much bother to rewrite c.SetCount("x", -2.5); ClassicCounter <string> c2 = new ClassicCounter <string>(c); NUnit.Framework.Assert.AreEqual(3.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(2.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(-5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(-2.5, c2.GetCount("x")); ICounter <string> c3 = c.GetFactory().Create(); foreach (string str in c2.KeySet()) { c3.IncrementCount(str); } NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("x")); Counters.AddInPlace(c2, c3, 10.0); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x")); c3.AddAll(c); NUnit.Framework.Assert.AreEqual(4.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(3.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(-4.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(-1.5, c3.GetCount("x")); Counters.SubtractInPlace(c3, c); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("x")); foreach (string str_1 in c.KeySet()) { c3.IncrementCount(str_1); } NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("x")); Counters.DivideInPlace(c2, c3); NUnit.Framework.Assert.AreEqual(6.5, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(6.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(2.5, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(3.75, c2.GetCount("x")); Counters.DivideInPlace(c2, 0.5); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x")); Counters.MultiplyInPlace(c2, 2.0); NUnit.Framework.Assert.AreEqual(26.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(24.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(10.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("x")); Counters.DivideInPlace(c2, 2.0); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x")); foreach (string str_2 in c2.KeySet()) { c2.IncrementCount(str_2); } NUnit.Framework.Assert.AreEqual(14.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(6.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(8.5, c2.GetCount("x")); foreach (string str_3 in c.KeySet()) { c2.IncrementCount(str_3); } NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(14.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); c2.AddAll(small_c); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(16.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("p", "q")), Counters.KeysAbove(c2, 14)); NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("q")), Counters.KeysAt(c2, 16)); NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("x", "w")), Counters.KeysBelow(c2, 9.5)); Counters.AddInPlace(c2, small_c, -6); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(4.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); Counters.SubtractInPlace(c2, small_c); Counters.SubtractInPlace(c2, small_c); Counters.RetainNonZeros(c2); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.IsFalse(c2.ContainsKey("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); } // serialize to Stream if (c is ISerializable) { try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); ObjectOutputStream @out = new ObjectOutputStream(new BufferedOutputStream(baos)); @out.WriteObject(c); @out.Close(); // reconstitute byte[] bytes = baos.ToByteArray(); ObjectInputStream @in = new ObjectInputStream(new BufferedInputStream(new ByteArrayInputStream(bytes))); c = IOUtils.ReadObjectFromObjectStream(@in); @in.Close(); if (!this.integral) { NUnit.Framework.Assert.AreEqual(-2.5, c.TotalCount()); NUnit.Framework.Assert.AreEqual(-5.0, Counters.Min(c)); NUnit.Framework.Assert.AreEqual("w", Counters.Argmin(c)); } c.Clear(); if (!this.integral) { NUnit.Framework.Assert.AreEqual(0.0, c.TotalCount()); } } catch (IOException ioe) { Fail("IOException: " + ioe); } catch (TypeLoadException cce) { Fail("ClassNotFoundException: " + cce); } } }
/// <param name="args"/> public static void Main(string[] args) { if (args.Length != 4) { System.Console.Error.Printf("Usage: java %s language features train_file dev_file%n", typeof(Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon).FullName); System.Environment.Exit(-1); } // Command line options Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; Treebank trainTreebank = tlpp.DiskTreebank(); trainTreebank.LoadPath(args[2]); Treebank devTreebank = tlpp.DiskTreebank(); devTreebank.LoadPath(args[3]); MorphoFeatureSpecification morphoSpec; Options options = GetOptions(language); if (language.Equals(Language.Arabic)) { morphoSpec = new ArabicMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { if (language.Equals(Language.French)) { morphoSpec = new FrenchMorphoFeatureSpecification(); string[] languageOptions = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(languageOptions, 0); } else { throw new NotSupportedException(); } } string featureList = args[1]; string[] features = featureList.Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.WriteLine("Features: " + args[1]); // Create word and tag indices // Save trees in a collection since the interface requires that.... System.Console.Out.Write("Loading training trees..."); IList <Tree> trainTrees = new List <Tree>(19000); IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); foreach (Tree tree in trainTreebank) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } trainTrees.Add(tree); } System.Console.Out.Printf("Done! (%d trees)%n", trainTrees.Count); // Setup and train the lexicon. System.Console.Out.Write("Collecting sufficient statistics for lexicon..."); Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon lexicon = new Edu.Stanford.Nlp.Parser.Lexparser.FactoredLexicon(options, morphoSpec, wordIndex, tagIndex); lexicon.InitializeTraining(trainTrees.Count); lexicon.Train(trainTrees, null); lexicon.FinishTraining(); System.Console.Out.WriteLine("Done!"); trainTrees = null; // Load the tuning set System.Console.Out.Write("Loading tuning set..."); IList <FactoredLexiconEvent> tuningSet = GetTuningSet(devTreebank, lexicon, tlpp); System.Console.Out.Printf("...Done! (%d events)%n", tuningSet.Count); // Print the probabilities that we obtain // TODO(spenceg): Implement tagging accuracy with FactLex int nCorrect = 0; ICounter <string> errors = new ClassicCounter <string>(); foreach (FactoredLexiconEvent @event in tuningSet) { IEnumerator <IntTaggedWord> itr = lexicon.RuleIteratorByWord(@event.Word(), @event.GetLoc(), @event.FeatureStr()); ICounter <int> logScores = new ClassicCounter <int>(); bool noRules = true; int goldTagId = -1; while (itr.MoveNext()) { noRules = false; IntTaggedWord iTW = itr.Current; if (iTW.Tag() == @event.TagId()) { log.Info("GOLD-"); goldTagId = iTW.Tag(); } float tagScore = lexicon.Score(iTW, @event.GetLoc(), @event.Word(), @event.FeatureStr()); logScores.IncrementCount(iTW.Tag(), tagScore); } if (noRules) { System.Console.Error.Printf("NO TAGGINGS: %s %s%n", @event.Word(), @event.FeatureStr()); } else { // Score the tagging int hypTagId = Counters.Argmax(logScores); if (hypTagId == goldTagId) { ++nCorrect; } else { string goldTag = goldTagId < 0 ? "UNSEEN" : lexicon.tagIndex.Get(goldTagId); errors.IncrementCount(goldTag); } } log.Info(); } // Output accuracy double acc = (double)nCorrect / (double)tuningSet.Count; System.Console.Error.Printf("%n%nACCURACY: %.2f%n%n", acc * 100.0); log.Info("% of errors by type:"); IList <string> biggestKeys = new List <string>(errors.KeySet()); biggestKeys.Sort(Counters.ToComparator(errors, false, true)); Counters.Normalize(errors); foreach (string key in biggestKeys) { System.Console.Error.Printf("%s\t%.2f%n", key, errors.GetCount(key) * 100.0); } }
/// <exception cref="System.Exception"/> public static string PrintErrorLog(Mention m, Document document, ICounter <int> probs, int mIdx, Dictionaries dict, RFSieve sieve) { StringBuilder sb = new StringBuilder(); sb.Append("\nERROR START-----------------------------------------------------------------------\n"); sb.Append("RESOLVER TYPE: mType: " + sieve.mType + ", aType: " + sieve.aType).Append("\n"); sb.Append("DOCUMENT: " + document.docInfo["DOC_ID"] + ", " + document.docInfo["DOC_PART"]).Append("\n"); IList <Mention> orderedAnts = new List <Mention>(); sb.Append("\nGOLD CLUSTER ID\n"); for (int sentDist = m.sentNum; sentDist >= 0; sentDist--) { if (sentDist == sieve.maxSentDist) { sb.Append("\tstart compare from here-------------\n"); } int sentIdx = m.sentNum - sentDist; sb.Append("\tSENT " + sentIdx + "\t" + SentenceStringWithMention(sentIdx, document, true, true)).Append("\n"); } sb.Append("\nMENTION ID\n"); for (int sentDist_1 = m.sentNum; sentDist_1 >= 0; sentDist_1--) { if (sentDist_1 == sieve.maxSentDist) { sb.Append("\tstart compare from here-------------\n"); } int sentIdx = m.sentNum - sentDist_1; sb.Append("\tSENT " + sentIdx + "\t" + SentenceStringWithMention(sentIdx, document, false, false)).Append("\n"); } // get dcoref antecedents ordering for (int sentDist_2 = 0; sentDist_2 <= Math.Min(sieve.maxSentDist, m.sentNum); sentDist_2++) { int sentIdx = m.sentNum - sentDist_2; Sharpen.Collections.AddAll(orderedAnts, Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve.GetOrderedAntecedents(m, sentIdx, mIdx, document.predictedMentions, dict)); } IDictionary <int, int> orders = Generics.NewHashMap(); for (int i = 0; i < orderedAnts.Count; i++) { Mention ant = orderedAnts[i]; orders[ant.mentionID] = i; } CorefCluster mC = document.corefClusters[m.corefClusterID]; bool isFirstMention = IsFirstMention(m, document); bool foundCorefAnt = (probs.Size() > 0 && Counters.Max(probs) > sieve.thresMerge); bool correctDecision = ((isFirstMention && !foundCorefAnt) || (foundCorefAnt && Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve.IsReallyCoref(document, m.mentionID, Counters.Argmax(probs)))); bool barePlural = (m.originalSpan.Count == 1 && m.headWord.Tag().Equals("NNS")); if (correctDecision) { return(string.Empty); } sb.Append("\nMENTION: " + m.SpanToString() + " (" + m.mentionID + ")\tperson: " + m.person + "\tsingleton? " + (!m.hasTwin) + "\t\tisFirstMention? " + isFirstMention + "\t\tfoundAnt? " + foundCorefAnt + "\t\tcorrectDecision? " + correctDecision + "\tbarePlural? " + barePlural); sb.Append("\n\ttype: " + m.mentionType + "\tHeadword: " + m.headWord.Word() + "\tNEtype: " + m.nerString + "\tnumber: " + m.number + "\tgender: " + m.gender + "\tanimacy: " + m.animacy).Append("\n"); if (m.contextParseTree != null) { sb.Append(m.contextParseTree.PennString()); } sb.Append("\n\n\t\tOracle\t\tDcoref\t\t\tRF\t\tAntecedent\n"); foreach (int antID in Counters.ToSortedList(probs)) { Mention ant = document.predictedMentionsByID[antID]; CorefCluster aC = document.corefClusters[ant.corefClusterID]; bool oracle = Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve.IsReallyCoref(document, m.mentionID, antID); double prob = probs.GetCount(antID); int order = orders[antID]; string oracleStr = (oracle) ? "coref " : "notcoref"; // String dcorefStr = (dcoref)? "coref " : "notcoref"; string dcorefStr = "notcoref"; if (dcorefDiscourse.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-discourse"; } else { // else if(dcorefChineseHeadMatch.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-chineseHeadMatch"; if (dcorefExactString.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-exactString"; } else { if (dcorefRelaxedExactString.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-relaxedExact"; } else { if (dcorefPreciseConstructs.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-preciseConstruct"; } else { if (dcorefHead1.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-head1"; } else { if (dcorefHead2.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-head2"; } else { if (dcorefHead3.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-head3"; } else { if (dcorefHead4.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-head4"; } else { if (dcorefRelaxedHead.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-relaxedHead"; } else { if (dcorefPronounSieve.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-pronounSieve"; } else { if (dcorefSpeaker.Coreferent(document, mC, aC, m, ant, dict, null)) { dcorefStr = "coref-speaker"; } } } } } } } } } } } dcorefStr += "\t" + order.ToString(); string probStr = df.Format(prob); sb.Append("\t\t" + oracleStr + "\t" + dcorefStr + "\t" + probStr + "\t\t" + ant.SpanToString() + " (" + ant.mentionID + ")\n"); } sb.Append("ERROR END -----------------------------------------------------------------------\n"); return(sb.ToString()); }
public virtual E Argmax() { return(Counters.Argmax(counter)); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
public virtual L ClassOf(IDatum <L, F> example) { return(Counters.Argmax(ScoresOf(example))); }
public virtual L ClassOf(RVFDatum <L, F> example) { ICounter <L> scores = ScoresOf(example); return(Counters.Argmax(scores)); }
public override E Argmax() { return(Counters.Argmax(Counters.LinearCombination(this.counter, 1.0, prior.counter, priorMultiplier))); }