/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> private ICounter <CandidatePhrase> LearnNewPhrasesPrivate(string label, PatternsForEachToken patternsForEachToken, ICounter <E> patternsLearnedThisIter, ICounter <E> allSelectedPatterns, ICollection <CandidatePhrase> alreadyIdentifiedWords, CollectionValuedMap <E, Triple <string, int, int> > matchedTokensByPat, ICounter <CandidatePhrase> scoreForAllWordsThisIteration, TwoDimensionalCounter <CandidatePhrase, E> terms, TwoDimensionalCounter <CandidatePhrase, E> wordsPatExtracted, TwoDimensionalCounter <E , CandidatePhrase> patternsAndWords4Label, string identifier, ICollection <CandidatePhrase> ignoreWords, bool computeProcDataFreq) { ICollection <CandidatePhrase> alreadyLabeledWords = new HashSet <CandidatePhrase>(); if (constVars.doNotApplyPatterns) { // if want to get the stats by the lossy way of just counting without // applying the patterns ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents); while (sentsIter.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sentsf = sentsIter.Current; this.StatsWithoutApplyingPatterns(sentsf.First(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted); } } else { if (patternsLearnedThisIter.Size() > 0) { this.ApplyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords); } } if (computeProcDataFreq) { if (!phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.None)) { Redwood.Log(Redwood.Dbg, "computing processed freq"); foreach (KeyValuePair <CandidatePhrase, double> fq in Data.rawFreq.EntrySet()) { double @in = fq.Value; if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Sqrt)) { @in = Math.Sqrt(@in); } else { if (phraseScorer.wordFreqNorm.Equals(PhraseScorer.Normalization.Log)) { @in = 1 + Math.Log(@in); } else { throw new Exception("can't understand the normalization"); } } System.Diagnostics.Debug.Assert(!double.IsNaN(@in), "Why is processed freq nan when rawfreq is " + @in); Data.processedDataFreq.SetCount(fq.Key, @in); } } else { Data.processedDataFreq = Data.rawFreq; } } if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Weightednorm)) { foreach (CandidatePhrase en in wordsPatExtracted.FirstKeySet()) { if (!constVars.GetOtherSemanticClassesWords().Contains(en) && (en.GetPhraseLemma() == null || !constVars.GetOtherSemanticClassesWords().Contains(CandidatePhrase.CreateOrGet(en.GetPhraseLemma()))) && !alreadyLabeledWords.Contains(en)) { terms.AddAll(en, wordsPatExtracted.GetCounter(en)); } } RemoveKeys(terms, ConstantsAndVariables.GetStopWords()); ICounter <CandidatePhrase> phraseScores = phraseScorer.ScorePhrases(label, terms, wordsPatExtracted, allSelectedPatterns, alreadyIdentifiedWords, false); System.Console.Out.WriteLine("count for word U.S. is " + phraseScores.GetCount(CandidatePhrase.CreateOrGet("U.S."))); ICollection <CandidatePhrase> ignoreWordsAll; if (ignoreWords != null && !ignoreWords.IsEmpty()) { ignoreWordsAll = CollectionUtils.UnionAsSet(ignoreWords, constVars.GetOtherSemanticClassesWords()); } else { ignoreWordsAll = new HashSet <CandidatePhrase>(constVars.GetOtherSemanticClassesWords()); } Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetSeedLabelDictionary()[label]); Sharpen.Collections.AddAll(ignoreWordsAll, constVars.GetLearnedWords(label).KeySet()); System.Console.Out.WriteLine("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.Contains(CandidatePhrase.CreateOrGet("U.S."))); ICounter <CandidatePhrase> finalwords = ChooseTopWords(phraseScores, terms, phraseScores, ignoreWordsAll, constVars.thresholdWordExtract); phraseScorer.PrintReasonForChoosing(finalwords); scoreForAllWordsThisIteration.Clear(); Counters.AddInPlace(scoreForAllWordsThisIteration, phraseScores); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Selected Words for " + label + " : " + Counters.ToSortedString(finalwords, finalwords.Size(), "%1$s:%2$.2f", "\t")); if (constVars.goldEntities != null) { IDictionary <string, bool> goldEntities4Label = constVars.goldEntities[label]; if (goldEntities4Label != null) { StringBuilder s = new StringBuilder(); finalwords.KeySet().Stream().ForEach(null); Redwood.Log(ConstantsAndVariables.minimaldebug, "\n\n## Gold labels for selected words for label " + label + " : " + s.ToString()); } else { Redwood.Log(Redwood.Dbg, "No gold entities provided for label " + label); } } if (constVars.outDir != null && !constVars.outDir.IsEmpty()) { string outputdir = constVars.outDir + "/" + identifier + "/" + label; IOUtils.EnsureDir(new File(outputdir)); TwoDimensionalCounter <CandidatePhrase, CandidatePhrase> reasonForWords = new TwoDimensionalCounter <CandidatePhrase, CandidatePhrase>(); foreach (CandidatePhrase word in finalwords.KeySet()) { foreach (E l in wordsPatExtracted.GetCounter(word).KeySet()) { foreach (CandidatePhrase w2 in patternsAndWords4Label.GetCounter(l)) { reasonForWords.IncrementCount(word, w2); } } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir); string filename = outputdir + "/words.json"; // the json object is an array corresponding to each iteration - of list // of objects, // each of which is a bean of entity and reasons IJsonArrayBuilder obj = Javax.Json.Json.CreateArrayBuilder(); if (writtenInJustification.Contains(label) && writtenInJustification[label]) { IJsonReader jsonReader = Javax.Json.Json.CreateReader(new BufferedInputStream(new FileInputStream(filename))); IJsonArray objarr = jsonReader.ReadArray(); foreach (IJsonValue o in objarr) { obj.Add(o); } jsonReader.Close(); } IJsonArrayBuilder objThisIter = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w in reasonForWords.FirstKeySet()) { IJsonObjectBuilder objinner = Javax.Json.Json.CreateObjectBuilder(); IJsonArrayBuilder l = Javax.Json.Json.CreateArrayBuilder(); foreach (CandidatePhrase w2 in reasonForWords.GetCounter(w).KeySet()) { l.Add(w2.GetPhrase()); } IJsonArrayBuilder pats = Javax.Json.Json.CreateArrayBuilder(); foreach (E p in wordsPatExtracted.GetCounter(w)) { pats.Add(p.ToStringSimple()); } objinner.Add("reasonwords", l); objinner.Add("patterns", pats); objinner.Add("score", finalwords.GetCount(w)); objinner.Add("entity", w.GetPhrase()); objThisIter.Add(objinner.Build()); } obj.Add(objThisIter); // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger, // "Writing justification at " + filename); IOUtils.WriteStringToFile(StringUtils.Normalize(StringUtils.ToAscii(obj.Build().ToString())), filename, "ASCII"); writtenInJustification[label] = true; } if (constVars.justify) { Redwood.Log(Redwood.Dbg, "\nJustification for phrases:\n"); foreach (CandidatePhrase word in finalwords.KeySet()) { Redwood.Log(Redwood.Dbg, "Phrase " + word + " extracted because of patterns: \t" + Counters.ToSortedString(wordsPatExtracted.GetCounter(word), wordsPatExtracted.GetCounter(word).Size(), "%1$s:%2$f", "\n")); } } // if (usePatternResultAsLabel) // if (answerLabel != null) // labelWords(sents, commonEngWords, finalwords.keySet(), // patterns.keySet(), outFile); // else // throw new RuntimeException("why is the answer label null?"); return(finalwords); } else { if (constVars.wordScoring.Equals(GetPatternsFromDataMultiClass.WordScoring.Bpb)) { Counters.AddInPlace(terms, wordsPatExtracted); ICounter <CandidatePhrase> maxPatWeightTerms = new ClassicCounter <CandidatePhrase>(); IDictionary <CandidatePhrase, E> wordMaxPat = new Dictionary <CandidatePhrase, E>(); foreach (KeyValuePair <CandidatePhrase, ClassicCounter <E> > en in terms.EntrySet()) { ICounter <E> weights = new ClassicCounter <E>(); foreach (E k in en.Value.KeySet()) { weights.SetCount(k, patternsLearnedThisIter.GetCount(k)); } maxPatWeightTerms.SetCount(en.Key, Counters.Max(weights)); wordMaxPat[en.Key] = Counters.Argmax(weights); } Counters.RemoveKeys(maxPatWeightTerms, alreadyIdentifiedWords); double maxvalue = Counters.Max(maxPatWeightTerms); ICollection <CandidatePhrase> words = Counters.KeysAbove(maxPatWeightTerms, maxvalue - 1e-10); CandidatePhrase bestw = null; if (words.Count > 1) { double max = double.NegativeInfinity; foreach (CandidatePhrase w in words) { if (terms.GetCount(w, wordMaxPat[w]) > max) { max = terms.GetCount(w, wordMaxPat[w]); bestw = w; } } } else { if (words.Count == 1) { bestw = words.GetEnumerator().Current; } else { return(new ClassicCounter <CandidatePhrase>()); } } Redwood.Log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw); return(Counters.AsCounter(Arrays.AsList(bestw))); } else { throw new Exception("wordscoring " + constVars.wordScoring + " not identified"); } } }
protected virtual void SetUp() { c.Clear(); }
public virtual void TestClassicCounterHistoricalMain() { c.SetCount("p", 0); c.SetCount("q", 2); ClassicCounter <string> small_c = new ClassicCounter <string>(c); ICounter <string> c7 = c.GetFactory().Create(); c7.AddAll(c); NUnit.Framework.Assert.AreEqual(c.TotalCount(), 2.0); c.IncrementCount("p"); NUnit.Framework.Assert.AreEqual(c.TotalCount(), 3.0); c.IncrementCount("p", 2.0); NUnit.Framework.Assert.AreEqual(Counters.Min(c), 2.0); NUnit.Framework.Assert.AreEqual(Counters.Argmin(c), "q"); // Now p is p=3.0, q=2.0 c.SetCount("w", -5.0); c.SetCount("x", -4.5); IList <string> biggestKeys = new List <string>(c.KeySet()); NUnit.Framework.Assert.AreEqual(biggestKeys.Count, 4); biggestKeys.Sort(Counters.ToComparator(c, false, true)); NUnit.Framework.Assert.AreEqual("w", biggestKeys[0]); NUnit.Framework.Assert.AreEqual("x", biggestKeys[1]); NUnit.Framework.Assert.AreEqual("p", biggestKeys[2]); NUnit.Framework.Assert.AreEqual("q", biggestKeys[3]); NUnit.Framework.Assert.AreEqual(Counters.Min(c), -5.0, Tolerance); NUnit.Framework.Assert.AreEqual(Counters.Argmin(c), "w"); NUnit.Framework.Assert.AreEqual(Counters.Max(c), 3.0, Tolerance); NUnit.Framework.Assert.AreEqual(Counters.Argmax(c), "p"); if (integral) { NUnit.Framework.Assert.AreEqual(Counters.Mean(c), -1.0); } else { NUnit.Framework.Assert.AreEqual(Counters.Mean(c), -1.125, Tolerance); } if (!integral) { // only do this for floating point counters. Too much bother to rewrite c.SetCount("x", -2.5); ClassicCounter <string> c2 = new ClassicCounter <string>(c); NUnit.Framework.Assert.AreEqual(3.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(2.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(-5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(-2.5, c2.GetCount("x")); ICounter <string> c3 = c.GetFactory().Create(); foreach (string str in c2.KeySet()) { c3.IncrementCount(str); } NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("x")); Counters.AddInPlace(c2, c3, 10.0); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x")); c3.AddAll(c); NUnit.Framework.Assert.AreEqual(4.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(3.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(-4.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(-1.5, c3.GetCount("x")); Counters.SubtractInPlace(c3, c); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(1.0, c3.GetCount("x")); foreach (string str_1 in c.KeySet()) { c3.IncrementCount(str_1); } NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("p")); NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("q")); NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("w")); NUnit.Framework.Assert.AreEqual(2.0, c3.GetCount("x")); Counters.DivideInPlace(c2, c3); NUnit.Framework.Assert.AreEqual(6.5, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(6.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(2.5, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(3.75, c2.GetCount("x")); Counters.DivideInPlace(c2, 0.5); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x")); Counters.MultiplyInPlace(c2, 2.0); NUnit.Framework.Assert.AreEqual(26.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(24.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(10.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("x")); Counters.DivideInPlace(c2, 2.0); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(12.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(5.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(7.5, c2.GetCount("x")); foreach (string str_2 in c2.KeySet()) { c2.IncrementCount(str_2); } NUnit.Framework.Assert.AreEqual(14.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(13.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(6.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(8.5, c2.GetCount("x")); foreach (string str_3 in c.KeySet()) { c2.IncrementCount(str_3); } NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(14.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); c2.AddAll(small_c); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(16.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("p", "q")), Counters.KeysAbove(c2, 14)); NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("q")), Counters.KeysAt(c2, 16)); NUnit.Framework.Assert.AreEqual(new HashSet <string>(Arrays.AsList("x", "w")), Counters.KeysBelow(c2, 9.5)); Counters.AddInPlace(c2, small_c, -6); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.AreEqual(4.0, c2.GetCount("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); Counters.SubtractInPlace(c2, small_c); Counters.SubtractInPlace(c2, small_c); Counters.RetainNonZeros(c2); NUnit.Framework.Assert.AreEqual(15.0, c2.GetCount("p")); NUnit.Framework.Assert.IsFalse(c2.ContainsKey("q")); NUnit.Framework.Assert.AreEqual(7.0, c2.GetCount("w")); NUnit.Framework.Assert.AreEqual(9.5, c2.GetCount("x")); } // serialize to Stream if (c is ISerializable) { try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); ObjectOutputStream @out = new ObjectOutputStream(new BufferedOutputStream(baos)); @out.WriteObject(c); @out.Close(); // reconstitute byte[] bytes = baos.ToByteArray(); ObjectInputStream @in = new ObjectInputStream(new BufferedInputStream(new ByteArrayInputStream(bytes))); c = IOUtils.ReadObjectFromObjectStream(@in); @in.Close(); if (!this.integral) { NUnit.Framework.Assert.AreEqual(-2.5, c.TotalCount()); NUnit.Framework.Assert.AreEqual(-5.0, Counters.Min(c)); NUnit.Framework.Assert.AreEqual("w", Counters.Argmin(c)); } c.Clear(); if (!this.integral) { NUnit.Framework.Assert.AreEqual(0.0, c.TotalCount()); } } catch (IOException ioe) { Fail("IOException: " + ioe); } catch (TypeLoadException cce) { Fail("ClassNotFoundException: " + cce); } } }