// 0 // 1 // 2 // 3 // 4 // 5 // 6 private static void RunIOBResultsTest(string[] gold, string[] guess, double tp, double fp, double fn) { IList <CoreLabel> sentence = MakeListCoreLabel(gold, guess); ICounter <string> entityTP = new ClassicCounter <string>(); ICounter <string> entityFP = new ClassicCounter <string>(); ICounter <string> entityFN = new ClassicCounter <string>(); IOBUtils.CountEntityResults(sentence, entityTP, entityFP, entityFN, Bg); NUnit.Framework.Assert.AreEqual("For true positives", tp, entityTP.TotalCount(), 0.0001); NUnit.Framework.Assert.AreEqual("For false positives", fp, entityFP.TotalCount(), 0.0001); NUnit.Framework.Assert.AreEqual("For false negatives", fn, entityFN.TotalCount(), 0.0001); }
// private static String stripTag(String tag) { // if (tag.startsWith("DT")) { // String newTag = tag.substring(2, tag.length()); // return newTag.length() > 0 ? newTag : tag; // } // return tag; // } /// <param name="args"/> public static void Main(string[] args) { if (args.Length != 3) { System.Console.Error.Printf("Usage: java %s language filename features%n", typeof(TreebankFactoredLexiconStats).FullName); System.Environment.Exit(-1); } Language language = Language.ValueOf(args[0]); ITreebankLangParserParams tlpp = language.@params; if (language.Equals(Language.Arabic)) { string[] options = new string[] { "-arabicFactored" }; tlpp.SetOptionFlag(options, 0); } else { string[] options = new string[] { "-frenchFactored" }; tlpp.SetOptionFlag(options, 0); } Treebank tb = tlpp.DiskTreebank(); tb.LoadPath(args[1]); MorphoFeatureSpecification morphoSpec = language.Equals(Language.Arabic) ? new ArabicMorphoFeatureSpecification() : new FrenchMorphoFeatureSpecification(); string[] features = args[2].Trim().Split(","); foreach (string feature in features) { morphoSpec.Activate(MorphoFeatureSpecification.MorphoFeatureType.ValueOf(feature)); } // Counters ICounter <string> wordTagCounter = new ClassicCounter <string>(30000); ICounter <string> morphTagCounter = new ClassicCounter <string>(500); // Counter<String> signatureTagCounter = new ClassicCounter<String>(); ICounter <string> morphCounter = new ClassicCounter <string>(500); ICounter <string> wordCounter = new ClassicCounter <string>(30000); ICounter <string> tagCounter = new ClassicCounter <string>(300); ICounter <string> lemmaCounter = new ClassicCounter <string>(25000); ICounter <string> lemmaTagCounter = new ClassicCounter <string>(25000); ICounter <string> richTagCounter = new ClassicCounter <string>(1000); ICounter <string> reducedTagCounter = new ClassicCounter <string>(500); ICounter <string> reducedTagLemmaCounter = new ClassicCounter <string>(500); IDictionary <string, ICollection <string> > wordLemmaMap = Generics.NewHashMap(); TwoDimensionalIntCounter <string, string> lemmaReducedTagCounter = new TwoDimensionalIntCounter <string, string>(30000); TwoDimensionalIntCounter <string, string> reducedTagTagCounter = new TwoDimensionalIntCounter <string, string>(500); TwoDimensionalIntCounter <string, string> tagReducedTagCounter = new TwoDimensionalIntCounter <string, string>(300); int numTrees = 0; foreach (Tree tree in tb) { foreach (Tree subTree in tree) { if (!subTree.IsLeaf()) { tlpp.TransformTree(subTree, tree); } } IList <ILabel> pretermList = tree.PreTerminalYield(); IList <ILabel> yield = tree.Yield(); System.Diagnostics.Debug.Assert(yield.Count == pretermList.Count); int yieldLen = yield.Count; for (int i = 0; i < yieldLen; ++i) { string tag = pretermList[i].Value(); string word = yield[i].Value(); string morph = ((CoreLabel)yield[i]).OriginalText(); // Note: if there is no lemma, then we use the surface form. Pair <string, string> lemmaTag = MorphoFeatureSpecification.SplitMorphString(word, morph); string lemma = lemmaTag.First(); string richTag = lemmaTag.Second(); // WSGDEBUG if (tag.Contains("MW")) { lemma += "-MWE"; } lemmaCounter.IncrementCount(lemma); lemmaTagCounter.IncrementCount(lemma + tag); richTagCounter.IncrementCount(richTag); string reducedTag = morphoSpec.StrToFeatures(richTag).ToString(); reducedTagCounter.IncrementCount(reducedTag); reducedTagLemmaCounter.IncrementCount(reducedTag + lemma); wordTagCounter.IncrementCount(word + tag); morphTagCounter.IncrementCount(morph + tag); morphCounter.IncrementCount(morph); wordCounter.IncrementCount(word); tagCounter.IncrementCount(tag); reducedTag = reducedTag.Equals(string.Empty) ? "NONE" : reducedTag; if (wordLemmaMap.Contains(word)) { wordLemmaMap[word].Add(lemma); } else { ICollection <string> lemmas = Generics.NewHashSet(1); wordLemmaMap[word] = lemmas; } lemmaReducedTagCounter.IncrementCount(lemma, reducedTag); reducedTagTagCounter.IncrementCount(lemma + reducedTag, tag); tagReducedTagCounter.IncrementCount(tag, reducedTag); } ++numTrees; } // Barf... System.Console.Out.WriteLine("Language: " + language.ToString()); System.Console.Out.Printf("#trees:\t%d%n", numTrees); System.Console.Out.Printf("#tokens:\t%d%n", (int)wordCounter.TotalCount()); System.Console.Out.Printf("#words:\t%d%n", wordCounter.KeySet().Count); System.Console.Out.Printf("#tags:\t%d%n", tagCounter.KeySet().Count); System.Console.Out.Printf("#wordTagPairs:\t%d%n", wordTagCounter.KeySet().Count); System.Console.Out.Printf("#lemmas:\t%d%n", lemmaCounter.KeySet().Count); System.Console.Out.Printf("#lemmaTagPairs:\t%d%n", lemmaTagCounter.KeySet().Count); System.Console.Out.Printf("#feattags:\t%d%n", reducedTagCounter.KeySet().Count); System.Console.Out.Printf("#feattag+lemmas:\t%d%n", reducedTagLemmaCounter.KeySet().Count); System.Console.Out.Printf("#richtags:\t%d%n", richTagCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemma:\t%d%n", morphCounter.KeySet().Count); System.Console.Out.Printf("#richtag+lemmaTagPairs:\t%d%n", morphTagCounter.KeySet().Count); // Extra System.Console.Out.WriteLine("=================="); StringBuilder sbNoLemma = new StringBuilder(); StringBuilder sbMultLemmas = new StringBuilder(); foreach (KeyValuePair <string, ICollection <string> > wordLemmas in wordLemmaMap) { string word = wordLemmas.Key; ICollection <string> lemmas = wordLemmas.Value; if (lemmas.Count == 0) { sbNoLemma.Append("NO LEMMAS FOR WORD: " + word + "\n"); continue; } if (lemmas.Count > 1) { sbMultLemmas.Append("MULTIPLE LEMMAS: " + word + " " + SetToString(lemmas) + "\n"); continue; } string lemma = lemmas.GetEnumerator().Current; ICollection <string> reducedTags = lemmaReducedTagCounter.GetCounter(lemma).KeySet(); if (reducedTags.Count > 1) { System.Console.Out.Printf("%s --> %s%n", word, lemma); foreach (string reducedTag in reducedTags) { int count = lemmaReducedTagCounter.GetCount(lemma, reducedTag); string posTags = SetToString(reducedTagTagCounter.GetCounter(lemma + reducedTag).KeySet()); System.Console.Out.Printf("\t%s\t%d\t%s%n", reducedTag, count, posTags); } System.Console.Out.WriteLine(); } } System.Console.Out.WriteLine("=================="); System.Console.Out.WriteLine(sbNoLemma.ToString()); System.Console.Out.WriteLine(sbMultLemmas.ToString()); System.Console.Out.WriteLine("=================="); IList <string> tags = new List <string>(tagReducedTagCounter.FirstKeySet()); tags.Sort(); foreach (string tag_1 in tags) { System.Console.Out.WriteLine(tag_1); ICollection <string> reducedTags = tagReducedTagCounter.GetCounter(tag_1).KeySet(); foreach (string reducedTag in reducedTags) { int count = tagReducedTagCounter.GetCount(tag_1, reducedTag); // reducedTag = reducedTag.equals("") ? "NONE" : reducedTag; System.Console.Out.Printf("\t%s\t%d%n", reducedTag, count); } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine("=================="); }
public static void PrintStats(ICollection <Tree> trees, PrintWriter pw) { ClassicCounter <int> wordLengthCounter = new ClassicCounter <int>(); ClassicCounter <TaggedWord> wordCounter = new ClassicCounter <TaggedWord>(); ClassicCounter <ChineseCharacterBasedLexicon.Symbol> charCounter = new ClassicCounter <ChineseCharacterBasedLexicon.Symbol>(); int counter = 0; foreach (Tree tree in trees) { counter++; IList <TaggedWord> taggedWords = tree.TaggedYield(); foreach (TaggedWord taggedWord in taggedWords) { string word = taggedWord.Word(); if (word.Equals(LexiconConstants.Boundary)) { continue; } wordCounter.IncrementCount(taggedWord); wordLengthCounter.IncrementCount(int.Parse(word.Length)); for (int j = 0; j < length; j++) { ChineseCharacterBasedLexicon.Symbol sym = ChineseCharacterBasedLexicon.Symbol.CannonicalSymbol(word[j]); charCounter.IncrementCount(sym); } charCounter.IncrementCount(ChineseCharacterBasedLexicon.Symbol.EndWord); } } ICollection <ChineseCharacterBasedLexicon.Symbol> singletonChars = Counters.KeysBelow(charCounter, 1.5); ICollection <TaggedWord> singletonWords = Counters.KeysBelow(wordCounter, 1.5); ClassicCounter <string> singletonWordPOSes = new ClassicCounter <string>(); foreach (TaggedWord taggedWord_1 in singletonWords) { singletonWordPOSes.IncrementCount(taggedWord_1.Tag()); } Distribution <string> singletonWordPOSDist = Distribution.GetDistribution(singletonWordPOSes); ClassicCounter <char> singletonCharRads = new ClassicCounter <char>(); foreach (ChineseCharacterBasedLexicon.Symbol s in singletonChars) { singletonCharRads.IncrementCount(char.ValueOf(RadicalMap.GetRadical(s.GetCh()))); } Distribution <char> singletonCharRadDist = Distribution.GetDistribution(singletonCharRads); Distribution <int> wordLengthDist = Distribution.GetDistribution(wordLengthCounter); NumberFormat percent = new DecimalFormat("##.##%"); pw.Println("There are " + singletonChars.Count + " singleton chars out of " + (int)charCounter.TotalCount() + " tokens and " + charCounter.Size() + " types found in " + counter + " trees."); pw.Println("Thus singletonChars comprise " + percent.Format(singletonChars.Count / charCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonChars.Count / charCounter.Size()) + " of types."); pw.Println(); pw.Println("There are " + singletonWords.Count + " singleton words out of " + (int)wordCounter.TotalCount() + " tokens and " + wordCounter.Size() + " types."); pw.Println("Thus singletonWords comprise " + percent.Format(singletonWords.Count / wordCounter.TotalCount()) + " of tokens and " + percent.Format((double)singletonWords.Count / wordCounter.Size()) + " of types."); pw.Println(); pw.Println("Distribution over singleton word POS:"); pw.Println(singletonWordPOSDist.ToString()); pw.Println(); pw.Println("Distribution over singleton char radicals:"); pw.Println(singletonCharRadDist.ToString()); pw.Println(); pw.Println("Distribution over word length:"); pw.Println(wordLengthDist); }
private static void GetSplitters(double cutOff, IDictionary <string, ClassicCounter <IList <string> > > nr, IDictionary <IList <string>, ClassicCounter <IList <string> > > pr, IDictionary <IList <string>, ClassicCounter <IList <string> > > gpr, ICollection <string > splitters) { // do value of parent foreach (string node in nr.Keys) { IList <Pair <IList <string>, double> > answers = new List <Pair <IList <string>, double> >(); ClassicCounter <IList <string> > cntr = nr[node]; double support = (cntr.TotalCount()); foreach (IList <string> key in pr.Keys) { if (key[0].Equals(node)) { // only do it if they match ClassicCounter <IList <string> > cntr2 = pr[key]; double support2 = cntr2.TotalCount(); double kl = Counters.KlDivergence(cntr2, cntr); answers.Add(new Pair <IList <string>, double>(key, kl * support2)); } } answers.Sort(null); foreach (Pair <IList <string>, double> p in answers) { double psd = p.Second(); if (psd >= cutOff) { IList <string> lst = p.First(); string nd = lst[0]; string par = lst[1]; string name = nd + "^" + par; splitters.Add(name); } } } /* * // do value of parent with info gain -- yet to finish this * for (Iterator it = nr.entrySet().iterator(); it.hasNext(); ) { * Map.Entry pair = (Map.Entry) it.next(); * String node = (String) pair.getKey(); * Counter cntr = (Counter) pair.getValue(); * double support = (cntr.totalCount()); * ArrayList dtrs = new ArrayList(); * for (Iterator it2 = pr.entrySet().iterator(); it2.hasNext();) { * HashMap annotated = new HashMap(); * Map.Entry pair2 = (Map.Entry) it2.next(); * List node2 = (List) pair2.getKey(); * Counter cntr2 = (Counter) pair2.getValue(); * if (node2.get(0).equals(node)) { // only do it if they match * annotated.put(node2, cntr2); * } * } * * // upto * * List answers = new ArrayList(); * Collections.sort(answers, * new Comparator() { * public int compare(Object o1, Object o2) { * Pair p1 = (Pair) o1; * Pair p2 = (Pair) o2; * Double p12 = (Double) p1.second(); * Double p22 = (Double) p2.second(); * return p22.compareTo(p12); * } * }); * for (int i = 0, size = answers.size(); i < size; i++) { * Pair p = (Pair) answers.get(i); * double psd = ((Double) p.second()).doubleValue(); * if (psd >= cutOff) { * List lst = (List) p.first(); * String nd = (String) lst.get(0); * String par = (String) lst.get(1); * String name = nd + "^" + par; * splitters.add(name); * } * } * } */ // do value of grandparent foreach (IList <string> node_1 in pr.Keys) { List <Pair <IList <string>, double> > answers = Generics.NewArrayList(); ClassicCounter <IList <string> > cntr = pr[node_1]; double support = (cntr.TotalCount()); if (support < Suppcutoff) { continue; } foreach (IList <string> key in gpr.Keys) { if (key[0].Equals(node_1[0]) && key[1].Equals(node_1[1])) { // only do it if they match ClassicCounter <IList <string> > cntr2 = gpr[key]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); answers.Add(new Pair <IList <string>, double>(key, kl * support2)); } } answers.Sort(null); foreach (Pair <IList <string>, double> answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); if (psd >= cutOff) { IList lst = (IList)p.First(); string nd = (string)lst[0]; string par = (string)lst[1]; string gpar = (string)lst[2]; string name = nd + "^" + par + "~" + gpar; splitters.Add(name); } } } }
public virtual void PrintStats() { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(2); // System.out.println("Node rules"); // System.out.println(nodeRules); // System.out.println("Parent rules"); // System.out.println(pRules); // System.out.println("Grandparent rules"); // System.out.println(gPRules); // Store java code for selSplit StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length]; for (int i = 0; i < Cutoffs.Length; i++) { javaSB[i] = new StringBuilder(" private static String[] splitters" + (i + 1) + " = new String[] {"); } ClassicCounter <IList <string> > allScores = new ClassicCounter <IList <string> >(); // do value of parent foreach (string node in nodeRules.Keys) { List <Pair <IList <string>, double> > answers = Generics.NewArrayList(); ClassicCounter <IList <string> > cntr = nodeRules[node]; double support = (cntr.TotalCount()); System.Console.Out.WriteLine("Node " + node + " support is " + support); foreach (IList <string> key in pRules.Keys) { if (key[0].Equals(node)) { // only do it if they match ClassicCounter <IList <string> > cntr2 = pRules[key]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); System.Console.Out.WriteLine("KL(" + key + "||" + node + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2); double score = kl * support2; answers.Add(new Pair <IList <string>, double>(key, score)); allScores.SetCount(key, score); } } System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (Pair <IList <string>, double> answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { IList lst = (IList)p.First(); string nd = (string)lst[0]; string par = (string)lst[1]; for (int j = 0; j < Cutoffs.Length; j++) { if (psd >= Cutoffs[j]) { javaSB[j].Append("\"").Append(nd).Append("^"); javaSB[j].Append(par).Append("\", "); } } } } System.Console.Out.WriteLine(); } /* * // do value of parent with info gain -- yet to finish this * for (Iterator it = nodeRules.entrySet().iterator(); it.hasNext(); ) { * Map.Entry pair = (Map.Entry) it.next(); * String node = (String) pair.getKey(); * Counter cntr = (Counter) pair.getValue(); * double support = (cntr.totalCount()); * System.out.println("Node " + node + " support is " + support); * ArrayList dtrs = new ArrayList(); * for (Iterator it2 = pRules.entrySet().iterator(); it2.hasNext();) { * HashMap annotated = new HashMap(); * Map.Entry pair2 = (Map.Entry) it2.next(); * List node2 = (List) pair2.getKey(); * Counter cntr2 = (Counter) pair2.getValue(); * if (node2.get(0).equals(node)) { // only do it if they match * annotated.put(node2, cntr2); * } * } * * // upto * * List answers = new ArrayList(); * System.out.println("----"); * System.out.println("Sorted descending support * KL"); * Collections.sort(answers, * new Comparator() { * public int compare(Object o1, Object o2) { * Pair p1 = (Pair) o1; * Pair p2 = (Pair) o2; * Double p12 = (Double) p1.second(); * Double p22 = (Double) p2.second(); * return p22.compareTo(p12); * } * }); * for (int i = 0, size = answers.size(); i < size; i++) { * Pair p = (Pair) answers.get(i); * double psd = ((Double) p.second()).doubleValue(); * System.out.println(p.first() + ": " + nf.format(psd)); * if (psd >= CUTOFFS[0]) { * List lst = (List) p.first(); * String nd = (String) lst.get(0); * String par = (String) lst.get(1); * for (int j=0; j < CUTOFFS.length; j++) { * if (psd >= CUTOFFS[j]) { * javaSB[j].append("\"").append(nd).append("^"); * javaSB[j].append(par).append("\", "); * } * } * } * } * System.out.println(); * } */ // do value of grandparent foreach (IList <string> node_1 in pRules.Keys) { List <Pair <IList <string>, double> > answers = Generics.NewArrayList(); ClassicCounter <IList <string> > cntr = pRules[node_1]; double support = (cntr.TotalCount()); if (support < Suppcutoff) { continue; } System.Console.Out.WriteLine("Node " + node_1 + " support is " + support); foreach (IList <string> key in gPRules.Keys) { if (key[0].Equals(node_1[0]) && key[1].Equals(node_1[1])) { // only do it if they match ClassicCounter <IList <string> > cntr2 = gPRules[key]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); System.Console.Out.WriteLine("KL(" + key + "||" + node_1 + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2); double score = kl * support2; answers.Add(Pair.MakePair(key, score)); allScores.SetCount(key, score); } } System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (Pair <IList <string>, double> answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { IList lst = (IList)p.First(); string nd = (string)lst[0]; string par = (string)lst[1]; string gpar = (string)lst[2]; for (int j = 0; j < Cutoffs.Length; j++) { if (psd >= Cutoffs[j]) { javaSB[j].Append("\"").Append(nd).Append("^"); javaSB[j].Append(par).Append("~"); javaSB[j].Append(gpar).Append("\", "); } } } } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine("All scores:"); IPriorityQueue <IList <string> > pq = Counters.ToPriorityQueue(allScores); while (!pq.IsEmpty()) { IList <string> key = pq.GetFirst(); double score = pq.GetPriority(key); pq.RemoveFirst(); System.Console.Out.WriteLine(key + "\t" + score); } System.Console.Out.WriteLine(" // Automatically generated by ParentAnnotationStats -- preferably don't edit"); for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++) { int len = javaSB[i_1].Length; javaSB[i_1].Replace(len - 2, len, "};"); System.Console.Out.WriteLine(javaSB[i_1]); } System.Console.Out.Write(" public static HashSet splitters = new HashSet(Arrays.asList("); for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--) { if (i_2 == 1) { System.Console.Out.Write("splitters1"); } else { System.Console.Out.Write("selectiveSplit" + i_2 + " ? splitters" + i_2 + " : ("); } } // need to print extra one to close other things open for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--) { System.Console.Out.Write(")"); } System.Console.Out.WriteLine(";"); }
public virtual void PrintStats() { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(2); // System.out.println("Node rules"); // System.out.println(nodeRules); // System.out.println("Parent rules"); // System.out.println(pRules); // System.out.println("Grandparent rules"); // System.out.println(gPRules); // Store java code for selSplit StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length]; for (int i = 0; i < Cutoffs.Length; i++) { javaSB[i] = new StringBuilder(" private static String[] sisterSplit" + (i + 1) + " = new String[] {"); } ArrayList topScores = new ArrayList(); foreach (object o in nodeRules.Keys) { ArrayList answers = new ArrayList(); string label = (string)o; ClassicCounter cntr = (ClassicCounter)nodeRules[label]; double support = (cntr.TotalCount()); System.Console.Out.WriteLine("Node " + label + " support is " + support); foreach (object o4 in ((Hashtable)leftRules[label]).Keys) { string sis = (string)o4; ClassicCounter cntr2 = (ClassicCounter)((Hashtable)leftRules[label])[sis]; double support2 = (cntr2.TotalCount()); /* alternative 1: use full distribution to calculate score */ double kl = Counters.KlDivergence(cntr2, cntr); /* alternative 2: hold out test-context data to calculate score */ /* this doesn't work because it can lead to zero-probability * data points hence infinite divergence */ // Counter tempCounter = new Counter(); // tempCounter.addCounter(cntr2); // for(Iterator i = tempCounter.seenSet().iterator(); i.hasNext();) { // Object o = i.next(); // tempCounter.setCount(o,-1*tempCounter.countOf(o)); // } // System.out.println(tempCounter); //debugging // tempCounter.addCounter(cntr); // System.out.println(tempCounter); //debugging // System.out.println(cntr); // double kl = cntr2.klDivergence(tempCounter); /* alternative 2 ends here */ string annotatedLabel = label + "=l=" + sis; System.Console.Out.WriteLine("KL(" + annotatedLabel + "||" + label + ") = " + nf.Format(kl) + "\t" + "support(" + sis + ") = " + support2); answers.Add(new Pair(annotatedLabel, kl * support2)); topScores.Add(new Pair(annotatedLabel, kl * support2)); } foreach (object o3 in ((Hashtable)rightRules[label]).Keys) { string sis = (string)o3; ClassicCounter cntr2 = (ClassicCounter)((Hashtable)rightRules[label])[sis]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); string annotatedLabel = label + "=r=" + sis; System.Console.Out.WriteLine("KL(" + annotatedLabel + "||" + label + ") = " + nf.Format(kl) + "\t" + "support(" + sis + ") = " + support2); answers.Add(new Pair(annotatedLabel, kl * support2)); topScores.Add(new Pair(annotatedLabel, kl * support2)); } // upto System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (object answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { string annotatedLabel = (string)p.First(); foreach (double Cutoff in Cutoffs) { if (psd >= Cutoff) { } } } } //javaSB[j].append("\"").append(annotatedLabel); //javaSB[j].append("\","); System.Console.Out.WriteLine(); } topScores.Sort(null); string outString = "All enriched categories, sorted by score\n"; foreach (object topScore in topScores) { Pair p = (Pair)topScore; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine(" // Automatically generated by SisterAnnotationStats -- preferably don't edit"); int k = Cutoffs.Length - 1; for (int j = 0; j < topScores.Count; j++) { Pair p = (Pair)topScores[j]; double psd = ((double)p.Second()); if (psd < Cutoffs[k]) { if (k == 0) { break; } else { k--; j -= 1; // messy but should do it continue; } } javaSB[k].Append("\"").Append(p.First()); javaSB[k].Append("\","); } for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++) { int len = javaSB[i_1].Length; javaSB[i_1].Replace(len - 2, len, "};"); System.Console.Out.WriteLine(javaSB[i_1]); } System.Console.Out.Write(" public static String[] sisterSplit = "); for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--) { if (i_2 == 1) { System.Console.Out.Write("sisterSplit1"); } else { System.Console.Out.Write("selectiveSisterSplit" + i_2 + " ? sisterSplit" + i_2 + " : ("); } } // need to print extra one to close other things open for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--) { System.Console.Out.Write(")"); } System.Console.Out.WriteLine(";"); }
/// <summary>Count some stats on what occurs in a file.</summary> /// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public static void Main(string[] args) { CoNLLDocumentReaderAndWriter rw = new CoNLLDocumentReaderAndWriter(); rw.Init(new SeqClassifierFlags()); int numDocs = 0; int numTokens = 0; int numEntities = 0; string lastAnsBase = string.Empty; ICounter <string> miscCounter = new ClassicCounter <string>(); StringBuilder inProgressMisc = new StringBuilder(); for (IEnumerator <IList <CoreLabel> > it = rw.GetIterator(IOUtils.ReaderFromString(args[0])); it.MoveNext();) { IList <CoreLabel> doc = it.Current; numDocs++; foreach (CoreLabel fl in doc) { string word = fl.Word(); // System.out.println("FL " + (++i) + " was " + fl); if (word.Equals(Boundary)) { continue; } string ans = fl.Get(typeof(CoreAnnotations.AnswerAnnotation)); string ansBase; string ansPrefix; string[] bits = ans.Split("-"); if (bits.Length == 1) { ansBase = bits[0]; ansPrefix = string.Empty; } else { ansBase = bits[1]; ansPrefix = bits[0]; } numTokens++; if (!ansBase.Equals("O")) { if (ansBase.Equals(lastAnsBase)) { if (ansPrefix.Equals("B")) { numEntities++; inProgressMisc = MaybeIncrementCounter(inProgressMisc, miscCounter); } } else { numEntities++; inProgressMisc = MaybeIncrementCounter(inProgressMisc, miscCounter); } if (ansBase.Equals("MISC")) { if (inProgressMisc.Length > 0) { // already something there inProgressMisc.Append(' '); } inProgressMisc.Append(word); } } else { inProgressMisc = MaybeIncrementCounter(inProgressMisc, miscCounter); } lastAnsBase = ansBase; } } // for tokens // for documents System.Console.Out.WriteLine("File " + args[0] + " has " + numDocs + " documents, " + numTokens + " (non-blank line) tokens and " + numEntities + " entities."); System.Console.Out.Printf("Here are the %.0f MISC items with counts:%n", miscCounter.TotalCount()); System.Console.Out.WriteLine(Counters.ToVerticalString(miscCounter, "%.0f\t%s")); }