public virtual string GetF1Description(int numDigits, L label) { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(numDigits); return(nf.Format(GetFMeasure(label))); }
public virtual string GetDescription(int numDigits) { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(numDigits); StringBuilder sb = new StringBuilder(); sb.Append("--- Accuracy Stats ---").Append('\n'); sb.Append("accuracy: ").Append(nf.Format(accuracy)).Append('\n'); sb.Append("optimal fn accuracy: ").Append(nf.Format(optAccuracy)).Append('\n'); sb.Append("confidence weighted accuracy :").Append(nf.Format(confWeightedAccuracy)).Append('\n'); sb.Append("optimal confidence weighted accuracy: ").Append(nf.Format(optConfWeightedAccuracy)).Append('\n'); sb.Append("log-likelihood: ").Append(logLikelihood).Append('\n'); if (saveFile != null) { string f = saveFile + '-' + saveIndex; sb.Append("saving accuracy info to ").Append(f).Append(".accuracy\n"); StringUtils.PrintToFile(f + ".accuracy", ToStringArr(accrecall)); sb.Append("saving optimal accuracy info to ").Append(f).Append(".optimal_accuracy\n"); StringUtils.PrintToFile(f + ".optimal_accuracy", ToStringArr(optaccrecall)); saveIndex++; } //sb.append("accuracy coverage: ").append(toStringArr(accrecall)).append("\n"); //sb.append("optimal accuracy coverage: ").append(toStringArr(optaccrecall)); return(sb.ToString()); }
/// <summary>Returns a String summarizing recall that will print nicely.</summary> public virtual string GetRecallDescription(int numDigits) { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(numDigits); return(nf.Format(GetRecall()) + " (" + tpCount + "/" + (tpCount + fnCount) + ")"); }
public virtual string GetRecallDescription(int numDigits, L label) { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(numDigits); Triple <double, int, int> recall = GetRecallInfo(label); return(nf.Format(recall.First()) + " (" + recall.Second() + "/" + (recall.Second() + recall.Third()) + ")"); }
public virtual string GetPrecisionDescription(int numDigits, L label) { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(numDigits); Triple <double, int, int> prec = GetPrecisionInfo(label); return(nf.Format(prec.First()) + " (" + prec.Second() + "/" + (prec.Second() + prec.Third()) + ")"); }
/// <summary>Returns a String summarizing overall accuracy that will print nicely.</summary> public virtual string GetAccuracyDescription(int numDigits) { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(numDigits); Triple <double, int, int> accu = GetAccuracyInfo(); return(nf.Format(accu.First()) + " (" + accu.Second() + "/" + (accu.Second() + accu.Third()) + ")"); }
private static NumberFormat CreateFormat(Entry type) { var locale = type.FormatProvider.ToLocale(); if (type.FormatType == FormatHex) { return(new HexFormat(type.Digits, type.IsUpperCase)); } if (type.FormatType == FormatScientific) { var pattern = type.Pattern; if (pattern == null) { pattern = GetPatternByDigits(type.Digits) + (type.IsUpperCase ? "E0" : "e0"); } return(new ScientificFormat(pattern, locale)); } if (type.FormatType == FormatDecimal) { DecimalFormatSymbols symbols = new DecimalFormatSymbols(locale) { Infinity = "Infinity" }; string pattern = type.Pattern ?? GetPatternByDigits(type.Digits); return(new DecimalFormat(pattern, symbols)); } if (type.FormatType == FormatCurrency) { return(NumberFormat.GetCurrencyInstance(locale)); } // default var format = NumberFormat.GetNumberInstance(locale); if (type.Pattern != null) { var decf = format as DecimalFormat; if (decf != null) { decf.ApplyLocalizedPattern(type.Pattern); } } return(format); }
internal NumberFormatInfo(Locale locale) { _locale = locale; _numbers = NumberFormat.GetNumberInstance(locale); _symbols = new DecimalFormatSymbols(locale) { Infinity = "Infinity" }; _decimals = _numbers as DecimalFormat ?? new DecimalFormat(); _decimals.DecimalFormatSymbols = _symbols; _currency = NumberFormat.GetCurrencyInstance(locale) as DecimalFormat ?? _decimals; _percent = NumberFormat.GetPercentInstance(locale) as DecimalFormat ?? _decimals; }
/// <summary> /// Works out whether the model expectations match the empirical /// expectations. /// </summary> /// <returns>Whether the model is correct</returns> public override bool CheckCorrectness() { log.Info("Checking model correctness; x size " + p.data.xSize + ' ' + ", ysize " + p.data.ySize); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(4); bool flag = true; for (int f = 0; f < lambda.Length; f++) { if (Math.Abs(lambda[f]) > 100) { log.Info(" Lambda too big " + lambda[f]); log.Info(" empirical " + ftildeArr[f] + " expected " + FExpected(p.functions.Get(f))); } } for (int i = 0; i < ftildeArr.Length; i++) { double exp = Math.Abs(ftildeArr[i] - FExpected(p.functions.Get(i))); if (exp > 0.001) { flag = false; log.Info("Constraint " + i + " not satisfied emp " + nf.Format(ftildeArr[i]) + " exp " + nf.Format(FExpected(p.functions.Get(i))) + " diff " + nf.Format(exp) + " lambda " + nf.Format(lambda[i])); } } for (int x = 0; x < p.data.xSize; x++) { double s = 0.0; for (int y = 0; y < p.data.ySize; y++) { s = s + probConds[x][y]; } if (Math.Abs(s - 1) > 0.0001) { for (int y_1 = 0; y_1 < p.data.ySize; y_1++) { log.Info(y_1 + " : " + probConds[x][y_1]); } log.Info("probabilities do not sum to one " + x + ' ' + (float)s); } } return(flag); }
public override string ToString() { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(2); StringBuilder sb = new StringBuilder(2000); string cl = GetType().FullName; sb.Append(Sharpen.Runtime.Substring(cl, cl.LastIndexOf('.') + 1)).Append("[tagbins="); sb.Append(numTagBins).Append(",wordTokens=").Append(numWordTokens).Append("; head -> arg\n"); // for (Iterator dI = coreDependencies.keySet().iterator(); dI.hasNext();) { // IntDependency d = (IntDependency) dI.next(); // double count = coreDependencies.getCount(d); // sb.append(d + " count " + nf.format(count)); // if (dI.hasNext()) { // sb.append(","); // } // sb.append("\n"); // } sb.Append("]"); return(sb.ToString()); }
public virtual void PrintStats() { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(2); // System.out.println("Node rules"); // System.out.println(nodeRules); // System.out.println("Parent rules"); // System.out.println(pRules); // System.out.println("Grandparent rules"); // System.out.println(gPRules); // Store java code for selSplit StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length]; for (int i = 0; i < Cutoffs.Length; i++) { javaSB[i] = new StringBuilder(" private static String[] splitters" + (i + 1) + " = new String[] {"); } ClassicCounter <IList <string> > allScores = new ClassicCounter <IList <string> >(); // do value of parent foreach (string node in nodeRules.Keys) { List <Pair <IList <string>, double> > answers = Generics.NewArrayList(); ClassicCounter <IList <string> > cntr = nodeRules[node]; double support = (cntr.TotalCount()); System.Console.Out.WriteLine("Node " + node + " support is " + support); foreach (IList <string> key in pRules.Keys) { if (key[0].Equals(node)) { // only do it if they match ClassicCounter <IList <string> > cntr2 = pRules[key]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); System.Console.Out.WriteLine("KL(" + key + "||" + node + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2); double score = kl * support2; answers.Add(new Pair <IList <string>, double>(key, score)); allScores.SetCount(key, score); } } System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (Pair <IList <string>, double> answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { IList lst = (IList)p.First(); string nd = (string)lst[0]; string par = (string)lst[1]; for (int j = 0; j < Cutoffs.Length; j++) { if (psd >= Cutoffs[j]) { javaSB[j].Append("\"").Append(nd).Append("^"); javaSB[j].Append(par).Append("\", "); } } } } System.Console.Out.WriteLine(); } /* * // do value of parent with info gain -- yet to finish this * for (Iterator it = nodeRules.entrySet().iterator(); it.hasNext(); ) { * Map.Entry pair = (Map.Entry) it.next(); * String node = (String) pair.getKey(); * Counter cntr = (Counter) pair.getValue(); * double support = (cntr.totalCount()); * System.out.println("Node " + node + " support is " + support); * ArrayList dtrs = new ArrayList(); * for (Iterator it2 = pRules.entrySet().iterator(); it2.hasNext();) { * HashMap annotated = new HashMap(); * Map.Entry pair2 = (Map.Entry) it2.next(); * List node2 = (List) pair2.getKey(); * Counter cntr2 = (Counter) pair2.getValue(); * if (node2.get(0).equals(node)) { // only do it if they match * annotated.put(node2, cntr2); * } * } * * // upto * * List answers = new ArrayList(); * System.out.println("----"); * System.out.println("Sorted descending support * KL"); * Collections.sort(answers, * new Comparator() { * public int compare(Object o1, Object o2) { * Pair p1 = (Pair) o1; * Pair p2 = (Pair) o2; * Double p12 = (Double) p1.second(); * Double p22 = (Double) p2.second(); * return p22.compareTo(p12); * } * }); * for (int i = 0, size = answers.size(); i < size; i++) { * Pair p = (Pair) answers.get(i); * double psd = ((Double) p.second()).doubleValue(); * System.out.println(p.first() + ": " + nf.format(psd)); * if (psd >= CUTOFFS[0]) { * List lst = (List) p.first(); * String nd = (String) lst.get(0); * String par = (String) lst.get(1); * for (int j=0; j < CUTOFFS.length; j++) { * if (psd >= CUTOFFS[j]) { * javaSB[j].append("\"").append(nd).append("^"); * javaSB[j].append(par).append("\", "); * } * } * } * } * System.out.println(); * } */ // do value of grandparent foreach (IList <string> node_1 in pRules.Keys) { List <Pair <IList <string>, double> > answers = Generics.NewArrayList(); ClassicCounter <IList <string> > cntr = pRules[node_1]; double support = (cntr.TotalCount()); if (support < Suppcutoff) { continue; } System.Console.Out.WriteLine("Node " + node_1 + " support is " + support); foreach (IList <string> key in gPRules.Keys) { if (key[0].Equals(node_1[0]) && key[1].Equals(node_1[1])) { // only do it if they match ClassicCounter <IList <string> > cntr2 = gPRules[key]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); System.Console.Out.WriteLine("KL(" + key + "||" + node_1 + ") = " + nf.Format(kl) + "\t" + "support(" + key + ") = " + support2); double score = kl * support2; answers.Add(Pair.MakePair(key, score)); allScores.SetCount(key, score); } } System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (Pair <IList <string>, double> answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { IList lst = (IList)p.First(); string nd = (string)lst[0]; string par = (string)lst[1]; string gpar = (string)lst[2]; for (int j = 0; j < Cutoffs.Length; j++) { if (psd >= Cutoffs[j]) { javaSB[j].Append("\"").Append(nd).Append("^"); javaSB[j].Append(par).Append("~"); javaSB[j].Append(gpar).Append("\", "); } } } } System.Console.Out.WriteLine(); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine("All scores:"); IPriorityQueue <IList <string> > pq = Counters.ToPriorityQueue(allScores); while (!pq.IsEmpty()) { IList <string> key = pq.GetFirst(); double score = pq.GetPriority(key); pq.RemoveFirst(); System.Console.Out.WriteLine(key + "\t" + score); } System.Console.Out.WriteLine(" // Automatically generated by ParentAnnotationStats -- preferably don't edit"); for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++) { int len = javaSB[i_1].Length; javaSB[i_1].Replace(len - 2, len, "};"); System.Console.Out.WriteLine(javaSB[i_1]); } System.Console.Out.Write(" public static HashSet splitters = new HashSet(Arrays.asList("); for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--) { if (i_2 == 1) { System.Console.Out.Write("splitters1"); } else { System.Console.Out.Write("selectiveSplit" + i_2 + " ? splitters" + i_2 + " : ("); } } // need to print extra one to close other things open for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--) { System.Console.Out.Write(")"); } System.Console.Out.WriteLine(";"); }
/// <summary>Print some statistics about this lexicon.</summary> public virtual void PrintLexStats() { System.Console.Out.WriteLine("BaseLexicon statistics"); System.Console.Out.WriteLine("unknownLevel is " + GetUnknownWordModel().GetUnknownLevel()); // System.out.println("Rules size: " + rules.size()); System.Console.Out.WriteLine("Sum of rulesWithWord: " + NumRules()); System.Console.Out.WriteLine("Tags size: " + tags.Count); int wsize = words.Count; System.Console.Out.WriteLine("Words size: " + wsize); // System.out.println("Unseen Sigs size: " + sigs.size() + // " [number of unknown equivalence classes]"); System.Console.Out.WriteLine("rulesWithWord length: " + rulesWithWord.Length + " [should be sum of words + unknown sigs]"); int[] lengths = new int[StatsBins]; List <string>[] wArr = new ArrayList[StatsBins]; for (int j = 0; j < StatsBins; j++) { wArr[j] = new List <string>(); } for (int i = 0; i < rulesWithWord.Length; i++) { int num = rulesWithWord[i].Count; if (num > StatsBins - 1) { num = StatsBins - 1; } lengths[num]++; if (wsize <= 20 || num >= StatsBins / 2) { wArr[num].Add(wordIndex.Get(i)); } } System.Console.Out.WriteLine("Stats on how many taggings for how many words"); for (int j_1 = 0; j_1 < StatsBins; j_1++) { System.Console.Out.Write(j_1 + " taggings: " + lengths[j_1] + " words "); if (wsize <= 20 || j_1 >= StatsBins / 2) { System.Console.Out.Write(wArr[j_1]); } System.Console.Out.WriteLine(); } NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(0); System.Console.Out.WriteLine("Unseen counter: " + Counters.ToString(uwModel.UnSeenCounter(), nf)); if (wsize < 50 && tags.Count < 10) { nf.SetMaximumFractionDigits(3); StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); pw.Println("Tagging probabilities log P(word|tag)"); for (int t = 0; t < tags.Count; t++) { pw.Print('\t'); pw.Print(tagIndex.Get(t)); } pw.Println(); for (int w = 0; w < wsize; w++) { pw.Print(wordIndex.Get(w)); pw.Print('\t'); for (int t_1 = 0; t_1 < tags.Count; t_1++) { IntTaggedWord iTW = new IntTaggedWord(w, t_1); pw.Print(nf.Format(Score(iTW, 1, wordIndex.Get(w), null))); if (t_1 == tags.Count - 1) { pw.Println(); } else { pw.Print('\t'); } } } pw.Close(); System.Console.Out.WriteLine(sw.ToString()); } }
/// <summary> /// Provides some testing and opportunities for exploration of the /// probabilities of a BaseLexicon. /// </summary> /// <remarks> /// Provides some testing and opportunities for exploration of the /// probabilities of a BaseLexicon. What's here currently probably /// only works for the English Penn Treeebank, as it uses default /// constructors. Of the words given to test on, /// the first is treated as sentence initial, and the rest as not /// sentence initial. /// </remarks> /// <param name="args"> /// The command line arguments: /// java BaseLexicon treebankPath fileRange unknownWordModel words /// </param> public static void Main(string[] args) { if (args.Length < 3) { log.Info("java BaseLexicon treebankPath fileRange unknownWordModel words*"); return; } System.Console.Out.Write("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... "); Treebank tb = new DiskTreebank(); tb.LoadPath(args[0], new NumberRangesFileFilter(args[1], true)); // TODO: change this interface so the lexicon creates its own indices? IIndex <string> wordIndex = new HashIndex <string>(); IIndex <string> tagIndex = new HashIndex <string>(); Options op = new Options(); op.lexOptions.useUnknownWordSignatures = System.Convert.ToInt32(args[2]); Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon lex = new Edu.Stanford.Nlp.Parser.Lexparser.BaseLexicon(op, wordIndex, tagIndex); lex.InitializeTraining(tb.Count); lex.Train(tb); lex.FinishTraining(); System.Console.Out.WriteLine("done."); System.Console.Out.WriteLine(); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(4); IList <string> impos = new List <string>(); for (int i = 3; i < args.Length; i++) { if (lex.IsKnown(args[i])) { System.Console.Out.WriteLine(args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:"); for (IEnumerator <IntTaggedWord> it = lex.RuleIteratorByWord(wordIndex.AddToIndex(args[i]), i - 3, null); it.MoveNext();) { IntTaggedWord iTW = it.Current; System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(lex.Score(iTW, i - 3, wordIndex.Get(iTW.word), null))); } } else { string sig = lex.GetUnknownWordModel().GetSignature(args[i], i - 3); System.Console.Out.WriteLine(args[i] + " is an unknown word. Signature with uwm " + lex.GetUnknownWordModel().GetUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig); impos.Clear(); IList <string> lis = new List <string>(tagIndex.ObjectsList()); lis.Sort(); foreach (string tStr in lis) { IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex); double score = lex.Score(iTW, 1, args[i], null); if (score == float.NegativeInfinity) { impos.Add(tStr); } else { System.Console.Out.WriteLine(StringUtils.Pad(iTW, 24) + nf.Format(score)); } } if (impos.Count > 0) { System.Console.Out.WriteLine(args[i] + " impossible tags: " + impos); } } System.Console.Out.WriteLine(); } }
public virtual string AsDOTString() { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(3); nf.SetMinimumFractionDigits(1); StringBuilder result = new StringBuilder(); ISet nodes = GetNodes(); result.Append("digraph G {\n"); // result.append("page = \"8.5,11\";\n"); // result.append("margin = \"0.25\";\n"); // Heuristic number of pages int sz = arcs.Count; int ht = 105; int mag = 250; while (sz > mag) { ht += 105; mag *= 2; } int wd = 8; mag = 500; while (sz > mag) { wd += 8; mag *= 4; } double htd = ht / 10.0; result.Append("size = \"" + wd + "," + htd + "\";\n"); result.Append("graph [rankdir = \"LR\"];\n"); result.Append("graph [ranksep = \"0.2\"];\n"); foreach (object node in nodes) { string cleanString = StringUtils.FileNameClean(node.ToString()); result.Append(cleanString); result.Append(" [ "); // if (getEndNodes().contains(node)) { // result.append("label=\"" + node.toString() + "\", style=filled, "); // } else result.Append("label=\"" + node.ToString() + "\""); result.Append("height=\"0.3\", width=\"0.3\""); result.Append(" ];\n"); foreach (TransducerGraph.Arc arc in GetArcsBySource(node)) { result.Append(StringUtils.FileNameClean(arc.GetSourceNode().ToString())); result.Append(" -> "); result.Append(StringUtils.FileNameClean(arc.GetTargetNode().ToString())); result.Append(" [ "); result.Append("label=\""); result.Append(arc.GetInput()); result.Append(" : "); // result.append(arc.getOutput()); object output = arc.GetOutput(); string wt = string.Empty; if (output is Number) { double dd = ((Number)output); if (dd == -0.0d) { result.Append(nf.Format(0.0d)); } else { result.Append(nf.Format(output)); } int weight; if (dotWeightInverted) { weight = (int)(20.0 - dd); } else { weight = (int)dd; } if (weight > 0) { wt = ", weight = \"" + weight + "\""; } if (dotWeightInverted && dd <= 2.0 || (!dotWeightInverted) && dd >= 20.0) { wt += ", style=bold"; } } else { result.Append(output); } result.Append("\""); result.Append(wt); // result.append("fontsize = 14 "); if (arc.GetInput().ToString().Equals("EPSILON")) { result.Append(", style = \"dashed\" "); } else { result.Append(", style = \"solid\" "); } // result.append(", weight = \"" + arc.getOutput() + "\" "); result.Append("];\n"); } } result.Append("}\n"); return(result.ToString()); }
public virtual SVMLightClassifier <L, F> TrainClassifierBasic(GeneralDataset <L, F> dataset) { IIndex <L> labelIndex = dataset.LabelIndex(); IIndex <F> featureIndex = dataset.featureIndex; bool multiclass = (dataset.NumClasses() > 2); try { // this is the file that the model will be saved to File modelFile = File.CreateTempFile("svm-", ".model"); if (deleteTempFilesOnExit) { modelFile.DeleteOnExit(); } // this is the file that the svm light formated dataset // will be printed to File dataFile = File.CreateTempFile("svm-", ".data"); if (deleteTempFilesOnExit) { dataFile.DeleteOnExit(); } // print the dataset PrintWriter pw = new PrintWriter(new FileWriter(dataFile)); dataset.PrintSVMLightFormat(pw); pw.Close(); // -v 0 makes it not verbose // -m 400 gives it a larger cache, for faster training string cmd = (multiclass ? svmStructLearn : (useSVMPerf ? svmPerfLearn : svmLightLearn)) + " -v " + svmLightVerbosity + " -m 400 "; // set the value of C if we have one specified if (C > 0.0) { cmd = cmd + " -c " + C + " "; } else { // C value if (useSVMPerf) { cmd = cmd + " -c " + 0.01 + " "; } } //It's required to specify this parameter for SVM perf // Alpha File if (useAlphaFile) { File newAlphaFile = File.CreateTempFile("svm-", ".alphas"); if (deleteTempFilesOnExit) { newAlphaFile.DeleteOnExit(); } cmd = cmd + " -a " + newAlphaFile.GetAbsolutePath(); if (alphaFile != null) { cmd = cmd + " -y " + alphaFile.GetAbsolutePath(); } alphaFile = newAlphaFile; } // File and Model Data cmd = cmd + " " + dataFile.GetAbsolutePath() + " " + modelFile.GetAbsolutePath(); if (verbose) { logger.Info("<< " + cmd + " >>"); } /*Process p = Runtime.getRuntime().exec(cmd); * * p.waitFor(); * * if (p.exitValue() != 0) throw new RuntimeException("Error Training SVM Light exit value: " + p.exitValue()); * p.destroy(); */ SystemUtils.Run(new ProcessBuilder(whitespacePattern.Split(cmd)), new PrintWriter(System.Console.Error), new PrintWriter(System.Console.Error)); if (doEval) { File predictFile = File.CreateTempFile("svm-", ".pred"); if (deleteTempFilesOnExit) { predictFile.DeleteOnExit(); } string evalCmd = (multiclass ? svmStructClassify : (useSVMPerf ? svmPerfClassify : svmLightClassify)) + " " + dataFile.GetAbsolutePath() + " " + modelFile.GetAbsolutePath() + " " + predictFile.GetAbsolutePath(); if (verbose) { logger.Info("<< " + evalCmd + " >>"); } SystemUtils.Run(new ProcessBuilder(whitespacePattern.Split(evalCmd)), new PrintWriter(System.Console.Error), new PrintWriter(System.Console.Error)); } // read in the model file Pair <double, ClassicCounter <int> > weightsAndThresh = ReadModel(modelFile, multiclass); double threshold = weightsAndThresh.First(); ClassicCounter <Pair <F, L> > weights = ConvertWeights(weightsAndThresh.Second(), featureIndex, labelIndex, multiclass); ClassicCounter <L> thresholds = new ClassicCounter <L>(); if (!multiclass) { thresholds.SetCount(labelIndex.Get(0), -threshold); thresholds.SetCount(labelIndex.Get(1), threshold); } SVMLightClassifier <L, F> classifier = new SVMLightClassifier <L, F>(weights, thresholds); if (doEval) { File predictFile = File.CreateTempFile("svm-", ".pred2"); if (deleteTempFilesOnExit) { predictFile.DeleteOnExit(); } PrintWriter pw2 = new PrintWriter(predictFile); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(5); foreach (IDatum <L, F> datum in dataset) { ICounter <L> scores = classifier.ScoresOf(datum); pw2.Println(Counters.ToString(scores, nf)); } pw2.Close(); } if (useSigmoid) { if (verbose) { System.Console.Out.Write("fitting sigmoid..."); } classifier.SetPlatt(FitSigmoid(classifier, dataset)); if (verbose) { System.Console.Out.WriteLine("done"); } } return(classifier); } catch (Exception e) { throw new Exception(e); } }
public virtual void PrintStats() { NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(2); // System.out.println("Node rules"); // System.out.println(nodeRules); // System.out.println("Parent rules"); // System.out.println(pRules); // System.out.println("Grandparent rules"); // System.out.println(gPRules); // Store java code for selSplit StringBuilder[] javaSB = new StringBuilder[Cutoffs.Length]; for (int i = 0; i < Cutoffs.Length; i++) { javaSB[i] = new StringBuilder(" private static String[] sisterSplit" + (i + 1) + " = new String[] {"); } ArrayList topScores = new ArrayList(); foreach (object o in nodeRules.Keys) { ArrayList answers = new ArrayList(); string label = (string)o; ClassicCounter cntr = (ClassicCounter)nodeRules[label]; double support = (cntr.TotalCount()); System.Console.Out.WriteLine("Node " + label + " support is " + support); foreach (object o4 in ((Hashtable)leftRules[label]).Keys) { string sis = (string)o4; ClassicCounter cntr2 = (ClassicCounter)((Hashtable)leftRules[label])[sis]; double support2 = (cntr2.TotalCount()); /* alternative 1: use full distribution to calculate score */ double kl = Counters.KlDivergence(cntr2, cntr); /* alternative 2: hold out test-context data to calculate score */ /* this doesn't work because it can lead to zero-probability * data points hence infinite divergence */ // Counter tempCounter = new Counter(); // tempCounter.addCounter(cntr2); // for(Iterator i = tempCounter.seenSet().iterator(); i.hasNext();) { // Object o = i.next(); // tempCounter.setCount(o,-1*tempCounter.countOf(o)); // } // System.out.println(tempCounter); //debugging // tempCounter.addCounter(cntr); // System.out.println(tempCounter); //debugging // System.out.println(cntr); // double kl = cntr2.klDivergence(tempCounter); /* alternative 2 ends here */ string annotatedLabel = label + "=l=" + sis; System.Console.Out.WriteLine("KL(" + annotatedLabel + "||" + label + ") = " + nf.Format(kl) + "\t" + "support(" + sis + ") = " + support2); answers.Add(new Pair(annotatedLabel, kl * support2)); topScores.Add(new Pair(annotatedLabel, kl * support2)); } foreach (object o3 in ((Hashtable)rightRules[label]).Keys) { string sis = (string)o3; ClassicCounter cntr2 = (ClassicCounter)((Hashtable)rightRules[label])[sis]; double support2 = (cntr2.TotalCount()); double kl = Counters.KlDivergence(cntr2, cntr); string annotatedLabel = label + "=r=" + sis; System.Console.Out.WriteLine("KL(" + annotatedLabel + "||" + label + ") = " + nf.Format(kl) + "\t" + "support(" + sis + ") = " + support2); answers.Add(new Pair(annotatedLabel, kl * support2)); topScores.Add(new Pair(annotatedLabel, kl * support2)); } // upto System.Console.Out.WriteLine("----"); System.Console.Out.WriteLine("Sorted descending support * KL"); answers.Sort(null); foreach (object answer in answers) { Pair p = (Pair)answer; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); if (psd >= Cutoffs[0]) { string annotatedLabel = (string)p.First(); foreach (double Cutoff in Cutoffs) { if (psd >= Cutoff) { } } } } //javaSB[j].append("\"").append(annotatedLabel); //javaSB[j].append("\","); System.Console.Out.WriteLine(); } topScores.Sort(null); string outString = "All enriched categories, sorted by score\n"; foreach (object topScore in topScores) { Pair p = (Pair)topScore; double psd = ((double)p.Second()); System.Console.Out.WriteLine(p.First() + ": " + nf.Format(psd)); } System.Console.Out.WriteLine(); System.Console.Out.WriteLine(" // Automatically generated by SisterAnnotationStats -- preferably don't edit"); int k = Cutoffs.Length - 1; for (int j = 0; j < topScores.Count; j++) { Pair p = (Pair)topScores[j]; double psd = ((double)p.Second()); if (psd < Cutoffs[k]) { if (k == 0) { break; } else { k--; j -= 1; // messy but should do it continue; } } javaSB[k].Append("\"").Append(p.First()); javaSB[k].Append("\","); } for (int i_1 = 0; i_1 < Cutoffs.Length; i_1++) { int len = javaSB[i_1].Length; javaSB[i_1].Replace(len - 2, len, "};"); System.Console.Out.WriteLine(javaSB[i_1]); } System.Console.Out.Write(" public static String[] sisterSplit = "); for (int i_2 = Cutoffs.Length; i_2 > 0; i_2--) { if (i_2 == 1) { System.Console.Out.Write("sisterSplit1"); } else { System.Console.Out.Write("selectiveSisterSplit" + i_2 + " ? sisterSplit" + i_2 + " : ("); } } // need to print extra one to close other things open for (int i_3 = Cutoffs.Length; i_3 >= 0; i_3--) { System.Console.Out.Write(")"); } System.Console.Out.WriteLine(";"); }
/// <summary> /// Return various statistics about the treebank (number of sentences, /// words, tag set, etc.). /// </summary> /// <param name="tlp"> /// The TreebankLanguagePack used to determine punctuation and an /// appropriate character encoding /// </param> /// <returns>A big string for human consumption describing the treebank</returns> public virtual string TextualSummary(ITreebankLanguagePack tlp) { int numTrees = 0; int numTreesLE40 = 0; int numNonUnaryRoots = 0; Tree nonUnaryEg = null; ClassicCounter <Tree> nonUnaries = new ClassicCounter <Tree>(); ClassicCounter <string> roots = new ClassicCounter <string>(); ClassicCounter <string> starts = new ClassicCounter <string>(); ClassicCounter <string> puncts = new ClassicCounter <string>(); int numUnenclosedLeaves = 0; int numLeaves = 0; int numNonPhrasal = 0; int numPreTerminalWithMultipleChildren = 0; int numWords = 0; int numTags = 0; int shortestSentence = int.MaxValue; int longestSentence = 0; int numNullLabel = 0; ICollection <string> words = Generics.NewHashSet(); ClassicCounter <string> tags = new ClassicCounter <string>(); ClassicCounter <string> cats = new ClassicCounter <string>(); Tree leafEg = null; Tree preTerminalMultipleChildrenEg = null; Tree nullLabelEg = null; Tree rootRewritesAsTaggedWordEg = null; foreach (Tree t in this) { roots.IncrementCount(t.Value()); numTrees++; int leng = t.Yield().Count; if (leng <= 40) { numTreesLE40++; } if (leng < shortestSentence) { shortestSentence = leng; } if (leng > longestSentence) { longestSentence = leng; } if (t.NumChildren() > 1) { if (numNonUnaryRoots == 0) { nonUnaryEg = t; } if (numNonUnaryRoots < 100) { nonUnaries.IncrementCount(t.LocalTree()); } numNonUnaryRoots++; } else { if (t.IsLeaf()) { numUnenclosedLeaves++; } else { Tree t2 = t.FirstChild(); if (t2.IsLeaf()) { numLeaves++; leafEg = t; } else { if (t2.IsPreTerminal()) { if (numNonPhrasal == 0) { rootRewritesAsTaggedWordEg = t; } numNonPhrasal++; } } starts.IncrementCount(t2.Value()); } } foreach (Tree subtree in t) { ILabel lab = subtree.Label(); if (lab == null || lab.Value() == null || lab.Value().IsEmpty()) { if (numNullLabel == 0) { nullLabelEg = subtree; } numNullLabel++; if (lab == null) { subtree.SetLabel(new StringLabel(string.Empty)); } else { if (lab.Value() == null) { subtree.Label().SetValue(string.Empty); } } } if (subtree.IsLeaf()) { numWords++; words.Add(subtree.Value()); } else { if (subtree.IsPreTerminal()) { numTags++; tags.IncrementCount(subtree.Value()); if (tlp != null && tlp.IsPunctuationTag(subtree.Value())) { puncts.IncrementCount(subtree.FirstChild().Value()); } } else { if (subtree.IsPhrasal()) { bool hasLeafChild = false; foreach (Tree kt in subtree.Children()) { if (kt.IsLeaf()) { hasLeafChild = true; } } if (hasLeafChild) { numPreTerminalWithMultipleChildren++; if (preTerminalMultipleChildrenEg == null) { preTerminalMultipleChildrenEg = subtree; } } cats.IncrementCount(subtree.Value()); } else { throw new InvalidOperationException("Treebank: Bad tree in treebank!: " + subtree); } } } } } StringWriter sw = new StringWriter(2000); PrintWriter pw = new PrintWriter(sw); NumberFormat nf = NumberFormat.GetNumberInstance(); nf.SetMaximumFractionDigits(0); pw.Println("Treebank has " + numTrees + " trees (" + numTreesLE40 + " of length <= 40) and " + numWords + " words (tokens)"); if (numTrees > 0) { if (numTags != numWords) { pw.Println(" Warning! numTags differs and is " + numTags); } if (roots.Size() == 1) { string root = (string)Sharpen.Collections.ToArray(roots.KeySet())[0]; pw.Println(" The root category is: " + root); } else { pw.Println(" Warning! " + roots.Size() + " different roots in treebank: " + Counters.ToString(roots, nf)); } if (numNonUnaryRoots > 0) { pw.Print(" Warning! " + numNonUnaryRoots + " trees without unary initial rewrite. "); if (numNonUnaryRoots > 100) { pw.Print("First 100 "); } pw.Println("Rewrites: " + Counters.ToString(nonUnaries, nf)); pw.Println(" Example: " + nonUnaryEg); } if (numUnenclosedLeaves > 0 || numLeaves > 0 || numNonPhrasal > 0) { pw.Println(" Warning! Non-phrasal trees: " + numUnenclosedLeaves + " bare leaves; " + numLeaves + " root rewrites as leaf; and " + numNonPhrasal + " root rewrites as tagged word"); if (numLeaves > 0) { pw.Println(" Example bad root rewrites as leaf: " + leafEg); } if (numNonPhrasal > 0) { pw.Println(" Example bad root rewrites as tagged word: " + rootRewritesAsTaggedWordEg); } } if (numNullLabel > 0) { pw.Println(" Warning! " + numNullLabel + " tree nodes with null or empty string labels, e.g.:"); pw.Println(" " + nullLabelEg); } if (numPreTerminalWithMultipleChildren > 0) { pw.Println(" Warning! " + numPreTerminalWithMultipleChildren + " preterminal nodes with multiple children."); pw.Println(" Example: " + preTerminalMultipleChildrenEg); } pw.Println(" Sentences range from " + shortestSentence + " to " + longestSentence + " words, with an average length of " + (((numWords * 100) / numTrees) / 100.0) + " words."); pw.Println(" " + cats.Size() + " phrasal category types, " + tags.Size() + " tag types, and " + words.Count + " word types"); string[] empties = new string[] { "*", "0", "*T*", "*RNR*", "*U*", "*?*", "*EXP*", "*ICH*", "*NOT*", "*PPA*", "*OP*", "*pro*", "*PRO*" }; // What a dopey choice using 0 as an empty element name!! // The problem with the below is that words aren't turned into a basic // category, but empties commonly are indexed.... Would need to look // for them with a suffix of -[0-9]+ ICollection <string> knownEmpties = Generics.NewHashSet(Arrays.AsList(empties)); ICollection <string> emptiesIntersection = Sets.Intersection(words, knownEmpties); if (!emptiesIntersection.IsEmpty()) { pw.Println(" Caution! " + emptiesIntersection.Count + " word types are known empty elements: " + emptiesIntersection); } ICollection <string> joint = Sets.Intersection(cats.KeySet(), tags.KeySet()); if (!joint.IsEmpty()) { pw.Println(" Warning! " + joint.Count + " items are tags and categories: " + joint); } foreach (string cat in cats.KeySet()) { if (cat != null && cat.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with categories containing '@' like: " + cat); break; } } foreach (string cat_1 in tags.KeySet()) { if (cat_1 != null && cat_1.Contains("@")) { pw.Println(" Warning!! Stanford Parser does not work with tags containing '@' like: " + cat_1); break; } } pw.Println(" Cats: " + Counters.ToString(cats, nf)); pw.Println(" Tags: " + Counters.ToString(tags, nf)); pw.Println(" " + starts.Size() + " start categories: " + Counters.ToString(starts, nf)); if (!puncts.IsEmpty()) { pw.Println(" Puncts: " + Counters.ToString(puncts, nf)); } } return(sw.ToString()); }