public static string testAllLanguages(LangDescriptor[] languages, string[] corpusDirs, string imageFileName) { IList <string> languageNames = BuffUtils.map(languages, l => l.name + "_err"); // Collections.sort(languageNames); IDictionary <string, int?> corpusSizes = new Dictionary <string, int?>(); for (int i = 0; i < languages.Length; i++) { LangDescriptor language = languages[i]; IList <string> filenames = Tool.getFilenames(corpusDirs[i], language.fileRegex); corpusSizes[language.name] = filenames.Count; } IList <string> languageNamesAsStr = BuffUtils.map(languages, l => '"' + l.name + "\\nn=" + corpusSizes[l.name] + '"'); // Collections.sort(languageNamesAsStr); StringBuilder data = new StringBuilder(); for (int i = 0; i < languages.Length; i++) { LangDescriptor language = languages[i]; string corpus = corpusDirs[i]; LeaveOneOutValidator validator = new LeaveOneOutValidator(corpus, language); Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(true, "/tmp"); IList <Formatter> formatters = results.a; IList <float> distances = results.b; IList <float> errors = results.c; // data.append(language.name+"_dist = "+distances+"\n"); data.Append(language.name + "_err = " + errors + "\n"); } string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff %s '%s'\n" + "#\n" + "import numpy as np\n" + "import pylab\n" + "import matplotlib.pyplot as plt\n\n" + "%s\n" + "language_data = %s\n" + "labels = %s\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "ax.boxplot(language_data,\n" + " whis=[10, 90], # 10 and 90 %% whiskers\n" + " widths=.35,\n" + " labels=labels,\n" + " showfliers=False)\n" + "ax.set_xticklabels(labels, rotation=60, fontsize=18)\n" + "ax.tick_params(axis='both', which='major', labelsize=18)\n" + "plt.xticks(range(1,len(labels)+1), labels, rotation=60, fontsize=18)\n" + "pylab.ylim([0,.28])\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "ax.set_xlabel(\"Grammar and corpus size\", fontsize=20)\n" + "ax.set_ylabel(\"Misclassification Error Rate\", fontsize=20)\n" + "# ax.set_title(\"Leave-one-out Validation Using Error Rate\\nBetween Formatted and Original File\")\n" + "plt.tight_layout()\n" + "fig.savefig('images/%s', format='pdf')\n" + "plt.show()\n"; return(string.Format(python, Tool.version, DateTime.Now, data, languageNames, languageNamesAsStr, imageFileName)); }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR }; IList <string> corpusDirs = BuffUtils.map(languages, l => l.corpusDir); string[] dirs = corpusDirs.ToArray(); string python = testAllLanguages(languages, dirs, "leave_one_out.pdf"); string fileName = "python/src/leave_one_out.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, python); Console.WriteLine("wrote python code to " + fileName); }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { JAVA_DESCR, JAVA8_DESCR, JAVA_GUAVA_DESCR }; IList <string> corpusDirs = BuffUtils.map(languages, l => l.corpusDir); string[] dirs = corpusDirs.ToArray(); string python = LeaveOneOutValidator.testAllLanguages(languages, dirs, "all_java_leave_one_out.pdf"); string fileName = "python/src/all_java_leave_one_out.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, python); Log.WriteLine("wrote python code to " + fileName); }
public static void runCaptureForOneLanguage(LangDescriptor language) { IList <string> filenames = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <float> selfEditDistances = new List <float>(); foreach (string fileName in filenames) { Corpus corpus = new Corpus(fileName, language); corpus.train(); InputDocument testDoc = Tool.parse(fileName, corpus.language); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); // System.out.println(output); float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); Log.WriteLine(fileName + " edit distance " + editDistance); selfEditDistances.Add(editDistance); } { Corpus corpus = new Corpus(language.corpusDir, language); corpus.train(); IList <float> corpusEditDistances = new List <float>(); foreach (string fileName in filenames) { InputDocument testDoc = Tool.parse(fileName, corpus.language); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); // System.out.println(output); float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); Log.WriteLine(fileName + "+corpus edit distance " + editDistance); corpusEditDistances.Add(editDistance); } // heh this gives info on within-corpus variability. i.e., how good/consistent is my corpus? // those files with big difference are candidates for dropping from corpus or for cleanup. IList <string> labels = BuffUtils.map(filenames, f => '"' + System.IO.Path.GetFileName(f) + '"'); string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "labels = <labels>\n" + "N = len(labels)\n\n" + "featureIndexes = range(0,N)\n" + "<lang>_self = <selfEditDistances>\n" + "<lang>_corpus = <corpusEditDistances>\n" + "<lang>_diff = np.abs(np.subtract(<lang>_self, <lang>_corpus))\n\n" + "all = zip(<lang>_self, <lang>_corpus, <lang>_diff, labels)\n" + "all = sorted(all, key=lambda x : x[2], reverse=True)\n" + "<lang>_self, <lang>_corpus, <lang>_diff, labels = zip(*all)\n\n" + "ax.plot(featureIndexes, <lang>_self, label=\"<lang>_self\")\n" + "#ax.plot(featureIndexes, <lang>_corpus, label=\"<lang>_corpus\")\n" + "ax.plot(featureIndexes, <lang>_diff, label=\"<lang>_diff\")\n" + "ax.set_xticklabels(labels, rotation=60, fontsize=8)\n" + "plt.xticks(featureIndexes, labels, rotation=60)\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n\n" + "ax.text(1, .25, 'median $f$ self distance = %5.3f, corpus+$f$ distance = %5.3f' %" + " (np.median(<lang>_self),np.median(<lang>_corpus)))\n" + "ax.set_xlabel(\"File Name\")\n" + "ax.set_ylabel(\"Edit Distance\")\n" + "ax.set_title(\"Difference between Formatting File <lang> $f$\\nwith Training=$f$ and Training=$f$+Corpus\")\n" + "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig(\"images/" + language.name + "_one_file_capture.pdf\", format='pdf')\n" + "plt.show()\n"; ST pythonST = new ST(python); pythonST.add("lang", language.name); pythonST.add("version", version); pythonST.add("date", DateTime.Now); pythonST.add("labels", labels.ToString()); pythonST.add("selfEditDistances", selfEditDistances.ToString()); pythonST.add("corpusEditDistances", corpusEditDistances.ToString()); string code = pythonST.render(); { string fileName = "python/src/" + language.name + "_one_file_capture.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, code); Log.WriteLine("wrote python code to " + fileName); } } }
public static void computeConsistency(LangDescriptor language, bool report) { if (report) { Console.WriteLine("-----------------------------------"); Console.WriteLine(language.name); Console.WriteLine("-----------------------------------"); } Corpus corpus = new Corpus(language.corpusDir, language); corpus.train(); // a map of feature vector to list of exemplar indexes of that feature MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); int n = corpus.featureVectors.Count; for (int i = 0; i < n; i++) { int[] features = corpus.featureVectors[i]; wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i); hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i); } int num_ambiguous_ws_vectors = 0; int num_ambiguous_hpos_vectors = 0; // Dump output grouped by ws vs hpos then feature vector then category if (report) { Console.WriteLine(" --- INJECT WS ---"); } IList <double> ws_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys) { var exemplarIndexes = wsContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for ws MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { wsCatToIndexes.Map(corpus.injectWhitespace[i], i); } if (wsCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size()); double wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", wsEntropy); } wsEntropy *= exemplarIndexes.size(); ws_entropies.Add(wsEntropy); num_ambiguous_ws_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS)); } if (report) { foreach (int cat in wsCatToIndexes.Keys) { var indexes = wsCatToIndexes[cat]; foreach (int i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i); Console.WriteLine(display); } Console.WriteLine(); } } } if (report) { Console.WriteLine(" --- HPOS ---"); } IList <double> hpos_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys) { MyHashSet <int> exemplarIndexes = hposContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for hpos MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { hposCatToIndexes.Map(corpus.hpos[i], i); } if (hposCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size()); double hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", hposEntropy); } hposEntropy *= exemplarIndexes.size(); hpos_entropies.Add(hposEntropy); num_ambiguous_hpos_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS)); } if (report) { foreach (int cat in hposCatToIndexes.Keys) { var indexes = hposCatToIndexes[cat]; foreach (int?i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value); Console.WriteLine(display); } Console.WriteLine(); } } } Console.WriteLine(); Console.WriteLine(language.name); Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n)); Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n)); float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n; Console.Write("num_ambiguous_ws_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous); float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n; Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous); // Collections.sort(ws_entropies); // System.out.println("ws_entropies="+ws_entropies); Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies)); double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous; Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy); Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies)); double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous; Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy); }