public static Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, InputDocument testDoc, bool saveOutput, bool computeEditDistance) { // kNNClassifier.resetCache(); Corpus corpus = new Corpus(documents, language); corpus.train(); // System.out.printf("%d feature vectors\n", corpus.featureVectors.size()); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); float editDistance = 0; if (computeEditDistance) { editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.AnalysisPerToken); // System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate()); if (saveOutput) { File dir = new File(outputDir + "/" + language.name); if (saveOutput) { dir = new File(outputDir + "/" + language.name); dir.mkdir(); } org.antlr.codebuff.misc.Utils.writeFile(dir.Path + "/" + System.IO.Path.GetFileName(testDoc.fileName), output); } return(new Triple <Formatter, float?, float?>(formatter, editDistance, analysis.ErrorRate)); }
public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, string outputDir, bool computeEditDistance, bool collectAnalysis) { string path = System.IO.Path.GetFullPath(fileToExclude); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path)); Debug.Assert(others.Count == documents.Count - 1); // kNNClassifier.resetCache(); if (excluded.Count == 0) { Console.Error.WriteLine("Doc not in corpus: " + path); return(null); } InputDocument testDoc = excluded[0]; DateTime start = System.DateTime.Now; Corpus corpus = new Corpus(others, language); corpus.train(); DateTime stop = System.DateTime.Now; Formatter formatter = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures); InputDocument originalDoc = testDoc; DateTime format_start = System.DateTime.Now; string output = formatter.format(testDoc, collectAnalysis); DateTime format_stop = System.DateTime.Now; float editDistance = 0; if (computeEditDistance) { editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.AnalysisPerToken); Console.WriteLine(testDoc.fileName + ": edit distance = " + editDistance + ", error rate = " + analysis.ErrorRate); if (!string.ReferenceEquals(outputDir, null)) { string dir = outputDir + "/" + language.name + "/" + Tool.version; if (!System.IO.Directory.Exists(dir)) { System.IO.Directory.CreateDirectory(dir); } org.antlr.codebuff.misc.Utils.writeFile(dir + "/" + System.IO.Path.GetFileName(testDoc.fileName), output); } var tms = (stop - start); var fms = format_stop - format_start; trainingTimes.Add((double)tms.Milliseconds); float tokensPerMS = testDoc.tokens.Size / (float)fms.TotalMilliseconds; formattingTokensPerMS.Add((double)tokensPerMS); Console.Write("Training time = {0:D} ms, formatting {1:D} ms, {2,5:F3} tokens/ms ({3:D} tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.Size); // System.out.printf("classify calls %d, hits %d rate %f\n", // kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits, // kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls); // System.out.printf("kNN calls %d, hits %d rate %f\n", // kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits, // kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls); return(new Triple <Formatter, float, float>(formatter, editDistance, analysis.ErrorRate)); }
public static void runCaptureForOneLanguage(LangDescriptor language) { IList <string> filenames = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = Tool.load(filenames, language); foreach (string fileName in filenames) { // Examine info for this file in isolation Corpus fileCorpus = new Corpus(fileName, language); fileCorpus.train(); Console.WriteLine(fileName); // examineCorpus(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> ws = getWSContextCategoryMap(fileCorpus); ArrayListMultiMap <FeatureVectorAsObject, int> hpos = getHPosContextCategoryMap(fileCorpus); // Compare with corpus minus this file string path = fileName; IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); Corpus corpus = new Corpus(others, language); corpus.train(); // examineCorpus(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> corpus_ws = getWSContextCategoryMap(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> corpus_hpos = getHPosContextCategoryMap(corpus); foreach (FeatureVectorAsObject x in ws.Keys) { HashBag <int> fwsCats = getCategoriesBag(ws[x]); IList <float> fwsRatios = getCategoryRatios(fwsCats.Values); HashBag <int> wsCats = getCategoriesBag(corpus_ws[x]); IList <float> wsRatios = getCategoryRatios(wsCats.Values); // compare file predictions with corpus predictions if (!fwsRatios.SequenceEqual(wsRatios)) { Console.WriteLine(fwsRatios + " vs " + wsRatios); } HashBag <int> fhposCats = getCategoriesBag(hpos[x]); HashBag <int> hposCats = getCategoriesBag(corpus_hpos[x]); } break; } }
public static org.antlr.codebuff.misc.Pair <int, int> test(LangDescriptor language, IList <InputDocument> others, InputDocument testDoc) { var train_start = System.DateTime.Now; Corpus corpus = new Corpus(others, language); corpus.train(); var train_stop = System.DateTime.Now; var format_start = System.DateTime.Now; Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, FEATURES_INJECT_WS, FEATURES_HPOS); formatter.format(testDoc, false); var format_stop = System.DateTime.Now; var train_time = (train_stop - train_start) / 1000000; var format_time = (format_stop - format_start) / 1000000; Log.Write("{0} training of {1} = {2:D}ms formatting = {3:D}ms\n", language.name, testDoc.fileName, train_time, format_time); return(new org.antlr.codebuff.misc.Pair <int, int>((int)train_time, (int)format_time)); }
public static void runCaptureForOneLanguage(LangDescriptor language) { IList <string> filenames = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <float> selfEditDistances = new List <float>(); foreach (string fileName in filenames) { Corpus corpus = new Corpus(fileName, language); corpus.train(); InputDocument testDoc = Tool.parse(fileName, corpus.language); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); // System.out.println(output); float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); Log.WriteLine(fileName + " edit distance " + editDistance); selfEditDistances.Add(editDistance); } { Corpus corpus = new Corpus(language.corpusDir, language); corpus.train(); IList <float> corpusEditDistances = new List <float>(); foreach (string fileName in filenames) { InputDocument testDoc = Tool.parse(fileName, corpus.language); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); // System.out.println(output); float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); Log.WriteLine(fileName + "+corpus edit distance " + editDistance); corpusEditDistances.Add(editDistance); } // heh this gives info on within-corpus variability. i.e., how good/consistent is my corpus? // those files with big difference are candidates for dropping from corpus or for cleanup. IList <string> labels = BuffUtils.map(filenames, f => '"' + System.IO.Path.GetFileName(f) + '"'); string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "labels = <labels>\n" + "N = len(labels)\n\n" + "featureIndexes = range(0,N)\n" + "<lang>_self = <selfEditDistances>\n" + "<lang>_corpus = <corpusEditDistances>\n" + "<lang>_diff = np.abs(np.subtract(<lang>_self, <lang>_corpus))\n\n" + "all = zip(<lang>_self, <lang>_corpus, <lang>_diff, labels)\n" + "all = sorted(all, key=lambda x : x[2], reverse=True)\n" + "<lang>_self, <lang>_corpus, <lang>_diff, labels = zip(*all)\n\n" + "ax.plot(featureIndexes, <lang>_self, label=\"<lang>_self\")\n" + "#ax.plot(featureIndexes, <lang>_corpus, label=\"<lang>_corpus\")\n" + "ax.plot(featureIndexes, <lang>_diff, label=\"<lang>_diff\")\n" + "ax.set_xticklabels(labels, rotation=60, fontsize=8)\n" + "plt.xticks(featureIndexes, labels, rotation=60)\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n\n" + "ax.text(1, .25, 'median $f$ self distance = %5.3f, corpus+$f$ distance = %5.3f' %" + " (np.median(<lang>_self),np.median(<lang>_corpus)))\n" + "ax.set_xlabel(\"File Name\")\n" + "ax.set_ylabel(\"Edit Distance\")\n" + "ax.set_title(\"Difference between Formatting File <lang> $f$\\nwith Training=$f$ and Training=$f$+Corpus\")\n" + "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig(\"images/" + language.name + "_one_file_capture.pdf\", format='pdf')\n" + "plt.show()\n"; ST pythonST = new ST(python); pythonST.add("lang", language.name); pythonST.add("version", version); pythonST.add("date", DateTime.Now); pythonST.add("labels", labels.ToString()); pythonST.add("selfEditDistances", selfEditDistances.ToString()); pythonST.add("corpusEditDistances", corpusEditDistances.ToString()); string code = pythonST.render(); { string fileName = "python/src/" + language.name + "_one_file_capture.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, code); Log.WriteLine("wrote python code to " + fileName); } } }
public static void computeConsistency(LangDescriptor language, bool report) { if (report) { Console.WriteLine("-----------------------------------"); Console.WriteLine(language.name); Console.WriteLine("-----------------------------------"); } Corpus corpus = new Corpus(language.corpusDir, language); corpus.train(); // a map of feature vector to list of exemplar indexes of that feature MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); int n = corpus.featureVectors.Count; for (int i = 0; i < n; i++) { int[] features = corpus.featureVectors[i]; wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i); hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i); } int num_ambiguous_ws_vectors = 0; int num_ambiguous_hpos_vectors = 0; // Dump output grouped by ws vs hpos then feature vector then category if (report) { Console.WriteLine(" --- INJECT WS ---"); } IList <double> ws_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys) { var exemplarIndexes = wsContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for ws MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { wsCatToIndexes.Map(corpus.injectWhitespace[i], i); } if (wsCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size()); double wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", wsEntropy); } wsEntropy *= exemplarIndexes.size(); ws_entropies.Add(wsEntropy); num_ambiguous_ws_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS)); } if (report) { foreach (int cat in wsCatToIndexes.Keys) { var indexes = wsCatToIndexes[cat]; foreach (int i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i); Console.WriteLine(display); } Console.WriteLine(); } } } if (report) { Console.WriteLine(" --- HPOS ---"); } IList <double> hpos_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys) { MyHashSet <int> exemplarIndexes = hposContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for hpos MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { hposCatToIndexes.Map(corpus.hpos[i], i); } if (hposCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size()); double hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", hposEntropy); } hposEntropy *= exemplarIndexes.size(); hpos_entropies.Add(hposEntropy); num_ambiguous_hpos_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS)); } if (report) { foreach (int cat in hposCatToIndexes.Keys) { var indexes = hposCatToIndexes[cat]; foreach (int?i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value); Console.WriteLine(display); } Console.WriteLine(); } } } Console.WriteLine(); Console.WriteLine(language.name); Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n)); Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n)); float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n; Console.Write("num_ambiguous_ws_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous); float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n; Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous); // Collections.sort(ws_entropies); // System.out.println("ws_entropies="+ws_entropies); Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies)); double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous; Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy); Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies)); double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous; Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy); }