public static IList <float?> checkStability(LangDescriptor language) { IList <float?> errorRates = new List <float?>(); // format the corpus into tmp dir LeaveOneOutValidator validator0 = new LeaveOneOutValidator(language.corpusDir, language); Triple <IList <Formatter>, IList <float>, IList <float> > results0 = validator0.validateDocuments(false, "/tmp/stability/1"); errorRates.Add(BuffUtils.median(results0.c)); IList <Formatter> formatters0 = results0.a; // now try formatting it over and over for (int i = 1; i <= STAGES; i++) { string inputDir = "/tmp/stability/" + i; string outputDir = "/tmp/stability/" + (i + 1); LeaveOneOutValidator validator = new LeaveOneOutValidator(inputDir, language); Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(false, outputDir); IList <Formatter> formatters = results.a; IList <float?> distances = new List <float?>(); for (int j = 0; j < formatters.Count; j++) { Formatter f0 = formatters0[j]; Formatter f = formatters[j]; float editDistance = Dbg.normalizedLevenshteinDistance(f.Output, f0.Output); distances.Add(editDistance); } errorRates.Add(BuffUtils.median(distances)); } return(errorRates); }
public static Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, InputDocument testDoc, bool saveOutput, bool computeEditDistance) { // kNNClassifier.resetCache(); Corpus corpus = new Corpus(documents, language); corpus.train(); // System.out.printf("%d feature vectors\n", corpus.featureVectors.size()); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); float editDistance = 0; if (computeEditDistance) { editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.AnalysisPerToken); // System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate()); if (saveOutput) { File dir = new File(outputDir + "/" + language.name); if (saveOutput) { dir = new File(outputDir + "/" + language.name); dir.mkdir(); } org.antlr.codebuff.misc.Utils.writeFile(dir.Path + "/" + System.IO.Path.GetFileName(testDoc.fileName), output); } return(new Triple <Formatter, float?, float?>(formatter, editDistance, analysis.ErrorRate)); }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR }; int maxNumFiles = 30; int trials = 50; IDictionary <string, float[]> results = new Dictionary <string, float[]>(); foreach (LangDescriptor language in languages) { float[] medians = getMedianErrorRates(language, maxNumFiles, trials); results[language.name] = medians; } string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "N = <maxNumFiles>\n" + "sizes = range(1,N+1)\n" + "<results:{r |\n" + "<r> = [<rest(results.(r)); separator={,}>]\n" + "ax.plot(range(1,len(<r>)+1), <r>, label=\"<r>\", marker='<markers.(r)>', color='<colors.(r)>')\n" + "}>\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "ax.set_xlabel(\"Number of training files in sample corpus subset\", fontsize=14)\n" + "ax.set_ylabel(\"Median Error rate for <trials> trials\", fontsize=14)\n" + "ax.set_title(\"Effect of Corpus size on Median Leave-one-out Validation Error Rate\")\n" + "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig('images/subset_validator.pdf', format='pdf')\n" + "plt.show()\n"; ST pythonST = new ST(python); pythonST.add("results", results); pythonST.add("markers", LeaveOneOutValidator.nameToGraphMarker); pythonST.add("colors", LeaveOneOutValidator.nameToGraphColor); pythonST.add("version", version); pythonST.add("date", DateTime.Now); pythonST.add("trials", trials); pythonST.add("maxNumFiles", maxNumFiles); IList <string> corpusDirs = map(languages, l => l.corpusDir); string[] dirs = corpusDirs.ToArray(); string fileName = "python/src/subset_validator.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, pythonST.render()); Console.WriteLine("wrote python code to " + fileName); }
public static void Main(string[] args) { LeaveOneOutValidator.FORCE_SINGLE_THREADED = true; // need this when we compare results file by file LangDescriptor[] languages = new LangDescriptor[] { QUORUM_DESCR }; IDictionary <string, IList <float?> > results = new Dictionary <string, IList <float?> >(); foreach (LangDescriptor language in languages) { IList <float?> errorRates = checkStability(language); Log.WriteLine(language.name + " " + errorRates); results[language.name] = errorRates; } foreach (string name in results.Keys) { Log.WriteLine(name + " = " + results[name]); } string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "import matplotlib\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "N = <N>\n" + "sizes = range(0,N)\n" + "<results:{r |\n" + "<r> = [<results.(r); separator={,}>]\n" + "ax.plot(sizes, <r>, label=\"<r>\", marker='o')\n" + "}>\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "xa = ax.get_xaxis()\n" + "xa.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))\n" + "ax.set_xlabel(\"Formatting Stage; stage 0 is first formatting pass\")\n" + "ax.set_ylabel(\"Median Leave-one-out Validation Error Rate\")\n" + "ax.set_title(\"<N>-Stage Formatting Stability\\nStage $n$ is formatted output of stage $n-1$\")\n" + "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig('images/stability.pdf', format='pdf')\n" + "plt.show()\n"; ST pythonST = new ST(python); pythonST.add("results", results); pythonST.add("version", version); pythonST.add("date", DateTime.Now); pythonST.add("N", STAGES + 1); string fileName = "python/src/stability.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, pythonST.render()); Log.WriteLine("wrote python code to " + fileName); }
public static string testAllLanguages(LangDescriptor[] languages, string[] corpusDirs, string imageFileName) { IList <string> languageNames = BuffUtils.map(languages, l => l.name + "_err"); // Collections.sort(languageNames); IDictionary <string, int?> corpusSizes = new Dictionary <string, int?>(); for (int i = 0; i < languages.Length; i++) { LangDescriptor language = languages[i]; IList <string> filenames = Tool.getFilenames(corpusDirs[i], language.fileRegex); corpusSizes[language.name] = filenames.Count; } IList <string> languageNamesAsStr = BuffUtils.map(languages, l => '"' + l.name + "\\nn=" + corpusSizes[l.name] + '"'); // Collections.sort(languageNamesAsStr); StringBuilder data = new StringBuilder(); for (int i = 0; i < languages.Length; i++) { LangDescriptor language = languages[i]; string corpus = corpusDirs[i]; LeaveOneOutValidator validator = new LeaveOneOutValidator(corpus, language); Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(true, "/tmp"); IList <Formatter> formatters = results.a; IList <float> distances = results.b; IList <float> errors = results.c; // data.append(language.name+"_dist = "+distances+"\n"); data.Append(language.name + "_err = " + errors + "\n"); } string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff %s '%s'\n" + "#\n" + "import numpy as np\n" + "import pylab\n" + "import matplotlib.pyplot as plt\n\n" + "%s\n" + "language_data = %s\n" + "labels = %s\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "ax.boxplot(language_data,\n" + " whis=[10, 90], # 10 and 90 %% whiskers\n" + " widths=.35,\n" + " labels=labels,\n" + " showfliers=False)\n" + "ax.set_xticklabels(labels, rotation=60, fontsize=18)\n" + "ax.tick_params(axis='both', which='major', labelsize=18)\n" + "plt.xticks(range(1,len(labels)+1), labels, rotation=60, fontsize=18)\n" + "pylab.ylim([0,.28])\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "ax.set_xlabel(\"Grammar and corpus size\", fontsize=20)\n" + "ax.set_ylabel(\"Misclassification Error Rate\", fontsize=20)\n" + "# ax.set_title(\"Leave-one-out Validation Using Error Rate\\nBetween Formatted and Original File\")\n" + "plt.tight_layout()\n" + "fig.savefig('images/%s', format='pdf')\n" + "plt.show()\n"; return(string.Format(python, Tool.version, DateTime.Now, data, languageNames, languageNamesAsStr, imageFileName)); }
public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, string outputDir, bool computeEditDistance, bool collectAnalysis) { string path = System.IO.Path.GetFullPath(fileToExclude); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path)); Debug.Assert(others.Count == documents.Count - 1); // kNNClassifier.resetCache(); if (excluded.Count == 0) { Console.Error.WriteLine("Doc not in corpus: " + path); return(null); } InputDocument testDoc = excluded[0]; DateTime start = System.DateTime.Now; Corpus corpus = new Corpus(others, language); corpus.train(); DateTime stop = System.DateTime.Now; Formatter formatter = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures); InputDocument originalDoc = testDoc; DateTime format_start = System.DateTime.Now; string output = formatter.format(testDoc, collectAnalysis); DateTime format_stop = System.DateTime.Now; float editDistance = 0; if (computeEditDistance) { editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.AnalysisPerToken); Console.WriteLine(testDoc.fileName + ": edit distance = " + editDistance + ", error rate = " + analysis.ErrorRate); if (!string.ReferenceEquals(outputDir, null)) { string dir = outputDir + "/" + language.name + "/" + Tool.version; if (!System.IO.Directory.Exists(dir)) { System.IO.Directory.CreateDirectory(dir); } org.antlr.codebuff.misc.Utils.writeFile(dir + "/" + System.IO.Path.GetFileName(testDoc.fileName), output); } var tms = (stop - start); var fms = format_stop - format_start; trainingTimes.Add((double)tms.Milliseconds); float tokensPerMS = testDoc.tokens.Size / (float)fms.TotalMilliseconds; formattingTokensPerMS.Add((double)tokensPerMS); Console.Write("Training time = {0:D} ms, formatting {1:D} ms, {2,5:F3} tokens/ms ({3:D} tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.Size); // System.out.printf("classify calls %d, hits %d rate %f\n", // kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits, // kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls); // System.out.printf("kNN calls %d, hits %d rate %f\n", // kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits, // kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls); return(new Triple <Formatter, float, float>(formatter, editDistance, analysis.ErrorRate)); }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { QUORUM_DESCR, JAVA_DESCR, JAVA8_DESCR, ANTLR4_DESCR, SQLITE_NOISY_DESCR, SQLITE_CLEAN_DESCR, TSQL_NOISY_DESCR, TSQL_CLEAN_DESCR }; for (int i = 0; i < languages.Length; i++) { LangDescriptor language = languages[i]; runCaptureForOneLanguage(language); } }
public static void Main(string[] args) { string langname = args[0].Substring(1); string testFilename = args[1]; LangDescriptor language = null; for (int i = 0; i < languages.length; i++) { if (languages[i].name.Equals(langname)) { language = languages[i]; break; } } if (language == null) { Log.WriteLine("Language " + langname + " unknown"); return; } // load all files up front DateTime load_start = System.DateTime.Now; IList <string> allFiles = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = Tool.load(allFiles, language); DateTime load_stop = System.DateTime.Now; DateTime load_time = (load_stop - load_start) / 1000000; Log.Write("Loaded {0:D} files in {1:D}ms\n", documents.Count, load_time); string path = System.IO.Path.GetFullPath(testFilename); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path)); Debug.Assert(others.Count == documents.Count - 1); if (excluded.Count == 0) { Log.WriteLine("Doc not in corpus: " + path); return; } InputDocument testDoc = excluded[0]; IList <int> training = new List <int>(); IList <int> formatting = new List <int>(); for (int i = 1; i <= TRIALS; i++) { org.antlr.codebuff.misc.Pair <int, int> timing = test(language, others, testDoc); training.Add(timing.a); formatting.Add(timing.b); } // drop first four training = training.subList(5, training.Count); formatting = formatting.subList(5, formatting.Count); Log.Write("median of [5:{0:D}] training {1:D}ms\n", TRIALS - 1, BuffUtils.median(training)); Log.Write("median of [5:{0:D}] formatting {1:D}ms\n", TRIALS - 1, BuffUtils.median(formatting)); }
public Corpus(string rootDir, LangDescriptor language) { this.rootDir = rootDir; this.language = language; if (documents == null) { IList <string> allFiles = Tool.getFilenames(rootDir, language.fileRegex); documents = Tool.load(allFiles, language); } }
public static float[] getMedianErrorRates(LangDescriptor language, int maxNumFiles, int trials) { SubsetValidator validator = new SubsetValidator(language.corpusDir, language); IList <InputDocument> documents = Tool.load(validator.allFiles, language); float[] medians = new float[Math.Min(documents.Count, maxNumFiles) + 1]; int ncpu = Runtime.Runtime.availableProcessors(); if (FORCE_SINGLE_THREADED) { ncpu = 2; } ExecutorService pool = Executors.newFixedThreadPool(ncpu - 1); IList <Callable <Void> > jobs = new List <Callable <Void> >(); for (int i = 1; i <= Math.Min(validator.allFiles.Count, maxNumFiles); i++) { // i is corpus subset size //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int corpusSubsetSize = i; int corpusSubsetSize = i; Callable <Void> job = () => { try { IList <float?> errorRates = new List <float?>(); for (int trial = 1; trial <= trials; trial++) { // multiple trials per subset size org.antlr.codebuff.misc.Pair <InputDocument, IList <InputDocument> > sample = validator.selectSample(documents, corpusSubsetSize); Triple <Formatter, float?, float?> results = validate(language, sample.b, sample.a, true, false); // System.out.println(sample.a.fileName+" n="+corpusSubsetSize+": error="+results.c); // System.out.println("\tcorpus =\n\t\t"+Utils.join(sample.b.iterator(), "\n\t\t")); errorRates.Add(results.c); } errorRates.Sort(); int n = errorRates.Count; float median = errorRates[n / 2].Value; Console.WriteLine("median " + language.name + " error rate for n=" + corpusSubsetSize + " is " + median); medians[corpusSubsetSize] = median; } catch (Exception t) { t.printStackTrace(System.err); } return(null); }; jobs.Add(job); } pool.invokeAll(jobs); pool.shutdown(); bool terminated = pool.awaitTermination(60, TimeUnit.MINUTES); return(medians); }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { JAVA_DESCR, JAVA8_DESCR, JAVA_GUAVA_DESCR }; IList <string> corpusDirs = BuffUtils.map(languages, l => l.corpusDir); string[] dirs = corpusDirs.ToArray(); string python = LeaveOneOutValidator.testAllLanguages(languages, dirs, "all_java_leave_one_out.pdf"); string fileName = "python/src/all_java_leave_one_out.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, python); Log.WriteLine("wrote python code to " + fileName); }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR }; IList <string> corpusDirs = BuffUtils.map(languages, l => l.corpusDir); string[] dirs = corpusDirs.ToArray(); string python = testAllLanguages(languages, dirs, "leave_one_out.pdf"); string fileName = "python/src/leave_one_out.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, python); Console.WriteLine("wrote python code to " + fileName); }
/// <summary> /// Get all file contents into input doc list </summary> public static IList <InputDocument> load(IList <string> fileNames, LangDescriptor language) { IList <InputDocument> documents = new List <InputDocument>(); foreach (string fileName in fileNames) { documents.Add(parse(fileName, language)); } if (documents.Count > 0) { documents[0].parser.Interpreter.ClearDFA(); // free up memory } return(documents); }
public static IList <float> getAlignmentErrorRates(LangDescriptor language, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures) { LeaveOneOutValidator validator = new LeaveOneOutValidator(language.corpusDir, language); Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(injectWSFeatures, alignmentFeatures, false, null); IList <Formatter> formatters = results.a; IList <float> alignErrorRates = new List <float>(); // don't include align errors foreach (Formatter formatter in formatters) { ClassificationAnalysis analysis = new ClassificationAnalysis(formatter.testDoc, formatter.AnalysisPerToken); alignErrorRates.Add(analysis.AlignmentErrorRate); } // System.out.println(results.c); // System.out.println("vs"); // System.out.println(alignErrorRates); return(alignErrorRates); }
public static InputDocument parse(string fileName, string content, LangDescriptor language) { ANTLRInputStream input = new ANTLRInputStream(content); Lexer lexer = getLexer(language.lexerClass, input); input.name = fileName; InputDocument doc = new InputDocument(fileName, content, language); doc.tokens = new CodeBuffTokenStream(lexer); doc.parser = getParser(language.parserClass, doc.tokens); doc.parser.BuildParseTree = true; // two-stage parsing. Try with SLL first doc.parser.Interpreter.PredictionMode = Antlr4.Runtime.Atn.PredictionMode.SLL; doc.parser.ErrorHandler = new BailErrorStrategy(); doc.parser.RemoveErrorListeners(); MethodInfo startRule = language.parserClass.GetMethod(language.startRuleName); try { doc.Tree = (ParserRuleContext)startRule.Invoke(doc.parser, (object[])null); } catch (Exception ex) { if (ex.InnerException is ParseCanceledException) { doc.parser.Reset(); doc.tokens.Reset(); // rewind input stream // back to standard listeners/handlers doc.parser.AddErrorListener(new ANTLRErrorListenerAnonymousInnerClass()); doc.parser.ErrorHandler = new DefaultErrorStrategy(); doc.parser.Interpreter.PredictionMode = PredictionMode.LL; doc.Tree = (ParserRuleContext)startRule.Invoke(doc.parser, (object[])null); if (doc.parser.NumberOfSyntaxErrors > 0) { doc.Tree = null; } } } return(doc); }
public static void runCaptureForOneLanguage(LangDescriptor language) { IList <string> filenames = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = Tool.load(filenames, language); foreach (string fileName in filenames) { // Examine info for this file in isolation Corpus fileCorpus = new Corpus(fileName, language); fileCorpus.train(); Console.WriteLine(fileName); // examineCorpus(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> ws = getWSContextCategoryMap(fileCorpus); ArrayListMultiMap <FeatureVectorAsObject, int> hpos = getHPosContextCategoryMap(fileCorpus); // Compare with corpus minus this file string path = fileName; IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); Corpus corpus = new Corpus(others, language); corpus.train(); // examineCorpus(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> corpus_ws = getWSContextCategoryMap(corpus); ArrayListMultiMap <FeatureVectorAsObject, int> corpus_hpos = getHPosContextCategoryMap(corpus); foreach (FeatureVectorAsObject x in ws.Keys) { HashBag <int> fwsCats = getCategoriesBag(ws[x]); IList <float> fwsRatios = getCategoryRatios(fwsCats.Values); HashBag <int> wsCats = getCategoriesBag(corpus_ws[x]); IList <float> wsRatios = getCategoryRatios(wsCats.Values); // compare file predictions with corpus predictions if (!fwsRatios.SequenceEqual(wsRatios)) { Console.WriteLine(fwsRatios + " vs " + wsRatios); } HashBag <int> fhposCats = getCategoriesBag(hpos[x]); HashBag <int> hposCats = getCategoriesBag(corpus_hpos[x]); } break; } }
public static org.antlr.codebuff.misc.Pair <int, int> test(LangDescriptor language, IList <InputDocument> others, InputDocument testDoc) { var train_start = System.DateTime.Now; Corpus corpus = new Corpus(others, language); corpus.train(); var train_stop = System.DateTime.Now; var format_start = System.DateTime.Now; Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, FEATURES_INJECT_WS, FEATURES_HPOS); formatter.format(testDoc, false); var format_stop = System.DateTime.Now; var train_time = (train_stop - train_start) / 1000000; var format_time = (format_stop - format_start) / 1000000; Log.Write("{0} training of {1} = {2:D}ms formatting = {3:D}ms\n", language.name, testDoc.fileName, train_time, format_time); return(new org.antlr.codebuff.misc.Pair <int, int>((int)train_time, (int)format_time)); }
public static void writePython(LangDescriptor[] languages, IList <int?> ks, float[][] medians) { StringBuilder data = new StringBuilder(); StringBuilder plot = new StringBuilder(); for (int i = 0; i < languages.Length; i++) { LangDescriptor language = languages[i]; IList <float?> filteredMedians = BuffUtils.filter(Arrays.asList(medians[i]), m => m != null); data.Append(language.name + '=' + filteredMedians + '\n'); plot.Append(string.Format("ax.plot(ks, {0}, label=\"{1}\", marker='{2}', color='{3}')\n", language.name, language.name, nameToGraphMarker.get(language.name), nameToGraphColor.get(language.name))); } string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff %s '%s'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "%s\n" + "ks = %s\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "%s" + "ax.tick_params(axis='both', which='major', labelsize=18)\n" + "ax.set_xlabel(\"$k$ nearest neighbors\", fontsize=20)\n" + "ax.set_ylabel(\"Median error rate\", fontsize=20)\n" + "#ax.set_title(\"k Nearest Neighbors vs\\nLeave-one-out Validation Error Rate\")\n" + "plt.legend(fontsize=18)\n\n" + "fig.savefig('images/vary_k.pdf', format='pdf')\n" + "plt.show()\n"; string code = string.format(python, Tool.version, DateTime.Now, data, ks, plot); string fileName = "python/src/vary_k.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, code); Log.WriteLine("wrote python code to " + fileName); }
public Corpus(IList <InputDocument> documents, LangDescriptor language) { this.documents = documents; this.language = language; }
public SubsetValidator(string rootDir, LangDescriptor language) { this.rootDir = rootDir; this.language = language; allFiles = Tool.getFilenames(rootDir, language.fileRegex); }
public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, string outputDir, bool computeEditDistance, bool collectAnalysis) { return(validate(language, documents, fileToExclude, k, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS, outputDir, computeEditDistance, collectAnalysis)); }
public static void computeConsistency(LangDescriptor language, bool report) { if (report) { Console.WriteLine("-----------------------------------"); Console.WriteLine(language.name); Console.WriteLine("-----------------------------------"); } Corpus corpus = new Corpus(language.corpusDir, language); corpus.train(); // a map of feature vector to list of exemplar indexes of that feature MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); int n = corpus.featureVectors.Count; for (int i = 0; i < n; i++) { int[] features = corpus.featureVectors[i]; wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i); hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i); } int num_ambiguous_ws_vectors = 0; int num_ambiguous_hpos_vectors = 0; // Dump output grouped by ws vs hpos then feature vector then category if (report) { Console.WriteLine(" --- INJECT WS ---"); } IList <double> ws_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys) { var exemplarIndexes = wsContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for ws MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { wsCatToIndexes.Map(corpus.injectWhitespace[i], i); } if (wsCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size()); double wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", wsEntropy); } wsEntropy *= exemplarIndexes.size(); ws_entropies.Add(wsEntropy); num_ambiguous_ws_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS)); } if (report) { foreach (int cat in wsCatToIndexes.Keys) { var indexes = wsCatToIndexes[cat]; foreach (int i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i); Console.WriteLine(display); } Console.WriteLine(); } } } if (report) { Console.WriteLine(" --- HPOS ---"); } IList <double> hpos_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys) { MyHashSet <int> exemplarIndexes = hposContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for hpos MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { hposCatToIndexes.Map(corpus.hpos[i], i); } if (hposCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size()); double hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", hposEntropy); } hposEntropy *= exemplarIndexes.size(); hpos_entropies.Add(hposEntropy); num_ambiguous_hpos_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS)); } if (report) { foreach (int cat in hposCatToIndexes.Keys) { var indexes = hposCatToIndexes[cat]; foreach (int?i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value); Console.WriteLine(display); } Console.WriteLine(); } } } Console.WriteLine(); Console.WriteLine(language.name); Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n)); Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n)); float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n; Console.Write("num_ambiguous_ws_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous); float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n; Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous); // Collections.sort(ws_entropies); // System.out.println("ws_entropies="+ws_entropies); Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies)); double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous; Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy); Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies)); double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous; Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy); }
public static void Main(string[] args) { if (args.Length < 2) { Console.Error.WriteLine("Dbg [-leave-one-out] [-java|-java8|-antlr|-sqlite|-tsql] test-file"); } int arg = 0; bool leaveOneOut = true; bool collectAnalysis = true; string language = args[arg++]; language = language.Substring(1); string testFilename = args[arg]; string output = "???"; InputDocument testDoc = null; IList <TokenPositionAnalysis> analysisPerToken = null; org.antlr.codebuff.misc.Pair <string, IList <TokenPositionAnalysis> > results; LangDescriptor lang = null; System.DateTime start, stop; for (int i = 0; i < Tool.languages.Length; i++) { if (Tool.languages[i].name.Equals(language)) { lang = Tool.languages[i]; break; } } if (lang != null) { start = System.DateTime.Now; LeaveOneOutValidator validator = new LeaveOneOutValidator(lang.corpusDir, lang); Triple <Formatter, float, float> val = validator.validateOneDocument(testFilename, null, collectAnalysis); testDoc = Tool.parse(testFilename, lang); stop = System.DateTime.Now; Formatter formatter = val.a; output = formatter.Output; Console.WriteLine("output len = " + output.Length); float editDistance = normalizedLevenshteinDistance(testDoc.content, output); Console.WriteLine("normalized Levenshtein distance: " + editDistance); analysisPerToken = formatter.AnalysisPerToken; Regex rex = new Regex("^\\s+$"); CommonTokenStream original_tokens = Tool.tokenize(testDoc.content, lang.lexerClass); IList <Token> wsTokens = BuffUtils.filter(original_tokens.GetTokens(), t => rex.IsMatch(t.Text)); string originalWS = tokenText(wsTokens); Console.WriteLine("origin ws tokens len: " + originalWS.Length); CommonTokenStream formatted_tokens = Tool.tokenize(output, lang.lexerClass); wsTokens = BuffUtils.filter(formatted_tokens.GetTokens(), t => rex.IsMatch(t.Text)); string formattedWS = tokenText(wsTokens); Console.WriteLine("formatted ws tokens len: " + formattedWS.Length); editDistance = levenshteinDistance(originalWS, formattedWS); editDistance /= Math.Max(testDoc.content.Length, output.Length); Console.WriteLine("Levenshtein distance of ws normalized to output len: " + editDistance); ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, analysisPerToken); Console.WriteLine(analysis); } if (lang != null) { // GUIController controller; // controller = new GUIController(analysisPerToken, testDoc, output, lang.lexerClass); //controller.show(); // System.out.println(output); //Console.Write("formatting time {0:D}s\n", (stop - start) / 1000000); Console.Write("classify calls {0:D}, hits {1:D} rate {2:F}\n", kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits, kNNClassifier.nClassifyCacheHits / (float)kNNClassifier.nClassifyCalls); Console.Write("kNN calls {0:D}, hits {1:D} rate {2:F}\n", kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits, kNNClassifier.nNNCacheHits / (float)kNNClassifier.nNNCalls); } }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR }; testFeatures(languages, false); }
/// <summary> /// Parse doc and fill tree and tokens fields /// </summary> public static InputDocument parse(string fileName, LangDescriptor language) { string content = load(fileName, language.indentSize); return(parse(fileName, content, language)); }
public static string Main(object[] args) { Log.Reset(); try { if (args.Length < 7) { Log.WriteLine("org.antlr.codebuff.Tool -g grammar-name -rule start-rule -corpus root-dir-of-samples \\\n" + " [-files file-extension] [-indent num-spaces] \\" + " [-comment line-comment-name] [-o output-file] file-to-format"); return(Log.Message()); } formatted_output = null; string outputFileName = ""; string grammarName = null; string startRule = null; string corpusDir = null; string indentS = "4"; string commentS = null; string input_file_name = null; string fileExtension = null; int i = 0; Type parserClass = null; Type lexerClass = null; while (i < args.Length && ((string)args[i]).StartsWith("-", StringComparison.Ordinal)) { switch (args[i]) { case "-g": i++; grammarName = (string)args[i++]; break; case "-lexer": i++; lexerClass = (Type)args[i++]; break; case "-parser": i++; parserClass = (Type)args[i++]; break; case "-rule": i++; startRule = (string)args[i++]; break; case "-corpus": i++; corpusDir = (string)args[i++]; break; case "-files": i++; fileExtension = (string)args[i++]; break; case "-indent": i++; indentS = (string)args[i++]; break; case "-comment": i++; commentS = (string)args[i++]; break; case "-o": i++; outputFileName = (string)args[i++]; break; case "-inoutstring": i++; formatted_output = ""; outputFileName = null; break; } } input_file_name = (string)args[i]; // must be last Log.WriteLine("gramm: " + grammarName); string parserClassName = grammarName + "Parser"; string lexerClassName = grammarName + "Lexer"; Lexer lexer = null; if (lexerClass == null || parserClass == null) { Log.WriteLine("You must specify a lexer and parser."); } if (parserClass == null | lexerClass == null) { return(Log.Message()); } int indentSize = int.Parse(indentS); int singleLineCommentType = -1; if (!string.ReferenceEquals(commentS, null)) { try { lexer = getLexer(lexerClass, null); } catch (Exception e) { Log.WriteLine("Can't instantiate lexer " + lexerClassName); Log.WriteLine(e.StackTrace); } if (lexer == null) { return(Log.Message()); } IDictionary <string, int> tokenTypeMap = lexer.TokenTypeMap; if (tokenTypeMap.ContainsKey(commentS)) { singleLineCommentType = tokenTypeMap[commentS]; } } string fileRegex = null; if (!string.ReferenceEquals(fileExtension, null)) { var pattern = ""; var allowable_suffices = fileExtension.Split(';').ToList <string>(); foreach (var s in allowable_suffices) { var no_dot = s.Substring(s.IndexOf('.') + 1); pattern = pattern == "" ? ("(" + no_dot) : (pattern + "|" + no_dot); } pattern = pattern + ")"; fileRegex = ".*\\." + pattern; } LangDescriptor language = new LangDescriptor(grammarName, corpusDir, fileRegex, lexerClass, parserClass, startRule, indentSize, singleLineCommentType); //////// // load all corpus files up front IList <string> allFiles = getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = load(allFiles, language); // Handle formatting of document if it's passed as a string or not. if (unformatted_input == null) { // Don't include file to format in corpus itself. string path = System.IO.Path.GetFullPath(input_file_name); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); // Perform training of formatter. Corpus corpus = new Corpus(others, language); corpus.train(); // Parse code contained in file. InputDocument unformatted_document = parse(input_file_name, language); // Format document. Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS); formatted_output = formatter.format(unformatted_document, false); } else { // Perform training of formatter. Corpus corpus = new Corpus(documents, language); corpus.train(); // Parse code that was represented as a string. InputDocument unformatted_document = parse(input_file_name, unformatted_input, language); // Format document. Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS); formatted_output = formatter.format(unformatted_document, false); } /////// if (outputFileName != null && outputFileName == "") { Log.WriteLine(formatted_output); } else if (!string.IsNullOrEmpty(outputFileName)) { org.antlr.codebuff.misc.Utils.writeFile(outputFileName, formatted_output); } } catch (Exception e) { throw e; } return(formatted_output); }
public static void Main(string[] args) { LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR }; int MAX_K = 98; // should be odd int OUTLIER_K = 99; IList <int?> ks = new List <int?>(); for (int i = 1; i <= MAX_K; i += 2) { ks.Add(i); } ks.Add(OUTLIER_K); // track medians[language][k] float[][] medians = new float[languages.Length + 1][]; int ncpu = 1; if (FORCE_SINGLE_THREADED) { ncpu = 2; } ExecutorService pool = Executors.newFixedThreadPool(ncpu - 1); IList <Callable <Void> > jobs = new List <Callable <Void> >(); for (int i = 0; i < languages.Length; i++) { //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final org.antlr.codebuff.misc.LangDescriptor language = languages[i]; LangDescriptor language = languages[i]; //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final': //ORIGINAL LINE: final int langIndex = i; int langIndex = i; Log.WriteLine(language.name); foreach (int k in ks) { medians[langIndex] = new float?[OUTLIER_K + 1]; Callable <Void> job = () => { try { TestK tester = new TestK(language.corpusDir, language, k); IList <float?> errorRates = tester.scoreDocuments(); errorRates.Sort(); int n = errorRates.Count; float median = errorRates[n / 2].Value; // double var = BuffUtils.varianceFloats(errorRates); // String display = String.format("%5.4f, %5.4f, %5.4f, %5.4f, %5.4f", min, quart, median, quart3, max); medians[langIndex][k] = median; } catch (Exception t) { t.printStackTrace(System.err); } return(null); }; jobs.Add(job); } } pool.invokeAll(jobs); pool.shutdown(); bool terminated = pool.awaitTermination(60, TimeUnit.MINUTES); writePython(languages, ks, medians); }
public static void runCaptureForOneLanguage(LangDescriptor language) { IList <string> filenames = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <float> selfEditDistances = new List <float>(); foreach (string fileName in filenames) { Corpus corpus = new Corpus(fileName, language); corpus.train(); InputDocument testDoc = Tool.parse(fileName, corpus.language); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); // System.out.println(output); float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); Log.WriteLine(fileName + " edit distance " + editDistance); selfEditDistances.Add(editDistance); } { Corpus corpus = new Corpus(language.corpusDir, language); corpus.train(); IList <float> corpusEditDistances = new List <float>(); foreach (string fileName in filenames) { InputDocument testDoc = Tool.parse(fileName, corpus.language); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); // System.out.println(output); float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); Log.WriteLine(fileName + "+corpus edit distance " + editDistance); corpusEditDistances.Add(editDistance); } // heh this gives info on within-corpus variability. i.e., how good/consistent is my corpus? // those files with big difference are candidates for dropping from corpus or for cleanup. IList <string> labels = BuffUtils.map(filenames, f => '"' + System.IO.Path.GetFileName(f) + '"'); string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "labels = <labels>\n" + "N = len(labels)\n\n" + "featureIndexes = range(0,N)\n" + "<lang>_self = <selfEditDistances>\n" + "<lang>_corpus = <corpusEditDistances>\n" + "<lang>_diff = np.abs(np.subtract(<lang>_self, <lang>_corpus))\n\n" + "all = zip(<lang>_self, <lang>_corpus, <lang>_diff, labels)\n" + "all = sorted(all, key=lambda x : x[2], reverse=True)\n" + "<lang>_self, <lang>_corpus, <lang>_diff, labels = zip(*all)\n\n" + "ax.plot(featureIndexes, <lang>_self, label=\"<lang>_self\")\n" + "#ax.plot(featureIndexes, <lang>_corpus, label=\"<lang>_corpus\")\n" + "ax.plot(featureIndexes, <lang>_diff, label=\"<lang>_diff\")\n" + "ax.set_xticklabels(labels, rotation=60, fontsize=8)\n" + "plt.xticks(featureIndexes, labels, rotation=60)\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n\n" + "ax.text(1, .25, 'median $f$ self distance = %5.3f, corpus+$f$ distance = %5.3f' %" + " (np.median(<lang>_self),np.median(<lang>_corpus)))\n" + "ax.set_xlabel(\"File Name\")\n" + "ax.set_ylabel(\"Edit Distance\")\n" + "ax.set_title(\"Difference between Formatting File <lang> $f$\\nwith Training=$f$ and Training=$f$+Corpus\")\n" + "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig(\"images/" + language.name + "_one_file_capture.pdf\", format='pdf')\n" + "plt.show()\n"; ST pythonST = new ST(python); pythonST.add("lang", language.name); pythonST.add("version", version); pythonST.add("date", DateTime.Now); pythonST.add("labels", labels.ToString()); pythonST.add("selfEditDistances", selfEditDistances.ToString()); pythonST.add("corpusEditDistances", corpusEditDistances.ToString()); string code = pythonST.render(); { string fileName = "python/src/" + language.name + "_one_file_capture.py"; org.antlr.codebuff.misc.Utils.writeFile(fileName, code); Log.WriteLine("wrote python code to " + fileName); } } }
public LeaveOneOutValidator(string rootDir, LangDescriptor language) { this.rootDir = rootDir; this.language = language; random = new Random(DOCLIST_RANDOM_SEED); }
public TestK(string rootDir, LangDescriptor language, int k) : base(rootDir, language) { this.k = k; }