Exemplo n.º 1
0
        public static IList <float?> checkStability(LangDescriptor language)
        {
            IList <float?> errorRates = new List <float?>();

            // format the corpus into tmp dir
            LeaveOneOutValidator validator0 = new LeaveOneOutValidator(language.corpusDir, language);
            Triple <IList <Formatter>, IList <float>, IList <float> > results0 = validator0.validateDocuments(false, "/tmp/stability/1");

            errorRates.Add(BuffUtils.median(results0.c));

            IList <Formatter> formatters0 = results0.a;

            // now try formatting it over and over
            for (int i = 1; i <= STAGES; i++)
            {
                string inputDir  = "/tmp/stability/" + i;
                string outputDir = "/tmp/stability/" + (i + 1);
                LeaveOneOutValidator validator = new LeaveOneOutValidator(inputDir, language);
                Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(false, outputDir);
                IList <Formatter> formatters = results.a;
                IList <float?>    distances  = new List <float?>();
                for (int j = 0; j < formatters.Count; j++)
                {
                    Formatter f0           = formatters0[j];
                    Formatter f            = formatters[j];
                    float     editDistance = Dbg.normalizedLevenshteinDistance(f.Output, f0.Output);
                    distances.Add(editDistance);
                }
                errorRates.Add(BuffUtils.median(distances));
            }

            return(errorRates);
        }
Exemplo n.º 2
0
        public static Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, InputDocument testDoc, bool saveOutput, bool computeEditDistance)
        {
            //		kNNClassifier.resetCache();
            Corpus corpus = new Corpus(documents, language);

            corpus.train();
            //		System.out.printf("%d feature vectors\n", corpus.featureVectors.size());
            Formatter formatter    = new Formatter(corpus, language.indentSize);
            string    output       = formatter.format(testDoc, false);
            float     editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.AnalysisPerToken);

            //		System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate());
            if (saveOutput)
            {
                File dir = new File(outputDir + "/" + language.name);
                if (saveOutput)
                {
                    dir = new File(outputDir + "/" + language.name);
                    dir.mkdir();
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir.Path + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            return(new Triple <Formatter, float?, float?>(formatter, editDistance, analysis.ErrorRate));
        }
Exemplo n.º 3
0
        public static void Main(string[] args)
        {
            LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR };

            int maxNumFiles = 30;
            int trials      = 50;
            IDictionary <string, float[]> results = new Dictionary <string, float[]>();

            foreach (LangDescriptor language in languages)
            {
                float[] medians = getMedianErrorRates(language, maxNumFiles, trials);
                results[language.name] = medians;
            }
            string python   = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "N = <maxNumFiles>\n" + "sizes = range(1,N+1)\n" + "<results:{r |\n" + "<r> = [<rest(results.(r)); separator={,}>]\n" + "ax.plot(range(1,len(<r>)+1), <r>, label=\"<r>\", marker='<markers.(r)>', color='<colors.(r)>')\n" + "}>\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "ax.set_xlabel(\"Number of training files in sample corpus subset\", fontsize=14)\n" + "ax.set_ylabel(\"Median Error rate for <trials> trials\", fontsize=14)\n" + "ax.set_title(\"Effect of Corpus size on Median Leave-one-out Validation Error Rate\")\n" + "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig('images/subset_validator.pdf', format='pdf')\n" + "plt.show()\n";
            ST     pythonST = new ST(python);

            pythonST.add("results", results);
            pythonST.add("markers", LeaveOneOutValidator.nameToGraphMarker);
            pythonST.add("colors", LeaveOneOutValidator.nameToGraphColor);
            pythonST.add("version", version);
            pythonST.add("date", DateTime.Now);
            pythonST.add("trials", trials);
            pythonST.add("maxNumFiles", maxNumFiles);
            IList <string> corpusDirs = map(languages, l => l.corpusDir);

            string[] dirs     = corpusDirs.ToArray();
            string   fileName = "python/src/subset_validator.py";

            org.antlr.codebuff.misc.Utils.writeFile(fileName, pythonST.render());
            Console.WriteLine("wrote python code to " + fileName);
        }
Exemplo n.º 4
0
        public static void Main(string[] args)
        {
            LeaveOneOutValidator.FORCE_SINGLE_THREADED = true;             // need this when we compare results file by file
            LangDescriptor[] languages = new LangDescriptor[] { QUORUM_DESCR };

            IDictionary <string, IList <float?> > results = new Dictionary <string, IList <float?> >();

            foreach (LangDescriptor language in languages)
            {
                IList <float?> errorRates = checkStability(language);
                Log.WriteLine(language.name + " " + errorRates);
                results[language.name] = errorRates;
            }
            foreach (string name in results.Keys)
            {
                Log.WriteLine(name + " = " + results[name]);
            }

            string python   = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "import matplotlib\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "N = <N>\n" + "sizes = range(0,N)\n" + "<results:{r |\n" + "<r> = [<results.(r); separator={,}>]\n" + "ax.plot(sizes, <r>, label=\"<r>\", marker='o')\n" + "}>\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "xa = ax.get_xaxis()\n" + "xa.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))\n" + "ax.set_xlabel(\"Formatting Stage; stage 0 is first formatting pass\")\n" + "ax.set_ylabel(\"Median Leave-one-out Validation Error Rate\")\n" + "ax.set_title(\"<N>-Stage Formatting Stability\\nStage $n$ is formatted output of stage $n-1$\")\n" + "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig('images/stability.pdf', format='pdf')\n" + "plt.show()\n";
            ST     pythonST = new ST(python);

            pythonST.add("results", results);
            pythonST.add("version", version);
            pythonST.add("date", DateTime.Now);
            pythonST.add("N", STAGES + 1);
            string fileName = "python/src/stability.py";

            org.antlr.codebuff.misc.Utils.writeFile(fileName, pythonST.render());
            Log.WriteLine("wrote python code to " + fileName);
        }
Exemplo n.º 5
0
        public static string testAllLanguages(LangDescriptor[] languages, string[] corpusDirs, string imageFileName)
        {
            IList <string> languageNames = BuffUtils.map(languages, l => l.name + "_err");
            //		Collections.sort(languageNames);
            IDictionary <string, int?> corpusSizes = new Dictionary <string, int?>();

            for (int i = 0; i < languages.Length; i++)
            {
                LangDescriptor language  = languages[i];
                IList <string> filenames = Tool.getFilenames(corpusDirs[i], language.fileRegex);
                corpusSizes[language.name] = filenames.Count;
            }
            IList <string> languageNamesAsStr = BuffUtils.map(languages, l => '"' + l.name + "\\nn=" + corpusSizes[l.name] + '"');
            //		Collections.sort(languageNamesAsStr);

            StringBuilder data = new StringBuilder();

            for (int i = 0; i < languages.Length; i++)
            {
                LangDescriptor       language  = languages[i];
                string               corpus    = corpusDirs[i];
                LeaveOneOutValidator validator = new LeaveOneOutValidator(corpus, language);
                Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(true, "/tmp");
                IList <Formatter> formatters = results.a;
                IList <float>     distances  = results.b;
                IList <float>     errors     = results.c;
                //			data.append(language.name+"_dist = "+distances+"\n");
                data.Append(language.name + "_err = " + errors + "\n");
            }

            string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff %s '%s'\n" + "#\n" + "import numpy as np\n" + "import pylab\n" + "import matplotlib.pyplot as plt\n\n" + "%s\n" + "language_data = %s\n" + "labels = %s\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "ax.boxplot(language_data,\n" + "           whis=[10, 90], # 10 and 90 %% whiskers\n" + "           widths=.35,\n" + "           labels=labels,\n" + "           showfliers=False)\n" + "ax.set_xticklabels(labels, rotation=60, fontsize=18)\n" + "ax.tick_params(axis='both', which='major', labelsize=18)\n" + "plt.xticks(range(1,len(labels)+1), labels, rotation=60, fontsize=18)\n" + "pylab.ylim([0,.28])\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "ax.set_xlabel(\"Grammar and corpus size\", fontsize=20)\n" + "ax.set_ylabel(\"Misclassification Error Rate\", fontsize=20)\n" + "# ax.set_title(\"Leave-one-out Validation Using Error Rate\\nBetween Formatted and Original File\")\n" + "plt.tight_layout()\n" + "fig.savefig('images/%s', format='pdf')\n" + "plt.show()\n";

            return(string.Format(python, Tool.version, DateTime.Now, data, languageNames, languageNamesAsStr, imageFileName));
        }
Exemplo n.º 6
0
        public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, string outputDir, bool computeEditDistance, bool collectAnalysis)
        {
            string path = System.IO.Path.GetFullPath(fileToExclude);
            IList <InputDocument> others   = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
            IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path));

            Debug.Assert(others.Count == documents.Count - 1);
            //		kNNClassifier.resetCache();
            if (excluded.Count == 0)
            {
                Console.Error.WriteLine("Doc not in corpus: " + path);
                return(null);
            }
            InputDocument testDoc = excluded[0];
            DateTime      start   = System.DateTime.Now;
            Corpus        corpus  = new Corpus(others, language);

            corpus.train();
            DateTime      stop         = System.DateTime.Now;
            Formatter     formatter    = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures);
            InputDocument originalDoc  = testDoc;
            DateTime      format_start = System.DateTime.Now;
            string        output       = formatter.format(testDoc, collectAnalysis);
            DateTime      format_stop  = System.DateTime.Now;
            float         editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.AnalysisPerToken);

            Console.WriteLine(testDoc.fileName + ": edit distance = " + editDistance + ", error rate = " + analysis.ErrorRate);
            if (!string.ReferenceEquals(outputDir, null))
            {
                string dir = outputDir + "/" + language.name + "/" + Tool.version;
                if (!System.IO.Directory.Exists(dir))
                {
                    System.IO.Directory.CreateDirectory(dir);
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            var tms = (stop - start);
            var fms = format_stop - format_start;

            trainingTimes.Add((double)tms.Milliseconds);
            float tokensPerMS = testDoc.tokens.Size / (float)fms.TotalMilliseconds;

            formattingTokensPerMS.Add((double)tokensPerMS);
            Console.Write("Training time = {0:D} ms, formatting {1:D} ms, {2,5:F3} tokens/ms ({3:D} tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.Size);
            //		System.out.printf("classify calls %d, hits %d rate %f\n",
            //		                  kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits,
            //		                  kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls);
            //		System.out.printf("kNN calls %d, hits %d rate %f\n",
            //						  kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits,
            //						  kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls);
            return(new Triple <Formatter, float, float>(formatter, editDistance, analysis.ErrorRate));
        }
Exemplo n.º 7
0
 public static void Main(string[] args)
 {
     LangDescriptor[] languages = new LangDescriptor[] { QUORUM_DESCR, JAVA_DESCR, JAVA8_DESCR, ANTLR4_DESCR, SQLITE_NOISY_DESCR, SQLITE_CLEAN_DESCR, TSQL_NOISY_DESCR, TSQL_CLEAN_DESCR };
     for (int i = 0; i < languages.Length; i++)
     {
         LangDescriptor language = languages[i];
         runCaptureForOneLanguage(language);
     }
 }
Exemplo n.º 8
0
        public static void Main(string[] args)
        {
            string         langname     = args[0].Substring(1);
            string         testFilename = args[1];
            LangDescriptor language     = null;

            for (int i = 0; i < languages.length; i++)
            {
                if (languages[i].name.Equals(langname))
                {
                    language = languages[i];
                    break;
                }
            }
            if (language == null)
            {
                Log.WriteLine("Language " + langname + " unknown");
                return;
            }

            // load all files up front
            DateTime              load_start = System.DateTime.Now;
            IList <string>        allFiles   = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <InputDocument> documents  = Tool.load(allFiles, language);
            DateTime              load_stop  = System.DateTime.Now;
            DateTime              load_time  = (load_stop - load_start) / 1000000;

            Log.Write("Loaded {0:D} files in {1:D}ms\n", documents.Count, load_time);

            string path = System.IO.Path.GetFullPath(testFilename);
            IList <InputDocument> others   = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
            IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path));

            Debug.Assert(others.Count == documents.Count - 1);
            if (excluded.Count == 0)
            {
                Log.WriteLine("Doc not in corpus: " + path);
                return;
            }
            InputDocument testDoc = excluded[0];

            IList <int> training   = new List <int>();
            IList <int> formatting = new List <int>();

            for (int i = 1; i <= TRIALS; i++)
            {
                org.antlr.codebuff.misc.Pair <int, int> timing = test(language, others, testDoc);
                training.Add(timing.a);
                formatting.Add(timing.b);
            }
            // drop first four
            training   = training.subList(5, training.Count);
            formatting = formatting.subList(5, formatting.Count);
            Log.Write("median of [5:{0:D}] training {1:D}ms\n", TRIALS - 1, BuffUtils.median(training));
            Log.Write("median of [5:{0:D}] formatting {1:D}ms\n", TRIALS - 1, BuffUtils.median(formatting));
        }
Exemplo n.º 9
0
 public Corpus(string rootDir, LangDescriptor language)
 {
     this.rootDir  = rootDir;
     this.language = language;
     if (documents == null)
     {
         IList <string> allFiles = Tool.getFilenames(rootDir, language.fileRegex);
         documents = Tool.load(allFiles, language);
     }
 }
Exemplo n.º 10
0
        public static float[] getMedianErrorRates(LangDescriptor language, int maxNumFiles, int trials)
        {
            SubsetValidator       validator = new SubsetValidator(language.corpusDir, language);
            IList <InputDocument> documents = Tool.load(validator.allFiles, language);

            float[] medians = new float[Math.Min(documents.Count, maxNumFiles) + 1];

            int ncpu = Runtime.Runtime.availableProcessors();

            if (FORCE_SINGLE_THREADED)
            {
                ncpu = 2;
            }
            ExecutorService          pool = Executors.newFixedThreadPool(ncpu - 1);
            IList <Callable <Void> > jobs = new List <Callable <Void> >();

            for (int i = 1; i <= Math.Min(validator.allFiles.Count, maxNumFiles); i++)
            {             // i is corpus subset size
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int corpusSubsetSize = i;
                int             corpusSubsetSize = i;
                Callable <Void> job = () =>
                {
                    try
                    {
                        IList <float?> errorRates = new List <float?>();
                        for (int trial = 1; trial <= trials; trial++)
                        {                 // multiple trials per subset size
                            org.antlr.codebuff.misc.Pair <InputDocument, IList <InputDocument> > sample = validator.selectSample(documents, corpusSubsetSize);
                            Triple <Formatter, float?, float?> results = validate(language, sample.b, sample.a, true, false);
//					System.out.println(sample.a.fileName+" n="+corpusSubsetSize+": error="+results.c);
//				System.out.println("\tcorpus =\n\t\t"+Utils.join(sample.b.iterator(), "\n\t\t"));
                            errorRates.Add(results.c);
                        }
                        errorRates.Sort();
                        int   n      = errorRates.Count;
                        float median = errorRates[n / 2].Value;
                        Console.WriteLine("median " + language.name + " error rate for n=" + corpusSubsetSize + " is " + median);
                        medians[corpusSubsetSize] = median;
                    }
                    catch (Exception t)
                    {
                        t.printStackTrace(System.err);
                    }
                    return(null);
                };
                jobs.Add(job);
            }

            pool.invokeAll(jobs);
            pool.shutdown();
            bool terminated = pool.awaitTermination(60, TimeUnit.MINUTES);

            return(medians);
        }
        public static void Main(string[] args)
        {
            LangDescriptor[] languages  = new LangDescriptor[] { JAVA_DESCR, JAVA8_DESCR, JAVA_GUAVA_DESCR };
            IList <string>   corpusDirs = BuffUtils.map(languages, l => l.corpusDir);

            string[] dirs     = corpusDirs.ToArray();
            string   python   = LeaveOneOutValidator.testAllLanguages(languages, dirs, "all_java_leave_one_out.pdf");
            string   fileName = "python/src/all_java_leave_one_out.py";

            org.antlr.codebuff.misc.Utils.writeFile(fileName, python);
            Log.WriteLine("wrote python code to " + fileName);
        }
Exemplo n.º 12
0
        public static void Main(string[] args)
        {
            LangDescriptor[] languages  = new LangDescriptor[] { Tool.ANTLR4_DESCR };
            IList <string>   corpusDirs = BuffUtils.map(languages, l => l.corpusDir);

            string[] dirs     = corpusDirs.ToArray();
            string   python   = testAllLanguages(languages, dirs, "leave_one_out.pdf");
            string   fileName = "python/src/leave_one_out.py";

            org.antlr.codebuff.misc.Utils.writeFile(fileName, python);
            Console.WriteLine("wrote python code to " + fileName);
        }
Exemplo n.º 13
0
        /// <summary>
        /// Get all file contents into input doc list </summary>
        public static IList <InputDocument> load(IList <string> fileNames, LangDescriptor language)
        {
            IList <InputDocument> documents = new List <InputDocument>();

            foreach (string fileName in fileNames)
            {
                documents.Add(parse(fileName, language));
            }
            if (documents.Count > 0)
            {
                documents[0].parser.Interpreter.ClearDFA();                 // free up memory
            }

            return(documents);
        }
Exemplo n.º 14
0
        public static IList <float> getAlignmentErrorRates(LangDescriptor language, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures)
        {
            LeaveOneOutValidator validator = new LeaveOneOutValidator(language.corpusDir, language);
            Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(injectWSFeatures, alignmentFeatures, false, null);
            IList <Formatter> formatters      = results.a;
            IList <float>     alignErrorRates = new List <float>();       // don't include align errors

            foreach (Formatter formatter in formatters)
            {
                ClassificationAnalysis analysis = new ClassificationAnalysis(formatter.testDoc, formatter.AnalysisPerToken);
                alignErrorRates.Add(analysis.AlignmentErrorRate);
            }
            //		System.out.println(results.c);
            //		System.out.println("vs");
            //		System.out.println(alignErrorRates);
            return(alignErrorRates);
        }
Exemplo n.º 15
0
        public static InputDocument parse(string fileName, string content, LangDescriptor language)
        {
            ANTLRInputStream input = new ANTLRInputStream(content);
            Lexer            lexer = getLexer(language.lexerClass, input);

            input.name = fileName;

            InputDocument doc = new InputDocument(fileName, content, language);

            doc.tokens = new CodeBuffTokenStream(lexer);

            doc.parser = getParser(language.parserClass, doc.tokens);
            doc.parser.BuildParseTree = true;

            // two-stage parsing. Try with SLL first
            doc.parser.Interpreter.PredictionMode = Antlr4.Runtime.Atn.PredictionMode.SLL;
            doc.parser.ErrorHandler = new BailErrorStrategy();
            doc.parser.RemoveErrorListeners();

            MethodInfo startRule = language.parserClass.GetMethod(language.startRuleName);

            try
            {
                doc.Tree = (ParserRuleContext)startRule.Invoke(doc.parser, (object[])null);
            }
            catch (Exception ex)
            {
                if (ex.InnerException is ParseCanceledException)
                {
                    doc.parser.Reset();
                    doc.tokens.Reset();                     // rewind input stream
                    // back to standard listeners/handlers
                    doc.parser.AddErrorListener(new ANTLRErrorListenerAnonymousInnerClass());
                    doc.parser.ErrorHandler = new DefaultErrorStrategy();
                    doc.parser.Interpreter.PredictionMode = PredictionMode.LL;
                    doc.Tree = (ParserRuleContext)startRule.Invoke(doc.parser, (object[])null);
                    if (doc.parser.NumberOfSyntaxErrors > 0)
                    {
                        doc.Tree = null;
                    }
                }
            }

            return(doc);
        }
Exemplo n.º 16
0
        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string>        filenames = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <InputDocument> documents = Tool.load(filenames, language);

            foreach (string fileName in filenames)
            {
                // Examine info for this file in isolation
                Corpus fileCorpus = new Corpus(fileName, language);
                fileCorpus.train();
                Console.WriteLine(fileName);
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> ws   = getWSContextCategoryMap(fileCorpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> hpos = getHPosContextCategoryMap(fileCorpus);

                // Compare with corpus minus this file
                string path = fileName;
                IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
                Corpus corpus = new Corpus(others, language);
                corpus.train();
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_ws   = getWSContextCategoryMap(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_hpos = getHPosContextCategoryMap(corpus);

                foreach (FeatureVectorAsObject x in ws.Keys)
                {
                    HashBag <int> fwsCats   = getCategoriesBag(ws[x]);
                    IList <float> fwsRatios = getCategoryRatios(fwsCats.Values);
                    HashBag <int> wsCats    = getCategoriesBag(corpus_ws[x]);
                    IList <float> wsRatios  = getCategoryRatios(wsCats.Values);
                    // compare file predictions with corpus predictions
                    if (!fwsRatios.SequenceEqual(wsRatios))
                    {
                        Console.WriteLine(fwsRatios + " vs " + wsRatios);
                    }

                    HashBag <int> fhposCats = getCategoriesBag(hpos[x]);
                    HashBag <int> hposCats  = getCategoriesBag(corpus_hpos[x]);
                }

                break;
            }
        }
Exemplo n.º 17
0
        public static org.antlr.codebuff.misc.Pair <int, int> test(LangDescriptor language, IList <InputDocument> others, InputDocument testDoc)
        {
            var    train_start = System.DateTime.Now;
            Corpus corpus      = new Corpus(others, language);

            corpus.train();
            var train_stop = System.DateTime.Now;

            var       format_start = System.DateTime.Now;
            Formatter formatter    = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, FEATURES_INJECT_WS, FEATURES_HPOS);

            formatter.format(testDoc, false);
            var format_stop = System.DateTime.Now;

            var train_time  = (train_stop - train_start) / 1000000;
            var format_time = (format_stop - format_start) / 1000000;

            Log.Write("{0} training of {1} = {2:D}ms formatting = {3:D}ms\n", language.name, testDoc.fileName, train_time, format_time);

            return(new org.antlr.codebuff.misc.Pair <int, int>((int)train_time, (int)format_time));
        }
Exemplo n.º 18
0
        public static void writePython(LangDescriptor[] languages, IList <int?> ks, float[][] medians)
        {
            StringBuilder data = new StringBuilder();
            StringBuilder plot = new StringBuilder();

            for (int i = 0; i < languages.Length; i++)
            {
                LangDescriptor language        = languages[i];
                IList <float?> filteredMedians = BuffUtils.filter(Arrays.asList(medians[i]), m => m != null);
                data.Append(language.name + '=' + filteredMedians + '\n');
                plot.Append(string.Format("ax.plot(ks, {0}, label=\"{1}\", marker='{2}', color='{3}')\n", language.name, language.name, nameToGraphMarker.get(language.name), nameToGraphColor.get(language.name)));
            }

            string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff %s '%s'\n" + "#\n" + "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "%s\n" + "ks = %s\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "%s" + "ax.tick_params(axis='both', which='major', labelsize=18)\n" + "ax.set_xlabel(\"$k$ nearest neighbors\", fontsize=20)\n" + "ax.set_ylabel(\"Median error rate\", fontsize=20)\n" + "#ax.set_title(\"k Nearest Neighbors vs\\nLeave-one-out Validation Error Rate\")\n" + "plt.legend(fontsize=18)\n\n" + "fig.savefig('images/vary_k.pdf', format='pdf')\n" + "plt.show()\n";
            string code   = string.format(python, Tool.version, DateTime.Now, data, ks, plot);

            string fileName = "python/src/vary_k.py";

            org.antlr.codebuff.misc.Utils.writeFile(fileName, code);
            Log.WriteLine("wrote python code to " + fileName);
        }
Exemplo n.º 19
0
 public Corpus(IList <InputDocument> documents, LangDescriptor language)
 {
     this.documents = documents;
     this.language  = language;
 }
Exemplo n.º 20
0
 public SubsetValidator(string rootDir, LangDescriptor language)
 {
     this.rootDir  = rootDir;
     this.language = language;
     allFiles      = Tool.getFilenames(rootDir, language.fileRegex);
 }
Exemplo n.º 21
0
 public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, string outputDir, bool computeEditDistance, bool collectAnalysis)
 {
     return(validate(language, documents, fileToExclude, k, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS, outputDir, computeEditDistance, collectAnalysis));
 }
Exemplo n.º 22
0
        public static void computeConsistency(LangDescriptor language, bool report)
        {
            if (report)
            {
                Console.WriteLine("-----------------------------------");
                Console.WriteLine(language.name);
                Console.WriteLine("-----------------------------------");
            }
            Corpus corpus = new Corpus(language.corpusDir, language);

            corpus.train();
            // a map of feature vector to list of exemplar indexes of that feature
            MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex   = new MyMultiMap <FeatureVectorAsObject, int>();
            MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>();

            int n = corpus.featureVectors.Count;

            for (int i = 0; i < n; i++)
            {
                int[] features = corpus.featureVectors[i];
                wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i);
                hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i);
            }

            int num_ambiguous_ws_vectors   = 0;
            int num_ambiguous_hpos_vectors = 0;

            // Dump output grouped by ws vs hpos then feature vector then category
            if (report)
            {
                Console.WriteLine(" --- INJECT WS ---");
            }
            IList <double> ws_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys)
            {
                var exemplarIndexes = wsContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for ws
                MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    wsCatToIndexes.Map(corpus.injectWhitespace[i], i);
                }
                if (wsCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size());
                double      wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", wsEntropy);
                }
                wsEntropy *= exemplarIndexes.size();
                ws_entropies.Add(wsEntropy);
                num_ambiguous_ws_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS));
                }

                if (report)
                {
                    foreach (int cat in wsCatToIndexes.Keys)
                    {
                        var indexes = wsCatToIndexes[cat];
                        foreach (int i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }

            if (report)
            {
                Console.WriteLine(" --- HPOS ---");
            }
            IList <double> hpos_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys)
            {
                MyHashSet <int> exemplarIndexes = hposContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for hpos
                MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    hposCatToIndexes.Map(corpus.hpos[i], i);
                }
                if (hposCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts   = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size());
                double      hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", hposEntropy);
                }
                hposEntropy *= exemplarIndexes.size();
                hpos_entropies.Add(hposEntropy);
                num_ambiguous_hpos_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS));
                }

                if (report)
                {
                    foreach (int cat in hposCatToIndexes.Keys)
                    {
                        var indexes = hposCatToIndexes[cat];
                        foreach (int?i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }
            Console.WriteLine();
            Console.WriteLine(language.name);
            Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n));
            Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n));
            float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n;

            Console.Write("num_ambiguous_ws_vectors   = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous);
            float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n;

            Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous);
            //		Collections.sort(ws_entropies);
            //		System.out.println("ws_entropies="+ws_entropies);
            Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies));
            double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous;

            Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy);

            Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies));
            double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous;

            Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy);
        }
Exemplo n.º 23
0
        public static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.Error.WriteLine("Dbg [-leave-one-out] [-java|-java8|-antlr|-sqlite|-tsql] test-file");
            }

            int    arg             = 0;
            bool   leaveOneOut     = true;
            bool   collectAnalysis = true;
            string language        = args[arg++];

            language = language.Substring(1);
            string        testFilename = args[arg];
            string        output       = "???";
            InputDocument testDoc      = null;
            IList <TokenPositionAnalysis> analysisPerToken = null;

            org.antlr.codebuff.misc.Pair <string, IList <TokenPositionAnalysis> > results;
            LangDescriptor lang = null;

            System.DateTime start, stop;
            for (int i = 0; i < Tool.languages.Length; i++)
            {
                if (Tool.languages[i].name.Equals(language))
                {
                    lang = Tool.languages[i];
                    break;
                }
            }
            if (lang != null)
            {
                start = System.DateTime.Now;
                LeaveOneOutValidator             validator = new LeaveOneOutValidator(lang.corpusDir, lang);
                Triple <Formatter, float, float> val       = validator.validateOneDocument(testFilename, null, collectAnalysis);
                testDoc = Tool.parse(testFilename, lang);
                stop    = System.DateTime.Now;
                Formatter formatter = val.a;
                output = formatter.Output;
                Console.WriteLine("output len = " + output.Length);
                float editDistance = normalizedLevenshteinDistance(testDoc.content, output);
                Console.WriteLine("normalized Levenshtein distance: " + editDistance);
                analysisPerToken = formatter.AnalysisPerToken;

                Regex             rex             = new Regex("^\\s+$");
                CommonTokenStream original_tokens = Tool.tokenize(testDoc.content, lang.lexerClass);
                IList <Token>     wsTokens        = BuffUtils.filter(original_tokens.GetTokens(), t => rex.IsMatch(t.Text));
                string            originalWS      = tokenText(wsTokens);
                Console.WriteLine("origin ws tokens len: " + originalWS.Length);
                CommonTokenStream formatted_tokens = Tool.tokenize(output, lang.lexerClass);
                wsTokens = BuffUtils.filter(formatted_tokens.GetTokens(), t => rex.IsMatch(t.Text));
                string formattedWS = tokenText(wsTokens);
                Console.WriteLine("formatted ws tokens len: " + formattedWS.Length);
                editDistance  = levenshteinDistance(originalWS, formattedWS);
                editDistance /= Math.Max(testDoc.content.Length, output.Length);
                Console.WriteLine("Levenshtein distance of ws normalized to output len: " + editDistance);

                ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, analysisPerToken);
                Console.WriteLine(analysis);
            }

            if (lang != null)
            {
                //            GUIController controller;
                //            controller = new GUIController(analysisPerToken, testDoc, output, lang.lexerClass);
                //controller.show();
                //			System.out.println(output);
                //Console.Write("formatting time {0:D}s\n", (stop - start) / 1000000);
                Console.Write("classify calls {0:D}, hits {1:D} rate {2:F}\n", kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits, kNNClassifier.nClassifyCacheHits / (float)kNNClassifier.nClassifyCalls);
                Console.Write("kNN calls {0:D}, hits {1:D} rate {2:F}\n", kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits, kNNClassifier.nNNCacheHits / (float)kNNClassifier.nNNCalls);
            }
        }
Exemplo n.º 24
0
 public static void Main(string[] args)
 {
     LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR };
     testFeatures(languages, false);
 }
Exemplo n.º 25
0
        /// <summary>
        /// Parse doc and fill tree and tokens fields
        /// </summary>
        public static InputDocument parse(string fileName, LangDescriptor language)
        {
            string content = load(fileName, language.indentSize);

            return(parse(fileName, content, language));
        }
Exemplo n.º 26
0
        public static string Main(object[] args)
        {
            Log.Reset();
            try
            {
                if (args.Length < 7)
                {
                    Log.WriteLine("org.antlr.codebuff.Tool -g grammar-name -rule start-rule -corpus root-dir-of-samples \\\n" + "   [-files file-extension] [-indent num-spaces] \\" + "   [-comment line-comment-name] [-o output-file] file-to-format");
                    return(Log.Message());
                }

                formatted_output = null;
                string outputFileName  = "";
                string grammarName     = null;
                string startRule       = null;
                string corpusDir       = null;
                string indentS         = "4";
                string commentS        = null;
                string input_file_name = null;
                string fileExtension   = null;
                int    i           = 0;
                Type   parserClass = null;
                Type   lexerClass  = null;
                while (i < args.Length && ((string)args[i]).StartsWith("-", StringComparison.Ordinal))
                {
                    switch (args[i])
                    {
                    case "-g":
                        i++;
                        grammarName = (string)args[i++];
                        break;

                    case "-lexer":
                        i++;
                        lexerClass = (Type)args[i++];
                        break;

                    case "-parser":
                        i++;
                        parserClass = (Type)args[i++];
                        break;

                    case "-rule":
                        i++;
                        startRule = (string)args[i++];
                        break;

                    case "-corpus":
                        i++;
                        corpusDir = (string)args[i++];
                        break;

                    case "-files":
                        i++;
                        fileExtension = (string)args[i++];
                        break;

                    case "-indent":
                        i++;
                        indentS = (string)args[i++];
                        break;

                    case "-comment":
                        i++;
                        commentS = (string)args[i++];
                        break;

                    case "-o":
                        i++;
                        outputFileName = (string)args[i++];
                        break;

                    case "-inoutstring":
                        i++;
                        formatted_output = "";
                        outputFileName   = null;
                        break;
                    }
                }
                input_file_name = (string)args[i]; // must be last

                Log.WriteLine("gramm: " + grammarName);
                string parserClassName = grammarName + "Parser";
                string lexerClassName  = grammarName + "Lexer";
                Lexer  lexer           = null;
                if (lexerClass == null || parserClass == null)
                {
                    Log.WriteLine("You must specify a lexer and parser.");
                }
                if (parserClass == null | lexerClass == null)
                {
                    return(Log.Message());
                }
                int indentSize            = int.Parse(indentS);
                int singleLineCommentType = -1;
                if (!string.ReferenceEquals(commentS, null))
                {
                    try
                    {
                        lexer = getLexer(lexerClass, null);
                    }
                    catch (Exception e)
                    {
                        Log.WriteLine("Can't instantiate lexer " + lexerClassName);
                        Log.WriteLine(e.StackTrace);
                    }
                    if (lexer == null)
                    {
                        return(Log.Message());
                    }
                    IDictionary <string, int> tokenTypeMap = lexer.TokenTypeMap;
                    if (tokenTypeMap.ContainsKey(commentS))
                    {
                        singleLineCommentType = tokenTypeMap[commentS];
                    }
                }
                string fileRegex = null;
                if (!string.ReferenceEquals(fileExtension, null))
                {
                    var pattern            = "";
                    var allowable_suffices = fileExtension.Split(';').ToList <string>();
                    foreach (var s in allowable_suffices)
                    {
                        var no_dot = s.Substring(s.IndexOf('.') + 1);
                        pattern = pattern == "" ? ("(" + no_dot) : (pattern + "|" + no_dot);
                    }
                    pattern   = pattern + ")";
                    fileRegex = ".*\\." + pattern;
                }
                LangDescriptor language = new LangDescriptor(grammarName, corpusDir, fileRegex, lexerClass, parserClass, startRule, indentSize, singleLineCommentType);

                ////////
                // load all corpus files up front
                IList <string>        allFiles  = getFilenames(language.corpusDir, language.fileRegex);
                IList <InputDocument> documents = load(allFiles, language);

                // Handle formatting of document if it's passed as a string or not.
                if (unformatted_input == null)
                {
                    // Don't include file to format in corpus itself.
                    string path = System.IO.Path.GetFullPath(input_file_name);
                    IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
                    // Perform training of formatter.
                    Corpus corpus = new Corpus(others, language);
                    corpus.train();

                    // Parse code contained in file.
                    InputDocument unformatted_document = parse(input_file_name, language);

                    // Format document.
                    Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS);
                    formatted_output = formatter.format(unformatted_document, false);
                }
                else
                {
                    // Perform training of formatter.
                    Corpus corpus = new Corpus(documents, language);
                    corpus.train();

                    // Parse code that was represented as a string.
                    InputDocument unformatted_document = parse(input_file_name, unformatted_input, language);

                    // Format document.
                    Formatter formatter = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, Trainer.FEATURES_INJECT_WS, Trainer.FEATURES_HPOS);
                    formatted_output = formatter.format(unformatted_document, false);
                }
                ///////
                if (outputFileName != null && outputFileName == "")
                {
                    Log.WriteLine(formatted_output);
                }
                else if (!string.IsNullOrEmpty(outputFileName))
                {
                    org.antlr.codebuff.misc.Utils.writeFile(outputFileName, formatted_output);
                }
            }
            catch (Exception e)
            {
                throw e;
            }

            return(formatted_output);
        }
Exemplo n.º 27
0
        public static void Main(string[] args)
        {
            LangDescriptor[] languages = new LangDescriptor[] { Tool.ANTLR4_DESCR };

            int          MAX_K     = 98; // should be odd
            int          OUTLIER_K = 99;
            IList <int?> ks        = new List <int?>();

            for (int i = 1; i <= MAX_K; i += 2)
            {
                ks.Add(i);
            }
            ks.Add(OUTLIER_K);
            // track medians[language][k]
            float[][] medians = new float[languages.Length + 1][];

            int ncpu = 1;

            if (FORCE_SINGLE_THREADED)
            {
                ncpu = 2;
            }
            ExecutorService          pool = Executors.newFixedThreadPool(ncpu - 1);
            IList <Callable <Void> > jobs = new List <Callable <Void> >();

            for (int i = 0; i < languages.Length; i++)
            {
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final org.antlr.codebuff.misc.LangDescriptor language = languages[i];
                LangDescriptor language = languages[i];
//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
//ORIGINAL LINE: final int langIndex = i;
                int langIndex = i;
                Log.WriteLine(language.name);
                foreach (int k in ks)
                {
                    medians[langIndex] = new float?[OUTLIER_K + 1];
                    Callable <Void> job = () =>
                    {
                        try
                        {
                            TestK          tester     = new TestK(language.corpusDir, language, k);
                            IList <float?> errorRates = tester.scoreDocuments();
                            errorRates.Sort();
                            int   n      = errorRates.Count;
                            float median = errorRates[n / 2].Value;
//						double var = BuffUtils.varianceFloats(errorRates);
//						String display = String.format("%5.4f, %5.4f, %5.4f, %5.4f, %5.4f", min, quart, median, quart3, max);
                            medians[langIndex][k] = median;
                        }
                        catch (Exception t)
                        {
                            t.printStackTrace(System.err);
                        }
                        return(null);
                    };
                    jobs.Add(job);
                }
            }

            pool.invokeAll(jobs);
            pool.shutdown();
            bool terminated = pool.awaitTermination(60, TimeUnit.MINUTES);

            writePython(languages, ks, medians);
        }
Exemplo n.º 28
0
        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string> filenames         = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <float>  selfEditDistances = new List <float>();

            foreach (string fileName in filenames)
            {
                Corpus corpus = new Corpus(fileName, language);
                corpus.train();
                InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                Formatter     formatter = new Formatter(corpus, language.indentSize);
                string        output    = formatter.format(testDoc, false);
                //		System.out.println(output);
                float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                Log.WriteLine(fileName + " edit distance " + editDistance);
                selfEditDistances.Add(editDistance);
            }

            {
                Corpus corpus = new Corpus(language.corpusDir, language);
                corpus.train();

                IList <float> corpusEditDistances = new List <float>();
                foreach (string fileName in filenames)
                {
                    InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                    Formatter     formatter = new Formatter(corpus, language.indentSize);
                    string        output    = formatter.format(testDoc, false);
                    //		System.out.println(output);
                    float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                    Log.WriteLine(fileName + "+corpus edit distance " + editDistance);
                    corpusEditDistances.Add(editDistance);
                }
                // heh this gives info on within-corpus variability. i.e., how good/consistent is my corpus?
                // those files with big difference are candidates for dropping from corpus or for cleanup.
                IList <string> labels = BuffUtils.map(filenames, f => '"' + System.IO.Path.GetFileName(f) + '"');

                string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" +
                                "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" +
                                "ax = plt.subplot(111)\n" + "labels = <labels>\n" + "N = len(labels)\n\n" +
                                "featureIndexes = range(0,N)\n" + "<lang>_self = <selfEditDistances>\n" +
                                "<lang>_corpus = <corpusEditDistances>\n" +
                                "<lang>_diff = np.abs(np.subtract(<lang>_self, <lang>_corpus))\n\n" +
                                "all = zip(<lang>_self, <lang>_corpus, <lang>_diff, labels)\n" +
                                "all = sorted(all, key=lambda x : x[2], reverse=True)\n" +
                                "<lang>_self, <lang>_corpus, <lang>_diff, labels = zip(*all)\n\n" +
                                "ax.plot(featureIndexes, <lang>_self, label=\"<lang>_self\")\n" +
                                "#ax.plot(featureIndexes, <lang>_corpus, label=\"<lang>_corpus\")\n" +
                                "ax.plot(featureIndexes, <lang>_diff, label=\"<lang>_diff\")\n" +
                                "ax.set_xticklabels(labels, rotation=60, fontsize=8)\n" +
                                "plt.xticks(featureIndexes, labels, rotation=60)\n" +
                                "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n\n" +
                                "ax.text(1, .25, 'median $f$ self distance = %5.3f, corpus+$f$ distance = %5.3f' %" +
                                "    (np.median(<lang>_self),np.median(<lang>_corpus)))\n" + "ax.set_xlabel(\"File Name\")\n" +
                                "ax.set_ylabel(\"Edit Distance\")\n" +
                                "ax.set_title(\"Difference between Formatting File <lang> $f$\\nwith Training=$f$ and Training=$f$+Corpus\")\n" +
                                "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig(\"images/" + language.name +
                                "_one_file_capture.pdf\", format='pdf')\n" + "plt.show()\n";
                ST pythonST = new ST(python);

                pythonST.add("lang", language.name);
                pythonST.add("version", version);
                pythonST.add("date", DateTime.Now);
                pythonST.add("labels", labels.ToString());
                pythonST.add("selfEditDistances", selfEditDistances.ToString());
                pythonST.add("corpusEditDistances", corpusEditDistances.ToString());

                string code = pythonST.render();

                {
                    string fileName = "python/src/" + language.name + "_one_file_capture.py";
                    org.antlr.codebuff.misc.Utils.writeFile(fileName, code);
                    Log.WriteLine("wrote python code to " + fileName);
                }
            }
        }
Exemplo n.º 29
0
 public LeaveOneOutValidator(string rootDir, LangDescriptor language)
 {
     this.rootDir  = rootDir;
     this.language = language;
     random        = new Random(DOCLIST_RANDOM_SEED);
 }
Exemplo n.º 30
0
 public TestK(string rootDir, LangDescriptor language, int k) : base(rootDir, language)
 {
     this.k = k;
 }