Beispiel #1
0
        public static string testAllLanguages(LangDescriptor[] languages, string[] corpusDirs, string imageFileName)
        {
            IList <string> languageNames = BuffUtils.map(languages, l => l.name + "_err");
            //		Collections.sort(languageNames);
            IDictionary <string, int?> corpusSizes = new Dictionary <string, int?>();

            for (int i = 0; i < languages.Length; i++)
            {
                LangDescriptor language  = languages[i];
                IList <string> filenames = Tool.getFilenames(corpusDirs[i], language.fileRegex);
                corpusSizes[language.name] = filenames.Count;
            }
            IList <string> languageNamesAsStr = BuffUtils.map(languages, l => '"' + l.name + "\\nn=" + corpusSizes[l.name] + '"');
            //		Collections.sort(languageNamesAsStr);

            StringBuilder data = new StringBuilder();

            for (int i = 0; i < languages.Length; i++)
            {
                LangDescriptor       language  = languages[i];
                string               corpus    = corpusDirs[i];
                LeaveOneOutValidator validator = new LeaveOneOutValidator(corpus, language);
                Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(true, "/tmp");
                IList <Formatter> formatters = results.a;
                IList <float>     distances  = results.b;
                IList <float>     errors     = results.c;
                //			data.append(language.name+"_dist = "+distances+"\n");
                data.Append(language.name + "_err = " + errors + "\n");
            }

            string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff %s '%s'\n" + "#\n" + "import numpy as np\n" + "import pylab\n" + "import matplotlib.pyplot as plt\n\n" + "%s\n" + "language_data = %s\n" + "labels = %s\n" + "fig = plt.figure()\n" + "ax = plt.subplot(111)\n" + "ax.boxplot(language_data,\n" + "           whis=[10, 90], # 10 and 90 %% whiskers\n" + "           widths=.35,\n" + "           labels=labels,\n" + "           showfliers=False)\n" + "ax.set_xticklabels(labels, rotation=60, fontsize=18)\n" + "ax.tick_params(axis='both', which='major', labelsize=18)\n" + "plt.xticks(range(1,len(labels)+1), labels, rotation=60, fontsize=18)\n" + "pylab.ylim([0,.28])\n" + "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n" + "ax.set_xlabel(\"Grammar and corpus size\", fontsize=20)\n" + "ax.set_ylabel(\"Misclassification Error Rate\", fontsize=20)\n" + "# ax.set_title(\"Leave-one-out Validation Using Error Rate\\nBetween Formatted and Original File\")\n" + "plt.tight_layout()\n" + "fig.savefig('images/%s', format='pdf')\n" + "plt.show()\n";

            return(string.Format(python, Tool.version, DateTime.Now, data, languageNames, languageNamesAsStr, imageFileName));
        }
Beispiel #2
0
        public static void Main(string[] args)
        {
            LangDescriptor[] languages  = new LangDescriptor[] { Tool.ANTLR4_DESCR };
            IList <string>   corpusDirs = BuffUtils.map(languages, l => l.corpusDir);

            string[] dirs     = corpusDirs.ToArray();
            string   python   = testAllLanguages(languages, dirs, "leave_one_out.pdf");
            string   fileName = "python/src/leave_one_out.py";

            org.antlr.codebuff.misc.Utils.writeFile(fileName, python);
            Console.WriteLine("wrote python code to " + fileName);
        }
        public static void Main(string[] args)
        {
            LangDescriptor[] languages  = new LangDescriptor[] { JAVA_DESCR, JAVA8_DESCR, JAVA_GUAVA_DESCR };
            IList <string>   corpusDirs = BuffUtils.map(languages, l => l.corpusDir);

            string[] dirs     = corpusDirs.ToArray();
            string   python   = LeaveOneOutValidator.testAllLanguages(languages, dirs, "all_java_leave_one_out.pdf");
            string   fileName = "python/src/all_java_leave_one_out.py";

            org.antlr.codebuff.misc.Utils.writeFile(fileName, python);
            Log.WriteLine("wrote python code to " + fileName);
        }
Beispiel #4
0
        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string> filenames         = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <float>  selfEditDistances = new List <float>();

            foreach (string fileName in filenames)
            {
                Corpus corpus = new Corpus(fileName, language);
                corpus.train();
                InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                Formatter     formatter = new Formatter(corpus, language.indentSize);
                string        output    = formatter.format(testDoc, false);
                //		System.out.println(output);
                float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                Log.WriteLine(fileName + " edit distance " + editDistance);
                selfEditDistances.Add(editDistance);
            }

            {
                Corpus corpus = new Corpus(language.corpusDir, language);
                corpus.train();

                IList <float> corpusEditDistances = new List <float>();
                foreach (string fileName in filenames)
                {
                    InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                    Formatter     formatter = new Formatter(corpus, language.indentSize);
                    string        output    = formatter.format(testDoc, false);
                    //		System.out.println(output);
                    float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                    Log.WriteLine(fileName + "+corpus edit distance " + editDistance);
                    corpusEditDistances.Add(editDistance);
                }
                // heh this gives info on within-corpus variability. i.e., how good/consistent is my corpus?
                // those files with big difference are candidates for dropping from corpus or for cleanup.
                IList <string> labels = BuffUtils.map(filenames, f => '"' + System.IO.Path.GetFileName(f) + '"');

                string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" +
                                "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" +
                                "ax = plt.subplot(111)\n" + "labels = <labels>\n" + "N = len(labels)\n\n" +
                                "featureIndexes = range(0,N)\n" + "<lang>_self = <selfEditDistances>\n" +
                                "<lang>_corpus = <corpusEditDistances>\n" +
                                "<lang>_diff = np.abs(np.subtract(<lang>_self, <lang>_corpus))\n\n" +
                                "all = zip(<lang>_self, <lang>_corpus, <lang>_diff, labels)\n" +
                                "all = sorted(all, key=lambda x : x[2], reverse=True)\n" +
                                "<lang>_self, <lang>_corpus, <lang>_diff, labels = zip(*all)\n\n" +
                                "ax.plot(featureIndexes, <lang>_self, label=\"<lang>_self\")\n" +
                                "#ax.plot(featureIndexes, <lang>_corpus, label=\"<lang>_corpus\")\n" +
                                "ax.plot(featureIndexes, <lang>_diff, label=\"<lang>_diff\")\n" +
                                "ax.set_xticklabels(labels, rotation=60, fontsize=8)\n" +
                                "plt.xticks(featureIndexes, labels, rotation=60)\n" +
                                "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n\n" +
                                "ax.text(1, .25, 'median $f$ self distance = %5.3f, corpus+$f$ distance = %5.3f' %" +
                                "    (np.median(<lang>_self),np.median(<lang>_corpus)))\n" + "ax.set_xlabel(\"File Name\")\n" +
                                "ax.set_ylabel(\"Edit Distance\")\n" +
                                "ax.set_title(\"Difference between Formatting File <lang> $f$\\nwith Training=$f$ and Training=$f$+Corpus\")\n" +
                                "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig(\"images/" + language.name +
                                "_one_file_capture.pdf\", format='pdf')\n" + "plt.show()\n";
                ST pythonST = new ST(python);

                pythonST.add("lang", language.name);
                pythonST.add("version", version);
                pythonST.add("date", DateTime.Now);
                pythonST.add("labels", labels.ToString());
                pythonST.add("selfEditDistances", selfEditDistances.ToString());
                pythonST.add("corpusEditDistances", corpusEditDistances.ToString());

                string code = pythonST.render();

                {
                    string fileName = "python/src/" + language.name + "_one_file_capture.py";
                    org.antlr.codebuff.misc.Utils.writeFile(fileName, code);
                    Log.WriteLine("wrote python code to " + fileName);
                }
            }
        }
        public static void computeConsistency(LangDescriptor language, bool report)
        {
            if (report)
            {
                Console.WriteLine("-----------------------------------");
                Console.WriteLine(language.name);
                Console.WriteLine("-----------------------------------");
            }
            Corpus corpus = new Corpus(language.corpusDir, language);

            corpus.train();
            // a map of feature vector to list of exemplar indexes of that feature
            MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex   = new MyMultiMap <FeatureVectorAsObject, int>();
            MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>();

            int n = corpus.featureVectors.Count;

            for (int i = 0; i < n; i++)
            {
                int[] features = corpus.featureVectors[i];
                wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i);
                hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i);
            }

            int num_ambiguous_ws_vectors   = 0;
            int num_ambiguous_hpos_vectors = 0;

            // Dump output grouped by ws vs hpos then feature vector then category
            if (report)
            {
                Console.WriteLine(" --- INJECT WS ---");
            }
            IList <double> ws_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys)
            {
                var exemplarIndexes = wsContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for ws
                MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    wsCatToIndexes.Map(corpus.injectWhitespace[i], i);
                }
                if (wsCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size());
                double      wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", wsEntropy);
                }
                wsEntropy *= exemplarIndexes.size();
                ws_entropies.Add(wsEntropy);
                num_ambiguous_ws_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS));
                }

                if (report)
                {
                    foreach (int cat in wsCatToIndexes.Keys)
                    {
                        var indexes = wsCatToIndexes[cat];
                        foreach (int i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }

            if (report)
            {
                Console.WriteLine(" --- HPOS ---");
            }
            IList <double> hpos_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys)
            {
                MyHashSet <int> exemplarIndexes = hposContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for hpos
                MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    hposCatToIndexes.Map(corpus.hpos[i], i);
                }
                if (hposCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts   = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size());
                double      hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", hposEntropy);
                }
                hposEntropy *= exemplarIndexes.size();
                hpos_entropies.Add(hposEntropy);
                num_ambiguous_hpos_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS));
                }

                if (report)
                {
                    foreach (int cat in hposCatToIndexes.Keys)
                    {
                        var indexes = hposCatToIndexes[cat];
                        foreach (int?i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }
            Console.WriteLine();
            Console.WriteLine(language.name);
            Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n));
            Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n));
            float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n;

            Console.Write("num_ambiguous_ws_vectors   = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous);
            float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n;

            Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous);
            //		Collections.sort(ws_entropies);
            //		System.out.println("ws_entropies="+ws_entropies);
            Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies));
            double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous;

            Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy);

            Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies));
            double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous;

            Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy);
        }