Esempio n. 1
0
        public static Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, InputDocument testDoc, bool saveOutput, bool computeEditDistance)
        {
            //		kNNClassifier.resetCache();
            Corpus corpus = new Corpus(documents, language);

            corpus.train();
            //		System.out.printf("%d feature vectors\n", corpus.featureVectors.size());
            Formatter formatter    = new Formatter(corpus, language.indentSize);
            string    output       = formatter.format(testDoc, false);
            float     editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.AnalysisPerToken);

            //		System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate());
            if (saveOutput)
            {
                File dir = new File(outputDir + "/" + language.name);
                if (saveOutput)
                {
                    dir = new File(outputDir + "/" + language.name);
                    dir.mkdir();
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir.Path + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            return(new Triple <Formatter, float?, float?>(formatter, editDistance, analysis.ErrorRate));
        }
Esempio n. 2
0
        public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, string outputDir, bool computeEditDistance, bool collectAnalysis)
        {
            string path = System.IO.Path.GetFullPath(fileToExclude);
            IList <InputDocument> others   = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
            IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path));

            Debug.Assert(others.Count == documents.Count - 1);
            //		kNNClassifier.resetCache();
            if (excluded.Count == 0)
            {
                Console.Error.WriteLine("Doc not in corpus: " + path);
                return(null);
            }
            InputDocument testDoc = excluded[0];
            DateTime      start   = System.DateTime.Now;
            Corpus        corpus  = new Corpus(others, language);

            corpus.train();
            DateTime      stop         = System.DateTime.Now;
            Formatter     formatter    = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures);
            InputDocument originalDoc  = testDoc;
            DateTime      format_start = System.DateTime.Now;
            string        output       = formatter.format(testDoc, collectAnalysis);
            DateTime      format_stop  = System.DateTime.Now;
            float         editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.AnalysisPerToken);

            Console.WriteLine(testDoc.fileName + ": edit distance = " + editDistance + ", error rate = " + analysis.ErrorRate);
            if (!string.ReferenceEquals(outputDir, null))
            {
                string dir = outputDir + "/" + language.name + "/" + Tool.version;
                if (!System.IO.Directory.Exists(dir))
                {
                    System.IO.Directory.CreateDirectory(dir);
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            var tms = (stop - start);
            var fms = format_stop - format_start;

            trainingTimes.Add((double)tms.Milliseconds);
            float tokensPerMS = testDoc.tokens.Size / (float)fms.TotalMilliseconds;

            formattingTokensPerMS.Add((double)tokensPerMS);
            Console.Write("Training time = {0:D} ms, formatting {1:D} ms, {2,5:F3} tokens/ms ({3:D} tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.Size);
            //		System.out.printf("classify calls %d, hits %d rate %f\n",
            //		                  kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits,
            //		                  kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls);
            //		System.out.printf("kNN calls %d, hits %d rate %f\n",
            //						  kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits,
            //						  kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls);
            return(new Triple <Formatter, float, float>(formatter, editDistance, analysis.ErrorRate));
        }
Esempio n. 3
0
        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string>        filenames = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <InputDocument> documents = Tool.load(filenames, language);

            foreach (string fileName in filenames)
            {
                // Examine info for this file in isolation
                Corpus fileCorpus = new Corpus(fileName, language);
                fileCorpus.train();
                Console.WriteLine(fileName);
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> ws   = getWSContextCategoryMap(fileCorpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> hpos = getHPosContextCategoryMap(fileCorpus);

                // Compare with corpus minus this file
                string path = fileName;
                IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
                Corpus corpus = new Corpus(others, language);
                corpus.train();
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_ws   = getWSContextCategoryMap(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_hpos = getHPosContextCategoryMap(corpus);

                foreach (FeatureVectorAsObject x in ws.Keys)
                {
                    HashBag <int> fwsCats   = getCategoriesBag(ws[x]);
                    IList <float> fwsRatios = getCategoryRatios(fwsCats.Values);
                    HashBag <int> wsCats    = getCategoriesBag(corpus_ws[x]);
                    IList <float> wsRatios  = getCategoryRatios(wsCats.Values);
                    // compare file predictions with corpus predictions
                    if (!fwsRatios.SequenceEqual(wsRatios))
                    {
                        Console.WriteLine(fwsRatios + " vs " + wsRatios);
                    }

                    HashBag <int> fhposCats = getCategoriesBag(hpos[x]);
                    HashBag <int> hposCats  = getCategoriesBag(corpus_hpos[x]);
                }

                break;
            }
        }
Esempio n. 4
0
        public static org.antlr.codebuff.misc.Pair <int, int> test(LangDescriptor language, IList <InputDocument> others, InputDocument testDoc)
        {
            var    train_start = System.DateTime.Now;
            Corpus corpus      = new Corpus(others, language);

            corpus.train();
            var train_stop = System.DateTime.Now;

            var       format_start = System.DateTime.Now;
            Formatter formatter    = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, FEATURES_INJECT_WS, FEATURES_HPOS);

            formatter.format(testDoc, false);
            var format_stop = System.DateTime.Now;

            var train_time  = (train_stop - train_start) / 1000000;
            var format_time = (format_stop - format_start) / 1000000;

            Log.Write("{0} training of {1} = {2:D}ms formatting = {3:D}ms\n", language.name, testDoc.fileName, train_time, format_time);

            return(new org.antlr.codebuff.misc.Pair <int, int>((int)train_time, (int)format_time));
        }
Esempio n. 5
0
        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string> filenames         = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <float>  selfEditDistances = new List <float>();

            foreach (string fileName in filenames)
            {
                Corpus corpus = new Corpus(fileName, language);
                corpus.train();
                InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                Formatter     formatter = new Formatter(corpus, language.indentSize);
                string        output    = formatter.format(testDoc, false);
                //		System.out.println(output);
                float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                Log.WriteLine(fileName + " edit distance " + editDistance);
                selfEditDistances.Add(editDistance);
            }

            {
                Corpus corpus = new Corpus(language.corpusDir, language);
                corpus.train();

                IList <float> corpusEditDistances = new List <float>();
                foreach (string fileName in filenames)
                {
                    InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                    Formatter     formatter = new Formatter(corpus, language.indentSize);
                    string        output    = formatter.format(testDoc, false);
                    //		System.out.println(output);
                    float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                    Log.WriteLine(fileName + "+corpus edit distance " + editDistance);
                    corpusEditDistances.Add(editDistance);
                }
                // heh this gives info on within-corpus variability. i.e., how good/consistent is my corpus?
                // those files with big difference are candidates for dropping from corpus or for cleanup.
                IList <string> labels = BuffUtils.map(filenames, f => '"' + System.IO.Path.GetFileName(f) + '"');

                string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" +
                                "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" +
                                "ax = plt.subplot(111)\n" + "labels = <labels>\n" + "N = len(labels)\n\n" +
                                "featureIndexes = range(0,N)\n" + "<lang>_self = <selfEditDistances>\n" +
                                "<lang>_corpus = <corpusEditDistances>\n" +
                                "<lang>_diff = np.abs(np.subtract(<lang>_self, <lang>_corpus))\n\n" +
                                "all = zip(<lang>_self, <lang>_corpus, <lang>_diff, labels)\n" +
                                "all = sorted(all, key=lambda x : x[2], reverse=True)\n" +
                                "<lang>_self, <lang>_corpus, <lang>_diff, labels = zip(*all)\n\n" +
                                "ax.plot(featureIndexes, <lang>_self, label=\"<lang>_self\")\n" +
                                "#ax.plot(featureIndexes, <lang>_corpus, label=\"<lang>_corpus\")\n" +
                                "ax.plot(featureIndexes, <lang>_diff, label=\"<lang>_diff\")\n" +
                                "ax.set_xticklabels(labels, rotation=60, fontsize=8)\n" +
                                "plt.xticks(featureIndexes, labels, rotation=60)\n" +
                                "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n\n" +
                                "ax.text(1, .25, 'median $f$ self distance = %5.3f, corpus+$f$ distance = %5.3f' %" +
                                "    (np.median(<lang>_self),np.median(<lang>_corpus)))\n" + "ax.set_xlabel(\"File Name\")\n" +
                                "ax.set_ylabel(\"Edit Distance\")\n" +
                                "ax.set_title(\"Difference between Formatting File <lang> $f$\\nwith Training=$f$ and Training=$f$+Corpus\")\n" +
                                "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig(\"images/" + language.name +
                                "_one_file_capture.pdf\", format='pdf')\n" + "plt.show()\n";
                ST pythonST = new ST(python);

                pythonST.add("lang", language.name);
                pythonST.add("version", version);
                pythonST.add("date", DateTime.Now);
                pythonST.add("labels", labels.ToString());
                pythonST.add("selfEditDistances", selfEditDistances.ToString());
                pythonST.add("corpusEditDistances", corpusEditDistances.ToString());

                string code = pythonST.render();

                {
                    string fileName = "python/src/" + language.name + "_one_file_capture.py";
                    org.antlr.codebuff.misc.Utils.writeFile(fileName, code);
                    Log.WriteLine("wrote python code to " + fileName);
                }
            }
        }
Esempio n. 6
0
        public static void computeConsistency(LangDescriptor language, bool report)
        {
            if (report)
            {
                Console.WriteLine("-----------------------------------");
                Console.WriteLine(language.name);
                Console.WriteLine("-----------------------------------");
            }
            Corpus corpus = new Corpus(language.corpusDir, language);

            corpus.train();
            // a map of feature vector to list of exemplar indexes of that feature
            MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex   = new MyMultiMap <FeatureVectorAsObject, int>();
            MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>();

            int n = corpus.featureVectors.Count;

            for (int i = 0; i < n; i++)
            {
                int[] features = corpus.featureVectors[i];
                wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i);
                hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i);
            }

            int num_ambiguous_ws_vectors   = 0;
            int num_ambiguous_hpos_vectors = 0;

            // Dump output grouped by ws vs hpos then feature vector then category
            if (report)
            {
                Console.WriteLine(" --- INJECT WS ---");
            }
            IList <double> ws_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys)
            {
                var exemplarIndexes = wsContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for ws
                MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    wsCatToIndexes.Map(corpus.injectWhitespace[i], i);
                }
                if (wsCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size());
                double      wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", wsEntropy);
                }
                wsEntropy *= exemplarIndexes.size();
                ws_entropies.Add(wsEntropy);
                num_ambiguous_ws_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS));
                }

                if (report)
                {
                    foreach (int cat in wsCatToIndexes.Keys)
                    {
                        var indexes = wsCatToIndexes[cat];
                        foreach (int i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }

            if (report)
            {
                Console.WriteLine(" --- HPOS ---");
            }
            IList <double> hpos_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys)
            {
                MyHashSet <int> exemplarIndexes = hposContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for hpos
                MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    hposCatToIndexes.Map(corpus.hpos[i], i);
                }
                if (hposCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts   = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size());
                double      hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", hposEntropy);
                }
                hposEntropy *= exemplarIndexes.size();
                hpos_entropies.Add(hposEntropy);
                num_ambiguous_hpos_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS));
                }

                if (report)
                {
                    foreach (int cat in hposCatToIndexes.Keys)
                    {
                        var indexes = hposCatToIndexes[cat];
                        foreach (int?i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }
            Console.WriteLine();
            Console.WriteLine(language.name);
            Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n));
            Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n));
            float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n;

            Console.Write("num_ambiguous_ws_vectors   = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous);
            float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n;

            Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous);
            //		Collections.sort(ws_entropies);
            //		System.out.println("ws_entropies="+ws_entropies);
            Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies));
            double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous;

            Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy);

            Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies));
            double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous;

            Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy);
        }