Ejemplo n.º 1
0
        public static IList <float?> checkStability(LangDescriptor language)
        {
            IList <float?> errorRates = new List <float?>();

            // format the corpus into tmp dir
            LeaveOneOutValidator validator0 = new LeaveOneOutValidator(language.corpusDir, language);
            Triple <IList <Formatter>, IList <float>, IList <float> > results0 = validator0.validateDocuments(false, "/tmp/stability/1");

            errorRates.Add(BuffUtils.median(results0.c));

            IList <Formatter> formatters0 = results0.a;

            // now try formatting it over and over
            for (int i = 1; i <= STAGES; i++)
            {
                string inputDir  = "/tmp/stability/" + i;
                string outputDir = "/tmp/stability/" + (i + 1);
                LeaveOneOutValidator validator = new LeaveOneOutValidator(inputDir, language);
                Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(false, outputDir);
                IList <Formatter> formatters = results.a;
                IList <float?>    distances  = new List <float?>();
                for (int j = 0; j < formatters.Count; j++)
                {
                    Formatter f0           = formatters0[j];
                    Formatter f            = formatters[j];
                    float     editDistance = Dbg.normalizedLevenshteinDistance(f.Output, f0.Output);
                    distances.Add(editDistance);
                }
                errorRates.Add(BuffUtils.median(distances));
            }

            return(errorRates);
        }
Ejemplo n.º 2
0
        public static void Main(string[] args)
        {
            string         langname     = args[0].Substring(1);
            string         testFilename = args[1];
            LangDescriptor language     = null;

            for (int i = 0; i < languages.length; i++)
            {
                if (languages[i].name.Equals(langname))
                {
                    language = languages[i];
                    break;
                }
            }
            if (language == null)
            {
                Log.WriteLine("Language " + langname + " unknown");
                return;
            }

            // load all files up front
            DateTime              load_start = System.DateTime.Now;
            IList <string>        allFiles   = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <InputDocument> documents  = Tool.load(allFiles, language);
            DateTime              load_stop  = System.DateTime.Now;
            DateTime              load_time  = (load_stop - load_start) / 1000000;

            Log.Write("Loaded {0:D} files in {1:D}ms\n", documents.Count, load_time);

            string path = System.IO.Path.GetFullPath(testFilename);
            IList <InputDocument> others   = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
            IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path));

            Debug.Assert(others.Count == documents.Count - 1);
            if (excluded.Count == 0)
            {
                Log.WriteLine("Doc not in corpus: " + path);
                return;
            }
            InputDocument testDoc = excluded[0];

            IList <int> training   = new List <int>();
            IList <int> formatting = new List <int>();

            for (int i = 1; i <= TRIALS; i++)
            {
                org.antlr.codebuff.misc.Pair <int, int> timing = test(language, others, testDoc);
                training.Add(timing.a);
                formatting.Add(timing.b);
            }
            // drop first four
            training   = training.subList(5, training.Count);
            formatting = formatting.subList(5, formatting.Count);
            Log.Write("median of [5:{0:D}] training {1:D}ms\n", TRIALS - 1, BuffUtils.median(training));
            Log.Write("median of [5:{0:D}] formatting {1:D}ms\n", TRIALS - 1, BuffUtils.median(formatting));
        }
Ejemplo n.º 3
0
        public virtual Triple <IList <Formatter>, IList <float>, IList <float> > validateDocuments(FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, bool computeEditDistance, string outputDir)
        {
            IList <Formatter> formatters = new List <Formatter>();
            IList <float>     distances  = new List <float>();
            IList <float>     errors     = new List <float>();

            System.DateTime start = System.DateTime.Now;
            try
            {
                IList <string>        allFiles          = Tool.getFilenames(rootDir, language.fileRegex);
                IList <InputDocument> documents         = Tool.load(allFiles, language);
                IList <InputDocument> parsableDocuments = BuffUtils.filter(documents, d => d.tree != null);
                System.DateTime       stop = System.DateTime.Now;
                //Console.Write("Load/parse all docs from {0} time {1:D} ms\n", rootDir, (stop - start) / 1000000);

                int ncpu = 1;
                if (FORCE_SINGLE_THREADED)
                {
                    ncpu = 2;
                }

                for (int i = 0; i < parsableDocuments.Count; i++)
                {
                    string fileName = parsableDocuments[i].fileName;

                    {
                        try
                        {
                            Triple <Formatter, float, float> results = validate(language, parsableDocuments, fileName,
                                                                                Formatter.DEFAULT_K, injectWSFeatures, alignmentFeatures, outputDir, computeEditDistance, false);
                            formatters.Add(results.a);
                            float editDistance = results.b;
                            distances.Add(editDistance);
                            float errorRate = results.c;
                            errors.Add(errorRate);
                        }
                        catch (Exception t)
                        {
                            System.Console.WriteLine(t.StackTrace);
                        }
                        return(null);
                    }
                }
            }
            finally
            {
                DateTime final_stop            = System.DateTime.Now;
                double   medianTrainingTime    = BuffUtils.median(trainingTimes);
                double   medianFormattingPerMS = BuffUtils.median(formattingTokensPerMS);
                Console.Write("Total time {0:D}ms\n", final_stop - start);
                Console.Write("Median training time {0:D}ms\n", medianTrainingTime);
                Console.Write("Median formatting time tokens per ms {0,5:F4}ms, min {1,5:F4} max {2,5:F4}\n", medianFormattingPerMS, BuffUtils.min(formattingTokensPerMS), BuffUtils.max(formattingTokensPerMS));
            }
            return(new Triple <IList <Formatter>, IList <float>, IList <float> >(formatters, distances, errors));
        }
Ejemplo n.º 4
0
        public static void computeConsistency(LangDescriptor language, bool report)
        {
            if (report)
            {
                Console.WriteLine("-----------------------------------");
                Console.WriteLine(language.name);
                Console.WriteLine("-----------------------------------");
            }
            Corpus corpus = new Corpus(language.corpusDir, language);

            corpus.train();
            // a map of feature vector to list of exemplar indexes of that feature
            MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex   = new MyMultiMap <FeatureVectorAsObject, int>();
            MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>();

            int n = corpus.featureVectors.Count;

            for (int i = 0; i < n; i++)
            {
                int[] features = corpus.featureVectors[i];
                wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i);
                hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i);
            }

            int num_ambiguous_ws_vectors   = 0;
            int num_ambiguous_hpos_vectors = 0;

            // Dump output grouped by ws vs hpos then feature vector then category
            if (report)
            {
                Console.WriteLine(" --- INJECT WS ---");
            }
            IList <double> ws_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys)
            {
                var exemplarIndexes = wsContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for ws
                MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    wsCatToIndexes.Map(corpus.injectWhitespace[i], i);
                }
                if (wsCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size());
                double      wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", wsEntropy);
                }
                wsEntropy *= exemplarIndexes.size();
                ws_entropies.Add(wsEntropy);
                num_ambiguous_ws_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS));
                }

                if (report)
                {
                    foreach (int cat in wsCatToIndexes.Keys)
                    {
                        var indexes = wsCatToIndexes[cat];
                        foreach (int i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }

            if (report)
            {
                Console.WriteLine(" --- HPOS ---");
            }
            IList <double> hpos_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys)
            {
                MyHashSet <int> exemplarIndexes = hposContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for hpos
                MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    hposCatToIndexes.Map(corpus.hpos[i], i);
                }
                if (hposCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts   = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size());
                double      hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", hposEntropy);
                }
                hposEntropy *= exemplarIndexes.size();
                hpos_entropies.Add(hposEntropy);
                num_ambiguous_hpos_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS));
                }

                if (report)
                {
                    foreach (int cat in hposCatToIndexes.Keys)
                    {
                        var indexes = hposCatToIndexes[cat];
                        foreach (int?i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }
            Console.WriteLine();
            Console.WriteLine(language.name);
            Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n));
            Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n));
            float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n;

            Console.Write("num_ambiguous_ws_vectors   = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous);
            float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n;

            Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous);
            //		Collections.sort(ws_entropies);
            //		System.out.println("ws_entropies="+ws_entropies);
            Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies));
            double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous;

            Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy);

            Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies));
            double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous;

            Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy);
        }