public static IList <float?> checkStability(LangDescriptor language) { IList <float?> errorRates = new List <float?>(); // format the corpus into tmp dir LeaveOneOutValidator validator0 = new LeaveOneOutValidator(language.corpusDir, language); Triple <IList <Formatter>, IList <float>, IList <float> > results0 = validator0.validateDocuments(false, "/tmp/stability/1"); errorRates.Add(BuffUtils.median(results0.c)); IList <Formatter> formatters0 = results0.a; // now try formatting it over and over for (int i = 1; i <= STAGES; i++) { string inputDir = "/tmp/stability/" + i; string outputDir = "/tmp/stability/" + (i + 1); LeaveOneOutValidator validator = new LeaveOneOutValidator(inputDir, language); Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(false, outputDir); IList <Formatter> formatters = results.a; IList <float?> distances = new List <float?>(); for (int j = 0; j < formatters.Count; j++) { Formatter f0 = formatters0[j]; Formatter f = formatters[j]; float editDistance = Dbg.normalizedLevenshteinDistance(f.Output, f0.Output); distances.Add(editDistance); } errorRates.Add(BuffUtils.median(distances)); } return(errorRates); }
public static void Main(string[] args) { string langname = args[0].Substring(1); string testFilename = args[1]; LangDescriptor language = null; for (int i = 0; i < languages.length; i++) { if (languages[i].name.Equals(langname)) { language = languages[i]; break; } } if (language == null) { Log.WriteLine("Language " + langname + " unknown"); return; } // load all files up front DateTime load_start = System.DateTime.Now; IList <string> allFiles = Tool.getFilenames(language.corpusDir, language.fileRegex); IList <InputDocument> documents = Tool.load(allFiles, language); DateTime load_stop = System.DateTime.Now; DateTime load_time = (load_stop - load_start) / 1000000; Log.Write("Loaded {0:D} files in {1:D}ms\n", documents.Count, load_time); string path = System.IO.Path.GetFullPath(testFilename); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path)); Debug.Assert(others.Count == documents.Count - 1); if (excluded.Count == 0) { Log.WriteLine("Doc not in corpus: " + path); return; } InputDocument testDoc = excluded[0]; IList <int> training = new List <int>(); IList <int> formatting = new List <int>(); for (int i = 1; i <= TRIALS; i++) { org.antlr.codebuff.misc.Pair <int, int> timing = test(language, others, testDoc); training.Add(timing.a); formatting.Add(timing.b); } // drop first four training = training.subList(5, training.Count); formatting = formatting.subList(5, formatting.Count); Log.Write("median of [5:{0:D}] training {1:D}ms\n", TRIALS - 1, BuffUtils.median(training)); Log.Write("median of [5:{0:D}] formatting {1:D}ms\n", TRIALS - 1, BuffUtils.median(formatting)); }
public virtual Triple <IList <Formatter>, IList <float>, IList <float> > validateDocuments(FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, bool computeEditDistance, string outputDir) { IList <Formatter> formatters = new List <Formatter>(); IList <float> distances = new List <float>(); IList <float> errors = new List <float>(); System.DateTime start = System.DateTime.Now; try { IList <string> allFiles = Tool.getFilenames(rootDir, language.fileRegex); IList <InputDocument> documents = Tool.load(allFiles, language); IList <InputDocument> parsableDocuments = BuffUtils.filter(documents, d => d.tree != null); System.DateTime stop = System.DateTime.Now; //Console.Write("Load/parse all docs from {0} time {1:D} ms\n", rootDir, (stop - start) / 1000000); int ncpu = 1; if (FORCE_SINGLE_THREADED) { ncpu = 2; } for (int i = 0; i < parsableDocuments.Count; i++) { string fileName = parsableDocuments[i].fileName; { try { Triple <Formatter, float, float> results = validate(language, parsableDocuments, fileName, Formatter.DEFAULT_K, injectWSFeatures, alignmentFeatures, outputDir, computeEditDistance, false); formatters.Add(results.a); float editDistance = results.b; distances.Add(editDistance); float errorRate = results.c; errors.Add(errorRate); } catch (Exception t) { System.Console.WriteLine(t.StackTrace); } return(null); } } } finally { DateTime final_stop = System.DateTime.Now; double medianTrainingTime = BuffUtils.median(trainingTimes); double medianFormattingPerMS = BuffUtils.median(formattingTokensPerMS); Console.Write("Total time {0:D}ms\n", final_stop - start); Console.Write("Median training time {0:D}ms\n", medianTrainingTime); Console.Write("Median formatting time tokens per ms {0,5:F4}ms, min {1,5:F4} max {2,5:F4}\n", medianFormattingPerMS, BuffUtils.min(formattingTokensPerMS), BuffUtils.max(formattingTokensPerMS)); } return(new Triple <IList <Formatter>, IList <float>, IList <float> >(formatters, distances, errors)); }
public static void computeConsistency(LangDescriptor language, bool report) { if (report) { Console.WriteLine("-----------------------------------"); Console.WriteLine(language.name); Console.WriteLine("-----------------------------------"); } Corpus corpus = new Corpus(language.corpusDir, language); corpus.train(); // a map of feature vector to list of exemplar indexes of that feature MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>(); int n = corpus.featureVectors.Count; for (int i = 0; i < n; i++) { int[] features = corpus.featureVectors[i]; wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i); hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i); } int num_ambiguous_ws_vectors = 0; int num_ambiguous_hpos_vectors = 0; // Dump output grouped by ws vs hpos then feature vector then category if (report) { Console.WriteLine(" --- INJECT WS ---"); } IList <double> ws_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys) { var exemplarIndexes = wsContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for ws MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { wsCatToIndexes.Map(corpus.injectWhitespace[i], i); } if (wsCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size()); double wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", wsEntropy); } wsEntropy *= exemplarIndexes.size(); ws_entropies.Add(wsEntropy); num_ambiguous_ws_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS)); } if (report) { foreach (int cat in wsCatToIndexes.Keys) { var indexes = wsCatToIndexes[cat]; foreach (int i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i); Console.WriteLine(display); } Console.WriteLine(); } } } if (report) { Console.WriteLine(" --- HPOS ---"); } IList <double> hpos_entropies = new List <double>(); foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys) { MyHashSet <int> exemplarIndexes = hposContextToIndex[fo]; // we have group by feature vector, now group by cat with that set for hpos MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>(); foreach (int i in exemplarIndexes) { hposCatToIndexes.Map(corpus.hpos[i], i); } if (hposCatToIndexes.Count == 1) { continue; } if (report) { Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars"); } IList <int> catCounts = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size()); double hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts)); if (report) { Console.Write("entropy={0,5:F4}\n", hposEntropy); } hposEntropy *= exemplarIndexes.size(); hpos_entropies.Add(hposEntropy); num_ambiguous_hpos_vectors += exemplarIndexes.size(); if (report) { Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS)); } if (report) { foreach (int cat in hposCatToIndexes.Keys) { var indexes = hposCatToIndexes[cat]; foreach (int?i in indexes) { string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value); Console.WriteLine(display); } Console.WriteLine(); } } } Console.WriteLine(); Console.WriteLine(language.name); Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n)); Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n)); float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n; Console.Write("num_ambiguous_ws_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous); float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n; Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous); // Collections.sort(ws_entropies); // System.out.println("ws_entropies="+ws_entropies); Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies)); double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous; Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy); Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies)); double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous; Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy); }