public static Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, InputDocument testDoc, bool saveOutput, bool computeEditDistance) { // kNNClassifier.resetCache(); Corpus corpus = new Corpus(documents, language); corpus.train(); // System.out.printf("%d feature vectors\n", corpus.featureVectors.size()); Formatter formatter = new Formatter(corpus, language.indentSize); string output = formatter.format(testDoc, false); float editDistance = 0; if (computeEditDistance) { editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.AnalysisPerToken); // System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate()); if (saveOutput) { File dir = new File(outputDir + "/" + language.name); if (saveOutput) { dir = new File(outputDir + "/" + language.name); dir.mkdir(); } org.antlr.codebuff.misc.Utils.writeFile(dir.Path + "/" + System.IO.Path.GetFileName(testDoc.fileName), output); } return(new Triple <Formatter, float?, float?>(formatter, editDistance, analysis.ErrorRate)); }
public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, string outputDir, bool computeEditDistance, bool collectAnalysis) { string path = System.IO.Path.GetFullPath(fileToExclude); IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path)); IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path)); Debug.Assert(others.Count == documents.Count - 1); // kNNClassifier.resetCache(); if (excluded.Count == 0) { Console.Error.WriteLine("Doc not in corpus: " + path); return(null); } InputDocument testDoc = excluded[0]; DateTime start = System.DateTime.Now; Corpus corpus = new Corpus(others, language); corpus.train(); DateTime stop = System.DateTime.Now; Formatter formatter = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures); InputDocument originalDoc = testDoc; DateTime format_start = System.DateTime.Now; string output = formatter.format(testDoc, collectAnalysis); DateTime format_stop = System.DateTime.Now; float editDistance = 0; if (computeEditDistance) { editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output); } ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.AnalysisPerToken); Console.WriteLine(testDoc.fileName + ": edit distance = " + editDistance + ", error rate = " + analysis.ErrorRate); if (!string.ReferenceEquals(outputDir, null)) { string dir = outputDir + "/" + language.name + "/" + Tool.version; if (!System.IO.Directory.Exists(dir)) { System.IO.Directory.CreateDirectory(dir); } org.antlr.codebuff.misc.Utils.writeFile(dir + "/" + System.IO.Path.GetFileName(testDoc.fileName), output); } var tms = (stop - start); var fms = format_stop - format_start; trainingTimes.Add((double)tms.Milliseconds); float tokensPerMS = testDoc.tokens.Size / (float)fms.TotalMilliseconds; formattingTokensPerMS.Add((double)tokensPerMS); Console.Write("Training time = {0:D} ms, formatting {1:D} ms, {2,5:F3} tokens/ms ({3:D} tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.Size); // System.out.printf("classify calls %d, hits %d rate %f\n", // kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits, // kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls); // System.out.printf("kNN calls %d, hits %d rate %f\n", // kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits, // kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls); return(new Triple <Formatter, float, float>(formatter, editDistance, analysis.ErrorRate)); }
public static IList <float> getAlignmentErrorRates(LangDescriptor language, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures) { LeaveOneOutValidator validator = new LeaveOneOutValidator(language.corpusDir, language); Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(injectWSFeatures, alignmentFeatures, false, null); IList <Formatter> formatters = results.a; IList <float> alignErrorRates = new List <float>(); // don't include align errors foreach (Formatter formatter in formatters) { ClassificationAnalysis analysis = new ClassificationAnalysis(formatter.testDoc, formatter.AnalysisPerToken); alignErrorRates.Add(analysis.AlignmentErrorRate); } // System.out.println(results.c); // System.out.println("vs"); // System.out.println(alignErrorRates); return(alignErrorRates); }