コード例 #1
0
        public static Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, InputDocument testDoc, bool saveOutput, bool computeEditDistance)
        {
            //		kNNClassifier.resetCache();
            Corpus corpus = new Corpus(documents, language);

            corpus.train();
            //		System.out.printf("%d feature vectors\n", corpus.featureVectors.size());
            Formatter formatter    = new Formatter(corpus, language.indentSize);
            string    output       = formatter.format(testDoc, false);
            float     editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.AnalysisPerToken);

            //		System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate());
            if (saveOutput)
            {
                File dir = new File(outputDir + "/" + language.name);
                if (saveOutput)
                {
                    dir = new File(outputDir + "/" + language.name);
                    dir.mkdir();
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir.Path + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            return(new Triple <Formatter, float?, float?>(formatter, editDistance, analysis.ErrorRate));
        }
コード例 #2
0
        public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, string outputDir, bool computeEditDistance, bool collectAnalysis)
        {
            string path = System.IO.Path.GetFullPath(fileToExclude);
            IList <InputDocument> others   = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
            IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path));

            Debug.Assert(others.Count == documents.Count - 1);
            //		kNNClassifier.resetCache();
            if (excluded.Count == 0)
            {
                Console.Error.WriteLine("Doc not in corpus: " + path);
                return(null);
            }
            InputDocument testDoc = excluded[0];
            DateTime      start   = System.DateTime.Now;
            Corpus        corpus  = new Corpus(others, language);

            corpus.train();
            DateTime      stop         = System.DateTime.Now;
            Formatter     formatter    = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures);
            InputDocument originalDoc  = testDoc;
            DateTime      format_start = System.DateTime.Now;
            string        output       = formatter.format(testDoc, collectAnalysis);
            DateTime      format_stop  = System.DateTime.Now;
            float         editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.AnalysisPerToken);

            Console.WriteLine(testDoc.fileName + ": edit distance = " + editDistance + ", error rate = " + analysis.ErrorRate);
            if (!string.ReferenceEquals(outputDir, null))
            {
                string dir = outputDir + "/" + language.name + "/" + Tool.version;
                if (!System.IO.Directory.Exists(dir))
                {
                    System.IO.Directory.CreateDirectory(dir);
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            var tms = (stop - start);
            var fms = format_stop - format_start;

            trainingTimes.Add((double)tms.Milliseconds);
            float tokensPerMS = testDoc.tokens.Size / (float)fms.TotalMilliseconds;

            formattingTokensPerMS.Add((double)tokensPerMS);
            Console.Write("Training time = {0:D} ms, formatting {1:D} ms, {2,5:F3} tokens/ms ({3:D} tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.Size);
            //		System.out.printf("classify calls %d, hits %d rate %f\n",
            //		                  kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits,
            //		                  kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls);
            //		System.out.printf("kNN calls %d, hits %d rate %f\n",
            //						  kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits,
            //						  kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls);
            return(new Triple <Formatter, float, float>(formatter, editDistance, analysis.ErrorRate));
        }
コード例 #3
0
        public static IList <float> getAlignmentErrorRates(LangDescriptor language, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures)
        {
            LeaveOneOutValidator validator = new LeaveOneOutValidator(language.corpusDir, language);
            Triple <IList <Formatter>, IList <float>, IList <float> > results = validator.validateDocuments(injectWSFeatures, alignmentFeatures, false, null);
            IList <Formatter> formatters      = results.a;
            IList <float>     alignErrorRates = new List <float>();       // don't include align errors

            foreach (Formatter formatter in formatters)
            {
                ClassificationAnalysis analysis = new ClassificationAnalysis(formatter.testDoc, formatter.AnalysisPerToken);
                alignErrorRates.Add(analysis.AlignmentErrorRate);
            }
            //		System.out.println(results.c);
            //		System.out.println("vs");
            //		System.out.println(alignErrorRates);
            return(alignErrorRates);
        }