C# (CSharp) Corpus.trainの例

プログラミング言語: C# (CSharp)

クラス/型: Corpus

メソッド/関数: train

hotexamples.comのコード掲載数: 6

C# (CSharp) Corpus.train - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC# (CSharp)のCorpus.trainの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

train(6)

MakeObject(6)

Add(4)

ProcessLinesToLetters(3)

init(3)

ToString(3)

LoadFromTextSsjTokenizer(3)

ContainsKey(2)

AddDocument(2)

LoadFromFile(2)

MakeRef(2)

GetTerms(2)

ChangeDocumentPath(1)

LoadFromXmlFile(1)

LoadFromXml(1)

RemoveDocument(1)

SetLastCrawled(1)

SetRepository(1)

Import(1)

Contains(1)

GetTermDocuments(1)

GetSentences(1)

GetNoDocuments(1)

GetLetter(1)

GetLastCrawled(1)

GetFirstLetter(1)

GetDocuments(1)

GetDocumentPath(1)

GetDocumentLength(1)

GetDocumentID(1)

GetDocument(1)

GetCorpora(1)

Delete(1)

Default(1)

GetRepository(1)

コード例 #1

ファイルを表示

        public static Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, InputDocument testDoc, bool saveOutput, bool computeEditDistance)
        {
            //		kNNClassifier.resetCache();
            Corpus corpus = new Corpus(documents, language);

            corpus.train();
            //		System.out.printf("%d feature vectors\n", corpus.featureVectors.size());
            Formatter formatter    = new Formatter(corpus, language.indentSize);
            string    output       = formatter.format(testDoc, false);
            float     editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(testDoc, formatter.AnalysisPerToken);

            //		System.out.println(testDoc.fileName+": edit distance = "+editDistance+", error rate = "+analysis.getErrorRate());
            if (saveOutput)
            {
                File dir = new File(outputDir + "/" + language.name);
                if (saveOutput)
                {
                    dir = new File(outputDir + "/" + language.name);
                    dir.mkdir();
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir.Path + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            return(new Triple <Formatter, float?, float?>(formatter, editDistance, analysis.ErrorRate));
        }

コード例 #2

ファイルを表示

        public virtual Triple <Formatter, float, float> validate(LangDescriptor language, IList <InputDocument> documents, string fileToExclude, int k, FeatureMetaData[] injectWSFeatures, FeatureMetaData[] alignmentFeatures, string outputDir, bool computeEditDistance, bool collectAnalysis)
        {
            string path = System.IO.Path.GetFullPath(fileToExclude);
            IList <InputDocument> others   = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
            IList <InputDocument> excluded = BuffUtils.filter(documents, d => d.fileName.Equals(path));

            Debug.Assert(others.Count == documents.Count - 1);
            //		kNNClassifier.resetCache();
            if (excluded.Count == 0)
            {
                Console.Error.WriteLine("Doc not in corpus: " + path);
                return(null);
            }
            InputDocument testDoc = excluded[0];
            DateTime      start   = System.DateTime.Now;
            Corpus        corpus  = new Corpus(others, language);

            corpus.train();
            DateTime      stop         = System.DateTime.Now;
            Formatter     formatter    = new Formatter(corpus, language.indentSize, k, injectWSFeatures, alignmentFeatures);
            InputDocument originalDoc  = testDoc;
            DateTime      format_start = System.DateTime.Now;
            string        output       = formatter.format(testDoc, collectAnalysis);
            DateTime      format_stop  = System.DateTime.Now;
            float         editDistance = 0;

            if (computeEditDistance)
            {
                editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
            }
            ClassificationAnalysis analysis = new ClassificationAnalysis(originalDoc, formatter.AnalysisPerToken);

            Console.WriteLine(testDoc.fileName + ": edit distance = " + editDistance + ", error rate = " + analysis.ErrorRate);
            if (!string.ReferenceEquals(outputDir, null))
            {
                string dir = outputDir + "/" + language.name + "/" + Tool.version;
                if (!System.IO.Directory.Exists(dir))
                {
                    System.IO.Directory.CreateDirectory(dir);
                }
                org.antlr.codebuff.misc.Utils.writeFile(dir + "/" + System.IO.Path.GetFileName(testDoc.fileName), output);
            }
            var tms = (stop - start);
            var fms = format_stop - format_start;

            trainingTimes.Add((double)tms.Milliseconds);
            float tokensPerMS = testDoc.tokens.Size / (float)fms.TotalMilliseconds;

            formattingTokensPerMS.Add((double)tokensPerMS);
            Console.Write("Training time = {0:D} ms, formatting {1:D} ms, {2,5:F3} tokens/ms ({3:D} tokens)\n", tms, fms, tokensPerMS, testDoc.tokens.Size);
            //		System.out.printf("classify calls %d, hits %d rate %f\n",
            //		                  kNNClassifier.nClassifyCalls, kNNClassifier.nClassifyCacheHits,
            //		                  kNNClassifier.nClassifyCacheHits/(float) kNNClassifier.nClassifyCalls);
            //		System.out.printf("kNN calls %d, hits %d rate %f\n",
            //						  kNNClassifier.nNNCalls, kNNClassifier.nNNCacheHits,
            //						  kNNClassifier.nNNCacheHits/(float) kNNClassifier.nNNCalls);
            return(new Triple <Formatter, float, float>(formatter, editDistance, analysis.ErrorRate));
        }

コード例 #3

ファイルを表示

ファイル: Entropy.cs プロジェクト: mhornsby/cs-codebuff

        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string>        filenames = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <InputDocument> documents = Tool.load(filenames, language);

            foreach (string fileName in filenames)
            {
                // Examine info for this file in isolation
                Corpus fileCorpus = new Corpus(fileName, language);
                fileCorpus.train();
                Console.WriteLine(fileName);
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> ws   = getWSContextCategoryMap(fileCorpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> hpos = getHPosContextCategoryMap(fileCorpus);

                // Compare with corpus minus this file
                string path = fileName;
                IList <InputDocument> others = BuffUtils.filter(documents, d => !d.fileName.Equals(path));
                Corpus corpus = new Corpus(others, language);
                corpus.train();
                //			examineCorpus(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_ws   = getWSContextCategoryMap(corpus);
                ArrayListMultiMap <FeatureVectorAsObject, int> corpus_hpos = getHPosContextCategoryMap(corpus);

                foreach (FeatureVectorAsObject x in ws.Keys)
                {
                    HashBag <int> fwsCats   = getCategoriesBag(ws[x]);
                    IList <float> fwsRatios = getCategoryRatios(fwsCats.Values);
                    HashBag <int> wsCats    = getCategoriesBag(corpus_ws[x]);
                    IList <float> wsRatios  = getCategoryRatios(wsCats.Values);
                    // compare file predictions with corpus predictions
                    if (!fwsRatios.SequenceEqual(wsRatios))
                    {
                        Console.WriteLine(fwsRatios + " vs " + wsRatios);
                    }

                    HashBag <int> fhposCats = getCategoriesBag(hpos[x]);
                    HashBag <int> hposCats  = getCategoriesBag(corpus_hpos[x]);
                }

                break;
            }
        }

コード例 #4

ファイルを表示

        public static org.antlr.codebuff.misc.Pair <int, int> test(LangDescriptor language, IList <InputDocument> others, InputDocument testDoc)
        {
            var    train_start = System.DateTime.Now;
            Corpus corpus      = new Corpus(others, language);

            corpus.train();
            var train_stop = System.DateTime.Now;

            var       format_start = System.DateTime.Now;
            Formatter formatter    = new Formatter(corpus, language.indentSize, Formatter.DEFAULT_K, FEATURES_INJECT_WS, FEATURES_HPOS);

            formatter.format(testDoc, false);
            var format_stop = System.DateTime.Now;

            var train_time  = (train_stop - train_start) / 1000000;
            var format_time = (format_stop - format_start) / 1000000;

            Log.Write("{0} training of {1} = {2:D}ms formatting = {3:D}ms\n", language.name, testDoc.fileName, train_time, format_time);

            return(new org.antlr.codebuff.misc.Pair <int, int>((int)train_time, (int)format_time));
        }

コード例 #5

ファイルを表示

ファイル: OneFileCapture.cs プロジェクト: kaby76/cs-codebuff

        public static void runCaptureForOneLanguage(LangDescriptor language)
        {
            IList <string> filenames         = Tool.getFilenames(language.corpusDir, language.fileRegex);
            IList <float>  selfEditDistances = new List <float>();

            foreach (string fileName in filenames)
            {
                Corpus corpus = new Corpus(fileName, language);
                corpus.train();
                InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                Formatter     formatter = new Formatter(corpus, language.indentSize);
                string        output    = formatter.format(testDoc, false);
                //		System.out.println(output);
                float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                Log.WriteLine(fileName + " edit distance " + editDistance);
                selfEditDistances.Add(editDistance);
            }

            {
                Corpus corpus = new Corpus(language.corpusDir, language);
                corpus.train();

                IList <float> corpusEditDistances = new List <float>();
                foreach (string fileName in filenames)
                {
                    InputDocument testDoc   = Tool.parse(fileName, corpus.language);
                    Formatter     formatter = new Formatter(corpus, language.indentSize);
                    string        output    = formatter.format(testDoc, false);
                    //		System.out.println(output);
                    float editDistance = Dbg.normalizedLevenshteinDistance(testDoc.content, output);
                    Log.WriteLine(fileName + "+corpus edit distance " + editDistance);
                    corpusEditDistances.Add(editDistance);
                }
                // heh this gives info on within-corpus variability. i.e., how good/consistent is my corpus?
                // those files with big difference are candidates for dropping from corpus or for cleanup.
                IList <string> labels = BuffUtils.map(filenames, f => '"' + System.IO.Path.GetFileName(f) + '"');

                string python = "#\n" + "# AUTO-GENERATED FILE. DO NOT EDIT\n" + "# CodeBuff <version> '<date>'\n" + "#\n" +
                                "import numpy as np\n" + "import matplotlib.pyplot as plt\n\n" + "fig = plt.figure()\n" +
                                "ax = plt.subplot(111)\n" + "labels = <labels>\n" + "N = len(labels)\n\n" +
                                "featureIndexes = range(0,N)\n" + "<lang>_self = <selfEditDistances>\n" +
                                "<lang>_corpus = <corpusEditDistances>\n" +
                                "<lang>_diff = np.abs(np.subtract(<lang>_self, <lang>_corpus))\n\n" +
                                "all = zip(<lang>_self, <lang>_corpus, <lang>_diff, labels)\n" +
                                "all = sorted(all, key=lambda x : x[2], reverse=True)\n" +
                                "<lang>_self, <lang>_corpus, <lang>_diff, labels = zip(*all)\n\n" +
                                "ax.plot(featureIndexes, <lang>_self, label=\"<lang>_self\")\n" +
                                "#ax.plot(featureIndexes, <lang>_corpus, label=\"<lang>_corpus\")\n" +
                                "ax.plot(featureIndexes, <lang>_diff, label=\"<lang>_diff\")\n" +
                                "ax.set_xticklabels(labels, rotation=60, fontsize=8)\n" +
                                "plt.xticks(featureIndexes, labels, rotation=60)\n" +
                                "ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5)\n\n" +
                                "ax.text(1, .25, 'median $f$ self distance = %5.3f, corpus+$f$ distance = %5.3f' %" +
                                "    (np.median(<lang>_self),np.median(<lang>_corpus)))\n" + "ax.set_xlabel(\"File Name\")\n" +
                                "ax.set_ylabel(\"Edit Distance\")\n" +
                                "ax.set_title(\"Difference between Formatting File <lang> $f$\\nwith Training=$f$ and Training=$f$+Corpus\")\n" +
                                "plt.legend()\n" + "plt.tight_layout()\n" + "fig.savefig(\"images/" + language.name +
                                "_one_file_capture.pdf\", format='pdf')\n" + "plt.show()\n";
                ST pythonST = new ST(python);

                pythonST.add("lang", language.name);
                pythonST.add("version", version);
                pythonST.add("date", DateTime.Now);
                pythonST.add("labels", labels.ToString());
                pythonST.add("selfEditDistances", selfEditDistances.ToString());
                pythonST.add("corpusEditDistances", corpusEditDistances.ToString());

                string code = pythonST.render();

                {
                    string fileName = "python/src/" + language.name + "_one_file_capture.py";
                    org.antlr.codebuff.misc.Utils.writeFile(fileName, code);
                    Log.WriteLine("wrote python code to " + fileName);
                }
            }
        }

コード例 #6

ファイルを表示

ファイル: CorpusConsistency.cs プロジェクト: mhornsby/cs-codebuff

        public static void computeConsistency(LangDescriptor language, bool report)
        {
            if (report)
            {
                Console.WriteLine("-----------------------------------");
                Console.WriteLine(language.name);
                Console.WriteLine("-----------------------------------");
            }
            Corpus corpus = new Corpus(language.corpusDir, language);

            corpus.train();
            // a map of feature vector to list of exemplar indexes of that feature
            MyMultiMap <FeatureVectorAsObject, int> wsContextToIndex   = new MyMultiMap <FeatureVectorAsObject, int>();
            MyMultiMap <FeatureVectorAsObject, int> hposContextToIndex = new MyMultiMap <FeatureVectorAsObject, int>();

            int n = corpus.featureVectors.Count;

            for (int i = 0; i < n; i++)
            {
                int[] features = corpus.featureVectors[i];
                wsContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_INJECT_WS), i);
                hposContextToIndex.Map(new FeatureVectorAsObject(features, Trainer.FEATURES_HPOS), i);
            }

            int num_ambiguous_ws_vectors   = 0;
            int num_ambiguous_hpos_vectors = 0;

            // Dump output grouped by ws vs hpos then feature vector then category
            if (report)
            {
                Console.WriteLine(" --- INJECT WS ---");
            }
            IList <double> ws_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in wsContextToIndex.Keys)
            {
                var exemplarIndexes = wsContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for ws
                MyMultiMap <int, int> wsCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    wsCatToIndexes.Map(corpus.injectWhitespace[i], i);
                }
                if (wsCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts = BuffUtils.map(wsCatToIndexes.Values, (x) => x.size());
                double      wsEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", wsEntropy);
                }
                wsEntropy *= exemplarIndexes.size();
                ws_entropies.Add(wsEntropy);
                num_ambiguous_ws_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_INJECT_WS));
                }

                if (report)
                {
                    foreach (int cat in wsCatToIndexes.Keys)
                    {
                        var indexes = wsCatToIndexes[cat];
                        foreach (int i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_INJECT_WS, corpus, corpus.injectWhitespace, i);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }

            if (report)
            {
                Console.WriteLine(" --- HPOS ---");
            }
            IList <double> hpos_entropies = new List <double>();

            foreach (FeatureVectorAsObject fo in hposContextToIndex.Keys)
            {
                MyHashSet <int> exemplarIndexes = hposContextToIndex[fo];

                // we have group by feature vector, now group by cat with that set for hpos
                MyMultiMap <int, int> hposCatToIndexes = new MyMultiMap <int, int>();
                foreach (int i in exemplarIndexes)
                {
                    hposCatToIndexes.Map(corpus.hpos[i], i);
                }
                if (hposCatToIndexes.Count == 1)
                {
                    continue;
                }
                if (report)
                {
                    Console.WriteLine("Feature vector has " + exemplarIndexes.size() + " exemplars");
                }
                IList <int> catCounts   = BuffUtils.map(hposCatToIndexes.Values, (x) => x.size());
                double      hposEntropy = Entropy.getNormalizedCategoryEntropy(Entropy.getCategoryRatios(catCounts));
                if (report)
                {
                    Console.Write("entropy={0,5:F4}\n", hposEntropy);
                }
                hposEntropy *= exemplarIndexes.size();
                hpos_entropies.Add(hposEntropy);
                num_ambiguous_hpos_vectors += exemplarIndexes.size();
                if (report)
                {
                    Console.Write(Trainer.featureNameHeader(Trainer.FEATURES_HPOS));
                }

                if (report)
                {
                    foreach (int cat in hposCatToIndexes.Keys)
                    {
                        var indexes = hposCatToIndexes[cat];
                        foreach (int?i in indexes)
                        {
                            string display = getExemplarDisplay(Trainer.FEATURES_HPOS, corpus, corpus.hpos, i.Value);
                            Console.WriteLine(display);
                        }
                        Console.WriteLine();
                    }
                }
            }
            Console.WriteLine();
            Console.WriteLine(language.name);
            Console.WriteLine("There are " + wsContextToIndex.Count + " unique ws feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * wsContextToIndex.Count / n));
            Console.WriteLine("There are " + hposContextToIndex.Count + " unique hpos feature vectors out of " + n + " = " + string.Format("{0,3:F1}%", 100.0 * hposContextToIndex.Count / n));
            float prob_ws_ambiguous = num_ambiguous_ws_vectors / (float)n;

            Console.Write("num_ambiguous_ws_vectors   = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_ws_vectors, n, prob_ws_ambiguous);
            float prob_hpos_ambiguous = num_ambiguous_hpos_vectors / (float)n;

            Console.Write("num_ambiguous_hpos_vectors = {0,5:D}/{1,5:D} = {2,5:F3}\n", num_ambiguous_hpos_vectors, n, prob_hpos_ambiguous);
            //		Collections.sort(ws_entropies);
            //		System.out.println("ws_entropies="+ws_entropies);
            Console.WriteLine("ws median,mean = " + BuffUtils.median(ws_entropies) + "," + BuffUtils.mean(ws_entropies));
            double expected_ws_entropy = (BuffUtils.sumDoubles(ws_entropies) / num_ambiguous_ws_vectors) * prob_ws_ambiguous;

            Console.WriteLine("expected_ws_entropy=" + expected_ws_entropy);

            Console.WriteLine("hpos median,mean = " + BuffUtils.median(hpos_entropies) + "," + BuffUtils.mean(hpos_entropies));
            double expected_hpos_entropy = (BuffUtils.sumDoubles(hpos_entropies) / num_ambiguous_hpos_vectors) * prob_hpos_ambiguous;

            Console.WriteLine("expected_hpos_entropy=" + expected_hpos_entropy);
        }