Beispiel #1
0
        public TF_IDF()
        {
            string[][] sentences = { };
            string     data_type = "IDF_Dictionary";

            // Initialize TFIDF
            this.codebook = new TFIDF()
            {
                Tf  = TermFrequency.Log,
                Idf = InverseDocumentFrequency.Default
            };

            sentences = Extract_sentences(data_type);

            // TFIDF Document Train
            codebook.Learn(sentences);
        }
        private void Button_Click(object sender, RoutedEventArgs e)
        {
            string[] texts =
            {
                @"The concept of grouping students together in a centralized location for learning has existed since Classical antiquity. Formal schools have existed at least since ancient Greece (see Academy), ancient Rome (see Education in Ancient Rome) ancient India (see Gurukul), and ancient China (see History of education in China). The Byzantine Empire had an established schooling system beginning at the primary level. According to Traditions and Encounters, the founding of the primary education system began in 425 AD and ... military personnel usually had at least a primary education .... The sometimes efficient and often large government of the Empire meant that educated citizens were a must. Although Byzantium lost much of the grandeur of Roman culture and extravagance in the process of surviving, the Empire emphasized efficiency in its war manuals. The Byzantine education system continued until the empire's collapse in 1453 AD.[4]",
                @"In Western Europe a considerable number of cathedral schools were founded during the Early Middle Ages in order to teach future clergy and administrators, with the oldest still existing, and continuously operated, cathedral schools being The King's School, Canterbury (established 597 CE), King's School, Rochester (established 604 CE), St Peter's School, York (established 627 CE) and Thetford Grammar School (established 631 CE). Beginning in the 5th century CE monastic schools were also established throughout Western Europe, teaching both religious and secular subjects.",
                @"Islam was another culture that developed a school system in the modern sense of the word. Emphasis was put on knowledge, which required a systematic way of teaching and spreading knowledge, and purpose-built structures. At first, mosques combined both religious performance and learning activities, but by the 9th century, the madrassa was introduced, a school that was built independently from the mosque, such as al-Qarawiyyin, founded in 859 CE. They were also the first to make the Madrassa system a public domain under the control of the Caliph.",
                @"Under the Ottomans, the towns of Bursa and Edirne became the main centers of learning. The Ottoman system of Külliye, a building complex containing a mosque, a hospital, madrassa, and public kitchen and dining areas, revolutionized the education system, making learning accessible to a wider public through its free meals, health care and sometimes free accommodation.",
                @"In Europe, universities emerged during the 12th century; here, scholasticism was an important tool, and the academicians were called schoolmen. During the Middle Ages and much of the Early Modern period, the main purpose of schools (as opposed to universities) was to teach the Latin language. This led to the term grammar school, which in the United States informally refers to a primary school, but in the United Kingdom means a school that selects entrants based on ability or aptitude. Following this, the school curriculum has gradually broadened to include literacy in the vernacular language as well as technical, artistic, scientific and practical subjects.",
                @"Obligatory school attendance became common in parts of Europe during the 18th century. In Denmark-Norway, this was introduced as early as in 1739-1741, the primary end being to increase the literacy of the almue, i.e. the regular people.[5] Many of the earlier public schools in the United States and elsewhere were one-room schools where a single teacher taught seven grades of boys and girls in the same classroom. Beginning in the 1920s, one-room schools were consolidated into multiple classroom facilities with transportation increasingly provided by kid hacks and school buses."
            };

            string[][] words = texts.Tokenize();

            var Bow = new BagOfWords(words);

            // Create a new TF-IDF with options:
            var codebook = new TFIDF()
            {
                Tf  = TermFrequency.Log,
                Idf = InverseDocumentFrequency.Default,
            };

            // Compute the codebook (note: this would have to be done only for the training set)
            codebook.Learn(words);

            // Now, we can use the learned codebook to extract fixed-length
            // representations of the different texts (paragraphs) above:

            // Extract a feature vector from the text 1:
            List <double[]> lstDocumentsScores = new List <double[]>();

            for (int i = 0; i < texts.Length; i++)
            {
                lstDocumentsScores.Add(codebook.Transform(words[i]));
            }

            var teacher = new MulticlassSupportVectorLearning <Linear>()
            {
                // using LIBLINEAR's L2-loss SVC dual for each SVM
                Learner = (p) => new LinearDualCoordinateDescent()
                {
                    Loss = Loss.L2
                }
            };

            double[][] features = Bow.Transform(words);

            teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism)

            // Learn a machine
            var machine = teacher.Learn(features, new int[] { 0, 0, 1, 1, 0, 0 });

            int[] predicted = machine.Decide(features);

            double error = new ZeroOneLoss(new int[] { 0, 0, 1, 1, 0, 0 }).Loss(predicted);

            // Extract a feature vector from the text 2:
            //example
            // double[] bow2 = codebook.Transform(words[1]);

            var indexSerachedTerm = Bow.StringToCode["Ottomans".ToLower()];

            double dblMaxScore     = double.MinValue;
            int    irWhichDocument = int.MinValue;

            for (int i = 0; i < texts.Length; i++)
            {
                if (lstDocumentsScores[i][indexSerachedTerm] > dblMaxScore)
                {
                    irWhichDocument = i;
                    dblMaxScore     = lstDocumentsScores[i][indexSerachedTerm];
                }
            }
        }
Beispiel #3
0
        private static void TestLinearASGD()
        {
            // http://leon.bottou.org/projects/sgd

            string codebookPath = "codebook.bin";
            string x_train_fn   = "x_train.txt.gz";
            string x_test_fn    = "x_test.txt.gz";

            Sparse <double>[] xTrain = null, xTest = null;
            bool[]            yTrain = null, yTest = null;

            // Check if we have the precomputed dataset on disk
            if (!File.Exists(x_train_fn) || !File.Exists(x_train_fn))
            {
                Console.WriteLine("Downloading dataset");
                RCV1v2 rcv1v2 = new RCV1v2(@"C:\Temp\");

                // Note: Leon Bottou's SGD inverts training and
                // testing when benchmarking in this dataset
                var trainWords = rcv1v2.Testing.Item1;
                var testWords  = rcv1v2.Training.Item1;

                string positiveClass = "CCAT";
                yTrain = rcv1v2.Testing.Item2.Apply(x => x.Contains(positiveClass));
                yTest  = rcv1v2.Training.Item2.Apply(x => x.Contains(positiveClass));

                TFIDF tfidf;
                if (!File.Exists(codebookPath))
                {
                    Console.WriteLine("Learning TD-IDF");
                    // Create a TF-IDF considering only words that
                    // exist in both the training and testing sets
                    tfidf = new TFIDF(testWords)
                    {
                        Tf  = TermFrequency.Log,
                        Idf = InverseDocumentFrequency.Default,
                    };

                    // Learn the training set
                    tfidf.Learn(trainWords);

                    Console.WriteLine("Saving codebook");
                    tfidf.Save(codebookPath);
                }
                else
                {
                    Console.WriteLine("Loading codebook");
                    Serializer.Load(codebookPath, out tfidf);
                }

                if (!File.Exists(x_train_fn))
                {
                    // Transform and normalize training set
                    Console.WriteLine("Pre-processing training set");
                    xTrain = tfidf.Transform(trainWords, out xTrain);

                    Console.WriteLine("Post-processing training set");
                    xTrain = xTrain.Divide(Norm.Euclidean(xTrain, dimension: 1), result: xTrain);

                    Console.WriteLine("Saving training set to disk");
                    SparseFormat.Save(xTrain, yTrain, x_train_fn, compression: SerializerCompression.GZip);
                }

                if (!File.Exists(x_test_fn))
                {
                    // Transform and normalize testing set
                    Console.WriteLine("Pre-processing testing set");
                    xTest = tfidf.Transform(testWords, out xTest);

                    Console.WriteLine("Post-processing testing set");
                    xTest = xTest.Divide(Norm.Euclidean(xTest, dimension: 1), result: xTest);

                    Console.WriteLine("Saving testing set to disk");
                    SparseFormat.Save(xTest, yTest, x_test_fn, compression: SerializerCompression.GZip);
                }
            }
            else
            {
                Console.WriteLine("Loading dataset from disk");
                if (xTrain == null || yTrain == null)
                {
                    SparseFormat.Load(x_train_fn, out xTrain, out yTrain, compression: SerializerCompression.GZip);
                }
                if (xTest == null || yTest == null)
                {
                    SparseFormat.Load(x_test_fn, out xTest, out yTest, compression: SerializerCompression.GZip);
                }
            }

            int positiveTrain = yTrain.Count(x => x);
            int positiveTest  = yTest.Count(x => x);
            int negativeTrain = yTrain.Length - positiveTrain;
            int negativeTest  = yTest.Length - positiveTest;

            Console.WriteLine("Training samples: {0} [{1}+, {2}-]", positiveTrain + negativeTrain, positiveTrain, negativeTrain);
            Console.WriteLine("Negative samples: {0} [{1}+, {2}-]", positiveTest + negativeTest, positiveTest, negativeTest);

            // Create and learn a linear sparse binary support vector machine
            var learn = new AveragedStochasticGradientDescent <Linear, Sparse <double> >()
            {
                MaxIterations = 5,
                Tolerance     = 0,
            };

            Console.WriteLine("Learning training set");
            Stopwatch sw  = Stopwatch.StartNew();
            var       svm = learn.Learn(xTrain, yTrain);

            Console.WriteLine(sw.Elapsed);


            Console.WriteLine("Predicting training set");
            sw = Stopwatch.StartNew();
            bool[] trainPred = svm.Decide(xTrain);
            Console.WriteLine(sw.Elapsed);

            var train = new ConfusionMatrix(trainPred, yTrain);

            Console.WriteLine("Train acc: " + train.Accuracy);


            Console.WriteLine("Predicting testing set");
            sw = Stopwatch.StartNew();
            bool[] testPred = svm.Decide(xTest);
            Console.WriteLine(sw.Elapsed);

            var test = new ConfusionMatrix(testPred, yTest);

            Console.WriteLine("Test acc: " + test.Accuracy);
        }
Beispiel #4
0
    public static void Main()
    {
        var documents = new Document[N];
        var words     = new string[N][];

        for (int i = 0; i < N; ++i)
        {
            documents[i] = new Document(i);
            words[i]     = documents[i].备注词汇;
        }
        var tfIdf = new TFIDF();

        tfIdf.Learn(words);
        var inputs = new double[N][];

        for (int i = 0; i < N; ++i)
        {
            documents[i].备注特征向量  = tfIdf.Transform(documents[i].备注词汇);
            documents[i].特征向量    = new double[documents[i].备注特征向量.Length + 4];
            documents[i].特征向量[0] = documents[i].能查到正在营业 ? 1.0 : 0.0;
            documents[i].特征向量[1] = documents[i].能查到曾经营业 ? 1.0 : 0.0;
            documents[i].特征向量[2] = documents[i].无营业信息 ? 1.0 : 0.0;
            documents[i].特征向量[3] = documents[i].GPS定位 ? 1.0 : 0.0;
            documents[i].备注特征向量.CopyTo(documents[i].特征向量, 4);
            inputs[i] = documents[i].特征向量;
        }
        var outputs通讯情况 = new int[N];
        var outputs存在状况 = new int[N];
        var outputs数据有效 = new int[N];

        for (int i = 0; i < N; ++i)
        {
            documents[i].ParseOutput();
            outputs通讯情况[i] = documents[i].通讯情况;
            outputs存在状况[i] = documents[i].存在状况;
            outputs数据有效[i] = documents[i].数据有效 ? 1 : 0;
        }
        var teacher1 = new NaiveBayesLearning <NormalDistribution>();

        teacher1.Options.InnerOption = new NormalOptions {
            Regularization = 1e-12
        };
        var teacher2 = new NaiveBayesLearning <NormalDistribution>();

        teacher2.Options.InnerOption = new NormalOptions {
            Regularization = 1e-12
        };
        var teacher3 = new NaiveBayesLearning <NormalDistribution>();

        teacher3.Options.InnerOption = new NormalOptions {
            Regularization = 1e-12
        };
        var model通讯情况   = teacher1.Learn(inputs, outputs通讯情况);
        var model存在状况   = teacher2.Learn(inputs, outputs存在状况);
        var model数据有效   = teacher3.Learn(inputs, outputs数据有效);
        var correct通讯情况 = 0;
        var correct存在状况 = 0;
        var correct数据有效 = 0;

        for (int i = 0; i < N; ++i)
        {
            var 通讯情况 = model通讯情况.Decide(documents[i].特征向量);
            if (documents[i].通讯情况 == 通讯情况)
            {
                ++correct通讯情况;
            }
            else
            {
                Console.WriteLine("Input{0}.txt的通讯情况 你认为:{1} 电脑认为:{2}",
                                  i, Document.通讯情况说明[documents[i].通讯情况],
                                  Document.通讯情况说明[通讯情况]);
            }
            var 存在状况 = model存在状况.Decide(documents[i].特征向量);
            if (documents[i].存在状况 == 存在状况)
            {
                ++correct存在状况;
            }
            else
            {
                Console.WriteLine("Input{0}.txt的存在状况 你认为:{1} 电脑认为:{2}",
                                  i, Document.存在状况说明[documents[i].存在状况],
                                  Document.存在状况说明[存在状况]);
            }
            var 数据有效 = model数据有效.Decide(documents[i].特征向量) == 1;
            if (documents[i].数据有效 == 数据有效)
            {
                ++correct数据有效;
            }
            else
            {
                Console.WriteLine("Input{0}.txt的数据有效 你认为:{1} 电脑认为:{2}",
                                  i, documents[i].数据有效, 数据有效);
            }
        }
        Console.WriteLine("通讯情况准确率: {0:F2} %",
                          (double)correct通讯情况 / N * 100);
        Console.WriteLine("存在状况准确率: {0:F2} %",
                          (double)correct存在状况 / N * 100);
        Console.WriteLine("数据有效准确率: {0:F2} %",
                          (double)correct数据有效 / N * 100);
    }