Esempio n. 1
0
        //LogisticRegression reg;


        public LogClassifier(string fileName, int countLayers, int countEpoch)
        {
            Codebook = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            int samples    = 0;
            var dictionary = Utilities.ReadHostFile(fileName, ref samples);

            Samples = samples;

            if (dictionary.Item1.Length != 0 && dictionary.Item2.Length != 0)
            {
                Codebook.Learn(dictionary.Item1);
                double[][] inputs = Codebook.Transform(dictionary.Item1);
                int        count  = inputs.Count();

                //var learner = new IterativeReweightedLeastSquares<LogisticRegression>()
                //{
                //    Tolerance = 1e-4,  // Let's set some convergence parameters
                //    Iterations = 10,  // maximum number of iterations to perform
                //    Regularization = 0
                //};

                //reg = learner.Learn(inputs, outputs2);
                double[][] outputs = Utilities.BoolToDouble(dictionary.Item2);
                classifier = new SimpleClassifierNN(inputs, outputs, count, countLayers, countEpoch);
                var trainingResult = classifier.Train(inputs, outputs);
                Error        = trainingResult.Item1;
                TrainingTime = trainingResult.Item2;
            }
        }
Esempio n. 2
0
        /// <summary>
        /// Saves the formatted tokens for future use.
        /// Formats a list of strings using a Bag of Words.
        /// </summary>
        /// <param name="path">Path to the folder</param>
        /// <param name="inputDoc">Name of training tweets document</param>
        /// <returns> Formatted tweets to Bag of Words format </returns>
        double[][] ReadInput(string inputDoc)
        {
            double[][] input;

            using (StreamReader r = new StreamReader(Constants.PROGRAM_DATA_FILEPATH + @"\" + inputDoc))
            {
                List <string> tweets = new List <string>();

                while (!r.EndOfStream)
                {
                    tweets.Add(r.ReadLine());
                }

                //Use custom tokenizer
                string[][] tokens = tp.Tokenizer(tweets);

                FileHelper.WriteObjectToFile("BagOfWords90.txt", tokens);

                bagOfWords.Learn(tokens);

                input = bagOfWords.Transform(tokens);

                r.DiscardBufferedData();
                r.Close();
            };

            return(input);
        }
Esempio n. 3
0
        private static void bagOfWords(int[][] inputs, int[] outputs)
        {
            var bow = new BagOfWords <int>();

            var quantizer = bow.Learn(inputs);

            double[][] histograms = quantizer.Transform(inputs);

            // One way to perform sequence classification with an SVM is to use
            // a kernel defined over sequences, such as DynamicTimeWarping.

            // Create the multi-class learning algorithm as one-vs-one with DTW:
            var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]>()
            {
                Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>()
                {
                    Complexity = 10000.0 // Create a hard SVM
                }
            };

            // Learn a multi-label SVM using the teacher
            var svm = teacher.Learn(histograms, outputs);

            // Get the predictions for the inputs
            int[] predicted = svm.Decide(histograms);

            // Create a confusion matrix to check the quality of the predictions:
            var cm = new ConfusionMatrix(predicted: predicted, expected: outputs);

            // Check the accuracy measure:
            double accuracy = cm.Accuracy;
        }
Esempio n. 4
0
        public static List <string> GetKeywords(string[] body, int count, int minOccurance = 0)
        {
            var bow = new BagOfWords()
            {
                MaximumOccurance = 500
            };

            bow.Learn(body);
            int[] codedBody = new int[body.Length];
            bow.Transform(body, codedBody);

            var dictionary = codedBody.Select((value, index) => new { value, index })
                             .ToDictionary(pair => pair.index, pair => pair.value)
                             .OrderByDescending(x => x.Value)
                             .Where(x => x.Value > minOccurance)
                             .Select(x => x.Key)
                             .Take(count)
                             .ToList();

            List <string> result   = new List <string>();
            var           codebook = bow.CodeToString;

            foreach (var keyWord in dictionary)
            {
                result.Add(codebook[keyWord]);
            }
            return(result);
        }
Esempio n. 5
0
 private double[][] CreateTextBagOfWords(string[][] inputs)
 {
     _textBagOfWords = new BagOfWords()
     {
         MaximumOccurance = 1
     };
     _textBagOfWords.Learn(inputs);
     return(_textBagOfWords.Transform(inputs));
 }
        //Make this class a singleton so that it is not retrained for every class it is used by
        private TextAnalyzer()
        {
            //Usage of a Naive Bayes classifier
            //Create the trainer, allowing for some regularlizatiton
            var teacher = new NaiveBayesLearning <NormalDistribution, NormalOptions>()
            {
                Options = { InnerOption = { Regularization = 1e-6 } }
            };

            //Read in the training data and stop words
            string liberalTrainingPath      = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/liberal_training.txt");
            string conservativeTrainingPath = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/conservative_training.txt");
            string stopWordsPath            = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/stop_words.txt");

            string[] liberalSamples      = File.ReadAllLines(liberalTrainingPath);
            string[] conservativeSamples = File.ReadAllLines(conservativeTrainingPath);
            stopWords = File.ReadAllLines(stopWordsPath);

            //Concat the samples into one array (They are first read into their own array to allow us to know the amount of samples in each file)
            string[] samples = liberalSamples.Concat(conservativeSamples).ToArray();

            //Break the text up into individual words
            string[][] words = samples.Tokenize();

            //If for some reason we didn't actually read any training data, throw an exception cuz the classifier wont work
            if (words.Length == 0)
            {
                throw new Exception("No training data for TextAnalyzer");
            }

            //Remove common english words
            words = TrimStopWords(words);

            //Create a bag of words using the tokenized sample data
            bagOfWords = new BagOfWords();
            bagOfWords.Learn(words);

            //Populate the output array using the known lengths of the sample files
            int[] outputs = new int[samples.Length];
            for (int i = 0; i < samples.Length; i++)
            {
                if (i < liberalSamples.Length)
                {
                    outputs[i] = 0;
                }
                else
                {
                    outputs[i] = 1;
                }
            }

            //Train the classifier
            double[][] inputs = bagOfWords.Transform(words);
            nbClassifier = teacher.Learn(inputs, outputs);
        }
Esempio n. 7
0
        public void learn_generic1()
        {
            // Declare some testing data
            int[][] sequences = new int[][]
            {
                new int[] { 0, 0, 1, 2 },     // Class 0
                new int[] { 0, 1, 1, 2 },     // Class 0
                new int[] { 0, 0, 0, 1, 2 },  // Class 0
                new int[] { 0, 1, 2, 2, 2 },  // Class 0

                new int[] { 2, 2, 1, 0 },     // Class 1
                new int[] { 2, 2, 2, 1, 0 },  // Class 1
                new int[] { 2, 2, 2, 1, 0 },  // Class 1
                new int[] { 2, 2, 2, 2, 1 },  // Class 1
            };

            int[] outputs = new int[]
            {
                0, 0, 0, 0, // First four sequences are of class 0
                1, 1, 1, 1, // Last four sequences are of class 1
            };

            // Create a Bag-of-Words learning algorithm
            var bow = new BagOfWords <int>();

            bow.ParallelOptions.MaxDegreeOfParallelism = 1;

            // Use the BoW to create a quantizer
            var quantizer = bow.Learn(sequences);

            // Extract vector representations from the integer sequences
            double[][] representations = quantizer.Transform(sequences);

            // Create a new learning algorithm for support vector machines
            var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]>
            {
                Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>()
                {
                    Complexity = 100000
                }
            };

            // Use the learning algorithm to create a classifier
            var svm = teacher.Learn(representations, outputs);

            // Compute predictions for the training set
            int[] predicted = svm.Decide(representations);

            var    cm  = new ConfusionMatrix(predicted: predicted, expected: outputs);
            double acc = cm.Accuracy;

            Assert.AreEqual(0.75, acc);
        }
Esempio n. 8
0
        public void ExecuteTest()
        {
            string[][] words =
            {
                new string[] { "今日", "は", "いい", "天気", "です"   },
                new string[] { "明日", "も", "いい", "天気", "でしょう" }
            };

            var codebook = new BagOfWords()
            {
                //MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
                MaximumOccurance = int.MaxValue
            };

            // Compute the codebook (note: this would have to be done only for the training set)
            codebook.Learn(words);

            // Now, we can use the learned codebook to extract fixed-length
            // representations of the different texts (paragraphs) above:

            // Extract a feature vector from the text 1:
            double[] bow1 = codebook.Transform(words[0]);

            // Extract a feature vector from the text 2:
            double[] bow2 = codebook.Transform(words[1]);

            // we could also have transformed everything at once, i.e.
            double[][] bow = codebook.Transform(words);


            // Now, since we have finite length representations (both bow1 and bow2 should
            // have the same size), we can pass them to any classifier or machine learning
            // method. For example, we can pass them to a Logistic Regression Classifier to
            // discern between the first and second paragraphs

            // Lets create a Logistic classifier to separate the two paragraphs:
            var learner = new IterativeReweightedLeastSquares <LogisticRegression>()
            {
                Tolerance      = 1e-4, // Let's set some convergence parameters
                Iterations     = 100,  // maximum number of iterations to perform
                Regularization = 0
            };

            // Now, we use the learning algorithm to learn the distinction between the two:
            LogisticRegression reg = learner.Learn(new[] { bow1, bow2 }, new[] { false, true });

            // Finally, we can predict using the classifier:
            bool c1 = reg.Decide(bow1); // Should be false
            bool c2 = reg.Decide(bow2); // Should be true

            Console.WriteLine(c1);
            Console.WriteLine(c2);
        }
Esempio n. 9
0
 /// <summary>
 /// Gets the trained Bag of words
 /// </summary>
 /// <returns></returns>
 public static BagOfWords GetBagOfWords()
 {
     if (bagOfWords == null)
     {
         bagOfWords = new BagOfWords()
         {
             MaximumOccurance = 1
         };
         bagOfWords.Learn(ReadObjectFromFile <string[][]>(@"BagOfWords90.txt"));
     }
     return(bagOfWords);
 }
Esempio n. 10
0
        public LogClassifier(string networkFileName, string dictionaryFileName)
        {
            Codebook = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            int samples    = 0;
            var dictionary = Utilities.ReadHostFile(dictionaryFileName, ref samples);

            if (dictionary.Item1.Length != 0)
            {
                Codebook.Learn(dictionary.Item1);
            }

            classifier = new SimpleClassifierNN(networkFileName);
        }
Esempio n. 11
0
        static void Main(string[] args)
        {
            DiferenciasEntities db = new DiferenciasEntities();
            var query = db.Edicion.Where(x => x.Tecnico.Equals("CSM05"));

            string[]   edits    = query.Select(x => x.CadenaInicial).ToArray();
            string[][] words    = edits.Tokenize();
            var        codebook = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            codebook.Learn(words);

            double[] bow1   = codebook.Transform(words[0]);
            double[] bow2   = codebook.Transform(words[1]);
            Cosine   cosine = new Cosine();

            Console.WriteLine(cosine.Similarity(bow1, bow2));
        }
Esempio n. 12
0
        public TrainingResult Train()
        {
            var result = new TrainingResult();

            // load training data
            result.StartMeasure(TrainingResult.RecordType.LoadDataset);
            var       reader    = new ExcelReader(Helpers.DatasetPath);
            DataTable dataStore = reader.GetWorksheet("Training");

            int[]    labels    = dataStore.ToVector <int>("Label");
            string[] learnData = dataStore.ToVector <string>("Sentiment");
            result.StopMeasure();

            // tokenize
            result.StartMeasure(TrainingResult.RecordType.Tokenization);
            string[][] tokenized = learnData.Select(x => _preprocessor.Process(x)).ToArray();
            result.StopMeasure();

            // train bag of words
            result.StartMeasure(TrainingResult.RecordType.BagOfWordsLearning);
            _bagOfWords = new BagOfWords();
            _bagOfWords.Learn(tokenized);
            result.StopMeasure();

            // vectorization of tokens
            result.StartMeasure(TrainingResult.RecordType.Featurization);
            int[][] featurized = _bagOfWords.Transform(tokenized).ToInt32();
            result.StopMeasure();

            // train
            result.StartMeasure(TrainingResult.RecordType.NaiveBayesLearning);
            var teacher = new NaiveBayesLearning();

            _bayes = teacher.Learn(featurized, labels);
            result.StopMeasure();

            return(result);
        }
Esempio n. 13
0
        static void Main()
        {
            // Path to the folder with classifiers models
            var jarRoot = @"\Users\devir\OneDrive\Documents\Visual Studio 2015\Projects\ner";
            var classifiersDirecrory = jarRoot + @"\classifiers";

            // Loading 3 class classifier model
            var classifier = CRFClassifier.getClassifierNoExceptions(
                classifiersDirecrory + @"\english.muc.7class.distsim.crf.ser.gz");

            var s1 = " She got up this morning at 9:00 am and went to a shop to spend five dollars to buy a 50% off toothbrush.";


            var s2 = "Tell the latest on olympics from the New York.";

            Console.WriteLine("{0}\n", classifier.classifyToCharacterOffsets(s1));
            Console.WriteLine("{0}\n", classifier.classifyWithInlineXML(s1));

            //MUNCULIN NER SATU SATU
            string result = classifier.classifyWithInlineXML(s1);
            String substr1 = "TIME";
            String substr2 = "LOCATION";
            String substr3 = "PERSON";
            String substr4 = "ORGANIZATION";
            String substr5 = "MONEY";
            String substr6 = "Percent";
            String substr7 = "Date";
            string total1, total2, total3, total4, total5, total6, total7;

            //if (result.Contains(substr1))
            //{
            //    string[] hasiltime = GetStringInBetween("<TIME>", "</TIME>", result, false, false);
            //    string output_time = hasiltime[0];
            //    string next_time = hasiltime[1];
            //    total1 = output_time;
            //   // Console.WriteLine(output_time);
            //}
            //if (result.Contains(substr2))
            //{
            //    string[] hasillocation = GetStringInBetween("<LOCATION>", "</LOCATION>", result, false, false);
            //    string output_location = hasillocation[0];
            //    string next_loc = hasillocation[1];
            //    //Console.WriteLine(output_location);
            //    total2 = output_location;
            //}
            //if (result.Contains(substr3))
            //{
            //    string[] hasilperson = GetStringInBetween("<PERSON>", "</PERSON>", result, false, false);
            //    string output_person = hasilperson[0];
            //    string next_person = hasilperson[1];
            //    //Console.WriteLine(hasilperson);
            //    total3 = output_person;
            //}
            //if (result.Contains(substr4))
            //{
            //    string[] hasilORGANIZATION = GetStringInBetween("<ORGANIZATION>", "</ORGANIZATION>", result, false, false);
            //    string output_ORGANIZATION = hasilORGANIZATION[0];
            //    string next_ORGANIZATION = hasilORGANIZATION[1];
            //    //Console.WriteLine(output_ORGANIZATION);
            //    total4 = output_ORGANIZATION;
            //}
            //if (result.Contains(substr5))
            //{
            //    string[] hasilMONEY = GetStringInBetween("<MONEY>", "</MONEY>", result, false, false);
            //    string output_MONEY = hasilMONEY[0];
            //    string next_MONEY = hasilMONEY[1];
            //    // Console.WriteLine(output_MONEY);
            //    total5 = output_MONEY;
            //}
            //if (result.Contains(substr6))
            //{
            //    string[] hasilPercent = GetStringInBetween("<Percent>", "</Percent>", result, false, false);
            //    string output_Percent = hasilPercent[0];
            //    string next_Percent = hasilPercent[1];
            //    //Console.WriteLine(output_Percent);
            //    total6 = output_Percent;
            //}
            //if (result.Contains(substr7))
            //{
            //    string[] hasilDate = GetStringInBetween("<Date>", "</Date>", result, false, false);
            //    string output_Date = hasilDate[0];
            //    string next_Date = hasilDate[1];
            //    //Console.WriteLine(output_Date);
            //    total7 = output_Date;

            //}

            string[] hasiltime   = GetStringInBetween("<TIME>", "</TIME>", result, false, false);
            string   output_time = hasiltime[0];
            string   next_time   = hasiltime[1];

            total1 = output_time;
            //Console.WriteLine(output_time);

            string[] hasillocation   = GetStringInBetween("<LOCATION>", "</LOCATION>", result, false, false);
            string   output_location = hasillocation[0];
            string   next_loc        = hasillocation[1];

            //Console.WriteLine(output_location);
            total2 = output_location;

            string[] hasilperson   = GetStringInBetween("<PERSON>", "</PERSON>", result, false, false);
            string   output_person = hasilperson[0];
            string   next_person   = hasilperson[1];

            //Console.WriteLine(hasilperson);
            total3 = output_person;

            string[] hasilORGANIZATION   = GetStringInBetween("<ORGANIZATION>", "</ORGANIZATION>", result, false, false);
            string   output_ORGANIZATION = hasilORGANIZATION[0];
            string   next_ORGANIZATION   = hasilORGANIZATION[1];

            //Console.WriteLine(output_ORGANIZATION);
            total4 = output_ORGANIZATION;

            string[] hasilMONEY   = GetStringInBetween("<MONEY>", "</MONEY>", result, false, false);
            string   output_MONEY = hasilMONEY[0];
            string   next_MONEY   = hasilMONEY[1];

            // Console.WriteLine(output_MONEY);
            total5 = output_MONEY;

            string[] hasilPercent   = GetStringInBetween("<Percent>", "</Percent>", result, false, false);
            string   output_Percent = hasilPercent[0];
            string   next_Percent   = hasilPercent[1];

            //Console.WriteLine(output_Percent);
            total6 = output_Percent;

            string[] hasilDate   = GetStringInBetween("<Date>", "</Date>", result, false, false);
            string   output_Date = hasilDate[0];
            string   next_Date   = hasilDate[1];

            //Console.WriteLine(output_Date);
            total7 = output_Date;


            //BOW
            string semua = total1 + ";" + total2 + ";" + total3 + ";" + total4 + ";" + total5 + ";" + total6 + ";" + total7 + ";";

            Console.WriteLine(semua);
            string[] gabungan = { total1, total2, total3, total4, total5, total6, total7 };

            foreach (var a in gabungan)
            {
                Console.WriteLine(a);
            }
            string[][] words = gabungan.Tokenize();
            //var codebook = new TFIDF()
            //{
            //    Tf = TermFrequency.Log,
            //    Idf = InverseDocumentFrequency.Default
            //};
            var codebook = new BagOfWords()
            {
                MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
            };

            codebook.Learn(words);
            double[]   bow1            = codebook.Transform(words[0]);
            double[]   bow2            = codebook.Transform(words[1]);
            double[]   bow3            = codebook.Transform(words[2]);
            double[]   bow4            = codebook.Transform(words[3]);
            double[]   bow5            = codebook.Transform(words[4]);
            double[]   bow6            = codebook.Transform(words[5]);
            double[]   bow7            = codebook.Transform(words[6]);
            double[][] keseluruhanBOW1 = { bow1, bow2, bow3, bow4, bow5, bow6, bow7 };

            //coba
            bool quitNow = false;

            while (!quitNow)
            {
                string val;
                Console.Write("Enter question: ");
                val = Console.ReadLine();
                string[] textss =
                {
                    val,
                };



                string[][] wordss = textss.Tokenize();
                //var codebook2 = new TFIDF()
                //{
                //    Tf = TermFrequency.Log,
                //    Idf = InverseDocumentFrequency.Default
                //};
                var codebook2 = new BagOfWords()
                {
                    MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
                };
                codebook2.Learn(wordss);
                double[] c1   = codebook2.Transform(wordss[0]);
                string   path = @"C:\Users\devir\OneDrive\Documents\Visual Studio 2015\Projects\ner";
                //var load_svm_model = Serializer.Load<MulticlassClassifierBase>(Path.Combine(path, "pelatihanSVMbayardanpergi.bin"));


                //LibSvmModel modela = LibSvmModel.Load(Path.Combine(path, "pelatihanSVMbayardanpergi.bint"));
                //int jawaban = load_svm_model.Decide( c1); // answer will be 2.
                // Now, we can use the model class to create the equivalent Accord.NET SVM:

                //Console.WriteLine(jawaban);
                LibSvmModel model = LibSvmModel.Load(Path.Combine(path, "pelatihanSVMbayardanpergi.txt"));

                // Now, we can use the model class to create the equivalent Accord.NET SVM:
                SupportVectorMachine svm = model.CreateMachine();

                // Compute classification error
                bool predicted = svm.Decide(c1);

                // var machine = teacher.Learn(inputs, outputs);

                if (predicted == false)
                {
                    Console.WriteLine("BAYAR");
                }
                ;
                if (predicted == true)
                {
                    Console.WriteLine("PERGI");
                }
                ;
                Console.ReadLine();
            }

            // In order to convert any 2d array to jagged one
            // let's use a generic implementation
        }
Esempio n. 14
0
        public void issue_168()
        {
            // Text naive bayes classification gives wrong results #168
            // https://github.com/accord-net/framework/issues/168
            // Some sample texts
            string[] spamTokens  = Tokenize(@"I decided to sign up for the Disney Half Marathon. Half of a marathon is 13.1 miles. A full marathon is 26.2 miles. You may wonder why the strange number of miles. “26.2” is certainly not an even number. And after running 26 miles who cares about the point two? You might think that 26.2 miles is a whole number of kilometers. It isn’t. In fact, it is even worse in kilometers – 42.1648128. I bet you don’t see many t-shirts in England with that number printed on the front.");
            string[] loremTokens = Tokenize(@"Lorem ipsum dolor sit amet,  Nulla nec tortor. Donec id elit quis purus consectetur consequat. Nam congue semper tellus. Sed erat dolor, dapibus sit amet, venenatis ornare, ultrices ut, nisi. Aliquam ante. Suspendisse scelerisque dui nec velit. Duis augue augue, gravida euismod, vulputate ac, facilisis id, sem. Morbi in orci. Nulla purus lacus, pulvinar vel, malesuada ac, mattis nec, quam. Nam molestie scelerisque quam. Nullam feugiat cursus lacus.orem ipsum dolor sit amet.");

            // Their respective classes
            string[] classes = { "spam", "lorem" };


            // Create a new Bag-of-Words for the texts
            BagOfWords bow = new BagOfWords()
            {
                // Limit the maximum number of occurences in
                // the feature vector to a single instance
                MaximumOccurance = 1
            };

            bow.Learn(new[] { spamTokens, loremTokens });

            string word = bow.CodeToString[52];

            Assert.AreEqual("in", word);

            // Create input and outputs for training
            int[][] inputs =
            {
                bow.GetFeatureVector(spamTokens),
                bow.GetFeatureVector(loremTokens)
            };

            int[] outputs =
            {
                0, // spam
                1, // lorem
            };

            // Create the naïve bayes model
            var teacher = new NaiveBayesLearning()
            {
                Empirical = true,
                Options   = new IndependentOptions <GeneralDiscreteOptions>()
                {
                    InnerOption = new GeneralDiscreteOptions()
                    {
                        //UseLaplaceRule = true
                    }
                }
            };

            // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization
            teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism)


            // Estimate the model
            var nb = teacher.Learn(inputs, outputs);


            double[][] spamDist  = nb.Distributions.GetRow(0);
            double[][] loremDist = nb.Distributions.GetRow(1);

            for (int i = 0; i < spamDist.Length; i++)
            {
                if (i == 52)
                {
                    Assert.AreEqual(spamDist[i][0], 0.0, 1e-8);
                    Assert.AreEqual(spamDist[i][1], 1.0, 1e-8);
                    Assert.AreEqual(loremDist[i][0], 0.0, 1e-8);
                    Assert.AreEqual(loremDist[i][1], 1.0, 1e-8);
                }
                else
                {
                    if (i < 68)
                    {
                        Assert.AreEqual(spamDist[i][0], 0.0, 1e-8);
                        Assert.AreEqual(spamDist[i][1], 1.0, 1e-8);
                        Assert.AreEqual(loremDist[i][0], 1.0, 1e-8);
                        Assert.AreEqual(loremDist[i][1], 0.0, 1e-8);
                    }
                    else
                    {
                        Assert.AreEqual(spamDist[i][0], 1.0, 1e-8);
                        Assert.AreEqual(spamDist[i][1], 0.0, 1e-8);
                        Assert.AreEqual(loremDist[i][0], 0.0, 1e-8);
                        Assert.AreEqual(loremDist[i][1], 1.0, 1e-8);
                    }
                }
            }

            // Consume the model
            {
                // This classifies as spam
                string text   = @"I decided to sign up for";
                int[]  input  = bow.GetFeatureVector(Tokenize(text));
                int    answer = nb.Decide(input);
                string result = classes[answer];
                Assert.AreEqual("lorem", result);
            }

            {
                // This classifies as spam
                string text   = @"I decided to sign up for the";
                int[]  input  = bow.GetFeatureVector(Tokenize(text));
                int    answer = nb.Decide(input);
                string result = classes[answer];
                Assert.AreEqual("spam", result);
            }

            {
                // This classifies as lorem
                string text   = @"I decided to lorem ipsum nulla nec tortor purus sit amet";
                int[]  input  = bow.GetFeatureVector(Tokenize(text));
                int    answer = nb.Decide(input);
                string result = classes[answer];
                Assert.AreEqual("lorem", result);
            }
        }
Esempio n. 15
0
        public void ClassifyText()
        {
            readFile();

            string[] words = { sbGeneratedByProgram.ToString(), sbNotGeneratedByProgram.ToString() };

            string[][] word = words.Tokenize();
            // Create a new Bag-of-Words for the texts
            var bow = new BagOfWords()
            {
                MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
            };

            bow.Learn(word);
            //bow.Learn(Tokens);

            // Create input and outputs for training
            double[] inputP  = bow.Transform(word[0]);
            double[] inputNP = bow.Transform(word[1]);

            int[][] inputs =
            {
                inputP.ToInt32(),
                inputNP.ToInt32()
            };

            int[] outputs =
            {
                0, // Program Generated
                1  // Not Program Generated
            };

            // Create the naïve bayes model
            var learner = new NaiveBayesLearning();

            learner.Options.InnerOption.UseLaplaceRule = true;

            var nb = learner.Learn(inputs, outputs);

            string[] text   = "Yup! Two years without a car! Downtown living is morning too I got that this morning too too If the wi. Here's hoping!".Tokenize();
            int[]    input  = bow.Transform(text).ToInt32();
            int      answer = nb.Decide(input);

            Console.WriteLine(answer);
            Console.WriteLine(nb.Probabilities(input)[0] + " " + nb.Probabilities(input)[1]);
            // Learn a Naive Bayes model from the examples
            //{
            //    // This classifies as Not Program Generated
            //    string text = @"How moms use their iPhones";
            //    int[] input = bow.GetFeatureVector(Tokenize(text));
            //    int answer = bayes.Compute(input);
            //   Console.WriteLine( bayes.Probabilities(input)[0]+" "+ bayes.Probabilities(input)[1]);

            //    string result = classes[answer];

            //    Console.WriteLine("1) Test: {0}", result);
            //}

            //{
            //    // This classifies as spam
            //    string text = @" WINNING GRACIOUSNESS";
            //    int[] input = bow.GetFeatureVector(Tokenize(text));
            //    int answer = bayes.Compute(input);
            //    string result = classes[answer];
            //    Console.WriteLine(bayes.Probabilities(input)[0] + " " + bayes.Probabilities(input)[1]);

            //    Console.WriteLine("2) Test: {0}", result);
            //}
        }
Esempio n. 16
0
        public void learn_pendigits_normalization()
        {
            Console.WriteLine("Starting BagOfWordsTest.learn_pendigits_normalization");

            using (var travis = new KeepTravisAlive())
            {
                #region doc_learn_pendigits
                // The Bag-Of-Words model can be used to extract finite-length feature
                // vectors from sequences of arbitrary length, like handwritten digits

                // Ensure we get reproducible results
                Accord.Math.Random.Generator.Seed = 0;

                // Download the PENDIGITS dataset from UCI ML repository
                var pendigits = new Pendigits(path: Path.GetTempPath());

                // Get and pre-process the training set
                double[][][] trainInputs  = pendigits.Training.Item1;
                int[]        trainOutputs = pendigits.Training.Item2;

                // Pre-process the digits so each of them is centered and scaled
                trainInputs = trainInputs.Apply(Accord.Statistics.Tools.ZScores);

                // Create a Bag-of-Words learning algorithm
                var bow = new BagOfWords <double[], KMeans>()
                {
                    Clustering = new KMeans(5),
                };

                // Use the BoW to create a quantizer
                var quantizer = bow.Learn(trainInputs);

                // Extract vector representations from the pen sequences
                double[][] trainVectors = quantizer.Transform(trainInputs);

                // Create a new learning algorithm for support vector machines
                var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]>
                {
                    Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>()
                    {
                        Complexity = 1
                    }
                };

                // Use the learning algorithm to create a classifier
                var svm = teacher.Learn(trainVectors, trainOutputs);

                // Compute predictions for the training set
                int[] trainPredicted = svm.Decide(trainVectors);

                // Check the performance of the classifier by comparing with the ground-truth:
                var    m1       = new GeneralConfusionMatrix(predicted: trainPredicted, expected: trainOutputs);
                double trainAcc = m1.Accuracy; // should be 0.690


                // Prepare the testing set
                double[][][] testInputs  = pendigits.Testing.Item1;
                int[]        testOutputs = pendigits.Testing.Item2;

                // Apply the same normalizations
                testInputs = testInputs.Apply(Accord.Statistics.Tools.ZScores);

                double[][] testVectors = quantizer.Transform(testInputs);

                // Compute predictions for the test set
                int[] testPredicted = svm.Decide(testVectors);

                // Check the performance of the classifier by comparing with the ground-truth:
                var    m2      = new GeneralConfusionMatrix(predicted: testPredicted, expected: testOutputs);
                double testAcc = m2.Accuracy; // should be 0.600
                #endregion

#if NET35
                Assert.AreEqual(0.89594053744997137d, trainAcc, 1e-10);
                Assert.AreEqual(0.89605017347211102d, testAcc, 1e-10);
#else
                Assert.AreEqual(0.69039451114922812, trainAcc, 1e-10);
                Assert.AreEqual(0.600880704563651, testAcc, 1e-10);
#endif
            }
        }
Esempio n. 17
0
        public void learn_test()
        {
            #region doc_learn
            // The Bag-Of-Words model can be used to extract finite-length feature
            // vectors from sequences of arbitrary length, like for example, texts:


            string[] texts =
            {
                @"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas molestie malesuada 
                  nisi et placerat. Curabitur blandit porttitor suscipit. Nunc facilisis ultrices felis,
                  vitae luctus arcu semper in. Fusce ut felis ipsum. Sed faucibus tortor ut felis placerat
                  euismod. Vestibulum pharetra velit et dolor ornare quis malesuada leo aliquam. Aenean 
                  lobortis, tortor iaculis vestibulum dictum, tellus nisi vestibulum libero, ultricies 
                  pretium nisi ante in neque. Integer et massa lectus. Aenean ut sem quam. Mauris at nisl 
                  augue, volutpat tempus nisl. Suspendisse luctus convallis metus, vitae pretium risus 
                  pretium vitae. Duis tristique euismod aliquam",

                @"Sed consectetur nisl et diam mattis varius. Aliquam ornare tincidunt arcu eget adipiscing. 
                  Etiam quis augue lectus, vel sollicitudin lorem. Fusce lacinia, leo non porttitor adipiscing, 
                  mauris purus lobortis ipsum, id scelerisque erat neque eget nunc. Suspendisse potenti. Etiam 
                  non urna non libero pulvinar consequat ac vitae turpis. Nam urna eros, laoreet id sagittis eu,
                  posuere in sapien. Phasellus semper convallis faucibus. Nulla fermentum faucibus tellus in 
                  rutrum. Maecenas quis risus augue, eu gravida massa."
            };

            string[][] words = texts.Tokenize();

            // Create a new BoW with options:
            var codebook = new BagOfWords()
            {
                MaximumOccurance = 1 // the resulting vector will have only 0's and 1's
            };

            // Compute the codebook (note: this would have to be done only for the training set)
            codebook.Learn(words);


            // Now, we can use the learned codebook to extract fixed-length
            // representations of the different texts (paragraphs) above:

            // Extract a feature vector from the text 1:
            double[] bow1 = codebook.Transform(words[0]);

            // Extract a feature vector from the text 2:
            double[] bow2 = codebook.Transform(words[1]);

            // we could also have transformed everything at once, i.e.
            // double[][] bow = codebook.Transform(words);


            // Now, since we have finite length representations (both bow1 and bow2 should
            // have the same size), we can pass them to any classifier or machine learning
            // method. For example, we can pass them to a Logistic Regression Classifier to
            // discern between the first and second paragraphs

            // Lets create a Logistic classifier to separate the two paragraphs:
            var learner = new IterativeReweightedLeastSquares <LogisticRegression>()
            {
                Tolerance      = 1e-4, // Let's set some convergence parameters
                Iterations     = 100,  // maximum number of iterations to perform
                Regularization = 0
            };

            // Now, we use the learning algorithm to learn the distinction between the two:
            LogisticRegression reg = learner.Learn(new[] { bow1, bow2 }, new[] { false, true });

            // Finally, we can predict using the classifier:
            bool c1 = reg.Decide(bow1); // Should be false
            bool c2 = reg.Decide(bow2); // Should be true
            #endregion

            Assert.AreEqual(bow1.Length, 99);
            Assert.AreEqual(bow2.Length, 99);

            Assert.AreEqual(bow1.Sum(), 67);
            Assert.AreEqual(bow2.Sum(), 63);

            Assert.IsFalse(c1);
            Assert.IsTrue(c2);
        }
Esempio n. 18
0
        /// <summary>
        /// Initiates the required components and runs 4 accuracy tests
        /// Uses 90 tweets for training
        /// uses 30 tweets for testing
        /// </summary>
        /// <param name="inputFile">Tweets</param>
        /// <param name="outputFile">Labels</param>
        public void TestNaiveBayes(string inputFile, string outputFile)
        {
            //Create training features
            //4 sets of 90
            string[][] tokens1 = ReadInputEx(inputFile, 0, 29);
            string[][] tokens2 = ReadInputEx(inputFile, 30, 59);
            string[][] tokens3 = ReadInputEx(inputFile, 60, 89);
            string[][] tokens4 = ReadInputEx(inputFile, 90, 119);

            //Read training output
            int[] outputs1 = ReadOutputEx(outputFile, 0, 29);
            int[] outputs2 = ReadOutputEx(outputFile, 30, 59);
            int[] outputs3 = ReadOutputEx(outputFile, 60, 89);
            int[] outputs4 = ReadOutputEx(outputFile, 90, 119);

            //Create BOW for each training set
            BagOfWords bow1 = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            bow1.Learn(tokens1);

            BagOfWords bow2 = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            bow2.Learn(tokens2);

            BagOfWords bow3 = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            bow3.Learn(tokens3);

            BagOfWords bow4 = new BagOfWords()
            {
                MaximumOccurance = 1
            };

            bow4.Learn(tokens4);

            //Transform to feature vector
            double[][] inputs1 = bow1.Transform(tokens1);
            double[][] inputs2 = bow2.Transform(tokens2);
            double[][] inputs3 = bow3.Transform(tokens3);
            double[][] inputs4 = bow4.Transform(tokens4);

            //Create teachers
            var teacher1 = new NaiveBayesLearning <NormalDistribution>();

            teacher1.Options.InnerOption = new NormalOptions
            {
                Regularization = 1e-6 // to avoid zero variances
            };
            var teacher2 = new NaiveBayesLearning <NormalDistribution>();

            teacher2.Options.InnerOption = new NormalOptions
            {
                Regularization = 1e-6 // to avoid zero variances
            };
            var teacher3 = new NaiveBayesLearning <NormalDistribution>();

            teacher3.Options.InnerOption = new NormalOptions
            {
                Regularization = 1e-6 // to avoid zero variances
            };
            var teacher4 = new NaiveBayesLearning <NormalDistribution>();

            teacher4.Options.InnerOption = new NormalOptions
            {
                Regularization = 1e-6 // to avoid zero variances
            };

            //Create the Naive Bayes
            var nb1 = teacher1.Learn(inputs1, outputs1);
            var nb2 = teacher2.Learn(inputs2, outputs2);
            var nb3 = teacher3.Learn(inputs3, outputs3);
            var nb4 = teacher4.Learn(inputs4, outputs4);

            //Create the training sets
            //the remaining 30
            double[][] testInputs1  = bow1.Transform(ReadInputIn(inputFile, 0, 30));
            double[][] testInputs2  = bow2.Transform(ReadInputIn(inputFile, 30, 60));
            double[][] testInputs3  = bow3.Transform(ReadInputIn(inputFile, 60, 90));
            double[][] testInputs4  = bow4.Transform(ReadInputIn(inputFile, 90, 120));
            int[]      testOutputs1 = ReadOutputIn(outputFile, 0, 30);
            int[]      testOutputs2 = ReadOutputIn(outputFile, 30, 60);
            int[]      testOutputs3 = ReadOutputIn(outputFile, 60, 90);
            int[]      testOutputs4 = ReadOutputIn(outputFile, 90, 120);

            //predict answers
            int[] answers1 = nb1.Decide(testInputs1);
            int[] answers2 = nb2.Decide(testInputs2);
            int[] answers3 = nb3.Decide(testInputs3);
            int[] answers4 = nb4.Decide(testInputs4);

            int correct1 = 0;
            int correct2 = 0;
            int correct3 = 0;
            int correct4 = 0;

            int[][] confusionMatrix = new int[3][];
            confusionMatrix[0] = new int[3] {
                0, 0, 0
            };
            confusionMatrix[1] = new int[3] {
                0, 0, 0
            };
            confusionMatrix[2] = new int[3] {
                0, 0, 0
            };


            for (int i = 0; i < testOutputs1.Length; i++)
            {
                confusionMatrix[testOutputs1[i]][answers1[i]]++;
                confusionMatrix[testOutputs2[i]][answers2[i]]++;
                confusionMatrix[testOutputs3[i]][answers3[i]]++;
                confusionMatrix[testOutputs4[i]][answers4[i]]++;
                if (answers1[i] == testOutputs1[i])
                {
                    correct1++;
                }
                if (answers2[i] == testOutputs2[i])
                {
                    correct2++;
                }
                if (answers3[i] == testOutputs3[i])
                {
                    correct3++;
                }
                if (answers4[i] == testOutputs4[i])
                {
                    correct4++;
                }
            }

            double accuracy1       = ((double)correct1 / 30);
            double accuracy2       = ((double)correct2 / 30);
            double accuracy3       = ((double)correct3 / 30);
            double accuracy4       = ((double)correct4 / 30);
            double averageAccuracy = (((double)(correct1 + correct2 + correct3 + correct4)) / 120);

            Console.WriteLine(accuracy1 + " " + accuracy2 + " " + accuracy3 + " " + accuracy4);
            Console.WriteLine(averageAccuracy);


            Console.WriteLine();
            for (int i = 0; i < 3; i++)
            {
                for (int j = 0; j < 3; j++)
                {
                    Console.Write(confusionMatrix[i][j] + " ");
                }
                Console.WriteLine();
            }
        }