//LogisticRegression reg; public LogClassifier(string fileName, int countLayers, int countEpoch) { Codebook = new BagOfWords() { MaximumOccurance = 1 }; int samples = 0; var dictionary = Utilities.ReadHostFile(fileName, ref samples); Samples = samples; if (dictionary.Item1.Length != 0 && dictionary.Item2.Length != 0) { Codebook.Learn(dictionary.Item1); double[][] inputs = Codebook.Transform(dictionary.Item1); int count = inputs.Count(); //var learner = new IterativeReweightedLeastSquares<LogisticRegression>() //{ // Tolerance = 1e-4, // Let's set some convergence parameters // Iterations = 10, // maximum number of iterations to perform // Regularization = 0 //}; //reg = learner.Learn(inputs, outputs2); double[][] outputs = Utilities.BoolToDouble(dictionary.Item2); classifier = new SimpleClassifierNN(inputs, outputs, count, countLayers, countEpoch); var trainingResult = classifier.Train(inputs, outputs); Error = trainingResult.Item1; TrainingTime = trainingResult.Item2; } }
/// <summary> /// Saves the formatted tokens for future use. /// Formats a list of strings using a Bag of Words. /// </summary> /// <param name="path">Path to the folder</param> /// <param name="inputDoc">Name of training tweets document</param> /// <returns> Formatted tweets to Bag of Words format </returns> double[][] ReadInput(string inputDoc) { double[][] input; using (StreamReader r = new StreamReader(Constants.PROGRAM_DATA_FILEPATH + @"\" + inputDoc)) { List <string> tweets = new List <string>(); while (!r.EndOfStream) { tweets.Add(r.ReadLine()); } //Use custom tokenizer string[][] tokens = tp.Tokenizer(tweets); FileHelper.WriteObjectToFile("BagOfWords90.txt", tokens); bagOfWords.Learn(tokens); input = bagOfWords.Transform(tokens); r.DiscardBufferedData(); r.Close(); }; return(input); }
private static void bagOfWords(int[][] inputs, int[] outputs) { var bow = new BagOfWords <int>(); var quantizer = bow.Learn(inputs); double[][] histograms = quantizer.Transform(inputs); // One way to perform sequence classification with an SVM is to use // a kernel defined over sequences, such as DynamicTimeWarping. // Create the multi-class learning algorithm as one-vs-one with DTW: var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]>() { Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>() { Complexity = 10000.0 // Create a hard SVM } }; // Learn a multi-label SVM using the teacher var svm = teacher.Learn(histograms, outputs); // Get the predictions for the inputs int[] predicted = svm.Decide(histograms); // Create a confusion matrix to check the quality of the predictions: var cm = new ConfusionMatrix(predicted: predicted, expected: outputs); // Check the accuracy measure: double accuracy = cm.Accuracy; }
public static List <string> GetKeywords(string[] body, int count, int minOccurance = 0) { var bow = new BagOfWords() { MaximumOccurance = 500 }; bow.Learn(body); int[] codedBody = new int[body.Length]; bow.Transform(body, codedBody); var dictionary = codedBody.Select((value, index) => new { value, index }) .ToDictionary(pair => pair.index, pair => pair.value) .OrderByDescending(x => x.Value) .Where(x => x.Value > minOccurance) .Select(x => x.Key) .Take(count) .ToList(); List <string> result = new List <string>(); var codebook = bow.CodeToString; foreach (var keyWord in dictionary) { result.Add(codebook[keyWord]); } return(result); }
private double[][] CreateTextBagOfWords(string[][] inputs) { _textBagOfWords = new BagOfWords() { MaximumOccurance = 1 }; _textBagOfWords.Learn(inputs); return(_textBagOfWords.Transform(inputs)); }
//Make this class a singleton so that it is not retrained for every class it is used by private TextAnalyzer() { //Usage of a Naive Bayes classifier //Create the trainer, allowing for some regularlizatiton var teacher = new NaiveBayesLearning <NormalDistribution, NormalOptions>() { Options = { InnerOption = { Regularization = 1e-6 } } }; //Read in the training data and stop words string liberalTrainingPath = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/liberal_training.txt"); string conservativeTrainingPath = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/conservative_training.txt"); string stopWordsPath = System.Web.Hosting.HostingEnvironment.MapPath(@"~/Data/stop_words.txt"); string[] liberalSamples = File.ReadAllLines(liberalTrainingPath); string[] conservativeSamples = File.ReadAllLines(conservativeTrainingPath); stopWords = File.ReadAllLines(stopWordsPath); //Concat the samples into one array (They are first read into their own array to allow us to know the amount of samples in each file) string[] samples = liberalSamples.Concat(conservativeSamples).ToArray(); //Break the text up into individual words string[][] words = samples.Tokenize(); //If for some reason we didn't actually read any training data, throw an exception cuz the classifier wont work if (words.Length == 0) { throw new Exception("No training data for TextAnalyzer"); } //Remove common english words words = TrimStopWords(words); //Create a bag of words using the tokenized sample data bagOfWords = new BagOfWords(); bagOfWords.Learn(words); //Populate the output array using the known lengths of the sample files int[] outputs = new int[samples.Length]; for (int i = 0; i < samples.Length; i++) { if (i < liberalSamples.Length) { outputs[i] = 0; } else { outputs[i] = 1; } } //Train the classifier double[][] inputs = bagOfWords.Transform(words); nbClassifier = teacher.Learn(inputs, outputs); }
public void learn_generic1() { // Declare some testing data int[][] sequences = new int[][] { new int[] { 0, 0, 1, 2 }, // Class 0 new int[] { 0, 1, 1, 2 }, // Class 0 new int[] { 0, 0, 0, 1, 2 }, // Class 0 new int[] { 0, 1, 2, 2, 2 }, // Class 0 new int[] { 2, 2, 1, 0 }, // Class 1 new int[] { 2, 2, 2, 1, 0 }, // Class 1 new int[] { 2, 2, 2, 1, 0 }, // Class 1 new int[] { 2, 2, 2, 2, 1 }, // Class 1 }; int[] outputs = new int[] { 0, 0, 0, 0, // First four sequences are of class 0 1, 1, 1, 1, // Last four sequences are of class 1 }; // Create a Bag-of-Words learning algorithm var bow = new BagOfWords <int>(); bow.ParallelOptions.MaxDegreeOfParallelism = 1; // Use the BoW to create a quantizer var quantizer = bow.Learn(sequences); // Extract vector representations from the integer sequences double[][] representations = quantizer.Transform(sequences); // Create a new learning algorithm for support vector machines var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]> { Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>() { Complexity = 100000 } }; // Use the learning algorithm to create a classifier var svm = teacher.Learn(representations, outputs); // Compute predictions for the training set int[] predicted = svm.Decide(representations); var cm = new ConfusionMatrix(predicted: predicted, expected: outputs); double acc = cm.Accuracy; Assert.AreEqual(0.75, acc); }
public void ExecuteTest() { string[][] words = { new string[] { "今日", "は", "いい", "天気", "です" }, new string[] { "明日", "も", "いい", "天気", "でしょう" } }; var codebook = new BagOfWords() { //MaximumOccurance = 1 // the resulting vector will have only 0's and 1's MaximumOccurance = int.MaxValue }; // Compute the codebook (note: this would have to be done only for the training set) codebook.Learn(words); // Now, we can use the learned codebook to extract fixed-length // representations of the different texts (paragraphs) above: // Extract a feature vector from the text 1: double[] bow1 = codebook.Transform(words[0]); // Extract a feature vector from the text 2: double[] bow2 = codebook.Transform(words[1]); // we could also have transformed everything at once, i.e. double[][] bow = codebook.Transform(words); // Now, since we have finite length representations (both bow1 and bow2 should // have the same size), we can pass them to any classifier or machine learning // method. For example, we can pass them to a Logistic Regression Classifier to // discern between the first and second paragraphs // Lets create a Logistic classifier to separate the two paragraphs: var learner = new IterativeReweightedLeastSquares <LogisticRegression>() { Tolerance = 1e-4, // Let's set some convergence parameters Iterations = 100, // maximum number of iterations to perform Regularization = 0 }; // Now, we use the learning algorithm to learn the distinction between the two: LogisticRegression reg = learner.Learn(new[] { bow1, bow2 }, new[] { false, true }); // Finally, we can predict using the classifier: bool c1 = reg.Decide(bow1); // Should be false bool c2 = reg.Decide(bow2); // Should be true Console.WriteLine(c1); Console.WriteLine(c2); }
/// <summary> /// Gets the trained Bag of words /// </summary> /// <returns></returns> public static BagOfWords GetBagOfWords() { if (bagOfWords == null) { bagOfWords = new BagOfWords() { MaximumOccurance = 1 }; bagOfWords.Learn(ReadObjectFromFile <string[][]>(@"BagOfWords90.txt")); } return(bagOfWords); }
public LogClassifier(string networkFileName, string dictionaryFileName) { Codebook = new BagOfWords() { MaximumOccurance = 1 }; int samples = 0; var dictionary = Utilities.ReadHostFile(dictionaryFileName, ref samples); if (dictionary.Item1.Length != 0) { Codebook.Learn(dictionary.Item1); } classifier = new SimpleClassifierNN(networkFileName); }
static void Main(string[] args) { DiferenciasEntities db = new DiferenciasEntities(); var query = db.Edicion.Where(x => x.Tecnico.Equals("CSM05")); string[] edits = query.Select(x => x.CadenaInicial).ToArray(); string[][] words = edits.Tokenize(); var codebook = new BagOfWords() { MaximumOccurance = 1 }; codebook.Learn(words); double[] bow1 = codebook.Transform(words[0]); double[] bow2 = codebook.Transform(words[1]); Cosine cosine = new Cosine(); Console.WriteLine(cosine.Similarity(bow1, bow2)); }
public TrainingResult Train() { var result = new TrainingResult(); // load training data result.StartMeasure(TrainingResult.RecordType.LoadDataset); var reader = new ExcelReader(Helpers.DatasetPath); DataTable dataStore = reader.GetWorksheet("Training"); int[] labels = dataStore.ToVector <int>("Label"); string[] learnData = dataStore.ToVector <string>("Sentiment"); result.StopMeasure(); // tokenize result.StartMeasure(TrainingResult.RecordType.Tokenization); string[][] tokenized = learnData.Select(x => _preprocessor.Process(x)).ToArray(); result.StopMeasure(); // train bag of words result.StartMeasure(TrainingResult.RecordType.BagOfWordsLearning); _bagOfWords = new BagOfWords(); _bagOfWords.Learn(tokenized); result.StopMeasure(); // vectorization of tokens result.StartMeasure(TrainingResult.RecordType.Featurization); int[][] featurized = _bagOfWords.Transform(tokenized).ToInt32(); result.StopMeasure(); // train result.StartMeasure(TrainingResult.RecordType.NaiveBayesLearning); var teacher = new NaiveBayesLearning(); _bayes = teacher.Learn(featurized, labels); result.StopMeasure(); return(result); }
static void Main() { // Path to the folder with classifiers models var jarRoot = @"\Users\devir\OneDrive\Documents\Visual Studio 2015\Projects\ner"; var classifiersDirecrory = jarRoot + @"\classifiers"; // Loading 3 class classifier model var classifier = CRFClassifier.getClassifierNoExceptions( classifiersDirecrory + @"\english.muc.7class.distsim.crf.ser.gz"); var s1 = " She got up this morning at 9:00 am and went to a shop to spend five dollars to buy a 50% off toothbrush."; var s2 = "Tell the latest on olympics from the New York."; Console.WriteLine("{0}\n", classifier.classifyToCharacterOffsets(s1)); Console.WriteLine("{0}\n", classifier.classifyWithInlineXML(s1)); //MUNCULIN NER SATU SATU string result = classifier.classifyWithInlineXML(s1); String substr1 = "TIME"; String substr2 = "LOCATION"; String substr3 = "PERSON"; String substr4 = "ORGANIZATION"; String substr5 = "MONEY"; String substr6 = "Percent"; String substr7 = "Date"; string total1, total2, total3, total4, total5, total6, total7; //if (result.Contains(substr1)) //{ // string[] hasiltime = GetStringInBetween("<TIME>", "</TIME>", result, false, false); // string output_time = hasiltime[0]; // string next_time = hasiltime[1]; // total1 = output_time; // // Console.WriteLine(output_time); //} //if (result.Contains(substr2)) //{ // string[] hasillocation = GetStringInBetween("<LOCATION>", "</LOCATION>", result, false, false); // string output_location = hasillocation[0]; // string next_loc = hasillocation[1]; // //Console.WriteLine(output_location); // total2 = output_location; //} //if (result.Contains(substr3)) //{ // string[] hasilperson = GetStringInBetween("<PERSON>", "</PERSON>", result, false, false); // string output_person = hasilperson[0]; // string next_person = hasilperson[1]; // //Console.WriteLine(hasilperson); // total3 = output_person; //} //if (result.Contains(substr4)) //{ // string[] hasilORGANIZATION = GetStringInBetween("<ORGANIZATION>", "</ORGANIZATION>", result, false, false); // string output_ORGANIZATION = hasilORGANIZATION[0]; // string next_ORGANIZATION = hasilORGANIZATION[1]; // //Console.WriteLine(output_ORGANIZATION); // total4 = output_ORGANIZATION; //} //if (result.Contains(substr5)) //{ // string[] hasilMONEY = GetStringInBetween("<MONEY>", "</MONEY>", result, false, false); // string output_MONEY = hasilMONEY[0]; // string next_MONEY = hasilMONEY[1]; // // Console.WriteLine(output_MONEY); // total5 = output_MONEY; //} //if (result.Contains(substr6)) //{ // string[] hasilPercent = GetStringInBetween("<Percent>", "</Percent>", result, false, false); // string output_Percent = hasilPercent[0]; // string next_Percent = hasilPercent[1]; // //Console.WriteLine(output_Percent); // total6 = output_Percent; //} //if (result.Contains(substr7)) //{ // string[] hasilDate = GetStringInBetween("<Date>", "</Date>", result, false, false); // string output_Date = hasilDate[0]; // string next_Date = hasilDate[1]; // //Console.WriteLine(output_Date); // total7 = output_Date; //} string[] hasiltime = GetStringInBetween("<TIME>", "</TIME>", result, false, false); string output_time = hasiltime[0]; string next_time = hasiltime[1]; total1 = output_time; //Console.WriteLine(output_time); string[] hasillocation = GetStringInBetween("<LOCATION>", "</LOCATION>", result, false, false); string output_location = hasillocation[0]; string next_loc = hasillocation[1]; //Console.WriteLine(output_location); total2 = output_location; string[] hasilperson = GetStringInBetween("<PERSON>", "</PERSON>", result, false, false); string output_person = hasilperson[0]; string next_person = hasilperson[1]; //Console.WriteLine(hasilperson); total3 = output_person; string[] hasilORGANIZATION = GetStringInBetween("<ORGANIZATION>", "</ORGANIZATION>", result, false, false); string output_ORGANIZATION = hasilORGANIZATION[0]; string next_ORGANIZATION = hasilORGANIZATION[1]; //Console.WriteLine(output_ORGANIZATION); total4 = output_ORGANIZATION; string[] hasilMONEY = GetStringInBetween("<MONEY>", "</MONEY>", result, false, false); string output_MONEY = hasilMONEY[0]; string next_MONEY = hasilMONEY[1]; // Console.WriteLine(output_MONEY); total5 = output_MONEY; string[] hasilPercent = GetStringInBetween("<Percent>", "</Percent>", result, false, false); string output_Percent = hasilPercent[0]; string next_Percent = hasilPercent[1]; //Console.WriteLine(output_Percent); total6 = output_Percent; string[] hasilDate = GetStringInBetween("<Date>", "</Date>", result, false, false); string output_Date = hasilDate[0]; string next_Date = hasilDate[1]; //Console.WriteLine(output_Date); total7 = output_Date; //BOW string semua = total1 + ";" + total2 + ";" + total3 + ";" + total4 + ";" + total5 + ";" + total6 + ";" + total7 + ";"; Console.WriteLine(semua); string[] gabungan = { total1, total2, total3, total4, total5, total6, total7 }; foreach (var a in gabungan) { Console.WriteLine(a); } string[][] words = gabungan.Tokenize(); //var codebook = new TFIDF() //{ // Tf = TermFrequency.Log, // Idf = InverseDocumentFrequency.Default //}; var codebook = new BagOfWords() { MaximumOccurance = 1 // the resulting vector will have only 0's and 1's }; codebook.Learn(words); double[] bow1 = codebook.Transform(words[0]); double[] bow2 = codebook.Transform(words[1]); double[] bow3 = codebook.Transform(words[2]); double[] bow4 = codebook.Transform(words[3]); double[] bow5 = codebook.Transform(words[4]); double[] bow6 = codebook.Transform(words[5]); double[] bow7 = codebook.Transform(words[6]); double[][] keseluruhanBOW1 = { bow1, bow2, bow3, bow4, bow5, bow6, bow7 }; //coba bool quitNow = false; while (!quitNow) { string val; Console.Write("Enter question: "); val = Console.ReadLine(); string[] textss = { val, }; string[][] wordss = textss.Tokenize(); //var codebook2 = new TFIDF() //{ // Tf = TermFrequency.Log, // Idf = InverseDocumentFrequency.Default //}; var codebook2 = new BagOfWords() { MaximumOccurance = 1 // the resulting vector will have only 0's and 1's }; codebook2.Learn(wordss); double[] c1 = codebook2.Transform(wordss[0]); string path = @"C:\Users\devir\OneDrive\Documents\Visual Studio 2015\Projects\ner"; //var load_svm_model = Serializer.Load<MulticlassClassifierBase>(Path.Combine(path, "pelatihanSVMbayardanpergi.bin")); //LibSvmModel modela = LibSvmModel.Load(Path.Combine(path, "pelatihanSVMbayardanpergi.bint")); //int jawaban = load_svm_model.Decide( c1); // answer will be 2. // Now, we can use the model class to create the equivalent Accord.NET SVM: //Console.WriteLine(jawaban); LibSvmModel model = LibSvmModel.Load(Path.Combine(path, "pelatihanSVMbayardanpergi.txt")); // Now, we can use the model class to create the equivalent Accord.NET SVM: SupportVectorMachine svm = model.CreateMachine(); // Compute classification error bool predicted = svm.Decide(c1); // var machine = teacher.Learn(inputs, outputs); if (predicted == false) { Console.WriteLine("BAYAR"); } ; if (predicted == true) { Console.WriteLine("PERGI"); } ; Console.ReadLine(); } // In order to convert any 2d array to jagged one // let's use a generic implementation }
public void issue_168() { // Text naive bayes classification gives wrong results #168 // https://github.com/accord-net/framework/issues/168 // Some sample texts string[] spamTokens = Tokenize(@"I decided to sign up for the Disney Half Marathon. Half of a marathon is 13.1 miles. A full marathon is 26.2 miles. You may wonder why the strange number of miles. “26.2” is certainly not an even number. And after running 26 miles who cares about the point two? You might think that 26.2 miles is a whole number of kilometers. It isn’t. In fact, it is even worse in kilometers – 42.1648128. I bet you don’t see many t-shirts in England with that number printed on the front."); string[] loremTokens = Tokenize(@"Lorem ipsum dolor sit amet, Nulla nec tortor. Donec id elit quis purus consectetur consequat. Nam congue semper tellus. Sed erat dolor, dapibus sit amet, venenatis ornare, ultrices ut, nisi. Aliquam ante. Suspendisse scelerisque dui nec velit. Duis augue augue, gravida euismod, vulputate ac, facilisis id, sem. Morbi in orci. Nulla purus lacus, pulvinar vel, malesuada ac, mattis nec, quam. Nam molestie scelerisque quam. Nullam feugiat cursus lacus.orem ipsum dolor sit amet."); // Their respective classes string[] classes = { "spam", "lorem" }; // Create a new Bag-of-Words for the texts BagOfWords bow = new BagOfWords() { // Limit the maximum number of occurences in // the feature vector to a single instance MaximumOccurance = 1 }; bow.Learn(new[] { spamTokens, loremTokens }); string word = bow.CodeToString[52]; Assert.AreEqual("in", word); // Create input and outputs for training int[][] inputs = { bow.GetFeatureVector(spamTokens), bow.GetFeatureVector(loremTokens) }; int[] outputs = { 0, // spam 1, // lorem }; // Create the naïve bayes model var teacher = new NaiveBayesLearning() { Empirical = true, Options = new IndependentOptions <GeneralDiscreteOptions>() { InnerOption = new GeneralDiscreteOptions() { //UseLaplaceRule = true } } }; // The following line is only needed to ensure reproducible results. Please remove it to enable full parallelization teacher.ParallelOptions.MaxDegreeOfParallelism = 1; // (Remove, comment, or change this line to enable full parallelism) // Estimate the model var nb = teacher.Learn(inputs, outputs); double[][] spamDist = nb.Distributions.GetRow(0); double[][] loremDist = nb.Distributions.GetRow(1); for (int i = 0; i < spamDist.Length; i++) { if (i == 52) { Assert.AreEqual(spamDist[i][0], 0.0, 1e-8); Assert.AreEqual(spamDist[i][1], 1.0, 1e-8); Assert.AreEqual(loremDist[i][0], 0.0, 1e-8); Assert.AreEqual(loremDist[i][1], 1.0, 1e-8); } else { if (i < 68) { Assert.AreEqual(spamDist[i][0], 0.0, 1e-8); Assert.AreEqual(spamDist[i][1], 1.0, 1e-8); Assert.AreEqual(loremDist[i][0], 1.0, 1e-8); Assert.AreEqual(loremDist[i][1], 0.0, 1e-8); } else { Assert.AreEqual(spamDist[i][0], 1.0, 1e-8); Assert.AreEqual(spamDist[i][1], 0.0, 1e-8); Assert.AreEqual(loremDist[i][0], 0.0, 1e-8); Assert.AreEqual(loremDist[i][1], 1.0, 1e-8); } } } // Consume the model { // This classifies as spam string text = @"I decided to sign up for"; int[] input = bow.GetFeatureVector(Tokenize(text)); int answer = nb.Decide(input); string result = classes[answer]; Assert.AreEqual("lorem", result); } { // This classifies as spam string text = @"I decided to sign up for the"; int[] input = bow.GetFeatureVector(Tokenize(text)); int answer = nb.Decide(input); string result = classes[answer]; Assert.AreEqual("spam", result); } { // This classifies as lorem string text = @"I decided to lorem ipsum nulla nec tortor purus sit amet"; int[] input = bow.GetFeatureVector(Tokenize(text)); int answer = nb.Decide(input); string result = classes[answer]; Assert.AreEqual("lorem", result); } }
public void ClassifyText() { readFile(); string[] words = { sbGeneratedByProgram.ToString(), sbNotGeneratedByProgram.ToString() }; string[][] word = words.Tokenize(); // Create a new Bag-of-Words for the texts var bow = new BagOfWords() { MaximumOccurance = 1 // the resulting vector will have only 0's and 1's }; bow.Learn(word); //bow.Learn(Tokens); // Create input and outputs for training double[] inputP = bow.Transform(word[0]); double[] inputNP = bow.Transform(word[1]); int[][] inputs = { inputP.ToInt32(), inputNP.ToInt32() }; int[] outputs = { 0, // Program Generated 1 // Not Program Generated }; // Create the naïve bayes model var learner = new NaiveBayesLearning(); learner.Options.InnerOption.UseLaplaceRule = true; var nb = learner.Learn(inputs, outputs); string[] text = "Yup! Two years without a car! Downtown living is morning too I got that this morning too too If the wi. Here's hoping!".Tokenize(); int[] input = bow.Transform(text).ToInt32(); int answer = nb.Decide(input); Console.WriteLine(answer); Console.WriteLine(nb.Probabilities(input)[0] + " " + nb.Probabilities(input)[1]); // Learn a Naive Bayes model from the examples //{ // // This classifies as Not Program Generated // string text = @"How moms use their iPhones"; // int[] input = bow.GetFeatureVector(Tokenize(text)); // int answer = bayes.Compute(input); // Console.WriteLine( bayes.Probabilities(input)[0]+" "+ bayes.Probabilities(input)[1]); // string result = classes[answer]; // Console.WriteLine("1) Test: {0}", result); //} //{ // // This classifies as spam // string text = @" WINNING GRACIOUSNESS"; // int[] input = bow.GetFeatureVector(Tokenize(text)); // int answer = bayes.Compute(input); // string result = classes[answer]; // Console.WriteLine(bayes.Probabilities(input)[0] + " " + bayes.Probabilities(input)[1]); // Console.WriteLine("2) Test: {0}", result); //} }
public void learn_pendigits_normalization() { Console.WriteLine("Starting BagOfWordsTest.learn_pendigits_normalization"); using (var travis = new KeepTravisAlive()) { #region doc_learn_pendigits // The Bag-Of-Words model can be used to extract finite-length feature // vectors from sequences of arbitrary length, like handwritten digits // Ensure we get reproducible results Accord.Math.Random.Generator.Seed = 0; // Download the PENDIGITS dataset from UCI ML repository var pendigits = new Pendigits(path: Path.GetTempPath()); // Get and pre-process the training set double[][][] trainInputs = pendigits.Training.Item1; int[] trainOutputs = pendigits.Training.Item2; // Pre-process the digits so each of them is centered and scaled trainInputs = trainInputs.Apply(Accord.Statistics.Tools.ZScores); // Create a Bag-of-Words learning algorithm var bow = new BagOfWords <double[], KMeans>() { Clustering = new KMeans(5), }; // Use the BoW to create a quantizer var quantizer = bow.Learn(trainInputs); // Extract vector representations from the pen sequences double[][] trainVectors = quantizer.Transform(trainInputs); // Create a new learning algorithm for support vector machines var teacher = new MulticlassSupportVectorLearning <ChiSquare, double[]> { Learner = (p) => new SequentialMinimalOptimization <ChiSquare, double[]>() { Complexity = 1 } }; // Use the learning algorithm to create a classifier var svm = teacher.Learn(trainVectors, trainOutputs); // Compute predictions for the training set int[] trainPredicted = svm.Decide(trainVectors); // Check the performance of the classifier by comparing with the ground-truth: var m1 = new GeneralConfusionMatrix(predicted: trainPredicted, expected: trainOutputs); double trainAcc = m1.Accuracy; // should be 0.690 // Prepare the testing set double[][][] testInputs = pendigits.Testing.Item1; int[] testOutputs = pendigits.Testing.Item2; // Apply the same normalizations testInputs = testInputs.Apply(Accord.Statistics.Tools.ZScores); double[][] testVectors = quantizer.Transform(testInputs); // Compute predictions for the test set int[] testPredicted = svm.Decide(testVectors); // Check the performance of the classifier by comparing with the ground-truth: var m2 = new GeneralConfusionMatrix(predicted: testPredicted, expected: testOutputs); double testAcc = m2.Accuracy; // should be 0.600 #endregion #if NET35 Assert.AreEqual(0.89594053744997137d, trainAcc, 1e-10); Assert.AreEqual(0.89605017347211102d, testAcc, 1e-10); #else Assert.AreEqual(0.69039451114922812, trainAcc, 1e-10); Assert.AreEqual(0.600880704563651, testAcc, 1e-10); #endif } }
public void learn_test() { #region doc_learn // The Bag-Of-Words model can be used to extract finite-length feature // vectors from sequences of arbitrary length, like for example, texts: string[] texts = { @"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas molestie malesuada nisi et placerat. Curabitur blandit porttitor suscipit. Nunc facilisis ultrices felis, vitae luctus arcu semper in. Fusce ut felis ipsum. Sed faucibus tortor ut felis placerat euismod. Vestibulum pharetra velit et dolor ornare quis malesuada leo aliquam. Aenean lobortis, tortor iaculis vestibulum dictum, tellus nisi vestibulum libero, ultricies pretium nisi ante in neque. Integer et massa lectus. Aenean ut sem quam. Mauris at nisl augue, volutpat tempus nisl. Suspendisse luctus convallis metus, vitae pretium risus pretium vitae. Duis tristique euismod aliquam", @"Sed consectetur nisl et diam mattis varius. Aliquam ornare tincidunt arcu eget adipiscing. Etiam quis augue lectus, vel sollicitudin lorem. Fusce lacinia, leo non porttitor adipiscing, mauris purus lobortis ipsum, id scelerisque erat neque eget nunc. Suspendisse potenti. Etiam non urna non libero pulvinar consequat ac vitae turpis. Nam urna eros, laoreet id sagittis eu, posuere in sapien. Phasellus semper convallis faucibus. Nulla fermentum faucibus tellus in rutrum. Maecenas quis risus augue, eu gravida massa." }; string[][] words = texts.Tokenize(); // Create a new BoW with options: var codebook = new BagOfWords() { MaximumOccurance = 1 // the resulting vector will have only 0's and 1's }; // Compute the codebook (note: this would have to be done only for the training set) codebook.Learn(words); // Now, we can use the learned codebook to extract fixed-length // representations of the different texts (paragraphs) above: // Extract a feature vector from the text 1: double[] bow1 = codebook.Transform(words[0]); // Extract a feature vector from the text 2: double[] bow2 = codebook.Transform(words[1]); // we could also have transformed everything at once, i.e. // double[][] bow = codebook.Transform(words); // Now, since we have finite length representations (both bow1 and bow2 should // have the same size), we can pass them to any classifier or machine learning // method. For example, we can pass them to a Logistic Regression Classifier to // discern between the first and second paragraphs // Lets create a Logistic classifier to separate the two paragraphs: var learner = new IterativeReweightedLeastSquares <LogisticRegression>() { Tolerance = 1e-4, // Let's set some convergence parameters Iterations = 100, // maximum number of iterations to perform Regularization = 0 }; // Now, we use the learning algorithm to learn the distinction between the two: LogisticRegression reg = learner.Learn(new[] { bow1, bow2 }, new[] { false, true }); // Finally, we can predict using the classifier: bool c1 = reg.Decide(bow1); // Should be false bool c2 = reg.Decide(bow2); // Should be true #endregion Assert.AreEqual(bow1.Length, 99); Assert.AreEqual(bow2.Length, 99); Assert.AreEqual(bow1.Sum(), 67); Assert.AreEqual(bow2.Sum(), 63); Assert.IsFalse(c1); Assert.IsTrue(c2); }
/// <summary> /// Initiates the required components and runs 4 accuracy tests /// Uses 90 tweets for training /// uses 30 tweets for testing /// </summary> /// <param name="inputFile">Tweets</param> /// <param name="outputFile">Labels</param> public void TestNaiveBayes(string inputFile, string outputFile) { //Create training features //4 sets of 90 string[][] tokens1 = ReadInputEx(inputFile, 0, 29); string[][] tokens2 = ReadInputEx(inputFile, 30, 59); string[][] tokens3 = ReadInputEx(inputFile, 60, 89); string[][] tokens4 = ReadInputEx(inputFile, 90, 119); //Read training output int[] outputs1 = ReadOutputEx(outputFile, 0, 29); int[] outputs2 = ReadOutputEx(outputFile, 30, 59); int[] outputs3 = ReadOutputEx(outputFile, 60, 89); int[] outputs4 = ReadOutputEx(outputFile, 90, 119); //Create BOW for each training set BagOfWords bow1 = new BagOfWords() { MaximumOccurance = 1 }; bow1.Learn(tokens1); BagOfWords bow2 = new BagOfWords() { MaximumOccurance = 1 }; bow2.Learn(tokens2); BagOfWords bow3 = new BagOfWords() { MaximumOccurance = 1 }; bow3.Learn(tokens3); BagOfWords bow4 = new BagOfWords() { MaximumOccurance = 1 }; bow4.Learn(tokens4); //Transform to feature vector double[][] inputs1 = bow1.Transform(tokens1); double[][] inputs2 = bow2.Transform(tokens2); double[][] inputs3 = bow3.Transform(tokens3); double[][] inputs4 = bow4.Transform(tokens4); //Create teachers var teacher1 = new NaiveBayesLearning <NormalDistribution>(); teacher1.Options.InnerOption = new NormalOptions { Regularization = 1e-6 // to avoid zero variances }; var teacher2 = new NaiveBayesLearning <NormalDistribution>(); teacher2.Options.InnerOption = new NormalOptions { Regularization = 1e-6 // to avoid zero variances }; var teacher3 = new NaiveBayesLearning <NormalDistribution>(); teacher3.Options.InnerOption = new NormalOptions { Regularization = 1e-6 // to avoid zero variances }; var teacher4 = new NaiveBayesLearning <NormalDistribution>(); teacher4.Options.InnerOption = new NormalOptions { Regularization = 1e-6 // to avoid zero variances }; //Create the Naive Bayes var nb1 = teacher1.Learn(inputs1, outputs1); var nb2 = teacher2.Learn(inputs2, outputs2); var nb3 = teacher3.Learn(inputs3, outputs3); var nb4 = teacher4.Learn(inputs4, outputs4); //Create the training sets //the remaining 30 double[][] testInputs1 = bow1.Transform(ReadInputIn(inputFile, 0, 30)); double[][] testInputs2 = bow2.Transform(ReadInputIn(inputFile, 30, 60)); double[][] testInputs3 = bow3.Transform(ReadInputIn(inputFile, 60, 90)); double[][] testInputs4 = bow4.Transform(ReadInputIn(inputFile, 90, 120)); int[] testOutputs1 = ReadOutputIn(outputFile, 0, 30); int[] testOutputs2 = ReadOutputIn(outputFile, 30, 60); int[] testOutputs3 = ReadOutputIn(outputFile, 60, 90); int[] testOutputs4 = ReadOutputIn(outputFile, 90, 120); //predict answers int[] answers1 = nb1.Decide(testInputs1); int[] answers2 = nb2.Decide(testInputs2); int[] answers3 = nb3.Decide(testInputs3); int[] answers4 = nb4.Decide(testInputs4); int correct1 = 0; int correct2 = 0; int correct3 = 0; int correct4 = 0; int[][] confusionMatrix = new int[3][]; confusionMatrix[0] = new int[3] { 0, 0, 0 }; confusionMatrix[1] = new int[3] { 0, 0, 0 }; confusionMatrix[2] = new int[3] { 0, 0, 0 }; for (int i = 0; i < testOutputs1.Length; i++) { confusionMatrix[testOutputs1[i]][answers1[i]]++; confusionMatrix[testOutputs2[i]][answers2[i]]++; confusionMatrix[testOutputs3[i]][answers3[i]]++; confusionMatrix[testOutputs4[i]][answers4[i]]++; if (answers1[i] == testOutputs1[i]) { correct1++; } if (answers2[i] == testOutputs2[i]) { correct2++; } if (answers3[i] == testOutputs3[i]) { correct3++; } if (answers4[i] == testOutputs4[i]) { correct4++; } } double accuracy1 = ((double)correct1 / 30); double accuracy2 = ((double)correct2 / 30); double accuracy3 = ((double)correct3 / 30); double accuracy4 = ((double)correct4 / 30); double averageAccuracy = (((double)(correct1 + correct2 + correct3 + correct4)) / 120); Console.WriteLine(accuracy1 + " " + accuracy2 + " " + accuracy3 + " " + accuracy4); Console.WriteLine(averageAccuracy); Console.WriteLine(); for (int i = 0; i < 3; i++) { for (int j = 0; j < 3; j++) { Console.Write(confusionMatrix[i][j] + " "); } Console.WriteLine(); } }