static void Main(string[] args) { List <Record> TrainingSet = new List <Record>() { new Record("Conservative", false, false, true), new Record("Conservative", true, true, false), new Record("Conservative", true, false, false), new Record("Socialist", false, true, false), new Record("Socialist", true, true, true), new Record("Libertarian", true, true, true), }; var toBeClassifiedRecord = new Record("Unknown", true, true, true); NaiveBayesClassifier bayesClassifier = new NaiveBayesClassifier(TrainingSet); toBeClassifiedRecord.classification = bayesClassifier.GetClassification(toBeClassifiedRecord); }
static void Main(string[] args) { var data = DataReader.ReadCsv(); var dataSets = TrainTest.Split(data); var accuracies = new List <double>(); dataSets.ToList().ForEach(sets => { var model = new NaiveBayesClassifier(); model.Fit(sets.training); var accuracy = model.Solve(sets.test); accuracies.Add(accuracy); }); Console.WriteLine(); Console.WriteLine($"Average accuracy: {accuracies.Sum() / (double)accuracies.Count}"); }
static void Main(string[] args) { // Reading data from .txt files List <_DataSetItem> raw_data_train = ReadInData(Properties.Resources.SMSSpamTrain); List <_DataSetItem> raw_data_test = ReadInData(Properties.Resources.SMSSpamTest); List <_DataSetItem> full_dataset = new List <_DataSetItem>(raw_data_test.Count + raw_data_train.Count); full_dataset.AddRange(raw_data_train); full_dataset.AddRange(raw_data_test); // Extracting the vocabulary of the dataset. Only words that occur 3 or more times are considered part of the vocabulary HashSet <string> vocabulary = ExtractVocabulary(full_dataset, 3); // Transforming the raw SMS data into a bag of words from the previously extracted vocabulary DataFeaturesContainer X_train = BagOfWordsFromVocabularyAndSMS(raw_data_train, vocabulary); DataFeaturesContainer X_test = BagOfWordsFromVocabularyAndSMS(raw_data_test, vocabulary); // Assigning the target values List <int> Y_train = raw_data_train.Select(x => x.target == "spam" ? 0 : 1).ToList(); List <int> Y_test = raw_data_test.Select(x => x.target == "spam" ? 0 : 1).ToList(); // Defining the prior // Unbiased prior, carries no information, every class is as likely to occur Dictionary <int, double> unbiased_prior = new Dictionary <int, double>() { { 1, 0.5 }, { 0, 0.5 } }; // Reasearched prior found on the wikipedia articke // @"https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering" Dictionary <int, double> wiki_prior = new Dictionary <int, double>() { { 1, 0.2 }, { 0, 0.8 } }; // Calculating the prior from our dataset which includes finding the frequency of // actuall spam messages vs ham messages List <string> tartets = full_dataset.Select(x => x.target).ToList(); double spam_prior = CalculateDatasetPrior(tartets); Dictionary <int, double> dataset_calculated_prior = new Dictionary <int, double>() { { 1, 1.0 - spam_prior }, { 0, spam_prior } }; // Training the model and doing the predictions NaiveBayesClassifier model = new NaiveBayesClassifier(); model.Fit(X_train, Y_train, unbiased_prior); List <int> predictions = model.Predict(X_test.Data); // Measuring model performance ModelMetrics modelMetrics = new ModelMetrics(Y_test, predictions, 0); modelMetrics.PrintMeasures(); // Saving the results List <string> test_sms = raw_data_train.Select(x => x.sms.Trim()).ToList(); WriteResults("results.tsv", test_sms, predictions); }