public string ClassifyArticle(int articleId) { BayesianClassifier classifier = new BayesianClassifier(this, new DefaultTokenizer(), new CustomizableStopWordProvider()); NewsItem article = this.Data.NewsItems.FirstOrDefault(s => s.Id == articleId); List<Category> mainCategories = this.Data.Categories.All().DistinctBy(s => s.BaseCategoryId).ToList(); string category = ""; decimal? probResult = 0; decimal? maxProbResult = 0; for (int i = 0; i < mainCategories.Count; i++) { bool isMatch = classifier.IsMatch(mainCategories[i].Name, article.Header, ref probResult, mainCategories[i].Id); if (isMatch) { category = mainCategories[i].Name; break; } else { if (probResult>maxProbResult) { maxProbResult = probResult; category= mainCategories[i].Name; } } } return category; }
public void ClassifyDataSourceResult(DataSourceResult news) { BayesianClassifier classifier = new BayesianClassifier(this, new DefaultTokenizer(), new CustomizableStopWordProvider()); List<Category> mainCategories = this.Data.Categories.All().DistinctBy(s => s.BaseCategoryId).ToList(); var sequenceEnum = news.Data.GetEnumerator(); while (sequenceEnum.MoveNext()) { var article = (sequenceEnum.Current as NewsItemViewModel); decimal? categoryPosibiity = 0; decimal? maxPropability = 0; string maxProbCategory = ""; for (int i = 0; i < mainCategories.Count; i++) { bool isMatch = classifier.IsMatch(mainCategories[i].Name, article.Header, ref categoryPosibiity, mainCategories[i].Id); if (isMatch) { article.ClassificationCategory = mainCategories[i].Name; article.ClassificationProbability = categoryPosibiity.Value; break; } else { if (categoryPosibiity > maxPropability) { maxPropability = categoryPosibiity; maxProbCategory = mainCategories[i].Name; } } } if (string.IsNullOrEmpty( article.ClassificationCategory)) { article.ClassificationCategory = maxProbCategory; article.ClassificationProbability = maxPropability.Value; } } }
public int TrainModel() { BayesianClassifier classifier = new BayesianClassifier(this, new DefaultTokenizer(), new CustomizableStopWordProvider("DefaultStopWords.txt")); int countTrainedNews = 0; List<Category> mainCategories = this.Data.Categories.All().DistinctBy(s => s.BaseCategoryId).ToList(); List<Category> allCategories = this.Data.Categories.All(new string[] { "NewsItems" }).ToList(); //get all news that havent been used for classification and arent for test var newsItems = this.Data.NewsItems.All(new string[] { "Categories" }).Where(s => s.UsedForClassication != true && s.IsForTest != true).ToList(); for (int i = 0; i < newsItems.Count; i++) { var article = newsItems[i]; var category = article.Categories.FirstOrDefault(); if (category != null) { if (category.BaseCategory == null) { continue; } //string text = article.CleanContent; - this take to mach time string text = article.Header; try { //teach this category with this content classifier.TeachMatch(category.BaseCategory.Name, text, category.BaseCategoryId); for (int j = 0; j < mainCategories.Count; j++) { if (mainCategories[j].BaseCategory == null) { continue; } if (category.Id != mainCategories[j].Id) { //teach each other category that is not match for this sentance classifier.TeachNonMatch(mainCategories[j].BaseCategory.Name, text, mainCategories[j].BaseCategory.Id); } } countTrainedNews++; article.UsedForClassication = true; this.Data.NewsItems.Update(article); this.Data.SaveChanges(); } catch (Exception ex) { BaseHelper.WriteInFile("errors.txt", ex.Message); } } } //foreach (var category in allCategories) //{ // var newsItems = category.NewsItems.Where(s => s.UsedForClassication != true && s.IsForTest!=true).ToList(); // for (int i = 0; i < newsItems.Count; i++) // { // var article = category.NewsItems.ElementAt(i); // //string text = article.CleanContent; - this take to mach time // string text = article.Header; // if (category.BaseCategory == null) // { // continue; // } // try // { // //teach this category with this content // classifier.TeachMatch(category.BaseCategory.Name, text, category.BaseCategoryId); // for (int j = 0; j < mainCategories.Count; j++) // { // if (mainCategories[j].BaseCategory == null) // { // continue; // } // if (category.Id != mainCategories[j].Id) // { // //teach each other category that is not match for this sentance // classifier.TeachNonMatch(mainCategories[j].BaseCategory.Name, text, mainCategories[j].BaseCategory.Id); // } // } // countTrainedNews++; // article.UsedForClassication = true; // this.Data.NewsItems.Update(article); // this.Data.SaveChanges(); // } // catch (Exception ex) // { // BaseHelper.WriteInFile("errors.txt", ex.Message); // } // } //} //foreach (var category in TrainingData) //{ // //something wrong with stemming // //List<string> listStemmedData = LucenePorterStemmer.ExecuteSteamming(category.Value); for (int i = 0; i < listStemmedData.Count; i++) // for (int i = 0; i < category.Value.Count; i++) // { // string word = category.Value[i]; // classifier.TeachMatch(category.Key, word); // if (category.Key==ICategorizedClassifierConstants.POSSITIVE_CATEGORY) // { // classifier.TeachNonMatch(ICategorizedClassifierConstants.NEGATIVE_CATEGORY, word); // classifier.TeachNonMatch(ICategorizedClassifierConstants.NEUTRAL_CATEGORY, word); // } // else if (category.Key == ICategorizedClassifierConstants.NEGATIVE_CATEGORY) // { // classifier.TeachNonMatch(ICategorizedClassifierConstants.POSSITIVE_CATEGORY, word); // classifier.TeachNonMatch(ICategorizedClassifierConstants.NEUTRAL_CATEGORY, word); // } // else // { // classifier.TeachNonMatch(ICategorizedClassifierConstants.NEGATIVE_CATEGORY, word); // classifier.TeachNonMatch(ICategorizedClassifierConstants.POSSITIVE_CATEGORY, word); // } // } //} return countTrainedNews; }