public ArticleRepo SelectTrainigSet(ArticleRepo wholeRepository, double trainingSetSize) { int numberOfTrainingElements = (int)(wholeRepository.articles.Count * trainingSetSize); ArticleRepo trainingReposiotry = new ArticleRepo(); trainingReposiotry.articles = wholeRepository.articles.GetRange(0, numberOfTrainingElements); return(trainingReposiotry); }
public ArticleRepo SelectTestingSet(ArticleRepo wholeRepository, double testingSetSize) { int numberOfTestingElements = (int)(wholeRepository.articles.Count * testingSetSize); int startingIndex = wholeRepository.articles.Count - numberOfTestingElements; ArticleRepo trainingReposiotry = new ArticleRepo(); trainingReposiotry.articles = wholeRepository.articles.GetRange(startingIndex, numberOfTestingElements); return(trainingReposiotry); }
public ArticleRepo Deserialize(string path) { ArticleRepo articleRepo = new ArticleRepo(); BinaryFormatter bf = new BinaryFormatter(); FileStream fsin = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.None); try { using (fsin) { articleRepo = (ArticleRepo)bf.Deserialize(fsin); } } catch (Exception e) { Console.WriteLine(e.StackTrace); } finally { fsin.Dispose(); } return(articleRepo); }
public int Classify(ArticleRepo trainingSet, ArticleRepo testedSet, int neighboursNumber, IMetric metric) { Dictionary <Article[], double> distancesDict = new Dictionary <Article[], double>(); List <string> trainingLabels = new List <string>(); int truePositiveCounter = 0; foreach (Article testedArticle in testedSet.articles) { foreach (Article trainingArticle in trainingSet.articles) { double distance = metric.CalculateDistance(trainingArticle.AllCharacteristicValues, testedArticle.AllCharacteristicValues); Article[] articleIDPair = { testedArticle, trainingArticle }; distancesDict[articleIDPair] = distance; } var sortedDict = distancesDict.OrderBy(x => x.Value).Take(neighboursNumber); foreach (var item in sortedDict) { string trainingLabel = item.Key.ElementAt(1).Label; trainingLabels.Add(trainingLabel); } string testedLabel = trainingLabels.GroupBy(x => x).OrderByDescending(x => x.Count()).First().Key; if (testedLabel == testedArticle.Label) { truePositiveCounter++; } trainingLabels.Clear(); distancesDict.Clear(); } return(truePositiveCounter); }
public double MainProcess(string selectedTrainingSet, int neighbours, string selectedMetric, string label, Dictionary <string, bool> characteristics) { List <string> labelType = new List <string>(); if (label == "PLACES") { labelType = places; } else if (label == "TOPICS") { labelType = topics; } //Initial processing (sgml to xml conversion.. etc. double trainingSetSize = this.trainingSet[selectedTrainingSet]; double testingSetSize = 1.0 - trainingSetSize; IMetric metric = metrics[selectedMetric]; ArticleRepo articleRepo = new ArticleRepo(); XmlHandler XmlHandler = new XmlHandler(); FileHandler fileHandler = new FileHandler(); string lastLabel = fileHandler.ReadLabelFromFile(); if (lastLabel != label) { XmlDocument xmlDoc = XmlHandler.GetMergedXmlDocuments(); XmlNodeList xmlNodeList = XmlHandler.GetAllCorrectNodes(xmlDoc, label); articleRepo = new ArticleRepo(xmlNodeList, label); articleRepo.SelectValidArticles(labelType); articleRepo.CleanUpTextAndRemoveStopwords(); articleRepo.PerformStemmingAndListWords(); fileHandler.Serialize(articleRepo, allSerializedArticlesPath); fileHandler.WriteLabelToFile(label); } else { articleRepo = fileHandler.Deserialize(allSerializedArticlesPath); } foreach (Article article in articleRepo.articles) { article.AllCharacteristicValues = new List <double>(); } //Mark training set ArticleRepo trainingSet = articleRepo.SelectTrainigSet(articleRepo, trainingSetSize); trainingSet.articles = articleRepo.SelectValidNumberOfArticles(labelType, trainingSet.articles); //Prepare Extracts Extractor extractor = new Extractor(); int correctArticles = 0; int testingSetNumberOfArticles = 0; foreach (string tag in labelType) { ArticleRepo testingSet = articleRepo.SelectTestingSet(articleRepo, testingSetSize); testingSetNumberOfArticles = testingSet.articles.Count; testingSet.articles = testingSet.articles.Where(i => i.Label == tag).ToList(); List <Article> processingList = trainingSet.articles.Concat(testingSet.articles).ToList(); List <Article> trainingArticlesWithLabel = trainingSet.articles.Where(i => i.Label == tag).ToList(); string calcKeyword = extractor.GetKeyword(trainingArticlesWithLabel); if (characteristics["numberOfWords"]) { processingList = extractor.CountAllWords(processingList); } if (characteristics["wordsWithLess4Chars"]) { processingList = extractor.CountWordsWith4CharsOrLess(processingList); } if (characteristics["wordsWithMore4Chars"]) { processingList = extractor.CountWordsWithMoreThan4Chars(processingList); } if (characteristics["numberOfKeywords"]) { processingList = extractor.CountKeywords(calcKeyword, processingList); } if (characteristics["hasKeyword"]) { processingList = extractor.CheckExistingKeywords(processingList); } if (characteristics["keywordPosition"]) { processingList = extractor.CheckKeywordPosition(calcKeyword, processingList); } if (characteristics["keywordFrequency"]) { processingList = extractor.CheckKeywordFrequency(calcKeyword, processingList); } correctArticles += extractor.Classify(trainingSet, testingSet, neighbours, metric); foreach (Article article in processingList) { article.AllCharacteristicValues.Clear(); } } return((double)correctArticles / testingSetNumberOfArticles); }