public ArticleRepo SelectTrainigSet(ArticleRepo wholeRepository, double trainingSetSize)
        {
            int         numberOfTrainingElements = (int)(wholeRepository.articles.Count * trainingSetSize);
            ArticleRepo trainingReposiotry       = new ArticleRepo();

            trainingReposiotry.articles = wholeRepository.articles.GetRange(0, numberOfTrainingElements);

            return(trainingReposiotry);
        }
        public ArticleRepo SelectTestingSet(ArticleRepo wholeRepository, double testingSetSize)
        {
            int numberOfTestingElements = (int)(wholeRepository.articles.Count * testingSetSize);
            int startingIndex           = wholeRepository.articles.Count - numberOfTestingElements;

            ArticleRepo trainingReposiotry = new ArticleRepo();

            trainingReposiotry.articles = wholeRepository.articles.GetRange(startingIndex, numberOfTestingElements);

            return(trainingReposiotry);
        }
예제 #3
0
        public ArticleRepo Deserialize(string path)
        {
            ArticleRepo articleRepo = new ArticleRepo();

            BinaryFormatter bf = new BinaryFormatter();

            FileStream fsin = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.None);

            try {
                using (fsin) {
                    articleRepo = (ArticleRepo)bf.Deserialize(fsin);
                }
            } catch (Exception e) {
                Console.WriteLine(e.StackTrace);
            }
            finally {
                fsin.Dispose();
            }

            return(articleRepo);
        }
        public int Classify(ArticleRepo trainingSet, ArticleRepo testedSet, int neighboursNumber, IMetric metric)
        {
            Dictionary <Article[], double> distancesDict = new Dictionary <Article[], double>();
            List <string> trainingLabels = new List <string>();

            int truePositiveCounter = 0;


            foreach (Article testedArticle in testedSet.articles)
            {
                foreach (Article trainingArticle in trainingSet.articles)
                {
                    double    distance      = metric.CalculateDistance(trainingArticle.AllCharacteristicValues, testedArticle.AllCharacteristicValues);
                    Article[] articleIDPair = { testedArticle, trainingArticle };
                    distancesDict[articleIDPair] = distance;
                }

                var sortedDict = distancesDict.OrderBy(x => x.Value).Take(neighboursNumber);

                foreach (var item in sortedDict)
                {
                    string trainingLabel = item.Key.ElementAt(1).Label;
                    trainingLabels.Add(trainingLabel);
                }

                string testedLabel = trainingLabels.GroupBy(x => x).OrderByDescending(x => x.Count()).First().Key;

                if (testedLabel == testedArticle.Label)
                {
                    truePositiveCounter++;
                }

                trainingLabels.Clear();
                distancesDict.Clear();
            }

            return(truePositiveCounter);
        }
예제 #5
0
        public double MainProcess(string selectedTrainingSet,
                                  int neighbours, string selectedMetric, string label, Dictionary <string, bool> characteristics)
        {
            List <string> labelType = new List <string>();

            if (label == "PLACES")
            {
                labelType = places;
            }
            else if (label == "TOPICS")
            {
                labelType = topics;
            }

            //Initial processing (sgml to xml conversion.. etc.
            double  trainingSetSize = this.trainingSet[selectedTrainingSet];
            double  testingSetSize  = 1.0 - trainingSetSize;
            IMetric metric          = metrics[selectedMetric];

            ArticleRepo articleRepo = new ArticleRepo();
            XmlHandler  XmlHandler  = new XmlHandler();
            FileHandler fileHandler = new FileHandler();
            string      lastLabel   = fileHandler.ReadLabelFromFile();


            if (lastLabel != label)
            {
                XmlDocument xmlDoc      = XmlHandler.GetMergedXmlDocuments();
                XmlNodeList xmlNodeList = XmlHandler.GetAllCorrectNodes(xmlDoc, label);
                articleRepo = new ArticleRepo(xmlNodeList, label);
                articleRepo.SelectValidArticles(labelType);

                articleRepo.CleanUpTextAndRemoveStopwords();
                articleRepo.PerformStemmingAndListWords();

                fileHandler.Serialize(articleRepo, allSerializedArticlesPath);

                fileHandler.WriteLabelToFile(label);
            }
            else
            {
                articleRepo = fileHandler.Deserialize(allSerializedArticlesPath);
            }

            foreach (Article article in articleRepo.articles)
            {
                article.AllCharacteristicValues = new List <double>();
            }

            //Mark training set
            ArticleRepo trainingSet = articleRepo.SelectTrainigSet(articleRepo, trainingSetSize);

            trainingSet.articles = articleRepo.SelectValidNumberOfArticles(labelType, trainingSet.articles);


            //Prepare Extracts
            Extractor extractor                  = new Extractor();
            int       correctArticles            = 0;
            int       testingSetNumberOfArticles = 0;

            foreach (string tag in labelType)
            {
                ArticleRepo testingSet = articleRepo.SelectTestingSet(articleRepo, testingSetSize);
                testingSetNumberOfArticles = testingSet.articles.Count;

                testingSet.articles = testingSet.articles.Where(i => i.Label == tag).ToList();

                List <Article> processingList = trainingSet.articles.Concat(testingSet.articles).ToList();

                List <Article> trainingArticlesWithLabel = trainingSet.articles.Where(i => i.Label == tag).ToList();
                string         calcKeyword = extractor.GetKeyword(trainingArticlesWithLabel);

                if (characteristics["numberOfWords"])
                {
                    processingList = extractor.CountAllWords(processingList);
                }
                if (characteristics["wordsWithLess4Chars"])
                {
                    processingList = extractor.CountWordsWith4CharsOrLess(processingList);
                }
                if (characteristics["wordsWithMore4Chars"])
                {
                    processingList = extractor.CountWordsWithMoreThan4Chars(processingList);
                }
                if (characteristics["numberOfKeywords"])
                {
                    processingList = extractor.CountKeywords(calcKeyword, processingList);
                }
                if (characteristics["hasKeyword"])
                {
                    processingList = extractor.CheckExistingKeywords(processingList);
                }
                if (characteristics["keywordPosition"])
                {
                    processingList = extractor.CheckKeywordPosition(calcKeyword, processingList);
                }
                if (characteristics["keywordFrequency"])
                {
                    processingList = extractor.CheckKeywordFrequency(calcKeyword, processingList);
                }


                correctArticles += extractor.Classify(trainingSet, testingSet, neighbours, metric);

                foreach (Article article in processingList)
                {
                    article.AllCharacteristicValues.Clear();
                }
            }

            return((double)correctArticles / testingSetNumberOfArticles);
        }