Example #1
0
        public void CalculateSimilarityTest()
        {
            var calculator = new CosineSimilarityCalculator();
            var token1     = MockRepository.GenerateStub <IToken>();

            token1.Stub(t => t.Text)
            .Return("a");
            var token2 = MockRepository.GenerateStub <IToken>();

            token2.Stub(t => t.Text)
            .Return("b");
            var token3 = MockRepository.GenerateStub <IToken>();

            token3.Stub(t => t.Text)
            .Return("c");
            var bow1 = new Dictionary <IToken, TfIdf>
            {
                { token1, new TfIdf(0.3, token1) },
                { token2, new TfIdf(0.1, token2) }
            };
            var bow2 = new Dictionary <IToken, TfIdf>
            {
                { token2, new TfIdf(0.5, token2) },
                { token3, new TfIdf(0.2, token3) }
            };
            var processed1 = new ProcessedArticle(bow1);
            var processed2 = new ProcessedArticle(bow2);

            double similarity = calculator.CalculateSimilarity(processed1, processed2);

            Assert.IsTrue(similarity.IsAbout(0.294, 0.001));
        }
Example #2
0
        public void CalculateSimilariteArticle2NullExc()
        {
            var calculator = new CosineSimilarityCalculator();
            var article    = new ProcessedArticle(new Dictionary <IToken, TfIdf>());

            calculator.CalculateSimilarity(article, null);
        }
        public GitMonitorHome()
        {
            InitializeComponent();
            CosineSimilarityCalculator c = new CosineSimilarityCalculator();

            c.CalculateCosineSimilarity("aa bb cc", "aa bb dd");
            ReadCredentials();
            CultureInfo.DefaultThreadCurrentCulture = new CultureInfo("en-US");
        }
Example #4
0
        private static void Main(string[] args)
        {
            var htmlCleaner                       = new HtmlCleaner(new ISiteHtmlCleaner[] { new DailyHtmlCleaner(), new MirrorHtmlCleaner() });
            var htmlLoader                        = new HtmlLoader();
            var articleProvider                   = new ArticleProvider(htmlCleaner, htmlLoader);
            var cosineSimilarityCalculator        = new CosineSimilarityCalculator();
            IDocumentFrequencyProvider dfProvider = LoadFrequencies();
            var tfIdfCalculator                   = new TfIdfCalculator(dfProvider);
            var tokenizer        = new Tokenizer();
            var articleProcessor = new ArticleProcessor(tfIdfCalculator, tokenizer);
            var articleComparer  = new TextProcessing.ArticleComparer(articleProvider, cosineSimilarityCalculator,
                                                                      articleProcessor);

            Console.WriteLine("Similar articles:");
            double similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2489957/Britains-spy-chiefs-grilled-MPs-television-time.html",
                    @"http://www.mirror.co.uk/news/uk-news/mi6-mi5-gchq-bosses-questioned-2685310");

            Console.WriteLine(similarity);
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2489640/80-parents-caught-children-copying-p**n-style-dances-offensive-lyrics.html",
                    @"http://www.mirror.co.uk/news/uk-news/miley-cyrus-twerking-kids-copying-2685363");
            Console.WriteLine(similarity);

            Console.WriteLine("Same article:");
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html",
                    @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html");
            Console.WriteLine(similarity);

            Console.WriteLine("Different articles:");
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/femail/article-2489984/Needy-people-likely-cheat.html",
                    @"http://www.dailymail.co.uk/news/article-2490531/Worlds-oldest-paperboy-deliver-round-71-years-route.html");
            Console.WriteLine(similarity);
            similarity =
                articleComparer.Compare(
                    @"http://www.dailymail.co.uk/news/article-2490412/Wikileaks-journalist-spent-4-months-Edward-Snowden-leaves-Russia.html",
                    @"http://www.dailymail.co.uk/news/article-2489994/Twitter-share-prices-soar-firms-day-trading.html");
            Console.WriteLine(similarity);
            Console.ReadKey();
        }
        /// <summary>
        /// 计算 VSM 方法
        /// </summary>
        /// <param name="outputFolderPath"></param>
        /// <param name="bugName"></param>
        /// <param name="queryText">查询文本</param>
        public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText)
        {
            Utility.Status("Creating VSM: " + bugName);

            // 创建查询文本的TF-IDF字典
            MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary();

            queryText.ForEach(queryTfIdfDictionary.Add);

            // 最大频度
            double maxFrequency = queryTfIdfDictionary.Max(x => x.Value);

            // 计算TF-IDF
            foreach (var queryWordWithTf in queryTfIdfDictionary.ToList())
            {
                queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key)
                    ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key]
                    : 0;
            }

            // 计算相似度字典
            MyDoubleDictionary         similarityDictionary       = new MyDoubleDictionary();
            CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary);

            // 计算文本文件相似度 with each _codeFiles
            foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary)
            {
                double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value);
                similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase);
            }

            // 将文档向量降序写入文件Project\001\Results\Vsm.txt
            WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary);

            Utility.Status("Completed VSM: " + bugName);
        }