public void CalculateSimilarityTest() { var calculator = new CosineSimilarityCalculator(); var token1 = MockRepository.GenerateStub <IToken>(); token1.Stub(t => t.Text) .Return("a"); var token2 = MockRepository.GenerateStub <IToken>(); token2.Stub(t => t.Text) .Return("b"); var token3 = MockRepository.GenerateStub <IToken>(); token3.Stub(t => t.Text) .Return("c"); var bow1 = new Dictionary <IToken, TfIdf> { { token1, new TfIdf(0.3, token1) }, { token2, new TfIdf(0.1, token2) } }; var bow2 = new Dictionary <IToken, TfIdf> { { token2, new TfIdf(0.5, token2) }, { token3, new TfIdf(0.2, token3) } }; var processed1 = new ProcessedArticle(bow1); var processed2 = new ProcessedArticle(bow2); double similarity = calculator.CalculateSimilarity(processed1, processed2); Assert.IsTrue(similarity.IsAbout(0.294, 0.001)); }
public void CalculateSimilariteArticle2NullExc() { var calculator = new CosineSimilarityCalculator(); var article = new ProcessedArticle(new Dictionary <IToken, TfIdf>()); calculator.CalculateSimilarity(article, null); }
public GitMonitorHome() { InitializeComponent(); CosineSimilarityCalculator c = new CosineSimilarityCalculator(); c.CalculateCosineSimilarity("aa bb cc", "aa bb dd"); ReadCredentials(); CultureInfo.DefaultThreadCurrentCulture = new CultureInfo("en-US"); }
private static void Main(string[] args) { var htmlCleaner = new HtmlCleaner(new ISiteHtmlCleaner[] { new DailyHtmlCleaner(), new MirrorHtmlCleaner() }); var htmlLoader = new HtmlLoader(); var articleProvider = new ArticleProvider(htmlCleaner, htmlLoader); var cosineSimilarityCalculator = new CosineSimilarityCalculator(); IDocumentFrequencyProvider dfProvider = LoadFrequencies(); var tfIdfCalculator = new TfIdfCalculator(dfProvider); var tokenizer = new Tokenizer(); var articleProcessor = new ArticleProcessor(tfIdfCalculator, tokenizer); var articleComparer = new TextProcessing.ArticleComparer(articleProvider, cosineSimilarityCalculator, articleProcessor); Console.WriteLine("Similar articles:"); double similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2489957/Britains-spy-chiefs-grilled-MPs-television-time.html", @"http://www.mirror.co.uk/news/uk-news/mi6-mi5-gchq-bosses-questioned-2685310"); Console.WriteLine(similarity); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2489640/80-parents-caught-children-copying-p**n-style-dances-offensive-lyrics.html", @"http://www.mirror.co.uk/news/uk-news/miley-cyrus-twerking-kids-copying-2685363"); Console.WriteLine(similarity); Console.WriteLine("Same article:"); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html", @"http://www.dailymail.co.uk/news/article-2490296/You-STILL-likely-lose-job-recession-25s-shop-workers-risk.html"); Console.WriteLine(similarity); Console.WriteLine("Different articles:"); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/femail/article-2489984/Needy-people-likely-cheat.html", @"http://www.dailymail.co.uk/news/article-2490531/Worlds-oldest-paperboy-deliver-round-71-years-route.html"); Console.WriteLine(similarity); similarity = articleComparer.Compare( @"http://www.dailymail.co.uk/news/article-2490412/Wikileaks-journalist-spent-4-months-Edward-Snowden-leaves-Russia.html", @"http://www.dailymail.co.uk/news/article-2489994/Twitter-share-prices-soar-firms-day-trading.html"); Console.WriteLine(similarity); Console.ReadKey(); }
/// <summary> /// 计算 VSM 方法 /// </summary> /// <param name="outputFolderPath"></param> /// <param name="bugName"></param> /// <param name="queryText">查询文本</param> public static void ComputeVsm(string outputFolderPath, string bugName, List <string> queryText) { Utility.Status("Creating VSM: " + bugName); // 创建查询文本的TF-IDF字典 MyDoubleDictionary queryTfIdfDictionary = new MyDoubleDictionary(); queryText.ForEach(queryTfIdfDictionary.Add); // 最大频度 double maxFrequency = queryTfIdfDictionary.Max(x => x.Value); // 计算TF-IDF foreach (var queryWordWithTf in queryTfIdfDictionary.ToList()) { queryTfIdfDictionary[queryWordWithTf.Key] = IdfDictionary.ContainsKey(queryWordWithTf.Key) ? (queryWordWithTf.Value / maxFrequency) * IdfDictionary[queryWordWithTf.Key] : 0; } // 计算相似度字典 MyDoubleDictionary similarityDictionary = new MyDoubleDictionary(); CosineSimilarityCalculator cosineSimilarityCalculator = new CosineSimilarityCalculator(queryTfIdfDictionary); // 计算文本文件相似度 with each _codeFiles foreach (var codeFileWithTfIdfDictionary in TfIdfDictionary) { double cosineSimilarityWithUseCase = cosineSimilarityCalculator.GetSimilarity(codeFileWithTfIdfDictionary.Value); similarityDictionary.Add(codeFileWithTfIdfDictionary.Key, cosineSimilarityWithUseCase); } // 将文档向量降序写入文件Project\001\Results\Vsm.txt WriteDocumentVectorToFileOrderedDescending(outputFolderPath + VsmFileName, similarityDictionary); Utility.Status("Completed VSM: " + bugName); }