static void Main(string[] args) { var targetDataSet = GetTargetDataset(); // BuildResultDataset(targetDataSet); var resultDataSet = GetResultDataset(); var sw = new Stopwatch(); ITextAnalyser analyser = new SimHashAnalyser(); sw.Start(); var testText = "您好呀,我是叶敏华"; // foreach (var item in resultDataSet) // { // Console.WriteLine($"正在和{item.QID}比对......"); // var similarityValue = analyser.GetSimilarityValue(testText, item.TextHashVector); // Console.WriteLine($"海明距离:{similarityValue}"); // } var tagItem = targetDataSet.First().Content; Console.WriteLine($"目标内容是:{tagItem}"); var result = analyser.GetSimilarityValue(testText, tagItem); Console.WriteLine($"海明距离:{result}"); sw.Stop(); Console.WriteLine($"用时:{sw.ElapsedMilliseconds} ms"); }
static void BuildResultDataset(IEnumerable <SpamWords> spamWordses) { using (var sw = new StreamWriter("result.txt")) { foreach (var r in spamWordses) { var result = ""; var text = ReplaceHtmlTag(r.Content); ITextAnalyser analyser = new SimHashAnalyser(); var textHash = analyser.GetTextHashVector(text); result = $"{textHash.ToString()},{r.QID}"; sw.WriteLine(result); } sw.Close(); } Console.WriteLine("生成样例数据成功......"); }
private static float GetSimHash(string str1, string str2) { IAnalyser analyser = new SimHashAnalyser(); return analyser.GetLikenessValue(str1, str2) * 100; }
//字符串两两组合。 //需要一个新的类型 private static List<UrlCombination> GetCombinatorics(List<string> list) { List<UrlCombination> comList = new List<UrlCombination>(); IAnalyser analyser = new SimHashAnalyser(); foreach (var row in new Combination(list.Count, 2).GetRows())//row里存了,m中选出n,和结果数。 { UrlCombination urlCom = new UrlCombination(); List<string> com = Combination.Permute(row, list);//Combination.Permute(row, list)返回一个组合 urlCom.Url1 = com[0]; urlCom.Url2 = com[1]; //SimHash运算 urlCom.SimHash = analyser.GetLikenessValue(com[0], com[1]) * 100; comList.Add(urlCom); } return comList; }