public void test_short() { List <Simhash> shs = new List <Simhash>(); List <string> ss = new List <string>() { "aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb" }; foreach (string s in ss) { var simHash = new Simhash(); simHash.GenerateSimhash(s); shs.Add(simHash); } foreach (Simhash sh1 in shs) { foreach (Simhash sh2 in shs) { if (sh1 != sh2) { Assert.AreNotEqual(sh1, sh2); } } } }
public void test_get_keys() { Dictionary <long, string> testdata = new Dictionary <long, string>(); testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks."); Dictionary <long, Simhash> simHashObjs = new Dictionary <long, Simhash>(); foreach (var it in testdata) { var simHash = new Simhash(hashingType: Simhash.HashingType.MD5); simHash.GenerateSimhash(it.Value); simHashObjs.Add(it.Key, simHash); } var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10); var listOfKeys = simHashIndex.get_the_keys(simHashObjs[1]); Assert.IsTrue(listOfKeys.Count == 11); Assert.AreEqual("26,0", listOfKeys[0]); Assert.AreEqual("3,1", listOfKeys[1]); Assert.AreEqual("7,2", listOfKeys[2]); Assert.AreEqual("12,3", listOfKeys[3]); Assert.AreEqual("17,4", listOfKeys[4]); Assert.AreEqual("0,5", listOfKeys[5]); Assert.AreEqual("13,6", listOfKeys[6]); Assert.AreEqual("30,7", listOfKeys[7]); Assert.AreEqual("1,8", listOfKeys[8]); Assert.AreEqual("14,9", listOfKeys[9]); Assert.AreEqual("7496,10", listOfKeys[10]); }
public void test_value_by_string() { var simHash = new Simhash(hashingType: Simhash.HashingType.MD5); simHash.GenerateSimhash("aaa bbb test test testing.happy time = -)."); ulong expected = 5683413558821905382; Assert.AreEqual(expected, simHash.value); }
public void test_get_near_dup_hash_jenkins_not_close() { var index = setUpIndex(1); var s = new Simhash(); s.GenerateSimhash("This is not even close to the text that is loaded by default"); var dups = index.get_near_dups(s); Assert.AreEqual(0, dups.Count); }
public void test_value_control() { List <string> features = new List <string>() { "aaa" }; var simHash = new Simhash(hashingType: Simhash.HashingType.MD5); simHash.GenerateSimhash(features); ulong expected = 7483809945577191432; Assert.AreEqual(expected, simHash.value); }
public void test_value() { List <string> features = new List <string>() { "aaa", "bbb" }; var simHash = new Simhash(hashingType: Simhash.HashingType.MD5); simHash.GenerateSimhash(features); ulong expected = 8637903533912358349; Assert.AreEqual(expected, simHash.value); }
public void test_get_near_dup_hash_jenkins_find_one() { var index = setUpIndex(1); var s = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(testData[1], 3); s.GenerateSimhash(features); var dups = index.get_near_dups(s); Assert.AreEqual(1, dups.Count); }
private SimhashIndex setUpIndex(int kValue) { Dictionary <long, Simhash> objs = new Dictionary <long, Simhash>(); foreach (var it in testData) { var simHash = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(it.Value, 3); simHash.GenerateSimhash(features); objs.Add(it.Key, simHash); } return(new SimhashIndex(objs: objs, k: kValue)); }
public void setUp() { testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks."); testData.Add(2, "How are you i am fine. blar blar blar blar blar than"); testData.Add(3, "This is simhash test."); testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1"); foreach (var it in testData) { var simHash = new Simhash(hashingType: Simhash.HashingType.MD5); simHash.GenerateSimhash(it.Value); objs.Add(it.Key, simHash); } index = new SimhashIndex(objs: objs, k: 10); }
public void test_distance() { var sh = new Simhash(); sh.GenerateSimhash("How are you? I AM fine. Thanks. And you?"); var sh2 = new Simhash(); sh2.GenerateSimhash("How old are you? :-) i am fine. Thanks. And you?"); int distA = sh.distance(sh2); Assert.IsTrue(distA > 0); var sh3 = new Simhash(sh2); int distB = sh2.distance(sh3); Assert.AreEqual(0, distB); var sh4 = new Simhash(); sh4.GenerateSimhash("1"); Assert.AreNotEqual(0, sh4.distance(sh3)); }
public void test_chinese() { var sh = new Simhash(); sh.GenerateSimhash("你好 世界! 呼噜。"); var sh2 = new Simhash(); sh2.GenerateSimhash("你好,世界呼噜"); Assert.AreEqual(sh.distance(sh2), 0); var sh4 = new Simhash(); sh4.GenerateSimhash("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks."); var sh5 = new Simhash(); sh5.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than"); var sh6 = new Simhash(); sh6.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank"); Assert.IsTrue(sh4.distance(sh6) < 3); Assert.IsTrue(sh5.distance(sh6) < 3); }
public void test_get_near_dup_hash() { var s1 = new Simhash(hashingType: Simhash.HashingType.MD5); s1.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank"); var dups = index.get_near_dups(s1); Assert.AreEqual(3, dups.Count); var s2 = new Simhash(hashingType: Simhash.HashingType.MD5); s2.GenerateSimhash(testData[1]); index.delete(1, s2); dups = index.get_near_dups(s1); Assert.AreEqual(2, dups.Count); var s3 = new Simhash(hashingType: Simhash.HashingType.MD5); s3.GenerateSimhash(testData[1]); index.delete(1, s3); dups = index.get_near_dups(s1); Assert.AreEqual(2, dups.Count); var s4 = new Simhash(hashingType: Simhash.HashingType.MD5); s4.GenerateSimhash(testData[1]); index.add(1, s4); dups = index.get_near_dups(s1); Assert.AreEqual(3, dups.Count); var s5 = new Simhash(hashingType: Simhash.HashingType.MD5); s5.GenerateSimhash(testData[1]); index.add(1, s5); dups = index.get_near_dups(s1); Assert.AreEqual(3, dups.Count); }