public HashSet <long> get_near_dups(Simhash simhash) { /* * "simhash" is an instance of Simhash * return a list of obj_id, which is in type of long (for now) */ if (simhash.fpSize != this.fpSize) { throw new Exception(); } var ans = new HashSet <long>(); foreach (string key in get_keys(simhash)) { if (bucket.ContainsKey(key)) { var dups = bucket[key]; foreach (var dup in dups) { string[] parts = dup.Split(','); ulong fp = Convert.ToUInt64(parts[0]); long obj_id = Convert.ToInt64(parts[1]); var sim2 = new Simhash(fp); int d = simhash.distance(sim2); if (d <= kDistance) { ans.Add(obj_id); } } } } return(ans); }
public HashSet<long> get_near_dups(Simhash simhash) { /* "simhash" is an instance of Simhash return a list of obj_id, which is in type of long (for now) */ if (simhash.fpSize != this.fpSize) throw new Exception(); var ans = new HashSet<long>(); foreach (string key in get_keys(simhash)) { if (bucket.ContainsKey(key)) { var dups = bucket[key]; foreach (var dup in dups) { string[] parts = dup.Split(','); ulong fp = Convert.ToUInt64(parts[0]); long obj_id = Convert.ToInt64(parts[1]); var sim2 = new Simhash(fp); int d = simhash.distance(sim2); if (d <= kDistance) { ans.Add(obj_id); } } } } return ans; }
public void test_gethashcode_specialhashing_to64bit() { string eval = "aaa"; var simHash = new Simhash(); ulong fromDb = simHash.hashfuncjenkins(eval); Assert.AreEqual(18323053351575752945, fromDb); }
public void test_get_near_dup_hash() { var s1 = new Simhash(hashingType: Simhash.HashingType.MD5); s1.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank"); var dups = index.get_near_dups(s1); Assert.AreEqual(3, dups.Count); var s2 = new Simhash(hashingType: Simhash.HashingType.MD5); s2.GenerateSimhash(testData[1]); index.delete(1, s2); dups = index.get_near_dups(s1); Assert.AreEqual(2, dups.Count); var s3 = new Simhash(hashingType: Simhash.HashingType.MD5); s3.GenerateSimhash(testData[1]); index.delete(1, s3); dups = index.get_near_dups(s1); Assert.AreEqual(2, dups.Count); var s4 = new Simhash(hashingType: Simhash.HashingType.MD5); s4.GenerateSimhash(testData[1]); index.add(1, s4); dups = index.get_near_dups(s1); Assert.AreEqual(3, dups.Count); var s5 = new Simhash(hashingType: Simhash.HashingType.MD5); s5.GenerateSimhash(testData[1]); index.add(1, s5); dups = index.get_near_dups(s1); Assert.AreEqual(3, dups.Count); }
public void test_get_keys() { Dictionary<long, string> testdata = new Dictionary<long, string>(); testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks."); Dictionary<long, Simhash> simHashObjs = new Dictionary<long, Simhash>(); foreach (var it in testdata) { var simHash = new Simhash(hashingType:Simhash.HashingType.MD5); simHash.GenerateSimhash(it.Value); simHashObjs.Add(it.Key, simHash); } var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10); var listOfKeys = simHashIndex.get_the_keys(simHashObjs[1]); Assert.IsTrue(listOfKeys.Count == 11); Assert.AreEqual("26,0", listOfKeys[0]); Assert.AreEqual("3,1", listOfKeys[1]); Assert.AreEqual("7,2", listOfKeys[2]); Assert.AreEqual("12,3", listOfKeys[3]); Assert.AreEqual("17,4", listOfKeys[4]); Assert.AreEqual("0,5", listOfKeys[5]); Assert.AreEqual("13,6", listOfKeys[6]); Assert.AreEqual("30,7", listOfKeys[7]); Assert.AreEqual("1,8", listOfKeys[8]); Assert.AreEqual("14,9", listOfKeys[9]); Assert.AreEqual("7496,10", listOfKeys[10]); }
public void Get_Keys() { var simhash = new SimhashLib.Simhash(); var testdata = new Dictionary <long, string>(); testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks."); var simHashObjs = new Dictionary <long, SimhashResult>(); foreach (var it in testdata) { simHashObjs.Add(it.Key, simhash.ComputeHashByMd5(it.Value)); } var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10); var listOfKeys = simHashIndex.GetListKeys(simHashObjs[1]); Assert.True(listOfKeys.Count == 11); Assert.Equal("26,0", listOfKeys[0]); Assert.Equal("3,1", listOfKeys[1]); Assert.Equal("7,2", listOfKeys[2]); Assert.Equal("12,3", listOfKeys[3]); Assert.Equal("17,4", listOfKeys[4]); Assert.Equal("0,5", listOfKeys[5]); Assert.Equal("13,6", listOfKeys[6]); Assert.Equal("30,7", listOfKeys[7]); Assert.Equal("1,8", listOfKeys[8]); Assert.Equal("14,9", listOfKeys[9]); Assert.Equal("7496,10", listOfKeys[10]); }
public void test_hashstringtobigint() { var simHash = new Simhash(); BigInteger actualBiggie = simHash.hashfunc_hashstringtobignasty("47bce5c74f589f4867dbd57e9ca9f808"); string expectedBiggie = "95355999972893604581396806948474189832"; Assert.AreEqual(expectedBiggie, actualBiggie.ToString()); }
public void Get_Near_Dup_Hash() { var simhash = new SimhashLib.Simhash(); var hash = simhash.ComputeHashByMd5("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank"); var dups = index.GetNearDups(hash); Assert.Equal(3, dups.Count); var hash2 = simhash.ComputeHashByMd5(testData[1]); index.Delete(1, hash2); dups = index.GetNearDups(hash); Assert.Equal(2, dups.Count); var hash3 = simhash.ComputeHashByMd5(testData[1]); index.Delete(1, hash3); dups = index.GetNearDups(hash); Assert.Equal(2, dups.Count); var hash4 = simhash.ComputeHashByMd5(testData[1]); index.Add(1, hash4); dups = index.GetNearDups(hash); Assert.Equal(3, dups.Count); var hash5 = simhash.ComputeHashByMd5(testData[1]); index.Add(1, hash5); dups = index.GetNearDups(hash); Assert.Equal(3, dups.Count); }
public void test_value() { List<string> features = new List<string>() { "aaa", "bbb" }; var simHash = new Simhash(hashingType:Simhash.HashingType.MD5); simHash.GenerateSimhash(features); ulong expected = 8637903533912358349; Assert.AreEqual(expected, simHash.value); }
public void test_get_near_dup_hash_jenkins_not_close() { var index = setUpIndex(1); var s = new Simhash(); s.GenerateSimhash("This is not even close to the text that is loaded by default"); var dups = index.get_near_dups(s); Assert.AreEqual(0, dups.Count); }
private static SimhashResult ComputeHash <THash, TRes>(this Simhash self, string content, THash hash) where THash : struct, IHash <TRes> where TRes : IHashResult <TRes> { var builder = new StringBuilder(content.Length); var shingles = Shingling.Tokenize(content, builder); return(self.ComputeHash <THash, TRes>(shingles, hash)); }
public void Get_Near_Dup_Hash_Jenkins_Not_Close() { var simhash = new SimhashLib.Simhash(); var index = SetUpIndex(1); var hash = simhash.ComputeHashByMd5("This is not even close to the text that is loaded by default"); var dups = index.GetNearDups(hash); Assert.Empty(dups); }
public void delete(long obj_id, Simhash simhash) { foreach (string key in get_keys(simhash)) { string v = string.Format("{0},{1}", simhash.value, obj_id); if (bucket.ContainsKey(key)) { bucket[key].Remove(v); } } }
public void test_get_near_dup_hash_jenkins_find_one() { var index = setUpIndex(1); var s = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(testData[1], 3); s.GenerateSimhash(features); var dups = index.get_near_dups(s); Assert.AreEqual(1, dups.Count); }
public int distance(Simhash another) { if (fpSize != another.fpSize) throw new Exception(); ulong x = (value ^ another.value) & (ulong.MaxValue); int ans = 0; while (x > 0) { ans++; x &= x - 1; } return ans; }
public void Get_Near_Dup_Hash_Jenkins_Find_One() { var simhash = new SimhashLib.Simhash(); var index = SetUpIndex(1); var features = Shingling.Tokenize(testData[1], new StringBuilder(), 3); var hash = simhash.ComputeHashByMd5(features); var dups = index.GetNearDups(hash); Assert.Single(dups); }
public TestSimhashIndexMD5() { var simhash = new SimhashLib.Simhash(); testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks."); testData.Add(2, "How are you i am fine. blar blar blar blar blar than"); testData.Add(3, "This is simhash test."); testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1"); foreach (var it in testData) { objs.Add(it.Key, simhash.ComputeHashByMd5(it.Value)); } index = new SimhashIndex(objs: objs, k: 10); }
private SimhashIndex setUpIndex(int kValue) { Dictionary<long, Simhash> objs = new Dictionary<long, Simhash>(); foreach (var it in testData) { var simHash = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(it.Value, 3); simHash.GenerateSimhash(features); objs.Add(it.Key, simHash); } return new SimhashIndex(objs: objs, k: kValue); }
private SimhashIndex SetUpIndex(int kValue) { var simhash = new SimhashLib.Simhash(); var objs = new Dictionary <long, SimhashResult>(); var builder = new StringBuilder(); foreach (var it in testData) { var features = Shingling.Tokenize(it.Value, builder.Clear(), 3); objs.Add(it.Key, simhash.ComputeHashByMd5(features)); } return(new SimhashIndex(objs: objs, k: kValue)); }
public void setUp() { testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks."); testData.Add(2, "How are you i am fine. blar blar blar blar blar than"); testData.Add(3, "This is simhash test."); testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1"); foreach(var it in testData) { var simHash = new Simhash(hashingType:Simhash.HashingType.MD5); simHash.GenerateSimhash(it.Value); objs.Add(it.Key, simHash); } index = new SimhashIndex(objs: objs, k: 10); }
public int distance(Simhash another) { if (fpSize != another.fpSize) { throw new Exception(); } ulong x = (value ^ another.value) & (ulong.MaxValue); int ans = 0; while (x > 0) { ans++; x &= x - 1; } return(ans); }
public void test_distance() { var sh = new Simhash(); sh.GenerateSimhash("How are you? I AM fine. Thanks. And you?"); var sh2 = new Simhash(); sh2.GenerateSimhash("How old are you? :-) i am fine. Thanks. And you?"); int distA = sh.distance(sh2); Assert.IsTrue(distA > 0); var sh3 = new Simhash(sh2); int distB = sh2.distance(sh3); Assert.AreEqual(0,distB); var sh4 = new Simhash(); sh4.GenerateSimhash("1"); Assert.AreNotEqual(0, sh4.distance(sh3)); }
public void add(long obj_id, Simhash simhash) { foreach (string key in get_keys(simhash)) { string v = string.Format("{0},{1}", simhash.value, obj_id); if (!bucket.ContainsKey(key)) { var bucketHashSet = new HashSet<string>() { v }; bucket.Add(key, bucketHashSet); } else { var values = bucket[key]; values.Add(v); } } }
public void test_chinese() { var sh = new Simhash(); sh.GenerateSimhash("你好 世界! 呼噜。"); var sh2 = new Simhash(); sh2.GenerateSimhash("你好,世界呼噜"); Assert.AreEqual(sh.distance(sh2), 0); var sh4 = new Simhash(); sh4.GenerateSimhash("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks."); var sh5 = new Simhash(); sh5.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than"); var sh6 = new Simhash(); sh6.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank"); Assert.IsTrue(sh4.distance(sh6) < 3); Assert.IsTrue(sh5.distance(sh6) < 3); }
private void Chinese <THash, TRes>(THash hash) where THash : IHash <TRes> where TRes : IHashResult <TRes> { var simhash = new SimhashLib.Simhash(); var h1 = simhash.ComputeHash <THash, TRes>("你好 世界! 呼噜。", hash); var h2 = simhash.ComputeHash <THash, TRes>("你好,世界呼噜", hash); Assert.Equal(0, h1.Distance(h2)); var h4 = simhash.ComputeHash <THash, TRes>("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.", hash); var h5 = simhash.ComputeHash <THash, TRes>("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than", hash); var h6 = simhash.ComputeHash <THash, TRes>("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank", hash); Assert.True(h4.Distance(h6) < 3); Assert.True(h5.Distance(h6) < 3); }
public void add(long obj_id, Simhash simhash) { foreach (string key in get_keys(simhash)) { string v = string.Format("{0},{1}", simhash.value, obj_id); if (!bucket.ContainsKey(key)) { var bucketHashSet = new HashSet <string>() { v }; bucket.Add(key, bucketHashSet); } else { var values = bucket[key]; values.Add(v); } } }
private void Short <THash, TRes>(THash hash) where THash : IHash <TRes> where TRes : IHashResult <TRes> { var simhash = new SimhashLib.Simhash(); var ss = new List <string>() { "aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb" }; var shs = ss.Select(s => simhash.ComputeHash <THash, TRes>(s, hash)).ToList(); foreach (var sh1 in shs) { foreach (var sh2 in shs.Where(sh2 => !sh1.Equals(sh2))) { Assert.NotEqual(sh1, sh2); } } }
public void test_short() { List<Simhash> shs = new List<Simhash>(); List<string> ss = new List<string>() { "aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb" }; foreach (string s in ss) { var simHash = new Simhash(); simHash.GenerateSimhash(s); shs.Add(simHash); } foreach (Simhash sh1 in shs) { foreach (Simhash sh2 in shs) { if (sh1 != sh2) { Assert.AreNotEqual(sh1, sh2); } } } }
private static IEnumerable <string> get_keys(Simhash simhash) { for (int i = 0; i < offsets.Count; i++) { int off; if (i == (offsets.Count - 1)) { off = (fpSizeStatic - offsets[i]); } else { off = offsets[i + 1] - offsets[i]; } double m = (Math.Pow(2, off)) - 1; ulong m64 = Convert.ToUInt64(m); ulong offset64 = Convert.ToUInt64(offsets[i]); ulong c = simhash.value >> offsets[i] & m64; yield return(string.Format("{0},{1}", c, i)); } }
public void Distance <THash, TRes>(THash hash) where THash : IHash <TRes> where TRes : IHashResult <TRes> { var simhash = new SimhashLib.Simhash(); var hash1 = simhash.ComputeHash <THash, TRes>("How are you? I AM fine. Thanks. And you?", hash); var hash2 = simhash.ComputeHash <THash, TRes>("How old are you? :-) i am fine. Thanks. And you?", hash); var distA = hash1.Distance(hash2); var dist2 = hash1.Distance(hash2); Assert.True(distA > 0); Assert.True(dist2 > 0); var distB = hash2.Distance(hash2); var dist3 = hash2.Distance(hash2); Assert.Equal(0, distB); Assert.Equal(0, dist3); var hash3 = simhash.ComputeHash <THash, TRes>("1", hash); Assert.NotEqual(0, hash3.Distance(hash2)); }
public static SimhashResult ComputeHashByMd5(this Simhash self, List <string> tokens) => self.ComputeHash <Md5Hash, Md5HashResult>(tokens, new Md5Hash());
public static SimhashResult ComputeHashByMurmurHash3(this Simhash self, List <string> tokens) => self.ComputeHash <MurmurHash3, MurmurHash3Result>(tokens, new MurmurHash3());
public static SimhashResult ComputeHashByJenkins(this Simhash self, string content, uint seed = default) => self.ComputeHash <JenkinsHash, JenkinsHashResult>(content, new JenkinsHash(seed));
public static SimhashResult ComputeHashByMurmurHash3(this Simhash self, string content) => self.ComputeHash <MurmurHash3, MurmurHash3Result>(content, new MurmurHash3());
public Simhash(Simhash simHash) { value = simHash.value; }
public static SimhashResult ComputeHashByJenkins(this Simhash self, List <string> tokens, uint seed = default) => self.ComputeHash <JenkinsHash, JenkinsHashResult>(tokens, new JenkinsHash(seed));
public static SimhashResult ComputeHashByMd5(this Simhash self, string content) => self.ComputeHash <Md5Hash, Md5HashResult>(content, new Md5Hash());
private static IEnumerable<string> get_keys(Simhash simhash) { for (int i = 0; i < offsets.Count; i++) { int off; if (i == (offsets.Count - 1)) { off = (fpSizeStatic - offsets[i]); } else { off = offsets[i + 1] - offsets[i]; } double m = (Math.Pow(2, off)) - 1; ulong m64 = Convert.ToUInt64(m); ulong offset64 = Convert.ToUInt64(offsets[i]); ulong c = simhash.value >> offsets[i] & m64; yield return string.Format("{0},{1}", c, i); } }
public void test_hashtostringvalue() { var simHash = new Simhash(); string val = simHash.hashfunc_hashtostring("aaa"); Assert.AreEqual(val, "47bce5c74f589f4867dbd57e9ca9f808"); }
public void test_value_control() { List<string> features = new List<string>() { "aaa" }; var simHash = new Simhash(hashingType:Simhash.HashingType.MD5); simHash.GenerateSimhash(features); ulong expected = 7483809945577191432; Assert.AreEqual(expected, simHash.value); }
public List <string> get_the_keys(Simhash simhash) { return(get_keys(simhash).ToList()); }
public void test_value_by_string() { var simHash = new Simhash(hashingType:Simhash.HashingType.MD5); simHash.GenerateSimhash("aaa bbb test test testing.happy time = -)."); ulong expected = 5683413558821905382; Assert.AreEqual(expected, simHash.value); }
public List<string> get_the_keys(Simhash simhash) { return get_keys(simhash).ToList(); }