public HashSet <long> get_near_dups(Simhash simhash) { /* * "simhash" is an instance of Simhash * return a list of obj_id, which is in type of long (for now) */ if (simhash.fpSize != this.fpSize) { throw new Exception(); } var ans = new HashSet <long>(); foreach (string key in get_keys(simhash)) { if (bucket.ContainsKey(key)) { var dups = bucket[key]; foreach (var dup in dups) { string[] parts = dup.Split(','); ulong fp = Convert.ToUInt64(parts[0]); long obj_id = Convert.ToInt64(parts[1]); var sim2 = new Simhash(fp); int d = simhash.distance(sim2); if (d <= kDistance) { ans.Add(obj_id); } } } } return(ans); }
public void test_distance() { var sh = new Simhash(); sh.GenerateSimhash("How are you? I AM fine. Thanks. And you?"); var sh2 = new Simhash(); sh2.GenerateSimhash("How old are you? :-) i am fine. Thanks. And you?"); int distA = sh.distance(sh2); Assert.IsTrue(distA > 0); var sh3 = new Simhash(sh2); int distB = sh2.distance(sh3); Assert.AreEqual(0,distB); var sh4 = new Simhash(); sh4.GenerateSimhash("1"); Assert.AreNotEqual(0, sh4.distance(sh3)); }
public void test_chinese() { var sh = new Simhash(); sh.GenerateSimhash("你好 世界! 呼噜。"); var sh2 = new Simhash(); sh2.GenerateSimhash("你好,世界呼噜"); Assert.AreEqual(sh.distance(sh2), 0); var sh4 = new Simhash(); sh4.GenerateSimhash("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks."); var sh5 = new Simhash(); sh5.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than"); var sh6 = new Simhash(); sh6.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank"); Assert.IsTrue(sh4.distance(sh6) < 3); Assert.IsTrue(sh5.distance(sh6) < 3); }
public HashSet<long> get_near_dups(Simhash simhash) { /* "simhash" is an instance of Simhash return a list of obj_id, which is in type of long (for now) */ if (simhash.fpSize != this.fpSize) throw new Exception(); var ans = new HashSet<long>(); foreach (string key in get_keys(simhash)) { if (bucket.ContainsKey(key)) { var dups = bucket[key]; foreach (var dup in dups) { string[] parts = dup.Split(','); ulong fp = Convert.ToUInt64(parts[0]); long obj_id = Convert.ToInt64(parts[1]); var sim2 = new Simhash(fp); int d = simhash.distance(sim2); if (d <= kDistance) { ans.Add(obj_id); } } } } return ans; }