Beispiel #1
0
        public HashSet <long> get_near_dups(Simhash simhash)
        {
            /*
             * "simhash" is an instance of Simhash
             * return a list of obj_id, which is in type of long (for now)
             */
            if (simhash.fpSize != this.fpSize)
            {
                throw new Exception();
            }

            var ans = new HashSet <long>();

            foreach (string key in get_keys(simhash))
            {
                if (bucket.ContainsKey(key))
                {
                    var dups = bucket[key];
                    foreach (var dup in dups)
                    {
                        string[] parts  = dup.Split(',');
                        ulong    fp     = Convert.ToUInt64(parts[0]);
                        long     obj_id = Convert.ToInt64(parts[1]);
                        var      sim2   = new Simhash(fp);
                        int      d      = simhash.distance(sim2);
                        if (d <= kDistance)
                        {
                            ans.Add(obj_id);
                        }
                    }
                }
            }
            return(ans);
        }
        public HashSet<long> get_near_dups(Simhash simhash)
        {
            /*
            "simhash" is an instance of Simhash
            return a list of obj_id, which is in type of long (for now)
            */
            if (simhash.fpSize != this.fpSize) throw new Exception();

            var ans = new HashSet<long>();

            foreach (string key in get_keys(simhash))
            {
                if (bucket.ContainsKey(key))
                {
                    var dups = bucket[key];
                    foreach (var dup in dups)
                    {
                        string[] parts = dup.Split(',');
                        ulong fp = Convert.ToUInt64(parts[0]);
                        long obj_id = Convert.ToInt64(parts[1]);
                        var sim2 = new Simhash(fp);
                        int d = simhash.distance(sim2);
                        if (d <= kDistance)
                        {
                            ans.Add(obj_id);
                        }
                    }
                }
            }
            return ans;
        }
Beispiel #3
0
 public void test_gethashcode_specialhashing_to64bit()
 {
     string eval = "aaa";
     var simHash = new Simhash();
     ulong fromDb = simHash.hashfuncjenkins(eval);
     Assert.AreEqual(18323053351575752945, fromDb);
 }
        public void test_get_near_dup_hash()
        {
            var s1 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s1.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");
            var dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);

            var s2 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s2.GenerateSimhash(testData[1]);
            index.delete(1, s2);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(2, dups.Count);

            var s3 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s3.GenerateSimhash(testData[1]);
            index.delete(1, s3);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(2, dups.Count);

            var s4 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s4.GenerateSimhash(testData[1]);
            index.add(1, s4);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);

            var s5 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s5.GenerateSimhash(testData[1]);
            index.add(1, s5);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);
        }
        public void test_get_keys()
        {
            Dictionary<long, string> testdata = new Dictionary<long, string>();
            testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");

            Dictionary<long, Simhash> simHashObjs = new Dictionary<long, Simhash>();
            foreach (var it in testdata)
            {
                var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
                simHash.GenerateSimhash(it.Value);
                simHashObjs.Add(it.Key, simHash);
            }
            var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10);
            var listOfKeys = simHashIndex.get_the_keys(simHashObjs[1]);
            Assert.IsTrue(listOfKeys.Count == 11);
            Assert.AreEqual("26,0", listOfKeys[0]);
            Assert.AreEqual("3,1", listOfKeys[1]);
            Assert.AreEqual("7,2", listOfKeys[2]);
            Assert.AreEqual("12,3", listOfKeys[3]);
            Assert.AreEqual("17,4", listOfKeys[4]);
            Assert.AreEqual("0,5", listOfKeys[5]);
            Assert.AreEqual("13,6", listOfKeys[6]);
            Assert.AreEqual("30,7", listOfKeys[7]);
            Assert.AreEqual("1,8", listOfKeys[8]);
            Assert.AreEqual("14,9", listOfKeys[9]);
            Assert.AreEqual("7496,10", listOfKeys[10]);
        }
Beispiel #6
0
        public void Get_Keys()
        {
            var simhash = new SimhashLib.Simhash();

            var testdata = new Dictionary <long, string>();

            testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");

            var simHashObjs = new Dictionary <long, SimhashResult>();

            foreach (var it in testdata)
            {
                simHashObjs.Add(it.Key, simhash.ComputeHashByMd5(it.Value));
            }
            var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10);
            var listOfKeys   = simHashIndex.GetListKeys(simHashObjs[1]);

            Assert.True(listOfKeys.Count == 11);
            Assert.Equal("26,0", listOfKeys[0]);
            Assert.Equal("3,1", listOfKeys[1]);
            Assert.Equal("7,2", listOfKeys[2]);
            Assert.Equal("12,3", listOfKeys[3]);
            Assert.Equal("17,4", listOfKeys[4]);
            Assert.Equal("0,5", listOfKeys[5]);
            Assert.Equal("13,6", listOfKeys[6]);
            Assert.Equal("30,7", listOfKeys[7]);
            Assert.Equal("1,8", listOfKeys[8]);
            Assert.Equal("14,9", listOfKeys[9]);
            Assert.Equal("7496,10", listOfKeys[10]);
        }
Beispiel #7
0
 public void test_hashstringtobigint()
 {
     var simHash = new Simhash();
     BigInteger actualBiggie = simHash.hashfunc_hashstringtobignasty("47bce5c74f589f4867dbd57e9ca9f808");
     string expectedBiggie = "95355999972893604581396806948474189832";
     Assert.AreEqual(expectedBiggie, actualBiggie.ToString());
 }
Beispiel #8
0
        public void Get_Near_Dup_Hash()
        {
            var simhash = new SimhashLib.Simhash();

            var hash = simhash.ComputeHashByMd5("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");
            var dups = index.GetNearDups(hash);

            Assert.Equal(3, dups.Count);

            var hash2 = simhash.ComputeHashByMd5(testData[1]);

            index.Delete(1, hash2);
            dups = index.GetNearDups(hash);
            Assert.Equal(2, dups.Count);

            var hash3 = simhash.ComputeHashByMd5(testData[1]);

            index.Delete(1, hash3);
            dups = index.GetNearDups(hash);
            Assert.Equal(2, dups.Count);

            var hash4 = simhash.ComputeHashByMd5(testData[1]);

            index.Add(1, hash4);
            dups = index.GetNearDups(hash);
            Assert.Equal(3, dups.Count);

            var hash5 = simhash.ComputeHashByMd5(testData[1]);

            index.Add(1, hash5);
            dups = index.GetNearDups(hash);
            Assert.Equal(3, dups.Count);
        }
Beispiel #9
0
 public void test_value()
 {
     List<string> features = new List<string>() { "aaa", "bbb" };
     var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
     simHash.GenerateSimhash(features);
     ulong expected = 8637903533912358349;
     Assert.AreEqual(expected, simHash.value);
 }
 public void test_get_near_dup_hash_jenkins_not_close()
 {
     var index = setUpIndex(1);
     var s = new Simhash();
     s.GenerateSimhash("This is not even close to the text that is loaded by default");
     var dups = index.get_near_dups(s);
     Assert.AreEqual(0, dups.Count);
 }
Beispiel #11
0
        private static SimhashResult ComputeHash <THash, TRes>(this Simhash self, string content, THash hash)
            where THash : struct, IHash <TRes>
            where TRes : IHashResult <TRes>
        {
            var builder  = new StringBuilder(content.Length);
            var shingles = Shingling.Tokenize(content, builder);

            return(self.ComputeHash <THash, TRes>(shingles, hash));
        }
Beispiel #12
0
        public void Get_Near_Dup_Hash_Jenkins_Not_Close()
        {
            var simhash = new SimhashLib.Simhash();

            var index = SetUpIndex(1);

            var hash = simhash.ComputeHashByMd5("This is not even close to the text that is loaded by default");
            var dups = index.GetNearDups(hash);

            Assert.Empty(dups);
        }
Beispiel #13
0
 public void delete(long obj_id, Simhash simhash)
 {
     foreach (string key in get_keys(simhash))
     {
         string v = string.Format("{0},{1}", simhash.value, obj_id);
         if (bucket.ContainsKey(key))
         {
             bucket[key].Remove(v);
         }
     }
 }
Beispiel #14
0
 public void delete(long obj_id, Simhash simhash)
 {
     foreach (string key in get_keys(simhash))
     {
         string v = string.Format("{0},{1}", simhash.value, obj_id);
         if (bucket.ContainsKey(key))
         {
             bucket[key].Remove(v);
         }
     }
 }
        public void test_get_near_dup_hash_jenkins_find_one()
        {
            var index = setUpIndex(1);

            var s = new Simhash();
            var shingling = new Shingling();
            var features = shingling.tokenize(testData[1], 3);
            s.GenerateSimhash(features);

            var dups = index.get_near_dups(s);
            Assert.AreEqual(1, dups.Count);
        }
Beispiel #16
0
 public int distance(Simhash another)
 {
     if (fpSize != another.fpSize) throw new Exception();
     ulong x = (value ^ another.value) & (ulong.MaxValue);
     int ans = 0;
     while (x > 0)
     {
         ans++;
         x &= x - 1;
     }
     return ans;
 }
Beispiel #17
0
        public void Get_Near_Dup_Hash_Jenkins_Find_One()
        {
            var simhash = new SimhashLib.Simhash();

            var index = SetUpIndex(1);

            var features = Shingling.Tokenize(testData[1], new StringBuilder(), 3);

            var hash = simhash.ComputeHashByMd5(features);
            var dups = index.GetNearDups(hash);

            Assert.Single(dups);
        }
Beispiel #18
0
        public TestSimhashIndexMD5()
        {
            var simhash = new SimhashLib.Simhash();

            testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");
            testData.Add(2, "How are you i am fine. blar blar blar blar blar than");
            testData.Add(3, "This is simhash test.");
            testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1");

            foreach (var it in testData)
            {
                objs.Add(it.Key, simhash.ComputeHashByMd5(it.Value));
            }
            index = new SimhashIndex(objs: objs, k: 10);
        }
        private SimhashIndex setUpIndex(int kValue)
        {
            Dictionary<long, Simhash> objs = new Dictionary<long, Simhash>();

            foreach (var it in testData)
            {
                var simHash = new Simhash();
                var shingling = new Shingling();
                var features = shingling.tokenize(it.Value, 3);
                simHash.GenerateSimhash(features);
                objs.Add(it.Key, simHash);

            }
            return new SimhashIndex(objs: objs, k: kValue);
        }
Beispiel #20
0
        private SimhashIndex SetUpIndex(int kValue)
        {
            var simhash = new SimhashLib.Simhash();

            var objs = new Dictionary <long, SimhashResult>();

            var builder = new StringBuilder();

            foreach (var it in testData)
            {
                var features = Shingling.Tokenize(it.Value, builder.Clear(), 3);
                objs.Add(it.Key, simhash.ComputeHashByMd5(features));
            }
            return(new SimhashIndex(objs: objs, k: kValue));
        }
        public void setUp()
        {
            testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");
            testData.Add(2, "How are you i am fine. blar blar blar blar blar than");
            testData.Add(3, "This is simhash test.");
            testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1");

            foreach(var it in testData)
            {
                var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
                simHash.GenerateSimhash(it.Value);
                objs.Add(it.Key, simHash);

            }
            index = new SimhashIndex(objs: objs, k: 10);
        }
Beispiel #22
0
        public int distance(Simhash another)
        {
            if (fpSize != another.fpSize)
            {
                throw new Exception();
            }
            ulong x   = (value ^ another.value) & (ulong.MaxValue);
            int   ans = 0;

            while (x > 0)
            {
                ans++;
                x &= x - 1;
            }
            return(ans);
        }
Beispiel #23
0
        public void test_distance()
        {
            var sh = new Simhash();
            sh.GenerateSimhash("How are you? I AM fine. Thanks. And you?");
            var sh2 = new Simhash();
            sh2.GenerateSimhash("How old are you? :-) i am fine. Thanks. And you?");
            int distA = sh.distance(sh2);
            Assert.IsTrue(distA > 0);

            var sh3 = new Simhash(sh2);
            int distB = sh2.distance(sh3);
            Assert.AreEqual(0,distB);

            var sh4 = new Simhash();
            sh4.GenerateSimhash("1");
            Assert.AreNotEqual(0, sh4.distance(sh3));
        }
Beispiel #24
0
 public void add(long obj_id, Simhash simhash)
 {
     foreach (string key in get_keys(simhash))
     {
         string v = string.Format("{0},{1}", simhash.value, obj_id);
         if (!bucket.ContainsKey(key))
         {
             var bucketHashSet = new HashSet<string>() { v };
             bucket.Add(key, bucketHashSet);
         }
         else
         {
             var values = bucket[key];
             values.Add(v);
         }
     }
 }
Beispiel #25
0
        public void test_chinese()
        {
            var sh = new Simhash();
            sh.GenerateSimhash("你好 世界!  呼噜。");
            var sh2 = new Simhash();
            sh2.GenerateSimhash("你好,世界呼噜");
            Assert.AreEqual(sh.distance(sh2), 0);

            var sh4 = new Simhash();
            sh4.GenerateSimhash("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.");
            var sh5 = new Simhash();
            sh5.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than");
            var sh6 = new Simhash();
            sh6.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");

            Assert.IsTrue(sh4.distance(sh6) < 3);
            Assert.IsTrue(sh5.distance(sh6) < 3);
        }
Beispiel #26
0
        private void Chinese <THash, TRes>(THash hash)
            where THash : IHash <TRes>
            where TRes : IHashResult <TRes>
        {
            var simhash = new SimhashLib.Simhash();

            var h1 = simhash.ComputeHash <THash, TRes>("你好 世界!  呼噜。", hash);

            var h2 = simhash.ComputeHash <THash, TRes>("你好,世界呼噜", hash);

            Assert.Equal(0, h1.Distance(h2));

            var h4 = simhash.ComputeHash <THash, TRes>("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.", hash);
            var h5 = simhash.ComputeHash <THash, TRes>("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than", hash);
            var h6 = simhash.ComputeHash <THash, TRes>("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank", hash);

            Assert.True(h4.Distance(h6) < 3);
            Assert.True(h5.Distance(h6) < 3);
        }
Beispiel #27
0
 public void add(long obj_id, Simhash simhash)
 {
     foreach (string key in get_keys(simhash))
     {
         string v = string.Format("{0},{1}", simhash.value, obj_id);
         if (!bucket.ContainsKey(key))
         {
             var bucketHashSet = new HashSet <string>()
             {
                 v
             };
             bucket.Add(key, bucketHashSet);
         }
         else
         {
             var values = bucket[key];
             values.Add(v);
         }
     }
 }
Beispiel #28
0
        private void Short <THash, TRes>(THash hash)
            where THash : IHash <TRes>
            where TRes : IHashResult <TRes>
        {
            var simhash = new SimhashLib.Simhash();

            var ss = new List <string>()
            {
                "aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb"
            };

            var shs = ss.Select(s => simhash.ComputeHash <THash, TRes>(s, hash)).ToList();

            foreach (var sh1 in shs)
            {
                foreach (var sh2 in shs.Where(sh2 => !sh1.Equals(sh2)))
                {
                    Assert.NotEqual(sh1, sh2);
                }
            }
        }
Beispiel #29
0
        public void test_short()
        {
            List<Simhash> shs = new List<Simhash>();
            List<string> ss = new List<string>() { "aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb" };
            foreach (string s in ss)
            {
                var simHash = new Simhash();
                simHash.GenerateSimhash(s);
                shs.Add(simHash);
            }

            foreach (Simhash sh1 in shs)
            {
                foreach (Simhash sh2 in shs)
                {
                    if (sh1 != sh2)
                    {
                        Assert.AreNotEqual(sh1, sh2);
                    }
                }
            }
        }
Beispiel #30
0
        private static IEnumerable <string> get_keys(Simhash simhash)
        {
            for (int i = 0; i < offsets.Count; i++)
            {
                int off;
                if (i == (offsets.Count - 1))
                {
                    off = (fpSizeStatic - offsets[i]);
                }
                else
                {
                    off = offsets[i + 1] - offsets[i];
                }

                double m        = (Math.Pow(2, off)) - 1;
                ulong  m64      = Convert.ToUInt64(m);
                ulong  offset64 = Convert.ToUInt64(offsets[i]);
                ulong  c        = simhash.value >> offsets[i] & m64;

                yield return(string.Format("{0},{1}", c, i));
            }
        }
Beispiel #31
0
        public void Distance <THash, TRes>(THash hash)
            where THash : IHash <TRes>
            where TRes : IHashResult <TRes>
        {
            var simhash = new SimhashLib.Simhash();

            var hash1 = simhash.ComputeHash <THash, TRes>("How are you? I AM fine. Thanks. And you?", hash);
            var hash2 = simhash.ComputeHash <THash, TRes>("How old are you? :-) i am fine. Thanks. And you?", hash);
            var distA = hash1.Distance(hash2);
            var dist2 = hash1.Distance(hash2);

            Assert.True(distA > 0);
            Assert.True(dist2 > 0);

            var distB = hash2.Distance(hash2);
            var dist3 = hash2.Distance(hash2);

            Assert.Equal(0, distB);
            Assert.Equal(0, dist3);

            var hash3 = simhash.ComputeHash <THash, TRes>("1", hash);

            Assert.NotEqual(0, hash3.Distance(hash2));
        }
Beispiel #32
0
 public static SimhashResult ComputeHashByMd5(this Simhash self, List <string> tokens)
 => self.ComputeHash <Md5Hash, Md5HashResult>(tokens, new Md5Hash());
Beispiel #33
0
 public static SimhashResult ComputeHashByMurmurHash3(this Simhash self, List <string> tokens)
 => self.ComputeHash <MurmurHash3, MurmurHash3Result>(tokens, new MurmurHash3());
Beispiel #34
0
 public static SimhashResult ComputeHashByJenkins(this Simhash self, string content, uint seed = default)
 => self.ComputeHash <JenkinsHash, JenkinsHashResult>(content, new JenkinsHash(seed));
Beispiel #35
0
 public static SimhashResult ComputeHashByMurmurHash3(this Simhash self, string content)
 => self.ComputeHash <MurmurHash3, MurmurHash3Result>(content, new MurmurHash3());
Beispiel #36
0
 public Simhash(Simhash simHash)
 {
     value = simHash.value;
 }
Beispiel #37
0
 public static SimhashResult ComputeHashByJenkins(this Simhash self, List <string> tokens, uint seed = default)
 => self.ComputeHash <JenkinsHash, JenkinsHashResult>(tokens, new JenkinsHash(seed));
Beispiel #38
0
 public static SimhashResult ComputeHashByMd5(this Simhash self, string content)
 => self.ComputeHash <Md5Hash, Md5HashResult>(content, new Md5Hash());
Beispiel #39
0
        private static IEnumerable<string> get_keys(Simhash simhash)
        {
            for (int i = 0; i < offsets.Count; i++)
            {
                int off;
                if (i == (offsets.Count - 1))
                {
                    off = (fpSizeStatic - offsets[i]);
                }
                else
                {
                    off = offsets[i + 1] - offsets[i];
                }

                double m = (Math.Pow(2, off)) - 1;
                ulong m64 = Convert.ToUInt64(m);
                ulong offset64 = Convert.ToUInt64(offsets[i]);
                ulong c = simhash.value >> offsets[i] & m64;

                yield return string.Format("{0},{1}", c, i);
            }
        }
Beispiel #40
0
 public void test_hashtostringvalue()
 {
     var simHash = new Simhash();
     string val = simHash.hashfunc_hashtostring("aaa");
     Assert.AreEqual(val, "47bce5c74f589f4867dbd57e9ca9f808");
 }
Beispiel #41
0
 public Simhash(Simhash simHash)
 {
     value = simHash.value;
 }
Beispiel #42
0
 public void test_value_control()
 {
     List<string> features = new List<string>() { "aaa" };
     var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
     simHash.GenerateSimhash(features);
     ulong expected = 7483809945577191432;
     Assert.AreEqual(expected, simHash.value);
 }
Beispiel #43
0
 public List <string> get_the_keys(Simhash simhash)
 {
     return(get_keys(simhash).ToList());
 }
Beispiel #44
0
 public void test_value_by_string()
 {
     var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
     simHash.GenerateSimhash("aaa bbb test test testing.happy time = -).");
     ulong expected = 5683413558821905382;
     Assert.AreEqual(expected, simHash.value);
 }
Beispiel #45
0
 public List<string> get_the_keys(Simhash simhash)
 {
     return get_keys(simhash).ToList();
 }