Beispiel #1
0
        public HashSet <long> get_near_dups(Simhash simhash)
        {
            /*
             * "simhash" is an instance of Simhash
             * return a list of obj_id, which is in type of long (for now)
             */
            if (simhash.fpSize != this.fpSize)
            {
                throw new Exception();
            }

            var ans = new HashSet <long>();

            foreach (string key in get_keys(simhash))
            {
                if (bucket.ContainsKey(key))
                {
                    var dups = bucket[key];
                    foreach (var dup in dups)
                    {
                        string[] parts  = dup.Split(',');
                        ulong    fp     = Convert.ToUInt64(parts[0]);
                        long     obj_id = Convert.ToInt64(parts[1]);
                        var      sim2   = new Simhash(fp);
                        int      d      = simhash.distance(sim2);
                        if (d <= kDistance)
                        {
                            ans.Add(obj_id);
                        }
                    }
                }
            }
            return(ans);
        }
Beispiel #2
0
        public void test_distance()
        {
            var sh = new Simhash();
            sh.GenerateSimhash("How are you? I AM fine. Thanks. And you?");
            var sh2 = new Simhash();
            sh2.GenerateSimhash("How old are you? :-) i am fine. Thanks. And you?");
            int distA = sh.distance(sh2);
            Assert.IsTrue(distA > 0);

            var sh3 = new Simhash(sh2);
            int distB = sh2.distance(sh3);
            Assert.AreEqual(0,distB);

            var sh4 = new Simhash();
            sh4.GenerateSimhash("1");
            Assert.AreNotEqual(0, sh4.distance(sh3));
        }
Beispiel #3
0
        public void test_chinese()
        {
            var sh = new Simhash();
            sh.GenerateSimhash("你好 世界!  呼噜。");
            var sh2 = new Simhash();
            sh2.GenerateSimhash("你好,世界呼噜");
            Assert.AreEqual(sh.distance(sh2), 0);

            var sh4 = new Simhash();
            sh4.GenerateSimhash("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.");
            var sh5 = new Simhash();
            sh5.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than");
            var sh6 = new Simhash();
            sh6.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");

            Assert.IsTrue(sh4.distance(sh6) < 3);
            Assert.IsTrue(sh5.distance(sh6) < 3);
        }
        public HashSet<long> get_near_dups(Simhash simhash)
        {
            /*
            "simhash" is an instance of Simhash
            return a list of obj_id, which is in type of long (for now)
            */
            if (simhash.fpSize != this.fpSize) throw new Exception();

            var ans = new HashSet<long>();

            foreach (string key in get_keys(simhash))
            {
                if (bucket.ContainsKey(key))
                {
                    var dups = bucket[key];
                    foreach (var dup in dups)
                    {
                        string[] parts = dup.Split(',');
                        ulong fp = Convert.ToUInt64(parts[0]);
                        long obj_id = Convert.ToInt64(parts[1]);
                        var sim2 = new Simhash(fp);
                        int d = simhash.distance(sim2);
                        if (d <= kDistance)
                        {
                            ans.Add(obj_id);
                        }
                    }
                }
            }
            return ans;
        }