コード例 #1
0
        public void test_short()
        {
            List <Simhash> shs = new List <Simhash>();
            List <string>  ss  = new List <string>()
            {
                "aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb"
            };

            foreach (string s in ss)
            {
                var simHash = new Simhash();
                simHash.GenerateSimhash(s);
                shs.Add(simHash);
            }

            foreach (Simhash sh1 in shs)
            {
                foreach (Simhash sh2 in shs)
                {
                    if (sh1 != sh2)
                    {
                        Assert.AreNotEqual(sh1, sh2);
                    }
                }
            }
        }
コード例 #2
0
        public void test_get_keys()
        {
            Dictionary <long, string> testdata = new Dictionary <long, string>();

            testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");

            Dictionary <long, Simhash> simHashObjs = new Dictionary <long, Simhash>();

            foreach (var it in testdata)
            {
                var simHash = new Simhash(hashingType: Simhash.HashingType.MD5);
                simHash.GenerateSimhash(it.Value);
                simHashObjs.Add(it.Key, simHash);
            }
            var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10);
            var listOfKeys   = simHashIndex.get_the_keys(simHashObjs[1]);

            Assert.IsTrue(listOfKeys.Count == 11);
            Assert.AreEqual("26,0", listOfKeys[0]);
            Assert.AreEqual("3,1", listOfKeys[1]);
            Assert.AreEqual("7,2", listOfKeys[2]);
            Assert.AreEqual("12,3", listOfKeys[3]);
            Assert.AreEqual("17,4", listOfKeys[4]);
            Assert.AreEqual("0,5", listOfKeys[5]);
            Assert.AreEqual("13,6", listOfKeys[6]);
            Assert.AreEqual("30,7", listOfKeys[7]);
            Assert.AreEqual("1,8", listOfKeys[8]);
            Assert.AreEqual("14,9", listOfKeys[9]);
            Assert.AreEqual("7496,10", listOfKeys[10]);
        }
コード例 #3
0
        public void test_value_by_string()
        {
            var simHash = new Simhash(hashingType: Simhash.HashingType.MD5);

            simHash.GenerateSimhash("aaa bbb test test testing.happy time = -).");
            ulong expected = 5683413558821905382;

            Assert.AreEqual(expected, simHash.value);
        }
コード例 #4
0
        public void test_get_near_dup_hash_jenkins_not_close()
        {
            var index = setUpIndex(1);
            var s     = new Simhash();

            s.GenerateSimhash("This is not even close to the text that is loaded by default");
            var dups = index.get_near_dups(s);

            Assert.AreEqual(0, dups.Count);
        }
コード例 #5
0
        public void test_value_control()
        {
            List <string> features = new List <string>()
            {
                "aaa"
            };
            var simHash = new Simhash(hashingType: Simhash.HashingType.MD5);

            simHash.GenerateSimhash(features);
            ulong expected = 7483809945577191432;

            Assert.AreEqual(expected, simHash.value);
        }
コード例 #6
0
        public void test_value()
        {
            List <string> features = new List <string>()
            {
                "aaa", "bbb"
            };
            var simHash = new Simhash(hashingType: Simhash.HashingType.MD5);

            simHash.GenerateSimhash(features);
            ulong expected = 8637903533912358349;

            Assert.AreEqual(expected, simHash.value);
        }
コード例 #7
0
        public void test_get_near_dup_hash_jenkins_find_one()
        {
            var index = setUpIndex(1);

            var s         = new Simhash();
            var shingling = new Shingling();
            var features  = shingling.tokenize(testData[1], 3);

            s.GenerateSimhash(features);

            var dups = index.get_near_dups(s);

            Assert.AreEqual(1, dups.Count);
        }
コード例 #8
0
        private SimhashIndex setUpIndex(int kValue)
        {
            Dictionary <long, Simhash> objs = new Dictionary <long, Simhash>();

            foreach (var it in testData)
            {
                var simHash   = new Simhash();
                var shingling = new Shingling();
                var features  = shingling.tokenize(it.Value, 3);
                simHash.GenerateSimhash(features);
                objs.Add(it.Key, simHash);
            }
            return(new SimhashIndex(objs: objs, k: kValue));
        }
コード例 #9
0
        public void setUp()
        {
            testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");
            testData.Add(2, "How are you i am fine. blar blar blar blar blar than");
            testData.Add(3, "This is simhash test.");
            testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1");

            foreach (var it in testData)
            {
                var simHash = new Simhash(hashingType: Simhash.HashingType.MD5);
                simHash.GenerateSimhash(it.Value);
                objs.Add(it.Key, simHash);
            }
            index = new SimhashIndex(objs: objs, k: 10);
        }
コード例 #10
0
        public void test_distance()
        {
            var sh = new Simhash();

            sh.GenerateSimhash("How are you? I AM fine. Thanks. And you?");
            var sh2 = new Simhash();

            sh2.GenerateSimhash("How old are you? :-) i am fine. Thanks. And you?");
            int distA = sh.distance(sh2);

            Assert.IsTrue(distA > 0);

            var sh3   = new Simhash(sh2);
            int distB = sh2.distance(sh3);

            Assert.AreEqual(0, distB);

            var sh4 = new Simhash();

            sh4.GenerateSimhash("1");
            Assert.AreNotEqual(0, sh4.distance(sh3));
        }
コード例 #11
0
        public void test_chinese()
        {
            var sh = new Simhash();

            sh.GenerateSimhash("你好 世界!  呼噜。");
            var sh2 = new Simhash();

            sh2.GenerateSimhash("你好,世界呼噜");
            Assert.AreEqual(sh.distance(sh2), 0);

            var sh4 = new Simhash();

            sh4.GenerateSimhash("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.");
            var sh5 = new Simhash();

            sh5.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than");
            var sh6 = new Simhash();

            sh6.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");

            Assert.IsTrue(sh4.distance(sh6) < 3);
            Assert.IsTrue(sh5.distance(sh6) < 3);
        }
コード例 #12
0
        public void test_get_near_dup_hash()
        {
            var s1 = new Simhash(hashingType: Simhash.HashingType.MD5);

            s1.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");
            var dups = index.get_near_dups(s1);

            Assert.AreEqual(3, dups.Count);

            var s2 = new Simhash(hashingType: Simhash.HashingType.MD5);

            s2.GenerateSimhash(testData[1]);
            index.delete(1, s2);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(2, dups.Count);

            var s3 = new Simhash(hashingType: Simhash.HashingType.MD5);

            s3.GenerateSimhash(testData[1]);
            index.delete(1, s3);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(2, dups.Count);

            var s4 = new Simhash(hashingType: Simhash.HashingType.MD5);

            s4.GenerateSimhash(testData[1]);
            index.add(1, s4);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);

            var s5 = new Simhash(hashingType: Simhash.HashingType.MD5);

            s5.GenerateSimhash(testData[1]);
            index.add(1, s5);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);
        }