public void test_get_keys()
        {
            Dictionary<long, string> testdata = new Dictionary<long, string>();
            testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");

            Dictionary<long, Simhash> simHashObjs = new Dictionary<long, Simhash>();
            foreach (var it in testdata)
            {
                var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
                simHash.GenerateSimhash(it.Value);
                simHashObjs.Add(it.Key, simHash);
            }
            var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10);
            var listOfKeys = simHashIndex.get_the_keys(simHashObjs[1]);
            Assert.IsTrue(listOfKeys.Count == 11);
            Assert.AreEqual("26,0", listOfKeys[0]);
            Assert.AreEqual("3,1", listOfKeys[1]);
            Assert.AreEqual("7,2", listOfKeys[2]);
            Assert.AreEqual("12,3", listOfKeys[3]);
            Assert.AreEqual("17,4", listOfKeys[4]);
            Assert.AreEqual("0,5", listOfKeys[5]);
            Assert.AreEqual("13,6", listOfKeys[6]);
            Assert.AreEqual("30,7", listOfKeys[7]);
            Assert.AreEqual("1,8", listOfKeys[8]);
            Assert.AreEqual("14,9", listOfKeys[9]);
            Assert.AreEqual("7496,10", listOfKeys[10]);
        }
        public void test_get_near_dup_hash()
        {
            var s1 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s1.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");
            var dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);

            var s2 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s2.GenerateSimhash(testData[1]);
            index.delete(1, s2);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(2, dups.Count);

            var s3 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s3.GenerateSimhash(testData[1]);
            index.delete(1, s3);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(2, dups.Count);

            var s4 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s4.GenerateSimhash(testData[1]);
            index.add(1, s4);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);

            var s5 = new Simhash(hashingType: Simhash.HashingType.MD5);
            s5.GenerateSimhash(testData[1]);
            index.add(1, s5);
            dups = index.get_near_dups(s1);
            Assert.AreEqual(3, dups.Count);
        }
 public void test_get_near_dup_hash_jenkins_not_close()
 {
     var index = setUpIndex(1);
     var s = new Simhash();
     s.GenerateSimhash("This is not even close to the text that is loaded by default");
     var dups = index.get_near_dups(s);
     Assert.AreEqual(0, dups.Count);
 }
        public void test_get_near_dup_hash_jenkins_find_one()
        {
            var index = setUpIndex(1);

            var s = new Simhash();
            var shingling = new Shingling();
            var features = shingling.tokenize(testData[1], 3);
            s.GenerateSimhash(features);

            var dups = index.get_near_dups(s);
            Assert.AreEqual(1, dups.Count);
        }
        private SimhashIndex setUpIndex(int kValue)
        {
            Dictionary<long, Simhash> objs = new Dictionary<long, Simhash>();

            foreach (var it in testData)
            {
                var simHash = new Simhash();
                var shingling = new Shingling();
                var features = shingling.tokenize(it.Value, 3);
                simHash.GenerateSimhash(features);
                objs.Add(it.Key, simHash);

            }
            return new SimhashIndex(objs: objs, k: kValue);
        }
        public void setUp()
        {
            testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");
            testData.Add(2, "How are you i am fine. blar blar blar blar blar than");
            testData.Add(3, "This is simhash test.");
            testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1");

            foreach(var it in testData)
            {
                var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
                simHash.GenerateSimhash(it.Value);
                objs.Add(it.Key, simHash);

            }
            index = new SimhashIndex(objs: objs, k: 10);
        }
Beispiel #7
0
        public void test_distance()
        {
            var sh = new Simhash();
            sh.GenerateSimhash("How are you? I AM fine. Thanks. And you?");
            var sh2 = new Simhash();
            sh2.GenerateSimhash("How old are you? :-) i am fine. Thanks. And you?");
            int distA = sh.distance(sh2);
            Assert.IsTrue(distA > 0);

            var sh3 = new Simhash(sh2);
            int distB = sh2.distance(sh3);
            Assert.AreEqual(0,distB);

            var sh4 = new Simhash();
            sh4.GenerateSimhash("1");
            Assert.AreNotEqual(0, sh4.distance(sh3));
        }
Beispiel #8
0
        public void test_chinese()
        {
            var sh = new Simhash();
            sh.GenerateSimhash("你好 世界!  呼噜。");
            var sh2 = new Simhash();
            sh2.GenerateSimhash("你好,世界呼噜");
            Assert.AreEqual(sh.distance(sh2), 0);

            var sh4 = new Simhash();
            sh4.GenerateSimhash("How are you? I Am fine. ablar ablar xyz blar blar blar blar blar blar blar Thanks.");
            var sh5 = new Simhash();
            sh5.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar than");
            var sh6 = new Simhash();
            sh6.GenerateSimhash("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");

            Assert.IsTrue(sh4.distance(sh6) < 3);
            Assert.IsTrue(sh5.distance(sh6) < 3);
        }
Beispiel #9
0
        public void test_short()
        {
            List<Simhash> shs = new List<Simhash>();
            List<string> ss = new List<string>() { "aa", "aaa", "aaaa", "aaaab", "aaaaabb", "aaaaabbb" };
            foreach (string s in ss)
            {
                var simHash = new Simhash();
                simHash.GenerateSimhash(s);
                shs.Add(simHash);
            }

            foreach (Simhash sh1 in shs)
            {
                foreach (Simhash sh2 in shs)
                {
                    if (sh1 != sh2)
                    {
                        Assert.AreNotEqual(sh1, sh2);
                    }
                }
            }
        }
Beispiel #10
0
 public void test_value_by_string()
 {
     var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
     simHash.GenerateSimhash("aaa bbb test test testing.happy time = -).");
     ulong expected = 5683413558821905382;
     Assert.AreEqual(expected, simHash.value);
 }
Beispiel #11
0
 public void test_value()
 {
     List<string> features = new List<string>() { "aaa", "bbb" };
     var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
     simHash.GenerateSimhash(features);
     ulong expected = 8637903533912358349;
     Assert.AreEqual(expected, simHash.value);
 }
Beispiel #12
0
 public void test_value_control()
 {
     List<string> features = new List<string>() { "aaa" };
     var simHash = new Simhash(hashingType:Simhash.HashingType.MD5);
     simHash.GenerateSimhash(features);
     ulong expected = 7483809945577191432;
     Assert.AreEqual(expected, simHash.value);
 }