Beispiel #1
0
        public void Get_Near_Dup_Hash()
        {
            var simhash = new SimhashLib.Simhash();

            var hash = simhash.ComputeHashByMd5("How are you i am fine.ablar ablar xyz blar blar blar blar blar blar blar thank");
            var dups = index.GetNearDups(hash);

            Assert.Equal(3, dups.Count);

            var hash2 = simhash.ComputeHashByMd5(testData[1]);

            index.Delete(1, hash2);
            dups = index.GetNearDups(hash);
            Assert.Equal(2, dups.Count);

            var hash3 = simhash.ComputeHashByMd5(testData[1]);

            index.Delete(1, hash3);
            dups = index.GetNearDups(hash);
            Assert.Equal(2, dups.Count);

            var hash4 = simhash.ComputeHashByMd5(testData[1]);

            index.Add(1, hash4);
            dups = index.GetNearDups(hash);
            Assert.Equal(3, dups.Count);

            var hash5 = simhash.ComputeHashByMd5(testData[1]);

            index.Add(1, hash5);
            dups = index.GetNearDups(hash);
            Assert.Equal(3, dups.Count);
        }
Beispiel #2
0
        public void Get_Keys()
        {
            var simhash = new SimhashLib.Simhash();

            var testdata = new Dictionary <long, string>();

            testdata.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");

            var simHashObjs = new Dictionary <long, SimhashResult>();

            foreach (var it in testdata)
            {
                simHashObjs.Add(it.Key, simhash.ComputeHashByMd5(it.Value));
            }
            var simHashIndex = new SimhashIndex(objs: simHashObjs, k: 10);
            var listOfKeys   = simHashIndex.GetListKeys(simHashObjs[1]);

            Assert.True(listOfKeys.Count == 11);
            Assert.Equal("26,0", listOfKeys[0]);
            Assert.Equal("3,1", listOfKeys[1]);
            Assert.Equal("7,2", listOfKeys[2]);
            Assert.Equal("12,3", listOfKeys[3]);
            Assert.Equal("17,4", listOfKeys[4]);
            Assert.Equal("0,5", listOfKeys[5]);
            Assert.Equal("13,6", listOfKeys[6]);
            Assert.Equal("30,7", listOfKeys[7]);
            Assert.Equal("1,8", listOfKeys[8]);
            Assert.Equal("14,9", listOfKeys[9]);
            Assert.Equal("7496,10", listOfKeys[10]);
        }
Beispiel #3
0
        public void Get_Near_Dup_Hash_Jenkins_Not_Close()
        {
            var simhash = new SimhashLib.Simhash();

            var index = SetUpIndex(1);

            var hash = simhash.ComputeHashByMd5("This is not even close to the text that is loaded by default");
            var dups = index.GetNearDups(hash);

            Assert.Empty(dups);
        }
Beispiel #4
0
        public void Get_Near_Dup_Hash_Jenkins_Find_One()
        {
            var simhash = new SimhashLib.Simhash();

            var index = SetUpIndex(1);

            var features = Shingling.Tokenize(testData[1], new StringBuilder(), 3);

            var hash = simhash.ComputeHashByMd5(features);
            var dups = index.GetNearDups(hash);

            Assert.Single(dups);
        }
Beispiel #5
0
        private SimhashIndex SetUpIndex(int kValue)
        {
            var simhash = new SimhashLib.Simhash();

            var objs = new Dictionary <long, SimhashResult>();

            var builder = new StringBuilder();

            foreach (var it in testData)
            {
                var features = Shingling.Tokenize(it.Value, builder.Clear(), 3);
                objs.Add(it.Key, simhash.ComputeHashByMd5(features));
            }
            return(new SimhashIndex(objs: objs, k: kValue));
        }
Beispiel #6
0
        public TestSimhashIndexMD5()
        {
            var simhash = new SimhashLib.Simhash();

            testData.Add(1, "How are you? I Am fine. blar blar blar blar blar Thanks.");
            testData.Add(2, "How are you i am fine. blar blar blar blar blar than");
            testData.Add(3, "This is simhash test.");
            testData.Add(4, "How are you i am fine. blar blar blar blar blar thank1");

            foreach (var it in testData)
            {
                objs.Add(it.Key, simhash.ComputeHashByMd5(it.Value));
            }
            index = new SimhashIndex(objs: objs, k: 10);
        }