Пример #1
0
        public void Tokenize_Width_Three()
        {
            var pieces = Shingling.Tokenize("This is a test for really cool content. yeah! =)", new StringBuilder(), width: 3);

            //thi, his, isi, sis, isa .. etc....
            Assert.Equal(33, pieces.Count);
        }
Пример #2
0
        public void Tokenize_Width_Default()
        {
            var pieces = Shingling.Tokenize("aaabbb", new StringBuilder());

            //aaab, aabb, abbb
            Assert.Equal(3, pieces.Count);
        }
Пример #3
0
        public void Get_Near_Dup_Hash_Jenkins_Find_One()
        {
            var simhash = new SimhashLib.Simhash();

            var index = SetUpIndex(1);

            var features = Shingling.Tokenize(testData[1], new StringBuilder(), 3);

            var hash = simhash.ComputeHashByMd5(features);
            var dups = index.GetNearDups(hash);

            Assert.Single(dups);
        }
Пример #4
0
        private SimhashIndex SetUpIndex(int kValue)
        {
            var simhash = new SimhashLib.Simhash();

            var objs = new Dictionary <long, SimhashResult>();

            var builder = new StringBuilder();

            foreach (var it in testData)
            {
                var features = Shingling.Tokenize(it.Value, builder.Clear(), 3);
                objs.Add(it.Key, simhash.ComputeHashByMd5(features));
            }
            return(new SimhashIndex(objs: objs, k: kValue));
        }