Beispiel #1
0
        public void test_tokenize_width_three()
        {
            var           shingling = new Shingling();
            List <string> pieces    = shingling.tokenize("This is a test for really cool content. yeah! =)", width: 3);

            //thi, his, isi, sis, isa .. etc....
            Assert.AreEqual(33, pieces.Count);
        }
Beispiel #2
0
        public void test_tokenize_width_default()
        {
            var           shingling = new Shingling();
            List <string> pieces    = shingling.tokenize("aaabbb");

            //aaab, aabb, abbb
            Assert.AreEqual(3, pieces.Count);
        }
Beispiel #3
0
        public void test_get_near_dup_hash_jenkins_find_one()
        {
            var index = setUpIndex(1);

            var s         = new Simhash();
            var shingling = new Shingling();
            var features  = shingling.tokenize(testData[1], 3);

            s.GenerateSimhash(features);

            var dups = index.get_near_dups(s);

            Assert.AreEqual(1, dups.Count);
        }
Beispiel #4
0
        private SimhashIndex setUpIndex(int kValue)
        {
            Dictionary <long, Simhash> objs = new Dictionary <long, Simhash>();

            foreach (var it in testData)
            {
                var simHash   = new Simhash();
                var shingling = new Shingling();
                var features  = shingling.tokenize(it.Value, 3);
                simHash.GenerateSimhash(features);
                objs.Add(it.Key, simHash);
            }
            return(new SimhashIndex(objs: objs, k: kValue));
        }