示例#1
0
        public void test_clean()
        {
            var    shingling = new Shingling();
            string cleaned   = shingling.scrub("aaa bbb test test testing. happy time =-).");

            Assert.AreEqual("aaabbbtesttesttestinghappytime", cleaned);
        }
示例#2
0
        public void Slide()
        {
            var pieces = Shingling.Slide("aaabbb", width: 4);

            //aaab, aabb, abbb
            Assert.Equal(3, pieces.Count);
        }
示例#3
0
        public void Tokenize_Width_Three()
        {
            var pieces = Shingling.Tokenize("This is a test for really cool content. yeah! =)", new StringBuilder(), width: 3);

            //thi, his, isi, sis, isa .. etc....
            Assert.Equal(33, pieces.Count);
        }
示例#4
0
        public void Tokenize_Width_Default()
        {
            var pieces = Shingling.Tokenize("aaabbb", new StringBuilder());

            //aaab, aabb, abbb
            Assert.Equal(3, pieces.Count);
        }
示例#5
0
        public void test_tokenize_width_three()
        {
            var           shingling = new Shingling();
            List <string> pieces    = shingling.tokenize("This is a test for really cool content. yeah! =)", width: 3);

            //thi, his, isi, sis, isa .. etc....
            Assert.AreEqual(33, pieces.Count);
        }
示例#6
0
        public void test_tokenize_width_default()
        {
            var           shingling = new Shingling();
            List <string> pieces    = shingling.tokenize("aaabbb");

            //aaab, aabb, abbb
            Assert.AreEqual(3, pieces.Count);
        }
示例#7
0
        public void test_slide()
        {
            var           shingling = new Shingling();
            List <string> pieces    = shingling.slide("aaabbb", width: 4);

            //aaab, aabb, abbb
            Assert.AreEqual(3, pieces.Count);
        }
示例#8
0
        public void Get_Near_Dup_Hash_Jenkins_Find_One()
        {
            var simhash = new SimhashLib.Simhash();

            var index = SetUpIndex(1);

            var features = Shingling.Tokenize(testData[1], new StringBuilder(), 3);

            var hash = simhash.ComputeHashByMd5(features);
            var dups = index.GetNearDups(hash);

            Assert.Single(dups);
        }
示例#9
0
        public void test_get_near_dup_hash_jenkins_find_one()
        {
            var index = setUpIndex(1);

            var s         = new Simhash();
            var shingling = new Shingling();
            var features  = shingling.tokenize(testData[1], 3);

            s.GenerateSimhash(features);

            var dups = index.get_near_dups(s);

            Assert.AreEqual(1, dups.Count);
        }
示例#10
0
        private SimhashIndex setUpIndex(int kValue)
        {
            Dictionary <long, Simhash> objs = new Dictionary <long, Simhash>();

            foreach (var it in testData)
            {
                var simHash   = new Simhash();
                var shingling = new Shingling();
                var features  = shingling.tokenize(it.Value, 3);
                simHash.GenerateSimhash(features);
                objs.Add(it.Key, simHash);
            }
            return(new SimhashIndex(objs: objs, k: kValue));
        }
示例#11
0
        private SimhashIndex SetUpIndex(int kValue)
        {
            var simhash = new SimhashLib.Simhash();

            var objs = new Dictionary <long, SimhashResult>();

            var builder = new StringBuilder();

            foreach (var it in testData)
            {
                var features = Shingling.Tokenize(it.Value, builder.Clear(), 3);
                objs.Add(it.Key, simhash.ComputeHashByMd5(features));
            }
            return(new SimhashIndex(objs: objs, k: kValue));
        }
示例#12
0
        public void Clean()
        {
            var cleaned = Shingling.Scrub("aaa bbb test test testing. happy time =-).", new StringBuilder());

            Assert.Equal("aaabbbtesttesttestinghappytime", cleaned.ToString());
        }