예제 #1
0
        public void GenerateSimhash(string content)
        {
            var shingling = new Shingling();
            var shingles  = shingling.tokenize(content);

            GenerateSimhash(shingles);
        }
예제 #2
0
 public void test_tokenize_width_three()
 {
     var shingling = new Shingling();
     List<string> pieces = shingling.tokenize("This is a test for really cool content. yeah! =)", width: 3);
     //thi, his, isi, sis, isa .. etc....
     Assert.AreEqual(33, pieces.Count);
 }
예제 #3
0
 public void test_tokenize_width_default()
 {
     var shingling = new Shingling();
     List<string> pieces = shingling.tokenize("aaabbb");
     //aaab, aabb, abbb
     Assert.AreEqual(3, pieces.Count);
 }
예제 #4
0
 public void test_slide()
 {
     var shingling = new Shingling();
     List<string> pieces = shingling.slide("aaabbb", width: 4);
     //aaab, aabb, abbb
     Assert.AreEqual(3, pieces.Count);
 }
예제 #5
0
        public SimhashResult ComputeHash <THash, TRes>(string content, THash hash, Encoding encoding = null)
            where THash : IHash <TRes>
            where TRes : IHashResult <TRes>
        {
            var builder  = new StringBuilder(content.Length);
            var shingles = Shingling.Tokenize(content, builder);

            return(ComputeHash <THash, TRes>(shingles, hash, encoding));
        }
예제 #6
0
        private static SimhashResult ComputeHash <THash, TRes>(this Simhash self, string content, THash hash)
            where THash : struct, IHash <TRes>
            where TRes : IHashResult <TRes>
        {
            var builder  = new StringBuilder(content.Length);
            var shingles = Shingling.Tokenize(content, builder);

            return(self.ComputeHash <THash, TRes>(shingles, hash));
        }
        public void test_get_near_dup_hash_jenkins_find_one()
        {
            var index = setUpIndex(1);

            var s = new Simhash();
            var shingling = new Shingling();
            var features = shingling.tokenize(testData[1], 3);
            s.GenerateSimhash(features);

            var dups = index.get_near_dups(s);
            Assert.AreEqual(1, dups.Count);
        }
        private SimhashIndex setUpIndex(int kValue)
        {
            Dictionary<long, Simhash> objs = new Dictionary<long, Simhash>();

            foreach (var it in testData)
            {
                var simHash = new Simhash();
                var shingling = new Shingling();
                var features = shingling.tokenize(it.Value, 3);
                simHash.GenerateSimhash(features);
                objs.Add(it.Key, simHash);

            }
            return new SimhashIndex(objs: objs, k: kValue);
        }
예제 #9
0
 public void GenerateSimhash(string content)
 {
     var shingling = new Shingling();
     var shingles = shingling.tokenize(content);
     GenerateSimhash(shingles);
 }
예제 #10
0
 public void test_clean()
 {
     var shingling = new Shingling();
     string cleaned = shingling.scrub("aaa bbb test test testing. happy time =-).");
     Assert.AreEqual("aaabbbtesttesttestinghappytime", cleaned);
 }