public void GenerateSimhash(string content) { var shingling = new Shingling(); var shingles = shingling.tokenize(content); GenerateSimhash(shingles); }
public void test_tokenize_width_three() { var shingling = new Shingling(); List<string> pieces = shingling.tokenize("This is a test for really cool content. yeah! =)", width: 3); //thi, his, isi, sis, isa .. etc.... Assert.AreEqual(33, pieces.Count); }
public void test_tokenize_width_default() { var shingling = new Shingling(); List<string> pieces = shingling.tokenize("aaabbb"); //aaab, aabb, abbb Assert.AreEqual(3, pieces.Count); }
public void test_slide() { var shingling = new Shingling(); List<string> pieces = shingling.slide("aaabbb", width: 4); //aaab, aabb, abbb Assert.AreEqual(3, pieces.Count); }
public SimhashResult ComputeHash <THash, TRes>(string content, THash hash, Encoding encoding = null) where THash : IHash <TRes> where TRes : IHashResult <TRes> { var builder = new StringBuilder(content.Length); var shingles = Shingling.Tokenize(content, builder); return(ComputeHash <THash, TRes>(shingles, hash, encoding)); }
private static SimhashResult ComputeHash <THash, TRes>(this Simhash self, string content, THash hash) where THash : struct, IHash <TRes> where TRes : IHashResult <TRes> { var builder = new StringBuilder(content.Length); var shingles = Shingling.Tokenize(content, builder); return(self.ComputeHash <THash, TRes>(shingles, hash)); }
public void test_get_near_dup_hash_jenkins_find_one() { var index = setUpIndex(1); var s = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(testData[1], 3); s.GenerateSimhash(features); var dups = index.get_near_dups(s); Assert.AreEqual(1, dups.Count); }
private SimhashIndex setUpIndex(int kValue) { Dictionary<long, Simhash> objs = new Dictionary<long, Simhash>(); foreach (var it in testData) { var simHash = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(it.Value, 3); simHash.GenerateSimhash(features); objs.Add(it.Key, simHash); } return new SimhashIndex(objs: objs, k: kValue); }
public void test_clean() { var shingling = new Shingling(); string cleaned = shingling.scrub("aaa bbb test test testing. happy time =-)."); Assert.AreEqual("aaabbbtesttesttestinghappytime", cleaned); }