public void test_tokenize_width_three() { var shingling = new Shingling(); List <string> pieces = shingling.tokenize("This is a test for really cool content. yeah! =)", width: 3); //thi, his, isi, sis, isa .. etc.... Assert.AreEqual(33, pieces.Count); }
public void test_tokenize_width_default() { var shingling = new Shingling(); List <string> pieces = shingling.tokenize("aaabbb"); //aaab, aabb, abbb Assert.AreEqual(3, pieces.Count); }
public void test_get_near_dup_hash_jenkins_find_one() { var index = setUpIndex(1); var s = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(testData[1], 3); s.GenerateSimhash(features); var dups = index.get_near_dups(s); Assert.AreEqual(1, dups.Count); }
private SimhashIndex setUpIndex(int kValue) { Dictionary <long, Simhash> objs = new Dictionary <long, Simhash>(); foreach (var it in testData) { var simHash = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(it.Value, 3); simHash.GenerateSimhash(features); objs.Add(it.Key, simHash); } return(new SimhashIndex(objs: objs, k: kValue)); }