public void Tokenize_Width_Three() { var pieces = Shingling.Tokenize("This is a test for really cool content. yeah! =)", new StringBuilder(), width: 3); //thi, his, isi, sis, isa .. etc.... Assert.Equal(33, pieces.Count); }
public void Tokenize_Width_Default() { var pieces = Shingling.Tokenize("aaabbb", new StringBuilder()); //aaab, aabb, abbb Assert.Equal(3, pieces.Count); }
public void Get_Near_Dup_Hash_Jenkins_Find_One() { var simhash = new SimhashLib.Simhash(); var index = SetUpIndex(1); var features = Shingling.Tokenize(testData[1], new StringBuilder(), 3); var hash = simhash.ComputeHashByMd5(features); var dups = index.GetNearDups(hash); Assert.Single(dups); }
private SimhashIndex SetUpIndex(int kValue) { var simhash = new SimhashLib.Simhash(); var objs = new Dictionary <long, SimhashResult>(); var builder = new StringBuilder(); foreach (var it in testData) { var features = Shingling.Tokenize(it.Value, builder.Clear(), 3); objs.Add(it.Key, simhash.ComputeHashByMd5(features)); } return(new SimhashIndex(objs: objs, k: kValue)); }