public void test_clean() { var shingling = new Shingling(); string cleaned = shingling.scrub("aaa bbb test test testing. happy time =-)."); Assert.AreEqual("aaabbbtesttesttestinghappytime", cleaned); }
public void Slide() { var pieces = Shingling.Slide("aaabbb", width: 4); //aaab, aabb, abbb Assert.Equal(3, pieces.Count); }
public void Tokenize_Width_Three() { var pieces = Shingling.Tokenize("This is a test for really cool content. yeah! =)", new StringBuilder(), width: 3); //thi, his, isi, sis, isa .. etc.... Assert.Equal(33, pieces.Count); }
public void Tokenize_Width_Default() { var pieces = Shingling.Tokenize("aaabbb", new StringBuilder()); //aaab, aabb, abbb Assert.Equal(3, pieces.Count); }
public void test_tokenize_width_three() { var shingling = new Shingling(); List <string> pieces = shingling.tokenize("This is a test for really cool content. yeah! =)", width: 3); //thi, his, isi, sis, isa .. etc.... Assert.AreEqual(33, pieces.Count); }
public void test_tokenize_width_default() { var shingling = new Shingling(); List <string> pieces = shingling.tokenize("aaabbb"); //aaab, aabb, abbb Assert.AreEqual(3, pieces.Count); }
public void test_slide() { var shingling = new Shingling(); List <string> pieces = shingling.slide("aaabbb", width: 4); //aaab, aabb, abbb Assert.AreEqual(3, pieces.Count); }
public void Get_Near_Dup_Hash_Jenkins_Find_One() { var simhash = new SimhashLib.Simhash(); var index = SetUpIndex(1); var features = Shingling.Tokenize(testData[1], new StringBuilder(), 3); var hash = simhash.ComputeHashByMd5(features); var dups = index.GetNearDups(hash); Assert.Single(dups); }
public void test_get_near_dup_hash_jenkins_find_one() { var index = setUpIndex(1); var s = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(testData[1], 3); s.GenerateSimhash(features); var dups = index.get_near_dups(s); Assert.AreEqual(1, dups.Count); }
private SimhashIndex setUpIndex(int kValue) { Dictionary <long, Simhash> objs = new Dictionary <long, Simhash>(); foreach (var it in testData) { var simHash = new Simhash(); var shingling = new Shingling(); var features = shingling.tokenize(it.Value, 3); simHash.GenerateSimhash(features); objs.Add(it.Key, simHash); } return(new SimhashIndex(objs: objs, k: kValue)); }
private SimhashIndex SetUpIndex(int kValue) { var simhash = new SimhashLib.Simhash(); var objs = new Dictionary <long, SimhashResult>(); var builder = new StringBuilder(); foreach (var it in testData) { var features = Shingling.Tokenize(it.Value, builder.Clear(), 3); objs.Add(it.Key, simhash.ComputeHashByMd5(features)); } return(new SimhashIndex(objs: objs, k: kValue)); }
public void Clean() { var cleaned = Shingling.Scrub("aaa bbb test test testing. happy time =-).", new StringBuilder()); Assert.Equal("aaabbbtesttesttestinghappytime", cleaned.ToString()); }