private static bool MatchBloomFilter(string word, BloomFilter filter, BloomFilterParameters parameters) { if (parameters.Disabled) { return(true); } var processed = DataGenerator.ProcessWordForBloomFilter(parameters, word); // Console.WriteLine("processed: " + processed); return(filter.test(processed)); }
//private void WriteRetouched(int id, Tuple<string, int>[] retouched) //{ // var path = Path.Combine(Settings.TempFolder, String.Format("{0:000}-retouched.txt", id)); // if (File.Exists(path)) File.Delete(path); // File.WriteAllLines(path, retouched.Select(r => String.Format("{0} {1}", r.Item1, r.Item2))); //} public static string ProcessWordForBloomFilter(BloomFilterParameters parameters, string word) { var processed = word; processed = SubstituteMostCommonSequences(processed, parameters.SubstitutionCount); if (parameters.SubstringStartIndex != null) { processed = SampleSplitter.PreSampleForBFValue(processed); var length = Math.Min(parameters.SubstringLength.Value, processed.Length); processed = processed.Substring(0, length); } if (parameters.CharOffset != null) { processed = new String(processed .Select(c => (char)(c - 'a' + parameters.CharOffset.Value)) .ToArray()); } return(processed); }
private void BuildBloomFilter(BloomFilterParameters parameters, GeneratedData data) { if (parameters.Disabled) { return; } using (new Timer("DataGenerator.BuildBloomFilter")) { var filter = new BloomFilter(parameters.FilterSizeBytes * 8, parameters.HashFunctionsCount); var counter = new TestExecutor.MatchCounter(); foreach (var word in Words.Value) { // if (word.Length < 3) continue; if (!TestExecutor.Match(data, word, data.Parameters, counter, skipBloomFilter: true)) { continue; } var processed = ProcessWordForBloomFilter(parameters, word); filter.add(processed); } data.BloomFilter = filter; using (new Timer("DataGenerator.BuildBloomFilter[retouch]")) { if (parameters.RetouchWordCount > 0) { var retouched = FakeWordsByFrequency.Value .Where(t => t.Item2 > 2) .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter)) .Take(parameters.RetouchWordCount.Value) .ToArray(); foreach (var tuple in retouched) { var word = tuple.Item1; var processed = ProcessWordForBloomFilter(parameters, word); filter.retouch(processed, parameters.RetouchMaxWeight ?? 0); } // WriteRetouched(data.Parameters.Id.Value, retouched); } else if (parameters.RetouchMinRelWeight != null) { var falsePositives = FakeWordsByFrequency.Value .Where(t => t.Item2 > 2) .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter)) .ToArray(); var falsePositivesBF = new BloomFilter(filter.m, filter.k); foreach (var fp in falsePositives) { var processed = ProcessWordForBloomFilter(parameters, fp.Item1); falsePositivesBF.add(processed, fp.Item2); } filter.retouch(falsePositivesBF, parameters.RetouchMinRelWeight.Value); } } } }