private static bool MatchBloomFilter(string word, BloomFilter filter, BloomFilterParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(true);
            }

            var processed = DataGenerator.ProcessWordForBloomFilter(parameters, word);

            // Console.WriteLine("processed: " + processed);
            return(filter.test(processed));
        }
Exemplo n.º 2
0
        //private void WriteRetouched(int id, Tuple<string, int>[] retouched)
        //{
        //	var path = Path.Combine(Settings.TempFolder, String.Format("{0:000}-retouched.txt", id));
        //	if (File.Exists(path)) File.Delete(path);
        //	File.WriteAllLines(path, retouched.Select(r => String.Format("{0} {1}", r.Item1, r.Item2)));
        //}

        public static string ProcessWordForBloomFilter(BloomFilterParameters parameters, string word)
        {
            var processed = word;

            processed = SubstituteMostCommonSequences(processed, parameters.SubstitutionCount);
            if (parameters.SubstringStartIndex != null)
            {
                processed = SampleSplitter.PreSampleForBFValue(processed);
                var length = Math.Min(parameters.SubstringLength.Value, processed.Length);
                processed = processed.Substring(0, length);
            }
            if (parameters.CharOffset != null)
            {
                processed = new String(processed
                                       .Select(c => (char)(c - 'a' + parameters.CharOffset.Value))
                                       .ToArray());
            }
            return(processed);
        }
Exemplo n.º 3
0
        private void BuildBloomFilter(BloomFilterParameters parameters, GeneratedData data)
        {
            if (parameters.Disabled)
            {
                return;
            }

            using (new Timer("DataGenerator.BuildBloomFilter"))
            {
                var filter  = new BloomFilter(parameters.FilterSizeBytes * 8, parameters.HashFunctionsCount);
                var counter = new TestExecutor.MatchCounter();
                foreach (var word in Words.Value)
                {
                    // if (word.Length < 3) continue;
                    if (!TestExecutor.Match(data, word, data.Parameters, counter, skipBloomFilter: true))
                    {
                        continue;
                    }

                    var processed = ProcessWordForBloomFilter(parameters, word);
                    filter.add(processed);
                }

                data.BloomFilter = filter;

                using (new Timer("DataGenerator.BuildBloomFilter[retouch]"))
                {
                    if (parameters.RetouchWordCount > 0)
                    {
                        var retouched = FakeWordsByFrequency.Value
                                        .Where(t => t.Item2 > 2)
                                        .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter))
                                        .Take(parameters.RetouchWordCount.Value)
                                        .ToArray();

                        foreach (var tuple in retouched)
                        {
                            var word      = tuple.Item1;
                            var processed = ProcessWordForBloomFilter(parameters, word);
                            filter.retouch(processed, parameters.RetouchMaxWeight ?? 0);
                        }

                        // WriteRetouched(data.Parameters.Id.Value, retouched);
                    }
                    else if (parameters.RetouchMinRelWeight != null)
                    {
                        var falsePositives = FakeWordsByFrequency.Value
                                             .Where(t => t.Item2 > 2)
                                             .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter))
                                             .ToArray();

                        var falsePositivesBF = new BloomFilter(filter.m, filter.k);
                        foreach (var fp in falsePositives)
                        {
                            var processed = ProcessWordForBloomFilter(parameters, fp.Item1);
                            falsePositivesBF.add(processed, fp.Item2);
                        }

                        filter.retouch(falsePositivesBF, parameters.RetouchMinRelWeight.Value);
                    }
                }
            }
        }