public GeneratedData Generate(GenerationParameters generationParameters) { using (new Timer("DataGenerator.Generate")) { var excludedSamples1 = GetExcludedSamples(generationParameters.SampleExclusion_1); var excludedSamples2 = GetExcludedSamples(generationParameters.SampleExclusion_2); var excludedSamples3 = GetExcludedSamples(generationParameters.SampleExclusion_3); var excludedPairs1 = GetExcludedPairs(generationParameters.PairExclusion_1); var excludedPairs2 = GetExcludedPairs(generationParameters.PairExclusion_2); var data = new GeneratedData { Parameters = generationParameters, ExcludedSamples_1 = excludedSamples1, ExcludedSamples_2 = excludedSamples2, ExcludedSamples_3 = excludedSamples3, ExcludedPairs_1 = excludedPairs1, ExcludedPairs_2 = excludedPairs2, }; RemoveRedundantData(data); BuildBloomFilter(generationParameters.BloomFilter, data); return(data); } }
public static bool Match(GeneratedData data, string value, GenerationParameters parameters, MatchCounter counter, bool skipBloomFilter = false) { if (SampleSplitter.PreExcludeValue(value)) { // if (!skipBloomFilter) Console.WriteLine("preex: " + value); counter.PreExclusion++; return(false); } if (!MatchSampleExclusion(value, data.ExcludedSamples_1, parameters.SampleExclusion_1)) { counter.SampleExclusion_1++; return(false); } if (!MatchSampleExclusion(value, data.ExcludedSamples_2, parameters.SampleExclusion_2)) { // if (!skipBloomFilter) Console.WriteLine("se: " + value); counter.SampleExclusion_2++; return(false); } if (!MatchSampleExclusion(value, data.ExcludedSamples_3, parameters.SampleExclusion_3)) { counter.SampleExclusion_3++; return(false); } if (!MatchPairExclusion(value, data.ExcludedPairs_1, parameters.PairExclusion_1)) { counter.PairExclusion++; return(false); } if (!MatchPairExclusion(value, data.ExcludedPairs_2, parameters.PairExclusion_2)) { //if (!skipBloomFilter) Console.WriteLine("pe: " + value); counter.PairExclusion2++; return(false); } if (!skipBloomFilter && !MatchBloomFilter(value, data.BloomFilter, data.Parameters.BloomFilter)) { // if (!skipBloomFilter) Console.WriteLine("bf: " + value); counter.BloomFilter++; return(false); } // if (!skipBloomFilter) Console.WriteLine("pass: " + value); return(true); }
public void Serialize(GeneratedData data, string txtFilePath, string dataBinaryFilePath) { if (File.Exists(txtFilePath)) { File.Delete(txtFilePath); } if (File.Exists(dataBinaryFilePath)) { File.Delete(dataBinaryFilePath); } using (var file = File.OpenWrite(txtFilePath)) using (var writer = new StreamWriter(file, Encoding.ASCII)) { foreach (var excludedSamples in data.GetExcludedSamples()) { var excludedSamplesSorted = excludedSamples .OrderBy(v => v, StringComparer.Ordinal); WriteSameSizeLines(excludedSamplesSorted, writer); } foreach (var excludedPairs in data.GetExcludedPairs()) { foreach (var kv in excludedPairs) { var sortedValues = kv.Value .OrderBy(v => v, StringComparer.Ordinal); var lines = kv.Key.Yield().Concat(sortedValues); WriteSameSizeLines(lines, writer); } } writer.Flush(); } if (!data.Parameters.BloomFilter.Disabled) { BloomFilter.Serialize(data.BloomFilter, dataBinaryFilePath); } }
public TestResult Test(GeneratedData data) { //Console.WriteLine(new //{ // samples = data.ExcludedSamples_2.Count, // pairs = data.ExcludedPairs_2.Count, //}); var generationParameters = data.Parameters; var scores = new List <int>(); var falsePositives = new List <string>(); var totalTruePositiveCount = 0; var totalTrueNegativeCount = 0; var totalFalsePositiveCount = 0; var totalFalseNegativeCount = 0; var matchCounter = new MatchCounter(); // var results = new List<string>(); var testCases = TestCases.Value; foreach (var testCase in testCases) { var score = 0; // Console.WriteLine("Testing {0}", sample); var testData = testCase.Item2; foreach (var value in testData) { var word = value.Key; var expected = value.Value; //word = "aq's"; //expected = testData[word]; var result = Match( data, word, generationParameters, matchCounter); var success = result == expected; // results.Add(String.Format("{0} - {1} - {2}\n", word, result, success).ToLowerInvariant()); if (success) { score++; if (result) { totalTruePositiveCount++; } else { totalTrueNegativeCount++; } } else { if (result) { falsePositives.Add(value.Key); totalFalsePositiveCount++; } else { totalFalseNegativeCount++; } } } // Console.WriteLine("score: {0}", score); scores.Add(score); } // File.WriteAllText(String.Format("c:/temp/results-dotnet-{0}.txt", data.Parameters.Id), String.Concat(results)); return(new TestResult { FalsePositives = falsePositives, AvgPreExclusionMatches = matchCounter.PreExclusion / (double)testCases.Count(), AvgSampleExclusion1Matches = matchCounter.SampleExclusion_1 / (double)testCases.Count(), AvgSampleExclusion2Matches = matchCounter.SampleExclusion_2 / (double)testCases.Count(), AvgSampleExclusion3Matches = matchCounter.SampleExclusion_3 / (double)testCases.Count(), AvgPairExclusionMatches = matchCounter.PairExclusion / (double)testCases.Count(), AvgPairExclusion2Matches = matchCounter.PairExclusion2 / (double)testCases.Count(), AvgBloomFilterMatches = matchCounter.BloomFilter / (double)testCases.Count(), AvgFalsePositives = totalFalsePositiveCount / (double)testCases.Count(), AvgFalseNegatives = totalFalseNegativeCount / (double)testCases.Count(), AvgTruePositives = totalTruePositiveCount / (double)testCases.Count(), AvgTrueNegatives = totalTrueNegativeCount / (double)testCases.Count(), MinScore = scores.Min(), MaxScore = scores.Max(), StdDevScore = scores.Select(score => (double)score).StdDev(), AvgScore = scores.Average(), }); }
public GeneratedData Deserialize(GenerationParameters parameters, string txtPath, string binPath) { var data = new GeneratedData { Parameters = parameters, }; var prev = default(string); var exclusionSampleIndex = 0; var allSampleExclusionParameters = parameters.GetSampleExclusions() .Concat(default(SampleExclusionParameters).Yield()) .ToArray(); var allExcludedParameters = data.GetExcludedSamples() .Concat(default(HashSet <string>).Yield()) .ToArray(); var excludedSamples = allExcludedParameters[exclusionSampleIndex]; var sampleExclusionParameters = allSampleExclusionParameters[exclusionSampleIndex]; var exclusionPairKey = default(string); var exclusionPairValues = default(List <string>); foreach (var line in File.ReadLines(txtPath)) { if (exclusionSampleIndex < allSampleExclusionParameters.Length - 1) { if (line.Length == 0) { exclusionSampleIndex++; sampleExclusionParameters = allSampleExclusionParameters[exclusionSampleIndex]; excludedSamples = allExcludedParameters[exclusionSampleIndex]; prev = null; continue; } var exclusionSample = line; if (DELTA_ENCODING) { if (line.Length < sampleExclusionParameters.SampleSize) { exclusionSample = prev.Substring(0, sampleExclusionParameters.SampleSize - line.Length) + line; } } excludedSamples.Add(exclusionSample); prev = exclusionSample; } else { if (line.Length == 0) { var excludedPairs = exclusionPairKey.Length == data.Parameters.PairExclusion_1.SampleSize ? data.ExcludedPairs_1 : data.ExcludedPairs_2; excludedPairs.Add(exclusionPairKey, new HashSet <string>(exclusionPairValues, StringComparer.Ordinal)); exclusionPairKey = null; exclusionPairValues = null; prev = null; continue; } else if (exclusionPairKey == null) { exclusionPairKey = line; exclusionPairValues = new List <string>(); prev = line; } else { var sampleSize = exclusionPairKey.Length; var exclusionValue = line; if (DELTA_ENCODING) { if (line.Length < sampleSize) { exclusionValue = prev.Substring(0, sampleSize - line.Length) + line; } } exclusionPairValues.Add(exclusionValue); prev = exclusionValue; } } } if (exclusionPairKey != null) { throw new Exception("Unexpected end of data"); } if (!parameters.BloomFilter.Disabled) { data.BloomFilter = BloomFilter.Deserialize(binPath, parameters.BloomFilter.HashFunctionsCount); } return(data); }
private void BuildBloomFilter(BloomFilterParameters parameters, GeneratedData data) { if (parameters.Disabled) { return; } using (new Timer("DataGenerator.BuildBloomFilter")) { var filter = new BloomFilter(parameters.FilterSizeBytes * 8, parameters.HashFunctionsCount); var counter = new TestExecutor.MatchCounter(); foreach (var word in Words.Value) { // if (word.Length < 3) continue; if (!TestExecutor.Match(data, word, data.Parameters, counter, skipBloomFilter: true)) { continue; } var processed = ProcessWordForBloomFilter(parameters, word); filter.add(processed); } data.BloomFilter = filter; using (new Timer("DataGenerator.BuildBloomFilter[retouch]")) { if (parameters.RetouchWordCount > 0) { var retouched = FakeWordsByFrequency.Value .Where(t => t.Item2 > 2) .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter)) .Take(parameters.RetouchWordCount.Value) .ToArray(); foreach (var tuple in retouched) { var word = tuple.Item1; var processed = ProcessWordForBloomFilter(parameters, word); filter.retouch(processed, parameters.RetouchMaxWeight ?? 0); } // WriteRetouched(data.Parameters.Id.Value, retouched); } else if (parameters.RetouchMinRelWeight != null) { var falsePositives = FakeWordsByFrequency.Value .Where(t => t.Item2 > 2) .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter)) .ToArray(); var falsePositivesBF = new BloomFilter(filter.m, filter.k); foreach (var fp in falsePositives) { var processed = ProcessWordForBloomFilter(parameters, fp.Item1); falsePositivesBF.add(processed, fp.Item2); } filter.retouch(falsePositivesBF, parameters.RetouchMinRelWeight.Value); } } } }
private void RemoveRedundantData(GeneratedData data) { var excludedSamplesSets = data.GetExcludedSamples() .Where(es => es.Count > 0) .OrderBy(es => es.First().Length) .ToArray(); // step 1: remove excluded samples already covered by smaller sized excluded samples { for (var index = 1; index < excludedSamplesSets.Length; index++) { for (var pIndex = 0; pIndex < index; pIndex++) { var excludedSamples = excludedSamplesSets[index]; var alreadyExcludedSamples = excludedSamplesSets[pIndex]; RemoveRedundantExclusionSamples(excludedSamples, alreadyExcludedSamples); } } } //// step 2: remove excluded samples already covered by pairs //{ // for (var index = 0; index < excludedSamplesSets.Length; index++) // { // var excludedSamples = excludedSamplesSets[index]; // var excludedSampleSize = excludedSamples.First().Length; // var excludedPairsSets = data.GetExcludedPairs() // .Where(s => s.Count > 0) // .ToArray(); // foreach (var pairExclusion in excludedPairsSets) // { // var pairSampleSize = pairExclusion.First().Key.Length; // if (excludedSampleSize <= pairSampleSize) continue; // foreach (var excludedSample in excludedSamples.ToArray()) // { // var lineSamples = SampleSplitter.SplitSamples(excludedSample, pairSampleSize); // var linePairs = // from x in lineSamples //.Select((s, i) => new { s, i }) // from y in lineSamples //.Select((s, i) => new { s, i }) // where x.CompareTo(y) < 0 // select Tuple.Create(x, y); // //where x.i < y.i // //select Tuple.Create(x.s, y.s); // foreach (var linePair in linePairs) // { // var excludedItem2s = default(HashSet<string>); // if (pairExclusion.TryGetValue(linePair.Item1, out excludedItem2s)) // { // if (excludedItem2s.Contains(linePair.Item2)) // { // excludedSamples.Remove(excludedSample); // break; // } // } // } // } // } // } //} //// step 3: remove pairs already covered by smaller pairs //{ // var excludedPairsSets = data.GetExcludedPairs() // .Where(es => es.Count > 0) // .OrderBy(es => es.First().Key.Length) // .ToArray(); // for (var index = 1; index < excludedPairsSets.Length; index++) // { // for (var pIndex = 0; pIndex < index; pIndex++) // { // var excludedPairs = excludedPairsSets[index]; // var alreadyExcludedPairs = excludedPairsSets[pIndex]; // RemoveRedundantExclusionPairs(excludedPairs, alreadyExcludedPairs); // } // } //} }