Ejemplo n.º 1
0
        public GeneratedData Generate(GenerationParameters generationParameters)
        {
            using (new Timer("DataGenerator.Generate"))
            {
                var excludedSamples1 = GetExcludedSamples(generationParameters.SampleExclusion_1);
                var excludedSamples2 = GetExcludedSamples(generationParameters.SampleExclusion_2);
                var excludedSamples3 = GetExcludedSamples(generationParameters.SampleExclusion_3);

                var excludedPairs1 = GetExcludedPairs(generationParameters.PairExclusion_1);
                var excludedPairs2 = GetExcludedPairs(generationParameters.PairExclusion_2);

                var data = new GeneratedData
                {
                    Parameters        = generationParameters,
                    ExcludedSamples_1 = excludedSamples1,
                    ExcludedSamples_2 = excludedSamples2,
                    ExcludedSamples_3 = excludedSamples3,
                    ExcludedPairs_1   = excludedPairs1,
                    ExcludedPairs_2   = excludedPairs2,
                };

                RemoveRedundantData(data);

                BuildBloomFilter(generationParameters.BloomFilter, data);

                return(data);
            }
        }
        public static bool Match(GeneratedData data, string value, GenerationParameters parameters, MatchCounter counter, bool skipBloomFilter = false)
        {
            if (SampleSplitter.PreExcludeValue(value))
            {
                // if (!skipBloomFilter) Console.WriteLine("preex: " + value);
                counter.PreExclusion++;
                return(false);
            }

            if (!MatchSampleExclusion(value, data.ExcludedSamples_1, parameters.SampleExclusion_1))
            {
                counter.SampleExclusion_1++;
                return(false);
            }

            if (!MatchSampleExclusion(value, data.ExcludedSamples_2, parameters.SampleExclusion_2))
            {
                // if (!skipBloomFilter) Console.WriteLine("se: " + value);
                counter.SampleExclusion_2++;
                return(false);
            }

            if (!MatchSampleExclusion(value, data.ExcludedSamples_3, parameters.SampleExclusion_3))
            {
                counter.SampleExclusion_3++;
                return(false);
            }

            if (!MatchPairExclusion(value, data.ExcludedPairs_1, parameters.PairExclusion_1))
            {
                counter.PairExclusion++;
                return(false);
            }

            if (!MatchPairExclusion(value, data.ExcludedPairs_2, parameters.PairExclusion_2))
            {
                //if (!skipBloomFilter) Console.WriteLine("pe: " + value);
                counter.PairExclusion2++;
                return(false);
            }

            if (!skipBloomFilter && !MatchBloomFilter(value, data.BloomFilter, data.Parameters.BloomFilter))
            {
                // if (!skipBloomFilter) Console.WriteLine("bf: " + value);
                counter.BloomFilter++;
                return(false);
            }

            // if (!skipBloomFilter) Console.WriteLine("pass: " + value);

            return(true);
        }
Ejemplo n.º 3
0
        public void Serialize(GeneratedData data, string txtFilePath, string dataBinaryFilePath)
        {
            if (File.Exists(txtFilePath))
            {
                File.Delete(txtFilePath);
            }
            if (File.Exists(dataBinaryFilePath))
            {
                File.Delete(dataBinaryFilePath);
            }

            using (var file = File.OpenWrite(txtFilePath))
                using (var writer = new StreamWriter(file, Encoding.ASCII))
                {
                    foreach (var excludedSamples in data.GetExcludedSamples())
                    {
                        var excludedSamplesSorted = excludedSamples
                                                    .OrderBy(v => v, StringComparer.Ordinal);

                        WriteSameSizeLines(excludedSamplesSorted, writer);
                    }

                    foreach (var excludedPairs in data.GetExcludedPairs())
                    {
                        foreach (var kv in excludedPairs)
                        {
                            var sortedValues = kv.Value
                                               .OrderBy(v => v, StringComparer.Ordinal);

                            var lines = kv.Key.Yield().Concat(sortedValues);
                            WriteSameSizeLines(lines, writer);
                        }
                    }

                    writer.Flush();
                }

            if (!data.Parameters.BloomFilter.Disabled)
            {
                BloomFilter.Serialize(data.BloomFilter, dataBinaryFilePath);
            }
        }
        public TestResult Test(GeneratedData data)
        {
            //Console.WriteLine(new
            //{
            //	samples = data.ExcludedSamples_2.Count,
            //	pairs = data.ExcludedPairs_2.Count,
            //});

            var generationParameters = data.Parameters;

            var scores                  = new List <int>();
            var falsePositives          = new List <string>();
            var totalTruePositiveCount  = 0;
            var totalTrueNegativeCount  = 0;
            var totalFalsePositiveCount = 0;
            var totalFalseNegativeCount = 0;
            var matchCounter            = new MatchCounter();

            // var results = new List<string>();

            var testCases = TestCases.Value;

            foreach (var testCase in testCases)
            {
                var score = 0;
                // Console.WriteLine("Testing {0}", sample);
                var testData = testCase.Item2;
                foreach (var value in testData)
                {
                    var word     = value.Key;
                    var expected = value.Value;

                    //word = "aq's";
                    //expected = testData[word];

                    var result = Match(
                        data,
                        word,
                        generationParameters,
                        matchCounter);

                    var success = result == expected;

                    // results.Add(String.Format("{0} - {1} - {2}\n", word, result, success).ToLowerInvariant());

                    if (success)
                    {
                        score++;
                        if (result)
                        {
                            totalTruePositiveCount++;
                        }
                        else
                        {
                            totalTrueNegativeCount++;
                        }
                    }
                    else
                    {
                        if (result)
                        {
                            falsePositives.Add(value.Key);
                            totalFalsePositiveCount++;
                        }
                        else
                        {
                            totalFalseNegativeCount++;
                        }
                    }
                }

                // Console.WriteLine("score: {0}", score);
                scores.Add(score);
            }

            // File.WriteAllText(String.Format("c:/temp/results-dotnet-{0}.txt", data.Parameters.Id), String.Concat(results));

            return(new TestResult
            {
                FalsePositives = falsePositives,
                AvgPreExclusionMatches = matchCounter.PreExclusion / (double)testCases.Count(),
                AvgSampleExclusion1Matches = matchCounter.SampleExclusion_1 / (double)testCases.Count(),
                AvgSampleExclusion2Matches = matchCounter.SampleExclusion_2 / (double)testCases.Count(),
                AvgSampleExclusion3Matches = matchCounter.SampleExclusion_3 / (double)testCases.Count(),
                AvgPairExclusionMatches = matchCounter.PairExclusion / (double)testCases.Count(),
                AvgPairExclusion2Matches = matchCounter.PairExclusion2 / (double)testCases.Count(),
                AvgBloomFilterMatches = matchCounter.BloomFilter / (double)testCases.Count(),
                AvgFalsePositives = totalFalsePositiveCount / (double)testCases.Count(),
                AvgFalseNegatives = totalFalseNegativeCount / (double)testCases.Count(),
                AvgTruePositives = totalTruePositiveCount / (double)testCases.Count(),
                AvgTrueNegatives = totalTrueNegativeCount / (double)testCases.Count(),
                MinScore = scores.Min(),
                MaxScore = scores.Max(),
                StdDevScore = scores.Select(score => (double)score).StdDev(),
                AvgScore = scores.Average(),
            });
        }
Ejemplo n.º 5
0
        public GeneratedData Deserialize(GenerationParameters parameters, string txtPath, string binPath)
        {
            var data = new GeneratedData
            {
                Parameters = parameters,
            };

            var prev = default(string);
            var exclusionSampleIndex = 0;

            var allSampleExclusionParameters = parameters.GetSampleExclusions()
                                               .Concat(default(SampleExclusionParameters).Yield())
                                               .ToArray();

            var allExcludedParameters = data.GetExcludedSamples()
                                        .Concat(default(HashSet <string>).Yield())
                                        .ToArray();

            var excludedSamples           = allExcludedParameters[exclusionSampleIndex];
            var sampleExclusionParameters = allSampleExclusionParameters[exclusionSampleIndex];

            var exclusionPairKey    = default(string);
            var exclusionPairValues = default(List <string>);

            foreach (var line in File.ReadLines(txtPath))
            {
                if (exclusionSampleIndex < allSampleExclusionParameters.Length - 1)
                {
                    if (line.Length == 0)
                    {
                        exclusionSampleIndex++;
                        sampleExclusionParameters = allSampleExclusionParameters[exclusionSampleIndex];
                        excludedSamples           = allExcludedParameters[exclusionSampleIndex];
                        prev = null;
                        continue;
                    }

                    var exclusionSample = line;

                    if (DELTA_ENCODING)
                    {
                        if (line.Length < sampleExclusionParameters.SampleSize)
                        {
                            exclusionSample = prev.Substring(0, sampleExclusionParameters.SampleSize - line.Length) + line;
                        }
                    }

                    excludedSamples.Add(exclusionSample);
                    prev = exclusionSample;
                }
                else
                {
                    if (line.Length == 0)
                    {
                        var excludedPairs = exclusionPairKey.Length == data.Parameters.PairExclusion_1.SampleSize ?
                                            data.ExcludedPairs_1 : data.ExcludedPairs_2;
                        excludedPairs.Add(exclusionPairKey, new HashSet <string>(exclusionPairValues, StringComparer.Ordinal));
                        exclusionPairKey    = null;
                        exclusionPairValues = null;
                        prev = null;
                        continue;
                    }
                    else if (exclusionPairKey == null)
                    {
                        exclusionPairKey    = line;
                        exclusionPairValues = new List <string>();
                        prev = line;
                    }
                    else
                    {
                        var sampleSize     = exclusionPairKey.Length;
                        var exclusionValue = line;
                        if (DELTA_ENCODING)
                        {
                            if (line.Length < sampleSize)
                            {
                                exclusionValue = prev.Substring(0, sampleSize - line.Length) + line;
                            }
                        }
                        exclusionPairValues.Add(exclusionValue);
                        prev = exclusionValue;
                    }
                }
            }

            if (exclusionPairKey != null)
            {
                throw new Exception("Unexpected end of data");
            }

            if (!parameters.BloomFilter.Disabled)
            {
                data.BloomFilter = BloomFilter.Deserialize(binPath, parameters.BloomFilter.HashFunctionsCount);
            }

            return(data);
        }
Ejemplo n.º 6
0
        private void BuildBloomFilter(BloomFilterParameters parameters, GeneratedData data)
        {
            if (parameters.Disabled)
            {
                return;
            }

            using (new Timer("DataGenerator.BuildBloomFilter"))
            {
                var filter  = new BloomFilter(parameters.FilterSizeBytes * 8, parameters.HashFunctionsCount);
                var counter = new TestExecutor.MatchCounter();
                foreach (var word in Words.Value)
                {
                    // if (word.Length < 3) continue;
                    if (!TestExecutor.Match(data, word, data.Parameters, counter, skipBloomFilter: true))
                    {
                        continue;
                    }

                    var processed = ProcessWordForBloomFilter(parameters, word);
                    filter.add(processed);
                }

                data.BloomFilter = filter;

                using (new Timer("DataGenerator.BuildBloomFilter[retouch]"))
                {
                    if (parameters.RetouchWordCount > 0)
                    {
                        var retouched = FakeWordsByFrequency.Value
                                        .Where(t => t.Item2 > 2)
                                        .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter))
                                        .Take(parameters.RetouchWordCount.Value)
                                        .ToArray();

                        foreach (var tuple in retouched)
                        {
                            var word      = tuple.Item1;
                            var processed = ProcessWordForBloomFilter(parameters, word);
                            filter.retouch(processed, parameters.RetouchMaxWeight ?? 0);
                        }

                        // WriteRetouched(data.Parameters.Id.Value, retouched);
                    }
                    else if (parameters.RetouchMinRelWeight != null)
                    {
                        var falsePositives = FakeWordsByFrequency.Value
                                             .Where(t => t.Item2 > 2)
                                             .Where(t => TestExecutor.Match(data, t.Item1, data.Parameters, counter))
                                             .ToArray();

                        var falsePositivesBF = new BloomFilter(filter.m, filter.k);
                        foreach (var fp in falsePositives)
                        {
                            var processed = ProcessWordForBloomFilter(parameters, fp.Item1);
                            falsePositivesBF.add(processed, fp.Item2);
                        }

                        filter.retouch(falsePositivesBF, parameters.RetouchMinRelWeight.Value);
                    }
                }
            }
        }
Ejemplo n.º 7
0
        private void RemoveRedundantData(GeneratedData data)
        {
            var excludedSamplesSets = data.GetExcludedSamples()
                                      .Where(es => es.Count > 0)
                                      .OrderBy(es => es.First().Length)
                                      .ToArray();

            // step 1: remove excluded samples already covered by smaller sized excluded samples
            {
                for (var index = 1; index < excludedSamplesSets.Length; index++)
                {
                    for (var pIndex = 0; pIndex < index; pIndex++)
                    {
                        var excludedSamples        = excludedSamplesSets[index];
                        var alreadyExcludedSamples = excludedSamplesSets[pIndex];
                        RemoveRedundantExclusionSamples(excludedSamples, alreadyExcludedSamples);
                    }
                }
            }

            //// step 2: remove excluded samples already covered by pairs
            //{

            //	for (var index = 0; index < excludedSamplesSets.Length; index++)
            //	{
            //		var excludedSamples = excludedSamplesSets[index];
            //		var excludedSampleSize = excludedSamples.First().Length;
            //		var excludedPairsSets = data.GetExcludedPairs()
            //			.Where(s => s.Count > 0)
            //			.ToArray();

            //		foreach (var pairExclusion in excludedPairsSets)
            //		{
            //			var pairSampleSize = pairExclusion.First().Key.Length;

            //			if (excludedSampleSize <= pairSampleSize) continue;

            //			foreach (var excludedSample in excludedSamples.ToArray())
            //			{
            //				var lineSamples = SampleSplitter.SplitSamples(excludedSample, pairSampleSize);
            //				var linePairs =
            //					from x in lineSamples //.Select((s, i) => new { s, i })
            //					from y in lineSamples //.Select((s, i) => new { s, i })
            //					where x.CompareTo(y) < 0
            //					select Tuple.Create(x, y);
            //				//where x.i < y.i
            //				//select Tuple.Create(x.s, y.s);

            //				foreach (var linePair in linePairs)
            //				{
            //					var excludedItem2s = default(HashSet<string>);
            //					if (pairExclusion.TryGetValue(linePair.Item1, out excludedItem2s))
            //					{
            //						if (excludedItem2s.Contains(linePair.Item2))
            //						{
            //							excludedSamples.Remove(excludedSample);
            //							break;
            //						}
            //					}
            //				}
            //			}
            //		}
            //	}
            //}

            //// step 3: remove pairs already covered by smaller pairs
            //{
            //	var excludedPairsSets = data.GetExcludedPairs()
            //		.Where(es => es.Count > 0)
            //		.OrderBy(es => es.First().Key.Length)
            //		.ToArray();

            //	for (var index = 1; index < excludedPairsSets.Length; index++)
            //	{
            //		for (var pIndex = 0; pIndex < index; pIndex++)
            //		{
            //			var excludedPairs = excludedPairsSets[index];
            //			var alreadyExcludedPairs = excludedPairsSets[pIndex];
            //			RemoveRedundantExclusionPairs(excludedPairs, alreadyExcludedPairs);
            //		}
            //	}
            //}
        }