public void Serialize(GeneratedData data, string txtFilePath, string dataBinaryFilePath) { if (File.Exists(txtFilePath)) { File.Delete(txtFilePath); } if (File.Exists(dataBinaryFilePath)) { File.Delete(dataBinaryFilePath); } using (var file = File.OpenWrite(txtFilePath)) using (var writer = new StreamWriter(file, Encoding.ASCII)) { foreach (var excludedSamples in data.GetExcludedSamples()) { var excludedSamplesSorted = excludedSamples .OrderBy(v => v, StringComparer.Ordinal); WriteSameSizeLines(excludedSamplesSorted, writer); } foreach (var excludedPairs in data.GetExcludedPairs()) { foreach (var kv in excludedPairs) { var sortedValues = kv.Value .OrderBy(v => v, StringComparer.Ordinal); var lines = kv.Key.Yield().Concat(sortedValues); WriteSameSizeLines(lines, writer); } } writer.Flush(); } if (!data.Parameters.BloomFilter.Disabled) { BloomFilter.Serialize(data.BloomFilter, dataBinaryFilePath); } }
public GeneratedData Deserialize(GenerationParameters parameters, string txtPath, string binPath) { var data = new GeneratedData { Parameters = parameters, }; var prev = default(string); var exclusionSampleIndex = 0; var allSampleExclusionParameters = parameters.GetSampleExclusions() .Concat(default(SampleExclusionParameters).Yield()) .ToArray(); var allExcludedParameters = data.GetExcludedSamples() .Concat(default(HashSet <string>).Yield()) .ToArray(); var excludedSamples = allExcludedParameters[exclusionSampleIndex]; var sampleExclusionParameters = allSampleExclusionParameters[exclusionSampleIndex]; var exclusionPairKey = default(string); var exclusionPairValues = default(List <string>); foreach (var line in File.ReadLines(txtPath)) { if (exclusionSampleIndex < allSampleExclusionParameters.Length - 1) { if (line.Length == 0) { exclusionSampleIndex++; sampleExclusionParameters = allSampleExclusionParameters[exclusionSampleIndex]; excludedSamples = allExcludedParameters[exclusionSampleIndex]; prev = null; continue; } var exclusionSample = line; if (DELTA_ENCODING) { if (line.Length < sampleExclusionParameters.SampleSize) { exclusionSample = prev.Substring(0, sampleExclusionParameters.SampleSize - line.Length) + line; } } excludedSamples.Add(exclusionSample); prev = exclusionSample; } else { if (line.Length == 0) { var excludedPairs = exclusionPairKey.Length == data.Parameters.PairExclusion_1.SampleSize ? data.ExcludedPairs_1 : data.ExcludedPairs_2; excludedPairs.Add(exclusionPairKey, new HashSet <string>(exclusionPairValues, StringComparer.Ordinal)); exclusionPairKey = null; exclusionPairValues = null; prev = null; continue; } else if (exclusionPairKey == null) { exclusionPairKey = line; exclusionPairValues = new List <string>(); prev = line; } else { var sampleSize = exclusionPairKey.Length; var exclusionValue = line; if (DELTA_ENCODING) { if (line.Length < sampleSize) { exclusionValue = prev.Substring(0, sampleSize - line.Length) + line; } } exclusionPairValues.Add(exclusionValue); prev = exclusionValue; } } } if (exclusionPairKey != null) { throw new Exception("Unexpected end of data"); } if (!parameters.BloomFilter.Disabled) { data.BloomFilter = BloomFilter.Deserialize(binPath, parameters.BloomFilter.HashFunctionsCount); } return(data); }
private void RemoveRedundantData(GeneratedData data) { var excludedSamplesSets = data.GetExcludedSamples() .Where(es => es.Count > 0) .OrderBy(es => es.First().Length) .ToArray(); // step 1: remove excluded samples already covered by smaller sized excluded samples { for (var index = 1; index < excludedSamplesSets.Length; index++) { for (var pIndex = 0; pIndex < index; pIndex++) { var excludedSamples = excludedSamplesSets[index]; var alreadyExcludedSamples = excludedSamplesSets[pIndex]; RemoveRedundantExclusionSamples(excludedSamples, alreadyExcludedSamples); } } } //// step 2: remove excluded samples already covered by pairs //{ // for (var index = 0; index < excludedSamplesSets.Length; index++) // { // var excludedSamples = excludedSamplesSets[index]; // var excludedSampleSize = excludedSamples.First().Length; // var excludedPairsSets = data.GetExcludedPairs() // .Where(s => s.Count > 0) // .ToArray(); // foreach (var pairExclusion in excludedPairsSets) // { // var pairSampleSize = pairExclusion.First().Key.Length; // if (excludedSampleSize <= pairSampleSize) continue; // foreach (var excludedSample in excludedSamples.ToArray()) // { // var lineSamples = SampleSplitter.SplitSamples(excludedSample, pairSampleSize); // var linePairs = // from x in lineSamples //.Select((s, i) => new { s, i }) // from y in lineSamples //.Select((s, i) => new { s, i }) // where x.CompareTo(y) < 0 // select Tuple.Create(x, y); // //where x.i < y.i // //select Tuple.Create(x.s, y.s); // foreach (var linePair in linePairs) // { // var excludedItem2s = default(HashSet<string>); // if (pairExclusion.TryGetValue(linePair.Item1, out excludedItem2s)) // { // if (excludedItem2s.Contains(linePair.Item2)) // { // excludedSamples.Remove(excludedSample); // break; // } // } // } // } // } // } //} //// step 3: remove pairs already covered by smaller pairs //{ // var excludedPairsSets = data.GetExcludedPairs() // .Where(es => es.Count > 0) // .OrderBy(es => es.First().Key.Length) // .ToArray(); // for (var index = 1; index < excludedPairsSets.Length; index++) // { // for (var pIndex = 0; pIndex < index; pIndex++) // { // var excludedPairs = excludedPairsSets[index]; // var alreadyExcludedPairs = excludedPairsSets[pIndex]; // RemoveRedundantExclusionPairs(excludedPairs, alreadyExcludedPairs); // } // } //} }