private static IEnumerable <string> GetWords() { return(File.ReadLines(Settings.WordsPath) .Where(w => !SampleSplitter.PreExcludeValue(w)) //.Distinct() .ToArray()); }
private static bool MatchSampleExclusion(string value, HashSet <string> data, SampleExclusionParameters parameters) { if (parameters.Disabled) { return(true); } var exclusionParameters = parameters; var wordSamples = SampleSplitter.SplitSamples(value, exclusionParameters.SampleSize, exclusionParameters.BeginAnchor, exclusionParameters.EndAnchor, exclusionParameters.GetMinWordSize(), exclusionParameters.StartIndex, exclusionParameters.Length); foreach (var sample in wordSamples) { if (data.Contains(sample)) { return(false); } } return(true); }
public static bool Match(GeneratedData data, string value, GenerationParameters parameters, MatchCounter counter, bool skipBloomFilter = false) { if (SampleSplitter.PreExcludeValue(value)) { // if (!skipBloomFilter) Console.WriteLine("preex: " + value); counter.PreExclusion++; return(false); } if (!MatchSampleExclusion(value, data.ExcludedSamples_1, parameters.SampleExclusion_1)) { counter.SampleExclusion_1++; return(false); } if (!MatchSampleExclusion(value, data.ExcludedSamples_2, parameters.SampleExclusion_2)) { // if (!skipBloomFilter) Console.WriteLine("se: " + value); counter.SampleExclusion_2++; return(false); } if (!MatchSampleExclusion(value, data.ExcludedSamples_3, parameters.SampleExclusion_3)) { counter.SampleExclusion_3++; return(false); } if (!MatchPairExclusion(value, data.ExcludedPairs_1, parameters.PairExclusion_1)) { counter.PairExclusion++; return(false); } if (!MatchPairExclusion(value, data.ExcludedPairs_2, parameters.PairExclusion_2)) { //if (!skipBloomFilter) Console.WriteLine("pe: " + value); counter.PairExclusion2++; return(false); } if (!skipBloomFilter && !MatchBloomFilter(value, data.BloomFilter, data.Parameters.BloomFilter)) { // if (!skipBloomFilter) Console.WriteLine("bf: " + value); counter.BloomFilter++; return(false); } // if (!skipBloomFilter) Console.WriteLine("pass: " + value); return(true); }
internal static IEnumerable <char> GetPossibleChars() { var chars = Enumerable.Range(0, 26) .Select(i => (char)(i + 'a')) .Concat('\''.Yield()) .Concat(SampleSplitter.ANCHOR.Yield()) .Concat(SampleSplitter.GetAdditionalChars()); if (EXCLUDE_APOS) { chars = chars.Where(c => c != '\''); } return(chars); }
private static void RemoveRedundantExclusionSamples(HashSet <string> excluded, HashSet <string> alreadyExcluded) { var sampleSize = excluded.First().Length; var previousSampleSize = alreadyExcluded.First().Length; foreach (var sample in excluded.ToArray()) { var subSamples = SampleSplitter.SplitSamples(sample, previousSampleSize); foreach (var subSample in subSamples) { if (alreadyExcluded.Contains(subSample)) { excluded.Remove(sample); break; } } } }
private void RemoveRedundantExclusionPairs(Dictionary <string, HashSet <string> > excluded, Dictionary <string, HashSet <string> > alreadyExcluded) { var sampleSize = excluded.First().Key.Length; var previousSampleSize = alreadyExcluded.First().Key.Length; var pairs = from e in excluded from y in e.Value where String.CompareOrdinal(e.Key, y) < 0 select Tuple.Create(e.Key, y); foreach (var pair in pairs.ToArray()) { var subSamples1 = SampleSplitter.SplitSamples(pair.Item1, previousSampleSize); var subSamples2 = SampleSplitter.SplitSamples(pair.Item2, previousSampleSize); var itemPairs = from x in subSamples1 from y in subSamples2 where String.CompareOrdinal(x, y) < 0 select Tuple.Create(x, y); foreach (var ip in itemPairs) { var excludedItem2s = default(HashSet <string>); if (alreadyExcluded.TryGetValue(ip.Item1, out excludedItem2s)) { if (excludedItem2s.Contains(ip.Item2)) { excluded[pair.Item1].Remove(pair.Item2); break; } } } } foreach (var p in excluded.ToArray()) { if (!p.Value.Any()) { excluded.Remove(p.Key); } } }
//private void WriteRetouched(int id, Tuple<string, int>[] retouched) //{ // var path = Path.Combine(Settings.TempFolder, String.Format("{0:000}-retouched.txt", id)); // if (File.Exists(path)) File.Delete(path); // File.WriteAllLines(path, retouched.Select(r => String.Format("{0} {1}", r.Item1, r.Item2))); //} public static string ProcessWordForBloomFilter(BloomFilterParameters parameters, string word) { var processed = word; processed = SubstituteMostCommonSequences(processed, parameters.SubstitutionCount); if (parameters.SubstringStartIndex != null) { processed = SampleSplitter.PreSampleForBFValue(processed); var length = Math.Min(parameters.SubstringLength.Value, processed.Length); processed = processed.Substring(0, length); } if (parameters.CharOffset != null) { processed = new String(processed .Select(c => (char)(c - 'a' + parameters.CharOffset.Value)) .ToArray()); } return(processed); }
public static Dictionary <string, int> GetSamples(int size, bool beginAnchor, bool endAnchor, int minWordSize, int startIndex, int?length = null) { var samples = new Dictionary <string, int>(StringComparer.Ordinal); foreach (var line in Words.Value) { var word = line; var wordSamples = SampleSplitter.SplitSamples(word, size, beginAnchor, endAnchor, minWordSize, startIndex, length); foreach (var sample in wordSamples) { if (samples.ContainsKey(sample)) { samples[sample]++; } else { samples[sample] = 1; } } } return(samples); }
private static IEnumerable <string> GetAllPossibleSamples(int sampleSize, bool beginAnchor, bool endAnchor) { var chars = SampleSplitter.GetPossibleChars(); var charsNoAnchor = chars.Where(c => c != SampleSplitter.ANCHOR); var samples = (beginAnchor ? chars : charsNoAnchor).Select(l => l.ToString()); for (var i = 1; i < sampleSize; i++) { var possibleChars = (i == (sampleSize - 1) && endAnchor) ? chars : charsNoAnchor; samples = from s in samples from c in possibleChars select s + c; } samples = SampleSplitter.ExcludeImpossibleSamples(samples, sampleSize, beginAnchor, endAnchor); return(samples); }
public static bool MatchPairExclusion(string value, Dictionary <string, HashSet <string> > data, PairExclusionParameters parameters) { if (parameters.Disabled) { return(true); } var wordSamples = SampleSplitter.SplitSamples(value, parameters.SampleSize, parameters.BeginAnchor, parameters.EndAnchor, parameters.GetMinWordSize(), parameters.StartIndex); var linePairs = from x in wordSamples //.Select((s, i) => new { s, i }) from y in wordSamples //.Select((s, i) => new { s, i }) where x.CompareTo(y) < 0 select Tuple.Create(x, y); //where x.i < y.i //select Tuple.Create(x.s, y.s); foreach (var linePair in linePairs) { var excludedItem2s = default(HashSet <string>); if (data.TryGetValue(linePair.Item1, out excludedItem2s)) { if (excludedItem2s.Contains(linePair.Item2)) { return(false); } } } return(true); }
private Dictionary <string, HashSet <string> > GetExcludedPairs(PairExclusionParameters parameters) { if (parameters.Disabled) { return(new Dictionary <string, HashSet <string> >(StringComparer.Ordinal)); } var sampleSize = parameters.SampleSize; var topSampleLimit = parameters.TopSamplesLimit > 0 ? parameters.TopSamplesLimit : Int16.MaxValue; var maxCount = parameters.MaxCount; var totalLimit = parameters.TotalLimit; var excludedSampleCandidates = GetSamples( sampleSize, parameters.BeginAnchor, parameters.EndAnchor, parameters.GetMinWordSize(), parameters.StartIndex); var topExclusionSamples = new HashSet <string>( excludedSampleCandidates .OrderByDescending(kv => kv.Value) .ThenBy(kv => kv.Key, StringComparer.Ordinal) .Select(kv => kv.Key) .Take(topSampleLimit), StringComparer.Ordinal); var exclusionLastIdx = sampleSize - 1; var allPairs = from x in topExclusionSamples from y in topExclusionSamples where String.CompareOrdinal(x, y) < 0 where ((x[0] != SampleSplitter.ANCHOR && y[0] != SampleSplitter.ANCHOR) || ((x[0] == SampleSplitter.ANCHOR) != (y[0] == SampleSplitter.ANCHOR))) && // 2 begin samples not possible ((x[exclusionLastIdx] != SampleSplitter.ANCHOR && y[exclusionLastIdx] != SampleSplitter.ANCHOR) || ((x[exclusionLastIdx] == SampleSplitter.ANCHOR) != (y[exclusionLastIdx] == SampleSplitter.ANCHOR))) // 2 end samples not possible select x + y; var pairResults = allPairs .ToDictionary(c => c, c => 0, StringComparer.Ordinal); foreach (var line in Words.Value) { var lineExclusionSamples = SampleSplitter.SplitSamples( line, sampleSize, parameters.BeginAnchor, parameters.EndAnchor, parameters.GetMinWordSize(), parameters.StartIndex); var lineTopSamples = new List <string>(); foreach (var sample in lineExclusionSamples) { if (topExclusionSamples.Contains(sample)) { lineTopSamples.Add(sample); } } var lineTopPairs = from x in lineTopSamples //.Select((s, i) => new { s, i }) from y in lineTopSamples //.Select((s, i) => new { s, i }) where String.CompareOrdinal(x, y) < 0 select x + y; //where x.i < y.i //select x.s + y.s; foreach (var linePair in lineTopPairs) { if (pairResults.ContainsKey(linePair)) // Could have been excluded by ExcludeWithPreviousPairs { pairResults[linePair]++; } } } var exludedPairs = pairResults .Where(r => r.Value <= maxCount) .OrderByDescending(r => r.Value) .ThenBy(r => r.Key, StringComparer.Ordinal) .Select(r => r.Key) .ToArray(); if (totalLimit != 0) { if (totalLimit > exludedPairs.Length) { Console.WriteLine("ExclusionTotalLimit ({0}) > exludedPairs.Length ({1}), you can increase ExclusionTopSampleLimit or ExclusionMaxCount", totalLimit, exludedPairs.Length); } else if (totalLimit < exludedPairs.Length) { Console.WriteLine("ExclusionTotalLimit ({0}) < exludedPairs.Length ({1}), croping data", totalLimit, exludedPairs.Length); exludedPairs = exludedPairs.Take(totalLimit).ToArray(); } else { Console.WriteLine("ExclusionTotalLimit ({0}) = exludedPairs.Length ({1})", totalLimit, exludedPairs.Length); } } return(exludedPairs .GroupBy(t => t.Substring(0, sampleSize)) .ToDictionary(g => g.Key, g => new HashSet <string>( g.Select(t => t.Substring(sampleSize, sampleSize)), StringComparer.Ordinal), StringComparer.Ordinal)); }