Example #1
0
 private static IEnumerable <string> GetWords()
 {
     return(File.ReadLines(Settings.WordsPath)
            .Where(w => !SampleSplitter.PreExcludeValue(w))
            //.Distinct()
            .ToArray());
 }
        private static bool MatchSampleExclusion(string value, HashSet <string> data, SampleExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(true);
            }

            var exclusionParameters = parameters;
            var wordSamples         = SampleSplitter.SplitSamples(value,
                                                                  exclusionParameters.SampleSize,
                                                                  exclusionParameters.BeginAnchor,
                                                                  exclusionParameters.EndAnchor,
                                                                  exclusionParameters.GetMinWordSize(),
                                                                  exclusionParameters.StartIndex,
                                                                  exclusionParameters.Length);

            foreach (var sample in wordSamples)
            {
                if (data.Contains(sample))
                {
                    return(false);
                }
            }

            return(true);
        }
        public static bool Match(GeneratedData data, string value, GenerationParameters parameters, MatchCounter counter, bool skipBloomFilter = false)
        {
            if (SampleSplitter.PreExcludeValue(value))
            {
                // if (!skipBloomFilter) Console.WriteLine("preex: " + value);
                counter.PreExclusion++;
                return(false);
            }

            if (!MatchSampleExclusion(value, data.ExcludedSamples_1, parameters.SampleExclusion_1))
            {
                counter.SampleExclusion_1++;
                return(false);
            }

            if (!MatchSampleExclusion(value, data.ExcludedSamples_2, parameters.SampleExclusion_2))
            {
                // if (!skipBloomFilter) Console.WriteLine("se: " + value);
                counter.SampleExclusion_2++;
                return(false);
            }

            if (!MatchSampleExclusion(value, data.ExcludedSamples_3, parameters.SampleExclusion_3))
            {
                counter.SampleExclusion_3++;
                return(false);
            }

            if (!MatchPairExclusion(value, data.ExcludedPairs_1, parameters.PairExclusion_1))
            {
                counter.PairExclusion++;
                return(false);
            }

            if (!MatchPairExclusion(value, data.ExcludedPairs_2, parameters.PairExclusion_2))
            {
                //if (!skipBloomFilter) Console.WriteLine("pe: " + value);
                counter.PairExclusion2++;
                return(false);
            }

            if (!skipBloomFilter && !MatchBloomFilter(value, data.BloomFilter, data.Parameters.BloomFilter))
            {
                // if (!skipBloomFilter) Console.WriteLine("bf: " + value);
                counter.BloomFilter++;
                return(false);
            }

            // if (!skipBloomFilter) Console.WriteLine("pass: " + value);

            return(true);
        }
        internal static IEnumerable <char> GetPossibleChars()
        {
            var chars = Enumerable.Range(0, 26)
                        .Select(i => (char)(i + 'a'))
                        .Concat('\''.Yield())
                        .Concat(SampleSplitter.ANCHOR.Yield())
                        .Concat(SampleSplitter.GetAdditionalChars());

            if (EXCLUDE_APOS)
            {
                chars = chars.Where(c => c != '\'');
            }

            return(chars);
        }
Example #5
0
        private static void RemoveRedundantExclusionSamples(HashSet <string> excluded, HashSet <string> alreadyExcluded)
        {
            var sampleSize         = excluded.First().Length;
            var previousSampleSize = alreadyExcluded.First().Length;

            foreach (var sample in excluded.ToArray())
            {
                var subSamples = SampleSplitter.SplitSamples(sample, previousSampleSize);
                foreach (var subSample in subSamples)
                {
                    if (alreadyExcluded.Contains(subSample))
                    {
                        excluded.Remove(sample);
                        break;
                    }
                }
            }
        }
Example #6
0
        private void RemoveRedundantExclusionPairs(Dictionary <string, HashSet <string> > excluded, Dictionary <string, HashSet <string> > alreadyExcluded)
        {
            var sampleSize         = excluded.First().Key.Length;
            var previousSampleSize = alreadyExcluded.First().Key.Length;

            var pairs =
                from e in excluded
                from y in e.Value
                where String.CompareOrdinal(e.Key, y) < 0
                select Tuple.Create(e.Key, y);

            foreach (var pair in pairs.ToArray())
            {
                var subSamples1 = SampleSplitter.SplitSamples(pair.Item1, previousSampleSize);
                var subSamples2 = SampleSplitter.SplitSamples(pair.Item2, previousSampleSize);

                var itemPairs =
                    from x in subSamples1
                    from y in subSamples2
                    where String.CompareOrdinal(x, y) < 0
                    select Tuple.Create(x, y);

                foreach (var ip in itemPairs)
                {
                    var excludedItem2s = default(HashSet <string>);
                    if (alreadyExcluded.TryGetValue(ip.Item1, out excludedItem2s))
                    {
                        if (excludedItem2s.Contains(ip.Item2))
                        {
                            excluded[pair.Item1].Remove(pair.Item2);
                            break;
                        }
                    }
                }
            }

            foreach (var p in excluded.ToArray())
            {
                if (!p.Value.Any())
                {
                    excluded.Remove(p.Key);
                }
            }
        }
Example #7
0
        //private void WriteRetouched(int id, Tuple<string, int>[] retouched)
        //{
        //	var path = Path.Combine(Settings.TempFolder, String.Format("{0:000}-retouched.txt", id));
        //	if (File.Exists(path)) File.Delete(path);
        //	File.WriteAllLines(path, retouched.Select(r => String.Format("{0} {1}", r.Item1, r.Item2)));
        //}

        public static string ProcessWordForBloomFilter(BloomFilterParameters parameters, string word)
        {
            var processed = word;

            processed = SubstituteMostCommonSequences(processed, parameters.SubstitutionCount);
            if (parameters.SubstringStartIndex != null)
            {
                processed = SampleSplitter.PreSampleForBFValue(processed);
                var length = Math.Min(parameters.SubstringLength.Value, processed.Length);
                processed = processed.Substring(0, length);
            }
            if (parameters.CharOffset != null)
            {
                processed = new String(processed
                                       .Select(c => (char)(c - 'a' + parameters.CharOffset.Value))
                                       .ToArray());
            }
            return(processed);
        }
Example #8
0
        public static Dictionary <string, int> GetSamples(int size, bool beginAnchor, bool endAnchor, int minWordSize, int startIndex, int?length = null)
        {
            var samples = new Dictionary <string, int>(StringComparer.Ordinal);

            foreach (var line in Words.Value)
            {
                var word        = line;
                var wordSamples = SampleSplitter.SplitSamples(word, size, beginAnchor, endAnchor, minWordSize, startIndex, length);
                foreach (var sample in wordSamples)
                {
                    if (samples.ContainsKey(sample))
                    {
                        samples[sample]++;
                    }
                    else
                    {
                        samples[sample] = 1;
                    }
                }
            }
            return(samples);
        }
Example #9
0
        private static IEnumerable <string> GetAllPossibleSamples(int sampleSize, bool beginAnchor, bool endAnchor)
        {
            var chars         = SampleSplitter.GetPossibleChars();
            var charsNoAnchor = chars.Where(c => c != SampleSplitter.ANCHOR);

            var samples = (beginAnchor ? chars : charsNoAnchor).Select(l => l.ToString());

            for (var i = 1; i < sampleSize; i++)
            {
                var possibleChars = (i == (sampleSize - 1) && endAnchor)
                                        ? chars
                                        : charsNoAnchor;

                samples =
                    from s in samples
                    from c in possibleChars
                    select s + c;
            }

            samples = SampleSplitter.ExcludeImpossibleSamples(samples, sampleSize, beginAnchor, endAnchor);

            return(samples);
        }
        public static bool MatchPairExclusion(string value, Dictionary <string, HashSet <string> > data, PairExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(true);
            }

            var wordSamples = SampleSplitter.SplitSamples(value,
                                                          parameters.SampleSize,
                                                          parameters.BeginAnchor,
                                                          parameters.EndAnchor,
                                                          parameters.GetMinWordSize(),
                                                          parameters.StartIndex);

            var linePairs =
                from x in wordSamples                 //.Select((s, i) => new { s, i })
                from y in wordSamples                 //.Select((s, i) => new { s, i })
                where x.CompareTo(y) < 0
                select Tuple.Create(x, y);

            //where x.i < y.i
            //select Tuple.Create(x.s, y.s);

            foreach (var linePair in linePairs)
            {
                var excludedItem2s = default(HashSet <string>);
                if (data.TryGetValue(linePair.Item1, out excludedItem2s))
                {
                    if (excludedItem2s.Contains(linePair.Item2))
                    {
                        return(false);
                    }
                }
            }

            return(true);
        }
Example #11
0
        private Dictionary <string, HashSet <string> > GetExcludedPairs(PairExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(new Dictionary <string, HashSet <string> >(StringComparer.Ordinal));
            }

            var sampleSize     = parameters.SampleSize;
            var topSampleLimit = parameters.TopSamplesLimit > 0 ? parameters.TopSamplesLimit : Int16.MaxValue;
            var maxCount       = parameters.MaxCount;
            var totalLimit     = parameters.TotalLimit;

            var excludedSampleCandidates = GetSamples(
                sampleSize,
                parameters.BeginAnchor,
                parameters.EndAnchor,
                parameters.GetMinWordSize(),
                parameters.StartIndex);

            var topExclusionSamples = new HashSet <string>(
                excludedSampleCandidates
                .OrderByDescending(kv => kv.Value)
                .ThenBy(kv => kv.Key, StringComparer.Ordinal)
                .Select(kv => kv.Key)
                .Take(topSampleLimit),
                StringComparer.Ordinal);

            var exclusionLastIdx = sampleSize - 1;
            var allPairs         =
                from x in topExclusionSamples
                from y in topExclusionSamples
                where String.CompareOrdinal(x, y) < 0
                where
                ((x[0] != SampleSplitter.ANCHOR && y[0] != SampleSplitter.ANCHOR) ||
                 ((x[0] == SampleSplitter.ANCHOR) != (y[0] == SampleSplitter.ANCHOR))) &&                                // 2 begin samples not possible
                ((x[exclusionLastIdx] != SampleSplitter.ANCHOR && y[exclusionLastIdx] != SampleSplitter.ANCHOR) ||
                 ((x[exclusionLastIdx] == SampleSplitter.ANCHOR) != (y[exclusionLastIdx] == SampleSplitter.ANCHOR)))                           // 2 end samples not possible
                select x + y;

            var pairResults = allPairs
                              .ToDictionary(c => c, c => 0, StringComparer.Ordinal);

            foreach (var line in Words.Value)
            {
                var lineExclusionSamples = SampleSplitter.SplitSamples(
                    line,
                    sampleSize,
                    parameters.BeginAnchor,
                    parameters.EndAnchor,
                    parameters.GetMinWordSize(),
                    parameters.StartIndex);
                var lineTopSamples = new List <string>();
                foreach (var sample in lineExclusionSamples)
                {
                    if (topExclusionSamples.Contains(sample))
                    {
                        lineTopSamples.Add(sample);
                    }
                }

                var lineTopPairs =
                    from x in lineTopSamples                     //.Select((s, i) => new { s, i })
                    from y in lineTopSamples                     //.Select((s, i) => new { s, i })
                    where String.CompareOrdinal(x, y) < 0
                    select x + y;
                //where x.i < y.i
                //select x.s + y.s;

                foreach (var linePair in lineTopPairs)
                {
                    if (pairResults.ContainsKey(linePair))                     // Could have been excluded by ExcludeWithPreviousPairs
                    {
                        pairResults[linePair]++;
                    }
                }
            }

            var exludedPairs = pairResults
                               .Where(r => r.Value <= maxCount)
                               .OrderByDescending(r => r.Value)
                               .ThenBy(r => r.Key, StringComparer.Ordinal)
                               .Select(r => r.Key)
                               .ToArray();

            if (totalLimit != 0)
            {
                if (totalLimit > exludedPairs.Length)
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) > exludedPairs.Length ({1}), you can increase ExclusionTopSampleLimit or ExclusionMaxCount",
                                      totalLimit, exludedPairs.Length);
                }
                else if (totalLimit < exludedPairs.Length)
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) < exludedPairs.Length ({1}), croping data",
                                      totalLimit, exludedPairs.Length);
                    exludedPairs = exludedPairs.Take(totalLimit).ToArray();
                }
                else
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) = exludedPairs.Length ({1})", totalLimit, exludedPairs.Length);
                }
            }

            return(exludedPairs
                   .GroupBy(t => t.Substring(0, sampleSize))
                   .ToDictionary(g => g.Key, g => new HashSet <string>(
                                     g.Select(t => t.Substring(sampleSize, sampleSize)),
                                     StringComparer.Ordinal),
                                 StringComparer.Ordinal));
        }