private static bool MatchSampleExclusion(string value, HashSet <string> data, SampleExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(true);
            }

            var exclusionParameters = parameters;
            var wordSamples         = SampleSplitter.SplitSamples(value,
                                                                  exclusionParameters.SampleSize,
                                                                  exclusionParameters.BeginAnchor,
                                                                  exclusionParameters.EndAnchor,
                                                                  exclusionParameters.GetMinWordSize(),
                                                                  exclusionParameters.StartIndex,
                                                                  exclusionParameters.Length);

            foreach (var sample in wordSamples)
            {
                if (data.Contains(sample))
                {
                    return(false);
                }
            }

            return(true);
        }
Exemplo n.º 2
0
        private static void RemoveRedundantExclusionSamples(HashSet <string> excluded, HashSet <string> alreadyExcluded)
        {
            var sampleSize         = excluded.First().Length;
            var previousSampleSize = alreadyExcluded.First().Length;

            foreach (var sample in excluded.ToArray())
            {
                var subSamples = SampleSplitter.SplitSamples(sample, previousSampleSize);
                foreach (var subSample in subSamples)
                {
                    if (alreadyExcluded.Contains(subSample))
                    {
                        excluded.Remove(sample);
                        break;
                    }
                }
            }
        }
Exemplo n.º 3
0
        private void RemoveRedundantExclusionPairs(Dictionary <string, HashSet <string> > excluded, Dictionary <string, HashSet <string> > alreadyExcluded)
        {
            var sampleSize         = excluded.First().Key.Length;
            var previousSampleSize = alreadyExcluded.First().Key.Length;

            var pairs =
                from e in excluded
                from y in e.Value
                where String.CompareOrdinal(e.Key, y) < 0
                select Tuple.Create(e.Key, y);

            foreach (var pair in pairs.ToArray())
            {
                var subSamples1 = SampleSplitter.SplitSamples(pair.Item1, previousSampleSize);
                var subSamples2 = SampleSplitter.SplitSamples(pair.Item2, previousSampleSize);

                var itemPairs =
                    from x in subSamples1
                    from y in subSamples2
                    where String.CompareOrdinal(x, y) < 0
                    select Tuple.Create(x, y);

                foreach (var ip in itemPairs)
                {
                    var excludedItem2s = default(HashSet <string>);
                    if (alreadyExcluded.TryGetValue(ip.Item1, out excludedItem2s))
                    {
                        if (excludedItem2s.Contains(ip.Item2))
                        {
                            excluded[pair.Item1].Remove(pair.Item2);
                            break;
                        }
                    }
                }
            }

            foreach (var p in excluded.ToArray())
            {
                if (!p.Value.Any())
                {
                    excluded.Remove(p.Key);
                }
            }
        }
Exemplo n.º 4
0
        public static Dictionary <string, int> GetSamples(int size, bool beginAnchor, bool endAnchor, int minWordSize, int startIndex, int?length = null)
        {
            var samples = new Dictionary <string, int>(StringComparer.Ordinal);

            foreach (var line in Words.Value)
            {
                var word        = line;
                var wordSamples = SampleSplitter.SplitSamples(word, size, beginAnchor, endAnchor, minWordSize, startIndex, length);
                foreach (var sample in wordSamples)
                {
                    if (samples.ContainsKey(sample))
                    {
                        samples[sample]++;
                    }
                    else
                    {
                        samples[sample] = 1;
                    }
                }
            }
            return(samples);
        }
        public static bool MatchPairExclusion(string value, Dictionary <string, HashSet <string> > data, PairExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(true);
            }

            var wordSamples = SampleSplitter.SplitSamples(value,
                                                          parameters.SampleSize,
                                                          parameters.BeginAnchor,
                                                          parameters.EndAnchor,
                                                          parameters.GetMinWordSize(),
                                                          parameters.StartIndex);

            var linePairs =
                from x in wordSamples                 //.Select((s, i) => new { s, i })
                from y in wordSamples                 //.Select((s, i) => new { s, i })
                where x.CompareTo(y) < 0
                select Tuple.Create(x, y);

            //where x.i < y.i
            //select Tuple.Create(x.s, y.s);

            foreach (var linePair in linePairs)
            {
                var excludedItem2s = default(HashSet <string>);
                if (data.TryGetValue(linePair.Item1, out excludedItem2s))
                {
                    if (excludedItem2s.Contains(linePair.Item2))
                    {
                        return(false);
                    }
                }
            }

            return(true);
        }
Exemplo n.º 6
0
        private Dictionary <string, HashSet <string> > GetExcludedPairs(PairExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(new Dictionary <string, HashSet <string> >(StringComparer.Ordinal));
            }

            var sampleSize     = parameters.SampleSize;
            var topSampleLimit = parameters.TopSamplesLimit > 0 ? parameters.TopSamplesLimit : Int16.MaxValue;
            var maxCount       = parameters.MaxCount;
            var totalLimit     = parameters.TotalLimit;

            var excludedSampleCandidates = GetSamples(
                sampleSize,
                parameters.BeginAnchor,
                parameters.EndAnchor,
                parameters.GetMinWordSize(),
                parameters.StartIndex);

            var topExclusionSamples = new HashSet <string>(
                excludedSampleCandidates
                .OrderByDescending(kv => kv.Value)
                .ThenBy(kv => kv.Key, StringComparer.Ordinal)
                .Select(kv => kv.Key)
                .Take(topSampleLimit),
                StringComparer.Ordinal);

            var exclusionLastIdx = sampleSize - 1;
            var allPairs         =
                from x in topExclusionSamples
                from y in topExclusionSamples
                where String.CompareOrdinal(x, y) < 0
                where
                ((x[0] != SampleSplitter.ANCHOR && y[0] != SampleSplitter.ANCHOR) ||
                 ((x[0] == SampleSplitter.ANCHOR) != (y[0] == SampleSplitter.ANCHOR))) &&                                // 2 begin samples not possible
                ((x[exclusionLastIdx] != SampleSplitter.ANCHOR && y[exclusionLastIdx] != SampleSplitter.ANCHOR) ||
                 ((x[exclusionLastIdx] == SampleSplitter.ANCHOR) != (y[exclusionLastIdx] == SampleSplitter.ANCHOR)))                           // 2 end samples not possible
                select x + y;

            var pairResults = allPairs
                              .ToDictionary(c => c, c => 0, StringComparer.Ordinal);

            foreach (var line in Words.Value)
            {
                var lineExclusionSamples = SampleSplitter.SplitSamples(
                    line,
                    sampleSize,
                    parameters.BeginAnchor,
                    parameters.EndAnchor,
                    parameters.GetMinWordSize(),
                    parameters.StartIndex);
                var lineTopSamples = new List <string>();
                foreach (var sample in lineExclusionSamples)
                {
                    if (topExclusionSamples.Contains(sample))
                    {
                        lineTopSamples.Add(sample);
                    }
                }

                var lineTopPairs =
                    from x in lineTopSamples                     //.Select((s, i) => new { s, i })
                    from y in lineTopSamples                     //.Select((s, i) => new { s, i })
                    where String.CompareOrdinal(x, y) < 0
                    select x + y;
                //where x.i < y.i
                //select x.s + y.s;

                foreach (var linePair in lineTopPairs)
                {
                    if (pairResults.ContainsKey(linePair))                     // Could have been excluded by ExcludeWithPreviousPairs
                    {
                        pairResults[linePair]++;
                    }
                }
            }

            var exludedPairs = pairResults
                               .Where(r => r.Value <= maxCount)
                               .OrderByDescending(r => r.Value)
                               .ThenBy(r => r.Key, StringComparer.Ordinal)
                               .Select(r => r.Key)
                               .ToArray();

            if (totalLimit != 0)
            {
                if (totalLimit > exludedPairs.Length)
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) > exludedPairs.Length ({1}), you can increase ExclusionTopSampleLimit or ExclusionMaxCount",
                                      totalLimit, exludedPairs.Length);
                }
                else if (totalLimit < exludedPairs.Length)
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) < exludedPairs.Length ({1}), croping data",
                                      totalLimit, exludedPairs.Length);
                    exludedPairs = exludedPairs.Take(totalLimit).ToArray();
                }
                else
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) = exludedPairs.Length ({1})", totalLimit, exludedPairs.Length);
                }
            }

            return(exludedPairs
                   .GroupBy(t => t.Substring(0, sampleSize))
                   .ToDictionary(g => g.Key, g => new HashSet <string>(
                                     g.Select(t => t.Substring(sampleSize, sampleSize)),
                                     StringComparer.Ordinal),
                                 StringComparer.Ordinal));
        }