public static bool MatchPairExclusion(string value, Dictionary <string, HashSet <string> > data, PairExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(true);
            }

            var wordSamples = SampleSplitter.SplitSamples(value,
                                                          parameters.SampleSize,
                                                          parameters.BeginAnchor,
                                                          parameters.EndAnchor,
                                                          parameters.GetMinWordSize(),
                                                          parameters.StartIndex);

            var linePairs =
                from x in wordSamples                 //.Select((s, i) => new { s, i })
                from y in wordSamples                 //.Select((s, i) => new { s, i })
                where x.CompareTo(y) < 0
                select Tuple.Create(x, y);

            //where x.i < y.i
            //select Tuple.Create(x.s, y.s);

            foreach (var linePair in linePairs)
            {
                var excludedItem2s = default(HashSet <string>);
                if (data.TryGetValue(linePair.Item1, out excludedItem2s))
                {
                    if (excludedItem2s.Contains(linePair.Item2))
                    {
                        return(false);
                    }
                }
            }

            return(true);
        }
Beispiel #2
0
        private Dictionary <string, HashSet <string> > GetExcludedPairs(PairExclusionParameters parameters)
        {
            if (parameters.Disabled)
            {
                return(new Dictionary <string, HashSet <string> >(StringComparer.Ordinal));
            }

            var sampleSize     = parameters.SampleSize;
            var topSampleLimit = parameters.TopSamplesLimit > 0 ? parameters.TopSamplesLimit : Int16.MaxValue;
            var maxCount       = parameters.MaxCount;
            var totalLimit     = parameters.TotalLimit;

            var excludedSampleCandidates = GetSamples(
                sampleSize,
                parameters.BeginAnchor,
                parameters.EndAnchor,
                parameters.GetMinWordSize(),
                parameters.StartIndex);

            var topExclusionSamples = new HashSet <string>(
                excludedSampleCandidates
                .OrderByDescending(kv => kv.Value)
                .ThenBy(kv => kv.Key, StringComparer.Ordinal)
                .Select(kv => kv.Key)
                .Take(topSampleLimit),
                StringComparer.Ordinal);

            var exclusionLastIdx = sampleSize - 1;
            var allPairs         =
                from x in topExclusionSamples
                from y in topExclusionSamples
                where String.CompareOrdinal(x, y) < 0
                where
                ((x[0] != SampleSplitter.ANCHOR && y[0] != SampleSplitter.ANCHOR) ||
                 ((x[0] == SampleSplitter.ANCHOR) != (y[0] == SampleSplitter.ANCHOR))) &&                                // 2 begin samples not possible
                ((x[exclusionLastIdx] != SampleSplitter.ANCHOR && y[exclusionLastIdx] != SampleSplitter.ANCHOR) ||
                 ((x[exclusionLastIdx] == SampleSplitter.ANCHOR) != (y[exclusionLastIdx] == SampleSplitter.ANCHOR)))                           // 2 end samples not possible
                select x + y;

            var pairResults = allPairs
                              .ToDictionary(c => c, c => 0, StringComparer.Ordinal);

            foreach (var line in Words.Value)
            {
                var lineExclusionSamples = SampleSplitter.SplitSamples(
                    line,
                    sampleSize,
                    parameters.BeginAnchor,
                    parameters.EndAnchor,
                    parameters.GetMinWordSize(),
                    parameters.StartIndex);
                var lineTopSamples = new List <string>();
                foreach (var sample in lineExclusionSamples)
                {
                    if (topExclusionSamples.Contains(sample))
                    {
                        lineTopSamples.Add(sample);
                    }
                }

                var lineTopPairs =
                    from x in lineTopSamples                     //.Select((s, i) => new { s, i })
                    from y in lineTopSamples                     //.Select((s, i) => new { s, i })
                    where String.CompareOrdinal(x, y) < 0
                    select x + y;
                //where x.i < y.i
                //select x.s + y.s;

                foreach (var linePair in lineTopPairs)
                {
                    if (pairResults.ContainsKey(linePair))                     // Could have been excluded by ExcludeWithPreviousPairs
                    {
                        pairResults[linePair]++;
                    }
                }
            }

            var exludedPairs = pairResults
                               .Where(r => r.Value <= maxCount)
                               .OrderByDescending(r => r.Value)
                               .ThenBy(r => r.Key, StringComparer.Ordinal)
                               .Select(r => r.Key)
                               .ToArray();

            if (totalLimit != 0)
            {
                if (totalLimit > exludedPairs.Length)
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) > exludedPairs.Length ({1}), you can increase ExclusionTopSampleLimit or ExclusionMaxCount",
                                      totalLimit, exludedPairs.Length);
                }
                else if (totalLimit < exludedPairs.Length)
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) < exludedPairs.Length ({1}), croping data",
                                      totalLimit, exludedPairs.Length);
                    exludedPairs = exludedPairs.Take(totalLimit).ToArray();
                }
                else
                {
                    Console.WriteLine("ExclusionTotalLimit ({0}) = exludedPairs.Length ({1})", totalLimit, exludedPairs.Length);
                }
            }

            return(exludedPairs
                   .GroupBy(t => t.Substring(0, sampleSize))
                   .ToDictionary(g => g.Key, g => new HashSet <string>(
                                     g.Select(t => t.Substring(sampleSize, sampleSize)),
                                     StringComparer.Ordinal),
                                 StringComparer.Ordinal));
        }