public static bool MatchPairExclusion(string value, Dictionary <string, HashSet <string> > data, PairExclusionParameters parameters) { if (parameters.Disabled) { return(true); } var wordSamples = SampleSplitter.SplitSamples(value, parameters.SampleSize, parameters.BeginAnchor, parameters.EndAnchor, parameters.GetMinWordSize(), parameters.StartIndex); var linePairs = from x in wordSamples //.Select((s, i) => new { s, i }) from y in wordSamples //.Select((s, i) => new { s, i }) where x.CompareTo(y) < 0 select Tuple.Create(x, y); //where x.i < y.i //select Tuple.Create(x.s, y.s); foreach (var linePair in linePairs) { var excludedItem2s = default(HashSet <string>); if (data.TryGetValue(linePair.Item1, out excludedItem2s)) { if (excludedItem2s.Contains(linePair.Item2)) { return(false); } } } return(true); }
private Dictionary <string, HashSet <string> > GetExcludedPairs(PairExclusionParameters parameters) { if (parameters.Disabled) { return(new Dictionary <string, HashSet <string> >(StringComparer.Ordinal)); } var sampleSize = parameters.SampleSize; var topSampleLimit = parameters.TopSamplesLimit > 0 ? parameters.TopSamplesLimit : Int16.MaxValue; var maxCount = parameters.MaxCount; var totalLimit = parameters.TotalLimit; var excludedSampleCandidates = GetSamples( sampleSize, parameters.BeginAnchor, parameters.EndAnchor, parameters.GetMinWordSize(), parameters.StartIndex); var topExclusionSamples = new HashSet <string>( excludedSampleCandidates .OrderByDescending(kv => kv.Value) .ThenBy(kv => kv.Key, StringComparer.Ordinal) .Select(kv => kv.Key) .Take(topSampleLimit), StringComparer.Ordinal); var exclusionLastIdx = sampleSize - 1; var allPairs = from x in topExclusionSamples from y in topExclusionSamples where String.CompareOrdinal(x, y) < 0 where ((x[0] != SampleSplitter.ANCHOR && y[0] != SampleSplitter.ANCHOR) || ((x[0] == SampleSplitter.ANCHOR) != (y[0] == SampleSplitter.ANCHOR))) && // 2 begin samples not possible ((x[exclusionLastIdx] != SampleSplitter.ANCHOR && y[exclusionLastIdx] != SampleSplitter.ANCHOR) || ((x[exclusionLastIdx] == SampleSplitter.ANCHOR) != (y[exclusionLastIdx] == SampleSplitter.ANCHOR))) // 2 end samples not possible select x + y; var pairResults = allPairs .ToDictionary(c => c, c => 0, StringComparer.Ordinal); foreach (var line in Words.Value) { var lineExclusionSamples = SampleSplitter.SplitSamples( line, sampleSize, parameters.BeginAnchor, parameters.EndAnchor, parameters.GetMinWordSize(), parameters.StartIndex); var lineTopSamples = new List <string>(); foreach (var sample in lineExclusionSamples) { if (topExclusionSamples.Contains(sample)) { lineTopSamples.Add(sample); } } var lineTopPairs = from x in lineTopSamples //.Select((s, i) => new { s, i }) from y in lineTopSamples //.Select((s, i) => new { s, i }) where String.CompareOrdinal(x, y) < 0 select x + y; //where x.i < y.i //select x.s + y.s; foreach (var linePair in lineTopPairs) { if (pairResults.ContainsKey(linePair)) // Could have been excluded by ExcludeWithPreviousPairs { pairResults[linePair]++; } } } var exludedPairs = pairResults .Where(r => r.Value <= maxCount) .OrderByDescending(r => r.Value) .ThenBy(r => r.Key, StringComparer.Ordinal) .Select(r => r.Key) .ToArray(); if (totalLimit != 0) { if (totalLimit > exludedPairs.Length) { Console.WriteLine("ExclusionTotalLimit ({0}) > exludedPairs.Length ({1}), you can increase ExclusionTopSampleLimit or ExclusionMaxCount", totalLimit, exludedPairs.Length); } else if (totalLimit < exludedPairs.Length) { Console.WriteLine("ExclusionTotalLimit ({0}) < exludedPairs.Length ({1}), croping data", totalLimit, exludedPairs.Length); exludedPairs = exludedPairs.Take(totalLimit).ToArray(); } else { Console.WriteLine("ExclusionTotalLimit ({0}) = exludedPairs.Length ({1})", totalLimit, exludedPairs.Length); } } return(exludedPairs .GroupBy(t => t.Substring(0, sampleSize)) .ToDictionary(g => g.Key, g => new HashSet <string>( g.Select(t => t.Substring(sampleSize, sampleSize)), StringComparer.Ordinal), StringComparer.Ordinal)); }