/// <summary> /// Merger Two Histograms /// </summary> /// <param name="curStatistics"></param> /// <param name="newStatistics"></param> /// <param name="currentJoin"></param> /// <returns></returns> internal static Statistics UpdateHistogram(Statistics curStatistics, Statistics newStatistics, out double joinSelectivity) { joinSelectivity = 1.0; if (curStatistics == null) return newStatistics; else if (newStatistics == null) return curStatistics; var resHistogram = new Dictionary<long, Tuple<double, bool>>(); var curHistogram = curStatistics.Histogram; var newHistogram = newStatistics.Histogram; if (!curHistogram.Any()) { return new Statistics { Density = newStatistics.Density, Histogram = newHistogram, }; } if (!newHistogram.Any()) { return new Statistics { Density = curStatistics.Density, Histogram = curHistogram, }; } var curNotPopularCount = 0.0; var newNotPopularCount = 0.0; var curDefaultRow = curStatistics.Density * curStatistics.RowCount; var newDefaultRow = newStatistics.Density * newStatistics.RowCount; IEnumerator<KeyValuePair<long, Tuple<double, bool>>> newEntry = null; bool fisrstMatch = false; bool newHistogramEnd = false; double resRowCount = 0.0; List<long> notPopularValues = new List<long>(); foreach (var entry in curHistogram) { if (!fisrstMatch) { if (newHistogram.ContainsKey(entry.Key)) { fisrstMatch = true; var entry1 = entry; newEntry = newHistogram.SkipWhile(e => e.Key != entry1.Key).GetEnumerator(); newEntry.MoveNext(); if (!entry.Value.Item2) curNotPopularCount -= entry.Value.Item1; if (!newEntry.Current.Value.Item2) newNotPopularCount -= newEntry.Current.Value.Item1; } } if (fisrstMatch) { if (newHistogramEnd || entry.Key < newEntry.Current.Key) { var curTuple = entry.Value; if (curTuple.Item2 == true) { var tmpCount = curTuple.Item1 * newDefaultRow; resRowCount += tmpCount; resHistogram.Add(entry.Key, new Tuple<double, bool>(tmpCount, true)); } else { notPopularValues.Add(entry.Key); curNotPopularCount += curTuple.Item1; resHistogram.Add(entry.Key, null); } } else if (entry.Key > newEntry.Current.Key) { while (entry.Key > newEntry.Current.Key) { var newTuple = newEntry.Current.Value; if (newTuple.Item2 == true) { var tmpCount = newTuple.Item1 * curDefaultRow; resRowCount += tmpCount; resHistogram.Add(newEntry.Current.Key, new Tuple<double, bool>(tmpCount, true)); } else { notPopularValues.Add(newEntry.Current.Key); newNotPopularCount += newTuple.Item1; resHistogram.Add(newEntry.Current.Key, null); } if (!newEntry.MoveNext()) { newHistogramEnd = true; break; } } if (newHistogramEnd) { break; } } else { var curTuple = entry.Value; var newTuple = newEntry.Current.Value; if (curTuple.Item2 == false && newTuple.Item2 == false) { notPopularValues.Add(entry.Key); curNotPopularCount += curTuple.Item1; newNotPopularCount += newTuple.Item1; resHistogram.Add(entry.Key, null); } else { var count1 = curTuple.Item2 ? curTuple.Item1 : curDefaultRow; var count2 = newTuple.Item2 ? newTuple.Item1 : newDefaultRow; var tmpCount = count1 * count2; resRowCount += tmpCount; resHistogram.Add(entry.Key, new Tuple<double, bool>(tmpCount, true)); } if (!newEntry.MoveNext()) { newHistogramEnd = true; } } } } double density = -1; if (notPopularValues.Any()) { var resDefaultRow = curNotPopularCount * newNotPopularCount * Math.Min(curDefaultRow / curNotPopularCount, newDefaultRow / newNotPopularCount); resRowCount += resDefaultRow; resDefaultRow = resDefaultRow / notPopularValues.Count; density = resDefaultRow / resRowCount; foreach (var value in notPopularValues) { resHistogram[value] = new Tuple<double, bool>(resDefaultRow, false); } } joinSelectivity = resRowCount/(curStatistics.RowCount*newStatistics.RowCount); return new Statistics { Histogram = resHistogram, Density = density < 0 ? Math.Max(curStatistics.Density, newStatistics.Density) : density, MaxValue = Math.Max(curStatistics.MaxValue, newStatistics.MaxValue), RowCount = resRowCount, }; }
/// <summary> /// Updates the statistics histogram for the edge given the sink id list. /// Bucket size is pre-defined /// </summary> /// <param name="edge"></param> /// <param name="sinkList">sink id of the edge sampling</param> internal static void UpdateEdgeHistogram(MatchEdge edge, List<long> sinkList) { sinkList.Sort(); var rowCount = sinkList.Count; var statistics = new Statistics { RowCount = rowCount, }; var height = (int)(rowCount / BucketNum); var popBucketCount = 0; var popValueCount = 0; var bucketCount = 0; // If number in each bucket is very small, then generate a Frequency Histogram if (height < 2) { bucketCount = rowCount; long preValue = sinkList[0]; int count = 1; int distCount = 1; for (int i = 1; i < rowCount; i++) { var curValue = sinkList[i]; if (curValue == preValue) { count++; } else { if (count > 1) { popBucketCount += count; popValueCount++; } statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > 1)); count = 1; preValue = curValue; distCount++; } } if (count > 1) { popBucketCount += count; popValueCount++; } statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > 1)); statistics.MaxValue = preValue; // Simple Denstity //statistics.Density = 1.0 / distCount; // Advanced Density statistics.Density = bucketCount == popBucketCount ? 0 : 1.0 * (bucketCount - popBucketCount) / bucketCount / (distCount - popValueCount); } // Generates a Height-balanced Histogram else { long preValue = sinkList[0]; int count = 0; int distCount = 1; for (int i = 1; i < rowCount; i++) { if (i % height == height - 1) { bucketCount++; var curValue = sinkList[i]; if (curValue == preValue) count += height; else { distCount++; if (count > height) { popBucketCount += count / height; popValueCount++; } //count = count == 0 ? height : count; statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > height)); preValue = curValue; count = height; } } } if (count > height) { popBucketCount += count / height; popValueCount++; } statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > height)); statistics.MaxValue = preValue; // Simple Density //statistics.Density = 1.0 / distCount; // Advanced Density statistics.Density = bucketCount == popBucketCount ? 0 : 1.0 * (bucketCount - popBucketCount) / bucketCount / (distCount - popValueCount); } edge.Statistics = statistics; }