public ColumnStatistics GetLeafToLeafStatistics(MatchEdge nodeEdge, MatchEdge componentEdge) { var edgeTuple = new Tuple <string, string>(nodeEdge.EdgeAlias, componentEdge.EdgeAlias); if (LeafToLeafSelectivity.ContainsKey(edgeTuple)) { return(LeafToLeafSelectivity[edgeTuple]); } var mergedStatistics = ColumnStatistics.UpdateHistogram(Context.GetEdgeStatistics(nodeEdge), Context.GetEdgeStatistics(componentEdge)); LeafToLeafSelectivity[edgeTuple] = mergedStatistics; return(mergedStatistics); }
public MatchComponent(MatchNode node) : this() { Nodes.Add(node); MaterializedNodeSplitCount[node] = 0; StatisticsDict[node] = new ColumnStatistics { Selectivity = 1.0 / node.TableRowCount }; Size *= node.EstimatedRows; EstimateSize *= node.EstimatedRows; TableRef = new WNamedTableReference { Alias = new Identifier { Value = node.RefAlias }, TableObjectName = node.TableObjectName }; }
/// <summary> /// Transit from current component to the new component in the next state given the Node Unit /// </summary> /// <param name="candidateTree"></param> /// <param name="densityDict"></param> /// <param name="subGraph"></param> /// <param name="statisticsCalculator"></param> /// <returns></returns> public MatchComponent GetNextState( OneHeightTree candidateTree, Dictionary <string, double> densityDict, IMatchJoinStatisticsCalculator statisticsCalculator) { var newComponent = new MatchComponent(this); var root = candidateTree.TreeRoot; WBooleanExpression joinCondition = null; string nodeName = ""; // Update Nodes if (newComponent.MaterializedNodeSplitCount.ContainsKey(root)) { newComponent.MaterializedNodeSplitCount[root]++; nodeName = newComponent.GetNodeRefName(root); joinCondition = new WBooleanComparisonExpression { FirstExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = root.RefAlias }, new Identifier { Value = "GlobalNodeId" } ), }, SecondExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = nodeName }, new Identifier { Value = "GlobalNodeId" } ), }, ComparisonType = BooleanComparisonType.Equals }; } else { nodeName = root.RefAlias; newComponent.Nodes.Add(root); newComponent.MaterializedNodeSplitCount[root] = 0; newComponent.StatisticsDict[root] = new ColumnStatistics { Selectivity = 1.0 / root.TableRowCount }; } // Constructs table reference WTableReference nodeTable = new WNamedTableReference { Alias = new Identifier { Value = nodeName }, TableObjectName = root.TableObjectName }; WTableReference compTable = newComponent.TableRef; // Updates join conditions double selectivity = 1.0; double degrees = 1.0; var DensityCount = new Dictionary <string, int>(StringComparer.CurrentCultureIgnoreCase); List <MatchEdge> inEdges; if (newComponent.UnmaterializedNodeMapping.TryGetValue(root, out inEdges)) { var firstEdge = inEdges.First(); bool materialized = newComponent.EdgeMaterilizedDict[firstEdge]; newComponent.UnmaterializedNodeMapping.Remove(root); selectivity *= 1.0 / root.TableRowCount; // Component materialized edge to root if (materialized) { joinCondition = WBooleanBinaryExpression.Conjunction(joinCondition, new WBooleanComparisonExpression { FirstExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = firstEdge.EdgeAlias }, new Identifier { Value = "Sink" } ), }, SecondExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = nodeName }, new Identifier { Value = "GlobalNodeId" } ) }, ComparisonType = BooleanComparisonType.Equals }); //var statistics = ColumnStatistics.UpdateHistogram(newComponent.StatisticsDict[root], // new ColumnStatistics {Selectivity = 1.0/root.TableRowCount}); //selectivity *= statistics.Selectivity; //newComponent.StatisticsDict[root] = statistics; if (DensityCount.ContainsKey(root.TableObjectName.ToString())) { DensityCount[root.TableObjectName.ToString()]++; } else { DensityCount[root.TableObjectName.ToString()] = 1; } } // Component unmaterialized edge to root else { ColumnStatistics statistics = null; foreach (var edge in inEdges) { // Update component table compTable = SpanTableRef(compTable, edge, newComponent.GetNodeRefName(edge.SourceNode)); newComponent.EdgeMaterilizedDict[edge] = true; joinCondition = WBooleanBinaryExpression.Conjunction(joinCondition, new WBooleanComparisonExpression { FirstExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = edge.EdgeAlias }, new Identifier { Value = "Sink" } ), }, SecondExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = nodeName }, new Identifier { Value = "GlobalNodeId" } ) }, ComparisonType = BooleanComparisonType.Equals }); statistics = ColumnStatistics.UpdateHistogram(statistics, newComponent.Context.GetEdgeStatistics(edge)); selectivity *= statistics.Selectivity; } newComponent.StatisticsDict[root] = statistics; if (DensityCount.ContainsKey(root.TableObjectName.ToString())) { DensityCount[root.TableObjectName.ToString()] += inEdges.Count; } else { DensityCount[root.TableObjectName.ToString()] = inEdges.Count; } } } var jointEdges = candidateTree.MaterializedEdges; int sinkToSinkCount = 0; foreach (var jointEdge in jointEdges) { // Update node table nodeTable = SpanTableRef(nodeTable, jointEdge, nodeName); degrees *= jointEdge.AverageDegree; newComponent.EdgeMaterilizedDict[jointEdge] = true; var sinkNode = jointEdge.SinkNode; // Leaf to component materialized node if (newComponent.MaterializedNodeSplitCount.ContainsKey(sinkNode)) { joinCondition = WBooleanBinaryExpression.Conjunction(joinCondition, new WBooleanComparisonExpression { FirstExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = jointEdge.EdgeAlias }, new Identifier { Value = "Sink" } ), }, SecondExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = sinkNode.RefAlias }, new Identifier { Value = "GlobalNodeId" } ) }, ComparisonType = BooleanComparisonType.Equals }); var statistics = ColumnStatistics.UpdateHistogram(newComponent.StatisticsDict[sinkNode], newComponent.Context.GetEdgeStatistics(jointEdge)); selectivity *= statistics.Selectivity; newComponent.StatisticsDict[sinkNode] = statistics; if (DensityCount.ContainsKey(sinkNode.TableObjectName.ToString())) { DensityCount[sinkNode.TableObjectName.ToString()]++; } else { DensityCount[sinkNode.TableObjectName.ToString()] = 1; } } // Leaf to component unmaterialized node else { inEdges = newComponent.UnmaterializedNodeMapping[sinkNode]; var firstEdge = inEdges.First(); bool materlizedEdge = newComponent.EdgeMaterilizedDict[firstEdge]; // Leaf to materialized leaf if (materlizedEdge) { joinCondition = WBooleanBinaryExpression.Conjunction(joinCondition, new WBooleanComparisonExpression { FirstExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = jointEdge.EdgeAlias }, new Identifier { Value = "Sink" } ), }, SecondExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = firstEdge.EdgeAlias }, new Identifier { Value = "Sink" } ) }, ComparisonType = BooleanComparisonType.Equals }); sinkToSinkCount++; var statistics = ColumnStatistics.UpdateHistogram(newComponent.StatisticsDict[sinkNode], newComponent.Context.GetEdgeStatistics(jointEdge)); selectivity *= statistics.Selectivity; newComponent.StatisticsDict[sinkNode] = statistics; } // Leaf to unmaterialized leaf else { ColumnStatistics compSinkNodeStatistics = null; foreach (var inEdge in inEdges) { compTable = SpanTableRef(compTable, inEdge, newComponent.GetNodeRefName(inEdge.SourceNode)); newComponent.EdgeMaterilizedDict[inEdge] = true; joinCondition = WBooleanBinaryExpression.Conjunction(joinCondition, new WBooleanComparisonExpression { FirstExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = jointEdge.EdgeAlias }, new Identifier { Value = "Sink" } ), }, SecondExpr = new WColumnReferenceExpression { ColumnType = ColumnType.Regular, MultiPartIdentifier = new WMultiPartIdentifier( new Identifier { Value = inEdge.EdgeAlias }, new Identifier { Value = "Sink" } ) }, ComparisonType = BooleanComparisonType.Equals }); sinkToSinkCount++; var leafToLeafStatistics = statisticsCalculator.GetLeafToLeafStatistics(jointEdge, inEdge); selectivity *= leafToLeafStatistics.Selectivity; compSinkNodeStatistics = ColumnStatistics.UpdateHistogram(compSinkNodeStatistics, newComponent.Context.GetEdgeStatistics(inEdge)); } newComponent.StatisticsDict[sinkNode] = compSinkNodeStatistics; } } } var unmatEdges = candidateTree.UnmaterializedEdges; foreach (var unmatEdge in unmatEdges) { newComponent.EdgeMaterilizedDict[unmatEdge] = false; newComponent.Nodes.Add(unmatEdge.SinkNode); var sinkNodeInEdges = newComponent.UnmaterializedNodeMapping.GetOrCreate(unmatEdge.SinkNode); sinkNodeInEdges.Add(unmatEdge); degrees *= unmatEdge.AverageDegree; } // Calculate Estimated Join Selectivity & Estimated Node Size double estimatedSelectity = 1.0; int count = 0; bool sinkJoin = false; foreach (var item in densityDict.Where(e => DensityCount.ContainsKey(e.Key))) { var density = item.Value; var curJoinCount = DensityCount[item.Key]; var curJoinSelectitivy = Math.Pow(density, 2 - Math.Pow(2, 1 - curJoinCount)); if (!sinkJoin && ColumnStatistics.DefaultDensity < density) { var curSinkJoinSelectivity = Math.Pow(ColumnStatistics.DefaultDensity, 2 - Math.Pow(2, 1 - sinkToSinkCount)); estimatedSelectity *= Math.Pow(curSinkJoinSelectivity, Math.Pow(2, -count)); count += sinkToSinkCount; sinkJoin = true; } estimatedSelectity *= Math.Pow(curJoinSelectitivy, Math.Pow(2, -count)); count += curJoinCount; } var estimatedNodeUnitSize = root.EstimatedRows * Math.Pow(1000, candidateTree.MaterializedEdges.Count + candidateTree.UnmaterializedEdges.Count); // Update Table Reference newComponent.TableRef = GetPlanAndUpdateCost(candidateTree, newComponent, nodeTable, compTable, joinCondition, degrees, selectivity, estimatedNodeUnitSize, estimatedSelectity); return(newComponent); }
/// <summary> /// Merger Two Histograms /// </summary> /// <param name="curStatistics"></param> /// <param name="newStatistics"></param> /// <param name="currentJoin"></param> /// <returns></returns> internal static ColumnStatistics UpdateHistogram(ColumnStatistics curStatistics, ColumnStatistics newStatistics) { if (curStatistics == null) { return(newStatistics); } else if (newStatistics == null) { return(curStatistics); } var resHistogram = new Dictionary <long, Tuple <double, bool> >(); var curHistogram = curStatistics.Histogram; var newHistogram = newStatistics.Histogram; if (!curHistogram.Any()) { return(new ColumnStatistics { Density = newStatistics.Density, Histogram = newHistogram, Selectivity = curStatistics.Selectivity * newStatistics.Selectivity, }); } if (!newHistogram.Any()) { return(new ColumnStatistics { Density = curStatistics.Density, Histogram = curHistogram, Selectivity = curStatistics.Selectivity * newStatistics.Selectivity, }); } var curNotPopularCount = 0.0; var newNotPopularCount = 0.0; var curDefaultRow = curStatistics.Density * curStatistics.RowCount; var newDefaultRow = newStatistics.Density * newStatistics.RowCount; IEnumerator <KeyValuePair <long, Tuple <double, bool> > > newEntry = null; bool fisrstMatch = false; bool newHistogramEnd = false; double resRowCount = 0.0; List <long> notPopularValues = new List <long>(); foreach (var entry in curHistogram) { if (!fisrstMatch) { if (newHistogram.ContainsKey(entry.Key)) { fisrstMatch = true; var entry1 = entry; newEntry = newHistogram.SkipWhile(e => e.Key != entry1.Key).GetEnumerator(); newEntry.MoveNext(); if (!entry.Value.Item2) { curNotPopularCount -= entry.Value.Item1; } if (!newEntry.Current.Value.Item2) { newNotPopularCount -= newEntry.Current.Value.Item1; } } } if (fisrstMatch) { if (newHistogramEnd || entry.Key < newEntry.Current.Key) { var curTuple = entry.Value; if (curTuple.Item2 == true) { var tmpCount = curTuple.Item1 * newDefaultRow; resRowCount += tmpCount; resHistogram.Add(entry.Key, new Tuple <double, bool>(tmpCount, true)); } else { notPopularValues.Add(entry.Key); curNotPopularCount += curTuple.Item1; resHistogram.Add(entry.Key, null); } } else if (entry.Key > newEntry.Current.Key) { while (entry.Key > newEntry.Current.Key) { var newTuple = newEntry.Current.Value; if (newTuple.Item2 == true) { var tmpCount = newTuple.Item1 * curDefaultRow; resRowCount += tmpCount; resHistogram.Add(newEntry.Current.Key, new Tuple <double, bool>(tmpCount, true)); } else { notPopularValues.Add(newEntry.Current.Key); newNotPopularCount += newTuple.Item1; resHistogram.Add(newEntry.Current.Key, null); } if (!newEntry.MoveNext()) { newHistogramEnd = true; break; } } if (newHistogramEnd) { break; } } else { var curTuple = entry.Value; var newTuple = newEntry.Current.Value; if (curTuple.Item2 == false && newTuple.Item2 == false) { notPopularValues.Add(entry.Key); curNotPopularCount += curTuple.Item1; newNotPopularCount += newTuple.Item1; resHistogram.Add(entry.Key, null); } else { var count1 = curTuple.Item2 ? curTuple.Item1 : curDefaultRow; var count2 = newTuple.Item2 ? newTuple.Item1 : newDefaultRow; var tmpCount = count1 * count2; resRowCount += tmpCount; resHistogram.Add(entry.Key, new Tuple <double, bool>(tmpCount, true)); } if (!newEntry.MoveNext()) { newHistogramEnd = true; } } } } double density = -1; if (notPopularValues.Any()) { var resDefaultRow = curNotPopularCount * newNotPopularCount * Math.Min(curDefaultRow / curNotPopularCount, newDefaultRow / newNotPopularCount); resRowCount += resDefaultRow; resDefaultRow = resDefaultRow / notPopularValues.Count; density = resDefaultRow / resRowCount; foreach (var value in notPopularValues) { resHistogram[value] = new Tuple <double, bool>(resDefaultRow, false); } } return(new ColumnStatistics { Histogram = resHistogram, Density = density < 0 ? Math.Max(curStatistics.Density, newStatistics.Density) : density, MaxValue = Math.Max(curStatistics.MaxValue, newStatistics.MaxValue), RowCount = resRowCount, Selectivity = resRowCount / (curStatistics.RowCount * newStatistics.RowCount), }); }
public MatchComponent(MatchNode node):this() { Nodes.Add(node); MaterializedNodeSplitCount[node] = 0; StatisticsDict[node] = new ColumnStatistics{Selectivity = 1.0/node.TableRowCount}; Size *= node.EstimatedRows; EstimateSize *= node.EstimatedRows; TableRef = new WNamedTableReference { Alias = new Identifier { Value = node.RefAlias}, TableObjectName = node.TableObjectName }; }
/// <summary> /// Update the statistics histogram for the edge given the sink id list. /// Bucket size is pre-defined /// </summary> /// <param name="edge"></param> /// <param name="sinkList"></param> private void UpdateEdgeHistogram(MatchEdge edge, List<long> sinkList) { sinkList.Sort(); var rowCount = sinkList.Count; var statistics = new ColumnStatistics { RowCount = rowCount }; var height = (int)(rowCount / BucketNum); var popBucketCount = 0; var popValueCount = 0; var bucketCount = 0; // If number in each bucket is very small, then generate a Frequency Histogram if (height < 2) { bucketCount = rowCount; long preValue = sinkList[0]; int count = 1; int distCount = 1; for (int i = 1; i < rowCount; i++) { var curValue = sinkList[i]; if (curValue == preValue) { count++; } else { if (count > 1) { popBucketCount += count; popValueCount++; } statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > 1)); count = 1; preValue = curValue; distCount++; } } if (count > 1) { popBucketCount += count; popValueCount++; } statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > 1)); statistics.MaxValue = preValue; // Simple Denstity //statistics.Density = 1.0 / distCount; // Advanced Density statistics.Density = bucketCount == popBucketCount ? 0 : 1.0 * (bucketCount - popBucketCount) / bucketCount / (distCount - popValueCount); } // Generate a Height-balanced Histogram else { long preValue = sinkList[0]; int count = 0; int distCount = 1; for (int i = 1; i < rowCount; i++) { if (i % height == height - 1) { bucketCount++; var curValue = sinkList[i]; if (curValue == preValue) count += height; else { distCount++; if (count > height) { popBucketCount += count / height; popValueCount++; } //count = count == 0 ? height : count; statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > height)); preValue = curValue; count = height; } } } if (count > height) { popBucketCount += count / height; popValueCount++; } statistics.Histogram.Add(preValue, new Tuple<double, bool>(count, count > height)); statistics.MaxValue = preValue; // Simple Density //statistics.Density = 1.0 / distCount; // Advanced Density statistics.Density = bucketCount == popBucketCount ? 0 : 1.0 * (bucketCount - popBucketCount) / bucketCount / (distCount - popValueCount); } _context.AddEdgeStatistics(edge, statistics); }
public void AddEdgeStatistics(MatchEdge edge, ColumnStatistics statistics) { _edgeStatisticses.Add(edge, statistics); }
/// <summary> /// Merger Two Histograms /// </summary> /// <param name="curStatistics"></param> /// <param name="newStatistics"></param> /// <param name="currentJoin"></param> /// <returns></returns> internal static ColumnStatistics UpdateHistogram(ColumnStatistics curStatistics, ColumnStatistics newStatistics) { if (curStatistics == null) return newStatistics; else if (newStatistics == null) return curStatistics; var resHistogram = new Dictionary<long, Tuple<double, bool>>(); var curHistogram = curStatistics.Histogram; var newHistogram = newStatistics.Histogram; if (!curHistogram.Any()) { return new ColumnStatistics { Density = newStatistics.Density, Histogram = newHistogram, Selectivity = curStatistics.Selectivity * newStatistics.Selectivity, }; } if (!newHistogram.Any()) { return new ColumnStatistics { Density = curStatistics.Density, Histogram = curHistogram, Selectivity = curStatistics.Selectivity * newStatistics.Selectivity, }; } var curNotPopularCount = 0.0; var newNotPopularCount = 0.0; var curDefaultRow = curStatistics.Density * curStatistics.RowCount; var newDefaultRow = newStatistics.Density * newStatistics.RowCount; IEnumerator<KeyValuePair<long, Tuple<double, bool>>> newEntry = null; bool fisrstMatch = false; bool newHistogramEnd = false; double resRowCount = 0.0; List<long> notPopularValues = new List<long>(); foreach (var entry in curHistogram) { if (!fisrstMatch) { if (newHistogram.ContainsKey(entry.Key)) { fisrstMatch = true; var entry1 = entry; newEntry = newHistogram.SkipWhile(e => e.Key != entry1.Key).GetEnumerator(); newEntry.MoveNext(); if (!entry.Value.Item2) curNotPopularCount -= entry.Value.Item1; if (!newEntry.Current.Value.Item2) newNotPopularCount -= newEntry.Current.Value.Item1; } } if (fisrstMatch) { if (newHistogramEnd || entry.Key < newEntry.Current.Key) { var curTuple = entry.Value; if (curTuple.Item2 == true) { var tmpCount = curTuple.Item1 * newDefaultRow; resRowCount += tmpCount; resHistogram.Add(entry.Key, new Tuple<double, bool>(tmpCount, true)); } else { notPopularValues.Add(entry.Key); curNotPopularCount += curTuple.Item1; resHistogram.Add(entry.Key, null); } } else if (entry.Key > newEntry.Current.Key) { while (entry.Key > newEntry.Current.Key) { var newTuple = newEntry.Current.Value; if (newTuple.Item2 == true) { var tmpCount = newTuple.Item1 * curDefaultRow; resRowCount += tmpCount; resHistogram.Add(newEntry.Current.Key, new Tuple<double, bool>(tmpCount, true)); } else { notPopularValues.Add(newEntry.Current.Key); newNotPopularCount += newTuple.Item1; resHistogram.Add(newEntry.Current.Key, null); } if (!newEntry.MoveNext()) { newHistogramEnd = true; break; } } if (newHistogramEnd) { break; } } else { var curTuple = entry.Value; var newTuple = newEntry.Current.Value; if (curTuple.Item2 == false && newTuple.Item2 == false) { notPopularValues.Add(entry.Key); curNotPopularCount += curTuple.Item1; newNotPopularCount += newTuple.Item1; resHistogram.Add(entry.Key, null); } else { var count1 = curTuple.Item2 ? curTuple.Item1 : curDefaultRow; var count2 = newTuple.Item2 ? newTuple.Item1 : newDefaultRow; var tmpCount = count1 * count2; resRowCount += tmpCount; resHistogram.Add(entry.Key, new Tuple<double, bool>(tmpCount, true)); } if (!newEntry.MoveNext()) { newHistogramEnd = true; } } } } double density = -1; if (notPopularValues.Any()) { var resDefaultRow = curNotPopularCount * newNotPopularCount * Math.Min(curDefaultRow / curNotPopularCount, newDefaultRow / newNotPopularCount); resRowCount += resDefaultRow; resDefaultRow = resDefaultRow / notPopularValues.Count; density = resDefaultRow / resRowCount; foreach (var value in notPopularValues) { resHistogram[value] = new Tuple<double, bool>(resDefaultRow, false); } } return new ColumnStatistics { Histogram = resHistogram, Density = density < 0 ? Math.Max(curStatistics.Density, newStatistics.Density) : density, MaxValue = Math.Max(curStatistics.MaxValue, newStatistics.MaxValue), RowCount = resRowCount, Selectivity = resRowCount / (curStatistics.RowCount * newStatistics.RowCount), }; }