public TermExpression(IXTable source, IXColumn left, CompareOperator op, IXColumn right) { _evaluate = EvaluateNormal; // Save arguments as-is for ToString() _left = left; _cOp = op; _right = right; // Disallow constant <op> constant [likely error not wrapping column name] if (_left is ConstantColumn && _right is ConstantColumn) { throw new ArgumentException($"({left} {op.ToQueryForm()} {right}) is comparing two constants. Wrap [ColumnNames] in braces."); } // If the left side is a constant and the operator can be swapped, move it to the right side. // Comparers can check if the right side is constant and run a faster loop when that's the case. if (_left.IsConstantColumn() && !(_right.IsConstantColumn())) { if (op.TryInvertCompareOperator(out op)) { _left = right; _right = left; } } // Disallow unquoted constants used as strings if (_right.IsConstantColumn() && _left.ColumnDetails.Type == typeof(String8) && _right.ColumnDetails.Type == typeof(String8)) { ConstantColumn cRight = _right as ConstantColumn; if (cRight != null && !cRight.IsNull && cRight.WasUnwrappedLiteral) { throw new ArgumentException($"{right} is compared to a string, but is unquoted. Strings must be quoted."); } } // Convert the right side to the left side type if required // This means constants will always be casted to the other side type. if (_left.ColumnDetails.Type != _right.ColumnDetails.Type) { _right = CastedColumn.Build(source, _right, _left.ColumnDetails.Type, ValueKinds.Invalid); } // Get the left and right getters _leftGetter = _left.CurrentGetter(); _rightGetter = _right.CurrentGetter(); // Null comparison is generic if (_right.IsNullConstant()) { if (op == CompareOperator.Equal) { _comparer = WhereIsNull; } else if (op == CompareOperator.NotEqual) { _comparer = WhereIsNotNull; } else { throw new ArgumentException($"Only equals and not equals operators are supported against null."); } } else if (_left.IsNullConstant()) { _left = _right; if (op == CompareOperator.Equal) { _comparer = WhereIsNull; } else if (op == CompareOperator.NotEqual) { _comparer = WhereIsNotNull; } else { throw new ArgumentException($"Only equals and not equals operators are supported against null."); } } else { // Get a comparer which can compare the values _comparer = TypeProviderFactory.Get(left.ColumnDetails.Type).TryGetComparer(op); if (_comparer == null) { throw new ArgumentException($"No comparer found for type {left.ColumnDetails.Type.Name}."); } } // Optimize Enum to Constant comparisons to use the underlying indices if (_left.IsEnumColumn() && _right.IsConstantColumn()) { // Get an optimized comparer against the indices rather than values IXColumn replacedRight = _right; _comparer = SetComparer.ConvertToEnumIndexComparer(_left, _comparer, ref replacedRight, source); // Get the indices on the left side _leftGetter = _left.IndicesCurrentGetter(); // Use the updated value for the right side _rightGetter = replacedRight.CurrentGetter(); } // Allow String8 to constant Contains queries to compare on the raw byte[] and int[] if (op == CompareOperator.Contains && _right.IsConstantColumn() && _left.ColumnDetails.Type == typeof(String8) && !_left.IsEnumColumn()) { Func <object> rawGetter = _left.ComponentGetter(ColumnComponent.String8Raw); if (rawGetter != null) { String8 rightValue = (String8)_right.ValuesGetter()().Array.GetValue(0); String8Comparer string8Comparer = new String8Comparer(); _evaluate = (vector) => { String8Raw raw = (String8Raw)rawGetter(); string8Comparer.WhereContains(raw, rightValue, vector); }; } } }
/// <summary> /// Build a GroupBy Dictionary for Peek. /// </summary> /// <remarks> /// Peek identifies each distinct common value and the approximate percentage of rows with it. /// If we have many matching rows, we can sample - the sample will have any common values in it. /// However, we don't know how many matches we have in advance. /// Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows. /// As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets. /// This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries. /// </remarks> /// <param name="cancellationToken">CancellationToken to request early stop</param> private void BuildDictionary(CancellationToken cancellationToken) { // Short-circuit path if there's one key column and it's an EnumColumn if (_column.IsEnumColumn()) { BuildSingleEnumColumnDictionary(cancellationToken); return; } // Build a Random instance to sample rows Random r = new Random(); // Build a Dictionary and CountAggregator for each sample GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount]; CountAggregator[] counts = new CountAggregator[SampleCount]; int[][] remapArrays = new int[SampleCount][]; for (int i = 0; i < SampleCount; ++i) { dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails }); counts[i] = new CountAggregator(); } // Retrieve the column getter Func <XArray> columnGetter = _column.CurrentGetter(); // Track which sample we'll currently report int currentSample = 0; XArray[] arrays = new XArray[1]; int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Get the column values arrays[0] = columnGetter(); // Build the GroupBy count for all rows and successive 1/8 samples for (int i = 0; i < SampleCount; ++i) { // Add these to the Join Dictionary if (i >= currentSample) { // Choose buckets for each row XArray indicesForRows = dictionaries[i].FindOrAdd(arrays); // Identify the bucket for each row and aggregate them counts[i].Add(indicesForRows, dictionaries[i].Count); // If this sample now has enough values, stop collecting bigger row sets if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize) { // If every row was unique, stop early and don't set outputs (zero rows) if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample])) { return; } dictionaries[currentSample] = null; counts[currentSample] = null; currentSample++; } } // Each successive dictionary has ~1/8 of the rows of the previous one if (i < SampleCount - 1) { ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]); arrays[0] = arrays[0].Reselect(sample); } } } // Once the loop is done, get the distinct values and aggregation results PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0); }