public static void RunAndCompare(XArray input, string inputColumnName, XArray expected, string outputColumnName, string queryText) { XDatabaseContext context = new XDatabaseContext { RequestedAsOfDateTime = TestAsOfDateTime }; IXTable query = TableTestHarness.DatabaseContext.FromArrays(input.Count) .WithColumn(new ColumnDetails(inputColumnName, input.Array.GetType().GetElementType()), input) .Query(queryText, context); Func <XArray> resultGetter = query.Columns.Find(outputColumnName).CurrentGetter(); int pageCount; // Get one row only and verify pageCount = query.Next(1); TableTestHarness.AssertAreEqual(TableTestHarness.Slice(expected, 0, 1), resultGetter(), pageCount); // Get another (ensure values set again) pageCount = query.Next(1); TableTestHarness.AssertAreEqual(TableTestHarness.Slice(expected, 1, 2), resultGetter(), pageCount); // Get the rest (ensure arrays expanded when needed) pageCount = query.Next(expected.Count - 2); Assert.AreEqual(expected.Count - 2, pageCount); TableTestHarness.AssertAreEqual(TableTestHarness.Slice(expected, 2, expected.Count), resultGetter(), pageCount); // Reset and ask for all of them at once query.Reset(); pageCount = query.Next(expected.Count + 1); Assert.AreEqual(expected.Count, pageCount); TableTestHarness.AssertAreEqual(expected, resultGetter(), pageCount); }
public int Next(int desiredCount, CancellationToken cancellationToken) { // If this is the first call, walk all rows once to find best rows if (!_isDictionaryBuilt) { _isDictionaryBuilt = true; BuildChooseDictionary(cancellationToken); _source.Reset(); } int outerCount; while ((outerCount = _source.Next(desiredCount, cancellationToken)) > 0) { // Ask for the indices of rows which were chosen in this page XArray chosenRows = _dictionary.GetChosenRows(_totalRowsRead, _totalRowsRead + outerCount, _totalRowsRead); // Track the total row count (so we know which rows to ask for chosens each time) _totalRowsRead += outerCount; // Tell the remapper to filter to chosen rows _chosenRowsFilter.SetMatches((int[])chosenRows.Array, chosenRows.Count); if (chosenRows.Count > 0) { CurrentRowCount = chosenRows.Count; return(CurrentRowCount); } } return(0); }
private void BuildDictionary(CancellationToken cancellationToken) { // Short-circuit path if there's one key column and it's an EnumColumn if (_keyColumns.Length == 1 && _keyColumns[0].IsEnumColumn()) { BuildSingleEnumColumnDictionary(cancellationToken); return; } // Retrieve the getters for all columns Func <XArray>[] keyColumnGetters = _keyColumns.Select((col) => col.CurrentGetter()).ToArray(); XArray[] keyArrays = new XArray[keyColumnGetters.Length]; int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Get the key column arrays for (int i = 0; i < keyArrays.Length; ++i) { keyArrays[i] = keyColumnGetters[i](); } // Add these to the Join Dictionary XArray indicesForRows = _dictionary.FindOrAdd(keyArrays); // Identify the bucket for each row and aggregate them for (int i = 0; i < _aggregators.Length; ++i) { _aggregators[i].Add(indicesForRows, _dictionary.Count); } } // Store the distinct count now that we know it _distinctCount = _dictionary.Count; // Once the loop is done, get the distinct values and aggregation results XArray[] keys = _dictionary.DistinctKeys(); for (int i = 0; i < _keyColumns.Length; ++i) { _columns[i].SetValues(keys[i]); } for (int i = 0; i < _aggregators.Length; ++i) { _columns[_keyColumns.Length + i].SetValues(_aggregators[i].Values); } }
/// <summary> /// Run the query for up to a provided timeout, and return the row count found, whether the query finished, /// and the runtime if it completed. /// /// NOTE: Does not dispose the query; caller must do so. /// </summary> /// <param name="pipeline">IXTable of source or query to run</param> /// <param name="timeout">Time limit for runtime</param> /// <param name="batchSize">Number of rows to process in each iteration</param> /// <returns>RunResult with whether query completed, runtime so far, and row count so far</returns> public static RunResult RunUntilTimeout(this IXTable pipeline, TimeSpan timeout, int batchSize = DefaultBatchSize) { using (CancellationTokenSource source = new CancellationTokenSource()) { CancellationToken cancellationToken = source.Token; if (timeout != TimeSpan.Zero && timeout != TimeSpan.MaxValue) { source.CancelAfter(timeout); } RunResult result = new RunResult(); result.Timeout = timeout; Stopwatch w = Stopwatch.StartNew(); while (true) { int count = pipeline.Next(batchSize, cancellationToken); result.RowCount += count; if (cancellationToken.IsCancellationRequested) { break; } else if (count == 0) { result.IsComplete = true; break; } } result.Elapsed = w.Elapsed; return(result); } }
private static long CountSource(IXTable source, int desiredCount, CancellationToken cancellationToken) { if (source is ISeekableXTable) { // If this is a List, just get the count return(((ISeekableXTable)source).Count); } else if (source is ConcatenatedTable) { // If this is multiple tables, count them in parallel ConcatenatedTable cSource = (ConcatenatedTable)source; List <IXTable> parts = cSource.Sources.ToList(); long[] counts = new long[parts.Count]; Parallel.For(0, parts.Count, (i) => counts[i] = CountSource(parts[i], desiredCount, cancellationToken)); return(counts.Sum()); } else { // Accumulate count over all rows from source long count = 0; while (true) { int batchCount = source.Next(Math.Max(desiredCount, XTableExtensions.DefaultBatchSize), cancellationToken); if (batchCount == 0) { break; } count += batchCount; } return(count); } }
public int SourceNext(int desiredCount, CancellationToken cancellationToken) { Reset(); // Get a page from the real source _currentPageCount = _source.Next(desiredCount, cancellationToken); return(_currentPageCount); }
public void Sample() { SampleDatabase.EnsureBuilt(); string xqlQuery = @" read WebRequest select [ServerPort], Cast([ResponseBytes], Int32) where [ServerPort] = ""80"" limit 1000 "; int desiredRowCountPerPage = 100; int maxResponseBytes = -1; // Build a Pipeline for the query. Wrap in a using statement to Dispose it when done. using (IXTable pipeline = SampleDatabase.XDatabaseContext.Query(xqlQuery)) { // Identify the columns you're consuming by requesting and caching the getter functions for them. // You must request the getters before the first call to Next(). // This tells the caller which columns you care about and builds hardcoded logic to get the data you want. Func <XArray> columnGetter = pipeline.Columns.Find("ResponseBytes").CurrentGetter(); // Call Next() to get an XArray of rows. Ask for only as many as you need. Ask for an XArray size convenient to work with. // Next() may return fewer rows than you asked for, but will not return zero until the input has run out of rows. while (pipeline.Next(desiredRowCountPerPage) > 0) { // Get the values for your desired column for this set of rows. XArray responseBytesxarray = columnGetter(); // If you know the type of the column, you can safely cast the array to the right type. // This allows you to write C# code hard-coded to the type, so there's no boxing and no interface calls. int[] array = (int[])responseBytesxarray.Array; // Loop from zero to the count the xarray says it returned for (int i = 0; i < responseBytesxarray.Count; ++i) { // You need to look up the real index of each value in the array. // Index() allows callers to pass a whole array, and array slice, or lookup indices into the array. // The CPU figures out the pattern quickly so branch costs are minimal. int responseBytes = array[responseBytesxarray.Index(i)]; // Run your own logic (in this case, track the Max of the value) if (responseBytes > maxResponseBytes) { maxResponseBytes = responseBytes; } } } } Assert.AreEqual(1335, maxResponseBytes); }
/// <summary> /// Get a single value from the source (the first column in the first row). /// </summary> /// <typeparam name="T">Data Type of result</typeparam> /// <param name="pipeline">IDatxarrayEnumerator to run</param> /// <returns>Single value result (first column, first row)</returns> public static T Single <T>(this IXTable pipeline, CancellationToken cancellationToken = default(CancellationToken)) { Func <XArray> getter = pipeline.Columns[0].CurrentGetter(); using (pipeline) { pipeline.Next(1, cancellationToken); XArray xarray = getter(); T[] array = (T[])(getter().Array); return(array[xarray.Index(0)]); } }
public int Next(int desiredCount, CancellationToken cancellationToken) { if (cancellationToken == default(CancellationToken)) { Assert.Fail("CancellationToken must be passed through the table pipeline."); } NextCalled = true; CurrentRowCount = _inner.Next(desiredCount, cancellationToken); Assert.AreEqual(CurrentRowCount, _inner.CurrentRowCount, $"Enumerator must return the same row count from Next {CurrentRowCount:n0} that it saves in CurrentRowbatchCount {_inner.CurrentRowCount:n0}."); return(CurrentRowCount); }
/// <summary> /// Run a Query but don't dispose the source, and return the count of rows from the query. /// </summary> /// <param name="pipeline">IXTable of source or query to run</param> /// <param name="cancellationToken">Token to allow early cancellation</param> /// <param name="batchSize">Number of rows to process on each iteration</param> /// <returns>Count of rows in this source or query.</returns> public static long RunWithoutDispose(this IXTable pipeline, CancellationToken cancellationToken = default(CancellationToken), int batchSize = DefaultBatchSize) { long rowsWritten = 0; while (true) { int batchCount = pipeline.Next(batchSize, cancellationToken); if (batchCount == 0) { break; } rowsWritten += batchCount; } return(rowsWritten); }
public int Next(int desiredCount, CancellationToken cancellationToken) { // Build the writer only when we start getting rows if (_writer == null) { if (_outputFilePath == null) { throw new InvalidOperationException("TabularFileWriter can't reset when passed an ITabularWriter instance"); } if (_outputFilePath.Equals("cout", StringComparison.OrdinalIgnoreCase)) { _writer = new ConsoleTabularWriter(); } else { _writer = TabularFactory.BuildWriter(_streamProvider.OpenWrite(_outputFilePath), _outputFilePath); } _writer.SetColumns(_source.Columns.Select((col) => col.ColumnDetails.Name)); } // Or smaller batch? int rowCount = _source.Next(desiredCount, cancellationToken); if (rowCount == 0) { return(0); } XArray[] arrays = new XArray[_stringColumnGetters.Length]; for (int i = 0; i < _stringColumnGetters.Length; ++i) { arrays[i] = _stringColumnGetters[i](); } for (int rowIndex = 0; rowIndex < rowCount; ++rowIndex) { for (int colIndex = 0; colIndex < _stringColumnGetters.Length; ++colIndex) { String8 value = ((String8[])arrays[colIndex].Array)[arrays[colIndex].Index(rowIndex)]; _writer.Write(value); } _writer.NextRow(); } return(rowCount); }
public int Next(int desiredCount, CancellationToken cancellationToken) { // If this is the first call, fully cache the JoinToSource and build a lookup Dictionary if (_joinDictionary == null) { BuildJoinDictionary(cancellationToken); } BitVector matchedRows = null; while (true) { // Get the next rows from the source int count = _source.Next(desiredCount, cancellationToken); if (count == 0) { CurrentRowCount = 0; return(0); } // Get values to join from XArray joinFromValues = _joinFromColumnGetter(); // Find which rows matched and to what right-side row indices matchedRows = _joinDictionary.TryGetValues(joinFromValues, out _currentRightSideSelector); if (_currentRightSideSelector.Count > 0) { break; } } // Filter left-side rows to the matches (inner join) _sourceJoinedRowsFilter.SetMatches(matchedRows); // Seek right-side rows to the matches for (int i = 0; i < _rightSideColumns.Length; ++i) { _rightSideColumns[i].Set(_currentRightSideSelector); } CurrentRowCount = _currentRightSideSelector.Count; return(_currentRightSideSelector.Count); }
/// <summary> /// Write a table to the Tracing system for debugging. /// </summary> public static void TraceWrite(IXTable table, int rowCount = XTableExtensions.DefaultBatchSize) { Func <XArray>[] columnGetters = new Func <XArray> [table.Columns.Count]; XArray[] columns = new XArray[table.Columns.Count]; for (int i = 0; i < columns.Length; ++i) { columnGetters[i] = table.Columns[i].CurrentGetter(); } table.Next(rowCount); for (int i = 0; i < columns.Length; ++i) { columns[i] = columnGetters[i](); } TraceWrite(columns, table.Columns.Select((col) => col.ColumnDetails).ToArray()); table.Reset(); }
/// <summary> /// Get pages of Lists of values from a single column from the source. /// </summary> /// <example> /// foreach(List<int> in XFormTable.FromArrays(10000).With("Values", array).ToList("Values", 1024)) /// { /// ... /// } /// </example> /// <typeparam name="T">Type of values in column</typeparam> /// <param name="pipeline">IXTable to run</param> /// <param name="columnName">Column Name to retrieve values from</param> /// <param name="batchSize">Maximum row count to retrieve per page</param> /// <returns>Pages of the result column in a List</returns> public static IEnumerable <List <T> > ToList <T>(this IXTable pipeline, string columnName, CancellationToken cancellationToken = default(CancellationToken), int batchSize = DefaultBatchSize) { List <T> result = new List <T>(batchSize); using (pipeline) { Func <XArray> getter = pipeline.Columns.Find(columnName).CurrentGetter(); while (pipeline.Next(batchSize, cancellationToken) != 0) { XArray xarray = getter(); T[] array = (T[])xarray.Array; for (int i = 0; i < xarray.Count; ++i) { result.Add(array[xarray.Index(i)]); } yield return(result); result.Clear(); } } }
public int Next(int desiredCount, CancellationToken cancellationToken) { // Return no more rows if this isn't the first call if (_count != -1) { CurrentRowCount = 0; return(CurrentRowCount); } // If this is a List, just get the count if (_source is ISeekableXTable) { _count = ((ISeekableXTable)_source).Count; } else { // Accumulate count over all rows from source _count = 0; while (true) { int batchCount = _source.Next(Math.Max(desiredCount, XTableExtensions.DefaultBatchSize), cancellationToken); if (batchCount == 0) { break; } _count += batchCount; } } // Set the count on the constant _countColumn[0].Set(_count); // Return that there's one row (the count) CurrentRowCount = 1; return(CurrentRowCount); }
/// <summary> /// Run a Query but don't dispose the source, and return the count of rows from the query. /// </summary> /// <param name="pipeline">IXTable of source or query to run</param> /// <param name="cancellationToken">Token to allow early cancellation</param> /// <param name="batchSize">Number of rows to process on each iteration</param> /// <returns>Count of rows in this source or query.</returns> public static RunResult RunWithoutDispose(this IXTable pipeline, CancellationToken cancellationToken = default(CancellationToken), int batchSize = DefaultBatchSize) { RunResult result = new RunResult(); Stopwatch w = Stopwatch.StartNew(); while (true) { int batchCount = pipeline.Next(batchSize, cancellationToken); result.RowCount += batchCount; if (cancellationToken.IsCancellationRequested) { break; } else if (batchCount == 0) { result.IsComplete = true; break; } } result.Elapsed = w.Elapsed; return(result); }
public const int DefaultBatchSize = 20480; // 10240; #region Next Overloads /// <summary> /// Get the next batch of rows from the table. /// </summary> /// <param name="table">IXTable to enumerate</param> /// <returns>Count of rows returned; zero means no more rows</returns> public static int Next(this IXTable table) { return(table.Next(DefaultBatchSize, default(CancellationToken))); }
public virtual int Next(int desiredCount, CancellationToken cancellationToken) { return(_source.Next(desiredCount, cancellationToken)); }
/// <summary> /// Build a GroupBy Dictionary for Peek. /// </summary> /// <remarks> /// Peek identifies each distinct common value and the approximate percentage of rows with it. /// If we have many matching rows, we can sample - the sample will have any common values in it. /// However, we don't know how many matches we have in advance. /// Therefore, we build a Dictionary of all rows, 1/8 of rows, 1/64 of rows, and 1/512 of rows. /// As soon as a given sample has enough samples to be statistically valid, we stop collecting the larger subsets. /// This strategy allows us to run the overall query only once, end up with a large enough sample, and avoid building giant Dictionaries. /// </remarks> /// <param name="cancellationToken">CancellationToken to request early stop</param> private void BuildDictionary(CancellationToken cancellationToken) { // Short-circuit path if there's one key column and it's an EnumColumn if (_column.IsEnumColumn()) { BuildSingleEnumColumnDictionary(cancellationToken); return; } // Build a Random instance to sample rows Random r = new Random(); // Build a Dictionary and CountAggregator for each sample GroupByDictionary[] dictionaries = new GroupByDictionary[SampleCount]; CountAggregator[] counts = new CountAggregator[SampleCount]; int[][] remapArrays = new int[SampleCount][]; for (int i = 0; i < SampleCount; ++i) { dictionaries[i] = new GroupByDictionary(new ColumnDetails[] { _column.ColumnDetails }); counts[i] = new CountAggregator(); } // Retrieve the column getter Func <XArray> columnGetter = _column.CurrentGetter(); // Track which sample we'll currently report int currentSample = 0; XArray[] arrays = new XArray[1]; int count; while ((count = _source.Next(XTableExtensions.DefaultBatchSize, cancellationToken)) != 0) { // Get the column values arrays[0] = columnGetter(); // Build the GroupBy count for all rows and successive 1/8 samples for (int i = 0; i < SampleCount; ++i) { // Add these to the Join Dictionary if (i >= currentSample) { // Choose buckets for each row XArray indicesForRows = dictionaries[i].FindOrAdd(arrays); // Identify the bucket for each row and aggregate them counts[i].Add(indicesForRows, dictionaries[i].Count); // If this sample now has enough values, stop collecting bigger row sets if (currentSample == i - 1 && counts[i].TotalRowCount > RequiredSampleSize) { // If every row was unique, stop early and don't set outputs (zero rows) if (ShouldStopEarly(dictionaries[currentSample], counts[currentSample])) { return; } dictionaries[currentSample] = null; counts[currentSample] = null; currentSample++; } } // Each successive dictionary has ~1/8 of the rows of the previous one if (i < SampleCount - 1) { ArraySelector sample = Sampler.Eighth(arrays[0].Selector, r, ref remapArrays[i]); arrays[0] = arrays[0].Reselect(sample); } } } // Once the loop is done, get the distinct values and aggregation results PostSortAndFilter(dictionaries[currentSample].DistinctKeys()[0], counts[currentSample].Values, counts[currentSample].TotalRowCount, currentSample == 0); }
public static void AssertAreEqual(IXTable expected, IXTable actual, int pageSize) { // Reset both tables (so they can be used for repeated scenarios) expected.Reset(); actual.Reset(); // Get the column getters for every expected column and the columns of the same names in actual Func <XArray>[] expectedGetters = new Func <XArray> [expected.Columns.Count]; Func <XArray>[] actualGetters = new Func <XArray> [actual.Columns.Count]; for (int i = 0; i < expected.Columns.Count; ++i) { expectedGetters[i] = expected.Columns[i].CurrentGetter(); actualGetters[i] = actual.Columns.Find(expected.Columns[i].ColumnDetails.Name).CurrentGetter(); } // Loop over rows, comparing as many rows as available each time int totalRowCount = 0; int expectedCurrentCount = 0, expectedNextIndex = 0; int actualCurrentCount = 0, actualNextIndex = 0; XArray[] expectedArrays = new XArray[expected.Columns.Count]; XArray[] actualArrays = new XArray[expected.Columns.Count]; while (true) { // Get new expected rows if we've compared all of the current ones already if (expectedNextIndex >= expectedCurrentCount) { expectedNextIndex = 0; expectedCurrentCount = expected.Next(pageSize); for (int i = 0; i < expected.Columns.Count; ++i) { expectedArrays[i] = expectedGetters[i](); } } // Get new actual rows if we've compared all of the current ones already if (actualNextIndex >= actualCurrentCount) { actualNextIndex = 0; actualCurrentCount = actual.Next(pageSize); for (int i = 0; i < expected.Columns.Count; ++i) { actualArrays[i] = actualGetters[i](); } } // If we're out of rows from both sides, stop if (expectedCurrentCount == 0 && actualCurrentCount == 0) { break; } // Figure out how many rows we can compare this time (the minimum available from both sides) int countToCompare = Math.Min(expectedCurrentCount - expectedNextIndex, actualCurrentCount - actualNextIndex); string errorMessage = ""; int firstMismatchedRow = -1; // If we ran out of rows on one side before the other, fail if (countToCompare == 0) { errorMessage = $"Ran out of rows after {totalRowCount + expectedCurrentCount - expectedNextIndex:n0} Expected rows but {totalRowCount + actualCurrentCount - actualNextIndex:n0} Actual rows."; firstMismatchedRow = Math.Max(expectedCurrentCount - expectedNextIndex, actualCurrentCount - actualNextIndex); } else { // Get the current xarray for each column, slice to the set of rows to compare, and compare them for (int i = 0; i < expected.Columns.Count; ++i) { XArray expectedArray = expectedArrays[i].Slice(expectedNextIndex, expectedNextIndex + countToCompare); XArray actualArray = actualArrays[i].Slice(actualNextIndex, actualNextIndex + countToCompare); firstMismatchedRow = FirstMismatchedRow( expectedArray, actualArray, countToCompare, expected.Columns[i].ColumnDetails.Name, out errorMessage); if (!String.IsNullOrEmpty(errorMessage)) { break; } } } // If the table spans weren't equal, show the rows and error message if (!String.IsNullOrEmpty(errorMessage)) { Trace.WriteLine("Expected:"); TraceWrite(expectedArrays, expected.Columns.Select((col) => col.ColumnDetails).ToArray(), expectedNextIndex + firstMismatchedRow, expectedCurrentCount - (expectedNextIndex + firstMismatchedRow)); Trace.WriteLine("Actual:"); TraceWrite(actualArrays, expected.Columns.Select((col) => col.ColumnDetails).ToArray(), actualNextIndex + firstMismatchedRow, actualCurrentCount - (actualNextIndex + firstMismatchedRow)); Assert.Fail(errorMessage); } expectedNextIndex += countToCompare; actualNextIndex += countToCompare; totalRowCount += countToCompare; } }
/// <summary> /// Get a specific desired count of rows from the table. /// </summary> /// <param name="table">IXTable to enumerate</param> /// <param name="desiredCount">Maximum number of rows to return</param> /// <returns>Count of rows returned; zero means no more rows</returns> public static int Next(this IXTable table, int desiredCount) { return(table.Next(desiredCount, default(CancellationToken))); }