private void _Finish() { if (_doThreading) { _SortAndWriteThreaded(); // wait on all previous threads, sorts and writes foreach (Thread t in _threads) { t.Join(); } _threads.Clear(); } else { // don't write the last one to disk _recordsToSort.Sort(); _sortedRecords = _recordsToSort; } int numToMerge = _numTempFiles; if (_sortedRecords != null && _sortedRecords.NumRecords != 0) { numToMerge++; } if (numToMerge > 1) { _merger = new SortedRecordMerger(); if (_sortedRecords != null && _sortedRecords.NumRecords != 0) { _merger.AddInput(_sortedRecords); } for (int i = 0; i < _numTempFiles; i++) { InternalRecordSource source = new InternalRecordFileReader(GetTempFilename(i)); _merger.AddInput(source); } _output = _merger; } else { _output = _sortedRecords; } // set up reduction filter for reduction across merged sources if (_internalReductionEnabled) { ReduceFilter reducer = new ReduceFilter(); RecordFilterDriver filterDriver = new RecordFilterDriver(reducer); filterDriver.AddInput(_output); _output = filterDriver; } // this is kind of a hack till i figure out how these should be set TotalRecordBytesEstimate = _output.TotalRecordBytesEstimate; TotalRecordsEstimate = _output.TotalRecordsEstimate; }
// write this chunk to disk. This is where reduction happens if it happens. private void _WriteSortedRecords() { if (!Directory.Exists(_tempDir)) { Directory.CreateDirectory(_tempDir); } InternalRecordSource output = _recordsToWrite; // set up reduction filter. The whole reason we have a sorterReducer // and not just a sorter is so that we can reduce before we write our // temp files to disk. if (_internalReductionEnabled) { ReduceFilter rf = new ReduceFilter(); RecordFilterDriver filterDriver = new RecordFilterDriver(rf); filterDriver.AddInput(output); output = filterDriver; } RecordFileWriter recordFileWriter = new RecordFileWriter(GetTempFilename(_numTempFiles)); _numTempFiles++; try { recordFileWriter.Write(output); } finally { _recordsToWrite.Close(); } }
/// <summary> /// This operation on a record source converts it to a source containing /// DataRecords with information about the source itself. The records of /// the input source are ignored completely and information like source name, /// estimated size, sorted-ness, etc. flow from the source. /// </summary> /// <param name="input">Input record source.</param> /// <returns>Output Records source.</returns> public RecordSource GetStatistics(RecordSource input) { // this is implemented like a filter. StatisticsPseudoFilter filter = new StatisticsPseudoFilter(input.InternalSource); RecordFilterDriver filterDriver = new RecordFilterDriver(filter); filterDriver.AddInput(input.InternalSource); RecordSource source2Bfiltered = new RecordSource(this); source2Bfiltered.InternalSource = filterDriver; return source2Bfiltered; }
/// <summary> /// Filters a RecordSource using a user provided filter /// </summary> /// <param name="input">The input record source</param> /// <param name="filter">The user provided recordFilter</param> /// <returns>A RecordSource for further processing</returns> public RecordSource Filter(RecordSource input, IRecordFilter filter) { if (filter is ISetRecordSource) { // if this filter implements IRecordSourceAccess then // we use it to set the RecordSource so the filter // itself will have access to it. ((ISetRecordSource)filter).Source = input; } RecordFilterDriver filterDriver = new RecordFilterDriver(filter); filterDriver.AddInput(input.InternalSource); RecordSource source2Bfiltered = new RecordSource(this); source2Bfiltered.InternalSource = filterDriver; return source2Bfiltered; }
private void _SortReduce(bool sortAscending, bool reductionEnabled) { // lets change sort needs to a number for easy comparison. // no sort = 0, sort ascending = 1, sort descending = 2 int askSortNum = 1; // we are asking for sorting on our output if (!sortAscending) askSortNum = 2; bool inputIsSorted = InternalSource.Sorting.IsSorted; bool inputIsSortedAscending = InternalSource.Sorting.IsSortedAscending; int inputSortNum = 0; if (inputIsSorted) inputSortNum = 1; if (inputIsSorted && !inputIsSortedAscending) inputSortNum = 2; // (aside: one might ask, why not just have a separate sorter // and reducer. Reduction is combined in the sorter so that // reduction can happen in memory before temp files are written // to disk. if (askSortNum == inputSortNum && !reductionEnabled) return; // don't insert a sorter or reducer if (askSortNum != inputSortNum) { RecordSorterReducer sr = new RecordSorterReducer(Processor.TempDir, sortAscending, reductionEnabled); sr.MaxMemorySize = Processor.MaxMemorySize; sr.DeleteTempFiles = Processor.DeleteTempFiles; InternalRecordSource temp = InternalSource; // grab our input sr.AddInput(temp); // pipe it into the sorterReducer this.InternalSource = sr; // make the sorterReducer the output of this source } else { IRecordFilter reducer = new ReduceFilter(); RecordFilterDriver driver = new RecordFilterDriver(reducer); driver.AddInput(InternalSource); InternalSource = driver; } }
/// <summary> /// Chooses random records from source. /// </summary> /// <param name="numToKeep">number of random records to pass through from source.</param> /// <param name="seed">a seed to the random number generator.</param> public void Random(int numToKeep, int seed) { RandomFilter filter = new RandomFilter(numToKeep, seed); RecordFilterDriver filterDriver = new RecordFilterDriver(filter); double fractionKept = (double)numToKeep / (double)InternalSource.TotalRecordsEstimate; long bytesEstimate = (long)(fractionKept * InternalSource.TotalRecordBytesEstimate); if (InternalSource is LoggingSource) { filterDriver.AddInput(InternalSource.Inputs[0]); filterDriver.TotalRecordsEstimate = (long)numToKeep; // unfortunately must be tweaked from outside after AddInput filterDriver.TotalRecordBytesEstimate = bytesEstimate; InternalSource.ClearInputs(); InternalSource.AddInput(filterDriver); } else { filterDriver.AddInput(InternalSource); filterDriver.TotalRecordsEstimate = (long)numToKeep; // unfortunately must be tweaked from outside after AddInput filterDriver.TotalRecordBytesEstimate = bytesEstimate; InternalSource = filterDriver; } }
/// <summary> /// Truncates the number of records coming from the record source. /// </summary> /// <param name="recordLimit">Number of records to limit source to.</param> public void Limit(long recordLimit) { LimitFilter filter = new LimitFilter(recordLimit); RecordFilterDriver filterDriver = new RecordFilterDriver(filter); double fractionKept = (double)recordLimit / (double)InternalSource.TotalRecordsEstimate; long bytesEstimate = (long)(fractionKept * InternalSource.TotalRecordBytesEstimate); if (InternalSource is LoggingSource) { filterDriver.AddInput(InternalSource.Inputs[0]); filterDriver.TotalRecordsEstimate = recordLimit; // unfortunately must be tweaked from outside after AddInput filterDriver.TotalRecordBytesEstimate = bytesEstimate; InternalSource.ClearInputs(); InternalSource.AddInput(filterDriver); } else { filterDriver.AddInput(InternalSource); filterDriver.TotalRecordsEstimate = recordLimit; // unfortunately must be tweaked from outside after AddInput filterDriver.TotalRecordBytesEstimate = bytesEstimate; InternalSource = filterDriver; } }