protected virtual void Initialize(ExtractDatasetCommand request) { Request = request; if (request == ExtractDatasetCommand.EmptyCommand) { return; } _timeSpentValidating = new Stopwatch(); _timeSpentCalculatingDISTINCT = new Stopwatch(); _timeSpentBuckettingDates = new Stopwatch(); Request.ColumnsToExtract.Sort();//ensure they are in the right order so we can record the release identifiers //if we have a cached builder already if (request.QueryBuilder == null) { request.GenerateQueryBuilder(); } foreach (ReleaseIdentifierSubstitution substitution in Request.ReleaseIdentifierSubstitutions) { _extractionIdentifiersidx.Add(substitution.GetRuntimeName()); } UniqueReleaseIdentifiersEncountered = new HashSet <object>(); _catalogue = request.Catalogue; if (!string.IsNullOrWhiteSpace(_catalogue.ValidatorXML)) { ExtractionTimeValidator = new ExtractionTimeValidator(_catalogue, request.ColumnsToExtract); } //if there is a time periodicity ExtractionInformation (AND! it is among the columns the user selected to be extracted) if (_catalogue.TimeCoverage_ExtractionInformation_ID != null && request.ColumnsToExtract.Cast <ExtractableColumn>().Any(c => c.CatalogueExtractionInformation_ID == _catalogue.TimeCoverage_ExtractionInformation_ID)) { ExtractionTimeTimeCoverageAggregator = new ExtractionTimeTimeCoverageAggregator(_catalogue, request.ExtractableCohort); } else { ExtractionTimeTimeCoverageAggregator = null; } }
public virtual DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken) { // we are in the Global Commands case, let's return an empty DataTable (not null) // so we can trigger the destination to extract the globals docs and sql if (GlobalsRequest != null) { GlobalsRequest.ElevateState(ExtractCommandState.WaitingForSQLServer); if (firstGlobalChunk) { //unless we are checking, start auditing StartAuditGlobals(); firstGlobalChunk = false; return(new DataTable(ExtractionDirectory.GLOBALS_DATA_NAME)); } return(null); } if (Request == null) { throw new Exception("Component has not been initialized before being asked to GetChunk(s)"); } Request.ElevateState(ExtractCommandState.WaitingForSQLServer); if (_cancel) { throw new Exception("User cancelled data extraction"); } if (_hostedSource == null) { StartAudit(Request.QueryBuilder.SQL); if (Request.DatasetBundle.DataSet.DisableExtraction) { throw new Exception("Cannot extract " + Request.DatasetBundle.DataSet + " because DisableExtraction is set to true"); } _hostedSource = new DbDataCommandDataFlowSource(GetCommandSQL(listener), "ExecuteDatasetExtraction " + Request.DatasetBundle.DataSet, Request.GetDistinctLiveDatabaseServer().Builder, ExecutionTimeout); // If we are running in batches then always allow empty extractions _hostedSource.AllowEmptyResultSets = AllowEmptyExtractions || Request.IsBatchResume; _hostedSource.BatchSize = BatchSize; } DataTable chunk = null; try { chunk = _hostedSource.GetChunk(listener, cancellationToken); chunk = _peeker.AddPeekedRowsIfAny(chunk); //if we are trying to distinct the records in memory based on release id if (DistinctStrategy == DistinctStrategy.OrderByAndDistinctInMemory) { var releaseIdentifierColumn = Request.ReleaseIdentifierSubstitutions.First().GetRuntimeName(); if (chunk != null) { //last release id in the current chunk var lastReleaseId = chunk.Rows[chunk.Rows.Count - 1][releaseIdentifierColumn]; _peeker.AddWhile(_hostedSource, r => Equals(r[releaseIdentifierColumn], lastReleaseId), chunk); chunk = MakeDistinct(chunk, listener, cancellationToken); } } } catch (AggregateException a) { if (a.GetExceptionIfExists <TaskCanceledException>() != null) { _cancel = true; } throw; } catch (Exception e) { listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Error, "Read from source failed", e)); } if (cancellationToken.IsCancellationRequested) { throw new Exception("Data read cancelled because our cancellationToken was set, aborting data reading"); } //if the first chunk is null if (firstChunk && chunk == null && !AllowEmptyExtractions) { throw new Exception("There is no data to load, query returned no rows, query was:" + Environment.NewLine + (_hostedSource.Sql ?? Request.QueryBuilder.SQL)); } //not the first chunk anymore firstChunk = false; //data exhausted if (chunk == null) { listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "Data exhausted after reading " + _rowsRead + " rows of data (" + UniqueReleaseIdentifiersEncountered.Count + " unique release identifiers seen)")); if (Request != null) { Request.CumulativeExtractionResults.DistinctReleaseIdentifiersEncountered = Request.IsBatchResume ? -1 : UniqueReleaseIdentifiersEncountered.Count; } return(null); } _rowsRead += chunk.Rows.Count; //chunk will have datatypes for all the things in the buffer so we can populate our dictionary of facts about what columns/catalogue items have spontaneously changed name/type etc if (ExtractTimeTransformationsObserved == null) { GenerateExtractionTransformObservations(chunk); } //see if the SqlDataReader has a column with the same name as the ReleaseIdentifierSQL (if so then we can use it to count the number of distinct subjects written out to the csv) bool includesReleaseIdentifier = _extractionIdentifiersidx.Count > 0; //first line - lets see what columns we wrote out //looks at the buffer and computes any transforms performed on the column _timeSpentValidating.Start(); //build up the validation report (Missing/Wrong/Etc) - this has no mechanical effect on the extracted data just some metadata that goes into a flat file if (ExtractionTimeValidator != null && Request.IncludeValidation) { try { chunk.Columns.Add(ValidationColumnName); ExtractionTimeValidator.Validate(chunk, ValidationColumnName); _rowsValidated += chunk.Rows.Count; listener.OnProgress(this, new ProgressEventArgs("Validation", new ProgressMeasurement(_rowsValidated, ProgressType.Records), _timeSpentValidating.Elapsed)); } catch (Exception ex) { listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Error, "Could not validate data chunk", ex)); ValidationFailureException = ex; ExtractionTimeValidator = null; } } _timeSpentValidating.Stop(); _timeSpentBuckettingDates.Start(); if (ExtractionTimeTimeCoverageAggregator != null) { _rowsBucketted += chunk.Rows.Count; foreach (DataRow row in chunk.Rows) { ExtractionTimeTimeCoverageAggregator.ProcessRow(row); } listener.OnProgress(this, new ProgressEventArgs("Bucketting Dates", new ProgressMeasurement(_rowsBucketted, ProgressType.Records), _timeSpentCalculatingDISTINCT.Elapsed)); } _timeSpentBuckettingDates.Stop(); _timeSpentCalculatingDISTINCT.Start(); //record unique release identifiers found if (includesReleaseIdentifier) { foreach (string idx in _extractionIdentifiersidx) { foreach (DataRow r in chunk.Rows) { if (r[idx] == DBNull.Value) { if (_extractionIdentifiersidx.Count == 1) { throw new Exception("Null release identifier found in extract of dataset " + Request.DatasetBundle.DataSet); } else { continue; //there are multiple extraction identifiers thats fine if one or two are null } } if (!UniqueReleaseIdentifiersEncountered.Contains(r[idx])) { UniqueReleaseIdentifiersEncountered.Add(r[idx]); } } listener.OnProgress(this, new ProgressEventArgs("Calculating Distinct Release Identifiers", new ProgressMeasurement(UniqueReleaseIdentifiersEncountered.Count, ProgressType.Records), _timeSpentCalculatingDISTINCT.Elapsed)); } } _timeSpentCalculatingDISTINCT.Stop(); return(chunk); }
private void CreateTimespanGraph(ExtractionTimeTimeCoverageAggregator toGraph) {/* * Chart wdChart = wrdDoc.InlineShapes.AddChart(Microsoft.Office.Core.XlChartType.xl3DColumn, ref oMissing).Chart; * ChartData wdChartData = wdChart.ChartData; * Workbook dataWorkbook = (Workbook)wdChartData.Workbook; * dataWorkbook.Application.Visible = DEBUG_WORD; * Worksheet dataSheet = (Worksheet)dataWorkbook.Worksheets[1]; * * //set title before putting any data in * wdChart.ApplyLayout(1); * wdChart.ChartTitle.Text = "Dataset Timespan"; * wdChart.ChartTitle.Font.Italic = true; * wdChart.ChartTitle.Font.Size = 18; * wdChart.ChartTitle.Font.Color = Color.Black.ToArgb(); * * //fill in sheet data * dataSheet.Cells.Range["A1", oMissing].FormulaR1C1 = "Date"; * dataSheet.Cells.Range["B1", oMissing].FormulaR1C1 = "Record Count"; * dataSheet.Cells.Range["C1", oMissing].FormulaR1C1 = "Distinct Patient Count"; * * //get rid of default microsoft crap * dataSheet.Cells.Range["D1", oMissing].FormulaR1C1 = ""; * dataSheet.Cells.Range["D2", oMissing].FormulaR1C1 = ""; * dataSheet.Cells.Range["D3", oMissing].FormulaR1C1 = ""; * dataSheet.Cells.Range["D4", oMissing].FormulaR1C1 = ""; * dataSheet.Cells.Range["D5", oMissing].FormulaR1C1 = ""; * * //get first of the month * int futureRecordsCount = 0; * int recordsPriorToDatasetStartCount = 0; * * int currentRow = 2; * * var cata = Executer.Source.Request.Catalogue; * DateTime startDateCutoff; * * if (cata.DatasetStartDate == null) * startDateCutoff = DateTime.MinValue; * else * startDateCutoff = (DateTime)cata.DatasetStartDate; * * foreach (DateTime key in Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets.Keys.OrderBy(d => d)) * { * if (key < startDateCutoff) * { * recordsPriorToDatasetStartCount += Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets[key].CountOfTimesSeen; * continue; * } * if (key > DateTime.Now) * { * futureRecordsCount += Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets[key].CountOfTimesSeen; * continue; * } * * dataSheet.Cells.Range["A" + currentRow, oMissing].FormulaR1C1 = key.ToString("yyyy-MM-dd"); * ExtractionTimeTimeCoverageAggregatorBucket currentBucket = Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets[key]; * dataSheet.Cells.Range["B" + currentRow, oMissing].FormulaR1C1 = currentBucket.CountOfTimesSeen; * dataSheet.Cells.Range["C" + currentRow, oMissing].FormulaR1C1 = currentBucket.CountOfDistinctIdentifiers; * * currentRow++; * } * * //set max size of sheet * Microsoft.Office.Interop.Excel.Range tRange = dataSheet.Cells.get_Range("A1", "C" + (currentRow-1)); * * ListObject tbl1 = dataSheet.ListObjects["Table1"]; * tbl1.Resize(tRange); * * wdChart.ApplyDataLabels(XlDataLabelsType.xlDataLabelsShowNone, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing); * wdChart.ChartType = XlChartType.xlLine; * Microsoft.Office.Interop.Word.Axis xAxis = (Microsoft.Office.Interop.Word.Axis)wdChart.Axes(Microsoft.Office.Interop.Word.XlAxisType.xlCategory, XlAxisGroup.xlPrimary); * xAxis.BaseUnit = Microsoft.Office.Interop.Word.XlTimeUnit.xlMonths; * * * wdChart.Refresh(); * * if (Executer.Destination.TableLoadInfo.Inserts == 0) * return; * * float futureRecordsCountAsFraction = ((float)futureRecordsCount) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f; * float nullCountAsFraction = ((float)toGraph.countOfNullsSeen) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f; * float brokenDatesAsFraction = ((float)toGraph.countOfBrokenDates) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f; * float recordsPriorToDatasetStartCountAsFraction = ((float)recordsPriorToDatasetStartCount) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f; * * wordHelper.WriteLine("Nulls:" + toGraph.countOfNullsSeen + "(" + nullCountAsFraction + "%)", WdBuiltinStyle.wdStyleNormal); * wordHelper.WriteLine("Invalid Date formats:" + toGraph.countOfBrokenDates + "(" + brokenDatesAsFraction + "%)", WdBuiltinStyle.wdStyleNormal); * wordHelper.WriteLine("Future Dates:" + futureRecordsCount +"("+ futureRecordsCountAsFraction + "%)",WdBuiltinStyle.wdStyleNormal); * wordHelper.WriteLine("Timespan Field: " + cata.TimeCoverage_ExtractionInformation.GetRuntimeName(), WdBuiltinStyle.wdStyleNormal); * * if (cata.DatasetStartDate != null) * wordHelper.WriteLine("Dates before dataset start date(" + cata.DatasetStartDate.Value.ToString(_destination.DateFormat) + "):" + recordsPriorToDatasetStartCount + "(" + recordsPriorToDatasetStartCountAsFraction + "%)", WdBuiltinStyle.wdStyleNormal); * * dataWorkbook.Application.Quit(); */ }