Exemple #1
0
        protected virtual void Initialize(ExtractDatasetCommand request)
        {
            Request = request;

            if (request == ExtractDatasetCommand.EmptyCommand)
            {
                return;
            }

            _timeSpentValidating          = new Stopwatch();
            _timeSpentCalculatingDISTINCT = new Stopwatch();
            _timeSpentBuckettingDates     = new Stopwatch();

            Request.ColumnsToExtract.Sort();//ensure they are in the right order so we can record the release identifiers

            //if we have a cached builder already
            if (request.QueryBuilder == null)
            {
                request.GenerateQueryBuilder();
            }

            foreach (ReleaseIdentifierSubstitution substitution in Request.ReleaseIdentifierSubstitutions)
            {
                _extractionIdentifiersidx.Add(substitution.GetRuntimeName());
            }

            UniqueReleaseIdentifiersEncountered = new HashSet <object>();

            _catalogue = request.Catalogue;

            if (!string.IsNullOrWhiteSpace(_catalogue.ValidatorXML))
            {
                ExtractionTimeValidator = new ExtractionTimeValidator(_catalogue, request.ColumnsToExtract);
            }

            //if there is a time periodicity ExtractionInformation (AND! it is among the columns the user selected to be extracted)
            if (_catalogue.TimeCoverage_ExtractionInformation_ID != null && request.ColumnsToExtract.Cast <ExtractableColumn>().Any(c => c.CatalogueExtractionInformation_ID == _catalogue.TimeCoverage_ExtractionInformation_ID))
            {
                ExtractionTimeTimeCoverageAggregator = new ExtractionTimeTimeCoverageAggregator(_catalogue, request.ExtractableCohort);
            }
            else
            {
                ExtractionTimeTimeCoverageAggregator = null;
            }
        }
Exemple #2
0
        public virtual DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken)
        {
            // we are in the Global Commands case, let's return an empty DataTable (not null)
            // so we can trigger the destination to extract the globals docs and sql
            if (GlobalsRequest != null)
            {
                GlobalsRequest.ElevateState(ExtractCommandState.WaitingForSQLServer);
                if (firstGlobalChunk)
                {
                    //unless we are checking, start auditing
                    StartAuditGlobals();

                    firstGlobalChunk = false;
                    return(new DataTable(ExtractionDirectory.GLOBALS_DATA_NAME));
                }

                return(null);
            }

            if (Request == null)
            {
                throw new Exception("Component has not been initialized before being asked to GetChunk(s)");
            }

            Request.ElevateState(ExtractCommandState.WaitingForSQLServer);

            if (_cancel)
            {
                throw new Exception("User cancelled data extraction");
            }

            if (_hostedSource == null)
            {
                StartAudit(Request.QueryBuilder.SQL);

                if (Request.DatasetBundle.DataSet.DisableExtraction)
                {
                    throw new Exception("Cannot extract " + Request.DatasetBundle.DataSet + " because DisableExtraction is set to true");
                }

                _hostedSource = new DbDataCommandDataFlowSource(GetCommandSQL(listener),
                                                                "ExecuteDatasetExtraction " + Request.DatasetBundle.DataSet,
                                                                Request.GetDistinctLiveDatabaseServer().Builder,
                                                                ExecutionTimeout);

                // If we are running in batches then always allow empty extractions
                _hostedSource.AllowEmptyResultSets = AllowEmptyExtractions || Request.IsBatchResume;
                _hostedSource.BatchSize            = BatchSize;
            }

            DataTable chunk = null;

            try
            {
                chunk = _hostedSource.GetChunk(listener, cancellationToken);

                chunk = _peeker.AddPeekedRowsIfAny(chunk);

                //if we are trying to distinct the records in memory based on release id
                if (DistinctStrategy == DistinctStrategy.OrderByAndDistinctInMemory)
                {
                    var releaseIdentifierColumn = Request.ReleaseIdentifierSubstitutions.First().GetRuntimeName();

                    if (chunk != null)
                    {
                        //last release id in the current chunk
                        var lastReleaseId = chunk.Rows[chunk.Rows.Count - 1][releaseIdentifierColumn];

                        _peeker.AddWhile(_hostedSource, r => Equals(r[releaseIdentifierColumn], lastReleaseId), chunk);
                        chunk = MakeDistinct(chunk, listener, cancellationToken);
                    }
                }
            }
            catch (AggregateException a)
            {
                if (a.GetExceptionIfExists <TaskCanceledException>() != null)
                {
                    _cancel = true;
                }

                throw;
            }
            catch (Exception e)
            {
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Error, "Read from source failed", e));
            }

            if (cancellationToken.IsCancellationRequested)
            {
                throw new Exception("Data read cancelled because our cancellationToken was set, aborting data reading");
            }

            //if the first chunk is null
            if (firstChunk && chunk == null && !AllowEmptyExtractions)
            {
                throw new Exception("There is no data to load, query returned no rows, query was:" + Environment.NewLine +
                                    (_hostedSource.Sql ?? Request.QueryBuilder.SQL));
            }

            //not the first chunk anymore
            firstChunk = false;

            //data exhausted
            if (chunk == null)
            {
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "Data exhausted after reading " + _rowsRead + " rows of data (" + UniqueReleaseIdentifiersEncountered.Count + " unique release identifiers seen)"));
                if (Request != null)
                {
                    Request.CumulativeExtractionResults.DistinctReleaseIdentifiersEncountered = Request.IsBatchResume ? -1 : UniqueReleaseIdentifiersEncountered.Count;
                }
                return(null);
            }

            _rowsRead += chunk.Rows.Count;
            //chunk will have datatypes for all the things in the buffer so we can populate our dictionary of facts about what columns/catalogue items have spontaneously changed name/type etc
            if (ExtractTimeTransformationsObserved == null)
            {
                GenerateExtractionTransformObservations(chunk);
            }


            //see if the SqlDataReader has a column with the same name as the ReleaseIdentifierSQL (if so then we can use it to count the number of distinct subjects written out to the csv)
            bool includesReleaseIdentifier = _extractionIdentifiersidx.Count > 0;


            //first line - lets see what columns we wrote out
            //looks at the buffer and computes any transforms performed on the column


            _timeSpentValidating.Start();
            //build up the validation report (Missing/Wrong/Etc) - this has no mechanical effect on the extracted data just some metadata that goes into a flat file
            if (ExtractionTimeValidator != null && Request.IncludeValidation)
            {
                try
                {
                    chunk.Columns.Add(ValidationColumnName);

                    ExtractionTimeValidator.Validate(chunk, ValidationColumnName);

                    _rowsValidated += chunk.Rows.Count;
                    listener.OnProgress(this, new ProgressEventArgs("Validation", new ProgressMeasurement(_rowsValidated, ProgressType.Records), _timeSpentValidating.Elapsed));
                }
                catch (Exception ex)
                {
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Error, "Could not validate data chunk", ex));
                    ValidationFailureException = ex;
                    ExtractionTimeValidator    = null;
                }
            }
            _timeSpentValidating.Stop();

            _timeSpentBuckettingDates.Start();
            if (ExtractionTimeTimeCoverageAggregator != null)
            {
                _rowsBucketted += chunk.Rows.Count;

                foreach (DataRow row in chunk.Rows)
                {
                    ExtractionTimeTimeCoverageAggregator.ProcessRow(row);
                }

                listener.OnProgress(this, new ProgressEventArgs("Bucketting Dates", new ProgressMeasurement(_rowsBucketted, ProgressType.Records), _timeSpentCalculatingDISTINCT.Elapsed));
            }
            _timeSpentBuckettingDates.Stop();

            _timeSpentCalculatingDISTINCT.Start();
            //record unique release identifiers found
            if (includesReleaseIdentifier)
            {
                foreach (string idx in _extractionIdentifiersidx)
                {
                    foreach (DataRow r in chunk.Rows)
                    {
                        if (r[idx] == DBNull.Value)
                        {
                            if (_extractionIdentifiersidx.Count == 1)
                            {
                                throw new Exception("Null release identifier found in extract of dataset " + Request.DatasetBundle.DataSet);
                            }
                            else
                            {
                                continue; //there are multiple extraction identifiers thats fine if one or two are null
                            }
                        }
                        if (!UniqueReleaseIdentifiersEncountered.Contains(r[idx]))
                        {
                            UniqueReleaseIdentifiersEncountered.Add(r[idx]);
                        }
                    }

                    listener.OnProgress(this, new ProgressEventArgs("Calculating Distinct Release Identifiers", new ProgressMeasurement(UniqueReleaseIdentifiersEncountered.Count, ProgressType.Records), _timeSpentCalculatingDISTINCT.Elapsed));
                }
            }
            _timeSpentCalculatingDISTINCT.Stop();

            return(chunk);
        }
Exemple #3
0
 private void CreateTimespanGraph(ExtractionTimeTimeCoverageAggregator toGraph)
 {/*
   * Chart wdChart = wrdDoc.InlineShapes.AddChart(Microsoft.Office.Core.XlChartType.xl3DColumn, ref oMissing).Chart;
   * ChartData wdChartData = wdChart.ChartData;
   * Workbook dataWorkbook = (Workbook)wdChartData.Workbook;
   * dataWorkbook.Application.Visible = DEBUG_WORD;
   * Worksheet dataSheet = (Worksheet)dataWorkbook.Worksheets[1];
   *
   * //set title before putting any data in
   * wdChart.ApplyLayout(1);
   * wdChart.ChartTitle.Text = "Dataset Timespan";
   * wdChart.ChartTitle.Font.Italic = true;
   * wdChart.ChartTitle.Font.Size = 18;
   * wdChart.ChartTitle.Font.Color = Color.Black.ToArgb();
   *
   * //fill in sheet data
   * dataSheet.Cells.Range["A1", oMissing].FormulaR1C1 = "Date";
   * dataSheet.Cells.Range["B1", oMissing].FormulaR1C1 = "Record Count";
   * dataSheet.Cells.Range["C1", oMissing].FormulaR1C1 = "Distinct Patient Count";
   *
   * //get rid of default microsoft crap
   * dataSheet.Cells.Range["D1", oMissing].FormulaR1C1 = "";
   * dataSheet.Cells.Range["D2", oMissing].FormulaR1C1 = "";
   * dataSheet.Cells.Range["D3", oMissing].FormulaR1C1 = "";
   * dataSheet.Cells.Range["D4", oMissing].FormulaR1C1 = "";
   * dataSheet.Cells.Range["D5", oMissing].FormulaR1C1 = "";
   *
   * //get first of the month
   * int futureRecordsCount = 0;
   * int recordsPriorToDatasetStartCount = 0;
   *
   * int currentRow = 2;
   *
   * var cata = Executer.Source.Request.Catalogue;
   * DateTime startDateCutoff;
   *
   * if (cata.DatasetStartDate == null)
   *     startDateCutoff = DateTime.MinValue;
   * else
   *     startDateCutoff = (DateTime)cata.DatasetStartDate;
   *
   * foreach (DateTime key in Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets.Keys.OrderBy(d => d))
   * {
   *     if (key < startDateCutoff)
   *     {
   *         recordsPriorToDatasetStartCount += Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets[key].CountOfTimesSeen;
   *         continue;
   *     }
   *     if (key > DateTime.Now)
   *     {
   *         futureRecordsCount += Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets[key].CountOfTimesSeen;
   *         continue;
   *     }
   *
   *     dataSheet.Cells.Range["A" + currentRow, oMissing].FormulaR1C1 = key.ToString("yyyy-MM-dd");
   *     ExtractionTimeTimeCoverageAggregatorBucket currentBucket = Executer.Source.ExtractionTimeTimeCoverageAggregator.Buckets[key];
   *     dataSheet.Cells.Range["B" + currentRow, oMissing].FormulaR1C1 = currentBucket.CountOfTimesSeen;
   *     dataSheet.Cells.Range["C" + currentRow, oMissing].FormulaR1C1 = currentBucket.CountOfDistinctIdentifiers;
   *
   *     currentRow++;
   * }
   *
   * //set max size of sheet
   * Microsoft.Office.Interop.Excel.Range tRange = dataSheet.Cells.get_Range("A1", "C" + (currentRow-1));
   *
   * ListObject tbl1 = dataSheet.ListObjects["Table1"];
   * tbl1.Resize(tRange);
   *
   * wdChart.ApplyDataLabels(XlDataLabelsType.xlDataLabelsShowNone, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing, oMissing);
   * wdChart.ChartType = XlChartType.xlLine;
   * Microsoft.Office.Interop.Word.Axis xAxis = (Microsoft.Office.Interop.Word.Axis)wdChart.Axes(Microsoft.Office.Interop.Word.XlAxisType.xlCategory, XlAxisGroup.xlPrimary);
   * xAxis.BaseUnit = Microsoft.Office.Interop.Word.XlTimeUnit.xlMonths;
   *
   *
   * wdChart.Refresh();
   *
   *  if (Executer.Destination.TableLoadInfo.Inserts == 0)
   *     return;
   *
   * float futureRecordsCountAsFraction = ((float)futureRecordsCount) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f;
   * float nullCountAsFraction = ((float)toGraph.countOfNullsSeen) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f;
   * float brokenDatesAsFraction = ((float)toGraph.countOfBrokenDates) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f;
   * float recordsPriorToDatasetStartCountAsFraction = ((float)recordsPriorToDatasetStartCount) / ((float)Executer.Destination.TableLoadInfo.Inserts) * 100.0f;
   *
   * wordHelper.WriteLine("Nulls:" + toGraph.countOfNullsSeen + "(" + nullCountAsFraction + "%)", WdBuiltinStyle.wdStyleNormal);
   * wordHelper.WriteLine("Invalid Date formats:" + toGraph.countOfBrokenDates + "(" + brokenDatesAsFraction + "%)", WdBuiltinStyle.wdStyleNormal);
   * wordHelper.WriteLine("Future Dates:" + futureRecordsCount +"("+ futureRecordsCountAsFraction + "%)",WdBuiltinStyle.wdStyleNormal);
   * wordHelper.WriteLine("Timespan Field: " + cata.TimeCoverage_ExtractionInformation.GetRuntimeName(), WdBuiltinStyle.wdStyleNormal);
   *
   * if (cata.DatasetStartDate != null)
   *     wordHelper.WriteLine("Dates before dataset start date(" + cata.DatasetStartDate.Value.ToString(_destination.DateFormat) + "):" + recordsPriorToDatasetStartCount + "(" + recordsPriorToDatasetStartCountAsFraction + "%)", WdBuiltinStyle.wdStyleNormal);
   *
   * dataWorkbook.Application.Quit();
   */
 }