コード例 #1
0
ファイル: RemoteTableAttacher.cs プロジェクト: HDRUK/RDMP
        public override ExitCodeType Attach(IDataLoadJob job, GracefulCancellationToken cancellationToken)
        {
            if (job == null)
                throw new Exception("Job is Null, we require to know the job to build a DataFlowPipeline");
      
            ThrowIfInvalidRemoteTableName();

            var syntax = _remoteDatabase.Server.GetQuerySyntaxHelper();

            string sql;

            if (!string.IsNullOrWhiteSpace(RemoteSelectSQL))
                sql = RemoteSelectSQL;
            else
                sql = "Select * from " + syntax.EnsureWrapped(RemoteTableName);
            
            bool scheduleMismatch = false;

            //if there is a load progress 
            if (Progress != null)
                try
                {
                    //get appropriate date declaration SQL if any
                    sql = GetScheduleParameterDeclarations(job, out scheduleMismatch) + sql;
                }
                catch (Exception e)
                {
                    //if the date range is in the future then GetScheduleParameterDeclarations will throw Exception about future dates
                    if(e.Message.StartsWith(FutureLoadMessage))
                        return ExitCodeType.OperationNotRequired;//if this is the case then don't bother with the data load

                    throw;
                }
            if (scheduleMismatch)
            {
                job.OnNotify(this, new NotifyEventArgs(ProgressEventType.Warning, "Skipping LoadProgress '" + Progress + "' because it is not the correct Schedule for this component"));
                return ExitCodeType.Success;
            }

            job.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "About to execute SQL:" + Environment.NewLine + sql));

            var source = new DbDataCommandDataFlowSource(sql, "Fetch data from " + _remoteDatabase.Server + " to populate RAW table " + RemoteTableName, _remoteDatabase.Server.Builder, Timeout == 0 ? 50000 : Timeout);

            //For Oracle / Postgres we have to add the parameters to the DbCommand directly
            if (_minDateParam.HasValue && _maxDateParam.HasValue && !syntax.SupportsEmbeddedParameters())
            {
                source.CommandAdjuster = (cmd) =>
                {
                    var pmin = cmd.CreateParameter();
                    pmin.Value = _minDateParam.Value;
                    pmin.ParameterName = StartDateParameter;
                    cmd.Parameters.Add(pmin);

                    var pmax = cmd.CreateParameter();
                    pmax.Value = _maxDateParam.Value;
                    pmax.ParameterName = EndDateParameter;
                    cmd.Parameters.Add(pmax);
                };
            }
                
            var destination = new SqlBulkInsertDestination(_dbInfo, RAWTableName, Enumerable.Empty<string>());

            var contextFactory = new DataFlowPipelineContextFactory<DataTable>();
            var context = contextFactory.Create(PipelineUsage.LogsToTableLoadInfo | PipelineUsage.FixedDestination);

            var engine = new DataFlowPipelineEngine<DataTable>(context, source, destination, job);

            ITableLoadInfo loadInfo = job.DataLoadInfo.CreateTableLoadInfo("Truncate RAW table " + RAWTableName,
                _dbInfo.Server.Name + "." + _dbInfo.GetRuntimeName(),
                new []
                {
                    new DataSource(
                        "Remote SqlServer Servername=" + _remoteDatabase.Server + "Database=" + _dbInfo.GetRuntimeName() +
                        
                        //Either list the table or the query depending on what is populated
                        (RemoteTableName != null?" Table=" + RemoteTableName
                            :" Query = " + sql), DateTime.Now)
                }, -1);

            engine.Initialize(loadInfo);
            engine.ExecutePipeline(new GracefulCancellationToken());

            if (source.TotalRowsRead == 0 && LoadNotRequiredIfNoRowsRead)
            {
                job.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "No rows were read from the remote table and LoadNotRequiredIfNoRowsRead is true so returning ExitCodeType.LoadNotRequired"));
                return ExitCodeType.OperationNotRequired;
            }

            job.OnNotify(this, new NotifyEventArgs(source.TotalRowsRead  > 0 ? ProgressEventType.Information:ProgressEventType.Warning, "Finished after reading " + source.TotalRowsRead + " rows"));


            if (Progress != null)
            {
                if(ProgressUpdateStrategy == null)
                    throw new Exception("ProgressUpdateStrategy is null but there is a Progress");

                ProgressUpdateStrategy.AddAppropriateDisposeStep((ScheduledDataLoadJob) job,_dbInfo);

            }
                
            
            return ExitCodeType.Success;
        }
コード例 #2
0
        public virtual DataTable GetChunk(IDataLoadEventListener listener, GracefulCancellationToken cancellationToken)
        {
            // we are in the Global Commands case, let's return an empty DataTable (not null)
            // so we can trigger the destination to extract the globals docs and sql
            if (GlobalsRequest != null)
            {
                GlobalsRequest.ElevateState(ExtractCommandState.WaitingForSQLServer);
                if (firstGlobalChunk)
                {
                    //unless we are checking, start auditing
                    StartAuditGlobals();

                    firstGlobalChunk = false;
                    return(new DataTable(ExtractionDirectory.GLOBALS_DATA_NAME));
                }

                return(null);
            }

            if (Request == null)
            {
                throw new Exception("Component has not been initialized before being asked to GetChunk(s)");
            }

            Request.ElevateState(ExtractCommandState.WaitingForSQLServer);

            if (_cancel)
            {
                throw new Exception("User cancelled data extraction");
            }

            if (_hostedSource == null)
            {
                StartAudit(Request.QueryBuilder.SQL);

                if (Request.DatasetBundle.DataSet.DisableExtraction)
                {
                    throw new Exception("Cannot extract " + Request.DatasetBundle.DataSet + " because DisableExtraction is set to true");
                }

                _hostedSource = new DbDataCommandDataFlowSource(GetCommandSQL(listener),
                                                                "ExecuteDatasetExtraction " + Request.DatasetBundle.DataSet,
                                                                Request.GetDistinctLiveDatabaseServer().Builder,
                                                                ExecutionTimeout);

                // If we are running in batches then always allow empty extractions
                _hostedSource.AllowEmptyResultSets = AllowEmptyExtractions || Request.IsBatchResume;
                _hostedSource.BatchSize            = BatchSize;
            }

            DataTable chunk = null;

            try
            {
                chunk = _hostedSource.GetChunk(listener, cancellationToken);

                chunk = _peeker.AddPeekedRowsIfAny(chunk);

                //if we are trying to distinct the records in memory based on release id
                if (DistinctStrategy == DistinctStrategy.OrderByAndDistinctInMemory)
                {
                    var releaseIdentifierColumn = Request.ReleaseIdentifierSubstitutions.First().GetRuntimeName();

                    if (chunk != null)
                    {
                        //last release id in the current chunk
                        var lastReleaseId = chunk.Rows[chunk.Rows.Count - 1][releaseIdentifierColumn];

                        _peeker.AddWhile(_hostedSource, r => Equals(r[releaseIdentifierColumn], lastReleaseId), chunk);
                        chunk = MakeDistinct(chunk, listener, cancellationToken);
                    }
                }
            }
            catch (AggregateException a)
            {
                if (a.GetExceptionIfExists <TaskCanceledException>() != null)
                {
                    _cancel = true;
                }

                throw;
            }
            catch (Exception e)
            {
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Error, "Read from source failed", e));
            }

            if (cancellationToken.IsCancellationRequested)
            {
                throw new Exception("Data read cancelled because our cancellationToken was set, aborting data reading");
            }

            //if the first chunk is null
            if (firstChunk && chunk == null && !AllowEmptyExtractions)
            {
                throw new Exception("There is no data to load, query returned no rows, query was:" + Environment.NewLine +
                                    (_hostedSource.Sql ?? Request.QueryBuilder.SQL));
            }

            //not the first chunk anymore
            firstChunk = false;

            //data exhausted
            if (chunk == null)
            {
                listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "Data exhausted after reading " + _rowsRead + " rows of data (" + UniqueReleaseIdentifiersEncountered.Count + " unique release identifiers seen)"));
                if (Request != null)
                {
                    Request.CumulativeExtractionResults.DistinctReleaseIdentifiersEncountered = Request.IsBatchResume ? -1 : UniqueReleaseIdentifiersEncountered.Count;
                }
                return(null);
            }

            _rowsRead += chunk.Rows.Count;
            //chunk will have datatypes for all the things in the buffer so we can populate our dictionary of facts about what columns/catalogue items have spontaneously changed name/type etc
            if (ExtractTimeTransformationsObserved == null)
            {
                GenerateExtractionTransformObservations(chunk);
            }


            //see if the SqlDataReader has a column with the same name as the ReleaseIdentifierSQL (if so then we can use it to count the number of distinct subjects written out to the csv)
            bool includesReleaseIdentifier = _extractionIdentifiersidx.Count > 0;


            //first line - lets see what columns we wrote out
            //looks at the buffer and computes any transforms performed on the column


            _timeSpentValidating.Start();
            //build up the validation report (Missing/Wrong/Etc) - this has no mechanical effect on the extracted data just some metadata that goes into a flat file
            if (ExtractionTimeValidator != null && Request.IncludeValidation)
            {
                try
                {
                    chunk.Columns.Add(ValidationColumnName);

                    ExtractionTimeValidator.Validate(chunk, ValidationColumnName);

                    _rowsValidated += chunk.Rows.Count;
                    listener.OnProgress(this, new ProgressEventArgs("Validation", new ProgressMeasurement(_rowsValidated, ProgressType.Records), _timeSpentValidating.Elapsed));
                }
                catch (Exception ex)
                {
                    listener.OnNotify(this, new NotifyEventArgs(ProgressEventType.Error, "Could not validate data chunk", ex));
                    ValidationFailureException = ex;
                    ExtractionTimeValidator    = null;
                }
            }
            _timeSpentValidating.Stop();

            _timeSpentBuckettingDates.Start();
            if (ExtractionTimeTimeCoverageAggregator != null)
            {
                _rowsBucketted += chunk.Rows.Count;

                foreach (DataRow row in chunk.Rows)
                {
                    ExtractionTimeTimeCoverageAggregator.ProcessRow(row);
                }

                listener.OnProgress(this, new ProgressEventArgs("Bucketting Dates", new ProgressMeasurement(_rowsBucketted, ProgressType.Records), _timeSpentCalculatingDISTINCT.Elapsed));
            }
            _timeSpentBuckettingDates.Stop();

            _timeSpentCalculatingDISTINCT.Start();
            //record unique release identifiers found
            if (includesReleaseIdentifier)
            {
                foreach (string idx in _extractionIdentifiersidx)
                {
                    foreach (DataRow r in chunk.Rows)
                    {
                        if (r[idx] == DBNull.Value)
                        {
                            if (_extractionIdentifiersidx.Count == 1)
                            {
                                throw new Exception("Null release identifier found in extract of dataset " + Request.DatasetBundle.DataSet);
                            }
                            else
                            {
                                continue; //there are multiple extraction identifiers thats fine if one or two are null
                            }
                        }
                        if (!UniqueReleaseIdentifiersEncountered.Contains(r[idx]))
                        {
                            UniqueReleaseIdentifiersEncountered.Add(r[idx]);
                        }
                    }

                    listener.OnProgress(this, new ProgressEventArgs("Calculating Distinct Release Identifiers", new ProgressMeasurement(UniqueReleaseIdentifiersEncountered.Count, ProgressType.Records), _timeSpentCalculatingDISTINCT.Elapsed));
                }
            }
            _timeSpentCalculatingDISTINCT.Stop();

            return(chunk);
        }
コード例 #3
0
ファイル: RemoteDatabaseAttacher.cs プロジェクト: HDRUK/RDMP
        public override ExitCodeType Attach(IDataLoadJob job, GracefulCancellationToken cancellationToken)
        {
            if (job == null)
            {
                throw new Exception("Job is Null, we require to know the job to build a DataFlowPipeline");
            }

            string sql;

            var dbFrom = RemoteSource.Discover(DataAccessContext.DataLoad);

            var remoteTables = new HashSet <string>(dbFrom.DiscoverTables(true).Select(t => t.GetRuntimeName()), StringComparer.CurrentCultureIgnoreCase);
            var loadables    = job.RegularTablesToLoad.Union(job.LookupTablesToLoad).ToArray();

            var syntaxFrom = dbFrom.Server.GetQuerySyntaxHelper();

            foreach (var tableInfo in loadables)
            {
                var table = tableInfo.GetRuntimeName();
                if (!remoteTables.Contains(table))
                {
                    throw new Exception("Loadable table " + table + " was NOT found on the remote DB!");
                }

                if (LoadRawColumnsOnly)
                {
                    var rawColumns = LoadRawColumnsOnly ? tableInfo.GetColumnsAtStage(LoadStage.AdjustRaw) : tableInfo.ColumnInfos;
                    sql = "SELECT " + String.Join(",", rawColumns.Select(c =>
                                                                         syntaxFrom.EnsureWrapped(c.GetRuntimeName(LoadStage.AdjustRaw)))) + " FROM " + syntaxFrom.EnsureWrapped(table);
                }
                else
                {
                    sql = "SELECT * FROM " + syntaxFrom.EnsureWrapped(table);
                }

                job.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "About to execute SQL:" + Environment.NewLine + sql));

                var source = new DbDataCommandDataFlowSource(sql, "Fetch data from " + dbFrom + " to populate RAW table " + table, dbFrom.Server.Builder, Timeout == 0 ? 50000 : Timeout);

                var destination = new SqlBulkInsertDestination(_dbInfo, table, Enumerable.Empty <string>());

                var contextFactory = new DataFlowPipelineContextFactory <DataTable>();
                var context        = contextFactory.Create(PipelineUsage.LogsToTableLoadInfo | PipelineUsage.FixedDestination);

                var engine = new DataFlowPipelineEngine <DataTable>(context, source, destination, job);

                ITableLoadInfo loadInfo = job.DataLoadInfo.CreateTableLoadInfo("Truncate RAW table " + table,
                                                                               _dbInfo.Server.Name + "." + _dbInfo.GetRuntimeName(),
                                                                               new[]
                {
                    new DataSource(
                        "Remote SqlServer Servername=" + dbFrom.Server + ";Database=" + _dbInfo.GetRuntimeName() +

                        //Either list the table or the query depending on what is populated
                        (table != null ? " Table=" + table : " Query = " + sql), DateTime.Now)
                }, -1);

                engine.Initialize(loadInfo);
                engine.ExecutePipeline(new GracefulCancellationToken());

                if (source.TotalRowsRead == 0)
                {
                    job.OnNotify(this, new NotifyEventArgs(ProgressEventType.Information, "No rows were read from the remote table " + table + "."));
                }

                job.OnNotify(this, new NotifyEventArgs(source.TotalRowsRead > 0 ? ProgressEventType.Information : ProgressEventType.Warning, "Finished after reading " + source.TotalRowsRead + " rows"));
            }

            return(ExitCodeType.Success);
        }
コード例 #4
0
ファイル: SourceTests.cs プロジェクト: 24418863/rdm
        public void RetrieveChunks()
        {
            var source = new DbDataCommandDataFlowSource("Select top 3 * from master.sys.tables", "Query Sys tables", DiscoveredServerICanCreateRandomDatabasesAndTablesOn.Builder, 30);

            Assert.AreEqual(3, source.GetChunk(new ThrowImmediatelyDataLoadEventListener(), new GracefulCancellationToken()).Rows.Count);
        }
コード例 #5
0
        public override ExitCodeType Run(IDataLoadJob job, GracefulCancellationToken cancellationToken)
        {
            if (_pipeline != null)
            {
                throw new Exception("Pipeline already executed once");
            }

            var contextFactory = new DataFlowPipelineContextFactory <DataTable>();
            var context        = contextFactory.Create(PipelineUsage.LoadsSingleTableInfo | PipelineUsage.FixedDestination | PipelineUsage.LogsToTableLoadInfo);

            //where we are coming from (source)
            var sourceConvention = LoadBubble.Raw;
            DiscoveredDatabase sourceDatabase = _databaseConfiguration.DeployInfo[sourceConvention];
            var sourceTableName = _tableInfo.GetRuntimeName(sourceConvention, _databaseConfiguration.DatabaseNamer);

            //What to do if where we are coming from does not have the table existing on it
            if (!sourceDatabase.ExpectTable(sourceTableName).Exists())
            {
                if (_isLookupTable)
                {
                    job.OnNotify(this,
                                 new NotifyEventArgs(ProgressEventType.Warning,
                                                     "Lookup table " + sourceTableName + " did not exist on RAW so was not migrated to STAGING"));
                    return(ExitCodeType.Success);
                }
                else
                {
                    job.OnNotify(this,
                                 new NotifyEventArgs(ProgressEventType.Error,
                                                     "Table " + sourceTableName + " did not exist in RAW database " + sourceDatabase +
                                                     " when it came time to migrate RAW to STAGING (and the table is not a lookup)"));
                }
            }


            // where we are going to (destination)
            // ignore any columns that are marked for discard
            var destinationConvention = LoadBubble.Staging;
            DiscoveredDatabase destinationDatabase = _databaseConfiguration.DeployInfo[LoadBubble.Staging];
            var destinationTableName = _tableInfo.GetRuntimeName(destinationConvention, _databaseConfiguration.DatabaseNamer);

            DeleteFullyNullRecords(sourceTableName, sourceDatabase, job);

            //audit
            ITableLoadInfo tableLoadInfo = job.DataLoadInfo.CreateTableLoadInfo(
                "None required, if fails then simply drop Staging database and reload dataset", "STAGING:" + destinationTableName,
                new DataSource[] { new DataSource("RAW:" + sourceTableName, DateTime.Now) }, -1);

            var syntax = sourceDatabase.Server.GetQuerySyntaxHelper();

            //connect to source and open a reader! note that GetReaderForRAW will at this point preserve the state of the database such that any commands e.g. deletes will not have any effect even though ExecutePipeline has not been called!
            var source = new DbDataCommandDataFlowSource(
                "Select distinct * from " + syntax.EnsureWrapped(sourceTableName),
                "Fetch data from " + syntax.EnsureWrapped(sourceTableName),
                sourceDatabase.Server.Builder, 50000);

            //ignore those that are pre load discarded columns (unless they are dilution in which case they get passed through in a decrepid state instead of dumped entirely - these fields will still bein ANODump in pristene state btw)
            var columnNamesToIgnoreForBulkInsert = _tableInfo.PreLoadDiscardedColumns.Where(c => c.Destination != DiscardedColumnDestination.Dilute).Select(column => column.RuntimeColumnName).ToList();

            //pass pre load discard
            var destination = new SqlBulkInsertDestination(destinationDatabase, destinationTableName, columnNamesToIgnoreForBulkInsert);

            //engine that will move data
            _pipeline = new DataFlowPipelineEngine <DataTable>(context, source, destination, job);

            //add clean strings component
            _pipeline.ComponentObjects.Add(new CleanStrings());

            //add dropping of preload discard columns
            _pipeline.ComponentObjects.Add(new BasicAnonymisationEngine());

            _pipeline.Initialize(tableLoadInfo, _tableInfo);

            //tell it to move data
            _pipeline.ExecutePipeline(cancellationToken);

            return(ExitCodeType.Success);
        }