Ejemplo n.º 1
0
        public void TestInitParameters()
        {
            // uninitialized Params
            List <DfModels.LinkedService> linkedServices = null;
            List <DfModels.Dataset>       datasets       = null;

            DfModels.Activity         activity = null;
            DfRuntime.IActivityLogger logger   = new ActivityLogger();

            // testdata
            var pipelinePath = Path.GetFullPath(@"..\..\Data\logs-etl-pipeline.json");
            var activityName = "stage";

            Assert.IsTrue(File.Exists(pipelinePath));

            // do run the wizard
            Wizard.InitParameters(pipelinePath, activityName,
                                  out linkedServices, out datasets, out activity);

            Assert.AreEqual(2, linkedServices.Count);
            Assert.AreEqual(3, datasets.Count);

            DfModels.Dataset importDataset = datasets.Where(dataset => dataset.Name == "import-month-dataset").Single();
            Assert.IsInstanceOfType(importDataset.Properties.TypeProperties, typeof(DfModels.AzureBlobDataset));
            DfModels.Dataset factDataset = datasets.Where(dataset => dataset.Name == "fact-month-dataset").Single();
            Assert.IsInstanceOfType(factDataset.Properties.TypeProperties, typeof(DfModels.AzureBlobDataset));
            DfModels.Dataset factTable = datasets.Where(dataset => dataset.Name == "fact-month-table").Single();
            Assert.IsInstanceOfType(factTable.Properties.TypeProperties, typeof(DfModels.AzureTableDataset));

            // run the activity
            DfRuntime.IDotNetActivity testedActivity = new TestActivity();
            testedActivity.Execute(linkedServices, datasets, activity, logger);
        }
        /// <summary>
        /// Transforms the values of a column in an Azure table. The column may be a normal column or the RowKey column, but cannot be the PartitionKey column.
        /// The column to be transformed is specified using the following extended properties
        /// Extended Properties
        ///     columnName - Name of the column to be transformed
        ///     columnType - Data type of the column. Only supported types right now are: int32, bool, and string
        ///     ifColumnValueMatches - The transformation is applied only if the contents of column value matches the specified value.
        ///     replaceColumnValueWith - Replace the contents of the matched column value with the specified value.
        ///     ifRowKeyContains - The transformation is applied only if the contents of row key contains the specified value.
        ///     replaceRowKeySubStrWith - Replace the contents of the matched row key with the specified value to generate a new row key.
        ///     rowKeyPrefixes - Rowkey prefixes of the rows in which the column transformation will be applied. This is optional and will identify the subset of rows to do this operation.
        /// You can specify columnName,columnType,ifColumnValueMatches,replaceColumnValueWith or ifRowKeyContains,replaceRowKeySubStrWith or both as they work on different column types
        /// Extended Properties Example
        ///   "columnName": "IdentityProviderType",
        ///   "columnType": "string",
        ///   "ifColumnValueMatches": "Beihai",
        ///   "replaceColumnValueWith": "AADS2S",
        ///   "ifRowKeyContains": "Beihai",
        ///   "replaceRowKeySubStrWith": "AADS2S"
        ///  Activity Operation
        ///     The activity iterates through all the rows from the input table with the matching rowKeyPrefixes,
        ///     checks for the column, apply the column transformation if the column value match is found
        ///     checks for the row key update, apply the row key transformation if the row key match is found
        ///     runs a replace table operation in case of column transformation only
        ///     runs a delete insert operation in case of row key transformation
        /// </summary>
        /// <param name="linkedServices">Linked services referenced by activity definition.</param>
        /// <param name="datasets">Datasets referenced by activity definition.</param>
        /// <param name="activity">Activity definition.</param>
        /// <param name="logger">Used to log messages during activity execution.</param>
        /// <returns>Activity state at the end of execution</returns>
        public IDictionary <string, string> Execute(
            IEnumerable <LinkedService> linkedServices,
            IEnumerable <Dataset> datasets,
            Microsoft.Azure.Management.DataFactories.Models.Activity activity,
            IActivityLogger logger)
        {
            DotNetActivity dotNetActivity = (DotNetActivity)activity.TypeProperties;
            IDictionary <string, string> extendedProperties = dotNetActivity.ExtendedProperties;

            logger.Write("Logging extended properties if any...");
            foreach (KeyValuePair <string, string> entry in extendedProperties)
            {
                logger.Write("<key:{0}> <value:{1}>", entry.Key, entry.Value);
            }

            string[] rowKeyPrefixes = null;
            if (extendedProperties.ContainsKey("rowKeyPrefixes"))
            {
                rowKeyPrefixes = extendedProperties["rowKeyPrefixes"].Split(',');
            }

            bool   hasColumnUpdate = false;
            string columnName = string.Empty, columnType = string.Empty, ifColumnValueMatches = string.Empty, replaceColumnValueWith = string.Empty;

            if (extendedProperties.ContainsKey("columnName"))
            {
                columnName             = extendedProperties["columnName"];
                columnType             = extendedProperties["columnType"];
                ifColumnValueMatches   = extendedProperties["ifColumnValueMatches"];
                replaceColumnValueWith = extendedProperties["replaceColumnValueWith"];
                hasColumnUpdate        = true;
            }

            bool   hasRowKeyUpdate = false;
            string ifRowKeyContains = string.Empty, replaceRowKeySubStrWith = string.Empty;

            if (extendedProperties.ContainsKey("ifRowKeyContains"))
            {
                ifRowKeyContains        = extendedProperties["ifRowKeyContains"];
                replaceRowKeySubStrWith = extendedProperties["replaceRowKeySubStrWith"];
                hasRowKeyUpdate         = true;
            }

            AzureStorageLinkedService inputLinkedService;
            AzureTableDataset         sourceTable;

            // For activities working on a single dataset, the first entry is the input dataset.
            // The activity.Inputs can have multiple datasets for building pipeline workflow dependencies. We can ignore the rest of the datasets
            Dataset inputDataset = datasets.Single(dataset => dataset.Name == activity.Inputs.First().Name);

            sourceTable = inputDataset.Properties.TypeProperties as AzureTableDataset;

            logger.Write("input table:{0}", sourceTable.TableName);

            inputLinkedService = linkedServices.First(
                ls =>
                ls.Name ==
                inputDataset.Properties.LinkedServiceName).Properties.TypeProperties
                                 as AzureStorageLinkedService;
            string inputConnectionString = inputLinkedService.ConnectionString;

            // create storage client for input. Pass the connection string.
            CloudStorageAccount inputStorageAccount = CloudStorageAccount.Parse(inputConnectionString);
            CloudTableClient    inputTableClient    = inputStorageAccount.CreateCloudTableClient();
            CloudTable          inputTable          = inputTableClient.GetTableReference(sourceTable.TableName);

            long totalProcessedRecords = 0;
            long actualAffectedRecords = 0;
            TableContinuationToken tableContinuationToken = null;
            List <Task>            tasks = new List <Task>();

            do
            {
                var resultSegment = inputTable.ExecuteQuerySegmented(new TableQuery(), tableContinuationToken);
                tableContinuationToken = resultSegment.ContinuationToken;

                var partitionGroups = (from s in resultSegment.Results
                                       where (rowKeyPrefixes == null || rowKeyPrefixes.Length <= 0) ? true : this.IsMatch(s.RowKey, rowKeyPrefixes)
                                       select s).GroupBy(a => a.PartitionKey);

                foreach (IGrouping <string, DynamicTableEntity> g in partitionGroups)
                {
                    TableBatchOperation batch = new TableBatchOperation();
                    foreach (DynamicTableEntity e in g.AsEnumerable())
                    {
                        string cachedRowkey = e.RowKey;
                        IDictionary <string, EntityProperty> cachedProperties = new Dictionary <string, EntityProperty>();
                        foreach (KeyValuePair <string, EntityProperty> p in e.Properties)
                        {
                            cachedProperties.Add(p);
                        }

                        bool recordUpdated = false, requiresDelete = false;
                        if (hasColumnUpdate)
                        {
                            recordUpdated = this.ReplaceIfMatch(e, columnName, columnType, ifColumnValueMatches, replaceColumnValueWith);
                        }

                        if (hasRowKeyUpdate && e.RowKey.Contains(ifRowKeyContains))
                        {
                            e.RowKey       = e.RowKey.Replace(ifRowKeyContains, replaceRowKeySubStrWith);
                            recordUpdated  = true;
                            requiresDelete = true;
                        }

                        if (recordUpdated)
                        {
                            if (!requiresDelete)
                            {
                                batch.Replace(e);
                            }
                            else
                            {
                                batch.Insert(e);
                                batch.Delete(new DynamicTableEntity(e.PartitionKey, cachedRowkey, "*", cachedProperties));
                            }

                            actualAffectedRecords++;
                            logger.Write("<partition key:{0}>, <row key:{1}> added to batch", e.PartitionKey, e.RowKey);
                        }
                    }

                    if (batch.Count > 0)
                    {
                        tasks.Add(inputTable.ExecuteBatchInChunkAsync(batch));
                    }

                    logger.Write("Updated partition: {0}", g.Key);
                }

                totalProcessedRecords += resultSegment.Results.Count;
                logger.Write("Processed records count: {0}", totalProcessedRecords);
                logger.Write("Affected records count: {0}", actualAffectedRecords);
            }while (tableContinuationToken != null);

            Task.WaitAll(tasks.ToArray());
            logger.Write("Updated {0} records", actualAffectedRecords);

            return(new Dictionary <string, string>());
        }
Ejemplo n.º 3
0
 public IDictionary <string, string> Execute(IEnumerable <DfModels.LinkedService> linkedServices, IEnumerable <DfModels.Dataset> datasets, DfModels.Activity activity, DfRuntime.IActivityLogger logger)
 {
     // do your elephant business
     return(null);
 }
Ejemplo n.º 4
0
        public IDictionary<string, string> Execute(
            IEnumerable<LinkedService> linkedServices,
            IEnumerable<Dataset> datasets,
            ADFActivity activity,
            IActivityLogger logger)
        {
            logger.Write("Actiity start.\n");
            Func<string, LinkedService> linkedServiceResolver = name => linkedServices.Single(_ => _.Name == name);
            IEnumerable<string> inputNames = activity.Inputs.Select(_ => _.Name);
            IEnumerable<string> outputNames = activity.Outputs.Select(_ => _.Name);
            IList<CustomDbDataset> dbDatasets = new List<CustomDbDataset>();

            // convert transform properties from activity's one.
            var dotNetActivity = (DotNetActivity)activity.TypeProperties;
            Slice slice = new Slice
            {
                Start = Convert.ToDateTime(dotNetActivity.ExtendedProperties["SliceStart"].ToString()),
                End = Convert.ToDateTime(dotNetActivity.ExtendedProperties["SliceEnd"].ToString())
            };
            string transform = dotNetActivity.ExtendedProperties["transform"].ToString();

            logger.Write("Slice from {0} to {1}\n", slice.Start, slice.End);

            // create providers
            logger.Write("create providers\n");
            IDictionary<string, IDatasetProvider> providers =
                inputNames.Concat(outputNames)
                .Distinct()
                .Select(datasetName => {
                    IDatasetProvider provider = null;
                    var dataset = datasets.Single(_ => _.Name == datasetName);
                    var linkedService = linkedServiceResolver(dataset.Properties.LinkedServiceName);

                    if (CustomDbDatasetProvider.IsMatch(dataset, linkedService))
                    {
                        provider = new CustomDbDatasetProvider(dataset, linkedService, linkedServiceResolver);
                        dbDatasets.Add((provider as CustomDbDatasetProvider).Dataset);
                        logger.Write("{0} is CustomDbDataset\n", datasetName);
                    }
                    else if (CustomAzureBlobProvider.IsMatch(dataset, linkedService))
                    {
                        provider = new CustomAzureBlobProvider(dataset, linkedService, slice);
                        logger.Write("{0} is CustomAzureBlobDataset\n", datasetName);
                    }
                    else if (AzureBlobProvider.IsMatch(dataset, linkedService))
                    {
                        provider = new AzureBlobProvider(dataset, linkedService, slice);
                        logger.Write("{0} is AzureBlobDataset\n", datasetName);
                    }
                    else
                    {
                        logger.Write("{0} is UnknownDataset\n", datasetName);
                    }

                    return new { dataset = datasetName, provider = provider };
                }).Where(_ => _.provider != null).ToDictionary(_ => _.dataset, _ => _.provider);
            inputNames = inputNames.Where(name => providers.ContainsKey(name)).ToList();

            // create model for transform razor.
            dynamic model = new ExpandoObject();
            var dict = (IDictionary<string, dynamic>)model;
            foreach (var name in inputNames)
            {
                logger.Write("Try load {0}\n", name);
                dict[providers[name].InstanceName] = providers[name].Load(slice);
                logger.Write("Success\n");
            }

            // transform
            logger.Write("Start transform\n");
            var transformer = new FactoryWorkerTransformer(dbDatasets, logger);
            dynamic transformed = transformer.Transform(transform, model, slice);
            logger.Write("End\n");

            // save all
            foreach (var name in outputNames)
            {
                logger.Write("Try save {0}\n", name);
                providers[name].Save(slice, transformed);
                logger.Write("Success\n");
            }
            
            return new Dictionary<string, string>();
        }
        /// <summary>
        /// Transforms Azure table partition key
        /// The partition key to be transformed is specified using the following extended properties
        /// Extended Properties
        ///     ifPartitionKeyContains - The transformation is applied only if the contents of partition key contains the specified value.
        ///     replacePartitionKeySubStrWith - Replace the contents of the matched partition key with the specified value to generate a new partition key.
        ///     rowKeyPrefixes - Rowkey prefixes of the rows in which the partition key transformation will be applied. This is optional and will identify the subset of rows to do this operation.
        /// ifPartitionKeyContains,replacePartitionKeySubStrWith are mandatory
        /// Extended Properties Example
        ///   "ifPartitionKeyContains": "Beihai",
        ///   "replacePartitionKeySubStrWith": "AADS2S"
        ///  Activity Operation
        ///     The activity iterates through all the rows from the input table with the matching rowKeyPrefixes,
        ///     checks for the partition key update, apply the partition key transformation if the partition key match is found
        ///     runs an insert operation for entities with new partition key and delete operation on existing entities with matching partition keys
        /// </summary>
        /// <param name="linkedServices">Linked services referenced by activity definition.</param>
        /// <param name="datasets">Datasets referenced by activity definition.</param>
        /// <param name="activity">Activity definition.</param>
        /// <param name="logger">Used to log messages during activity execution.</param>
        /// <returns>Activity state at the end of execution</returns>
        public IDictionary <string, string> Execute(
            IEnumerable <LinkedService> linkedServices,
            IEnumerable <Dataset> datasets,
            Microsoft.Azure.Management.DataFactories.Models.Activity activity,
            IActivityLogger logger)
        {
            DotNetActivity dotNetActivity = (DotNetActivity)activity.TypeProperties;
            IDictionary <string, string> extendedProperties = dotNetActivity.ExtendedProperties;

            logger.Write("Logging extended properties if any...");
            foreach (KeyValuePair <string, string> entry in extendedProperties)
            {
                logger.Write("<key:{0}> <value:{1}>", entry.Key, entry.Value);
            }

            string[] rowKeyPrefixes = null;
            if (extendedProperties.ContainsKey("rowKeyPrefixes"))
            {
                rowKeyPrefixes = extendedProperties["rowKeyPrefixes"].Split(',');
            }

            if (!extendedProperties.ContainsKey("ifPartitionKeyContains"))
            {
                throw new ArgumentException("Partition key match criteria is required", "ifPartitionKeyContains");
            }

            if (!extendedProperties.ContainsKey("replacePartitionKeySubStrWith"))
            {
                throw new ArgumentException("Partition key substring replacement value is required", "replacePartitionKeySubStrWith");
            }

            string ifPartitionKeyContains        = extendedProperties["ifPartitionKeyContains"];
            string replacePartitionKeySubStrWith = extendedProperties["replacePartitionKeySubStrWith"];

            AzureStorageLinkedService inputLinkedService;
            AzureTableDataset         sourceTable;

            // For activities working on a single dataset, the first entry is the input dataset.
            // The activity.Inputs can have multiple datasets for building pipeline workflow dependencies. We can ignore the rest of the datasets
            Dataset inputDataset = datasets.Single(dataset => dataset.Name == activity.Inputs.First().Name);

            sourceTable = inputDataset.Properties.TypeProperties as AzureTableDataset;

            logger.Write("input table:{0}", sourceTable.TableName);

            inputLinkedService = linkedServices.First(
                ls =>
                ls.Name ==
                inputDataset.Properties.LinkedServiceName).Properties.TypeProperties
                                 as AzureStorageLinkedService;
            string inputConnectionString = inputLinkedService.ConnectionString;

            // create storage client for input. Pass the connection string.
            CloudStorageAccount inputStorageAccount = CloudStorageAccount.Parse(inputConnectionString);
            CloudTableClient    inputTableClient    = inputStorageAccount.CreateCloudTableClient();
            CloudTable          inputTable          = inputTableClient.GetTableReference(sourceTable.TableName);

            long totalProcessedRecords = 0;
            long actualAffectedRecords = 0;
            TableContinuationToken tableContinuationToken = null;
            List <Task>            tasks = new List <Task>();

            do
            {
                var resultSegment = inputTable.ExecuteQuerySegmented(new TableQuery(), tableContinuationToken);
                tableContinuationToken = resultSegment.ContinuationToken;

                var partitionGroups = (from s in resultSegment.Results
                                       where (rowKeyPrefixes == null || rowKeyPrefixes.Length <= 0) ? true : this.IsMatch(s.RowKey, rowKeyPrefixes)
                                       select s).GroupBy(a => a.PartitionKey);

                foreach (IGrouping <string, DynamicTableEntity> g in partitionGroups)
                {
                    TableBatchOperation deleteBatch = new TableBatchOperation();
                    TableBatchOperation insertBatch = new TableBatchOperation();
                    foreach (DynamicTableEntity e in g.AsEnumerable())
                    {
                        if (!e.PartitionKey.Contains(ifPartitionKeyContains))
                        {
                            continue;
                        }

                        DynamicTableEntity newEntity = new DynamicTableEntity(
                            e.PartitionKey.Replace(ifPartitionKeyContains, replacePartitionKeySubStrWith),
                            e.RowKey);
                        foreach (KeyValuePair <string, EntityProperty> property in e.Properties)
                        {
                            newEntity.Properties.Add(property);
                        }

                        insertBatch.InsertOrReplace(newEntity);
                        deleteBatch.Delete(e);
                        actualAffectedRecords++;
                    }

                    if (insertBatch.Count > 0)
                    {
                        tasks.Add(this.RetryOnStorageTimeout(inputTable.ExecuteBatchInChunkAsync(insertBatch), numRetriesOnTimeout, numMsDelayOnTimeout, logger));
                    }

                    if (deleteBatch.Count > 0)
                    {
                        tasks.Add(this.RetryOnStorageTimeout(inputTable.ExecuteBatchInChunkAsync(deleteBatch), numRetriesOnTimeout, numMsDelayOnTimeout, logger));
                    }

                    logger.Write("Updated partition: {0}", g.Key);
                }

                totalProcessedRecords += resultSegment.Results.Count;
                logger.Write("Processed records count: {0}", totalProcessedRecords);
                logger.Write("Affected records count: {0}", actualAffectedRecords);
            }while (tableContinuationToken != null);

            Task.WaitAll(tasks.ToArray());
            logger.Write("Updated {0} records", actualAffectedRecords);

            return(new Dictionary <string, string>());
        }
Ejemplo n.º 6
0
        public static void InitParameters(
            string pipelinePath,
            string activityName,
            out List <Models.LinkedService> linkedServices,
            out List <Models.Dataset> datasets,
            out Models.Activity activity)
        {
            // init the parameters
            linkedServices = new List <Models.LinkedService>();
            datasets       = new List <Models.Dataset>();
            activity       = new Models.Activity();

            // parse the pipeline json source
            var pipelineJson  = File.ReadAllText(pipelinePath);
            var dummyPipeline = JsonConvert.DeserializeObject <Dummy.Pipeline>(pipelineJson);

            foreach (var dummyActivity in dummyPipeline.Properties.Activities)
            {
                // find the relevant activity in the pipeline
                if (dummyActivity.Name != activityName)
                {
                    continue;
                }

                activity.Name = dummyActivity.Name;

                // get the input and output tables
                var dummyDatasets = new HashSet <Dummy.ActivityData>();
                dummyDatasets.UnionWith(dummyActivity.Inputs);
                dummyDatasets.UnionWith(dummyActivity.Outputs);

                var dummyServices = new HashSet <Dummy.LinkedService>();

                // init the data tables
                foreach (var dummyDataset in dummyDatasets)
                {
                    // parse the table json source
                    var dataPath   = Path.Combine(Path.GetDirectoryName(pipelinePath), dummyDataset.Name + ".json");
                    var dataJson   = File.ReadAllText(dataPath);
                    var dummyTable = JsonConvert.DeserializeObject <Dummy.Table>(dataJson);

                    {
                        // initialize dataset properties
                        Models.DatasetTypeProperties datasetProperties;
                        switch (dummyTable.Properties.Type)
                        {
                        case "AzureBlob":
                            // init the azure model
                            var blobDataset = new Models.AzureBlobDataset();
                            blobDataset.FolderPath = dummyTable.Properties.TypeProperties.FolderPath;
                            blobDataset.FileName   = dummyTable.Properties.TypeProperties.FileName;
                            datasetProperties      = blobDataset;
                            break;

                        case "AzureTable":
                            var tableDataset = new Models.AzureTableDataset();
                            tableDataset.TableName = dummyTable.Properties.TypeProperties.TableName;
                            datasetProperties      = tableDataset;
                            break;

                        default:
                            throw new Exception(string.Format("Unexpected Dataset.Type {0}", dummyTable.Properties.Type));
                        }

                        // initialize dataset
                        {
                            var dataDataset = new Models.Dataset(
                                dummyDataset.Name,
                                new Models.DatasetProperties(
                                    datasetProperties,
                                    new CommonModels.Availability(),
                                    ""
                                    )
                                );
                            dataDataset.Properties.LinkedServiceName = dummyTable.Properties.LinkedServiceName;
                            datasets.Add(dataDataset);
                        }
                    }

                    // register the input or output in the activity
                    if (dummyDataset is Dummy.ActivityInput)
                    {
                        activity.Inputs.Add(new CommonModels.ActivityInput(dummyDataset.Name));
                    }

                    if (dummyDataset is Dummy.ActivityOutput)
                    {
                        activity.Outputs.Add(new CommonModels.ActivityOutput(dummyDataset.Name));
                    }

                    // parse the linked service json source for later use
                    var servicePath    = Path.Combine(Path.GetDirectoryName(pipelinePath), dummyTable.Properties.LinkedServiceName + ".json");
                    var serviceJson    = File.ReadAllText(servicePath);
                    var storageService = JsonConvert.DeserializeObject <Dummy.StorageService>(serviceJson);

                    dummyServices.Add(storageService);
                }

                // parse the hd insight service json source
                {
                    var servicePath    = Path.Combine(Path.GetDirectoryName(pipelinePath), dummyActivity.LinkedServiceName + ".json");
                    var serviceJson    = File.ReadAllText(servicePath);
                    var computeService = JsonConvert.DeserializeObject <Dummy.ComputeService>(serviceJson);

                    dummyServices.Add(computeService);
                }

                // init the services
                foreach (var dummyService in dummyServices)
                {
                    Models.LinkedService linkedService = null;

                    // init if it is a storage service
                    if (dummyService is Dummy.StorageService)
                    {
                        var dummyStorageService = dummyService as Dummy.StorageService;

                        var service = new Models.AzureStorageLinkedService();
                        service.ConnectionString = dummyStorageService.Properties.TypeProperties.ConnectionString;
                        linkedService            = new Models.LinkedService(
                            dummyService.Name,
                            new Models.LinkedServiceProperties(service)
                            );
                    }

                    // init if it is a hd insight service
                    if (dummyService is Dummy.ComputeService)
                    {
                        var service = new Models.HDInsightLinkedService();
                        linkedService = new Models.LinkedService(
                            dummyService.Name,
                            new Models.LinkedServiceProperties(service)
                            );
                    }

                    linkedServices.Add(linkedService);
                }
            }

            if (activity.Name == null)
            {
                throw new Exception(string.Format("Activity {0} not found.", activityName));
            }
        }