public void TestInitParameters() { // uninitialized Params List <DfModels.LinkedService> linkedServices = null; List <DfModels.Dataset> datasets = null; DfModels.Activity activity = null; DfRuntime.IActivityLogger logger = new ActivityLogger(); // testdata var pipelinePath = Path.GetFullPath(@"..\..\Data\logs-etl-pipeline.json"); var activityName = "stage"; Assert.IsTrue(File.Exists(pipelinePath)); // do run the wizard Wizard.InitParameters(pipelinePath, activityName, out linkedServices, out datasets, out activity); Assert.AreEqual(2, linkedServices.Count); Assert.AreEqual(3, datasets.Count); DfModels.Dataset importDataset = datasets.Where(dataset => dataset.Name == "import-month-dataset").Single(); Assert.IsInstanceOfType(importDataset.Properties.TypeProperties, typeof(DfModels.AzureBlobDataset)); DfModels.Dataset factDataset = datasets.Where(dataset => dataset.Name == "fact-month-dataset").Single(); Assert.IsInstanceOfType(factDataset.Properties.TypeProperties, typeof(DfModels.AzureBlobDataset)); DfModels.Dataset factTable = datasets.Where(dataset => dataset.Name == "fact-month-table").Single(); Assert.IsInstanceOfType(factTable.Properties.TypeProperties, typeof(DfModels.AzureTableDataset)); // run the activity DfRuntime.IDotNetActivity testedActivity = new TestActivity(); testedActivity.Execute(linkedServices, datasets, activity, logger); }
/// <summary> /// Transforms the values of a column in an Azure table. The column may be a normal column or the RowKey column, but cannot be the PartitionKey column. /// The column to be transformed is specified using the following extended properties /// Extended Properties /// columnName - Name of the column to be transformed /// columnType - Data type of the column. Only supported types right now are: int32, bool, and string /// ifColumnValueMatches - The transformation is applied only if the contents of column value matches the specified value. /// replaceColumnValueWith - Replace the contents of the matched column value with the specified value. /// ifRowKeyContains - The transformation is applied only if the contents of row key contains the specified value. /// replaceRowKeySubStrWith - Replace the contents of the matched row key with the specified value to generate a new row key. /// rowKeyPrefixes - Rowkey prefixes of the rows in which the column transformation will be applied. This is optional and will identify the subset of rows to do this operation. /// You can specify columnName,columnType,ifColumnValueMatches,replaceColumnValueWith or ifRowKeyContains,replaceRowKeySubStrWith or both as they work on different column types /// Extended Properties Example /// "columnName": "IdentityProviderType", /// "columnType": "string", /// "ifColumnValueMatches": "Beihai", /// "replaceColumnValueWith": "AADS2S", /// "ifRowKeyContains": "Beihai", /// "replaceRowKeySubStrWith": "AADS2S" /// Activity Operation /// The activity iterates through all the rows from the input table with the matching rowKeyPrefixes, /// checks for the column, apply the column transformation if the column value match is found /// checks for the row key update, apply the row key transformation if the row key match is found /// runs a replace table operation in case of column transformation only /// runs a delete insert operation in case of row key transformation /// </summary> /// <param name="linkedServices">Linked services referenced by activity definition.</param> /// <param name="datasets">Datasets referenced by activity definition.</param> /// <param name="activity">Activity definition.</param> /// <param name="logger">Used to log messages during activity execution.</param> /// <returns>Activity state at the end of execution</returns> public IDictionary <string, string> Execute( IEnumerable <LinkedService> linkedServices, IEnumerable <Dataset> datasets, Microsoft.Azure.Management.DataFactories.Models.Activity activity, IActivityLogger logger) { DotNetActivity dotNetActivity = (DotNetActivity)activity.TypeProperties; IDictionary <string, string> extendedProperties = dotNetActivity.ExtendedProperties; logger.Write("Logging extended properties if any..."); foreach (KeyValuePair <string, string> entry in extendedProperties) { logger.Write("<key:{0}> <value:{1}>", entry.Key, entry.Value); } string[] rowKeyPrefixes = null; if (extendedProperties.ContainsKey("rowKeyPrefixes")) { rowKeyPrefixes = extendedProperties["rowKeyPrefixes"].Split(','); } bool hasColumnUpdate = false; string columnName = string.Empty, columnType = string.Empty, ifColumnValueMatches = string.Empty, replaceColumnValueWith = string.Empty; if (extendedProperties.ContainsKey("columnName")) { columnName = extendedProperties["columnName"]; columnType = extendedProperties["columnType"]; ifColumnValueMatches = extendedProperties["ifColumnValueMatches"]; replaceColumnValueWith = extendedProperties["replaceColumnValueWith"]; hasColumnUpdate = true; } bool hasRowKeyUpdate = false; string ifRowKeyContains = string.Empty, replaceRowKeySubStrWith = string.Empty; if (extendedProperties.ContainsKey("ifRowKeyContains")) { ifRowKeyContains = extendedProperties["ifRowKeyContains"]; replaceRowKeySubStrWith = extendedProperties["replaceRowKeySubStrWith"]; hasRowKeyUpdate = true; } AzureStorageLinkedService inputLinkedService; AzureTableDataset sourceTable; // For activities working on a single dataset, the first entry is the input dataset. // The activity.Inputs can have multiple datasets for building pipeline workflow dependencies. We can ignore the rest of the datasets Dataset inputDataset = datasets.Single(dataset => dataset.Name == activity.Inputs.First().Name); sourceTable = inputDataset.Properties.TypeProperties as AzureTableDataset; logger.Write("input table:{0}", sourceTable.TableName); inputLinkedService = linkedServices.First( ls => ls.Name == inputDataset.Properties.LinkedServiceName).Properties.TypeProperties as AzureStorageLinkedService; string inputConnectionString = inputLinkedService.ConnectionString; // create storage client for input. Pass the connection string. CloudStorageAccount inputStorageAccount = CloudStorageAccount.Parse(inputConnectionString); CloudTableClient inputTableClient = inputStorageAccount.CreateCloudTableClient(); CloudTable inputTable = inputTableClient.GetTableReference(sourceTable.TableName); long totalProcessedRecords = 0; long actualAffectedRecords = 0; TableContinuationToken tableContinuationToken = null; List <Task> tasks = new List <Task>(); do { var resultSegment = inputTable.ExecuteQuerySegmented(new TableQuery(), tableContinuationToken); tableContinuationToken = resultSegment.ContinuationToken; var partitionGroups = (from s in resultSegment.Results where (rowKeyPrefixes == null || rowKeyPrefixes.Length <= 0) ? true : this.IsMatch(s.RowKey, rowKeyPrefixes) select s).GroupBy(a => a.PartitionKey); foreach (IGrouping <string, DynamicTableEntity> g in partitionGroups) { TableBatchOperation batch = new TableBatchOperation(); foreach (DynamicTableEntity e in g.AsEnumerable()) { string cachedRowkey = e.RowKey; IDictionary <string, EntityProperty> cachedProperties = new Dictionary <string, EntityProperty>(); foreach (KeyValuePair <string, EntityProperty> p in e.Properties) { cachedProperties.Add(p); } bool recordUpdated = false, requiresDelete = false; if (hasColumnUpdate) { recordUpdated = this.ReplaceIfMatch(e, columnName, columnType, ifColumnValueMatches, replaceColumnValueWith); } if (hasRowKeyUpdate && e.RowKey.Contains(ifRowKeyContains)) { e.RowKey = e.RowKey.Replace(ifRowKeyContains, replaceRowKeySubStrWith); recordUpdated = true; requiresDelete = true; } if (recordUpdated) { if (!requiresDelete) { batch.Replace(e); } else { batch.Insert(e); batch.Delete(new DynamicTableEntity(e.PartitionKey, cachedRowkey, "*", cachedProperties)); } actualAffectedRecords++; logger.Write("<partition key:{0}>, <row key:{1}> added to batch", e.PartitionKey, e.RowKey); } } if (batch.Count > 0) { tasks.Add(inputTable.ExecuteBatchInChunkAsync(batch)); } logger.Write("Updated partition: {0}", g.Key); } totalProcessedRecords += resultSegment.Results.Count; logger.Write("Processed records count: {0}", totalProcessedRecords); logger.Write("Affected records count: {0}", actualAffectedRecords); }while (tableContinuationToken != null); Task.WaitAll(tasks.ToArray()); logger.Write("Updated {0} records", actualAffectedRecords); return(new Dictionary <string, string>()); }
public IDictionary <string, string> Execute(IEnumerable <DfModels.LinkedService> linkedServices, IEnumerable <DfModels.Dataset> datasets, DfModels.Activity activity, DfRuntime.IActivityLogger logger) { // do your elephant business return(null); }
public IDictionary<string, string> Execute( IEnumerable<LinkedService> linkedServices, IEnumerable<Dataset> datasets, ADFActivity activity, IActivityLogger logger) { logger.Write("Actiity start.\n"); Func<string, LinkedService> linkedServiceResolver = name => linkedServices.Single(_ => _.Name == name); IEnumerable<string> inputNames = activity.Inputs.Select(_ => _.Name); IEnumerable<string> outputNames = activity.Outputs.Select(_ => _.Name); IList<CustomDbDataset> dbDatasets = new List<CustomDbDataset>(); // convert transform properties from activity's one. var dotNetActivity = (DotNetActivity)activity.TypeProperties; Slice slice = new Slice { Start = Convert.ToDateTime(dotNetActivity.ExtendedProperties["SliceStart"].ToString()), End = Convert.ToDateTime(dotNetActivity.ExtendedProperties["SliceEnd"].ToString()) }; string transform = dotNetActivity.ExtendedProperties["transform"].ToString(); logger.Write("Slice from {0} to {1}\n", slice.Start, slice.End); // create providers logger.Write("create providers\n"); IDictionary<string, IDatasetProvider> providers = inputNames.Concat(outputNames) .Distinct() .Select(datasetName => { IDatasetProvider provider = null; var dataset = datasets.Single(_ => _.Name == datasetName); var linkedService = linkedServiceResolver(dataset.Properties.LinkedServiceName); if (CustomDbDatasetProvider.IsMatch(dataset, linkedService)) { provider = new CustomDbDatasetProvider(dataset, linkedService, linkedServiceResolver); dbDatasets.Add((provider as CustomDbDatasetProvider).Dataset); logger.Write("{0} is CustomDbDataset\n", datasetName); } else if (CustomAzureBlobProvider.IsMatch(dataset, linkedService)) { provider = new CustomAzureBlobProvider(dataset, linkedService, slice); logger.Write("{0} is CustomAzureBlobDataset\n", datasetName); } else if (AzureBlobProvider.IsMatch(dataset, linkedService)) { provider = new AzureBlobProvider(dataset, linkedService, slice); logger.Write("{0} is AzureBlobDataset\n", datasetName); } else { logger.Write("{0} is UnknownDataset\n", datasetName); } return new { dataset = datasetName, provider = provider }; }).Where(_ => _.provider != null).ToDictionary(_ => _.dataset, _ => _.provider); inputNames = inputNames.Where(name => providers.ContainsKey(name)).ToList(); // create model for transform razor. dynamic model = new ExpandoObject(); var dict = (IDictionary<string, dynamic>)model; foreach (var name in inputNames) { logger.Write("Try load {0}\n", name); dict[providers[name].InstanceName] = providers[name].Load(slice); logger.Write("Success\n"); } // transform logger.Write("Start transform\n"); var transformer = new FactoryWorkerTransformer(dbDatasets, logger); dynamic transformed = transformer.Transform(transform, model, slice); logger.Write("End\n"); // save all foreach (var name in outputNames) { logger.Write("Try save {0}\n", name); providers[name].Save(slice, transformed); logger.Write("Success\n"); } return new Dictionary<string, string>(); }
/// <summary> /// Transforms Azure table partition key /// The partition key to be transformed is specified using the following extended properties /// Extended Properties /// ifPartitionKeyContains - The transformation is applied only if the contents of partition key contains the specified value. /// replacePartitionKeySubStrWith - Replace the contents of the matched partition key with the specified value to generate a new partition key. /// rowKeyPrefixes - Rowkey prefixes of the rows in which the partition key transformation will be applied. This is optional and will identify the subset of rows to do this operation. /// ifPartitionKeyContains,replacePartitionKeySubStrWith are mandatory /// Extended Properties Example /// "ifPartitionKeyContains": "Beihai", /// "replacePartitionKeySubStrWith": "AADS2S" /// Activity Operation /// The activity iterates through all the rows from the input table with the matching rowKeyPrefixes, /// checks for the partition key update, apply the partition key transformation if the partition key match is found /// runs an insert operation for entities with new partition key and delete operation on existing entities with matching partition keys /// </summary> /// <param name="linkedServices">Linked services referenced by activity definition.</param> /// <param name="datasets">Datasets referenced by activity definition.</param> /// <param name="activity">Activity definition.</param> /// <param name="logger">Used to log messages during activity execution.</param> /// <returns>Activity state at the end of execution</returns> public IDictionary <string, string> Execute( IEnumerable <LinkedService> linkedServices, IEnumerable <Dataset> datasets, Microsoft.Azure.Management.DataFactories.Models.Activity activity, IActivityLogger logger) { DotNetActivity dotNetActivity = (DotNetActivity)activity.TypeProperties; IDictionary <string, string> extendedProperties = dotNetActivity.ExtendedProperties; logger.Write("Logging extended properties if any..."); foreach (KeyValuePair <string, string> entry in extendedProperties) { logger.Write("<key:{0}> <value:{1}>", entry.Key, entry.Value); } string[] rowKeyPrefixes = null; if (extendedProperties.ContainsKey("rowKeyPrefixes")) { rowKeyPrefixes = extendedProperties["rowKeyPrefixes"].Split(','); } if (!extendedProperties.ContainsKey("ifPartitionKeyContains")) { throw new ArgumentException("Partition key match criteria is required", "ifPartitionKeyContains"); } if (!extendedProperties.ContainsKey("replacePartitionKeySubStrWith")) { throw new ArgumentException("Partition key substring replacement value is required", "replacePartitionKeySubStrWith"); } string ifPartitionKeyContains = extendedProperties["ifPartitionKeyContains"]; string replacePartitionKeySubStrWith = extendedProperties["replacePartitionKeySubStrWith"]; AzureStorageLinkedService inputLinkedService; AzureTableDataset sourceTable; // For activities working on a single dataset, the first entry is the input dataset. // The activity.Inputs can have multiple datasets for building pipeline workflow dependencies. We can ignore the rest of the datasets Dataset inputDataset = datasets.Single(dataset => dataset.Name == activity.Inputs.First().Name); sourceTable = inputDataset.Properties.TypeProperties as AzureTableDataset; logger.Write("input table:{0}", sourceTable.TableName); inputLinkedService = linkedServices.First( ls => ls.Name == inputDataset.Properties.LinkedServiceName).Properties.TypeProperties as AzureStorageLinkedService; string inputConnectionString = inputLinkedService.ConnectionString; // create storage client for input. Pass the connection string. CloudStorageAccount inputStorageAccount = CloudStorageAccount.Parse(inputConnectionString); CloudTableClient inputTableClient = inputStorageAccount.CreateCloudTableClient(); CloudTable inputTable = inputTableClient.GetTableReference(sourceTable.TableName); long totalProcessedRecords = 0; long actualAffectedRecords = 0; TableContinuationToken tableContinuationToken = null; List <Task> tasks = new List <Task>(); do { var resultSegment = inputTable.ExecuteQuerySegmented(new TableQuery(), tableContinuationToken); tableContinuationToken = resultSegment.ContinuationToken; var partitionGroups = (from s in resultSegment.Results where (rowKeyPrefixes == null || rowKeyPrefixes.Length <= 0) ? true : this.IsMatch(s.RowKey, rowKeyPrefixes) select s).GroupBy(a => a.PartitionKey); foreach (IGrouping <string, DynamicTableEntity> g in partitionGroups) { TableBatchOperation deleteBatch = new TableBatchOperation(); TableBatchOperation insertBatch = new TableBatchOperation(); foreach (DynamicTableEntity e in g.AsEnumerable()) { if (!e.PartitionKey.Contains(ifPartitionKeyContains)) { continue; } DynamicTableEntity newEntity = new DynamicTableEntity( e.PartitionKey.Replace(ifPartitionKeyContains, replacePartitionKeySubStrWith), e.RowKey); foreach (KeyValuePair <string, EntityProperty> property in e.Properties) { newEntity.Properties.Add(property); } insertBatch.InsertOrReplace(newEntity); deleteBatch.Delete(e); actualAffectedRecords++; } if (insertBatch.Count > 0) { tasks.Add(this.RetryOnStorageTimeout(inputTable.ExecuteBatchInChunkAsync(insertBatch), numRetriesOnTimeout, numMsDelayOnTimeout, logger)); } if (deleteBatch.Count > 0) { tasks.Add(this.RetryOnStorageTimeout(inputTable.ExecuteBatchInChunkAsync(deleteBatch), numRetriesOnTimeout, numMsDelayOnTimeout, logger)); } logger.Write("Updated partition: {0}", g.Key); } totalProcessedRecords += resultSegment.Results.Count; logger.Write("Processed records count: {0}", totalProcessedRecords); logger.Write("Affected records count: {0}", actualAffectedRecords); }while (tableContinuationToken != null); Task.WaitAll(tasks.ToArray()); logger.Write("Updated {0} records", actualAffectedRecords); return(new Dictionary <string, string>()); }
public static void InitParameters( string pipelinePath, string activityName, out List <Models.LinkedService> linkedServices, out List <Models.Dataset> datasets, out Models.Activity activity) { // init the parameters linkedServices = new List <Models.LinkedService>(); datasets = new List <Models.Dataset>(); activity = new Models.Activity(); // parse the pipeline json source var pipelineJson = File.ReadAllText(pipelinePath); var dummyPipeline = JsonConvert.DeserializeObject <Dummy.Pipeline>(pipelineJson); foreach (var dummyActivity in dummyPipeline.Properties.Activities) { // find the relevant activity in the pipeline if (dummyActivity.Name != activityName) { continue; } activity.Name = dummyActivity.Name; // get the input and output tables var dummyDatasets = new HashSet <Dummy.ActivityData>(); dummyDatasets.UnionWith(dummyActivity.Inputs); dummyDatasets.UnionWith(dummyActivity.Outputs); var dummyServices = new HashSet <Dummy.LinkedService>(); // init the data tables foreach (var dummyDataset in dummyDatasets) { // parse the table json source var dataPath = Path.Combine(Path.GetDirectoryName(pipelinePath), dummyDataset.Name + ".json"); var dataJson = File.ReadAllText(dataPath); var dummyTable = JsonConvert.DeserializeObject <Dummy.Table>(dataJson); { // initialize dataset properties Models.DatasetTypeProperties datasetProperties; switch (dummyTable.Properties.Type) { case "AzureBlob": // init the azure model var blobDataset = new Models.AzureBlobDataset(); blobDataset.FolderPath = dummyTable.Properties.TypeProperties.FolderPath; blobDataset.FileName = dummyTable.Properties.TypeProperties.FileName; datasetProperties = blobDataset; break; case "AzureTable": var tableDataset = new Models.AzureTableDataset(); tableDataset.TableName = dummyTable.Properties.TypeProperties.TableName; datasetProperties = tableDataset; break; default: throw new Exception(string.Format("Unexpected Dataset.Type {0}", dummyTable.Properties.Type)); } // initialize dataset { var dataDataset = new Models.Dataset( dummyDataset.Name, new Models.DatasetProperties( datasetProperties, new CommonModels.Availability(), "" ) ); dataDataset.Properties.LinkedServiceName = dummyTable.Properties.LinkedServiceName; datasets.Add(dataDataset); } } // register the input or output in the activity if (dummyDataset is Dummy.ActivityInput) { activity.Inputs.Add(new CommonModels.ActivityInput(dummyDataset.Name)); } if (dummyDataset is Dummy.ActivityOutput) { activity.Outputs.Add(new CommonModels.ActivityOutput(dummyDataset.Name)); } // parse the linked service json source for later use var servicePath = Path.Combine(Path.GetDirectoryName(pipelinePath), dummyTable.Properties.LinkedServiceName + ".json"); var serviceJson = File.ReadAllText(servicePath); var storageService = JsonConvert.DeserializeObject <Dummy.StorageService>(serviceJson); dummyServices.Add(storageService); } // parse the hd insight service json source { var servicePath = Path.Combine(Path.GetDirectoryName(pipelinePath), dummyActivity.LinkedServiceName + ".json"); var serviceJson = File.ReadAllText(servicePath); var computeService = JsonConvert.DeserializeObject <Dummy.ComputeService>(serviceJson); dummyServices.Add(computeService); } // init the services foreach (var dummyService in dummyServices) { Models.LinkedService linkedService = null; // init if it is a storage service if (dummyService is Dummy.StorageService) { var dummyStorageService = dummyService as Dummy.StorageService; var service = new Models.AzureStorageLinkedService(); service.ConnectionString = dummyStorageService.Properties.TypeProperties.ConnectionString; linkedService = new Models.LinkedService( dummyService.Name, new Models.LinkedServiceProperties(service) ); } // init if it is a hd insight service if (dummyService is Dummy.ComputeService) { var service = new Models.HDInsightLinkedService(); linkedService = new Models.LinkedService( dummyService.Name, new Models.LinkedServiceProperties(service) ); } linkedServices.Add(linkedService); } } if (activity.Name == null) { throw new Exception(string.Format("Activity {0} not found.", activityName)); } }