public static async Task ClearAthenaTable(this EtlSettings etlSettings, string tableName, string s3Path) { if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes) { return; } var pipesSource = etlSettings.AthenaQueryPipesSource; var athenaApi = etlSettings.CreatePipesSourceAthenaAPI(); Console.WriteLine($"DROP TABLE IF EXISTS {tableName}"); var executionId = await athenaApi.StartQuery($"DROP TABLE IF EXISTS {tableName}"); while (!await athenaApi.IsExecutionCompleted(executionId)) { Thread.Sleep(2000); } var s3Api = etlSettings.CreatePipesSourceS3API(); var s3Object = s3Path.ParseS3URI(); if (s3Object is S3Object) { Console.WriteLine($"Delete S3: {s3Path}"); var files = await s3Api.ListFiles(s3Object.Key, "/", s3Object.BucketName); await s3Api.Delete(files.Select(key => $"{s3Object.Key}{key}"), s3Object.BucketName); Console.WriteLine($"{s3Path}: {files.Count} S3 Files Deleted"); } }
public static async Task DeleteFromS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey) { if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent) { var list = new List <S3EventHandler>(); if (await awsS3API.FileExists(listKey)) { try { var json = await awsS3API.ReadAsString(listKey); list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json); } catch (Exception ex) { list = new List <S3EventHandler>(); } } // remove any entries that match that name list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList(); // write back to s3 await awsS3API.UploadAsJson(listKey, list); } }
public static AWSS3API CreateTargetS3API(this EtlSettings etlSettings) { return(new AWSS3API(new AWSS3Options() { Key = etlSettings.TargetAWSKey, Secret = etlSettings.TargetAWSSecret, Region = etlSettings.TargetS3Region, Bucket = etlSettings.TargetS3BucketName })); }
public static AWSS3API CreatePipesSourceS3API(this EtlSettings etlSettings) { var athenaPipes = etlSettings.AthenaQueryPipesSource; if (athenaPipes == null) { throw new Exception("The ETL has an empty Athena source setting."); } return(new AWSS3API(new AWSS3Options() { Key = athenaPipes.Key, Secret = athenaPipes.Secret, Region = athenaPipes.Region })); }
public static AWSAthenaAPI CreateSourceAthenaAPI(this EtlSettings etlSettings) { var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } return(new AWSAthenaAPI(new AWSAthenaOptions() { Key = athena.Key, Secret = athena.Secret, DefaultOutputLocation = athena.DefaultOutputLocation, Region = athena.Region })); }
public static async Task UpdateS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey) { var list = new List <S3EventHandler>(); if (await awsS3API.FileExists(listKey)) { try { var json = await awsS3API.ReadAsString(listKey); list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json); } catch (Exception ex) { list = new List <S3EventHandler>(); } } if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent) { // find the key and update var found = list.FirstOrDefault(handler => handler.EtlName == etlSettings.Name); if (found == null) { list.Add(new S3EventHandler() { EtlName = etlSettings.Name, BucketName = etlSettings.S3EventSource.BucketName, PathRegex = etlSettings.S3EventSource.PathRegex }); } else { found.BucketName = etlSettings.S3EventSource.BucketName; found.PathRegex = etlSettings.S3EventSource.PathRegex; } } else { // remove any entries that match that name list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList(); } // write back to s3 await awsS3API.UploadAsJson(listKey, list); }
public static async Task GetAthenaQueryResultSampleByDate(this EtlSettings etlSettings, int lines) { var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } var athenaApi = etlSettings.CreateSourceAthenaAPI(); var query = athena.AthenaSQL; var today = DateTime.Now; var date = today.AddDays(-athena.DaysAgo); query = query.Replace("{date}", date.ToString(athena.DateFormat)); query = rgxDateOffset.Replace(query, m => { var offset = int.Parse(m.Groups[1].Value); return(date.AddDays(offset).ToString(athena.DateFormat)); }); query += $"\nlimit {lines}"; var getResultRequest = await athenaApi.ExecuteQuery(query); var response = await athenaApi.ReadOneResult(getResultRequest); etlSettings.Mappings = response.ToFieldMapping(); // load data schema to the etlsetting schema var sample = new DataSample() { Rows = new List <DataRow>() }; var data = response.ReadData(); foreach (var row in data) { var dataRow = new DataRow() { Items = row.Select(item => item.ToString()).ToList() }; sample.Rows.Add(dataRow); } etlSettings.Sample = sample; }
public static async Task <List <string> > LoadAllPartitions(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var results = new List <string>(); var targetS3Api = etlSettings.CreateTargetS3API(); var allPaths = await targetS3Api.ListPaths($"{etlSettings.TargetS3Prefix}/", "/"); foreach (var path in allPaths) { var dateKey = path.Replace("/", ""); await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); results.Add($"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(results); }
/// <summary> /// run the athena query pipes /// </summary> /// <param name="etlSettings"></param> /// <param name="useDate"></param> /// <returns></returns> public static async Task RunAthenaQueryPipes(this EtlSettings etlSettings, DateTime?useDate = null) { if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes) { return; } var pipesSource = etlSettings.AthenaQueryPipesSource; AthenaParserSetting parserSetting = new AthenaParserSetting(); parserSetting.DefaultExportPath = $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}".FixPathEnd(); parserSetting.DefaultTableName = $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`"; parserSetting.Date = useDate == null?DateTime.UtcNow.AddDays(-pipesSource.DaysAgo) : useDate.Value.AddDays(-pipesSource.DaysAgo); parserSetting.DateFormat = pipesSource.DateFormat; parserSetting.TempDatabase = pipesSource.TempDatabase; parserSetting.TempTablePath = pipesSource.TempDataPath.FixPathEnd(); var caches = JsonConvert.DeserializeObject <List <CacheSetting> >(etlSettings.AthenaQueryPipesSource.Caches); foreach (var cache in caches) { if (!cache.S3Path.EndsWith("/")) { cache.S3Path += "/"; } parserSetting.Caches.Add(cache.Key, cache); } var parsed = pipesSource.AthenaSQL.ParseAthenaPipes(parserSetting); await etlSettings.ExecuteControlFlow(parsed, parserSetting); var athenaApi = etlSettings.CreatePipesSourceAthenaAPI(); foreach (var kvp in parserSetting.Partitions) { await athenaApi.LoadPartitionIfNotExists(parserSetting.DefaultTableName, kvp.Key, kvp.Value); } }
public static async Task <bool> CreateAthenaTable(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { //create athena database if not exists if (etlSettings.AthenaDatabaseName == null || !regexAthena.IsMatch(etlSettings.AthenaDatabaseName)) { throw new Exception($@"Invalid Athena Database Name '{etlSettings.AthenaDatabaseName}'"); } ; if (etlSettings.AthenaTableName == null || !regexAthena.IsMatch(etlSettings.AthenaTableName)) { throw new Exception($@"Invalid Athena Table Name '{etlSettings.AthenaDatabaseName}'"); } if (etlSettings.Mappings == null || etlSettings.Mappings.Count == 0) { throw new Exception($@"No Fields found for ETL Setting '{etlSettings.Name}'"); } await awsAthenaAPI.ExecuteQuery($@"create database if not exists `{etlSettings.AthenaDatabaseName}`"); // drop the table if it exists await awsAthenaAPI.ExecuteQuery($@"drop table if exists `{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`"); var query = $@"CREATE EXTERNAL TABLE IF NOT EXISTS `{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`( {etlSettings.MapAthenaFields()} ) PARTITIONED BY ( `{etlSettings.DatePartitionKey}` string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' WITH SERDEPROPERTIES ( 'serialization.format' = '1' ) LOCATION 's3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/' "; await awsAthenaAPI.ExecuteQuery(query); return(true); }
public static string MakeTargetS3Key(this EtlSettings etlSettings, string dateKey, string filename, bool keepOriginalName, int parquetIndex) { return($"{etlSettings.MakeTargetS3Prefix(dateKey, filename, keepOriginalName)}{parquetIndex.ToString().PadLeft(5, '0')}.parquet"); }
public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } var athenaApi = etlSettings.CreateSourceAthenaAPI(); var query = athena.AthenaSQL; var today = DateTime.Now; var date = today.AddDays(-athena.DaysAgo); query = query.Replace("{date}", date.ToString(athena.DateFormat)); var dateKey = date.ToString("yyyyMMdd"); // var response = await athenaApi.ExecuteQuery(query); var getResultRequest = await athenaApi.ExecuteQuery(query); //var response = await athenaApi.ReadOneResult(getResultRequest); //var enumerator = response.ResultSet.Rows.GetEnumerator(); ResultSetMetadata resultSetMetadata = null; var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator(); List <Row> rows = new List <Row>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); //skip first row; enumerator.MoveNext(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <Row> rows, ResultSetMetadata metadata, EtlSettings etlSettings, string s3Key) { using (MemoryStream gaStream = new MemoryStream()) { gaStream.WriteAthenaRowsAsParquet(metadata, etlSettings.Mappings, rows); using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray())) { await awsS3Api.Upload(s3Key, uploadStream); } } rows.Clear(); }
public static async Task GetBigQueryResultSampleByDate(this EtlSettings etlSettings, int lines) { var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); // make sure the query is limited by 20 sql = sql.Replace("{date}", dateQueryKey) + $"\nlimit {lines}"; var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); } // map schema to athena types etlSettings.Mappings = results.ToFieldMappings(); var sample = new DataSample() { Rows = new List <DataRow>() }; // convert big query data to sample data foreach (var row in rows) { sample.Rows.Add(new DataRow() { Items = row.RawRow.F.Select(item => { if (item.V == null) { return(""); } else if (item.V.GetType() == typeof(DateTime)) { return(((DateTime)item.V).ToString("o")); } else if (item.V.GetType() == typeof(byte[])) { return(Convert.ToBase64String((byte[])item.V)); } else { return(item.V.ToString()); } }).ToList() }); } etlSettings.Sample = sample; }
public static async Task <EtlReportResponse> GetReports(this EtlSettings etlSettings, EtlReportRequest request) { var awsS3Api = etlSettings.CreateTargetS3API(); var paths = await awsS3Api.ListPaths(etlSettings.TargetS3Prefix + "/", "/"); var dateFrom = DateTime.ParseExact(request.DateFrom, "yyyy-MM-dd", null); var dateTo = DateTime.ParseExact(request.DateTo, "yyyy-MM-dd", null); var dateIntFrom = int.Parse(dateFrom.ToString("yyyyMMdd")); var dateIntTo = int.Parse(dateTo.ToString("yyyyMMdd")); Regex dateKeyPathRegex = new Regex(@"^(\d+)\/$"); var dateKeys = paths.Where(p => dateKeyPathRegex.IsMatch(p)) .Where(p => { var dateKey = dateKeyPathRegex.Match(p).Groups[1].Value; var dateInt = int.Parse(dateKey); return(dateInt >= dateIntFrom && dateInt <= dateIntTo); }) .ToList(); var allObjectsList = await Task.WhenAll(dateKeys.Select(async dateKey => { return(await awsS3Api.ListAllObjectsInBucket(prefix: $"{etlSettings.TargetS3Prefix}/{dateKey}")); }).ToArray()); var allObjects = allObjectsList.Aggregate(new List <S3Object>(), (seed, list) => { if (list != null) { seed.AddRange(list); } return(seed); }); var partitionKey = etlSettings.DatePartitionKey; var partitionPrefexLength = etlSettings.TargetS3Prefix.Length + 1; // read all parquet files var allDictLists = await Task.WhenAll(allObjects.Select(async s3Obj => { using (var parquetStream = await awsS3Api.OpenReadAsync(s3Obj.Key)) { var dictList = parquetStream.ReadParquetAdDictData(etlSettings.Mappings.Select(m => m.MappedName).ToList()); var relativePath = s3Obj.Key.Substring(partitionPrefexLength); var dateKey = relativePath.Substring(0, relativePath.IndexOf("/")); foreach (var dict in dictList) { dict.Add(partitionKey, dateKey); } return(dictList); } }).ToArray()); var resultDictData = allDictLists.Aggregate(new List <Dictionary <string, object> >(), (seed, list) => { if (list != null) { seed.AddRange(list); } return(seed); }); var schema = etlSettings.Mappings.Select(m => m.MappedName).ToList(); schema.Add(partitionKey); return(new EtlReportResponse() { Name = request.Name, DateFrom = request.DateFrom, DateTo = request.DateTo, Schema = schema, Data = resultDictData }); }
public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <BigQueryRow> rows, BigQueryResults results, EtlSettings etlSettings, string s3Key) { using (MemoryStream gaStream = new MemoryStream()) { gaStream.WriteGARowsAsParquet(results.Schema, etlSettings.Mappings, rows); using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray())) { await awsS3Api.Upload(s3Key, uploadStream); } } rows.Clear(); }
/// <summary> /// /// </summary> /// <param name="etlSettings"></param> /// <returns></returns> public static async Task <List <string> > TransferBigQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); string dateKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString("yyyyMMdd"); sql = sql.Replace("{date}", dateKey); var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
public static string MakeTargetS3Prefix(this EtlSettings etlSettings, string dateKey, string filename, bool keepOriginalName) { return($"{etlSettings.TargetS3Prefix}/{dateKey}/{(keepOriginalName ? filename + "/" : "")}"); }
public static string TargetFlagFile(this EtlSettings etlSettings, string filename) { return($"{etlSettings.TargetS3Prefix}/{filename}"); }
public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName) { var result = new List <string>(); var config = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = etlSettings.CsvSourceOptoins.Delimiter }; var csvStream = stream; if (etlSettings.CsvSourceOptoins.GZip) { csvStream = new GZipStream(stream, CompressionMode.Decompress); } using (var csvStreamReader = new StreamReader(csvStream)) { using (var csvReader = new CsvReader(csvStreamReader, config)) { var headers = new List <string>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); if (etlSettings.HasHeader) { csvReader.Read(); string header = null; int index = 0; while (csvReader.TryGetField(index, out header)) { headers.Add(header); index++; } } var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m); List <List <string> > data = new List <List <string> >(); while (csvReader.Read()) { int index = 0; string value = null; var row = new List <string>(); while (csvReader.TryGetField(index, out value)) { if (headers.Count == index) { headers.Add($"Col{index}"); } row.Add(value); index++; } data.Add(row); if (data.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } } { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } { // upload the flag file var s3key = etlSettings.TargetFlagFile(filename); await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK"))); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); } } } return(result); }
private static string MapAthenaFields(this EtlSettings etlSettings) { return(string.Join(",\n", etlSettings.Mappings.Select(field => $"`{field.MappedName}` {field.MapAthenaField()}"))); }
public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null) { var result = new List <string>(); logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}"); switch (etlSettings.SourceType) { case EtlSourceEnum.SFTP: { var sftp = etlSettings.SFTPSource; var nameRegex = new Regex(sftp.PathRegex); var dateRegex = new Regex(sftp.DateKeyRegex); using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password)) { sftpClient.Connect(); var files = sftpClient.ListDirectory(sftp.BasePath); files = files .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name)) .OrderByDescending(f => f.Name) .ToList(); // find in the target to work out if there is the corresponding parquet file var targetS3 = etlSettings.CreateTargetS3API(); SftpFile first = null; foreach (var file in files) { Console.WriteLine($"Check File: {file.FullName}"); var s3Key = etlSettings.TargetFlagFile(file.Name); if (!await targetS3.FileExists(s3Key)) { first = file; break; } } // transfer that file if (first != null) { Console.WriteLine($"Transfer File: {first.FullName}"); var dateKey = first.Name.MakeRegexExtraction(dateRegex); using (var sftpStream = sftpClient.OpenRead(first.FullName)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false); } } sftpClient.Disconnect(); } } break; case EtlSourceEnum.S3BucketCheck: { } break; case EtlSourceEnum.S3BucketEvent: { var sourceAwsS3Api = new AWSS3API(new AWSS3Options() { Key = etlSettings.S3EventSource.Key, Secret = etlSettings.S3EventSource.Secret, Bucket = etlSettings.S3EventSource.BucketName, Region = etlSettings.S3EventSource.Region }); var s3Event = etlSettings.S3EventSource; var nameRegex = new Regex(s3Event.PathRegex); var keyRegex = new Regex(s3Event.FileNameRegex); // do nothing if it does not match the path pattern if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath))) { return(result); } // generate dateKey var dateKey = DateTime.UtcNow.ToString("yyyyMMdd"); Regex dateRegex = null; if (!s3Event.UseEventDateAsDateKey) { dateRegex = new Regex(s3Event.DateKeyRegex); if (!dateRegex.IsMatch(s3Event.ExamplePath)) { return(result); } dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex); } // generate file name var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex); // it will overwrite by default we need to workout datekey first of all var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true); // check files that should be deleted var targetAwsS3Api = etlSettings.CreateTargetS3API(); var oldObjects = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate); // delete the files with those prefix foreach (var oldObj in oldObjects) { await targetAwsS3Api.Delete(oldObj.Key); } // open file stream and transfer data using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true); } } break; case EtlSourceEnum.GoogleAnalytics: { result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthena: { result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthenaPipes: { await etlSettings.RunAthenaQueryPipes(useDate); } break; } return(result); }
/// <summary> /// compile the pipes and run the definition query /// </summary> /// <param name="etlSettings"></param> /// <returns></returns> public static async Task ParseAthenaQueryPipes(this EtlSettings etlSettings) { if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes) { return; } var pipesSource = etlSettings.AthenaQueryPipesSource; AthenaParserSetting parserLogger = new AthenaParserSetting(); parserLogger.DefaultExportPath = $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}".FixPathEnd(); parserLogger.DefaultTableName = $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`"; parserLogger.Date = DateTime.UtcNow.AddDays(-pipesSource.DaysAgo); parserLogger.DateFormat = pipesSource.DateFormat; parserLogger.TempDatabase = pipesSource.TempDatabase; parserLogger.TempTablePath = pipesSource.TempDataPath.FixPathEnd(); pipesSource.ParseErrors = ""; if (!string.IsNullOrWhiteSpace(etlSettings.AthenaQueryPipesSource.Caches)) { try { var caches = JsonConvert.DeserializeObject <List <CacheSetting> >(etlSettings.AthenaQueryPipesSource.Caches); foreach (var cache in caches) { if (!cache.S3Path.EndsWith("/")) { cache.S3Path += "/"; } parserLogger.Caches.Add(cache.Key, cache); } } catch (Exception ex) { pipesSource.ParseErrors += ex.Message; pipesSource.ParseErrors += "\n"; } } try { var parsed = pipesSource.AthenaSQL.ParseAthenaPipes(parserLogger); pipesSource.ParsedQuery = parsed.ToQueryString(); } catch (Exception ex) { pipesSource.ParseErrors = parserLogger.ToString(); pipesSource.ParseErrors += "\n"; pipesSource.ParseErrors += ex.Message; pipesSource.ParsedQuery = ""; } // run if there is definition query if (Regex.IsMatch(pipesSource.AthenaDefinitionSQL, @"\S+")) { var athenaApi = etlSettings.CreatePipesSourceAthenaAPI(); var getResultRequest = await athenaApi.ExecuteQuery(pipesSource.AthenaDefinitionSQL); var response = await athenaApi.ReadOneResult(getResultRequest); etlSettings.Mappings = response.ToFieldMapping(); // load data schema to the etlsetting schema var sample = new DataSample() { Rows = new List <DataRow>() }; var data = response.ReadData(); foreach (var row in data) { var dataRow = new DataRow() { Items = row.Select(item => item.ToString()).ToList() }; sample.Rows.Add(dataRow); } etlSettings.Sample = sample; } }
public static void ReadFromCSVFile(this EtlSettings etlSettings, Stream stream, int lines = 20) { var newMapptings = new List <FieldMapping>(); etlSettings.Sample = new DataSample() { Rows = new List <DataRow>() }; var config = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = etlSettings.CsvSourceOptoins.Delimiter }; var csvStream = stream; if (etlSettings.CsvSourceOptoins.GZip) { csvStream = new GZipStream(stream, CompressionMode.Decompress); } using (var streamReader = new StreamReader(csvStream)) { using (var csvReader = new CsvReader(streamReader, config)) { int numberOfColoumns = 0; if (etlSettings.HasHeader) { if (csvReader.Read()) { var value = ""; int i = 0; while (csvReader.TryGetField(i, out value)) { newMapptings.Add(new FieldMapping() { SourceFieldName = value, MappedName = value.ToMappedName() }); i++; } numberOfColoumns = i; } } int rowCount = 0; while (csvReader.Read() && rowCount < lines) { var value = ""; int i = 0; var row = new List <string>(); while (csvReader.TryGetField(i, out value)) { row.Add(value); i++; } numberOfColoumns = numberOfColoumns <= i ? numberOfColoumns : i; etlSettings.Sample.Rows.Add(new DataRow() { Items = row }); rowCount++; } if (!etlSettings.HasHeader) { for (int i = 0; i < numberOfColoumns; i++) { var field = new FieldMapping() { SourceFieldName = $"Col{i}", MappedName = $"Col{i}" }; newMapptings.Add(field); } } for (int i = 0; i < newMapptings.Count; i++) { newMapptings[i].MappedType = etlSettings.Sample.Rows .Select(row => row.Items.Count > i ? row.Items[i].ToString() : "") .DetectTypeString() .DetectedTypeToAthenaType(); } // update the mappings if (etlSettings.Mappings != null && etlSettings.Mappings.Count > 0) { var oldMappings = etlSettings.Mappings; etlSettings.Mappings = new List <FieldMapping>(); for (int i = 0; i < newMapptings.Count; i++) { if (oldMappings.Count > i && oldMappings[i].SourceFieldName == newMapptings[i].SourceFieldName) { etlSettings.Mappings.Add(oldMappings[i]); } else { etlSettings.Mappings.Add(newMapptings[i]); } } } else { etlSettings.Mappings = newMapptings; } } } }
public static StateMachineQueryContext BuildStateMachineQueryContext(this EtlSettings etlSettings, DateTime?useDate = null) { if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes) { return(null); } var pipesSource = etlSettings.AthenaQueryPipesSource; AthenaParserSetting parserSetting = new AthenaParserSetting(); parserSetting.DefaultExportPath = $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}".FixPathEnd(); parserSetting.DefaultTableName = $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`"; parserSetting.Date = useDate == null?DateTime.UtcNow.AddDays(-pipesSource.DaysAgo) : useDate.Value.AddDays(-pipesSource.DaysAgo); parserSetting.DateFormat = pipesSource.DateFormat; parserSetting.TempDatabase = pipesSource.TempDatabase; parserSetting.TempTablePath = pipesSource.TempDataPath.FixPathEnd(); var caches = JsonConvert.DeserializeObject <List <CacheSetting> >(etlSettings.AthenaQueryPipesSource.Caches); foreach (var cache in caches) { if (!cache.S3Path.EndsWith("/")) { cache.S3Path += "/"; } parserSetting.Caches.Add(cache.Key, cache); } StateMachineQueryContext context = new StateMachineQueryContext(); context.raw = pipesSource.AthenaSQL; context.settings = new StateMachineSettings() { DefaultExportPath = parserSetting.DefaultExportPath, DefaultTableName = parserSetting.DefaultTableName, Date = parserSetting.Date, DateFormat = parserSetting.DateFormat, TempDatabase = parserSetting.TempDatabase, TempTablePath = parserSetting.TempTablePath, Caches = parserSetting.Caches.Values.ToList(), Clearings = parserSetting.Clearings.Select(kvp => new KeyValueEntry() { Key = kvp.Key, Value = kvp.Value }).ToList(), Commands = parserSetting.Commands, DroppingTables = parserSetting.DroppingTables, Partitions = parserSetting.Partitions.Select(kvp => new KeyValueEntry() { Key = kvp.Key, Value = kvp.Value }).ToList(), Variables = parserSetting.Variables.Select(kvp => new KeyValueEntry() { Key = kvp.Key, Value = kvp.Value }).ToList() }; //var parsed = pipesSource.AthenaSQL.ParseAthenaPipes(parserSetting); return(context); //var athenaApi = etlSettings.CreatePipesSourceAthenaAPI(); //foreach (var kvp in parserSetting.Partitions) //{ // await athenaApi.LoadPartitionIfNotExists(parserSetting.DefaultTableName, kvp.Key, kvp.Value); //} }
public static async Task <EtlSettings> ReadEtlSampleData(this EtlSettings etlSettings, int lines = 20) { etlSettings.Sample = new DataSample(); switch (etlSettings.SourceType) { case EtlSourceEnum.SFTP: { var sftp = etlSettings.SFTPSource; var nameRegex = new Regex(sftp.PathRegex); using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password)) { sftpClient.Connect(); var files = sftpClient.ListDirectory(sftp.BasePath); files = files.Where(f => nameRegex.IsMatch(f.FullName)).ToList(); var first = files.FirstOrDefault(); if (first != null) { switch (etlSettings.FileType) { case EtlFileType.CSV: { using (var sftpStream = sftpClient.OpenRead(first.FullName)) { etlSettings.ReadFromCSVFile(sftpStream, lines); } } break; } } sftpClient.Disconnect(); } } break; case EtlSourceEnum.S3BucketCheck: { var s3 = etlSettings.S3CheckSource; var awsS3API = new AWSS3API(new AWSS3Options() { Key = s3.Key, Secret = s3.Secret, Bucket = s3.BucketName, Region = s3.Region, }); var objects = await awsS3API.ListAllObjectsInBucket(s3.BucketName, s3.Prefix); var nameRegex = new Regex(s3.PathRegex); objects = objects.Where(f => nameRegex.IsMatch(f.Key)).ToList(); var first = objects.FirstOrDefault(); if (first != null) { switch (etlSettings.FileType) { case EtlFileType.CSV: { using (var s3Stream = await awsS3API.OpenReadAsync(first.Key, first.BucketName)) { etlSettings.ReadFromCSVFile(s3Stream, lines); } } break; } } } break; case EtlSourceEnum.S3BucketEvent: { var s3 = etlSettings.S3EventSource; var awsS3API = new AWSS3API(new AWSS3Options() { Key = s3.Key, Secret = s3.Secret, Bucket = s3.BucketName, Region = s3.Region, }); if (await awsS3API.FileExists(s3.ExamplePath, s3.BucketName)) { switch (etlSettings.FileType) { case EtlFileType.CSV: { using (var s3Stream = await awsS3API.OpenReadAsync(s3.ExamplePath, s3.BucketName)) { etlSettings.ReadFromCSVFile(s3Stream, lines); } } break; } } } break; case EtlSourceEnum.GoogleAnalytics: { await etlSettings.GetBigQueryResultSampleByDate(lines); } break; case EtlSourceEnum.AmazonAthena: { await etlSettings.GetAthenaQueryResultSampleByDate(lines); } break; case EtlSourceEnum.AmazonAthenaPipes: { // need to compile the query await etlSettings.ParseAthenaQueryPipes(); } break; } // make the sample data smaller foreach (var row in etlSettings.Sample.Rows.ToList()) { row.Items = row.Items.Select(item => item.Length < 100 ? item : item.Substring(0, 50) + "..." + item.Substring(item.Length - 50)).ToList(); } return(etlSettings); }