public static async Task <List <string> > LoadAllPartitions(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var results = new List <string>(); var targetS3Api = etlSettings.CreateTargetS3API(); var allPaths = await targetS3Api.ListPaths($"{etlSettings.TargetS3Prefix}/", "/"); foreach (var path in allPaths) { var dateKey = path.Replace("/", ""); await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); results.Add($"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(results); }
public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } var athenaApi = etlSettings.CreateSourceAthenaAPI(); var query = athena.AthenaSQL; var today = DateTime.Now; var date = today.AddDays(-athena.DaysAgo); query = query.Replace("{date}", date.ToString(athena.DateFormat)); var dateKey = date.ToString("yyyyMMdd"); // var response = await athenaApi.ExecuteQuery(query); var getResultRequest = await athenaApi.ExecuteQuery(query); //var response = await athenaApi.ReadOneResult(getResultRequest); //var enumerator = response.ResultSet.Rows.GetEnumerator(); ResultSetMetadata resultSetMetadata = null; var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator(); List <Row> rows = new List <Row>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); //skip first row; enumerator.MoveNext(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
public static async Task <EtlReportResponse> GetReports(this EtlSettings etlSettings, EtlReportRequest request) { var awsS3Api = etlSettings.CreateTargetS3API(); var paths = await awsS3Api.ListPaths(etlSettings.TargetS3Prefix + "/", "/"); var dateFrom = DateTime.ParseExact(request.DateFrom, "yyyy-MM-dd", null); var dateTo = DateTime.ParseExact(request.DateTo, "yyyy-MM-dd", null); var dateIntFrom = int.Parse(dateFrom.ToString("yyyyMMdd")); var dateIntTo = int.Parse(dateTo.ToString("yyyyMMdd")); Regex dateKeyPathRegex = new Regex(@"^(\d+)\/$"); var dateKeys = paths.Where(p => dateKeyPathRegex.IsMatch(p)) .Where(p => { var dateKey = dateKeyPathRegex.Match(p).Groups[1].Value; var dateInt = int.Parse(dateKey); return(dateInt >= dateIntFrom && dateInt <= dateIntTo); }) .ToList(); var allObjectsList = await Task.WhenAll(dateKeys.Select(async dateKey => { return(await awsS3Api.ListAllObjectsInBucket(prefix: $"{etlSettings.TargetS3Prefix}/{dateKey}")); }).ToArray()); var allObjects = allObjectsList.Aggregate(new List <S3Object>(), (seed, list) => { if (list != null) { seed.AddRange(list); } return(seed); }); var partitionKey = etlSettings.DatePartitionKey; var partitionPrefexLength = etlSettings.TargetS3Prefix.Length + 1; // read all parquet files var allDictLists = await Task.WhenAll(allObjects.Select(async s3Obj => { using (var parquetStream = await awsS3Api.OpenReadAsync(s3Obj.Key)) { var dictList = parquetStream.ReadParquetAdDictData(etlSettings.Mappings.Select(m => m.MappedName).ToList()); var relativePath = s3Obj.Key.Substring(partitionPrefexLength); var dateKey = relativePath.Substring(0, relativePath.IndexOf("/")); foreach (var dict in dictList) { dict.Add(partitionKey, dateKey); } return(dictList); } }).ToArray()); var resultDictData = allDictLists.Aggregate(new List <Dictionary <string, object> >(), (seed, list) => { if (list != null) { seed.AddRange(list); } return(seed); }); var schema = etlSettings.Mappings.Select(m => m.MappedName).ToList(); schema.Add(partitionKey); return(new EtlReportResponse() { Name = request.Name, DateFrom = request.DateFrom, DateTo = request.DateTo, Schema = schema, Data = resultDictData }); }
public static async Task GetBigQueryResultSampleByDate(this EtlSettings etlSettings, int lines) { var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); // make sure the query is limited by 20 sql = sql.Replace("{date}", dateQueryKey) + $"\nlimit {lines}"; var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); } // map schema to athena types etlSettings.Mappings = results.ToFieldMappings(); var sample = new DataSample() { Rows = new List <DataRow>() }; // convert big query data to sample data foreach (var row in rows) { sample.Rows.Add(new DataRow() { Items = row.RawRow.F.Select(item => { if (item.V == null) { return(""); } else if (item.V.GetType() == typeof(DateTime)) { return(((DateTime)item.V).ToString("o")); } else if (item.V.GetType() == typeof(byte[])) { return(Convert.ToBase64String((byte[])item.V)); } else { return(item.V.ToString()); } }).ToList() }); } etlSettings.Sample = sample; }
/// <summary> /// /// </summary> /// <param name="etlSettings"></param> /// <returns></returns> public static async Task <List <string> > TransferBigQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); string dateKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString("yyyyMMdd"); sql = sql.Replace("{date}", dateKey); var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName) { var result = new List <string>(); var config = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = etlSettings.CsvSourceOptoins.Delimiter }; var csvStream = stream; if (etlSettings.CsvSourceOptoins.GZip) { csvStream = new GZipStream(stream, CompressionMode.Decompress); } using (var csvStreamReader = new StreamReader(csvStream)) { using (var csvReader = new CsvReader(csvStreamReader, config)) { var headers = new List <string>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); if (etlSettings.HasHeader) { csvReader.Read(); string header = null; int index = 0; while (csvReader.TryGetField(index, out header)) { headers.Add(header); index++; } } var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m); List <List <string> > data = new List <List <string> >(); while (csvReader.Read()) { int index = 0; string value = null; var row = new List <string>(); while (csvReader.TryGetField(index, out value)) { if (headers.Count == index) { headers.Add($"Col{index}"); } row.Add(value); index++; } data.Add(row); if (data.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } } { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } { // upload the flag file var s3key = etlSettings.TargetFlagFile(filename); await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK"))); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); } } } return(result); }
public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null) { var result = new List <string>(); logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}"); switch (etlSettings.SourceType) { case EtlSourceEnum.SFTP: { var sftp = etlSettings.SFTPSource; var nameRegex = new Regex(sftp.PathRegex); var dateRegex = new Regex(sftp.DateKeyRegex); using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password)) { sftpClient.Connect(); var files = sftpClient.ListDirectory(sftp.BasePath); files = files .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name)) .OrderByDescending(f => f.Name) .ToList(); // find in the target to work out if there is the corresponding parquet file var targetS3 = etlSettings.CreateTargetS3API(); SftpFile first = null; foreach (var file in files) { Console.WriteLine($"Check File: {file.FullName}"); var s3Key = etlSettings.TargetFlagFile(file.Name); if (!await targetS3.FileExists(s3Key)) { first = file; break; } } // transfer that file if (first != null) { Console.WriteLine($"Transfer File: {first.FullName}"); var dateKey = first.Name.MakeRegexExtraction(dateRegex); using (var sftpStream = sftpClient.OpenRead(first.FullName)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false); } } sftpClient.Disconnect(); } } break; case EtlSourceEnum.S3BucketCheck: { } break; case EtlSourceEnum.S3BucketEvent: { var sourceAwsS3Api = new AWSS3API(new AWSS3Options() { Key = etlSettings.S3EventSource.Key, Secret = etlSettings.S3EventSource.Secret, Bucket = etlSettings.S3EventSource.BucketName, Region = etlSettings.S3EventSource.Region }); var s3Event = etlSettings.S3EventSource; var nameRegex = new Regex(s3Event.PathRegex); var keyRegex = new Regex(s3Event.FileNameRegex); // do nothing if it does not match the path pattern if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath))) { return(result); } // generate dateKey var dateKey = DateTime.UtcNow.ToString("yyyyMMdd"); Regex dateRegex = null; if (!s3Event.UseEventDateAsDateKey) { dateRegex = new Regex(s3Event.DateKeyRegex); if (!dateRegex.IsMatch(s3Event.ExamplePath)) { return(result); } dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex); } // generate file name var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex); // it will overwrite by default we need to workout datekey first of all var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true); // check files that should be deleted var targetAwsS3Api = etlSettings.CreateTargetS3API(); var oldObjects = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate); // delete the files with those prefix foreach (var oldObj in oldObjects) { await targetAwsS3Api.Delete(oldObj.Key); } // open file stream and transfer data using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true); } } break; case EtlSourceEnum.GoogleAnalytics: { result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthena: { result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthenaPipes: { await etlSettings.RunAthenaQueryPipes(useDate); } break; } return(result); }