public static async Task GetAthenaQueryResultSampleByDate(this EtlSettings etlSettings, int lines) { var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } var athenaApi = etlSettings.CreateSourceAthenaAPI(); var query = athena.AthenaSQL; var today = DateTime.Now; var date = today.AddDays(-athena.DaysAgo); query = query.Replace("{date}", date.ToString(athena.DateFormat)); query = rgxDateOffset.Replace(query, m => { var offset = int.Parse(m.Groups[1].Value); return(date.AddDays(offset).ToString(athena.DateFormat)); }); query += $"\nlimit {lines}"; var getResultRequest = await athenaApi.ExecuteQuery(query); var response = await athenaApi.ReadOneResult(getResultRequest); etlSettings.Mappings = response.ToFieldMapping(); // load data schema to the etlsetting schema var sample = new DataSample() { Rows = new List <DataRow>() }; var data = response.ReadData(); foreach (var row in data) { var dataRow = new DataRow() { Items = row.Select(item => item.ToString()).ToList() }; sample.Rows.Add(dataRow); } etlSettings.Sample = sample; }
public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } var athenaApi = etlSettings.CreateSourceAthenaAPI(); var query = athena.AthenaSQL; var today = DateTime.Now; var date = today.AddDays(-athena.DaysAgo); query = query.Replace("{date}", date.ToString(athena.DateFormat)); var dateKey = date.ToString("yyyyMMdd"); // var response = await athenaApi.ExecuteQuery(query); var getResultRequest = await athenaApi.ExecuteQuery(query); //var response = await athenaApi.ReadOneResult(getResultRequest); //var enumerator = response.ResultSet.Rows.GetEnumerator(); ResultSetMetadata resultSetMetadata = null; var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator(); List <Row> rows = new List <Row>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); //skip first row; enumerator.MoveNext(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }