public async Task CreateDailySalesTable() { var salesTableName = "`productsdb`.`sales`"; var queryDeleteTable = $@"DROP TABLE IF EXISTS {salesTableName}"; await awsAthenaAPI.ExecuteQuery(queryDeleteTable); var query = $@"CREATE EXTERNAL TABLE IF NOT EXISTS {salesTableName}( `{nameof(SaleEntry.ItemKey)}` STRING, `{nameof(SaleEntry.ItemName)}` STRING, `{nameof(SaleEntry.UnitPrice)}` DOUBLE, `{nameof(SaleEntry.Quantity)}` DOUBLE, `{nameof(SaleEntry.Price)}` DOUBLE ) PARTITIONED BY ( `year` integer, `month` integer, `date` integer ) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' WITH SERDEPROPERTIES ( 'serialization.format' = '1' ) LOCATION 's3://{awsS3API.Options.Bucket}/sales/'"; await awsAthenaAPI.ExecuteQuery(query); await awsAthenaAPI.LoadPartition(salesTableName, $"`year`=2019, `month`=5, `date`=26", $"s3://{awsS3API.Options.Bucket}/sales/2019-05-26/"); await awsAthenaAPI.LoadPartition(salesTableName, $"`year`=2019, `month`=5, `date`=25", $"s3://{awsS3API.Options.Bucket}/sales/2019-05-25/"); }
public static async Task <List <string> > LoadAllPartitions(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var results = new List <string>(); var targetS3Api = etlSettings.CreateTargetS3API(); var allPaths = await targetS3Api.ListPaths($"{etlSettings.TargetS3Prefix}/", "/"); foreach (var path in allPaths) { var dateKey = path.Replace("/", ""); await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); results.Add($"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(results); }
public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } var athenaApi = etlSettings.CreateSourceAthenaAPI(); var query = athena.AthenaSQL; var today = DateTime.Now; var date = today.AddDays(-athena.DaysAgo); query = query.Replace("{date}", date.ToString(athena.DateFormat)); var dateKey = date.ToString("yyyyMMdd"); // var response = await athenaApi.ExecuteQuery(query); var getResultRequest = await athenaApi.ExecuteQuery(query); //var response = await athenaApi.ReadOneResult(getResultRequest); //var enumerator = response.ResultSet.Rows.GetEnumerator(); ResultSetMetadata resultSetMetadata = null; var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator(); List <Row> rows = new List <Row>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); //skip first row; enumerator.MoveNext(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
/// <summary> /// /// </summary> /// <param name="etlSettings"></param> /// <returns></returns> public static async Task <List <string> > TransferBigQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); string dateKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString("yyyyMMdd"); sql = sql.Replace("{date}", dateKey); var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName) { var result = new List <string>(); var config = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = etlSettings.CsvSourceOptoins.Delimiter }; var csvStream = stream; if (etlSettings.CsvSourceOptoins.GZip) { csvStream = new GZipStream(stream, CompressionMode.Decompress); } using (var csvStreamReader = new StreamReader(csvStream)) { using (var csvReader = new CsvReader(csvStreamReader, config)) { var headers = new List <string>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); if (etlSettings.HasHeader) { csvReader.Read(); string header = null; int index = 0; while (csvReader.TryGetField(index, out header)) { headers.Add(header); index++; } } var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m); List <List <string> > data = new List <List <string> >(); while (csvReader.Read()) { int index = 0; string value = null; var row = new List <string>(); while (csvReader.TryGetField(index, out value)) { if (headers.Count == index) { headers.Add($"Col{index}"); } row.Add(value); index++; } data.Add(row); if (data.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } } { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } { // upload the flag file var s3key = etlSettings.TargetFlagFile(filename); await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK"))); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); } } } return(result); }