/// <summary> /// execute athena query and return sample data /// </summary> /// <param name="athenaApi"></param> /// <param name="sql"></param> /// <param name="lines"></param> /// <returns></returns> public static async Task <DataSampleWithSchema> GetSampleDataBySQL(this AWSAthenaAPI athenaApi, string sql) { var result = new DataSampleWithSchema() { FieldMappings = new List <FieldMapping>(), }; var sample = new DataSample() { Rows = new List <DataRow>() }; result.DataSample = sample; // var response = await athenaApi.ExecuteQuery(sql); var getResultRequest = await athenaApi.ExecuteQuery(sql); var response = await athenaApi.ReadOneResult(getResultRequest); var data = response.ReadData(); result.FieldMappings = response.ToFieldMapping(); foreach (var row in data) { var dataRow = new DataRow() { Items = row.Select(item => item.ToString()).ToList() }; sample.Rows.Add(dataRow); } return(result); }
public static async Task <AthenaQueryFlatResult> GetQueryData(this AWSAthenaAPI athena, FormatedQuery query) { var sql = query.BuildQuerySQL(); var request = await athena.ExecuteQuery(sql); return(await athena.GetFlatResult(request)); }
public MainWindow() { componentContext = (App.Current as App).Services; s3 = componentContext.Resolve <AWSS3API>(); athena = componentContext.Resolve <AWSAthenaAPI>(); options = componentContext.Resolve <AthenaClientOptions>(); InitializeComponent(); s3Tree.ItemsSource = S3ItemsSource; tabQueries.ItemsSource = FormatedQuerySource; LoadS3(); }
public static async Task DropAthenaTable(this AWSAthenaAPI athenaApi, string tableName) { string query = $"DROP TABLE IF EXISTS {tableName}"; Console.WriteLine(query); var executionId = await athenaApi.StartQuery(query); while (!await athenaApi.IsExecutionCompleted(executionId)) { Thread.Sleep(2000); } }
public static async Task <string> StartSampleDataBySQL(this AWSAthenaAPI athenaApi, string sql) { var result = new DataSampleWithSchema() { FieldMappings = new List <FieldMapping>(), }; var sample = new DataSample() { Rows = new List <DataRow>() }; result.DataSample = sample; // var response = await athenaApi.ExecuteQuery(sql); return(await athenaApi.StartQuery(sql)); }
/// <summary> /// Process S3 Etl Event /// </summary> /// <param name="reportingAwsS3Api">The S3 bucket access for the reporting settings</param> /// <param name="listKey">s3 event list handler json key</param> /// <param name="etlPrefix">etl settings prefex</param> /// <param name="awsAthenaAPI">The target athena</param> /// <param name="bucketName">event source bucket name</param> /// <param name="s3FileKey">event source s3 key</param> /// <returns></returns> public static async Task <string> ProcessS3EtlEvent(this AWSS3API reportingAwsS3Api, string listKey, string etlPrefix, AWSAthenaAPI awsAthenaAPI, string bucketName, string s3FileKey, GenericLogger logger = null) { if (!await reportingAwsS3Api.FileExists(listKey)) { return($"event handler setting does not exist: '{listKey}'"); } var json = await reportingAwsS3Api.ReadAsString(listKey); logger?.Log?.Invoke(json); var list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json); var found = list.FirstOrDefault(handler => handler.BucketName == bucketName && Regex.IsMatch(s3FileKey, handler.PathRegex)); if (found == null) { return($"event handler not found for object: 's3://{bucketName}/{s3FileKey}'"); } var etlkey = $"{etlPrefix}{found.EtlName}.json"; logger?.Log?.Invoke($"Find ETL setting: {etlkey}"); if (!await reportingAwsS3Api.FileExists(etlkey)) { return($"etl setting does not exist: '{etlkey}'"); } ; var jsonEtl = await reportingAwsS3Api.ReadAsString(etlkey); var etlSettings = JsonConvert.DeserializeObject <EtlSettings>(jsonEtl); // assign the s3FileKey to the ExamplePath and tell people around the deal etlSettings.S3EventSource.ExamplePath = s3FileKey; var results = await etlSettings.TransferData(awsAthenaAPI); return(string.Join("\n", results)); }
public MainWindow() { componentContext = (App.Current as App).Services; var encryptedOptions = componentContext.Resolve <EncryptedOptions>(); DecryptedOptions decryptedOptions = LoadOptions(encryptedOptions); if (decryptedOptions == null) { Application.Current.Shutdown(); return; } s3 = new AWSS3API(decryptedOptions.AWSS3Options); athena = new AWSAthenaAPI(decryptedOptions.AWSAthenaOptions); options = decryptedOptions.AthenaClientOptions; InitializeComponent(); s3Tree.ItemsSource = S3ItemsSource; tabQueries.ItemsSource = FormatedQuerySource; dgJobList.ItemsSource = QueryTasks; LoadS3(); }
public static async Task LoadAthenaParition(this AWSAthenaAPI athenaApi, string tableName, string key, string location) { string dropQuery = $"ALTER TABLE {tableName} DROP IF EXISTS PARTITION ({key})"; Console.WriteLine(dropQuery); var dropExecutionId = await athenaApi.StartQuery(dropQuery); while (!await athenaApi.IsExecutionCompleted(dropExecutionId)) { Thread.Sleep(500); } string addQuery = $"ALTER TABLE {tableName} ADD IF NOT EXISTS PARTITION ({key}) LOCATION '{location}'"; Console.WriteLine(addQuery); var addExecutionId = await athenaApi.StartQuery(addQuery); while (!await athenaApi.IsExecutionCompleted(addExecutionId)) { Thread.Sleep(500); } }
public static async Task ClearAthenaTable(this AWSAthenaAPI athenaApi, AWSS3API awsS3Api, string tableName, string s3Path) { Console.WriteLine($"DROP TABLE IF EXISTS {tableName}"); var executionId = await athenaApi.StartQuery($"DROP TABLE IF EXISTS {tableName}"); while (!await athenaApi.IsExecutionCompleted(executionId)) { Thread.Sleep(2000); } var s3Object = s3Path.ParseS3URI(); if (s3Object is S3Object) { Console.WriteLine($"Delete S3: {s3Path}"); var files = await awsS3Api.ListFiles(s3Object.Key, "/", s3Object.BucketName); if (files.Any()) { await awsS3Api.Delete(files.Select(key => $"{s3Object.Key}{key}"), s3Object.BucketName); } Console.WriteLine($"{s3Path}: {files.Count} S3 Files Deleted"); } }
public static async Task <DataSampleWithSchema> TryObtainSampleDataResult(this AWSAthenaAPI athenaApi, string executionId) { if (await athenaApi.IsExecutionCompleted(executionId)) { var result = new DataSampleWithSchema() { FieldMappings = new List <FieldMapping>(), }; var sample = new DataSample() { Rows = new List <DataRow>() }; result.DataSample = sample; var response = await athenaApi.ReadOneResult(new GetQueryResultsRequest() { QueryExecutionId = executionId }); var data = response.ReadData(); result.FieldMappings = response.ToFieldMapping(); foreach (var row in data) { var dataRow = new DataRow() { Items = row.Select(item => item.ToString()).ToList() }; sample.Rows.Add(dataRow); } return(result); } else { return(null); } }
public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var athena = etlSettings.AthenaQuerySource; if (athena == null) { throw new Exception("The ETL has an empty Athena source setting."); } var athenaApi = etlSettings.CreateSourceAthenaAPI(); var query = athena.AthenaSQL; var today = DateTime.Now; var date = today.AddDays(-athena.DaysAgo); query = query.Replace("{date}", date.ToString(athena.DateFormat)); var dateKey = date.ToString("yyyyMMdd"); // var response = await athenaApi.ExecuteQuery(query); var getResultRequest = await athenaApi.ExecuteQuery(query); //var response = await athenaApi.ReadOneResult(getResultRequest); //var enumerator = response.ResultSet.Rows.GetEnumerator(); ResultSetMetadata resultSetMetadata = null; var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator(); List <Row> rows = new List <Row>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); //skip first row; enumerator.MoveNext(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
/// <summary> /// /// </summary> /// <param name="etlSettings"></param> /// <returns></returns> public static async Task <List <string> > TransferBigQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); string dateKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString("yyyyMMdd"); sql = sql.Replace("{date}", dateKey); var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }
public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName) { var result = new List <string>(); var config = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = etlSettings.CsvSourceOptoins.Delimiter }; var csvStream = stream; if (etlSettings.CsvSourceOptoins.GZip) { csvStream = new GZipStream(stream, CompressionMode.Decompress); } using (var csvStreamReader = new StreamReader(csvStream)) { using (var csvReader = new CsvReader(csvStreamReader, config)) { var headers = new List <string>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); if (etlSettings.HasHeader) { csvReader.Read(); string header = null; int index = 0; while (csvReader.TryGetField(index, out header)) { headers.Add(header); index++; } } var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m); List <List <string> > data = new List <List <string> >(); while (csvReader.Read()) { int index = 0; string value = null; var row = new List <string>(); while (csvReader.TryGetField(index, out value)) { if (headers.Count == index) { headers.Add($"Col{index}"); } row.Add(value); index++; } data.Add(row); if (data.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } } { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } { // upload the flag file var s3key = etlSettings.TargetFlagFile(filename); await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK"))); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); } } } return(result); }
public static async Task <List <string> > LoadAllPartitions(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var results = new List <string>(); var targetS3Api = etlSettings.CreateTargetS3API(); var allPaths = await targetS3Api.ListPaths($"{etlSettings.TargetS3Prefix}/", "/"); foreach (var path in allPaths) { var dateKey = path.Replace("/", ""); await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); results.Add($"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(results); }
public static async Task DropAthenaTables(this StateMachineQueryContext context, AWSAthenaAPI athenaApi) { foreach (var dropping in context.settings.DroppingTables) { await DropAthenaTable(athenaApi, dropping); } }
public AthenaTableSetup(AWSS3API awsS3API, AWSAthenaAPI awsAthenaAPI) { this.awsS3API = awsS3API; this.awsAthenaAPI = awsAthenaAPI; }
public static async Task <bool> CreateAthenaTable(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { //create athena database if not exists if (etlSettings.AthenaDatabaseName == null || !regexAthena.IsMatch(etlSettings.AthenaDatabaseName)) { throw new Exception($@"Invalid Athena Database Name '{etlSettings.AthenaDatabaseName}'"); } ; if (etlSettings.AthenaTableName == null || !regexAthena.IsMatch(etlSettings.AthenaTableName)) { throw new Exception($@"Invalid Athena Table Name '{etlSettings.AthenaDatabaseName}'"); } if (etlSettings.Mappings == null || etlSettings.Mappings.Count == 0) { throw new Exception($@"No Fields found for ETL Setting '{etlSettings.Name}'"); } await awsAthenaAPI.ExecuteQuery($@"create database if not exists `{etlSettings.AthenaDatabaseName}`"); // drop the table if it exists await awsAthenaAPI.ExecuteQuery($@"drop table if exists `{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`"); var query = $@"CREATE EXTERNAL TABLE IF NOT EXISTS `{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`( {etlSettings.MapAthenaFields()} ) PARTITIONED BY ( `{etlSettings.DatePartitionKey}` string ) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' WITH SERDEPROPERTIES ( 'serialization.format' = '1' ) LOCATION 's3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/' "; await awsAthenaAPI.ExecuteQuery(query); return(true); }
public static async Task ClearAthenaTables(this StateMachineQueryContext context, AWSAthenaAPI athenaApi, AWSS3API awsS3Api) { var parserSetting = context.BuildParserSetting(); var pipes = context.raw.ParseAthenaPipes(parserSetting); foreach (var clearing in parserSetting.Clearings) { await athenaApi.ClearAthenaTable(awsS3Api, clearing.Key, clearing.Value); } }
public static async Task LoadPartitions(this StateMachineQueryContext context, AWSAthenaAPI athenaApi) { //Console.WriteLine("LoadPartitions:"); foreach (var patition in context.settings.Partitions) { await LoadAthenaParition(athenaApi, context.settings.DefaultTableName, patition.Key, patition.Value); } }
public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null) { var result = new List <string>(); logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}"); switch (etlSettings.SourceType) { case EtlSourceEnum.SFTP: { var sftp = etlSettings.SFTPSource; var nameRegex = new Regex(sftp.PathRegex); var dateRegex = new Regex(sftp.DateKeyRegex); using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password)) { sftpClient.Connect(); var files = sftpClient.ListDirectory(sftp.BasePath); files = files .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name)) .OrderByDescending(f => f.Name) .ToList(); // find in the target to work out if there is the corresponding parquet file var targetS3 = etlSettings.CreateTargetS3API(); SftpFile first = null; foreach (var file in files) { Console.WriteLine($"Check File: {file.FullName}"); var s3Key = etlSettings.TargetFlagFile(file.Name); if (!await targetS3.FileExists(s3Key)) { first = file; break; } } // transfer that file if (first != null) { Console.WriteLine($"Transfer File: {first.FullName}"); var dateKey = first.Name.MakeRegexExtraction(dateRegex); using (var sftpStream = sftpClient.OpenRead(first.FullName)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false); } } sftpClient.Disconnect(); } } break; case EtlSourceEnum.S3BucketCheck: { } break; case EtlSourceEnum.S3BucketEvent: { var sourceAwsS3Api = new AWSS3API(new AWSS3Options() { Key = etlSettings.S3EventSource.Key, Secret = etlSettings.S3EventSource.Secret, Bucket = etlSettings.S3EventSource.BucketName, Region = etlSettings.S3EventSource.Region }); var s3Event = etlSettings.S3EventSource; var nameRegex = new Regex(s3Event.PathRegex); var keyRegex = new Regex(s3Event.FileNameRegex); // do nothing if it does not match the path pattern if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath))) { return(result); } // generate dateKey var dateKey = DateTime.UtcNow.ToString("yyyyMMdd"); Regex dateRegex = null; if (!s3Event.UseEventDateAsDateKey) { dateRegex = new Regex(s3Event.DateKeyRegex); if (!dateRegex.IsMatch(s3Event.ExamplePath)) { return(result); } dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex); } // generate file name var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex); // it will overwrite by default we need to workout datekey first of all var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true); // check files that should be deleted var targetAwsS3Api = etlSettings.CreateTargetS3API(); var oldObjects = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate); // delete the files with those prefix foreach (var oldObj in oldObjects) { await targetAwsS3Api.Delete(oldObj.Key); } // open file stream and transfer data using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true); } } break; case EtlSourceEnum.GoogleAnalytics: { result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthena: { result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthenaPipes: { await etlSettings.RunAthenaQueryPipes(useDate); } break; } return(result); }
public AsyncLogic(AWSAthenaAPI awsAthenaAPI, AWSBatchAPI awsBatchAPI) { this.awsAthenaAPI = awsAthenaAPI; this.awsBatchAPI = awsBatchAPI; }
public static async Task ClearTempTables(this StateMachineQueryContext context, AWSAthenaAPI athenaApi, AWSS3API awsS3Api) { foreach (var clearing in context.settings.Clearings) { await athenaApi.ClearAthenaTable(awsS3Api, clearing.Key, clearing.Value); } context.settings.Clearings.Clear(); }