public DataTransferCore(AWSS3Options awsS3Options, AWSS3API awsS3, SftpClient sftpClient) { this.awsS3Options = awsS3Options; this.awsS3 = awsS3; this.sftpClient = sftpClient; }
public static async Task <List <T> > ReadParquet <T>(this AWSS3API awsS3, string key, string bucket = null) where T : class, new() { using (Stream readStream = await awsS3.OpenReadAsync(key, bucket)) { return(readStream.ReadParquet <T>()); } }
public static async Task <List <T> > ReadCsv <T>(this AWSS3API awsS3, string key, string bucket = null, CsvConfiguration configuration = null) where T : class { using (Stream readStream = await awsS3.OpenReadAsync(key, bucket)) { return(readStream.ReadCsv <T>(configuration)); } }
public static async Task <List <T> > ReadParquet <T>(this AWSS3API awsS3, string s3Uri) where T : class, new() { var s3Obj = s3Uri.ParseS3URI(); using (Stream readStream = await awsS3.OpenReadAsync(s3Obj.Key, s3Obj.BucketName)) { return(readStream.ReadParquet <T>()); } }
public static async Task WriteParquet <T>(this AWSS3API awsS3, IEnumerable <T> items, string key, string bucket = null) where T : class { using (MemoryStream csvStream = new MemoryStream()) { csvStream.WriteParquet(items); using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray())) { await awsS3.Upload(key, writeStream, bucket); } } }
public MainWindow() { componentContext = (App.Current as App).Services; s3 = componentContext.Resolve <AWSS3API>(); athena = componentContext.Resolve <AWSAthenaAPI>(); options = componentContext.Resolve <AthenaClientOptions>(); InitializeComponent(); s3Tree.ItemsSource = S3ItemsSource; tabQueries.ItemsSource = FormatedQuerySource; LoadS3(); }
public static async Task WriteParquet(this AWSS3API awsS3, List <string> headers, List <Type> types, IEnumerable <List <object> > data, string key, string bucket = null) { using (MemoryStream csvStream = new MemoryStream()) { csvStream.WriteParquet(headers, types, data); using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray())) { await awsS3.Upload(key, writeStream, bucket); } } }
public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <Row> rows, ResultSetMetadata metadata, EtlSettings etlSettings, string s3Key) { using (MemoryStream gaStream = new MemoryStream()) { gaStream.WriteAthenaRowsAsParquet(metadata, etlSettings.Mappings, rows); using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray())) { await awsS3Api.Upload(s3Key, uploadStream); } } rows.Clear(); }
public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <BigQueryRow> rows, BigQueryResults results, EtlSettings etlSettings, string s3Key) { using (MemoryStream gaStream = new MemoryStream()) { gaStream.WriteGARowsAsParquet(results.Schema, etlSettings.Mappings, rows); using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray())) { await awsS3Api.Upload(s3Key, uploadStream); } } rows.Clear(); }
public static async Task AppendLines(this AWSS3API awsS3, string s3Uri, IEnumerable <string> lines) { List <string> list = new List <string>(); try { list.AddRange(await awsS3.ReadLines(s3Uri)); } catch { } list.AddRange(lines); await awsS3.Put(s3Uri, Encoding.UTF8.GetBytes(string.Join("\n", list))); }
public static async Task WriteParquet <T>(this AWSS3API awsS3, IEnumerable <T> items, string s3Uri) where T : class { var s3Ojb = s3Uri.ParseS3URI(); using (MemoryStream csvStream = new MemoryStream()) { csvStream.WriteParquet(items); using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray())) { await awsS3.Upload(s3Ojb.Key, writeStream, s3Ojb.BucketName); } } }
public static async Task WriteParquet(this AWSS3API awsS3, List <string> headers, List <Type> types, IEnumerable <List <object> > data, string s3Uri) { var s3Ojb = s3Uri.ParseS3URI(); using (MemoryStream csvStream = new MemoryStream()) { csvStream.WriteParquet(headers, types, data); using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray())) { await awsS3.Upload(s3Ojb.Key, writeStream, s3Ojb.BucketName); } } }
public CloudFrontDeployAPI(CloudFrontDeployOptions cloudFrontDeployOptions) { if (cloudFrontDeployOptions.S3BasePath == null) { cloudFrontDeployOptions.S3BasePath = ""; } this.cloudFrontDeployOptions = cloudFrontDeployOptions; awsS3API = new AWSS3API(cloudFrontDeployOptions.AWSS3Options); if (!string.IsNullOrWhiteSpace(cloudFrontDeployOptions.CloudFrontDistributionId)) { awsCloudFrontAPI = new AWSCloudFrontAPI(cloudFrontDeployOptions.AWSCloudFrontOptions); } }
public static async Task UpdateS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey) { var list = new List <S3EventHandler>(); if (await awsS3API.FileExists(listKey)) { try { var json = await awsS3API.ReadAsString(listKey); list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json); } catch (Exception ex) { list = new List <S3EventHandler>(); } } if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent) { // find the key and update var found = list.FirstOrDefault(handler => handler.EtlName == etlSettings.Name); if (found == null) { list.Add(new S3EventHandler() { EtlName = etlSettings.Name, BucketName = etlSettings.S3EventSource.BucketName, PathRegex = etlSettings.S3EventSource.PathRegex }); } else { found.BucketName = etlSettings.S3EventSource.BucketName; found.PathRegex = etlSettings.S3EventSource.PathRegex; } } else { // remove any entries that match that name list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList(); } // write back to s3 await awsS3API.UploadAsJson(listKey, list); }
public static async Task <List <T> > ReadParquetPartitions <T>(this AWSS3API awsS3, string prefix, string bucket = null) where T : class, new() { var objects = await awsS3.ListAllObjectsInBucket(bucket : bucket, prefix : prefix); var readTasks = objects.Select(async obj => { return(await awsS3.ReadParquet <T>(obj.Key, obj.BucketName)); }).ToArray(); var results = await Task.WhenAll(readTasks); return(results.Aggregate(new List <T>(), (seed, items) => { seed.AddRange(items); return seed; })); }
/// <summary> /// write parquet files in parallel /// </summary> /// <typeparam name="T"></typeparam> /// <param name="awsS3"></param> /// <param name="items"></param> /// <param name="partitionSize"></param> /// <param name="digitCount"></param> /// <param name="key"></param> /// <param name="bucket"></param> /// <returns></returns> public static async Task <List <string> > WriteParquetPerPartition <T>(this AWSS3API awsS3, IEnumerable <T> items, int partitionSize, int digitCount, string key, string bucket = null) where T : class { key = parquetSuffix.Replace(key, ""); List <string> files = new List <string>(); var writeTasks = items.SplitIntoPartitions(partitionSize).Select(async(partition, index) => { if (partition.Count > 0) { var fileKey = $"{key}-{index.ToString().PadLeft(digitCount, '0')}.parquet"; await awsS3.WriteParquet(partition, fileKey, bucket); files.Add(fileKey); } }).ToArray(); await Task.WhenAll(writeTasks); return(files); }
/// <summary> /// Process S3 Etl Event /// </summary> /// <param name="reportingAwsS3Api">The S3 bucket access for the reporting settings</param> /// <param name="listKey">s3 event list handler json key</param> /// <param name="etlPrefix">etl settings prefex</param> /// <param name="awsAthenaAPI">The target athena</param> /// <param name="bucketName">event source bucket name</param> /// <param name="s3FileKey">event source s3 key</param> /// <returns></returns> public static async Task <string> ProcessS3EtlEvent(this AWSS3API reportingAwsS3Api, string listKey, string etlPrefix, AWSAthenaAPI awsAthenaAPI, string bucketName, string s3FileKey, GenericLogger logger = null) { if (!await reportingAwsS3Api.FileExists(listKey)) { return($"event handler setting does not exist: '{listKey}'"); } var json = await reportingAwsS3Api.ReadAsString(listKey); logger?.Log?.Invoke(json); var list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json); var found = list.FirstOrDefault(handler => handler.BucketName == bucketName && Regex.IsMatch(s3FileKey, handler.PathRegex)); if (found == null) { return($"event handler not found for object: 's3://{bucketName}/{s3FileKey}'"); } var etlkey = $"{etlPrefix}{found.EtlName}.json"; logger?.Log?.Invoke($"Find ETL setting: {etlkey}"); if (!await reportingAwsS3Api.FileExists(etlkey)) { return($"etl setting does not exist: '{etlkey}'"); } ; var jsonEtl = await reportingAwsS3Api.ReadAsString(etlkey); var etlSettings = JsonConvert.DeserializeObject <EtlSettings>(jsonEtl); // assign the s3FileKey to the ExamplePath and tell people around the deal etlSettings.S3EventSource.ExamplePath = s3FileKey; var results = await etlSettings.TransferData(awsAthenaAPI); return(string.Join("\n", results)); }
public MainWindow() { componentContext = (App.Current as App).Services; var encryptedOptions = componentContext.Resolve <EncryptedOptions>(); DecryptedOptions decryptedOptions = LoadOptions(encryptedOptions); if (decryptedOptions == null) { Application.Current.Shutdown(); return; } s3 = new AWSS3API(decryptedOptions.AWSS3Options); athena = new AWSAthenaAPI(decryptedOptions.AWSAthenaOptions); options = decryptedOptions.AthenaClientOptions; InitializeComponent(); s3Tree.ItemsSource = S3ItemsSource; tabQueries.ItemsSource = FormatedQuerySource; dgJobList.ItemsSource = QueryTasks; LoadS3(); }
public static async Task ClearAthenaTable(this AWSAthenaAPI athenaApi, AWSS3API awsS3Api, string tableName, string s3Path) { Console.WriteLine($"DROP TABLE IF EXISTS {tableName}"); var executionId = await athenaApi.StartQuery($"DROP TABLE IF EXISTS {tableName}"); while (!await athenaApi.IsExecutionCompleted(executionId)) { Thread.Sleep(2000); } var s3Object = s3Path.ParseS3URI(); if (s3Object is S3Object) { Console.WriteLine($"Delete S3: {s3Path}"); var files = await awsS3Api.ListFiles(s3Object.Key, "/", s3Object.BucketName); if (files.Any()) { await awsS3Api.Delete(files.Select(key => $"{s3Object.Key}{key}"), s3Object.BucketName); } Console.WriteLine($"{s3Path}: {files.Count} S3 Files Deleted"); } }
public AthenaTableSetup(AWSS3API awsS3API, AWSAthenaAPI awsAthenaAPI) { this.awsS3API = awsS3API; this.awsAthenaAPI = awsAthenaAPI; }
public SnowflakeEtlAPI(SnowflakeAPI snowflakeAPI, AWSS3API awsS3API) { this.snowflakeAPI = snowflakeAPI; this.awsS3API = awsS3API; }
public S3TreeItem(AWSS3API awsS3API) { this.awsS3API = awsS3API; }
public static async Task ClearAthenaTables(this StateMachineQueryContext context, AWSAthenaAPI athenaApi, AWSS3API awsS3Api) { var parserSetting = context.BuildParserSetting(); var pipes = context.raw.ParseAthenaPipes(parserSetting); foreach (var clearing in parserSetting.Clearings) { await athenaApi.ClearAthenaTable(awsS3Api, clearing.Key, clearing.Value); } }
public static async Task DeleteFromS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey) { if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent) { var list = new List <S3EventHandler>(); if (await awsS3API.FileExists(listKey)) { try { var json = await awsS3API.ReadAsString(listKey); list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json); } catch (Exception ex) { list = new List <S3EventHandler>(); } } // remove any entries that match that name list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList(); // write back to s3 await awsS3API.UploadAsJson(listKey, list); } }
public static async Task <bool> Exists(this AWSS3API awsS3, string s3Uri) { var s3obj = s3Uri.ParseS3URI(); return(await awsS3.FileExists(s3obj.Key, s3obj.BucketName)); }
public static async Task <EtlSettings> ReadEtlSampleData(this EtlSettings etlSettings, int lines = 20) { etlSettings.Sample = new DataSample(); switch (etlSettings.SourceType) { case EtlSourceEnum.SFTP: { var sftp = etlSettings.SFTPSource; var nameRegex = new Regex(sftp.PathRegex); using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password)) { sftpClient.Connect(); var files = sftpClient.ListDirectory(sftp.BasePath); files = files.Where(f => nameRegex.IsMatch(f.FullName)).ToList(); var first = files.FirstOrDefault(); if (first != null) { switch (etlSettings.FileType) { case EtlFileType.CSV: { using (var sftpStream = sftpClient.OpenRead(first.FullName)) { etlSettings.ReadFromCSVFile(sftpStream, lines); } } break; } } sftpClient.Disconnect(); } } break; case EtlSourceEnum.S3BucketCheck: { var s3 = etlSettings.S3CheckSource; var awsS3API = new AWSS3API(new AWSS3Options() { Key = s3.Key, Secret = s3.Secret, Bucket = s3.BucketName, Region = s3.Region, }); var objects = await awsS3API.ListAllObjectsInBucket(s3.BucketName, s3.Prefix); var nameRegex = new Regex(s3.PathRegex); objects = objects.Where(f => nameRegex.IsMatch(f.Key)).ToList(); var first = objects.FirstOrDefault(); if (first != null) { switch (etlSettings.FileType) { case EtlFileType.CSV: { using (var s3Stream = await awsS3API.OpenReadAsync(first.Key, first.BucketName)) { etlSettings.ReadFromCSVFile(s3Stream, lines); } } break; } } } break; case EtlSourceEnum.S3BucketEvent: { var s3 = etlSettings.S3EventSource; var awsS3API = new AWSS3API(new AWSS3Options() { Key = s3.Key, Secret = s3.Secret, Bucket = s3.BucketName, Region = s3.Region, }); if (await awsS3API.FileExists(s3.ExamplePath, s3.BucketName)) { switch (etlSettings.FileType) { case EtlFileType.CSV: { using (var s3Stream = await awsS3API.OpenReadAsync(s3.ExamplePath, s3.BucketName)) { etlSettings.ReadFromCSVFile(s3Stream, lines); } } break; } } } break; case EtlSourceEnum.GoogleAnalytics: { await etlSettings.GetBigQueryResultSampleByDate(lines); } break; case EtlSourceEnum.AmazonAthena: { await etlSettings.GetAthenaQueryResultSampleByDate(lines); } break; case EtlSourceEnum.AmazonAthenaPipes: { // need to compile the query await etlSettings.ParseAthenaQueryPipes(); } break; } // make the sample data smaller foreach (var row in etlSettings.Sample.Rows.ToList()) { row.Items = row.Items.Select(item => item.Length < 100 ? item : item.Substring(0, 50) + "..." + item.Substring(item.Length - 50)).ToList(); } return(etlSettings); }
public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null) { var result = new List <string>(); logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}"); switch (etlSettings.SourceType) { case EtlSourceEnum.SFTP: { var sftp = etlSettings.SFTPSource; var nameRegex = new Regex(sftp.PathRegex); var dateRegex = new Regex(sftp.DateKeyRegex); using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password)) { sftpClient.Connect(); var files = sftpClient.ListDirectory(sftp.BasePath); files = files .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name)) .OrderByDescending(f => f.Name) .ToList(); // find in the target to work out if there is the corresponding parquet file var targetS3 = etlSettings.CreateTargetS3API(); SftpFile first = null; foreach (var file in files) { Console.WriteLine($"Check File: {file.FullName}"); var s3Key = etlSettings.TargetFlagFile(file.Name); if (!await targetS3.FileExists(s3Key)) { first = file; break; } } // transfer that file if (first != null) { Console.WriteLine($"Transfer File: {first.FullName}"); var dateKey = first.Name.MakeRegexExtraction(dateRegex); using (var sftpStream = sftpClient.OpenRead(first.FullName)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false); } } sftpClient.Disconnect(); } } break; case EtlSourceEnum.S3BucketCheck: { } break; case EtlSourceEnum.S3BucketEvent: { var sourceAwsS3Api = new AWSS3API(new AWSS3Options() { Key = etlSettings.S3EventSource.Key, Secret = etlSettings.S3EventSource.Secret, Bucket = etlSettings.S3EventSource.BucketName, Region = etlSettings.S3EventSource.Region }); var s3Event = etlSettings.S3EventSource; var nameRegex = new Regex(s3Event.PathRegex); var keyRegex = new Regex(s3Event.FileNameRegex); // do nothing if it does not match the path pattern if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath))) { return(result); } // generate dateKey var dateKey = DateTime.UtcNow.ToString("yyyyMMdd"); Regex dateRegex = null; if (!s3Event.UseEventDateAsDateKey) { dateRegex = new Regex(s3Event.DateKeyRegex); if (!dateRegex.IsMatch(s3Event.ExamplePath)) { return(result); } dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex); } // generate file name var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex); // it will overwrite by default we need to workout datekey first of all var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true); // check files that should be deleted var targetAwsS3Api = etlSettings.CreateTargetS3API(); var oldObjects = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate); // delete the files with those prefix foreach (var oldObj in oldObjects) { await targetAwsS3Api.Delete(oldObj.Key); } // open file stream and transfer data using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true); } } break; case EtlSourceEnum.GoogleAnalytics: { result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthena: { result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthenaPipes: { await etlSettings.RunAthenaQueryPipes(useDate); } break; } return(result); }
public static async Task ClearTempTables(this StateMachineQueryContext context, AWSAthenaAPI athenaApi, AWSS3API awsS3Api) { foreach (var clearing in context.settings.Clearings) { await athenaApi.ClearAthenaTable(awsS3Api, clearing.Key, clearing.Value); } context.settings.Clearings.Clear(); }
public static async Task <string[]> ReadLines(this AWSS3API awsS3, string s3Uri) { return(Regex.Split(Encoding.UTF8.GetString(await awsS3.Get(s3Uri)), "[\n\r]")); }
public SftpEtl(SftpClient sftpClient, AWSS3API awsS3API) { this.sftpClient = sftpClient; this.awsS3API = awsS3API; }