public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName) { var result = new List <string>(); var config = new CsvConfiguration(CultureInfo.InvariantCulture) { Delimiter = etlSettings.CsvSourceOptoins.Delimiter }; var csvStream = stream; if (etlSettings.CsvSourceOptoins.GZip) { csvStream = new GZipStream(stream, CompressionMode.Decompress); } using (var csvStreamReader = new StreamReader(csvStream)) { using (var csvReader = new CsvReader(csvStreamReader, config)) { var headers = new List <string>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); if (etlSettings.HasHeader) { csvReader.Read(); string header = null; int index = 0; while (csvReader.TryGetField(index, out header)) { headers.Add(header); index++; } } var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m); List <List <string> > data = new List <List <string> >(); while (csvReader.Read()) { int index = 0; string value = null; var row = new List <string>(); while (csvReader.TryGetField(index, out value)) { if (headers.Count == index) { headers.Add($"Col{index}"); } row.Add(value); index++; } data.Add(row); if (data.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } } { var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex); using (var bufferStream = new MemoryStream()) { bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data); await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray())); } data.Clear(); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex++; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } { // upload the flag file var s3key = etlSettings.TargetFlagFile(filename); await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK"))); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); } } } return(result); }
public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null) { var result = new List <string>(); logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}"); switch (etlSettings.SourceType) { case EtlSourceEnum.SFTP: { var sftp = etlSettings.SFTPSource; var nameRegex = new Regex(sftp.PathRegex); var dateRegex = new Regex(sftp.DateKeyRegex); using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password)) { sftpClient.Connect(); var files = sftpClient.ListDirectory(sftp.BasePath); files = files .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name)) .OrderByDescending(f => f.Name) .ToList(); // find in the target to work out if there is the corresponding parquet file var targetS3 = etlSettings.CreateTargetS3API(); SftpFile first = null; foreach (var file in files) { Console.WriteLine($"Check File: {file.FullName}"); var s3Key = etlSettings.TargetFlagFile(file.Name); if (!await targetS3.FileExists(s3Key)) { first = file; break; } } // transfer that file if (first != null) { Console.WriteLine($"Transfer File: {first.FullName}"); var dateKey = first.Name.MakeRegexExtraction(dateRegex); using (var sftpStream = sftpClient.OpenRead(first.FullName)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false); } } sftpClient.Disconnect(); } } break; case EtlSourceEnum.S3BucketCheck: { } break; case EtlSourceEnum.S3BucketEvent: { var sourceAwsS3Api = new AWSS3API(new AWSS3Options() { Key = etlSettings.S3EventSource.Key, Secret = etlSettings.S3EventSource.Secret, Bucket = etlSettings.S3EventSource.BucketName, Region = etlSettings.S3EventSource.Region }); var s3Event = etlSettings.S3EventSource; var nameRegex = new Regex(s3Event.PathRegex); var keyRegex = new Regex(s3Event.FileNameRegex); // do nothing if it does not match the path pattern if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath))) { return(result); } // generate dateKey var dateKey = DateTime.UtcNow.ToString("yyyyMMdd"); Regex dateRegex = null; if (!s3Event.UseEventDateAsDateKey) { dateRegex = new Regex(s3Event.DateKeyRegex); if (!dateRegex.IsMatch(s3Event.ExamplePath)) { return(result); } dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex); } // generate file name var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex); // it will overwrite by default we need to workout datekey first of all var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true); // check files that should be deleted var targetAwsS3Api = etlSettings.CreateTargetS3API(); var oldObjects = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate); // delete the files with those prefix foreach (var oldObj in oldObjects) { await targetAwsS3Api.Delete(oldObj.Key); } // open file stream and transfer data using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath)) { result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true); } } break; case EtlSourceEnum.GoogleAnalytics: { result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthena: { result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate); } break; case EtlSourceEnum.AmazonAthenaPipes: { await etlSettings.RunAthenaQueryPipes(useDate); } break; } return(result); }