Ejemplo n.º 1
0
        public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName)
        {
            var result = new List <string>();
            var config = new CsvConfiguration(CultureInfo.InvariantCulture)
            {
                Delimiter = etlSettings.CsvSourceOptoins.Delimiter
            };

            var csvStream = stream;

            if (etlSettings.CsvSourceOptoins.GZip)
            {
                csvStream = new GZipStream(stream, CompressionMode.Decompress);
            }

            using (var csvStreamReader = new StreamReader(csvStream))
            {
                using (var csvReader = new CsvReader(csvStreamReader, config))
                {
                    var headers      = new List <string>();
                    int parquetIndex = 0;

                    var targetS3 = etlSettings.CreateTargetS3API();

                    if (etlSettings.HasHeader)
                    {
                        csvReader.Read();
                        string header = null;
                        int    index  = 0;
                        while (csvReader.TryGetField(index, out header))
                        {
                            headers.Add(header);
                            index++;
                        }
                    }
                    var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m);
                    List <List <string> > data = new List <List <string> >();
                    while (csvReader.Read())
                    {
                        int    index = 0;
                        string value = null;
                        var    row   = new List <string>();
                        while (csvReader.TryGetField(index, out value))
                        {
                            if (headers.Count == index)
                            {
                                headers.Add($"Col{index}");
                            }
                            row.Add(value);
                            index++;
                        }
                        data.Add(row);
                        if (data.Count >= etlSettings.NumberOfItemsPerParquet)
                        {
                            var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex);
                            using (var bufferStream = new MemoryStream())
                            {
                                bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data);
                                await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray()));
                            }
                            data.Clear();
                            result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                            parquetIndex++;
                        }
                    }
                    {
                        var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex);
                        using (var bufferStream = new MemoryStream())
                        {
                            bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data);
                            await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray()));
                        }
                        data.Clear();
                        result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                        parquetIndex++;
                    }
                    {
                        // load partition to athena table
                        await awsAthenaAPI.LoadPartition(
                            $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                            $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                            $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
                    }
                    {
                        // upload the flag file
                        var s3key = etlSettings.TargetFlagFile(filename);
                        await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK")));

                        result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    }
                }
            }
            return(result);
        }
Ejemplo n.º 2
0
        public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null)
        {
            var result = new List <string>();

            logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}");

            switch (etlSettings.SourceType)
            {
            case EtlSourceEnum.SFTP:
            {
                var sftp      = etlSettings.SFTPSource;
                var nameRegex = new Regex(sftp.PathRegex);
                var dateRegex = new Regex(sftp.DateKeyRegex);
                using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password))
                {
                    sftpClient.Connect();
                    var files = sftpClient.ListDirectory(sftp.BasePath);
                    files = files
                            .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name))
                            .OrderByDescending(f => f.Name)
                            .ToList();
                    // find in the target to work out if there is the corresponding parquet file
                    var      targetS3 = etlSettings.CreateTargetS3API();
                    SftpFile first    = null;
                    foreach (var file in files)
                    {
                        Console.WriteLine($"Check File: {file.FullName}");
                        var s3Key = etlSettings.TargetFlagFile(file.Name);
                        if (!await targetS3.FileExists(s3Key))
                        {
                            first = file;
                            break;
                        }
                    }
                    // transfer that file
                    if (first != null)
                    {
                        Console.WriteLine($"Transfer File: {first.FullName}");
                        var dateKey = first.Name.MakeRegexExtraction(dateRegex);
                        using (var sftpStream = sftpClient.OpenRead(first.FullName))
                        {
                            result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false);
                        }
                    }
                    sftpClient.Disconnect();
                }
            }
            break;

            case EtlSourceEnum.S3BucketCheck:
            {
            }
            break;

            case EtlSourceEnum.S3BucketEvent:
            {
                var sourceAwsS3Api = new AWSS3API(new AWSS3Options()
                    {
                        Key    = etlSettings.S3EventSource.Key,
                        Secret = etlSettings.S3EventSource.Secret,
                        Bucket = etlSettings.S3EventSource.BucketName,
                        Region = etlSettings.S3EventSource.Region
                    });
                var s3Event   = etlSettings.S3EventSource;
                var nameRegex = new Regex(s3Event.PathRegex);
                var keyRegex  = new Regex(s3Event.FileNameRegex);
                // do nothing if it does not match the path pattern
                if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath)))
                {
                    return(result);
                }

                // generate dateKey
                var dateKey = DateTime.UtcNow.ToString("yyyyMMdd");

                Regex dateRegex = null;
                if (!s3Event.UseEventDateAsDateKey)
                {
                    dateRegex = new Regex(s3Event.DateKeyRegex);
                    if (!dateRegex.IsMatch(s3Event.ExamplePath))
                    {
                        return(result);
                    }
                    dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex);
                }

                // generate file name

                var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex);

                // it will overwrite by default we need to workout datekey first of all
                var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true);

                // check files that should be deleted
                var targetAwsS3Api = etlSettings.CreateTargetS3API();
                var oldObjects     = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate);

                // delete the files with those prefix
                foreach (var oldObj in oldObjects)
                {
                    await targetAwsS3Api.Delete(oldObj.Key);
                }

                // open file stream and transfer data
                using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath))
                {
                    result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true);
                }
            }
            break;

            case EtlSourceEnum.GoogleAnalytics:
            {
                result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate);
            }
            break;

            case EtlSourceEnum.AmazonAthena:
            {
                result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate);
            }
            break;

            case EtlSourceEnum.AmazonAthenaPipes:
            {
                await etlSettings.RunAthenaQueryPipes(useDate);
            }
            break;
            }
            return(result);
        }