public static async Task ClearAthenaTable(this EtlSettings etlSettings, string tableName, string s3Path)
        {
            if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes)
            {
                return;
            }
            var pipesSource = etlSettings.AthenaQueryPipesSource;
            var athenaApi   = etlSettings.CreatePipesSourceAthenaAPI();

            Console.WriteLine($"DROP TABLE IF EXISTS {tableName}");
            var executionId = await athenaApi.StartQuery($"DROP TABLE IF EXISTS {tableName}");

            while (!await athenaApi.IsExecutionCompleted(executionId))
            {
                Thread.Sleep(2000);
            }
            var s3Api    = etlSettings.CreatePipesSourceS3API();
            var s3Object = s3Path.ParseS3URI();

            if (s3Object is S3Object)
            {
                Console.WriteLine($"Delete S3: {s3Path}");
                var files = await s3Api.ListFiles(s3Object.Key, "/", s3Object.BucketName);

                await s3Api.Delete(files.Select(key => $"{s3Object.Key}{key}"), s3Object.BucketName);

                Console.WriteLine($"{s3Path}: {files.Count} S3 Files Deleted");
            }
        }
Пример #2
0
        public static async Task DeleteFromS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey)
        {
            if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent)
            {
                var list = new List <S3EventHandler>();

                if (await awsS3API.FileExists(listKey))
                {
                    try
                    {
                        var json = await awsS3API.ReadAsString(listKey);

                        list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json);
                    }
                    catch (Exception ex)
                    {
                        list = new List <S3EventHandler>();
                    }
                }

                // remove any entries that match that name
                list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList();

                // write back to s3
                await awsS3API.UploadAsJson(listKey, list);
            }
        }
Пример #3
0
 public static AWSS3API CreateTargetS3API(this EtlSettings etlSettings)
 {
     return(new AWSS3API(new AWSS3Options()
     {
         Key = etlSettings.TargetAWSKey,
         Secret = etlSettings.TargetAWSSecret,
         Region = etlSettings.TargetS3Region,
         Bucket = etlSettings.TargetS3BucketName
     }));
 }
        public static AWSS3API CreatePipesSourceS3API(this EtlSettings etlSettings)
        {
            var athenaPipes = etlSettings.AthenaQueryPipesSource;

            if (athenaPipes == null)
            {
                throw new Exception("The ETL has an empty Athena source setting.");
            }
            return(new AWSS3API(new AWSS3Options()
            {
                Key = athenaPipes.Key,
                Secret = athenaPipes.Secret,
                Region = athenaPipes.Region
            }));
        }
        public static AWSAthenaAPI CreateSourceAthenaAPI(this EtlSettings etlSettings)
        {
            var athena = etlSettings.AthenaQuerySource;

            if (athena == null)
            {
                throw new Exception("The ETL has an empty Athena source setting.");
            }
            return(new AWSAthenaAPI(new AWSAthenaOptions()
            {
                Key = athena.Key,
                Secret = athena.Secret,
                DefaultOutputLocation = athena.DefaultOutputLocation,
                Region = athena.Region
            }));
        }
Пример #6
0
        public static async Task UpdateS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey)
        {
            var list = new List <S3EventHandler>();

            if (await awsS3API.FileExists(listKey))
            {
                try
                {
                    var json = await awsS3API.ReadAsString(listKey);

                    list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json);
                }
                catch (Exception ex)
                {
                    list = new List <S3EventHandler>();
                }
            }

            if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent)
            {
                // find the key and update
                var found = list.FirstOrDefault(handler => handler.EtlName == etlSettings.Name);
                if (found == null)
                {
                    list.Add(new S3EventHandler()
                    {
                        EtlName    = etlSettings.Name,
                        BucketName = etlSettings.S3EventSource.BucketName,
                        PathRegex  = etlSettings.S3EventSource.PathRegex
                    });
                }
                else
                {
                    found.BucketName = etlSettings.S3EventSource.BucketName;
                    found.PathRegex  = etlSettings.S3EventSource.PathRegex;
                }
            }
            else
            {
                // remove any entries that match that name
                list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList();
            }

            // write back to s3
            await awsS3API.UploadAsJson(listKey, list);
        }
        public static async Task GetAthenaQueryResultSampleByDate(this EtlSettings etlSettings, int lines)
        {
            var athena = etlSettings.AthenaQuerySource;

            if (athena == null)
            {
                throw new Exception("The ETL has an empty Athena source setting.");
            }
            var athenaApi = etlSettings.CreateSourceAthenaAPI();

            var query = athena.AthenaSQL;
            var today = DateTime.Now;
            var date  = today.AddDays(-athena.DaysAgo);

            query = query.Replace("{date}", date.ToString(athena.DateFormat));
            query = rgxDateOffset.Replace(query, m =>
            {
                var offset = int.Parse(m.Groups[1].Value);
                return(date.AddDays(offset).ToString(athena.DateFormat));
            });
            query += $"\nlimit {lines}";
            var getResultRequest = await athenaApi.ExecuteQuery(query);

            var response = await athenaApi.ReadOneResult(getResultRequest);

            etlSettings.Mappings = response.ToFieldMapping();
            // load data schema to the etlsetting schema

            var sample = new DataSample()
            {
                Rows = new List <DataRow>()
            };

            var data = response.ReadData();

            foreach (var row in data)
            {
                var dataRow = new DataRow()
                {
                    Items = row.Select(item => item.ToString()).ToList()
                };
                sample.Rows.Add(dataRow);
            }
            etlSettings.Sample = sample;
        }
Пример #8
0
        public static async Task <List <string> > LoadAllPartitions(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            var results     = new List <string>();
            var targetS3Api = etlSettings.CreateTargetS3API();

            var allPaths = await targetS3Api.ListPaths($"{etlSettings.TargetS3Prefix}/", "/");

            foreach (var path in allPaths)
            {
                var dateKey = path.Replace("/", "");
                await awsAthenaAPI.LoadPartition(
                    $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                    $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                    $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");

                results.Add($"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
            }
            return(results);
        }
        /// <summary>
        /// run the athena query pipes
        /// </summary>
        /// <param name="etlSettings"></param>
        /// <param name="useDate"></param>
        /// <returns></returns>
        public static async Task RunAthenaQueryPipes(this EtlSettings etlSettings, DateTime?useDate = null)
        {
            if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes)
            {
                return;
            }

            var pipesSource = etlSettings.AthenaQueryPipesSource;

            AthenaParserSetting parserSetting = new AthenaParserSetting();

            parserSetting.DefaultExportPath = $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}".FixPathEnd();
            parserSetting.DefaultTableName  = $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`";
            parserSetting.Date = useDate == null?DateTime.UtcNow.AddDays(-pipesSource.DaysAgo) : useDate.Value.AddDays(-pipesSource.DaysAgo);

            parserSetting.DateFormat    = pipesSource.DateFormat;
            parserSetting.TempDatabase  = pipesSource.TempDatabase;
            parserSetting.TempTablePath = pipesSource.TempDataPath.FixPathEnd();

            var caches = JsonConvert.DeserializeObject <List <CacheSetting> >(etlSettings.AthenaQueryPipesSource.Caches);

            foreach (var cache in caches)
            {
                if (!cache.S3Path.EndsWith("/"))
                {
                    cache.S3Path += "/";
                }
                parserSetting.Caches.Add(cache.Key, cache);
            }

            var parsed = pipesSource.AthenaSQL.ParseAthenaPipes(parserSetting);

            await etlSettings.ExecuteControlFlow(parsed, parserSetting);

            var athenaApi = etlSettings.CreatePipesSourceAthenaAPI();

            foreach (var kvp in parserSetting.Partitions)
            {
                await athenaApi.LoadPartitionIfNotExists(parserSetting.DefaultTableName, kvp.Key, kvp.Value);
            }
        }
Пример #10
0
        public static async Task <bool> CreateAthenaTable(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            //create athena database if not exists

            if (etlSettings.AthenaDatabaseName == null || !regexAthena.IsMatch(etlSettings.AthenaDatabaseName))
            {
                throw new Exception($@"Invalid Athena Database Name '{etlSettings.AthenaDatabaseName}'");
            }
            ;
            if (etlSettings.AthenaTableName == null || !regexAthena.IsMatch(etlSettings.AthenaTableName))
            {
                throw new Exception($@"Invalid Athena Table Name '{etlSettings.AthenaDatabaseName}'");
            }
            if (etlSettings.Mappings == null || etlSettings.Mappings.Count == 0)
            {
                throw new Exception($@"No Fields found for ETL Setting '{etlSettings.Name}'");
            }
            await awsAthenaAPI.ExecuteQuery($@"create database if not exists `{etlSettings.AthenaDatabaseName}`");

            // drop the table if it exists
            await awsAthenaAPI.ExecuteQuery($@"drop table if exists `{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`");

            var query = $@"CREATE EXTERNAL TABLE IF NOT EXISTS `{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`(
{etlSettings.MapAthenaFields()}
)
PARTITIONED BY (
    `{etlSettings.DatePartitionKey}` string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES (
  'serialization.format' = '1'
)
LOCATION 's3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/'
";

            await awsAthenaAPI.ExecuteQuery(query);

            return(true);
        }
Пример #11
0
 public static string MakeTargetS3Key(this EtlSettings etlSettings, string dateKey, string filename, bool keepOriginalName, int parquetIndex)
 {
     return($"{etlSettings.MakeTargetS3Prefix(dateKey, filename, keepOriginalName)}{parquetIndex.ToString().PadLeft(5, '0')}.parquet");
 }
        public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            var result = new List <string>();
            var athena = etlSettings.AthenaQuerySource;

            if (athena == null)
            {
                throw new Exception("The ETL has an empty Athena source setting.");
            }
            var athenaApi = etlSettings.CreateSourceAthenaAPI();

            var query = athena.AthenaSQL;
            var today = DateTime.Now;
            var date  = today.AddDays(-athena.DaysAgo);

            query = query.Replace("{date}", date.ToString(athena.DateFormat));
            var dateKey = date.ToString("yyyyMMdd");

            // var response = await athenaApi.ExecuteQuery(query);

            var getResultRequest = await athenaApi.ExecuteQuery(query);

            //var response = await athenaApi.ReadOneResult(getResultRequest);

            //var enumerator = response.ResultSet.Rows.GetEnumerator();
            ResultSetMetadata resultSetMetadata = null;

            var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator();

            List <Row> rows = new List <Row>();

            int parquetIndex = 0;

            var targetS3 = etlSettings.CreateTargetS3API();

            //skip first row;
            enumerator.MoveNext();
            while (enumerator.MoveNext())
            {
                rows.Add(enumerator.Current);
                if (rows.Count >= etlSettings.NumberOfItemsPerParquet)
                {
                    var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                    await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key);

                    result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    parquetIndex += 1;
                }
            }

            // write what ever left less than 200000
            if (rows.Count > 0)
            {
                var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key);

                result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                parquetIndex += 1;
            }

            {
                // load partition to athena table
                await awsAthenaAPI.LoadPartition(
                    $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                    $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                    $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
            }

            return(result);
        }
 public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <Row> rows, ResultSetMetadata metadata, EtlSettings etlSettings, string s3Key)
 {
     using (MemoryStream gaStream = new MemoryStream())
     {
         gaStream.WriteAthenaRowsAsParquet(metadata, etlSettings.Mappings, rows);
         using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray()))
         {
             await awsS3Api.Upload(s3Key, uploadStream);
         }
     }
     rows.Clear();
 }
Пример #14
0
        public static async Task GetBigQueryResultSampleByDate(this EtlSettings etlSettings, int lines)
        {
            var awsS3Api = etlSettings.CreateTargetS3API();
            var ga       = etlSettings.GoogleAnalyticsQuerySource;

            Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}");

            BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId);

            string sql = ga.BigQuerySQL;

            string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat);

            // make sure the query is limited by 20
            sql = sql.Replace("{date}", dateQueryKey) + $"\nlimit {lines}";

            var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>());

            BigQueryResults results = null;

            results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions()
            {
                StartIndex = 0,
                PageSize   = 20000,
            });

            var enumerator = results.GetEnumerator();

            List <BigQueryRow> rows = new List <BigQueryRow>();

            while (enumerator.MoveNext())
            {
                rows.Add(enumerator.Current);
            }

            // map schema to athena types

            etlSettings.Mappings = results.ToFieldMappings();
            var sample = new DataSample()
            {
                Rows = new List <DataRow>()
            };

            // convert big query data to sample data
            foreach (var row in rows)
            {
                sample.Rows.Add(new DataRow()
                {
                    Items = row.RawRow.F.Select(item =>
                    {
                        if (item.V == null)
                        {
                            return("");
                        }
                        else if (item.V.GetType() == typeof(DateTime))
                        {
                            return(((DateTime)item.V).ToString("o"));
                        }
                        else if (item.V.GetType() == typeof(byte[]))
                        {
                            return(Convert.ToBase64String((byte[])item.V));
                        }
                        else
                        {
                            return(item.V.ToString());
                        }
                    }).ToList()
                });
            }
            etlSettings.Sample = sample;
        }
Пример #15
0
        public static async Task <EtlReportResponse> GetReports(this EtlSettings etlSettings, EtlReportRequest request)
        {
            var awsS3Api = etlSettings.CreateTargetS3API();

            var paths = await awsS3Api.ListPaths(etlSettings.TargetS3Prefix + "/", "/");

            var dateFrom    = DateTime.ParseExact(request.DateFrom, "yyyy-MM-dd", null);
            var dateTo      = DateTime.ParseExact(request.DateTo, "yyyy-MM-dd", null);
            var dateIntFrom = int.Parse(dateFrom.ToString("yyyyMMdd"));
            var dateIntTo   = int.Parse(dateTo.ToString("yyyyMMdd"));

            Regex dateKeyPathRegex = new Regex(@"^(\d+)\/$");

            var dateKeys = paths.Where(p => dateKeyPathRegex.IsMatch(p))
                           .Where(p =>
            {
                var dateKey = dateKeyPathRegex.Match(p).Groups[1].Value;
                var dateInt = int.Parse(dateKey);
                return(dateInt >= dateIntFrom && dateInt <= dateIntTo);
            })
                           .ToList();

            var allObjectsList = await Task.WhenAll(dateKeys.Select(async dateKey =>
            {
                return(await awsS3Api.ListAllObjectsInBucket(prefix: $"{etlSettings.TargetS3Prefix}/{dateKey}"));
            }).ToArray());

            var allObjects = allObjectsList.Aggregate(new List <S3Object>(), (seed, list) =>
            {
                if (list != null)
                {
                    seed.AddRange(list);
                }
                return(seed);
            });

            var partitionKey          = etlSettings.DatePartitionKey;
            var partitionPrefexLength = etlSettings.TargetS3Prefix.Length + 1;
            // read all parquet files

            var allDictLists = await Task.WhenAll(allObjects.Select(async s3Obj =>
            {
                using (var parquetStream = await awsS3Api.OpenReadAsync(s3Obj.Key))
                {
                    var dictList = parquetStream.ReadParquetAdDictData(etlSettings.Mappings.Select(m => m.MappedName).ToList());
                    var relativePath = s3Obj.Key.Substring(partitionPrefexLength);
                    var dateKey = relativePath.Substring(0, relativePath.IndexOf("/"));
                    foreach (var dict in dictList)
                    {
                        dict.Add(partitionKey, dateKey);
                    }
                    return(dictList);
                }
            }).ToArray());

            var resultDictData = allDictLists.Aggregate(new List <Dictionary <string, object> >(), (seed, list) =>
            {
                if (list != null)
                {
                    seed.AddRange(list);
                }
                return(seed);
            });

            var schema = etlSettings.Mappings.Select(m => m.MappedName).ToList();

            schema.Add(partitionKey);

            return(new EtlReportResponse()
            {
                Name = request.Name,
                DateFrom = request.DateFrom,
                DateTo = request.DateTo,
                Schema = schema,
                Data = resultDictData
            });
        }
Пример #16
0
 public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <BigQueryRow> rows, BigQueryResults results, EtlSettings etlSettings, string s3Key)
 {
     using (MemoryStream gaStream = new MemoryStream())
     {
         gaStream.WriteGARowsAsParquet(results.Schema, etlSettings.Mappings, rows);
         using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray()))
         {
             await awsS3Api.Upload(s3Key, uploadStream);
         }
     }
     rows.Clear();
 }
Пример #17
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="etlSettings"></param>
        /// <returns></returns>
        public static async Task <List <string> > TransferBigQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            var result = new List <string>();

            var awsS3Api = etlSettings.CreateTargetS3API();
            var ga       = etlSettings.GoogleAnalyticsQuerySource;

            Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}");

            BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId);

            string sql = ga.BigQuerySQL;

            string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat);
            string dateKey      = DateTime.Now.AddDays(-ga.DaysAgo).ToString("yyyyMMdd");

            sql = sql.Replace("{date}", dateKey);

            var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>());

            BigQueryResults results = null;

            results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions()
            {
                StartIndex = 0,
                PageSize   = 20000,
            });

            var enumerator = results.GetEnumerator();

            List <BigQueryRow> rows = new List <BigQueryRow>();

            int parquetIndex = 0;

            var targetS3 = etlSettings.CreateTargetS3API();

            while (enumerator.MoveNext())
            {
                rows.Add(enumerator.Current);
                if (rows.Count >= etlSettings.NumberOfItemsPerParquet)
                {
                    var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                    await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key);

                    result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    parquetIndex += 1;
                }
            }

            // write what ever left less than 200000
            if (rows.Count > 0)
            {
                var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key);

                result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                parquetIndex += 1;
            }

            {
                // load partition to athena table
                await awsAthenaAPI.LoadPartition(
                    $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                    $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                    $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
            }

            return(result);
        }
Пример #18
0
 public static string MakeTargetS3Prefix(this EtlSettings etlSettings, string dateKey, string filename, bool keepOriginalName)
 {
     return($"{etlSettings.TargetS3Prefix}/{dateKey}/{(keepOriginalName ? filename + "/" : "")}");
 }
Пример #19
0
 public static string TargetFlagFile(this EtlSettings etlSettings, string filename)
 {
     return($"{etlSettings.TargetS3Prefix}/{filename}");
 }
Пример #20
0
        public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName)
        {
            var result = new List <string>();
            var config = new CsvConfiguration(CultureInfo.InvariantCulture)
            {
                Delimiter = etlSettings.CsvSourceOptoins.Delimiter
            };

            var csvStream = stream;

            if (etlSettings.CsvSourceOptoins.GZip)
            {
                csvStream = new GZipStream(stream, CompressionMode.Decompress);
            }

            using (var csvStreamReader = new StreamReader(csvStream))
            {
                using (var csvReader = new CsvReader(csvStreamReader, config))
                {
                    var headers      = new List <string>();
                    int parquetIndex = 0;

                    var targetS3 = etlSettings.CreateTargetS3API();

                    if (etlSettings.HasHeader)
                    {
                        csvReader.Read();
                        string header = null;
                        int    index  = 0;
                        while (csvReader.TryGetField(index, out header))
                        {
                            headers.Add(header);
                            index++;
                        }
                    }
                    var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m);
                    List <List <string> > data = new List <List <string> >();
                    while (csvReader.Read())
                    {
                        int    index = 0;
                        string value = null;
                        var    row   = new List <string>();
                        while (csvReader.TryGetField(index, out value))
                        {
                            if (headers.Count == index)
                            {
                                headers.Add($"Col{index}");
                            }
                            row.Add(value);
                            index++;
                        }
                        data.Add(row);
                        if (data.Count >= etlSettings.NumberOfItemsPerParquet)
                        {
                            var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex);
                            using (var bufferStream = new MemoryStream())
                            {
                                bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data);
                                await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray()));
                            }
                            data.Clear();
                            result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                            parquetIndex++;
                        }
                    }
                    {
                        var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex);
                        using (var bufferStream = new MemoryStream())
                        {
                            bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data);
                            await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray()));
                        }
                        data.Clear();
                        result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                        parquetIndex++;
                    }
                    {
                        // load partition to athena table
                        await awsAthenaAPI.LoadPartition(
                            $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                            $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                            $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
                    }
                    {
                        // upload the flag file
                        var s3key = etlSettings.TargetFlagFile(filename);
                        await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK")));

                        result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    }
                }
            }
            return(result);
        }
Пример #21
0
 private static string MapAthenaFields(this EtlSettings etlSettings)
 {
     return(string.Join(",\n", etlSettings.Mappings.Select(field => $"`{field.MappedName}` {field.MapAthenaField()}")));
 }
Пример #22
0
        public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null)
        {
            var result = new List <string>();

            logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}");

            switch (etlSettings.SourceType)
            {
            case EtlSourceEnum.SFTP:
            {
                var sftp      = etlSettings.SFTPSource;
                var nameRegex = new Regex(sftp.PathRegex);
                var dateRegex = new Regex(sftp.DateKeyRegex);
                using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password))
                {
                    sftpClient.Connect();
                    var files = sftpClient.ListDirectory(sftp.BasePath);
                    files = files
                            .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name))
                            .OrderByDescending(f => f.Name)
                            .ToList();
                    // find in the target to work out if there is the corresponding parquet file
                    var      targetS3 = etlSettings.CreateTargetS3API();
                    SftpFile first    = null;
                    foreach (var file in files)
                    {
                        Console.WriteLine($"Check File: {file.FullName}");
                        var s3Key = etlSettings.TargetFlagFile(file.Name);
                        if (!await targetS3.FileExists(s3Key))
                        {
                            first = file;
                            break;
                        }
                    }
                    // transfer that file
                    if (first != null)
                    {
                        Console.WriteLine($"Transfer File: {first.FullName}");
                        var dateKey = first.Name.MakeRegexExtraction(dateRegex);
                        using (var sftpStream = sftpClient.OpenRead(first.FullName))
                        {
                            result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false);
                        }
                    }
                    sftpClient.Disconnect();
                }
            }
            break;

            case EtlSourceEnum.S3BucketCheck:
            {
            }
            break;

            case EtlSourceEnum.S3BucketEvent:
            {
                var sourceAwsS3Api = new AWSS3API(new AWSS3Options()
                    {
                        Key    = etlSettings.S3EventSource.Key,
                        Secret = etlSettings.S3EventSource.Secret,
                        Bucket = etlSettings.S3EventSource.BucketName,
                        Region = etlSettings.S3EventSource.Region
                    });
                var s3Event   = etlSettings.S3EventSource;
                var nameRegex = new Regex(s3Event.PathRegex);
                var keyRegex  = new Regex(s3Event.FileNameRegex);
                // do nothing if it does not match the path pattern
                if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath)))
                {
                    return(result);
                }

                // generate dateKey
                var dateKey = DateTime.UtcNow.ToString("yyyyMMdd");

                Regex dateRegex = null;
                if (!s3Event.UseEventDateAsDateKey)
                {
                    dateRegex = new Regex(s3Event.DateKeyRegex);
                    if (!dateRegex.IsMatch(s3Event.ExamplePath))
                    {
                        return(result);
                    }
                    dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex);
                }

                // generate file name

                var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex);

                // it will overwrite by default we need to workout datekey first of all
                var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true);

                // check files that should be deleted
                var targetAwsS3Api = etlSettings.CreateTargetS3API();
                var oldObjects     = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate);

                // delete the files with those prefix
                foreach (var oldObj in oldObjects)
                {
                    await targetAwsS3Api.Delete(oldObj.Key);
                }

                // open file stream and transfer data
                using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath))
                {
                    result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true);
                }
            }
            break;

            case EtlSourceEnum.GoogleAnalytics:
            {
                result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate);
            }
            break;

            case EtlSourceEnum.AmazonAthena:
            {
                result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate);
            }
            break;

            case EtlSourceEnum.AmazonAthenaPipes:
            {
                await etlSettings.RunAthenaQueryPipes(useDate);
            }
            break;
            }
            return(result);
        }
        /// <summary>
        /// compile the pipes and run the definition query
        /// </summary>
        /// <param name="etlSettings"></param>
        /// <returns></returns>
        public static async Task ParseAthenaQueryPipes(this EtlSettings etlSettings)
        {
            if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes)
            {
                return;
            }

            var pipesSource = etlSettings.AthenaQueryPipesSource;
            AthenaParserSetting parserLogger = new AthenaParserSetting();

            parserLogger.DefaultExportPath = $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}".FixPathEnd();
            parserLogger.DefaultTableName  = $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`";
            parserLogger.Date          = DateTime.UtcNow.AddDays(-pipesSource.DaysAgo);
            parserLogger.DateFormat    = pipesSource.DateFormat;
            parserLogger.TempDatabase  = pipesSource.TempDatabase;
            parserLogger.TempTablePath = pipesSource.TempDataPath.FixPathEnd();

            pipesSource.ParseErrors = "";

            if (!string.IsNullOrWhiteSpace(etlSettings.AthenaQueryPipesSource.Caches))
            {
                try
                {
                    var caches = JsonConvert.DeserializeObject <List <CacheSetting> >(etlSettings.AthenaQueryPipesSource.Caches);
                    foreach (var cache in caches)
                    {
                        if (!cache.S3Path.EndsWith("/"))
                        {
                            cache.S3Path += "/";
                        }
                        parserLogger.Caches.Add(cache.Key, cache);
                    }
                }
                catch (Exception ex)
                {
                    pipesSource.ParseErrors += ex.Message;
                    pipesSource.ParseErrors += "\n";
                }
            }

            try
            {
                var parsed = pipesSource.AthenaSQL.ParseAthenaPipes(parserLogger);
                pipesSource.ParsedQuery = parsed.ToQueryString();
            }
            catch (Exception ex)
            {
                pipesSource.ParseErrors  = parserLogger.ToString();
                pipesSource.ParseErrors += "\n";
                pipesSource.ParseErrors += ex.Message;
                pipesSource.ParsedQuery  = "";
            }

            // run if there is definition query
            if (Regex.IsMatch(pipesSource.AthenaDefinitionSQL, @"\S+"))
            {
                var athenaApi        = etlSettings.CreatePipesSourceAthenaAPI();
                var getResultRequest = await athenaApi.ExecuteQuery(pipesSource.AthenaDefinitionSQL);

                var response = await athenaApi.ReadOneResult(getResultRequest);

                etlSettings.Mappings = response.ToFieldMapping();
                // load data schema to the etlsetting schema

                var sample = new DataSample()
                {
                    Rows = new List <DataRow>()
                };

                var data = response.ReadData();
                foreach (var row in data)
                {
                    var dataRow = new DataRow()
                    {
                        Items = row.Select(item => item.ToString()).ToList()
                    };
                    sample.Rows.Add(dataRow);
                }

                etlSettings.Sample = sample;
            }
        }
Пример #24
0
        public static void ReadFromCSVFile(this EtlSettings etlSettings, Stream stream, int lines = 20)
        {
            var newMapptings = new List <FieldMapping>();

            etlSettings.Sample = new DataSample()
            {
                Rows = new List <DataRow>()
            };

            var config = new CsvConfiguration(CultureInfo.InvariantCulture)
            {
                Delimiter = etlSettings.CsvSourceOptoins.Delimiter
            };

            var csvStream = stream;

            if (etlSettings.CsvSourceOptoins.GZip)
            {
                csvStream = new GZipStream(stream, CompressionMode.Decompress);
            }

            using (var streamReader = new StreamReader(csvStream))
            {
                using (var csvReader = new CsvReader(streamReader, config))
                {
                    int numberOfColoumns = 0;
                    if (etlSettings.HasHeader)
                    {
                        if (csvReader.Read())
                        {
                            var value = "";
                            int i     = 0;
                            while (csvReader.TryGetField(i, out value))
                            {
                                newMapptings.Add(new FieldMapping()
                                {
                                    SourceFieldName = value,
                                    MappedName      = value.ToMappedName()
                                });
                                i++;
                            }
                            numberOfColoumns = i;
                        }
                    }
                    int rowCount = 0;
                    while (csvReader.Read() && rowCount < lines)
                    {
                        var value = "";
                        int i     = 0;
                        var row   = new List <string>();
                        while (csvReader.TryGetField(i, out value))
                        {
                            row.Add(value);
                            i++;
                        }
                        numberOfColoumns = numberOfColoumns <= i ? numberOfColoumns : i;
                        etlSettings.Sample.Rows.Add(new DataRow()
                        {
                            Items = row
                        });
                        rowCount++;
                    }
                    if (!etlSettings.HasHeader)
                    {
                        for (int i = 0; i < numberOfColoumns; i++)
                        {
                            var field = new FieldMapping()
                            {
                                SourceFieldName = $"Col{i}",
                                MappedName      = $"Col{i}"
                            };
                            newMapptings.Add(field);
                        }
                    }
                    for (int i = 0; i < newMapptings.Count; i++)
                    {
                        newMapptings[i].MappedType = etlSettings.Sample.Rows
                                                     .Select(row => row.Items.Count > i ? row.Items[i].ToString() : "")
                                                     .DetectTypeString()
                                                     .DetectedTypeToAthenaType();
                    }

                    // update the mappings

                    if (etlSettings.Mappings != null && etlSettings.Mappings.Count > 0)
                    {
                        var oldMappings = etlSettings.Mappings;
                        etlSettings.Mappings = new List <FieldMapping>();
                        for (int i = 0; i < newMapptings.Count; i++)
                        {
                            if (oldMappings.Count > i && oldMappings[i].SourceFieldName == newMapptings[i].SourceFieldName)
                            {
                                etlSettings.Mappings.Add(oldMappings[i]);
                            }
                            else
                            {
                                etlSettings.Mappings.Add(newMapptings[i]);
                            }
                        }
                    }
                    else
                    {
                        etlSettings.Mappings = newMapptings;
                    }
                }
            }
        }
        public static StateMachineQueryContext BuildStateMachineQueryContext(this EtlSettings etlSettings, DateTime?useDate = null)
        {
            if (etlSettings.SourceType != EtlSourceEnum.AmazonAthenaPipes)
            {
                return(null);
            }

            var pipesSource = etlSettings.AthenaQueryPipesSource;

            AthenaParserSetting parserSetting = new AthenaParserSetting();

            parserSetting.DefaultExportPath = $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}".FixPathEnd();
            parserSetting.DefaultTableName  = $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`";
            parserSetting.Date = useDate == null?DateTime.UtcNow.AddDays(-pipesSource.DaysAgo) : useDate.Value.AddDays(-pipesSource.DaysAgo);

            parserSetting.DateFormat    = pipesSource.DateFormat;
            parserSetting.TempDatabase  = pipesSource.TempDatabase;
            parserSetting.TempTablePath = pipesSource.TempDataPath.FixPathEnd();

            var caches = JsonConvert.DeserializeObject <List <CacheSetting> >(etlSettings.AthenaQueryPipesSource.Caches);

            foreach (var cache in caches)
            {
                if (!cache.S3Path.EndsWith("/"))
                {
                    cache.S3Path += "/";
                }
                parserSetting.Caches.Add(cache.Key, cache);
            }

            StateMachineQueryContext context = new StateMachineQueryContext();

            context.raw = pipesSource.AthenaSQL;

            context.settings = new StateMachineSettings()
            {
                DefaultExportPath = parserSetting.DefaultExportPath,
                DefaultTableName  = parserSetting.DefaultTableName,
                Date          = parserSetting.Date,
                DateFormat    = parserSetting.DateFormat,
                TempDatabase  = parserSetting.TempDatabase,
                TempTablePath = parserSetting.TempTablePath,
                Caches        = parserSetting.Caches.Values.ToList(),
                Clearings     = parserSetting.Clearings.Select(kvp => new KeyValueEntry()
                {
                    Key = kvp.Key, Value = kvp.Value
                }).ToList(),
                Commands       = parserSetting.Commands,
                DroppingTables = parserSetting.DroppingTables,
                Partitions     = parserSetting.Partitions.Select(kvp => new KeyValueEntry()
                {
                    Key = kvp.Key, Value = kvp.Value
                }).ToList(),
                Variables = parserSetting.Variables.Select(kvp => new KeyValueEntry()
                {
                    Key = kvp.Key, Value = kvp.Value
                }).ToList()
            };

            //var parsed = pipesSource.AthenaSQL.ParseAthenaPipes(parserSetting);

            return(context);

            //var athenaApi = etlSettings.CreatePipesSourceAthenaAPI();
            //foreach (var kvp in parserSetting.Partitions)
            //{
            //    await athenaApi.LoadPartitionIfNotExists(parserSetting.DefaultTableName, kvp.Key, kvp.Value);
            //}
        }
Пример #26
0
        public static async Task <EtlSettings> ReadEtlSampleData(this EtlSettings etlSettings, int lines = 20)
        {
            etlSettings.Sample = new DataSample();

            switch (etlSettings.SourceType)
            {
            case EtlSourceEnum.SFTP:
            {
                var sftp      = etlSettings.SFTPSource;
                var nameRegex = new Regex(sftp.PathRegex);
                using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password))
                {
                    sftpClient.Connect();
                    var files = sftpClient.ListDirectory(sftp.BasePath);
                    files = files.Where(f => nameRegex.IsMatch(f.FullName)).ToList();
                    var first = files.FirstOrDefault();
                    if (first != null)
                    {
                        switch (etlSettings.FileType)
                        {
                        case EtlFileType.CSV:
                        {
                            using (var sftpStream = sftpClient.OpenRead(first.FullName))
                            {
                                etlSettings.ReadFromCSVFile(sftpStream, lines);
                            }
                        }
                        break;
                        }
                    }
                    sftpClient.Disconnect();
                }
            }
            break;

            case EtlSourceEnum.S3BucketCheck:
            {
                var s3       = etlSettings.S3CheckSource;
                var awsS3API = new AWSS3API(new AWSS3Options()
                    {
                        Key    = s3.Key,
                        Secret = s3.Secret,
                        Bucket = s3.BucketName,
                        Region = s3.Region,
                    });
                var objects = await awsS3API.ListAllObjectsInBucket(s3.BucketName, s3.Prefix);

                var nameRegex = new Regex(s3.PathRegex);
                objects = objects.Where(f => nameRegex.IsMatch(f.Key)).ToList();
                var first = objects.FirstOrDefault();
                if (first != null)
                {
                    switch (etlSettings.FileType)
                    {
                    case EtlFileType.CSV:
                    {
                        using (var s3Stream = await awsS3API.OpenReadAsync(first.Key, first.BucketName))
                        {
                            etlSettings.ReadFromCSVFile(s3Stream, lines);
                        }
                    }
                    break;
                    }
                }
            }
            break;

            case EtlSourceEnum.S3BucketEvent:
            {
                var s3       = etlSettings.S3EventSource;
                var awsS3API = new AWSS3API(new AWSS3Options()
                    {
                        Key    = s3.Key,
                        Secret = s3.Secret,
                        Bucket = s3.BucketName,
                        Region = s3.Region,
                    });
                if (await awsS3API.FileExists(s3.ExamplePath, s3.BucketName))
                {
                    switch (etlSettings.FileType)
                    {
                    case EtlFileType.CSV:
                    {
                        using (var s3Stream = await awsS3API.OpenReadAsync(s3.ExamplePath, s3.BucketName))
                        {
                            etlSettings.ReadFromCSVFile(s3Stream, lines);
                        }
                    }
                    break;
                    }
                }
            }
            break;

            case EtlSourceEnum.GoogleAnalytics:
            {
                await etlSettings.GetBigQueryResultSampleByDate(lines);
            }
            break;

            case EtlSourceEnum.AmazonAthena:
            {
                await etlSettings.GetAthenaQueryResultSampleByDate(lines);
            }
            break;

            case EtlSourceEnum.AmazonAthenaPipes:
            {
                // need to compile the query
                await etlSettings.ParseAthenaQueryPipes();
            }
            break;
            }

            // make the sample data smaller
            foreach (var row in etlSettings.Sample.Rows.ToList())
            {
                row.Items = row.Items.Select(item => item.Length < 100 ? item : item.Substring(0, 50) + "..." + item.Substring(item.Length - 50)).ToList();
            }

            return(etlSettings);
        }