public async Task CreateDailySalesTable()
        {
            var salesTableName   = "`productsdb`.`sales`";
            var queryDeleteTable = $@"DROP TABLE IF EXISTS {salesTableName}";
            await awsAthenaAPI.ExecuteQuery(queryDeleteTable);

            var query = $@"CREATE EXTERNAL TABLE IF NOT EXISTS {salesTableName}(
`{nameof(SaleEntry.ItemKey)}` STRING,
`{nameof(SaleEntry.ItemName)}` STRING,
`{nameof(SaleEntry.UnitPrice)}` DOUBLE,
`{nameof(SaleEntry.Quantity)}` DOUBLE,
`{nameof(SaleEntry.Price)}` DOUBLE
)
PARTITIONED BY (
    `year` integer,
    `month` integer,
    `date` integer
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
WITH SERDEPROPERTIES (
  'serialization.format' = '1'
)
LOCATION 's3://{awsS3API.Options.Bucket}/sales/'";
            await awsAthenaAPI.ExecuteQuery(query);

            await awsAthenaAPI.LoadPartition(salesTableName, $"`year`=2019, `month`=5, `date`=26", $"s3://{awsS3API.Options.Bucket}/sales/2019-05-26/");

            await awsAthenaAPI.LoadPartition(salesTableName, $"`year`=2019, `month`=5, `date`=25", $"s3://{awsS3API.Options.Bucket}/sales/2019-05-25/");
        }
Example #2
0
        public static async Task <List <string> > LoadAllPartitions(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            var results     = new List <string>();
            var targetS3Api = etlSettings.CreateTargetS3API();

            var allPaths = await targetS3Api.ListPaths($"{etlSettings.TargetS3Prefix}/", "/");

            foreach (var path in allPaths)
            {
                var dateKey = path.Replace("/", "");
                await awsAthenaAPI.LoadPartition(
                    $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                    $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                    $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");

                results.Add($"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
            }
            return(results);
        }
        public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            var result = new List <string>();
            var athena = etlSettings.AthenaQuerySource;

            if (athena == null)
            {
                throw new Exception("The ETL has an empty Athena source setting.");
            }
            var athenaApi = etlSettings.CreateSourceAthenaAPI();

            var query = athena.AthenaSQL;
            var today = DateTime.Now;
            var date  = today.AddDays(-athena.DaysAgo);

            query = query.Replace("{date}", date.ToString(athena.DateFormat));
            var dateKey = date.ToString("yyyyMMdd");

            // var response = await athenaApi.ExecuteQuery(query);

            var getResultRequest = await athenaApi.ExecuteQuery(query);

            //var response = await athenaApi.ReadOneResult(getResultRequest);

            //var enumerator = response.ResultSet.Rows.GetEnumerator();
            ResultSetMetadata resultSetMetadata = null;

            var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator();

            List <Row> rows = new List <Row>();

            int parquetIndex = 0;

            var targetS3 = etlSettings.CreateTargetS3API();

            //skip first row;
            enumerator.MoveNext();
            while (enumerator.MoveNext())
            {
                rows.Add(enumerator.Current);
                if (rows.Count >= etlSettings.NumberOfItemsPerParquet)
                {
                    var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                    await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key);

                    result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    parquetIndex += 1;
                }
            }

            // write what ever left less than 200000
            if (rows.Count > 0)
            {
                var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key);

                result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                parquetIndex += 1;
            }

            {
                // load partition to athena table
                await awsAthenaAPI.LoadPartition(
                    $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                    $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                    $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
            }

            return(result);
        }
Example #4
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="etlSettings"></param>
        /// <returns></returns>
        public static async Task <List <string> > TransferBigQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            var result = new List <string>();

            var awsS3Api = etlSettings.CreateTargetS3API();
            var ga       = etlSettings.GoogleAnalyticsQuerySource;

            Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}");

            BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId);

            string sql = ga.BigQuerySQL;

            string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat);
            string dateKey      = DateTime.Now.AddDays(-ga.DaysAgo).ToString("yyyyMMdd");

            sql = sql.Replace("{date}", dateKey);

            var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>());

            BigQueryResults results = null;

            results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions()
            {
                StartIndex = 0,
                PageSize   = 20000,
            });

            var enumerator = results.GetEnumerator();

            List <BigQueryRow> rows = new List <BigQueryRow>();

            int parquetIndex = 0;

            var targetS3 = etlSettings.CreateTargetS3API();

            while (enumerator.MoveNext())
            {
                rows.Add(enumerator.Current);
                if (rows.Count >= etlSettings.NumberOfItemsPerParquet)
                {
                    var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                    await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key);

                    result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    parquetIndex += 1;
                }
            }

            // write what ever left less than 200000
            if (rows.Count > 0)
            {
                var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key);

                result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                parquetIndex += 1;
            }

            {
                // load partition to athena table
                await awsAthenaAPI.LoadPartition(
                    $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                    $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                    $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
            }

            return(result);
        }
Example #5
0
        public static async Task <List <string> > TransferCsvStream(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, Stream stream, string dateKey, string filename, bool keepOriginalName)
        {
            var result = new List <string>();
            var config = new CsvConfiguration(CultureInfo.InvariantCulture)
            {
                Delimiter = etlSettings.CsvSourceOptoins.Delimiter
            };

            var csvStream = stream;

            if (etlSettings.CsvSourceOptoins.GZip)
            {
                csvStream = new GZipStream(stream, CompressionMode.Decompress);
            }

            using (var csvStreamReader = new StreamReader(csvStream))
            {
                using (var csvReader = new CsvReader(csvStreamReader, config))
                {
                    var headers      = new List <string>();
                    int parquetIndex = 0;

                    var targetS3 = etlSettings.CreateTargetS3API();

                    if (etlSettings.HasHeader)
                    {
                        csvReader.Read();
                        string header = null;
                        int    index  = 0;
                        while (csvReader.TryGetField(index, out header))
                        {
                            headers.Add(header);
                            index++;
                        }
                    }
                    var mappings = etlSettings.Mappings.ToDictionary(m => m.SourceFieldName, m => m);
                    List <List <string> > data = new List <List <string> >();
                    while (csvReader.Read())
                    {
                        int    index = 0;
                        string value = null;
                        var    row   = new List <string>();
                        while (csvReader.TryGetField(index, out value))
                        {
                            if (headers.Count == index)
                            {
                                headers.Add($"Col{index}");
                            }
                            row.Add(value);
                            index++;
                        }
                        data.Add(row);
                        if (data.Count >= etlSettings.NumberOfItemsPerParquet)
                        {
                            var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex);
                            using (var bufferStream = new MemoryStream())
                            {
                                bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data);
                                await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray()));
                            }
                            data.Clear();
                            result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                            parquetIndex++;
                        }
                    }
                    {
                        var s3key = etlSettings.MakeTargetS3Key(dateKey, filename, keepOriginalName, parquetIndex);
                        using (var bufferStream = new MemoryStream())
                        {
                            bufferStream.WriteParquet(etlSettings.Mappings.Select(m => m.ToParquetField()).ToList(), data);
                            await targetS3.Upload(s3key, new MemoryStream(bufferStream.ToArray()));
                        }
                        data.Clear();
                        result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                        parquetIndex++;
                    }
                    {
                        // load partition to athena table
                        await awsAthenaAPI.LoadPartition(
                            $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                            $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                            $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
                    }
                    {
                        // upload the flag file
                        var s3key = etlSettings.TargetFlagFile(filename);
                        await targetS3.Upload(s3key, new MemoryStream(Encoding.UTF8.GetBytes("OK")));

                        result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    }
                }
            }
            return(result);
        }