コード例 #1
0
        public static async Task GetAthenaQueryResultSampleByDate(this EtlSettings etlSettings, int lines)
        {
            var athena = etlSettings.AthenaQuerySource;

            if (athena == null)
            {
                throw new Exception("The ETL has an empty Athena source setting.");
            }
            var athenaApi = etlSettings.CreateSourceAthenaAPI();

            var query = athena.AthenaSQL;
            var today = DateTime.Now;
            var date  = today.AddDays(-athena.DaysAgo);

            query = query.Replace("{date}", date.ToString(athena.DateFormat));
            query = rgxDateOffset.Replace(query, m =>
            {
                var offset = int.Parse(m.Groups[1].Value);
                return(date.AddDays(offset).ToString(athena.DateFormat));
            });
            query += $"\nlimit {lines}";
            var getResultRequest = await athenaApi.ExecuteQuery(query);

            var response = await athenaApi.ReadOneResult(getResultRequest);

            etlSettings.Mappings = response.ToFieldMapping();
            // load data schema to the etlsetting schema

            var sample = new DataSample()
            {
                Rows = new List <DataRow>()
            };

            var data = response.ReadData();

            foreach (var row in data)
            {
                var dataRow = new DataRow()
                {
                    Items = row.Select(item => item.ToString()).ToList()
                };
                sample.Rows.Add(dataRow);
            }
            etlSettings.Sample = sample;
        }
コード例 #2
0
        public static async Task <List <string> > TransferAthenaQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI)
        {
            var result = new List <string>();
            var athena = etlSettings.AthenaQuerySource;

            if (athena == null)
            {
                throw new Exception("The ETL has an empty Athena source setting.");
            }
            var athenaApi = etlSettings.CreateSourceAthenaAPI();

            var query = athena.AthenaSQL;
            var today = DateTime.Now;
            var date  = today.AddDays(-athena.DaysAgo);

            query = query.Replace("{date}", date.ToString(athena.DateFormat));
            var dateKey = date.ToString("yyyyMMdd");

            // var response = await athenaApi.ExecuteQuery(query);

            var getResultRequest = await athenaApi.ExecuteQuery(query);

            //var response = await athenaApi.ReadOneResult(getResultRequest);

            //var enumerator = response.ResultSet.Rows.GetEnumerator();
            ResultSetMetadata resultSetMetadata = null;

            var enumerator = athenaApi.EnumerateRows(getResultRequest, res => resultSetMetadata = res.ResultSet.ResultSetMetadata).GetEnumerator();

            List <Row> rows = new List <Row>();

            int parquetIndex = 0;

            var targetS3 = etlSettings.CreateTargetS3API();

            //skip first row;
            enumerator.MoveNext();
            while (enumerator.MoveNext())
            {
                rows.Add(enumerator.Current);
                if (rows.Count >= etlSettings.NumberOfItemsPerParquet)
                {
                    var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                    await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key);

                    result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                    parquetIndex += 1;
                }
            }

            // write what ever left less than 200000
            if (rows.Count > 0)
            {
                var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex);
                await targetS3.WriteResultRowsToS3Bucket(rows, resultSetMetadata, etlSettings, s3key);

                result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}");
                parquetIndex += 1;
            }

            {
                // load partition to athena table
                await awsAthenaAPI.LoadPartition(
                    $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`",
                    $"`{etlSettings.DatePartitionKey}` = '{dateKey}'",
                    $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/");
            }

            return(result);
        }