public static async Task <BigQueryResults> RunBigQueryAsync(BigQueryClient bigQueryClient, string query) { BigQueryJob bigQueryJob = await bigQueryClient.CreateQueryJobAsync( sql : query, parameters : null ); await bigQueryJob.PollUntilCompletedAsync(); return(await bigQueryClient.GetQueryResultsAsync(bigQueryJob.Reference)); }
public static async Task GetBigQueryResultSampleByDate(this EtlSettings etlSettings, int lines) { var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); // make sure the query is limited by 20 sql = sql.Replace("{date}", dateQueryKey) + $"\nlimit {lines}"; var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); } // map schema to athena types etlSettings.Mappings = results.ToFieldMappings(); var sample = new DataSample() { Rows = new List <DataRow>() }; // convert big query data to sample data foreach (var row in rows) { sample.Rows.Add(new DataRow() { Items = row.RawRow.F.Select(item => { if (item.V == null) { return(""); } else if (item.V.GetType() == typeof(DateTime)) { return(((DateTime)item.V).ToString("o")); } else if (item.V.GetType() == typeof(byte[])) { return(Convert.ToBase64String((byte[])item.V)); } else { return(item.V.ToString()); } }).ToList() }); } etlSettings.Sample = sample; }
/// <summary> /// /// </summary> /// <param name="etlSettings"></param> /// <returns></returns> public static async Task <List <string> > TransferBigQueryResultByDate(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI) { var result = new List <string>(); var awsS3Api = etlSettings.CreateTargetS3API(); var ga = etlSettings.GoogleAnalyticsQuerySource; Environment.SetEnvironmentVariable("GOOGLE_APPLICATION_CREDENTIALS", $"{AppContext.BaseDirectory}/{ga.GoogleAnalyticsSettingFile}"); BigQueryClient client = BigQueryClient.Create(ga.GoogleAnalyticsProjectId); string sql = ga.BigQuerySQL; string dateQueryKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString(ga.DateFormat); string dateKey = DateTime.Now.AddDays(-ga.DaysAgo).ToString("yyyyMMdd"); sql = sql.Replace("{date}", dateKey); var job = await client.CreateQueryJobAsync(sql, new List <BigQueryParameter>()); BigQueryResults results = null; results = await client.GetQueryResultsAsync(job.Reference, new GetQueryResultsOptions() { StartIndex = 0, PageSize = 20000, }); var enumerator = results.GetEnumerator(); List <BigQueryRow> rows = new List <BigQueryRow>(); int parquetIndex = 0; var targetS3 = etlSettings.CreateTargetS3API(); while (enumerator.MoveNext()) { rows.Add(enumerator.Current); if (rows.Count >= etlSettings.NumberOfItemsPerParquet) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } } // write what ever left less than 200000 if (rows.Count > 0) { var s3key = etlSettings.MakeTargetS3Key(dateKey, "", false, parquetIndex); await targetS3.WriteResultRowsToS3Bucket(rows, results, etlSettings, s3key); result.Add($"s3://{etlSettings.TargetS3BucketName}/{s3key}"); parquetIndex += 1; } { // load partition to athena table await awsAthenaAPI.LoadPartition( $"`{etlSettings.AthenaDatabaseName}`.`{etlSettings.AthenaTableName}`", $"`{etlSettings.DatePartitionKey}` = '{dateKey}'", $"s3://{etlSettings.TargetS3BucketName}/{etlSettings.TargetS3Prefix}/{dateKey}/"); } return(result); }