public DataTransferCore(AWSS3Options awsS3Options,
                         AWSS3API awsS3, SftpClient sftpClient)
 {
     this.awsS3Options = awsS3Options;
     this.awsS3        = awsS3;
     this.sftpClient   = sftpClient;
 }
Beispiel #2
0
 public static async Task <List <T> > ReadParquet <T>(this AWSS3API awsS3, string key, string bucket = null) where T : class, new()
 {
     using (Stream readStream = await awsS3.OpenReadAsync(key, bucket))
     {
         return(readStream.ReadParquet <T>());
     }
 }
Beispiel #3
0
 public static async Task <List <T> > ReadCsv <T>(this AWSS3API awsS3, string key, string bucket = null, CsvConfiguration configuration = null) where T : class
 {
     using (Stream readStream = await awsS3.OpenReadAsync(key, bucket))
     {
         return(readStream.ReadCsv <T>(configuration));
     }
 }
Beispiel #4
0
        public static async Task <List <T> > ReadParquet <T>(this AWSS3API awsS3, string s3Uri) where T : class, new()
        {
            var s3Obj = s3Uri.ParseS3URI();

            using (Stream readStream = await awsS3.OpenReadAsync(s3Obj.Key, s3Obj.BucketName))
            {
                return(readStream.ReadParquet <T>());
            }
        }
Beispiel #5
0
 public static async Task WriteParquet <T>(this AWSS3API awsS3, IEnumerable <T> items, string key, string bucket = null) where T : class
 {
     using (MemoryStream csvStream = new MemoryStream())
     {
         csvStream.WriteParquet(items);
         using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray()))
         {
             await awsS3.Upload(key, writeStream, bucket);
         }
     }
 }
 public MainWindow()
 {
     componentContext = (App.Current as App).Services;
     s3      = componentContext.Resolve <AWSS3API>();
     athena  = componentContext.Resolve <AWSAthenaAPI>();
     options = componentContext.Resolve <AthenaClientOptions>();
     InitializeComponent();
     s3Tree.ItemsSource     = S3ItemsSource;
     tabQueries.ItemsSource = FormatedQuerySource;
     LoadS3();
 }
Beispiel #7
0
 public static async Task WriteParquet(this AWSS3API awsS3, List <string> headers, List <Type> types, IEnumerable <List <object> > data, string key, string bucket = null)
 {
     using (MemoryStream csvStream = new MemoryStream())
     {
         csvStream.WriteParquet(headers, types, data);
         using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray()))
         {
             await awsS3.Upload(key, writeStream, bucket);
         }
     }
 }
 public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <Row> rows, ResultSetMetadata metadata, EtlSettings etlSettings, string s3Key)
 {
     using (MemoryStream gaStream = new MemoryStream())
     {
         gaStream.WriteAthenaRowsAsParquet(metadata, etlSettings.Mappings, rows);
         using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray()))
         {
             await awsS3Api.Upload(s3Key, uploadStream);
         }
     }
     rows.Clear();
 }
Beispiel #9
0
 public static async Task WriteResultRowsToS3Bucket(this AWSS3API awsS3Api, List <BigQueryRow> rows, BigQueryResults results, EtlSettings etlSettings, string s3Key)
 {
     using (MemoryStream gaStream = new MemoryStream())
     {
         gaStream.WriteGARowsAsParquet(results.Schema, etlSettings.Mappings, rows);
         using (MemoryStream uploadStream = new MemoryStream(gaStream.ToArray()))
         {
             await awsS3Api.Upload(s3Key, uploadStream);
         }
     }
     rows.Clear();
 }
Beispiel #10
0
        public static async Task AppendLines(this AWSS3API awsS3, string s3Uri, IEnumerable <string> lines)
        {
            List <string> list = new List <string>();

            try
            {
                list.AddRange(await awsS3.ReadLines(s3Uri));
            }
            catch { }
            list.AddRange(lines);
            await awsS3.Put(s3Uri, Encoding.UTF8.GetBytes(string.Join("\n", list)));
        }
Beispiel #11
0
        public static async Task WriteParquet <T>(this AWSS3API awsS3, IEnumerable <T> items, string s3Uri) where T : class
        {
            var s3Ojb = s3Uri.ParseS3URI();

            using (MemoryStream csvStream = new MemoryStream())
            {
                csvStream.WriteParquet(items);
                using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray()))
                {
                    await awsS3.Upload(s3Ojb.Key, writeStream, s3Ojb.BucketName);
                }
            }
        }
Beispiel #12
0
        public static async Task WriteParquet(this AWSS3API awsS3, List <string> headers, List <Type> types, IEnumerable <List <object> > data, string s3Uri)
        {
            var s3Ojb = s3Uri.ParseS3URI();

            using (MemoryStream csvStream = new MemoryStream())
            {
                csvStream.WriteParquet(headers, types, data);
                using (MemoryStream writeStream = new MemoryStream(csvStream.ToArray()))
                {
                    await awsS3.Upload(s3Ojb.Key, writeStream, s3Ojb.BucketName);
                }
            }
        }
 public CloudFrontDeployAPI(CloudFrontDeployOptions cloudFrontDeployOptions)
 {
     if (cloudFrontDeployOptions.S3BasePath == null)
     {
         cloudFrontDeployOptions.S3BasePath = "";
     }
     this.cloudFrontDeployOptions = cloudFrontDeployOptions;
     awsS3API = new AWSS3API(cloudFrontDeployOptions.AWSS3Options);
     if (!string.IsNullOrWhiteSpace(cloudFrontDeployOptions.CloudFrontDistributionId))
     {
         awsCloudFrontAPI = new AWSCloudFrontAPI(cloudFrontDeployOptions.AWSCloudFrontOptions);
     }
 }
Beispiel #14
0
        public static async Task UpdateS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey)
        {
            var list = new List <S3EventHandler>();

            if (await awsS3API.FileExists(listKey))
            {
                try
                {
                    var json = await awsS3API.ReadAsString(listKey);

                    list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json);
                }
                catch (Exception ex)
                {
                    list = new List <S3EventHandler>();
                }
            }

            if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent)
            {
                // find the key and update
                var found = list.FirstOrDefault(handler => handler.EtlName == etlSettings.Name);
                if (found == null)
                {
                    list.Add(new S3EventHandler()
                    {
                        EtlName    = etlSettings.Name,
                        BucketName = etlSettings.S3EventSource.BucketName,
                        PathRegex  = etlSettings.S3EventSource.PathRegex
                    });
                }
                else
                {
                    found.BucketName = etlSettings.S3EventSource.BucketName;
                    found.PathRegex  = etlSettings.S3EventSource.PathRegex;
                }
            }
            else
            {
                // remove any entries that match that name
                list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList();
            }

            // write back to s3
            await awsS3API.UploadAsJson(listKey, list);
        }
Beispiel #15
0
        public static async Task <List <T> > ReadParquetPartitions <T>(this AWSS3API awsS3, string prefix, string bucket = null) where T : class, new()
        {
            var objects = await awsS3.ListAllObjectsInBucket(bucket : bucket, prefix : prefix);

            var readTasks = objects.Select(async obj =>
            {
                return(await awsS3.ReadParquet <T>(obj.Key, obj.BucketName));
            }).ToArray();

            var results = await Task.WhenAll(readTasks);

            return(results.Aggregate(new List <T>(), (seed, items) =>
            {
                seed.AddRange(items);
                return seed;
            }));
        }
Beispiel #16
0
        /// <summary>
        /// write parquet files in parallel
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="awsS3"></param>
        /// <param name="items"></param>
        /// <param name="partitionSize"></param>
        /// <param name="digitCount"></param>
        /// <param name="key"></param>
        /// <param name="bucket"></param>
        /// <returns></returns>
        public static async Task <List <string> > WriteParquetPerPartition <T>(this AWSS3API awsS3, IEnumerable <T> items, int partitionSize, int digitCount,
                                                                               string key, string bucket = null) where T : class
        {
            key = parquetSuffix.Replace(key, "");
            List <string> files      = new List <string>();
            var           writeTasks = items.SplitIntoPartitions(partitionSize).Select(async(partition, index) =>
            {
                if (partition.Count > 0)
                {
                    var fileKey = $"{key}-{index.ToString().PadLeft(digitCount, '0')}.parquet";
                    await awsS3.WriteParquet(partition, fileKey, bucket);
                    files.Add(fileKey);
                }
            }).ToArray();
            await Task.WhenAll(writeTasks);

            return(files);
        }
Beispiel #17
0
        /// <summary>
        /// Process S3 Etl Event
        /// </summary>
        /// <param name="reportingAwsS3Api">The S3 bucket access for the reporting settings</param>
        /// <param name="listKey">s3 event list handler json key</param>
        /// <param name="etlPrefix">etl settings prefex</param>
        /// <param name="awsAthenaAPI">The target athena</param>
        /// <param name="bucketName">event source bucket name</param>
        /// <param name="s3FileKey">event source s3 key</param>
        /// <returns></returns>
        public static async Task <string> ProcessS3EtlEvent(this AWSS3API reportingAwsS3Api, string listKey, string etlPrefix,
                                                            AWSAthenaAPI awsAthenaAPI, string bucketName, string s3FileKey, GenericLogger logger = null)
        {
            if (!await reportingAwsS3Api.FileExists(listKey))
            {
                return($"event handler setting does not exist: '{listKey}'");
            }

            var json = await reportingAwsS3Api.ReadAsString(listKey);

            logger?.Log?.Invoke(json);
            var list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json);

            var found = list.FirstOrDefault(handler => handler.BucketName == bucketName && Regex.IsMatch(s3FileKey, handler.PathRegex));

            if (found == null)
            {
                return($"event handler not found for object: 's3://{bucketName}/{s3FileKey}'");
            }

            var etlkey = $"{etlPrefix}{found.EtlName}.json";

            logger?.Log?.Invoke($"Find ETL setting: {etlkey}");
            if (!await reportingAwsS3Api.FileExists(etlkey))
            {
                return($"etl setting does not exist: '{etlkey}'");
            }
            ;

            var jsonEtl = await reportingAwsS3Api.ReadAsString(etlkey);

            var etlSettings = JsonConvert.DeserializeObject <EtlSettings>(jsonEtl);

            // assign the s3FileKey to the ExamplePath and tell people around the deal
            etlSettings.S3EventSource.ExamplePath = s3FileKey;

            var results = await etlSettings.TransferData(awsAthenaAPI);

            return(string.Join("\n", results));
        }
Beispiel #18
0
        public MainWindow()
        {
            componentContext = (App.Current as App).Services;

            var encryptedOptions = componentContext.Resolve <EncryptedOptions>();

            DecryptedOptions decryptedOptions = LoadOptions(encryptedOptions);

            if (decryptedOptions == null)
            {
                Application.Current.Shutdown();
                return;
            }
            s3      = new AWSS3API(decryptedOptions.AWSS3Options);
            athena  = new AWSAthenaAPI(decryptedOptions.AWSAthenaOptions);
            options = decryptedOptions.AthenaClientOptions;
            InitializeComponent();
            s3Tree.ItemsSource     = S3ItemsSource;
            tabQueries.ItemsSource = FormatedQuerySource;
            dgJobList.ItemsSource  = QueryTasks;
            LoadS3();
        }
        public static async Task ClearAthenaTable(this AWSAthenaAPI athenaApi, AWSS3API awsS3Api, string tableName, string s3Path)
        {
            Console.WriteLine($"DROP TABLE IF EXISTS {tableName}");
            var executionId = await athenaApi.StartQuery($"DROP TABLE IF EXISTS {tableName}");

            while (!await athenaApi.IsExecutionCompleted(executionId))
            {
                Thread.Sleep(2000);
            }
            var s3Object = s3Path.ParseS3URI();

            if (s3Object is S3Object)
            {
                Console.WriteLine($"Delete S3: {s3Path}");
                var files = await awsS3Api.ListFiles(s3Object.Key, "/", s3Object.BucketName);

                if (files.Any())
                {
                    await awsS3Api.Delete(files.Select(key => $"{s3Object.Key}{key}"), s3Object.BucketName);
                }
                Console.WriteLine($"{s3Path}: {files.Count} S3 Files Deleted");
            }
        }
 public AthenaTableSetup(AWSS3API awsS3API, AWSAthenaAPI awsAthenaAPI)
 {
     this.awsS3API     = awsS3API;
     this.awsAthenaAPI = awsAthenaAPI;
 }
Beispiel #21
0
 public SnowflakeEtlAPI(SnowflakeAPI snowflakeAPI, AWSS3API awsS3API)
 {
     this.snowflakeAPI = snowflakeAPI;
     this.awsS3API     = awsS3API;
 }
 public S3TreeItem(AWSS3API awsS3API)
 {
     this.awsS3API = awsS3API;
 }
        public static async Task ClearAthenaTables(this StateMachineQueryContext context, AWSAthenaAPI athenaApi, AWSS3API awsS3Api)
        {
            var parserSetting = context.BuildParserSetting();
            var pipes         = context.raw.ParseAthenaPipes(parserSetting);

            foreach (var clearing in parserSetting.Clearings)
            {
                await athenaApi.ClearAthenaTable(awsS3Api, clearing.Key, clearing.Value);
            }
        }
Beispiel #24
0
        public static async Task DeleteFromS3EventEtlList(this EtlSettings etlSettings, AWSS3API awsS3API, string listKey)
        {
            if (etlSettings.SourceType == EtlSourceEnum.S3BucketEvent)
            {
                var list = new List <S3EventHandler>();

                if (await awsS3API.FileExists(listKey))
                {
                    try
                    {
                        var json = await awsS3API.ReadAsString(listKey);

                        list = JsonConvert.DeserializeObject <List <S3EventHandler> >(json);
                    }
                    catch (Exception ex)
                    {
                        list = new List <S3EventHandler>();
                    }
                }

                // remove any entries that match that name
                list = list.Where(handler => handler.EtlName != etlSettings.Name).ToList();

                // write back to s3
                await awsS3API.UploadAsJson(listKey, list);
            }
        }
Beispiel #25
0
        public static async Task <bool> Exists(this AWSS3API awsS3, string s3Uri)
        {
            var s3obj = s3Uri.ParseS3URI();

            return(await awsS3.FileExists(s3obj.Key, s3obj.BucketName));
        }
Beispiel #26
0
        public static async Task <EtlSettings> ReadEtlSampleData(this EtlSettings etlSettings, int lines = 20)
        {
            etlSettings.Sample = new DataSample();

            switch (etlSettings.SourceType)
            {
            case EtlSourceEnum.SFTP:
            {
                var sftp      = etlSettings.SFTPSource;
                var nameRegex = new Regex(sftp.PathRegex);
                using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password))
                {
                    sftpClient.Connect();
                    var files = sftpClient.ListDirectory(sftp.BasePath);
                    files = files.Where(f => nameRegex.IsMatch(f.FullName)).ToList();
                    var first = files.FirstOrDefault();
                    if (first != null)
                    {
                        switch (etlSettings.FileType)
                        {
                        case EtlFileType.CSV:
                        {
                            using (var sftpStream = sftpClient.OpenRead(first.FullName))
                            {
                                etlSettings.ReadFromCSVFile(sftpStream, lines);
                            }
                        }
                        break;
                        }
                    }
                    sftpClient.Disconnect();
                }
            }
            break;

            case EtlSourceEnum.S3BucketCheck:
            {
                var s3       = etlSettings.S3CheckSource;
                var awsS3API = new AWSS3API(new AWSS3Options()
                    {
                        Key    = s3.Key,
                        Secret = s3.Secret,
                        Bucket = s3.BucketName,
                        Region = s3.Region,
                    });
                var objects = await awsS3API.ListAllObjectsInBucket(s3.BucketName, s3.Prefix);

                var nameRegex = new Regex(s3.PathRegex);
                objects = objects.Where(f => nameRegex.IsMatch(f.Key)).ToList();
                var first = objects.FirstOrDefault();
                if (first != null)
                {
                    switch (etlSettings.FileType)
                    {
                    case EtlFileType.CSV:
                    {
                        using (var s3Stream = await awsS3API.OpenReadAsync(first.Key, first.BucketName))
                        {
                            etlSettings.ReadFromCSVFile(s3Stream, lines);
                        }
                    }
                    break;
                    }
                }
            }
            break;

            case EtlSourceEnum.S3BucketEvent:
            {
                var s3       = etlSettings.S3EventSource;
                var awsS3API = new AWSS3API(new AWSS3Options()
                    {
                        Key    = s3.Key,
                        Secret = s3.Secret,
                        Bucket = s3.BucketName,
                        Region = s3.Region,
                    });
                if (await awsS3API.FileExists(s3.ExamplePath, s3.BucketName))
                {
                    switch (etlSettings.FileType)
                    {
                    case EtlFileType.CSV:
                    {
                        using (var s3Stream = await awsS3API.OpenReadAsync(s3.ExamplePath, s3.BucketName))
                        {
                            etlSettings.ReadFromCSVFile(s3Stream, lines);
                        }
                    }
                    break;
                    }
                }
            }
            break;

            case EtlSourceEnum.GoogleAnalytics:
            {
                await etlSettings.GetBigQueryResultSampleByDate(lines);
            }
            break;

            case EtlSourceEnum.AmazonAthena:
            {
                await etlSettings.GetAthenaQueryResultSampleByDate(lines);
            }
            break;

            case EtlSourceEnum.AmazonAthenaPipes:
            {
                // need to compile the query
                await etlSettings.ParseAthenaQueryPipes();
            }
            break;
            }

            // make the sample data smaller
            foreach (var row in etlSettings.Sample.Rows.ToList())
            {
                row.Items = row.Items.Select(item => item.Length < 100 ? item : item.Substring(0, 50) + "..." + item.Substring(item.Length - 50)).ToList();
            }

            return(etlSettings);
        }
Beispiel #27
0
        public static async Task <List <string> > TransferData(this EtlSettings etlSettings, AWSAthenaAPI awsAthenaAPI, GenericLogger logger = null, DateTime?useDate = null)
        {
            var result = new List <string>();

            logger?.Log?.Invoke($"ETL Mode: {etlSettings.SourceType}");

            switch (etlSettings.SourceType)
            {
            case EtlSourceEnum.SFTP:
            {
                var sftp      = etlSettings.SFTPSource;
                var nameRegex = new Regex(sftp.PathRegex);
                var dateRegex = new Regex(sftp.DateKeyRegex);
                using (var sftpClient = new SftpClient(sftp.Host, sftp.Username, sftp.Password))
                {
                    sftpClient.Connect();
                    var files = sftpClient.ListDirectory(sftp.BasePath);
                    files = files
                            .Where(f => nameRegex.IsMatch(f.FullName) && dateRegex.IsMatch(f.Name))
                            .OrderByDescending(f => f.Name)
                            .ToList();
                    // find in the target to work out if there is the corresponding parquet file
                    var      targetS3 = etlSettings.CreateTargetS3API();
                    SftpFile first    = null;
                    foreach (var file in files)
                    {
                        Console.WriteLine($"Check File: {file.FullName}");
                        var s3Key = etlSettings.TargetFlagFile(file.Name);
                        if (!await targetS3.FileExists(s3Key))
                        {
                            first = file;
                            break;
                        }
                    }
                    // transfer that file
                    if (first != null)
                    {
                        Console.WriteLine($"Transfer File: {first.FullName}");
                        var dateKey = first.Name.MakeRegexExtraction(dateRegex);
                        using (var sftpStream = sftpClient.OpenRead(first.FullName))
                        {
                            result = await etlSettings.TransferCsvStream(awsAthenaAPI, sftpStream, dateKey, first.Name, false);
                        }
                    }
                    sftpClient.Disconnect();
                }
            }
            break;

            case EtlSourceEnum.S3BucketCheck:
            {
            }
            break;

            case EtlSourceEnum.S3BucketEvent:
            {
                var sourceAwsS3Api = new AWSS3API(new AWSS3Options()
                    {
                        Key    = etlSettings.S3EventSource.Key,
                        Secret = etlSettings.S3EventSource.Secret,
                        Bucket = etlSettings.S3EventSource.BucketName,
                        Region = etlSettings.S3EventSource.Region
                    });
                var s3Event   = etlSettings.S3EventSource;
                var nameRegex = new Regex(s3Event.PathRegex);
                var keyRegex  = new Regex(s3Event.FileNameRegex);
                // do nothing if it does not match the path pattern
                if (!nameRegex.IsMatch(s3Event.ExamplePath) || (!keyRegex.IsMatch(s3Event.ExamplePath)))
                {
                    return(result);
                }

                // generate dateKey
                var dateKey = DateTime.UtcNow.ToString("yyyyMMdd");

                Regex dateRegex = null;
                if (!s3Event.UseEventDateAsDateKey)
                {
                    dateRegex = new Regex(s3Event.DateKeyRegex);
                    if (!dateRegex.IsMatch(s3Event.ExamplePath))
                    {
                        return(result);
                    }
                    dateKey = s3Event.ExamplePath.MakeRegexExtraction(dateRegex);
                }

                // generate file name

                var filename = s3Event.ExamplePath.MakeRegexExtraction(keyRegex);

                // it will overwrite by default we need to workout datekey first of all
                var prefixUpToDate = etlSettings.MakeTargetS3Prefix(dateKey, filename, true);

                // check files that should be deleted
                var targetAwsS3Api = etlSettings.CreateTargetS3API();
                var oldObjects     = await targetAwsS3Api.ListAllObjectsInBucket(prefix : prefixUpToDate);

                // delete the files with those prefix
                foreach (var oldObj in oldObjects)
                {
                    await targetAwsS3Api.Delete(oldObj.Key);
                }

                // open file stream and transfer data
                using (var awsS3Stream = await sourceAwsS3Api.OpenReadAsync(s3Event.ExamplePath))
                {
                    result = await etlSettings.TransferCsvStream(awsAthenaAPI, awsS3Stream, dateKey, filename, true);
                }
            }
            break;

            case EtlSourceEnum.GoogleAnalytics:
            {
                result = await etlSettings.TransferBigQueryResultByDate(awsAthenaAPI, useDate);
            }
            break;

            case EtlSourceEnum.AmazonAthena:
            {
                result = await etlSettings.TransferAthenaQueryResultByDate(awsAthenaAPI, useDate);
            }
            break;

            case EtlSourceEnum.AmazonAthenaPipes:
            {
                await etlSettings.RunAthenaQueryPipes(useDate);
            }
            break;
            }
            return(result);
        }
 public static async Task ClearTempTables(this StateMachineQueryContext context, AWSAthenaAPI athenaApi, AWSS3API awsS3Api)
 {
     foreach (var clearing in context.settings.Clearings)
     {
         await athenaApi.ClearAthenaTable(awsS3Api, clearing.Key, clearing.Value);
     }
     context.settings.Clearings.Clear();
 }
Beispiel #29
0
 public static async Task <string[]> ReadLines(this AWSS3API awsS3, string s3Uri)
 {
     return(Regex.Split(Encoding.UTF8.GetString(await awsS3.Get(s3Uri)), "[\n\r]"));
 }
 public SftpEtl(SftpClient sftpClient, AWSS3API awsS3API)
 {
     this.sftpClient = sftpClient;
     this.awsS3API   = awsS3API;
 }