private static void ProcessLocalWeatherData(string[] csvFiles) { var processor = new LocalWeatherDataBatchProcessor(ConnectionString); csvFiles .AsParallel() .WithDegreeOfParallelism(4) .ForAll(file => { log.Info($"Processing File: {file}"); // Access to the List of Parsers: var batches = Parsers // Use the LocalWeatherData Parser: .LocalWeatherDataParser // Read the File: .ReadFromFile(file, Encoding.UTF8, 1) // Get the Valid Results: .Where(x => x.IsValid) // And get the populated Entities: .Select(x => x.Result) // Convert into the Sql Data Model: .Select(x => LocalWeatherDataConverter.Convert(x)) // Batch: .Batch(80000); foreach (var batch in batches) { processor.Write(batch); } }); }
private static void ProcessLocalWeatherData(string csvFilePath) { if (log.IsInfoEnabled) { log.Info($"Processing File: {csvFilePath}"); } // Construct the Batch Processor: var processor = new LocalWeatherDataBatchProcessor(ConnectionString, Database); // Access to the List of Parsers: var batches = Parsers // Use the LocalWeatherData Parser: .LocalWeatherDataParser // Read the File: .ReadFromFile(csvFilePath, Encoding.UTF8, 1) // Get the Valid Results: .Where(x => x.IsValid) // And get the populated Entities: .Select(x => x.Result) // Let's stay safe! Stop parallelism here: .AsEnumerable() // Evaluate: .Batch(10000) // Convert each Batch into a LineProtocolPayload: .Select(measurements => LocalWeatherDataConverter.Convert(measurements)); foreach (var batch in batches) { try { var result = processor.WriteAsync(batch).GetAwaiter().GetResult(); // Log all unsuccessful writes, but do not quit execution: if (!result.Success) { if (log.IsErrorEnabled) { log.Error(result.ErrorMessage); } } } catch (Exception e) { // Some Pokemon Exception Handling here. I am seeing TaskCanceledExceptions with the // InfluxDB .NET Client. At the same time I do not want to quit execution, because // some batches fail: if (log.IsErrorEnabled) { log.Error(e, "Error occured writing InfluxDB Payload"); } } } }
/// <summary> /// /// https://www.elastic.co/guide/en/elasticsearch/reference/master/tune-for-indexing-speed.html /// </summary> /// <param name="csvFilePath"></param> private static void ProcessLocalWeatherData(string csvFilePath) { if (log.IsInfoEnabled) { log.Info($"Processing File: {csvFilePath}"); } // Construct the Batch Processor: var client = new ElasticSearchClient <Elastic.Model.LocalWeatherData>(ConnectionString, "weather_data"); // We are creating the Index with special indexing options for initial load, // as suggested in the Elasticsearch documentation at [1]. // // We disable the performance-heavy indexing during the initial load and also // disable any replicas of the data. This comes at a price of not being able // to query the data in realtime, but it will enhance the import speed. // // After the initial load I will revert to the standard settings for the Index // and set the default values for Shards and Refresh Interval. // // [1]: https://www.elastic.co/guide/en/elasticsearch/reference/master/tune-for-indexing-speed.html // client.CreateIndex(settings => settings .NumberOfReplicas(0) .RefreshInterval(-1)); // Access to the List of Parsers: var batches = Parsers // Use the LocalWeatherData Parser: .LocalWeatherDataParser // Read the File, Skip first row: .ReadFromFile(csvFilePath, Encoding.UTF8, 1) // Get the Valid Results: .Where(x => x.IsValid) // And get the populated Entities: .Select(x => x.Result) // Convert to ElasticSearch Entity: .Select(x => LocalWeatherDataConverter.Convert(x)) // Batch Entities: .Batch(30000); foreach (var batch in batches) { client.BulkInsert(batch); } }
private static IEnumerable <ElasticLocalWeatherDataType> GetData() { // Create Lookup Dictionary to map stations from: IDictionary <string, CsvStationType> stations = GetStations("C:\\Users\\philipp\\Downloads\\csv\\201503station.txt") .ToDictionary(station => station.WBAN, station => station); // Create the flattened Elasticsearch entry: return(GetLocalWeatherData("C:\\Users\\philipp\\Downloads\\csv\\201503hourly.txt") .Where(x => stations.ContainsKey(x.WBAN)) .Select(x => { var station = stations[x.WBAN]; return LocalWeatherDataConverter.Convert(station, x); })); }
private static IEnumerable <ElasticLocalWeatherDataType> GetData() { // Create Lookup Dictionary to map stations from: IDictionary <string, CsvStationType> stations = GetStations(@"D:\datasets\201503station.txt") .GroupBy(x => x.WBAN, x => x) .Select(x => x.First()) .ToDictionary(station => station.WBAN, station => station); // Create the flattened Elasticsearch entry: return(GetLocalWeatherData(@"D:\datasets\201503hourly.txt") .Where(x => stations.ContainsKey(x.WBAN)) .Select(x => { var station = stations[x.WBAN]; return LocalWeatherDataConverter.Convert(station, x); })); }
private static async Task InsertStationsAsync(Neo4JClient client) { // Read the Stations: string csvStationDataFile = @"D:\datasets\CDC\zehn_min_tu_Beschreibung_Stationen.txt"; // Access to the List of Parsers: var batches = Parsers // Use the LocalWeatherData Parser: .StationParser // Read the File: .ReadFromFile(csvStationDataFile, Encoding.UTF8, 1) // Get the Valid Results: .Where(x => x.IsValid) // And get the populated Entities: .Select(x => x.Result) // Let's stay safe! Stop parallelism here: .AsEnumerable() // Evaluate, prefer smaller batches for Neo4j: .Batch(10000) // Go Parallel again: .AsParallel() // As List: .Select(batch => { return(batch // Group by WBAN, Date and Time to avoid duplicates for this batch: .GroupBy(x => new { x.Identifier }) // If there are duplicates then make a guess and select the first one: .Select(x => x.First()) // Convert to Neo4j: .Select(x => LocalWeatherDataConverter.Convert(x)) // And evaluate to prevent multiple iterations: .ToList()); }); foreach (var batch in batches) { // Finally write them with the Batch Writer: await client.CreateStationsAsync(batch); } }
private static void ProcessStationData(string csvFilePath) { log.Info($"Processing File: {csvFilePath}"); var batches = Parsers .StationParser .ReadFromFile(csvFilePath, Encoding.UTF8, 2) .Where(x => x.IsValid) .Select(x => x.Result) .Select(x => LocalWeatherDataConverter.Convert(x)) .Batch(500); // Construct the Batch Processor: var processor = new StationBatchProcessor(ConnectionString); foreach (var batch in batches) { // Finally write them with the Batch Writer: processor.Write(batch); } }
private static async Task ProcessLocalWeatherData(Neo4JClient client, string csvFilePath, CancellationToken cancellationToken = default(CancellationToken)) { Console.WriteLine($"Processing File: {csvFilePath}"); // Access to the List of Parsers: var batches = Parsers // Use the LocalWeatherData Parser: .LocalWeatherDataParser // Read the File: .ReadFromFile(csvFilePath, Encoding.UTF8, 1) // Get the Valid Results: .Where(x => x.IsValid) // And get the populated Entities: .Select(x => x.Result) // Let's stay safe! Stop parallelism here: .AsEnumerable() // Evaluate, prefer smaller batches for Neo4j: .Batch(30000) // Go Parallel again: .AsParallel() // As List: .Select(batch => { return(batch // Group by WBAN, Date and Time to avoid duplicates for this batch: .GroupBy(x => new { x.StationIdentifier, x.TimeStamp }) // If there are duplicates then make a guess and select the first one: .Select(x => x.First()) // Convert to Neo4j: .Select(x => LocalWeatherDataConverter.Convert(x)) // And evaluate to prevent multiple iterations: .ToList()); }); foreach (var batch in batches) { // Finally write them with the Batch Writer: await client.CreateLocalWeatherDataAsync(batch); } }