public CsvPartition(string directory, CsvExporter owner) { _owner = owner; Directory = directory; }
public void NestedChildTablesWithBatchingInCsv() { var csvDir = Path.Combine(Directory.GetCurrentDirectory(), "~tmp"); if (Directory.Exists(csvDir)) Directory.Delete(csvDir, true); var csv = new CsvExporter(csvDir, binaryPartitions:true); var batchWriter = new TableDataBatchWriter(csv); //1000 visits. //1 row in Test per visit //3 rows in Events per visit //3 rows in Pages per visit //1 row in Pages2 per visit //-------------------------- //8 rows per visit = 80000 in total //Batch size 26672 (ceiling(1000/3) visits) gives two disk partitions with 26672 and one with 26656 in memory var visitCount = 1000; var rowsPerVisit = 8; var visitPerBatch = (int)Math.Ceiling(visitCount / 3d); var rowsPerFilePartition = visitPerBatch * rowsPerVisit; var eventRowsPerVisit = 3; var expectedFilePartitions = 2; var tables = TestSets.Countries(visitCount, 37).Process( () => new SimpleTableMapper(new TableDefinition("Test") .Key("VisitId", s => s.Current<IVisitAggregationContext>().Visit.InteractionId) .Fact("Value", s => s.Current<IVisitAggregationContext>().Visit.Value) .Map(s => s.Current<IVisitAggregationContext>().Visit.Pages, new TableDefinition("Pages") .Key("PageId", s => s.Current<PageData>().Item.Id) .Fact("Value", s => s.Current<PageData>().PageEvents.Sum(pe => pe.Value)) .Map(s => s.Current<PageData>().PageEvents, new TableDefinition("Events") .Dimension("Event", s => s.Current<PageEventData>().PageEventDefinitionId) .Fact("Value", s => s.Current<PageEventData>().Value))) .Map(s => new[] { s.Current<IVisitAggregationContext>().Visit.Pages.First() }, new TableDefinition("Pages2") .Key("Id", s => s.Current<PageData>().Item.Id))), initializer: p => { p.BatchWriter = batchWriter; p.BatchSize = rowsPerFilePartition; }); var partitions = new DirectoryInfo(csvDir).GetDirectories().Length; Assert.AreEqual(2, partitions, string.Format("{0:N0} rows should create 2 file partitions and one in memory", visitCount * rowsPerVisit)); Assert.AreEqual(expectedFilePartitions * eventRowsPerVisit * visitPerBatch, batchWriter.Tables.FirstOrDefault(t => t.Schema.Name == "Events").Rows.Count(), string.Format("{0:N0} rows in event tables in file partitions", expectedFilePartitions * eventRowsPerVisit * visitPerBatch)); Assert.AreEqual(expectedFilePartitions * rowsPerVisit * visitPerBatch, batchWriter.Tables.Sum(t => t.Rows.Count()), string.Format("{0:N0} total rows in file partitions", expectedFilePartitions * rowsPerVisit * visitPerBatch)); Assert.AreEqual(visitCount*rowsPerVisit, tables.Sum(t => t.Rows.Count()), string.Format("{0:N0} rows in file + memory partitions", visitCount*rowsPerVisit)); //Merge partitions tables = csv.Export(tables); //Delete partitions batchWriter.Dispose(); partitions = new DirectoryInfo(csvDir).GetDirectories().Length; Assert.AreEqual(0, partitions, "Temporary partition directories are deleted"); var visits = tables.FirstOrDefault(t => t.Schema.Name == "Test"); var pages = tables.FirstOrDefault(t => t.Schema.Name == "Pages"); var pages2 = tables.FirstOrDefault(t => t.Schema.Name == "Pages2"); var events = tables.FirstOrDefault(t => t.Schema.Name == "Events"); Assert.AreEqual(1000, visits.Rows.Count()); Assert.AreEqual(14000, visits.Fields<int>("Value").Sum()); Assert.AreEqual(14000, pages.Fields<int>("Value").Sum()); Assert.AreEqual(14000, events.Fields<int>("Value").Sum()); Assert.AreEqual(3000, pages.Rows.Count()); Assert.AreEqual(3000, events.Rows.Count()); Assert.AreEqual(1000, pages2.Rows.Count()); if (Directory.Exists(csvDir)) { Directory.Delete(csvDir, true); } }
public CsvPartition(string directory, CsvExporter owner) { _owner = owner; Directory = directory; }
public void Run() { if (Status != JobStatus.Pending) throw new InvalidOperationException("Job is not pending"); try { Directory.CreateDirectory(TempDirectory); File.WriteAllText(Path.Combine(TempDirectory, "specification.json"), Specification.ToString()); SetStatus(JobStatus.Preparing); Specification.Initialize(this); var source = Specification.CreateDataSource(); PostProcessors = Specification.CreatePostProcessors().ToArray(); var jobDirectory = TempDirectory; //The processors will consume data from this collection; var items = new BlockingCollection<object>(ExecutionSettings.DataSourceBufferSize); //Create the processors var processors = Enumerable.Range(0, ExecutionSettings.ProcessingThreads).Select(i => { IItemFieldLookup lookup = null; try { lookup = new ItemDatabaseFieldLookup(Database.GetDatabase(ExecutionSettings.DatabaseName), Specification.DefaultLanguage, ExecutionSettings.FieldCacheSize); } catch (Exception ex) { Log.Error("Error initailizing item field lookup", ex, this); } var proc = new DataProcessor(Specification.CreateRootMapper()) { BatchSize = ExecutionSettings.BatchSize, FieldLookup = lookup }; var exporter = Specification.CreateExporter(jobDirectory) as CsvExporter; //Move PartititionPrefix so this cast isn't necessary if (exporter == null) { exporter = new CsvExporter(jobDirectory); } exporter.PartitionPrefix = "~" + i + "_"; exporter.KeepOutput = true; //Don't delete the job's main directory proc.BatchWriter = new TableDataBatchWriter(exporter) { SyncLock = this, MaximumSize = ExecutionSettings.SizeLimit }; proc.Initialize(); return proc; }).ToArray(); var hasUpdates = false; //Allow post processors to validate their conditions (if any). This allows the job to fail before data is processed //Allow post processors to filter data source for updates foreach (var pp in PostProcessors) { pp.Validate(processors[0].Tables, Specification); if (pp.UpdateDataSource(processors[0].Tables, source)) { if (hasUpdates) { throw new InvalidOperationException("Only one post processor can update the data source"); } hasUpdates = true; } } EstimatedClientItemCount = source.Count; //Start the processors var processingThreads = processors.Select(p => { var t = new Thread(() => { try { p.Process(items.GetConsumingEnumerable()); } catch (Exception ex) { LastException = ex; SetStatus(JobStatus.Failed, ex.Message); } }); t.Start(); return t; }).ToArray(); if (Status == JobStatus.Failed) { throw LastException; } SetStatus(JobStatus.Running); source.ItemLoaded += (sender, args) => { if (StatusUpdateFrequency <= 0 || ItemsProcessed % StatusUpdateFrequency == 0) { OnProgress(); } ItemsProcessed = args; RowsCreated = processors.Sum(p => p.RowsCreated); }; //Add items to the collection that the processors consume foreach (var item in source) { if (Status != JobStatus.Running) { break; } if (processors.Any(p => p.BatchWriter.End)) { break; } items.Add(item); } items.CompleteAdding(); //Wait for processors to finish foreach (var p in processingThreads) { p.Join(); } RowsCreated = processors.Sum(p => p.RowsCreated); if (Status == JobStatus.Running) { //Now we know how many items we got for sure. Update progress to 100% EstimatedClientItemCount = ItemsProcessed; SetStatus(JobStatus.Merging); using (var csvWriter = Specification.CreateExporter(TempDirectory)) { var tables = MergedTableData.FromTableSets(processors.Select(p => p.Tables)).ToArray(); var w = csvWriter as CsvExporter; if (w == null || w.KeepOutput) { tables = csvWriter.Export(tables).ToArray(); } File.WriteAllText(Path.Combine(jobDirectory, "schema.json"), tables.Select(t => t.Schema).Serialize()); foreach (var postProcessor in PostProcessors) { CurrentPostProcessor = postProcessor; SetStatus(JobStatus.PostProcessing, postProcessor.Name); postProcessor.Process(jobDirectory, tables, Specification); } CurrentPostProcessor = null; foreach (var proc in processors) { SizeLimitExceeded = SizeLimitExceeded || proc.BatchWriter.End; proc.BatchWriter.Dispose(); } SetStatus(JobStatus.Completing); } SetStatus(JobStatus.Completed); } } catch (Exception ex) { Log.Error("Job failed", ex, this); LastException = ex; SetStatus(JobStatus.Failed, ex.ToString()); } try { OnJobEnded(); } catch (Exception ex) { Log.Error("Exception occured after job ended", ex, this); LastException = ex; } EndDate = DateTime.Now; try { if (Status == JobStatus.Canceled || Status == JobStatus.Failed) { Delete(); } } catch (Exception ex) { Log.Error("Exception occured while deleting job", ex, this); } }