Пример #1
0
 public CsvPartition(string directory, CsvExporter owner)
 {
     _owner = owner;
     Directory = directory;
 }
        public void NestedChildTablesWithBatchingInCsv()
        {
            var csvDir = Path.Combine(Directory.GetCurrentDirectory(), "~tmp");
            if (Directory.Exists(csvDir)) Directory.Delete(csvDir, true);

            var csv = new CsvExporter(csvDir, binaryPartitions:true);

            var batchWriter = new TableDataBatchWriter(csv);

            //1000 visits.
            //1 row in Test per visit
            //3 rows in Events per visit
            //3 rows in Pages per visit
            //1 row in Pages2 per visit
            //--------------------------
            //8 rows per visit = 80000 in total

            //Batch size 26672 (ceiling(1000/3) visits) gives two disk partitions with 26672 and one with 26656 in memory

            var visitCount = 1000;
            var rowsPerVisit = 8;
            var visitPerBatch = (int)Math.Ceiling(visitCount / 3d);
            var rowsPerFilePartition = visitPerBatch * rowsPerVisit;
            var eventRowsPerVisit = 3;
            var expectedFilePartitions = 2;

            var tables = TestSets.Countries(visitCount, 37).Process(
                () => new SimpleTableMapper(new TableDefinition("Test")
                    .Key("VisitId", s => s.Current<IVisitAggregationContext>().Visit.InteractionId)
                    .Fact("Value", s => s.Current<IVisitAggregationContext>().Visit.Value)
                    .Map(s => s.Current<IVisitAggregationContext>().Visit.Pages,
                        new TableDefinition("Pages")
                            .Key("PageId", s => s.Current<PageData>().Item.Id)
                            .Fact("Value", s => s.Current<PageData>().PageEvents.Sum(pe => pe.Value))
                            .Map(s => s.Current<PageData>().PageEvents,
                                new TableDefinition("Events")
                                    .Dimension("Event", s => s.Current<PageEventData>().PageEventDefinitionId)
                                    .Fact("Value", s => s.Current<PageEventData>().Value)))
                    .Map(s => new[] { s.Current<IVisitAggregationContext>().Visit.Pages.First() },
                        new TableDefinition("Pages2")
                            .Key("Id", s => s.Current<PageData>().Item.Id))),
                initializer: p =>
                {
                    p.BatchWriter = batchWriter;
                    p.BatchSize = rowsPerFilePartition;
                });


            var partitions = new DirectoryInfo(csvDir).GetDirectories().Length;

            Assert.AreEqual(2, partitions, string.Format("{0:N0} rows should create 2 file partitions and one in memory", visitCount * rowsPerVisit));
            Assert.AreEqual(expectedFilePartitions * eventRowsPerVisit * visitPerBatch, batchWriter.Tables.FirstOrDefault(t => t.Schema.Name == "Events").Rows.Count(),
                string.Format("{0:N0} rows in event tables in file partitions", expectedFilePartitions * eventRowsPerVisit * visitPerBatch));

            Assert.AreEqual(expectedFilePartitions * rowsPerVisit * visitPerBatch, batchWriter.Tables.Sum(t => t.Rows.Count()),
                string.Format("{0:N0} total rows in file partitions", expectedFilePartitions * rowsPerVisit * visitPerBatch));

            Assert.AreEqual(visitCount*rowsPerVisit, tables.Sum(t => t.Rows.Count()), 
                string.Format("{0:N0} rows in file + memory partitions", visitCount*rowsPerVisit));

            //Merge partitions
            tables = csv.Export(tables);


            //Delete partitions
            batchWriter.Dispose();
            partitions = new DirectoryInfo(csvDir).GetDirectories().Length;
            Assert.AreEqual(0, partitions, "Temporary partition directories are deleted");


            var visits = tables.FirstOrDefault(t => t.Schema.Name == "Test");
            var pages = tables.FirstOrDefault(t => t.Schema.Name == "Pages");
            var pages2 = tables.FirstOrDefault(t => t.Schema.Name == "Pages2");
            var events = tables.FirstOrDefault(t => t.Schema.Name == "Events");


            Assert.AreEqual(1000, visits.Rows.Count());
            Assert.AreEqual(14000, visits.Fields<int>("Value").Sum());
            Assert.AreEqual(14000, pages.Fields<int>("Value").Sum());
            Assert.AreEqual(14000, events.Fields<int>("Value").Sum());

            Assert.AreEqual(3000, pages.Rows.Count());
            Assert.AreEqual(3000, events.Rows.Count());
            Assert.AreEqual(1000, pages2.Rows.Count());

            if (Directory.Exists(csvDir))
            {
                Directory.Delete(csvDir, true);
            }

        }
Пример #3
0
 public CsvPartition(string directory, CsvExporter owner)
 {
     _owner    = owner;
     Directory = directory;
 }
Пример #4
0
        public void Run()
        {
            if (Status != JobStatus.Pending) throw new InvalidOperationException("Job is not pending");

            try
            {
                Directory.CreateDirectory(TempDirectory);

                File.WriteAllText(Path.Combine(TempDirectory, "specification.json"), Specification.ToString());

                SetStatus(JobStatus.Preparing);

                Specification.Initialize(this);

                var source = Specification.CreateDataSource();                

                PostProcessors = Specification.CreatePostProcessors().ToArray();

                var jobDirectory = TempDirectory;

                //The processors will consume data from this collection;
                var items = new BlockingCollection<object>(ExecutionSettings.DataSourceBufferSize);

                //Create the processors
                var processors = Enumerable.Range(0, ExecutionSettings.ProcessingThreads).Select(i =>
                {
                    IItemFieldLookup lookup = null;
                    try
                    {
                        lookup = new ItemDatabaseFieldLookup(Database.GetDatabase(ExecutionSettings.DatabaseName),
                            Specification.DefaultLanguage,
                            ExecutionSettings.FieldCacheSize);
                    }
                    catch (Exception ex)
                    {
                        Log.Error("Error initailizing item field lookup", ex, this);
                    }

                    var proc = new DataProcessor(Specification.CreateRootMapper())
                    {
                        BatchSize = ExecutionSettings.BatchSize,                        
                        FieldLookup = lookup
                    };

                    var exporter =  Specification.CreateExporter(jobDirectory) as CsvExporter; //Move PartititionPrefix so this cast isn't necessary
                    if (exporter == null)
                    {
                        exporter = new CsvExporter(jobDirectory);                        
                    }
                    exporter.PartitionPrefix = "~" + i + "_";
                    exporter.KeepOutput = true; //Don't delete the job's main directory
                    proc.BatchWriter = new TableDataBatchWriter(exporter)                    
                    {
                        SyncLock = this,
                        MaximumSize = ExecutionSettings.SizeLimit
                    };

                    proc.Initialize();

                    return proc;
                }).ToArray();



                var hasUpdates = false;                
                //Allow post processors to validate their conditions (if any). This allows the job to fail before data is processed
                //Allow post processors to filter data source for updates
                foreach (var pp in PostProcessors)
                {
                    pp.Validate(processors[0].Tables, Specification);
                    if (pp.UpdateDataSource(processors[0].Tables, source))
                    {
                        if (hasUpdates)
                        {
                            throw new InvalidOperationException("Only one post processor can update the data source");
                        }
                        hasUpdates = true;
                    }
                }

                EstimatedClientItemCount = source.Count;
                
                //Start the processors
                var processingThreads = processors.Select(p =>
                {
                    var t = new Thread(() =>
                    {
                        try
                        {
                            p.Process(items.GetConsumingEnumerable());
                        }
                        catch (Exception ex)
                        {
                            LastException = ex;
                            SetStatus(JobStatus.Failed, ex.Message);
                        }
                    });
                    t.Start();
                    return t;
                }).ToArray();


                if (Status == JobStatus.Failed)
                {
                    throw LastException;
                }
                
                SetStatus(JobStatus.Running);


                source.ItemLoaded += (sender, args) =>
                {
                    if (StatusUpdateFrequency <= 0 || ItemsProcessed % StatusUpdateFrequency == 0)
                    {
                        OnProgress();
                    }
                    ItemsProcessed = args;
                    RowsCreated = processors.Sum(p => p.RowsCreated);
                };                

                //Add items to the collection that the processors consume
                foreach (var item in source)
                {
                    if (Status != JobStatus.Running)
                    {
                        break;
                    }
                    
                    if (processors.Any(p => p.BatchWriter.End))
                    {
                        break;
                    }
                    items.Add(item);
                }
                items.CompleteAdding();

                //Wait for processors to finish
                foreach (var p in processingThreads)
                {
                    p.Join();
                }

                RowsCreated = processors.Sum(p => p.RowsCreated);

                if (Status == JobStatus.Running)
                {
                    //Now we know how many items we got for sure. Update progress to 100%
                    EstimatedClientItemCount = ItemsProcessed;

                    SetStatus(JobStatus.Merging);

                    using (var csvWriter = Specification.CreateExporter(TempDirectory))
                    {

                        var tables = MergedTableData.FromTableSets(processors.Select(p => p.Tables)).ToArray();

                        
                        var w = csvWriter as CsvExporter;
                        if (w == null || w.KeepOutput)
                        {                            
                            tables = csvWriter.Export(tables).ToArray();
                        }


                        File.WriteAllText(Path.Combine(jobDirectory, "schema.json"),
                            tables.Select(t => t.Schema).Serialize());


                        
                        foreach (var postProcessor in PostProcessors)
                        {
                            CurrentPostProcessor = postProcessor;
                            SetStatus(JobStatus.PostProcessing, postProcessor.Name);

                            postProcessor.Process(jobDirectory, tables, Specification);
                        }
                        CurrentPostProcessor = null;

                        foreach (var proc in processors)
                        {                            
                            SizeLimitExceeded = SizeLimitExceeded || proc.BatchWriter.End;
                            proc.BatchWriter.Dispose();
                        }

                        SetStatus(JobStatus.Completing);
                    }
                    SetStatus(JobStatus.Completed);
                }

            }
            catch (Exception ex)
            {
                Log.Error("Job failed", ex, this);
                LastException = ex;
                SetStatus(JobStatus.Failed, ex.ToString());
            }
            
            try
            {
                OnJobEnded();
            }
            catch (Exception ex)
            {
                Log.Error("Exception occured after job ended", ex, this);
                LastException = ex;
            }
            EndDate = DateTime.Now;

            try
            {
                if (Status == JobStatus.Canceled || Status == JobStatus.Failed)
                {
                    Delete();
                }
            }
            catch (Exception ex)
            {
                Log.Error("Exception occured while deleting job", ex, this);
            }
        }