Optimise(this ISimpleFileStore store, OptimiseCfg cfg, StringPath rootPath, string ts = null, ILogger log = null)
        {
            // all partition landing files (will group using directories)
            var sw = Stopwatch.StartNew();

            log.Debug("Optimise {Path} - reading current files", rootPath);
            var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration();

            log.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}",
                      rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort());

            var optimiseRes = await byDir.BlockFunc(p => Optimise(store, p.Key, p, cfg, log), cfg.Parallel);

            var optimiseIn  = optimiseRes.Sum(r => r.optimisedIn);
            var optimiseOut = optimiseRes.Sum(r => r.optimisedOut);

            log.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}",
                            rootPath, optimiseIn, optimiseOut, sw.Elapsed.HumanizeShort());
        }
Exemplo n.º 2
0
        /// <summary>Join ts (timestamp) contiguous files together until they are > MaxBytes</summary>
        static async Task <(long optimisedIn, long optimisedOut)> Optimise(ISimpleFileStore store, StringPath partitionPath, IEnumerable <StoreFileMd> files,
                                                                           OptimiseCfg cfg, ILogger log)
        {
            var toProcess = files.OrderBy(f => f.Ts).ToQueue();

            log.Debug("Optimise {Path} - Processing {Files} files in partition {Partition}",
                      partitionPath, toProcess.Count, partitionPath);

            var currentBatch = new List <StoreFileMd>();
            var optimisePlan = new List <StoreFileMd[]>();

            if (toProcess.None())
            {
                return(0, 0);
            }

            while (toProcess.Any())
            {
                var file = toProcess.Dequeue();

                var(nextBytes, nextIsFurther) = BatchSize(file);
                if (nextBytes > cfg.TargetBytes && nextIsFurther) // if adding this file will make it too big, optimise the current batch as is
                {
                    PlanCurrentBatch();
                }

                currentBatch.Add(file);
                if (toProcess.None() || currentBatch.Sum(f => f.Bytes) > cfg.TargetBytes) // join if big enough, or this is the last batch
                {
                    PlanCurrentBatch();
                }
            }

            (long nextBytes, bool nextIsFurther) BatchSize(StoreFileMd file)
            {
                var bytes         = currentBatch.Sum(f => f.Bytes);
                var nextBytes     = bytes + file.Bytes;
                var nextIsFurther = nextBytes - cfg.TargetBytes > cfg.TargetBytes - bytes;

                return(nextBytes, nextIsFurther);
            }

            void PlanCurrentBatch()
            {
                if (currentBatch.Count > 1) // only plan a batch if there is more than one file in it
                {
                    optimisePlan.Add(currentBatch.ToArray());
                }
                currentBatch.Clear();
            }

            if (optimisePlan.None())
            {
                log.Debug("Optimise {Path} - already optimal", partitionPath);
            }
            else
            {
                log.Debug("Optimise {Path} - staring to execute optimisation plan", partitionPath);
                await optimisePlan.Select((b, i) => (b, i)).BlockAction(async b => {
                    var(batch, i)   = b;
                    var optimiseRes = await JoinFiles(store, batch, partitionPath, cfg.Parallel, log).WithDuration();
                    log.Debug("Optimise {Path} - optimised file {OptimisedFile} from {FilesIn} in {Duration}. batch {Batch}/{Total}",
                              partitionPath, optimiseRes.Result, batch.Length, optimiseRes.Duration.HumanizeShort(), i, optimisePlan.Count);
                }, cfg.Parallel);
            }
            return(optimisePlan.Sum(p => p.Length), optimisePlan.Count);
        }
Exemplo n.º 3
0
        /// <summary>Process new files in land into stage. Note on the different modes: - LandAndStage: Immutable operation, load
        ///   data downstream from the stage directory - Default: Mutates the files in the same directory. Downstream operations:
        ///   don't run while processing, and fully reload after this is complete</summary>
        /// <returns>stats about the optimisation, and all new files (optimised or not) based on the timestamp</returns>
        public static async Task <(long optimiseIn, long optimiseOut, StoreFileMd[] files)> Optimise(this ISimpleFileStore store, OptimiseCfg cfg, StringPath rootPath, string ts = null, ILogger log = null) // all partition landing files (will group using directories)
        {
            var sw = Stopwatch.StartNew();

            log.Debug("Optimise {Path} - reading current files", rootPath);
            var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration();

            log.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}",
                      rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort());

            var optimiseRes = await byDir.BlockFunc(p => Optimise(store, p.Key, p, cfg, log), cfg.Parallel);

            var res = (
                optimiseIn : optimiseRes.Sum(r => r.optimisedIn),
                optimiseOut : optimiseRes.Sum(r => r.optimisedOut),
                files : byDir.SelectMany(df => df).ToArray());

            log.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}",
                            rootPath, res.optimiseIn, res.optimiseOut, sw.Elapsed.HumanizeShort());

            return(res);
        }
 Optimise(this IJsonlStore store, OptimiseCfg cfg, StringPath partition, string ts = null, ILogger log = null) =>
 store.Store.Optimise(cfg, partition != null ? store.Path.Add(partition) : store.Path, ts, log);