Esempio n. 1
0
    /// <summary>Join ts (timestamp) contiguous files together until they are > MaxBytes</summary>
    public static async Task <(long optimisedIn, long optimisedOut)> Optimise(this ISimpleFileStore store, SPath destPath, IEnumerable <StoreFileMd> files,
                                                                              OptimiseCfg cfg, ILogger log)
    {
        var plan = OptimisePlan(files, destPath, cfg);

        var(filesIn, filesOut) = (plan.Sum(p => p.SourceFileCount), plan.Count);
        if (plan.None())
        {
            log.Debug("Optimise {Path} - already optimal", destPath);
        }
        else
        {
            log?.Debug("Optimise {Path} - starting optimization of {FilesIn} to {FilesOut} files", destPath, filesIn, filesOut);
            await Optimise(store, cfg, plan, log);
        }
        return(filesIn, filesOut);
    }
Esempio n. 2
0
    Optimise(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath, string ts = null, ILogger log = null)
    {
        // all partition landing files (will group using directories)
        var sw = Stopwatch.StartNew();

        log?.Debug("Optimise {Path} - reading current files", rootPath);
        var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration();

        log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}",
                   rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort());

        var optimiseRes = await byDir.BlockDo(p => Optimise(store, p.Key, p, cfg, log), cfg.ParallelFiles).ToArrayAsync();

        var optimiseIn  = optimiseRes.Sum(r => r.optimisedIn);
        var optimiseOut = optimiseRes.Sum(r => r.optimisedOut);

        log?.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}",
                         rootPath, optimiseIn, optimiseOut, sw.Elapsed.HumanizeShort());
    }
Esempio n. 3
0
    /// <summary>Process new files in land into stage. Note on the different modes: - LandAndStage: Immutable operation, load
    ///   data downstream from the stage directory - Default: Mutates the files in the same directory. Downstream operations:
    ///   don't run while processing, and fully reload after this is complete</summary>
    /// <returns>stats about the optimisation, and all new files (optimised or not) based on the timestamp</returns>
    public static async Task <IReadOnlyCollection <OptimiseBatch> > OptimisePlan(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath,
                                                                                 string ts = null, ILogger log = null)
    {
        log?.Debug("Optimise {Path} - reading current files", rootPath);
        var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration();

        log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}",
                   rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort());
        return(byDir.SelectMany(p => OptimisePlan(p, p.Key, cfg)).ToArray());
    }
Esempio n. 4
0
 Optimise(this IJsonlStore store, OptimiseCfg cfg, SPath partition, string ts = null, ILogger log = null) =>
 store.Store.Optimise(cfg, partition != null ? store.Path.Add(partition) : store.Path, ts, log);
Esempio n. 5
0
 public static async Task Optimise(this ISimpleFileStore store, OptimiseCfg cfg, IReadOnlyCollection <OptimiseBatch> plan, ILogger log) =>
 await plan.Select((b, i) => (b, i)).BlockDo(async b => {
Esempio n. 6
0
    public static IReadOnlyCollection <OptimiseBatch> OptimisePlan(this IEnumerable <StoreFileMd> files, SPath destPath, OptimiseCfg cfg)
    {
        var toProcess = files.OrderBy(f => f.Ts).ToQueue();

        var batch        = new List <StoreFileMd>();
        var optimisePlan = new List <OptimiseBatch>();

        if (toProcess.None())
        {
            return(Array.Empty <OptimiseBatch>());
        }

        while (toProcess.Any())
        {
            var file = toProcess.Dequeue();

            var(nextBytes, nextIsFurther) = BatchSize(file);
            if (nextBytes > cfg.TargetBytes && nextIsFurther) // if adding this file will make it too big, optimise the current batch as is
            {
                PlanCurrentBatch();
            }

            batch.Add(file);
            if (toProcess.None() || batch.Sum(f => f.Bytes) > cfg.TargetBytes) // join if big enough, or this is the last batch
            {
                PlanCurrentBatch();
            }
        }

        (long nextBytes, bool nextIsFurther) BatchSize(StoreFileMd file)
        {
            var bytes         = batch.Sum(f => f.Bytes);
            var nextBytes     = bytes + file.Bytes;
            var nextIsFurther = nextBytes - cfg.TargetBytes > cfg.TargetBytes - bytes;

            if (!nextBytes.HasValue)
            {
                throw new InvalidOperationException($"Optimisation requires the file bytes are know. Missing on file {file.Path}");
            }
            return(nextBytes.Value, nextIsFurther);
        }

        void PlanCurrentBatch()
        {
            OptimiseBatch batchPlan = batch.Count switch {
                1 => new(batch.First().Path, Array.Empty <StoreFileMd>()), // leave file as is
                > 1 => new(FilePath(destPath, batch.Last().Ts), batch.ToArray()),
                _ => null
            };

            if (batchPlan != null)
            {
                optimisePlan.Add(batchPlan);
            }
            batch.Clear();
        }

        return(optimisePlan);
    }