/// <summary>Join ts (timestamp) contiguous files together until they are > MaxBytes</summary> public static async Task <(long optimisedIn, long optimisedOut)> Optimise(this ISimpleFileStore store, SPath destPath, IEnumerable <StoreFileMd> files, OptimiseCfg cfg, ILogger log) { var plan = OptimisePlan(files, destPath, cfg); var(filesIn, filesOut) = (plan.Sum(p => p.SourceFileCount), plan.Count); if (plan.None()) { log.Debug("Optimise {Path} - already optimal", destPath); } else { log?.Debug("Optimise {Path} - starting optimization of {FilesIn} to {FilesOut} files", destPath, filesIn, filesOut); await Optimise(store, cfg, plan, log); } return(filesIn, filesOut); }
Optimise(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath, string ts = null, ILogger log = null) { // all partition landing files (will group using directories) var sw = Stopwatch.StartNew(); log?.Debug("Optimise {Path} - reading current files", rootPath); var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration(); log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}", rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort()); var optimiseRes = await byDir.BlockDo(p => Optimise(store, p.Key, p, cfg, log), cfg.ParallelFiles).ToArrayAsync(); var optimiseIn = optimiseRes.Sum(r => r.optimisedIn); var optimiseOut = optimiseRes.Sum(r => r.optimisedOut); log?.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}", rootPath, optimiseIn, optimiseOut, sw.Elapsed.HumanizeShort()); }
/// <summary>Process new files in land into stage. Note on the different modes: - LandAndStage: Immutable operation, load /// data downstream from the stage directory - Default: Mutates the files in the same directory. Downstream operations: /// don't run while processing, and fully reload after this is complete</summary> /// <returns>stats about the optimisation, and all new files (optimised or not) based on the timestamp</returns> public static async Task <IReadOnlyCollection <OptimiseBatch> > OptimisePlan(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath, string ts = null, ILogger log = null) { log?.Debug("Optimise {Path} - reading current files", rootPath); var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration(); log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}", rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort()); return(byDir.SelectMany(p => OptimisePlan(p, p.Key, cfg)).ToArray()); }
Optimise(this IJsonlStore store, OptimiseCfg cfg, SPath partition, string ts = null, ILogger log = null) => store.Store.Optimise(cfg, partition != null ? store.Path.Add(partition) : store.Path, ts, log);
public static async Task Optimise(this ISimpleFileStore store, OptimiseCfg cfg, IReadOnlyCollection <OptimiseBatch> plan, ILogger log) => await plan.Select((b, i) => (b, i)).BlockDo(async b => {
public static IReadOnlyCollection <OptimiseBatch> OptimisePlan(this IEnumerable <StoreFileMd> files, SPath destPath, OptimiseCfg cfg) { var toProcess = files.OrderBy(f => f.Ts).ToQueue(); var batch = new List <StoreFileMd>(); var optimisePlan = new List <OptimiseBatch>(); if (toProcess.None()) { return(Array.Empty <OptimiseBatch>()); } while (toProcess.Any()) { var file = toProcess.Dequeue(); var(nextBytes, nextIsFurther) = BatchSize(file); if (nextBytes > cfg.TargetBytes && nextIsFurther) // if adding this file will make it too big, optimise the current batch as is { PlanCurrentBatch(); } batch.Add(file); if (toProcess.None() || batch.Sum(f => f.Bytes) > cfg.TargetBytes) // join if big enough, or this is the last batch { PlanCurrentBatch(); } } (long nextBytes, bool nextIsFurther) BatchSize(StoreFileMd file) { var bytes = batch.Sum(f => f.Bytes); var nextBytes = bytes + file.Bytes; var nextIsFurther = nextBytes - cfg.TargetBytes > cfg.TargetBytes - bytes; if (!nextBytes.HasValue) { throw new InvalidOperationException($"Optimisation requires the file bytes are know. Missing on file {file.Path}"); } return(nextBytes.Value, nextIsFurther); } void PlanCurrentBatch() { OptimiseBatch batchPlan = batch.Count switch { 1 => new(batch.First().Path, Array.Empty <StoreFileMd>()), // leave file as is > 1 => new(FilePath(destPath, batch.Last().Ts), batch.ToArray()), _ => null }; if (batchPlan != null) { optimisePlan.Add(batchPlan); } batch.Clear(); } return(optimisePlan); }