public WebScraper(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore) { Proxy = proxy; CollectCfg = collectCfg; LogStore = logStore; Clients = new ResourceCycle <HttpClient, ProxyConnectionCfg>(proxy.DirectAndProxies(), p => Task.FromResult(CreateHttpClient(p))); }
public YtResults(SnowflakeConnectionProvider sf, ResultsCfg resCfg, ISimpleFileStore store, UserScrapeCfg userScrapeCfg) { Sf = sf; ResCfg = resCfg; Store = store; UserScrapeCfg = userScrapeCfg; }
static async Task <StringPath> JoinFiles(ISimpleFileStore store, IReadOnlyCollection <StoreFileMd> toOptimise, StringPath destPath, int parallel, ILogger log) { var optimisedFile = FilePath(destPath, toOptimise.Last().Ts); using (var joinedStream = new MemoryStream()) { using (var zipWriter = new GZipStream(joinedStream, CompressionLevel.Optimal, true)) { var inStreams = await toOptimise.BlockFunc(async s => { var inStream = await store.Load(s.Path, log).WithDuration(); log.Debug("Optimise {Path} - loaded file {SourceFile} to be optimised in {Duration}", destPath, s.Path, inStream.Duration.HumanizeShort()); return(inStream.Result); }, parallel); foreach (var s in inStreams) { using var zr = new GZipStream(s, CompressionMode.Decompress, false); await zr.CopyToAsync(zipWriter); } } joinedStream.Seek(0, SeekOrigin.Begin); await store.Save(optimisedFile, joinedStream); } // when in-place, this is dirty if we fail now. There is no transaction capability in cloud storage, so downstream process must handle duplicates // successfully staged files, delete from land. Incremental using TS will work without delete, but it's more efficient to delete process landed files. await toOptimise.BlockAction(f => store.Delete(f.Path), parallel) .WithWrappedException(e => "Failed to delete optimised files. Duplicate records need to be handled downstream"); log.Debug("Optimise {Path} - deleted {Files} that were optimised into {OptimisedFile}", destPath, toOptimise.Count, optimisedFile); return(optimisedFile); }
public YtCollect(YtStore store, ISimpleFileStore simpleFileStore, AppCfg cfg, ILogger log) { Yt = store; Store = simpleFileStore; Cfg = cfg; Log = log; }
public YtWeb(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore) { Proxy = proxy; CollectCfg = collectCfg; LogStore = logStore; Clients = new(proxy.DirectAndProxies(), p => Task.FromResult(p.CreateHttpClient())); }
/// <summary>Serializes item into the object store</summary> /// <param name="path">The path to the object (no extensions)</param> public static async Task Set <T>(this ISimpleFileStore store, StringPath path, T item, bool zip = true, ILogger log = default, JsonSerializerSettings jCfg = default) { await using var memStream = new MemoryStream(); var serializer = jCfg != null?JsonSerializer.Create(jCfg) : JsonExtensions.DefaultSerializer; if (zip) { await using (var zipWriter = new GZipStream(memStream, CompressionLevel.Optimal, true)) { await using var tw = new StreamWriter(zipWriter, Encoding.UTF8); serializer.Serialize(new JsonTextWriter(tw), item); } } else { await using (var tw = new StreamWriter(memStream, Encoding.UTF8, leaveOpen: true)) serializer.Serialize(new JsonTextWriter(tw), item); } var fullPath = path.AddJsonExtention(zip); memStream.Seek(0, SeekOrigin.Begin); await store.Save(fullPath, memStream, log); }
public AppendCollectionStore(ISimpleFileStore store, StringPath path, Func <T, string> getTs, string version, ILogger log) { Store = store; Path = path; GetTs = getTs; Version = version; Log = log; }
public UserScrape(AzureContainers containers, RootCfg rootCfg, UserScrapeCfg cfg, SemVersion version, YtStore store) { Containers = containers; RootCfg = rootCfg; Cfg = cfg; Version = version; Store = store.Store; }
public FileCollection(ISimpleFileStore s3, Expression <Func <T, string> > getId, StringPath path, CollectionCacheType cacheType = CollectionCacheType.Memory, FPath localCacheDir = null) { Store = s3; GetId = getId.Compile(); Path = path; CacheType = cacheType; LocalCacheDir = localCacheDir; Cache = new KeyedCollection <string, T>(getId, theadSafe: true); }
/// <summary>Process new files in land into stage. Note on the different modes: - LandAndStage: Immutable operation, load /// data downstream from the stage directory - Default: Mutates the files in the same directory. Downstream operations: /// don't run while processing, and fully reload after this is complete</summary> /// <returns>stats about the optimisation, and all new files (optimised or not) based on the timestamp</returns> public static async Task <IReadOnlyCollection <OptimiseBatch> > OptimisePlan(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath, string ts = null, ILogger log = null) { log?.Debug("Optimise {Path} - reading current files", rootPath); var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration(); log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}", rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort()); return(byDir.SelectMany(p => OptimisePlan(p, p.Key, cfg)).ToArray()); }
public ChromeScraper(ProxyCfg proxyCfg, YtCollectCfg collectCfg, ISimpleFileStore logStore) { ProxyCfg = proxyCfg; CollectCfg = collectCfg; LogStore = logStore; ExecutablePath = new(async() => { var revisionInfo = await new BrowserFetcher().DownloadAsync(802497); //revision needs to be recent to be able to use optional chaining return(revisionInfo.ExecutablePath); }); }
public YtStore(YtClient reader, ISimpleFileStore store) { Yt = reader; Store = store; Videos = new FileCollection <VideoStored>(Store, v => v.VideoId, "Videos", Yt.Cfg.CacheType, CacheDataDir); Channels = new FileCollection <ChannelStored>(Store, v => v.ChannelId, "Channels", Yt.Cfg.CacheType, CacheDataDir); RecommendedVideosCollection = new FileCollection <RecommendedVideoStored>(Store, v => v.VideoId, "RecommendedVideos", Yt.Cfg.CacheType, CacheDataDir); ChannelVideosCollection = new FileCollection <ChannelVideosStored>(Store, c => c.ChannelId, "ChannelVideos", Yt.Cfg.CacheType, CacheDataDir); }
/// <summary></summary> /// <param name="getTs">A function to get a timestamp for this file. This must always be greater for new records using an /// invariant string comparer</param> public JsonlStore(ISimpleFileStore store, StringPath path, Func <T, string> getTs, ILogger log, string version = "", Func <T, string> getPartition = null, int parallel = 8) { Store = store; Path = path; GetTs = getTs; Log = log; GetPartition = getPartition; Parallel = parallel; Version = version; }
public static async Task <T> GetOrCreate <T>(this ISimpleFileStore store, StringPath path, Func <T> create = null) where T : class, new() { var o = await store.Get <T>(path); if (o == null) { o = create == null ? new T() : create(); await store.Set(path, o); } return(o); }
public static async Task <T> Get <T>(this ISimpleFileStore store, StringPath path, bool zip = true, ILogger log = null) { using var stream = await store.Load(path.AddJsonExtention(zip), log); if (!zip) { return(stream.ToObject <T>()); } await using var zr = new GZipStream(stream, CompressionMode.Decompress, true); return(zr.ToObject <T>()); }
JsonStoreFiles(this ISimpleFileStore store, SPath path, bool allDirectories = false) { var allFiles = await store.List(path, allDirectories).ToArrayAsync(); foreach (var b in allFiles.Select(b => b .Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension)) .Select(StoreFileMd.FromFileItem).ToArray()) .Select(dummy => (IReadOnlyCollection <StoreFileMd>)dummy)) { yield return(b); } }
/// <summary>Join ts (timestamp) contiguous files together until they are > MaxBytes</summary> public static async Task <(long optimisedIn, long optimisedOut)> Optimise(this ISimpleFileStore store, SPath destPath, IEnumerable <StoreFileMd> files, OptimiseCfg cfg, ILogger log) { var plan = OptimisePlan(files, destPath, cfg); var(filesIn, filesOut) = (plan.Sum(p => p.SourceFileCount), plan.Count); if (plan.None()) { log.Debug("Optimise {Path} - already optimal", destPath); } else { log?.Debug("Optimise {Path} - starting optimization of {FilesIn} to {FilesOut} files", destPath, filesIn, filesOut); await Optimise(store, cfg, plan, log); } return(filesIn, filesOut); }
/// <summary>Copies the given files into staging tables MOTE: filesToCopy is relative to the store path. The resulting /// LoadHistoryRow.File paths are converted to be equivalent</summary> public static async Task <LoadHistoryRow[]> CopyInto(this StageTableCfg t, ISimpleFileStore store, string sfStage, ILoggedConnection <IDbConnection> db, [CanBeNull] SPath[] filesToCopy, ILogger log) { if (filesToCopy?.Length > 1000) { throw new("copying 1k+ files not implemented"); } var stagePath = new string[] { sfStage, store.BasePathSansContainer().Dot(c => c.IsEmpty ? null : c) }.Concat(t.Dir.Tokens).NotNull().Join("/"); var cols = await t.TableCols(db); // support subsets of columns (e.g. no loaded or updated columns var selectCols = cols.Join(",", c => c.column_name.ToLowerInvariant() switch { "v" => "$1 v", "loaded" => "sysdate() loaded", "updated" => "v:Updated::timestamp_ntz updated", _ => throw new($"stage column {c.column_name} not supported") });
Optimise(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath, string ts = null, ILogger log = null) { // all partition landing files (will group using directories) var sw = Stopwatch.StartNew(); log?.Debug("Optimise {Path} - reading current files", rootPath); var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration(); log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}", rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort()); var optimiseRes = await byDir.BlockDo(p => Optimise(store, p.Key, p, cfg, log), cfg.ParallelFiles).ToArrayAsync(); var optimiseIn = optimiseRes.Sum(r => r.optimisedIn); var optimiseOut = optimiseRes.Sum(r => r.optimisedOut); log?.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}", rootPath, optimiseIn, optimiseOut, sw.Elapsed.HumanizeShort()); }
/// <summary>Join ts (timestamp) contiguous files together until they are > MaxBytes</summary> static async Task <(long optimisedIn, long optimisedOut)> Optimise(ISimpleFileStore store, StringPath partitionPath, IEnumerable <StoreFileMd> files, OptimiseCfg cfg, ILogger log) { var toProcess = files.OrderBy(f => f.Ts).ToQueue(); log.Debug("Optimise {Path} - Processing {Files} files in partition {Partition}", partitionPath, toProcess.Count, partitionPath); var currentBatch = new List <StoreFileMd>(); var optimisePlan = new List <StoreFileMd[]>(); if (toProcess.None()) { return(0, 0); } while (toProcess.Any()) { var file = toProcess.Dequeue(); var(nextBytes, nextIsFurther) = BatchSize(file); if (nextBytes > cfg.TargetBytes && nextIsFurther) // if adding this file will make it too big, optimise the current batch as is { PlanCurrentBatch(); } currentBatch.Add(file); if (toProcess.None() || currentBatch.Sum(f => f.Bytes) > cfg.TargetBytes) // join if big enough, or this is the last batch { PlanCurrentBatch(); } } (long nextBytes, bool nextIsFurther) BatchSize(StoreFileMd file) { var bytes = currentBatch.Sum(f => f.Bytes); var nextBytes = bytes + file.Bytes; var nextIsFurther = nextBytes - cfg.TargetBytes > cfg.TargetBytes - bytes; return(nextBytes, nextIsFurther); } void PlanCurrentBatch() { if (currentBatch.Count > 1) // only plan a batch if there is more than one file in it { optimisePlan.Add(currentBatch.ToArray()); } currentBatch.Clear(); } if (optimisePlan.None()) { log.Debug("Optimise {Path} - already optimal", partitionPath); } else { log.Debug("Optimise {Path} - staring to execute optimisation plan", partitionPath); await optimisePlan.Select((b, i) => (b, i)).BlockAction(async b => { var(batch, i) = b; var optimiseRes = await JoinFiles(store, batch, partitionPath, cfg.Parallel, log).WithDuration(); log.Debug("Optimise {Path} - optimised file {OptimisedFile} from {FilesIn} in {Duration}. batch {Batch}/{Total}", partitionPath, optimiseRes.Result, batch.Length, optimiseRes.Duration.HumanizeShort(), i, optimisePlan.Count); }, cfg.Parallel); } return(optimisePlan.Sum(p => p.Length), optimisePlan.Count); }
public static SPath BasePathSansContainer(this ISimpleFileStore store) => new(store.BasePath.Tokens.Skip(1));
public BlobIndex(ISimpleFileStore store) => Store = store;
public KeyedCollectionStore(ISimpleFileStore store, Func <T, string> getId, SPath path) { Store = store; GetId = getId; Path = path; }
public YtStore(ISimpleFileStore store, ILogger log) { Store = store; Log = log; }
public StoreUpgrader(AppCfg cfg, ISimpleFileStore store, ILogger log) { Cfg = cfg; Store = store; Log = log; }
public YtResults(SnowflakeCfg snowflakeCfg, ResultsCfg resCfg, ISimpleFileStore store, ILogger log) { SnowflakeCfg = snowflakeCfg; ResCfg = resCfg; Store = store; Log = log; }
public YtConvertWatchTimeFiles(YtStores stores) { Store = stores.Store(DataStoreType.Root); }
static async Task <IGrouping <string, StoreFileMd>[]> ToOptimiseByDir(ISimpleFileStore store, StringPath landPath, string ts) => (await store.Files(landPath, true)) .Where(f => ts == null || string.CompareOrdinal(f.Ts, ts) > 0) .GroupBy(f => f.Path.Parent.ToString()) .ToArray();
public static async Task <IReadOnlyCollection <StoreFileMd> > Files(this ISimpleFileStore store, StringPath path, bool allDirectories = false) { var list = (await store.List(path, allDirectories).SelectManyList()).Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension)); return(list.Select(StoreFileMd.FromFileItem).ToList()); }
/// <summary>Process new files in land into stage. Note on the different modes: - LandAndStage: Immutable operation, load /// data downstream from the stage directory - Default: Mutates the files in the same directory. Downstream operations: /// don't run while processing, and fully reload after this is complete</summary> /// <returns>stats about the optimisation, and all new files (optimised or not) based on the timestamp</returns> public static async Task <(long optimiseIn, long optimiseOut, StoreFileMd[] files)> Optimise(this ISimpleFileStore store, OptimiseCfg cfg, StringPath rootPath, string ts = null, ILogger log = null) // all partition landing files (will group using directories) { var sw = Stopwatch.StartNew(); log.Debug("Optimise {Path} - reading current files", rootPath); var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration(); log.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}", rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort()); var optimiseRes = await byDir.BlockFunc(p => Optimise(store, p.Key, p, cfg, log), cfg.Parallel); var res = ( optimiseIn : optimiseRes.Sum(r => r.optimisedIn), optimiseOut : optimiseRes.Sum(r => r.optimisedOut), files : byDir.SelectMany(df => df).ToArray()); log.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}", rootPath, res.optimiseIn, res.optimiseOut, sw.Elapsed.HumanizeShort()); return(res); }