Beispiel #1
0
 public WebScraper(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore)
 {
     Proxy      = proxy;
     CollectCfg = collectCfg;
     LogStore   = logStore;
     Clients    = new ResourceCycle <HttpClient, ProxyConnectionCfg>(proxy.DirectAndProxies(), p => Task.FromResult(CreateHttpClient(p)));
 }
Beispiel #2
0
 public YtResults(SnowflakeConnectionProvider sf, ResultsCfg resCfg, ISimpleFileStore store, UserScrapeCfg userScrapeCfg)
 {
     Sf            = sf;
     ResCfg        = resCfg;
     Store         = store;
     UserScrapeCfg = userScrapeCfg;
 }
        static async Task <StringPath> JoinFiles(ISimpleFileStore store, IReadOnlyCollection <StoreFileMd> toOptimise, StringPath destPath, int parallel,
                                                 ILogger log)
        {
            var optimisedFile = FilePath(destPath, toOptimise.Last().Ts);

            using (var joinedStream = new MemoryStream()) {
                using (var zipWriter = new GZipStream(joinedStream, CompressionLevel.Optimal, true)) {
                    var inStreams = await toOptimise.BlockFunc(async s => {
                        var inStream = await store.Load(s.Path, log).WithDuration();
                        log.Debug("Optimise {Path} - loaded file {SourceFile} to be optimised in {Duration}",
                                  destPath, s.Path, inStream.Duration.HumanizeShort());
                        return(inStream.Result);
                    }, parallel);

                    foreach (var s in inStreams)
                    {
                        using var zr = new GZipStream(s, CompressionMode.Decompress, false);
                        await zr.CopyToAsync(zipWriter);
                    }
                }
                joinedStream.Seek(0, SeekOrigin.Begin);
                await store.Save(optimisedFile, joinedStream);
            }

            // when in-place, this is dirty if we fail now. There is no transaction capability in cloud storage, so downstream process must handle duplicates
            // successfully staged files, delete from land. Incremental using TS will work without delete, but it's more efficient to delete process landed files.
            await toOptimise.BlockAction(f => store.Delete(f.Path), parallel)
            .WithWrappedException(e => "Failed to delete optimised files. Duplicate records need to be handled downstream");

            log.Debug("Optimise {Path} - deleted {Files} that were optimised into {OptimisedFile}",
                      destPath, toOptimise.Count, optimisedFile);

            return(optimisedFile);
        }
 public YtCollect(YtStore store, ISimpleFileStore simpleFileStore, AppCfg cfg, ILogger log)
 {
     Yt    = store;
     Store = simpleFileStore;
     Cfg   = cfg;
     Log   = log;
 }
Beispiel #5
0
 public YtWeb(ProxyCfg proxy, YtCollectCfg collectCfg, ISimpleFileStore logStore)
 {
     Proxy      = proxy;
     CollectCfg = collectCfg;
     LogStore   = logStore;
     Clients    = new(proxy.DirectAndProxies(), p => Task.FromResult(p.CreateHttpClient()));
 }
Beispiel #6
0
        /// <summary>Serializes item into the object store</summary>
        /// <param name="path">The path to the object (no extensions)</param>
        public static async Task Set <T>(this ISimpleFileStore store, StringPath path, T item, bool zip = true, ILogger log = default,
                                         JsonSerializerSettings jCfg = default)
        {
            await using var memStream = new MemoryStream();

            var serializer = jCfg != null?JsonSerializer.Create(jCfg) : JsonExtensions.DefaultSerializer;

            if (zip)
            {
                await using (var zipWriter = new GZipStream(memStream, CompressionLevel.Optimal, true)) {
                    await using var tw = new StreamWriter(zipWriter, Encoding.UTF8);
                    serializer.Serialize(new JsonTextWriter(tw), item);
                }
            }
            else
            {
                await using (var tw = new StreamWriter(memStream, Encoding.UTF8, leaveOpen: true))
                    serializer.Serialize(new JsonTextWriter(tw), item);
            }

            var fullPath = path.AddJsonExtention(zip);

            memStream.Seek(0, SeekOrigin.Begin);

            await store.Save(fullPath, memStream, log);
        }
 public AppendCollectionStore(ISimpleFileStore store, StringPath path, Func <T, string> getTs, string version, ILogger log)
 {
     Store   = store;
     Path    = path;
     GetTs   = getTs;
     Version = version;
     Log     = log;
 }
Beispiel #8
0
 public UserScrape(AzureContainers containers, RootCfg rootCfg, UserScrapeCfg cfg, SemVersion version, YtStore store)
 {
     Containers = containers;
     RootCfg    = rootCfg;
     Cfg        = cfg;
     Version    = version;
     Store      = store.Store;
 }
Beispiel #9
0
 public FileCollection(ISimpleFileStore s3, Expression <Func <T, string> > getId, StringPath path, CollectionCacheType cacheType = CollectionCacheType.Memory, FPath localCacheDir = null)
 {
     Store         = s3;
     GetId         = getId.Compile();
     Path          = path;
     CacheType     = cacheType;
     LocalCacheDir = localCacheDir;
     Cache         = new KeyedCollection <string, T>(getId, theadSafe: true);
 }
Beispiel #10
0
    /// <summary>Process new files in land into stage. Note on the different modes: - LandAndStage: Immutable operation, load
    ///   data downstream from the stage directory - Default: Mutates the files in the same directory. Downstream operations:
    ///   don't run while processing, and fully reload after this is complete</summary>
    /// <returns>stats about the optimisation, and all new files (optimised or not) based on the timestamp</returns>
    public static async Task <IReadOnlyCollection <OptimiseBatch> > OptimisePlan(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath,
                                                                                 string ts = null, ILogger log = null)
    {
        log?.Debug("Optimise {Path} - reading current files", rootPath);
        var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration();

        log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}",
                   rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort());
        return(byDir.SelectMany(p => OptimisePlan(p, p.Key, cfg)).ToArray());
    }
Beispiel #11
0
 public ChromeScraper(ProxyCfg proxyCfg, YtCollectCfg collectCfg, ISimpleFileStore logStore)
 {
     ProxyCfg       = proxyCfg;
     CollectCfg     = collectCfg;
     LogStore       = logStore;
     ExecutablePath = new(async() => {
         var revisionInfo = await new BrowserFetcher().DownloadAsync(802497); //revision needs to be recent to be able to use optional chaining
         return(revisionInfo.ExecutablePath);
     });
 }
Beispiel #12
0
        public YtStore(YtClient reader, ISimpleFileStore store)
        {
            Yt    = reader;
            Store = store;

            Videos   = new FileCollection <VideoStored>(Store, v => v.VideoId, "Videos", Yt.Cfg.CacheType, CacheDataDir);
            Channels = new FileCollection <ChannelStored>(Store, v => v.ChannelId, "Channels", Yt.Cfg.CacheType, CacheDataDir);
            RecommendedVideosCollection = new FileCollection <RecommendedVideoStored>(Store, v => v.VideoId, "RecommendedVideos", Yt.Cfg.CacheType, CacheDataDir);
            ChannelVideosCollection     = new FileCollection <ChannelVideosStored>(Store, c => c.ChannelId, "ChannelVideos", Yt.Cfg.CacheType, CacheDataDir);
        }
Beispiel #13
0
 /// <summary></summary>
 /// <param name="getTs">A function to get a timestamp for this file. This must always be greater for new records using an
 ///   invariant string comparer</param>
 public JsonlStore(ISimpleFileStore store, StringPath path, Func <T, string> getTs,
                   ILogger log, string version = "", Func <T, string> getPartition = null, int parallel = 8)
 {
     Store        = store;
     Path         = path;
     GetTs        = getTs;
     Log          = log;
     GetPartition = getPartition;
     Parallel     = parallel;
     Version      = version;
 }
Beispiel #14
0
        public static async Task <T> GetOrCreate <T>(this ISimpleFileStore store, StringPath path, Func <T> create = null) where T : class, new()
        {
            var o = await store.Get <T>(path);

            if (o == null)
            {
                o = create == null ? new T() : create();
                await store.Set(path, o);
            }
            return(o);
        }
Beispiel #15
0
        public static async Task <T> Get <T>(this ISimpleFileStore store, StringPath path, bool zip = true, ILogger log = null)
        {
            using var stream = await store.Load(path.AddJsonExtention(zip), log);

            if (!zip)
            {
                return(stream.ToObject <T>());
            }
            await using var zr = new GZipStream(stream, CompressionMode.Decompress, true);
            return(zr.ToObject <T>());
        }
Beispiel #16
0
    JsonStoreFiles(this ISimpleFileStore store, SPath path, bool allDirectories = false)
    {
        var allFiles = await store.List(path, allDirectories).ToArrayAsync();

        foreach (var b in allFiles.Select(b => b
                                          .Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension))
                                          .Select(StoreFileMd.FromFileItem).ToArray())
                 .Select(dummy => (IReadOnlyCollection <StoreFileMd>)dummy))
        {
            yield return(b);
        }
    }
Beispiel #17
0
    /// <summary>Join ts (timestamp) contiguous files together until they are > MaxBytes</summary>
    public static async Task <(long optimisedIn, long optimisedOut)> Optimise(this ISimpleFileStore store, SPath destPath, IEnumerable <StoreFileMd> files,
                                                                              OptimiseCfg cfg, ILogger log)
    {
        var plan = OptimisePlan(files, destPath, cfg);

        var(filesIn, filesOut) = (plan.Sum(p => p.SourceFileCount), plan.Count);
        if (plan.None())
        {
            log.Debug("Optimise {Path} - already optimal", destPath);
        }
        else
        {
            log?.Debug("Optimise {Path} - starting optimization of {FilesIn} to {FilesOut} files", destPath, filesIn, filesOut);
            await Optimise(store, cfg, plan, log);
        }
        return(filesIn, filesOut);
    }
Beispiel #18
0
    /// <summary>Copies the given files into staging tables MOTE: filesToCopy is relative to the store path. The resulting
    ///   LoadHistoryRow.File paths are converted to be equivalent</summary>
    public static async Task <LoadHistoryRow[]> CopyInto(this StageTableCfg t, ISimpleFileStore store, string sfStage, ILoggedConnection <IDbConnection> db,
                                                         [CanBeNull] SPath[] filesToCopy, ILogger log)
    {
        if (filesToCopy?.Length > 1000)
        {
            throw new("copying 1k+ files not implemented");
        }

        var stagePath = new string[] { sfStage, store.BasePathSansContainer().Dot(c => c.IsEmpty ? null : c) }.Concat(t.Dir.Tokens).NotNull().Join("/");

        var cols = await t.TableCols(db); // support subsets of columns (e.g. no loaded or updated columns

        var selectCols = cols.Join(",", c => c.column_name.ToLowerInvariant() switch {
            "v" => "$1 v",
            "loaded" => "sysdate() loaded",
            "updated" => "v:Updated::timestamp_ntz updated",
            _ => throw new($"stage column {c.column_name} not supported")
        });
Beispiel #19
0
    Optimise(this ISimpleFileStore store, OptimiseCfg cfg, SPath rootPath, string ts = null, ILogger log = null)
    {
        // all partition landing files (will group using directories)
        var sw = Stopwatch.StartNew();

        log?.Debug("Optimise {Path} - reading current files", rootPath);
        var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration();

        log?.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}",
                   rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort());

        var optimiseRes = await byDir.BlockDo(p => Optimise(store, p.Key, p, cfg, log), cfg.ParallelFiles).ToArrayAsync();

        var optimiseIn  = optimiseRes.Sum(r => r.optimisedIn);
        var optimiseOut = optimiseRes.Sum(r => r.optimisedOut);

        log?.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}",
                         rootPath, optimiseIn, optimiseOut, sw.Elapsed.HumanizeShort());
    }
        /// <summary>Join ts (timestamp) contiguous files together until they are > MaxBytes</summary>
        static async Task <(long optimisedIn, long optimisedOut)> Optimise(ISimpleFileStore store, StringPath partitionPath, IEnumerable <StoreFileMd> files,
                                                                           OptimiseCfg cfg, ILogger log)
        {
            var toProcess = files.OrderBy(f => f.Ts).ToQueue();

            log.Debug("Optimise {Path} - Processing {Files} files in partition {Partition}",
                      partitionPath, toProcess.Count, partitionPath);

            var currentBatch = new List <StoreFileMd>();
            var optimisePlan = new List <StoreFileMd[]>();

            if (toProcess.None())
            {
                return(0, 0);
            }

            while (toProcess.Any())
            {
                var file = toProcess.Dequeue();

                var(nextBytes, nextIsFurther) = BatchSize(file);
                if (nextBytes > cfg.TargetBytes && nextIsFurther) // if adding this file will make it too big, optimise the current batch as is
                {
                    PlanCurrentBatch();
                }

                currentBatch.Add(file);
                if (toProcess.None() || currentBatch.Sum(f => f.Bytes) > cfg.TargetBytes) // join if big enough, or this is the last batch
                {
                    PlanCurrentBatch();
                }
            }

            (long nextBytes, bool nextIsFurther) BatchSize(StoreFileMd file)
            {
                var bytes         = currentBatch.Sum(f => f.Bytes);
                var nextBytes     = bytes + file.Bytes;
                var nextIsFurther = nextBytes - cfg.TargetBytes > cfg.TargetBytes - bytes;

                return(nextBytes, nextIsFurther);
            }

            void PlanCurrentBatch()
            {
                if (currentBatch.Count > 1) // only plan a batch if there is more than one file in it
                {
                    optimisePlan.Add(currentBatch.ToArray());
                }
                currentBatch.Clear();
            }

            if (optimisePlan.None())
            {
                log.Debug("Optimise {Path} - already optimal", partitionPath);
            }
            else
            {
                log.Debug("Optimise {Path} - staring to execute optimisation plan", partitionPath);
                await optimisePlan.Select((b, i) => (b, i)).BlockAction(async b => {
                    var(batch, i)   = b;
                    var optimiseRes = await JoinFiles(store, batch, partitionPath, cfg.Parallel, log).WithDuration();
                    log.Debug("Optimise {Path} - optimised file {OptimisedFile} from {FilesIn} in {Duration}. batch {Batch}/{Total}",
                              partitionPath, optimiseRes.Result, batch.Length, optimiseRes.Duration.HumanizeShort(), i, optimisePlan.Count);
                }, cfg.Parallel);
            }
            return(optimisePlan.Sum(p => p.Length), optimisePlan.Count);
        }
 public static SPath BasePathSansContainer(this ISimpleFileStore store) => new(store.BasePath.Tokens.Skip(1));
Beispiel #22
0
 public BlobIndex(ISimpleFileStore store) => Store = store;
Beispiel #23
0
 public KeyedCollectionStore(ISimpleFileStore store, Func <T, string> getId, SPath path)
 {
     Store = store;
     GetId = getId;
     Path  = path;
 }
Beispiel #24
0
 public YtStore(ISimpleFileStore store, ILogger log)
 {
     Store = store;
     Log   = log;
 }
 public StoreUpgrader(AppCfg cfg, ISimpleFileStore store, ILogger log)
 {
     Cfg   = cfg;
     Store = store;
     Log   = log;
 }
Beispiel #26
0
 public YtResults(SnowflakeCfg snowflakeCfg, ResultsCfg resCfg, ISimpleFileStore store, ILogger log) {
   SnowflakeCfg = snowflakeCfg;
   ResCfg = resCfg;
   Store = store;
   Log = log;
 }
 public YtConvertWatchTimeFiles(YtStores stores)
 {
     Store = stores.Store(DataStoreType.Root);
 }
 static async Task <IGrouping <string, StoreFileMd>[]> ToOptimiseByDir(ISimpleFileStore store, StringPath landPath, string ts) =>
 (await store.Files(landPath, true))
 .Where(f => ts == null || string.CompareOrdinal(f.Ts, ts) > 0)
 .GroupBy(f => f.Path.Parent.ToString())
 .ToArray();
        public static async Task <IReadOnlyCollection <StoreFileMd> > Files(this ISimpleFileStore store, StringPath path, bool allDirectories = false)
        {
            var list = (await store.List(path, allDirectories).SelectManyList()).Where(p => !p.Path.Name.StartsWith("_") && p.Path.Name.EndsWith(Extension));

            return(list.Select(StoreFileMd.FromFileItem).ToList());
        }
        /// <summary>Process new files in land into stage. Note on the different modes: - LandAndStage: Immutable operation, load
        ///   data downstream from the stage directory - Default: Mutates the files in the same directory. Downstream operations:
        ///   don't run while processing, and fully reload after this is complete</summary>
        /// <returns>stats about the optimisation, and all new files (optimised or not) based on the timestamp</returns>
        public static async Task <(long optimiseIn, long optimiseOut, StoreFileMd[] files)> Optimise(this ISimpleFileStore store, OptimiseCfg cfg, StringPath rootPath, string ts = null, ILogger log = null) // all partition landing files (will group using directories)
        {
            var sw = Stopwatch.StartNew();

            log.Debug("Optimise {Path} - reading current files", rootPath);
            var(byDir, duration) = await ToOptimiseByDir(store, rootPath, ts).WithDuration();

            log.Debug("Optimise {Path} - read {Files} files across {Partitions} partitions in {Duration}",
                      rootPath, byDir.Sum(p => p.Count()), byDir.Length, duration.HumanizeShort());

            var optimiseRes = await byDir.BlockFunc(p => Optimise(store, p.Key, p, cfg, log), cfg.Parallel);

            var res = (
                optimiseIn : optimiseRes.Sum(r => r.optimisedIn),
                optimiseOut : optimiseRes.Sum(r => r.optimisedOut),
                files : byDir.SelectMany(df => df).ToArray());

            log.Information("Optimise {Path} - optimised {FilesIn} into {FilesOut} files in {Duration}",
                            rootPath, res.optimiseIn, res.optimiseOut, sw.Elapsed.HumanizeShort());

            return(res);
        }