Example #1
0
        /// <summary>
        ///     localise every x.html file that has been marked WebPages.Localise=2
        /// </summary>
        /// <param name="repo">
        ///     IRepository to perform database work
        /// </param>
        /// <param name="batchSize">
        ///     number of files in request from db
        /// </param>
        /// <param name="localise">
        ///     object that actually performs the localise (find+alter each link)
        /// </param>
        /// <returns>
        ///     Task although activity is heavily CPU-bound and HAP methods all sync, there is some database I/O conducted async
        /// </returns>
        /// <remarks>
        /// 1.  batchSize is set by caller [from App.config
        /// </remarks>
        async Task HtmlLocalise(IRepository repo, int batchSize, Localiser localise, bool getMissing = false)
        {
            List <WebPage> batch;

            batch = await repo.GetWebPagesToLocaliseAsync(batchSize);       // get first batch (as List<WebPage>)

            //batch = new List<WebPage>(dbctx.WebPages.Where(w => w.Url == "https://www.ligonier.org/learn/scripture/").ToList());
            while (batch.Count > 0)
            {
                foreach (var webpage in batch)                                  // iterate through [re-]obtained List
                {
                    if (webpage.Download != WebPage.DownloadEnum.Downloaded ||  // this page already fully downloaded ?
                        webpage.Localise != WebPage.LocaliseEnum.ToLocalise)    //  and needs localisation ?
                    {
                        continue;                                               // no. [sproc should not have included it]/ TODO: make Debug.Assert instead
                    }
                    var htmlFile   = webpage.Filespec;
                    var backupFile = backupdir + Path.DirectorySeparatorChar + Path.GetFileName(htmlFile);

                    Console.WriteLine($"<<<{webpage.Url}\t~~>\t{htmlFile }>>>");
                    try
                    {
                        var changedLinks = await localise.Translate(webpage, MaxLinks, getMissing);   // [async because of Downloader] but complete current page before starting the next

                        webpage.Localise = (changedLinks)
                            ? WebPage.LocaliseEnum.Localised                          // show Localise success
                            : WebPage.LocaliseEnum.Ignore;                            // pretend it wasn't wanted anyway
                    }
                    catch (Exception excp)                                            // either explicit from FetchFileAsync or HTTP timeout [TODO: Polly retries]
                    {
                        Console.WriteLine($"HtmlLocalise EXCEPTION\t{excp.Message}"); // see Filespec like '~%'
                        webpage.Localise = WebPage.LocaliseEnum.Ignore;               // pretend it wasn't wanted anyway
                    }
                }
                var finalcnt = repo.SaveChanges();                              // flush to update any pending "webpage.Localise = Ignore/Localised" rows
                batch = await repo.GetWebPagesToLocaliseAsync(batchSize);       // get next batch
            }
        }
Example #2
0
        static int MaxFileSize;                 // don't download files bigger than 10 MB

        static async Task Main(string[] _)
        {
            //string fs1 = @"C:\Ligonier\webcache\state - theology - does - sin - deserve - damnation.html",
            //    fs2 = @"C:\Ligonier\webcache\assets\bible - plan.pdf";
            //var rel = Utils.GetRelativePath(fs1, fs2);
            //Console.WriteLine(rel);

            dbctx = new WebModel();                                                                                                   // EF context defaults to config: "name=DefaultConnection"

            IAsyncPolicy AdoRetryPolicy =                                                                                             // TODO: probably should configure based on App.config
                                          Policy.Handle <Exception>(ex => true)                                                       // retry every exception! TODO: improve
                                          .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 4)); // i.e. 0.5, 1, 2, 4, 8 second retries

            //IRepository repo = new Repository(dbctx);
            IRepository repo = new BulkRepository(dbctx, AdoRetryPolicy);

            MimeCollection.Load(await repo.GetContentTypeToExtnsAsync());

            //var ct = new CancellationToken();
            htmldir = ConfigurationManager.AppSettings["htmldir"] ?? @"C:\Ligonier\webcache";
            if (!Directory.Exists(htmldir))
            {
                Directory.CreateDirectory(htmldir);
            }
            var otherdir = ConfigurationManager.AppSettings["otherdir"] ?? (htmldir + Path.DirectorySeparatorChar + OTHFOLDER);

            if (!Directory.Exists(otherdir))
            {
                Directory.CreateDirectory(otherdir);
            }
            backupdir = ConfigurationManager.AppSettings["backupdir"] ?? (htmldir + Path.DirectorySeparatorChar + BACKUPFOLDER);
            if (!Directory.Exists(backupdir))
            {
                Directory.CreateDirectory(backupdir);
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["batchsize"], out var batchSize))
            {
                batchSize = 4;
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["maxlinks"], out MaxLinks))
            {
                MaxLinks = 1500;
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["maxfilesize"], out MaxFileSize))
            {
                MaxFileSize = 10_000_000;               // 10 MB
            }
            var ValidRetry = new HttpStatusCode[] {
                HttpStatusCode.Ambiguous,                                                                                                                    // 300
                HttpStatusCode.Conflict,                                                                                                                     // 409
                HttpStatusCode.InternalServerError,                                                                                                          // 500
                HttpStatusCode.NotImplemented,                                                                                                               // 501
                HttpStatusCode.BadGateway,                                                                                                                   // 502
                HttpStatusCode.ServiceUnavailable,                                                                                                           // 503
                HttpStatusCode.GatewayTimeout
            };                                                                                                                                               // 504
            IAsyncPolicy <HttpResponseMessage> HttpRetryPolicy =                                                                                             // TODO: probably should configure based on App.config
                                                                 Policy.HandleResult <HttpResponseMessage>(rsp => ValidRetry.Contains(rsp.StatusCode))
                                                                 .WaitAndRetryAsync(0, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 2)); // i.e. 1, 2, 4 seconds

#pragma warning disable GCop302                                                                                                                              // Since '{0}' implements IDisposable, wrap it in a using() statement
            //TODO: plug-in Polly as MessageProcessingHandler / whatever !
            var Client = new HttpClient(
                new HttpClientHandler {
                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = true
            })
            {
                Timeout = new TimeSpan(0, 0, 20)
            };
#pragma warning restore GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement

            var        p          = new Program();
            var        retrycount = 2;
            Downloader download;
            do
            {
                HParser  = new HapParser(MaxLinks);
                download = new Downloader(repo, Client, HttpRetryPolicy, HParser, htmldir, otherdir, backupdir, MaxFileSize);
                var dlresult = await p.DownloadAndParse(repo, batchSize, download);

                if (!dlresult)                          // failure may be due to tainted EF context so have to reset all these
                {
                    dbctx = new WebModel();             // EF context defaults to config: "name=DefaultConnection"
                    repo  = new BulkRepository(dbctx, AdoRetryPolicy);
                    retrycount--;
                }
                else
                {
                    break;
                }
            } while (retrycount >= 0);
            Console.WriteLine("*** DownloadAndParse FINISHED ***");

            var localise = new Localiser(HParser, htmldir, backupdir, download);
            await p.HtmlLocalise(repo, batchSize, localise, getMissing : true);

            Console.WriteLine("*** HtmlLocalise FINISHED ***");

#if DEBUG
            foreach (var extn in MimeCollection.MissingExtns.OrderBy(e => e))
            {
                Console.WriteLine($"missing extn\t{extn}");
            }
#endif

            Console.ReadLine();
        }