/// <summary> /// localise every x.html file that has been marked WebPages.Localise=2 /// </summary> /// <param name="repo"> /// IRepository to perform database work /// </param> /// <param name="batchSize"> /// number of files in request from db /// </param> /// <param name="localise"> /// object that actually performs the localise (find+alter each link) /// </param> /// <returns> /// Task although activity is heavily CPU-bound and HAP methods all sync, there is some database I/O conducted async /// </returns> /// <remarks> /// 1. batchSize is set by caller [from App.config /// </remarks> async Task HtmlLocalise(IRepository repo, int batchSize, Localiser localise, bool getMissing = false) { List <WebPage> batch; batch = await repo.GetWebPagesToLocaliseAsync(batchSize); // get first batch (as List<WebPage>) //batch = new List<WebPage>(dbctx.WebPages.Where(w => w.Url == "https://www.ligonier.org/learn/scripture/").ToList()); while (batch.Count > 0) { foreach (var webpage in batch) // iterate through [re-]obtained List { if (webpage.Download != WebPage.DownloadEnum.Downloaded || // this page already fully downloaded ? webpage.Localise != WebPage.LocaliseEnum.ToLocalise) // and needs localisation ? { continue; // no. [sproc should not have included it]/ TODO: make Debug.Assert instead } var htmlFile = webpage.Filespec; var backupFile = backupdir + Path.DirectorySeparatorChar + Path.GetFileName(htmlFile); Console.WriteLine($"<<<{webpage.Url}\t~~>\t{htmlFile }>>>"); try { var changedLinks = await localise.Translate(webpage, MaxLinks, getMissing); // [async because of Downloader] but complete current page before starting the next webpage.Localise = (changedLinks) ? WebPage.LocaliseEnum.Localised // show Localise success : WebPage.LocaliseEnum.Ignore; // pretend it wasn't wanted anyway } catch (Exception excp) // either explicit from FetchFileAsync or HTTP timeout [TODO: Polly retries] { Console.WriteLine($"HtmlLocalise EXCEPTION\t{excp.Message}"); // see Filespec like '~%' webpage.Localise = WebPage.LocaliseEnum.Ignore; // pretend it wasn't wanted anyway } } var finalcnt = repo.SaveChanges(); // flush to update any pending "webpage.Localise = Ignore/Localised" rows batch = await repo.GetWebPagesToLocaliseAsync(batchSize); // get next batch } }
static int MaxFileSize; // don't download files bigger than 10 MB static async Task Main(string[] _) { //string fs1 = @"C:\Ligonier\webcache\state - theology - does - sin - deserve - damnation.html", // fs2 = @"C:\Ligonier\webcache\assets\bible - plan.pdf"; //var rel = Utils.GetRelativePath(fs1, fs2); //Console.WriteLine(rel); dbctx = new WebModel(); // EF context defaults to config: "name=DefaultConnection" IAsyncPolicy AdoRetryPolicy = // TODO: probably should configure based on App.config Policy.Handle <Exception>(ex => true) // retry every exception! TODO: improve .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 4)); // i.e. 0.5, 1, 2, 4, 8 second retries //IRepository repo = new Repository(dbctx); IRepository repo = new BulkRepository(dbctx, AdoRetryPolicy); MimeCollection.Load(await repo.GetContentTypeToExtnsAsync()); //var ct = new CancellationToken(); htmldir = ConfigurationManager.AppSettings["htmldir"] ?? @"C:\Ligonier\webcache"; if (!Directory.Exists(htmldir)) { Directory.CreateDirectory(htmldir); } var otherdir = ConfigurationManager.AppSettings["otherdir"] ?? (htmldir + Path.DirectorySeparatorChar + OTHFOLDER); if (!Directory.Exists(otherdir)) { Directory.CreateDirectory(otherdir); } backupdir = ConfigurationManager.AppSettings["backupdir"] ?? (htmldir + Path.DirectorySeparatorChar + BACKUPFOLDER); if (!Directory.Exists(backupdir)) { Directory.CreateDirectory(backupdir); } if (!int.TryParse(ConfigurationManager.AppSettings["batchsize"], out var batchSize)) { batchSize = 4; } if (!int.TryParse(ConfigurationManager.AppSettings["maxlinks"], out MaxLinks)) { MaxLinks = 1500; } if (!int.TryParse(ConfigurationManager.AppSettings["maxfilesize"], out MaxFileSize)) { MaxFileSize = 10_000_000; // 10 MB } var ValidRetry = new HttpStatusCode[] { HttpStatusCode.Ambiguous, // 300 HttpStatusCode.Conflict, // 409 HttpStatusCode.InternalServerError, // 500 HttpStatusCode.NotImplemented, // 501 HttpStatusCode.BadGateway, // 502 HttpStatusCode.ServiceUnavailable, // 503 HttpStatusCode.GatewayTimeout }; // 504 IAsyncPolicy <HttpResponseMessage> HttpRetryPolicy = // TODO: probably should configure based on App.config Policy.HandleResult <HttpResponseMessage>(rsp => ValidRetry.Contains(rsp.StatusCode)) .WaitAndRetryAsync(0, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 2)); // i.e. 1, 2, 4 seconds #pragma warning disable GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement //TODO: plug-in Polly as MessageProcessingHandler / whatever ! var Client = new HttpClient( new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = true }) { Timeout = new TimeSpan(0, 0, 20) }; #pragma warning restore GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement var p = new Program(); var retrycount = 2; Downloader download; do { HParser = new HapParser(MaxLinks); download = new Downloader(repo, Client, HttpRetryPolicy, HParser, htmldir, otherdir, backupdir, MaxFileSize); var dlresult = await p.DownloadAndParse(repo, batchSize, download); if (!dlresult) // failure may be due to tainted EF context so have to reset all these { dbctx = new WebModel(); // EF context defaults to config: "name=DefaultConnection" repo = new BulkRepository(dbctx, AdoRetryPolicy); retrycount--; } else { break; } } while (retrycount >= 0); Console.WriteLine("*** DownloadAndParse FINISHED ***"); var localise = new Localiser(HParser, htmldir, backupdir, download); await p.HtmlLocalise(repo, batchSize, localise, getMissing : true); Console.WriteLine("*** HtmlLocalise FINISHED ***"); #if DEBUG foreach (var extn in MimeCollection.MissingExtns.OrderBy(e => e)) { Console.WriteLine($"missing extn\t{extn}"); } #endif Console.ReadLine(); }