static int MaxFileSize; // don't download files bigger than 10 MB static async Task Main(string[] _) { //string fs1 = @"C:\Ligonier\webcache\state - theology - does - sin - deserve - damnation.html", // fs2 = @"C:\Ligonier\webcache\assets\bible - plan.pdf"; //var rel = Utils.GetRelativePath(fs1, fs2); //Console.WriteLine(rel); dbctx = new WebModel(); // EF context defaults to config: "name=DefaultConnection" IAsyncPolicy AdoRetryPolicy = // TODO: probably should configure based on App.config Policy.Handle <Exception>(ex => true) // retry every exception! TODO: improve .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 4)); // i.e. 0.5, 1, 2, 4, 8 second retries //IRepository repo = new Repository(dbctx); IRepository repo = new BulkRepository(dbctx, AdoRetryPolicy); MimeCollection.Load(await repo.GetContentTypeToExtnsAsync()); //var ct = new CancellationToken(); htmldir = ConfigurationManager.AppSettings["htmldir"] ?? @"C:\Ligonier\webcache"; if (!Directory.Exists(htmldir)) { Directory.CreateDirectory(htmldir); } var otherdir = ConfigurationManager.AppSettings["otherdir"] ?? (htmldir + Path.DirectorySeparatorChar + OTHFOLDER); if (!Directory.Exists(otherdir)) { Directory.CreateDirectory(otherdir); } backupdir = ConfigurationManager.AppSettings["backupdir"] ?? (htmldir + Path.DirectorySeparatorChar + BACKUPFOLDER); if (!Directory.Exists(backupdir)) { Directory.CreateDirectory(backupdir); } if (!int.TryParse(ConfigurationManager.AppSettings["batchsize"], out var batchSize)) { batchSize = 4; } if (!int.TryParse(ConfigurationManager.AppSettings["maxlinks"], out MaxLinks)) { MaxLinks = 1500; } if (!int.TryParse(ConfigurationManager.AppSettings["maxfilesize"], out MaxFileSize)) { MaxFileSize = 10_000_000; // 10 MB } var ValidRetry = new HttpStatusCode[] { HttpStatusCode.Ambiguous, // 300 HttpStatusCode.Conflict, // 409 HttpStatusCode.InternalServerError, // 500 HttpStatusCode.NotImplemented, // 501 HttpStatusCode.BadGateway, // 502 HttpStatusCode.ServiceUnavailable, // 503 HttpStatusCode.GatewayTimeout }; // 504 IAsyncPolicy <HttpResponseMessage> HttpRetryPolicy = // TODO: probably should configure based on App.config Policy.HandleResult <HttpResponseMessage>(rsp => ValidRetry.Contains(rsp.StatusCode)) .WaitAndRetryAsync(0, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 2)); // i.e. 1, 2, 4 seconds #pragma warning disable GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement //TODO: plug-in Polly as MessageProcessingHandler / whatever ! var Client = new HttpClient( new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = true }) { Timeout = new TimeSpan(0, 0, 20) }; #pragma warning restore GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement var p = new Program(); var retrycount = 2; Downloader download; do { HParser = new HapParser(MaxLinks); download = new Downloader(repo, Client, HttpRetryPolicy, HParser, htmldir, otherdir, backupdir, MaxFileSize); var dlresult = await p.DownloadAndParse(repo, batchSize, download); if (!dlresult) // failure may be due to tainted EF context so have to reset all these { dbctx = new WebModel(); // EF context defaults to config: "name=DefaultConnection" repo = new BulkRepository(dbctx, AdoRetryPolicy); retrycount--; } else { break; } } while (retrycount >= 0); Console.WriteLine("*** DownloadAndParse FINISHED ***"); var localise = new Localiser(HParser, htmldir, backupdir, download); await p.HtmlLocalise(repo, batchSize, localise, getMissing : true); Console.WriteLine("*** HtmlLocalise FINISHED ***"); #if DEBUG foreach (var extn in MimeCollection.MissingExtns.OrderBy(e => e)) { Console.WriteLine($"missing extn\t{extn}"); } #endif Console.ReadLine(); }
static async Task Main(string[] args) { ctx = new WebModel(); repo = new Repository(ctx); MimeCollection.Load(await repo.GetContentTypeToExtnsAsync()); allpages = ctx.WebPages.ToList(); var p = new Program(); #region unused /* * var urls = new string[] { * "http://www.ligonier.org", // * "http://www.ligonier.org/blog", * "http://www.ligonier.org/blog/category/ministry-news", * "http://www.ligonier.org?", // * "http://www.ligonier.org/blog?", * "http://www.ligonier.org/blog/category/ministry-news?", * "https://www.ligonier.org", // * "https://www.ligonier.org/blog", * "https://www.ligonier.org/blog/category/ministry-news", * "https://www.ligonier.org?", // * "https://www.ligonier.org/blog?", * "https://www.ligonier.org/blog/category/ministry-news?", * "https://www.ligonier.org/", // * "https://www.ligonier.org/blog/", // * "https://www.ligonier.org/blog/category/ministry-news/", * "https://www.ligonier.org/?", // * "https://www.ligonier.org/blog/?", // * "https://www.ligonier.org/blog/category/ministry-news/?", * "https://www.ligonier.org?abc=123", // * "https://www.ligonier.org/blog?abc=123", // * "https://www.ligonier.org/blog/category/ministry-news?abc=123", * "https://www.ligonier.org/?abc=123", // * "https://www.ligonier.org/blog/?abc=123", // * "https://www.ligonier.org/blog/category/ministry-news/?abc=123", * "https://www.ligonier.org?abc=123", // * "https://www.ligonier.org/blog?abc=123", // * "https://www.ligonier.org/blog/category/ministry-news?abc=123" * }; * foreach (var url in urls) * { * var u2 = StdUrl(url); * } */ #endregion var u = "http://www.ligonier.org/store/keyword/apologetics"; var fs = @"C:\Ligonier\webcache\41m4uuk2.html"; var HParser = new HapParser(); HParser.LoadFromFile(u, fs); var lnks = HParser.GetLinks(); var url0 = "http://www.ligonier.org/store/keyword/apologetics"; var bld = new UriBuilder(url0); var url1 = bld.Uri.AbsoluteUri; if (url0 != url1) { Console.WriteLine($"{url0}\t->\t{url1}"); } foreach (var webpage in allpages) { var url = webpage.Url; var url2 = StdUrl(url); LookupPage2(webpage, url, url2, changeUrl: true); url2 = (url2.StartsWith(Uri.UriSchemeHttp)) ? Uri.UriSchemeHttps + url2.Substring(Uri.UriSchemeHttp.Length) : Uri.UriSchemeHttp + url2.Substring(Uri.UriSchemeHttps.Length); LookupPage2(webpage, url, url2, changeUrl: false); } }