/// <summary> /// given HttpResponseMessage, invent two candidate target filespecs /// </summary> /// <param name="webpage"> /// current webpage being downloaded /// </param> /// <param name="rsp"> /// HttpResponseMessage /// </param> /// <param name="extn"> /// extension without the dot (e.g. "html") /// </param> /// <param name="filespec2"> /// filespec supposed from basic data before the HttpRequest /// </param> /// <param name="filespec3"> /// filespec based on the HttpResponse content (or generated if borderline case) /// </param> /// <remarks> /// because this is executed DURING HttpResponse processing, it must be quick (no long debugging!) to avoid timeout /// </remarks> void TargetFilespecs(WebPage webpage, HttpResponseMessage rsp, out string extn, out string filespec2, out string filespec3) { var filenameOnly = Utils.TrimOrNull(Path.GetFileNameWithoutExtension(webpage.DraftFilespec)); var mtyp = rsp.Content.Headers.ContentType.MediaType; // "application/json", "application/manifest+json" extn = MimeCollection.LookupExtnFromMime(mtyp) // MediaType takes priority over DraftFilespec for EXTN ?? Utils.TrimOrNull(Path.GetExtension(webpage.DraftFilespec)); var contdisp = rsp.Content.Headers.ContentDisposition; if (contdisp != null) { var FileName = Path.GetFileName( // e.g. "json.json" (prevent any malicious device/folder spec) Utils.MakeValid(contdisp.FileName ?? contdisp.FileNameStar ?? contdisp.Name)); // filter out any spurious chars(e.g. double-quotes) if (FileName != null) { string extn2; (filenameOnly, extn2) = Utils.FileExtSplit(FileName); // ContentDisposition.FileName takes priority over webpage.DraftFilespec for file NAME if (!string.IsNullOrWhiteSpace(extn2)) { extn = extn2; // ContentDisposition takes priority over MediaType for EXTN } } } if (extn == null) // abort if no explicit content (i.e. ignore extn in caller's DraftFilespec) { // || !ct2extn.IsText TODO: write non-UTF-8 file code /* * "application/manifest+json" */ throw new ApplicationException($"unknown extn for Url={webpage.Url}"); // TODO: consider accepting a plain filename (no extn) } var filespec1 = (filenameOnly ?? Utils.RandomFilenameOnly()) // NB this produces a file5678 format + EXTN_SEPARATOR + extn; // filename & extension (ignore any extn in DraftFilespec) var folder = (extn == HTML) ? HtmlPath : OtherPath; // device & folder path filespec2 = Utils.TrimOrNull(webpage.Filespec); // if this is a reload, assign the original to filespec2 (will compare later) filespec2 = filespec3 = (filespec2 != null && !filespec2.StartsWith(ERRTAG)) // skip any previous error message ? filespec2 : Path.Combine(folder, filespec1); if (File.Exists(filespec2) || filespec2.Length > FILESIZE) { webpage.DraftFilespec = filespec1; // keep our 2nd choice of fn.extn [simple debug aid] do // use alternate file target { filespec3 = Path.Combine(folder, Utils.RandomFilenameOnly() + EXTN_SEPARATOR + extn); // no 100% guarantee that file5678.extn file doesn't exist Debug.Assert(filespec3.Length <= FILESIZE, "reduce folder length for htmldir / otherdir in App.config for AppSettings"); } while (File.Exists(filespec3)); // hopefully rare and finite case ! } }
//static readonly char[] DIRSEP = { Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar }; public static (string filename, string extn) FileExtSplit(string instr) { var proto = MakeValid(instr); // will remove any trailing "/". finally does .Trim() but not TrimOrNull() if (!string.IsNullOrWhiteSpace(proto)) { var fname = Path.GetFileNameWithoutExtension(proto); if (!string.IsNullOrWhiteSpace(fname)) // MUST be a filename { var extn = Path.GetExtension(proto); if (extn.Length > 0 && extn[0] == '.') { extn = extn.Substring(1); } return((MimeCollection.IsValidExtn(extn)) // ANY match ? ? (fname, extn) // yes. pass extn as-is : (proto, null)); // no. makes no guesses (content/type will prevail later) } } return(null, null); }
static int MaxFileSize; // don't download files bigger than 10 MB static async Task Main(string[] _) { //string fs1 = @"C:\Ligonier\webcache\state - theology - does - sin - deserve - damnation.html", // fs2 = @"C:\Ligonier\webcache\assets\bible - plan.pdf"; //var rel = Utils.GetRelativePath(fs1, fs2); //Console.WriteLine(rel); dbctx = new WebModel(); // EF context defaults to config: "name=DefaultConnection" IAsyncPolicy AdoRetryPolicy = // TODO: probably should configure based on App.config Policy.Handle <Exception>(ex => true) // retry every exception! TODO: improve .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 4)); // i.e. 0.5, 1, 2, 4, 8 second retries //IRepository repo = new Repository(dbctx); IRepository repo = new BulkRepository(dbctx, AdoRetryPolicy); MimeCollection.Load(await repo.GetContentTypeToExtnsAsync()); //var ct = new CancellationToken(); htmldir = ConfigurationManager.AppSettings["htmldir"] ?? @"C:\Ligonier\webcache"; if (!Directory.Exists(htmldir)) { Directory.CreateDirectory(htmldir); } var otherdir = ConfigurationManager.AppSettings["otherdir"] ?? (htmldir + Path.DirectorySeparatorChar + OTHFOLDER); if (!Directory.Exists(otherdir)) { Directory.CreateDirectory(otherdir); } backupdir = ConfigurationManager.AppSettings["backupdir"] ?? (htmldir + Path.DirectorySeparatorChar + BACKUPFOLDER); if (!Directory.Exists(backupdir)) { Directory.CreateDirectory(backupdir); } if (!int.TryParse(ConfigurationManager.AppSettings["batchsize"], out var batchSize)) { batchSize = 4; } if (!int.TryParse(ConfigurationManager.AppSettings["maxlinks"], out MaxLinks)) { MaxLinks = 1500; } if (!int.TryParse(ConfigurationManager.AppSettings["maxfilesize"], out MaxFileSize)) { MaxFileSize = 10_000_000; // 10 MB } var ValidRetry = new HttpStatusCode[] { HttpStatusCode.Ambiguous, // 300 HttpStatusCode.Conflict, // 409 HttpStatusCode.InternalServerError, // 500 HttpStatusCode.NotImplemented, // 501 HttpStatusCode.BadGateway, // 502 HttpStatusCode.ServiceUnavailable, // 503 HttpStatusCode.GatewayTimeout }; // 504 IAsyncPolicy <HttpResponseMessage> HttpRetryPolicy = // TODO: probably should configure based on App.config Policy.HandleResult <HttpResponseMessage>(rsp => ValidRetry.Contains(rsp.StatusCode)) .WaitAndRetryAsync(0, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 2)); // i.e. 1, 2, 4 seconds #pragma warning disable GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement //TODO: plug-in Polly as MessageProcessingHandler / whatever ! var Client = new HttpClient( new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = true }) { Timeout = new TimeSpan(0, 0, 20) }; #pragma warning restore GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement var p = new Program(); var retrycount = 2; Downloader download; do { HParser = new HapParser(MaxLinks); download = new Downloader(repo, Client, HttpRetryPolicy, HParser, htmldir, otherdir, backupdir, MaxFileSize); var dlresult = await p.DownloadAndParse(repo, batchSize, download); if (!dlresult) // failure may be due to tainted EF context so have to reset all these { dbctx = new WebModel(); // EF context defaults to config: "name=DefaultConnection" repo = new BulkRepository(dbctx, AdoRetryPolicy); retrycount--; } else { break; } } while (retrycount >= 0); Console.WriteLine("*** DownloadAndParse FINISHED ***"); var localise = new Localiser(HParser, htmldir, backupdir, download); await p.HtmlLocalise(repo, batchSize, localise, getMissing : true); Console.WriteLine("*** HtmlLocalise FINISHED ***"); #if DEBUG foreach (var extn in MimeCollection.MissingExtns.OrderBy(e => e)) { Console.WriteLine($"missing extn\t{extn}"); } #endif Console.ReadLine(); }
static async Task Main(string[] args) { ctx = new WebModel(); repo = new Repository(ctx); MimeCollection.Load(await repo.GetContentTypeToExtnsAsync()); allpages = ctx.WebPages.ToList(); var p = new Program(); #region unused /* * var urls = new string[] { * "http://www.ligonier.org", // * "http://www.ligonier.org/blog", * "http://www.ligonier.org/blog/category/ministry-news", * "http://www.ligonier.org?", // * "http://www.ligonier.org/blog?", * "http://www.ligonier.org/blog/category/ministry-news?", * "https://www.ligonier.org", // * "https://www.ligonier.org/blog", * "https://www.ligonier.org/blog/category/ministry-news", * "https://www.ligonier.org?", // * "https://www.ligonier.org/blog?", * "https://www.ligonier.org/blog/category/ministry-news?", * "https://www.ligonier.org/", // * "https://www.ligonier.org/blog/", // * "https://www.ligonier.org/blog/category/ministry-news/", * "https://www.ligonier.org/?", // * "https://www.ligonier.org/blog/?", // * "https://www.ligonier.org/blog/category/ministry-news/?", * "https://www.ligonier.org?abc=123", // * "https://www.ligonier.org/blog?abc=123", // * "https://www.ligonier.org/blog/category/ministry-news?abc=123", * "https://www.ligonier.org/?abc=123", // * "https://www.ligonier.org/blog/?abc=123", // * "https://www.ligonier.org/blog/category/ministry-news/?abc=123", * "https://www.ligonier.org?abc=123", // * "https://www.ligonier.org/blog?abc=123", // * "https://www.ligonier.org/blog/category/ministry-news?abc=123" * }; * foreach (var url in urls) * { * var u2 = StdUrl(url); * } */ #endregion var u = "http://www.ligonier.org/store/keyword/apologetics"; var fs = @"C:\Ligonier\webcache\41m4uuk2.html"; var HParser = new HapParser(); HParser.LoadFromFile(u, fs); var lnks = HParser.GetLinks(); var url0 = "http://www.ligonier.org/store/keyword/apologetics"; var bld = new UriBuilder(url0); var url1 = bld.Uri.AbsoluteUri; if (url0 != url1) { Console.WriteLine($"{url0}\t->\t{url1}"); } foreach (var webpage in allpages) { var url = webpage.Url; var url2 = StdUrl(url); LookupPage2(webpage, url, url2, changeUrl: true); url2 = (url2.StartsWith(Uri.UriSchemeHttp)) ? Uri.UriSchemeHttps + url2.Substring(Uri.UriSchemeHttp.Length) : Uri.UriSchemeHttp + url2.Substring(Uri.UriSchemeHttps.Length); LookupPage2(webpage, url, url2, changeUrl: false); } }