Exemplo n.º 1
0
        /// <summary>
        ///     given HttpResponseMessage, invent two candidate target filespecs
        /// </summary>
        /// <param name="webpage">
        ///     current webpage being downloaded
        /// </param>
        /// <param name="rsp">
        ///     HttpResponseMessage
        /// </param>
        /// <param name="extn">
        ///     extension without the dot (e.g. "html")
        /// </param>
        /// <param name="filespec2">
        ///     filespec supposed from basic data before the HttpRequest
        /// </param>
        /// <param name="filespec3">
        ///     filespec based on the HttpResponse content (or generated if borderline case)
        /// </param>
        /// <remarks>
        ///     because this is executed DURING HttpResponse processing, it must be quick (no long debugging!) to avoid timeout
        /// </remarks>
        void TargetFilespecs(WebPage webpage, HttpResponseMessage rsp, out string extn, out string filespec2, out string filespec3)
        {
            var filenameOnly = Utils.TrimOrNull(Path.GetFileNameWithoutExtension(webpage.DraftFilespec));
            var mtyp         = rsp.Content.Headers.ContentType.MediaType;   // "application/json", "application/manifest+json"

            extn = MimeCollection.LookupExtnFromMime(mtyp)                  // MediaType takes priority over DraftFilespec for EXTN
                   ?? Utils.TrimOrNull(Path.GetExtension(webpage.DraftFilespec));

            var contdisp = rsp.Content.Headers.ContentDisposition;

            if (contdisp != null)
            {
                var FileName = Path.GetFileName(                                                   // e.g. "json.json" (prevent any malicious device/folder spec)
                    Utils.MakeValid(contdisp.FileName ?? contdisp.FileNameStar ?? contdisp.Name)); // filter out any spurious chars(e.g. double-quotes)
                if (FileName != null)
                {
                    string extn2;
                    (filenameOnly, extn2) = Utils.FileExtSplit(FileName);   // ContentDisposition.FileName takes priority over webpage.DraftFilespec for file NAME
                    if (!string.IsNullOrWhiteSpace(extn2))
                    {
                        extn = extn2;                                       // ContentDisposition takes priority over MediaType for EXTN
                    }
                }
            }

            if (extn == null)                                               // abort if no explicit content (i.e. ignore extn in caller's DraftFilespec)
            {
                //  || !ct2extn.IsText TODO: write non-UTF-8 file code

                /*
                 * "application/manifest+json"
                 */
                throw new ApplicationException($"unknown extn for Url={webpage.Url}");   // TODO: consider accepting a plain filename (no extn)
            }
            var filespec1 = (filenameOnly ?? Utils.RandomFilenameOnly())                 // NB this produces a file5678 format
                            + EXTN_SEPARATOR + extn;                                     // filename & extension (ignore any extn in DraftFilespec)
            var folder = (extn == HTML) ? HtmlPath : OtherPath;                          // device & folder path

            filespec2 = Utils.TrimOrNull(webpage.Filespec);                              // if this is a reload, assign the original to filespec2 (will compare later)
            filespec2 = filespec3 = (filespec2 != null && !filespec2.StartsWith(ERRTAG)) // skip any previous error message
                ? filespec2
                : Path.Combine(folder, filespec1);
            if (File.Exists(filespec2) || filespec2.Length > FILESIZE)
            {
                webpage.DraftFilespec = filespec1;                                                        // keep our 2nd choice of fn.extn [simple debug aid]
                do                                                                                        // use alternate file target
                {
                    filespec3 = Path.Combine(folder, Utils.RandomFilenameOnly() + EXTN_SEPARATOR + extn); // no 100% guarantee that file5678.extn file doesn't exist
                    Debug.Assert(filespec3.Length <= FILESIZE, "reduce folder length for htmldir / otherdir in App.config for AppSettings");
                } while (File.Exists(filespec3));                                                         // hopefully rare and finite case !
            }
        }
Exemplo n.º 2
0
        //static readonly char[] DIRSEP = { Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar };

        public static (string filename, string extn) FileExtSplit(string instr)
        {
            var proto = MakeValid(instr);                       // will remove any trailing "/". finally does .Trim() but not TrimOrNull()

            if (!string.IsNullOrWhiteSpace(proto))
            {
                var fname = Path.GetFileNameWithoutExtension(proto);
                if (!string.IsNullOrWhiteSpace(fname))          // MUST be a filename
                {
                    var extn = Path.GetExtension(proto);
                    if (extn.Length > 0 && extn[0] == '.')
                    {
                        extn = extn.Substring(1);
                    }
                    return((MimeCollection.IsValidExtn(extn))   // ANY match ?
                        ? (fname, extn)                         // yes. pass extn as-is
                        : (proto, null));                       // no. makes no guesses (content/type will prevail later)
                }
            }
            return(null, null);
        }
Exemplo n.º 3
0
        static int MaxFileSize;                 // don't download files bigger than 10 MB

        static async Task Main(string[] _)
        {
            //string fs1 = @"C:\Ligonier\webcache\state - theology - does - sin - deserve - damnation.html",
            //    fs2 = @"C:\Ligonier\webcache\assets\bible - plan.pdf";
            //var rel = Utils.GetRelativePath(fs1, fs2);
            //Console.WriteLine(rel);

            dbctx = new WebModel();                                                                                                   // EF context defaults to config: "name=DefaultConnection"

            IAsyncPolicy AdoRetryPolicy =                                                                                             // TODO: probably should configure based on App.config
                                          Policy.Handle <Exception>(ex => true)                                                       // retry every exception! TODO: improve
                                          .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 4)); // i.e. 0.5, 1, 2, 4, 8 second retries

            //IRepository repo = new Repository(dbctx);
            IRepository repo = new BulkRepository(dbctx, AdoRetryPolicy);

            MimeCollection.Load(await repo.GetContentTypeToExtnsAsync());

            //var ct = new CancellationToken();
            htmldir = ConfigurationManager.AppSettings["htmldir"] ?? @"C:\Ligonier\webcache";
            if (!Directory.Exists(htmldir))
            {
                Directory.CreateDirectory(htmldir);
            }
            var otherdir = ConfigurationManager.AppSettings["otherdir"] ?? (htmldir + Path.DirectorySeparatorChar + OTHFOLDER);

            if (!Directory.Exists(otherdir))
            {
                Directory.CreateDirectory(otherdir);
            }
            backupdir = ConfigurationManager.AppSettings["backupdir"] ?? (htmldir + Path.DirectorySeparatorChar + BACKUPFOLDER);
            if (!Directory.Exists(backupdir))
            {
                Directory.CreateDirectory(backupdir);
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["batchsize"], out var batchSize))
            {
                batchSize = 4;
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["maxlinks"], out MaxLinks))
            {
                MaxLinks = 1500;
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["maxfilesize"], out MaxFileSize))
            {
                MaxFileSize = 10_000_000;               // 10 MB
            }
            var ValidRetry = new HttpStatusCode[] {
                HttpStatusCode.Ambiguous,                                                                                                                    // 300
                HttpStatusCode.Conflict,                                                                                                                     // 409
                HttpStatusCode.InternalServerError,                                                                                                          // 500
                HttpStatusCode.NotImplemented,                                                                                                               // 501
                HttpStatusCode.BadGateway,                                                                                                                   // 502
                HttpStatusCode.ServiceUnavailable,                                                                                                           // 503
                HttpStatusCode.GatewayTimeout
            };                                                                                                                                               // 504
            IAsyncPolicy <HttpResponseMessage> HttpRetryPolicy =                                                                                             // TODO: probably should configure based on App.config
                                                                 Policy.HandleResult <HttpResponseMessage>(rsp => ValidRetry.Contains(rsp.StatusCode))
                                                                 .WaitAndRetryAsync(0, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 2)); // i.e. 1, 2, 4 seconds

#pragma warning disable GCop302                                                                                                                              // Since '{0}' implements IDisposable, wrap it in a using() statement
            //TODO: plug-in Polly as MessageProcessingHandler / whatever !
            var Client = new HttpClient(
                new HttpClientHandler {
                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = true
            })
            {
                Timeout = new TimeSpan(0, 0, 20)
            };
#pragma warning restore GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement

            var        p          = new Program();
            var        retrycount = 2;
            Downloader download;
            do
            {
                HParser  = new HapParser(MaxLinks);
                download = new Downloader(repo, Client, HttpRetryPolicy, HParser, htmldir, otherdir, backupdir, MaxFileSize);
                var dlresult = await p.DownloadAndParse(repo, batchSize, download);

                if (!dlresult)                          // failure may be due to tainted EF context so have to reset all these
                {
                    dbctx = new WebModel();             // EF context defaults to config: "name=DefaultConnection"
                    repo  = new BulkRepository(dbctx, AdoRetryPolicy);
                    retrycount--;
                }
                else
                {
                    break;
                }
            } while (retrycount >= 0);
            Console.WriteLine("*** DownloadAndParse FINISHED ***");

            var localise = new Localiser(HParser, htmldir, backupdir, download);
            await p.HtmlLocalise(repo, batchSize, localise, getMissing : true);

            Console.WriteLine("*** HtmlLocalise FINISHED ***");

#if DEBUG
            foreach (var extn in MimeCollection.MissingExtns.OrderBy(e => e))
            {
                Console.WriteLine($"missing extn\t{extn}");
            }
#endif

            Console.ReadLine();
        }
Exemplo n.º 4
0
        static async Task Main(string[] args)
        {
            ctx  = new WebModel();
            repo = new Repository(ctx);
            MimeCollection.Load(await repo.GetContentTypeToExtnsAsync());
            allpages = ctx.WebPages.ToList();

            var p = new Program();

            #region unused

            /*
             * var urls = new string[] {
             * "http://www.ligonier.org",                        //
             * "http://www.ligonier.org/blog",
             * "http://www.ligonier.org/blog/category/ministry-news",
             * "http://www.ligonier.org?",                        //
             * "http://www.ligonier.org/blog?",
             * "http://www.ligonier.org/blog/category/ministry-news?",
             * "https://www.ligonier.org",                        //
             * "https://www.ligonier.org/blog",
             * "https://www.ligonier.org/blog/category/ministry-news",
             * "https://www.ligonier.org?",                        //
             * "https://www.ligonier.org/blog?",
             * "https://www.ligonier.org/blog/category/ministry-news?",
             * "https://www.ligonier.org/",                        //
             * "https://www.ligonier.org/blog/",                        //
             * "https://www.ligonier.org/blog/category/ministry-news/",
             * "https://www.ligonier.org/?",                        //
             * "https://www.ligonier.org/blog/?",                        //
             * "https://www.ligonier.org/blog/category/ministry-news/?",
             * "https://www.ligonier.org?abc=123",                        //
             * "https://www.ligonier.org/blog?abc=123",                        //
             * "https://www.ligonier.org/blog/category/ministry-news?abc=123",
             * "https://www.ligonier.org/?abc=123",                        //
             * "https://www.ligonier.org/blog/?abc=123",                        //
             * "https://www.ligonier.org/blog/category/ministry-news/?abc=123",
             * "https://www.ligonier.org?abc=123",                        //
             * "https://www.ligonier.org/blog?abc=123",                        //
             * "https://www.ligonier.org/blog/category/ministry-news?abc=123"
             * };
             * foreach (var url in urls)
             * {
             * var u2 = StdUrl(url);
             * }
             */
            #endregion

            var u       = "http://www.ligonier.org/store/keyword/apologetics";
            var fs      = @"C:\Ligonier\webcache\41m4uuk2.html";
            var HParser = new HapParser();
            HParser.LoadFromFile(u, fs);
            var lnks = HParser.GetLinks();

            var url0 = "http://www.ligonier.org/store/keyword/apologetics";
            var bld  = new UriBuilder(url0);
            var url1 = bld.Uri.AbsoluteUri;
            if (url0 != url1)
            {
                Console.WriteLine($"{url0}\t->\t{url1}");
            }

            foreach (var webpage in allpages)
            {
                var url  = webpage.Url;
                var url2 = StdUrl(url);
                LookupPage2(webpage, url, url2, changeUrl: true);

                url2 = (url2.StartsWith(Uri.UriSchemeHttp))
                    ? Uri.UriSchemeHttps + url2.Substring(Uri.UriSchemeHttp.Length)
                    : Uri.UriSchemeHttp + url2.Substring(Uri.UriSchemeHttps.Length);
                LookupPage2(webpage, url, url2, changeUrl: false);
            }
        }