Beispiel #1
0
        static int MaxFileSize;                 // don't download files bigger than 10 MB

        static async Task Main(string[] _)
        {
            //string fs1 = @"C:\Ligonier\webcache\state - theology - does - sin - deserve - damnation.html",
            //    fs2 = @"C:\Ligonier\webcache\assets\bible - plan.pdf";
            //var rel = Utils.GetRelativePath(fs1, fs2);
            //Console.WriteLine(rel);

            dbctx = new WebModel();                                                                                                   // EF context defaults to config: "name=DefaultConnection"

            IAsyncPolicy AdoRetryPolicy =                                                                                             // TODO: probably should configure based on App.config
                                          Policy.Handle <Exception>(ex => true)                                                       // retry every exception! TODO: improve
                                          .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 4)); // i.e. 0.5, 1, 2, 4, 8 second retries

            //IRepository repo = new Repository(dbctx);
            IRepository repo = new BulkRepository(dbctx, AdoRetryPolicy);

            MimeCollection.Load(await repo.GetContentTypeToExtnsAsync());

            //var ct = new CancellationToken();
            htmldir = ConfigurationManager.AppSettings["htmldir"] ?? @"C:\Ligonier\webcache";
            if (!Directory.Exists(htmldir))
            {
                Directory.CreateDirectory(htmldir);
            }
            var otherdir = ConfigurationManager.AppSettings["otherdir"] ?? (htmldir + Path.DirectorySeparatorChar + OTHFOLDER);

            if (!Directory.Exists(otherdir))
            {
                Directory.CreateDirectory(otherdir);
            }
            backupdir = ConfigurationManager.AppSettings["backupdir"] ?? (htmldir + Path.DirectorySeparatorChar + BACKUPFOLDER);
            if (!Directory.Exists(backupdir))
            {
                Directory.CreateDirectory(backupdir);
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["batchsize"], out var batchSize))
            {
                batchSize = 4;
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["maxlinks"], out MaxLinks))
            {
                MaxLinks = 1500;
            }
            if (!int.TryParse(ConfigurationManager.AppSettings["maxfilesize"], out MaxFileSize))
            {
                MaxFileSize = 10_000_000;               // 10 MB
            }
            var ValidRetry = new HttpStatusCode[] {
                HttpStatusCode.Ambiguous,                                                                                                                    // 300
                HttpStatusCode.Conflict,                                                                                                                     // 409
                HttpStatusCode.InternalServerError,                                                                                                          // 500
                HttpStatusCode.NotImplemented,                                                                                                               // 501
                HttpStatusCode.BadGateway,                                                                                                                   // 502
                HttpStatusCode.ServiceUnavailable,                                                                                                           // 503
                HttpStatusCode.GatewayTimeout
            };                                                                                                                                               // 504
            IAsyncPolicy <HttpResponseMessage> HttpRetryPolicy =                                                                                             // TODO: probably should configure based on App.config
                                                                 Policy.HandleResult <HttpResponseMessage>(rsp => ValidRetry.Contains(rsp.StatusCode))
                                                                 .WaitAndRetryAsync(0, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt) / 2)); // i.e. 1, 2, 4 seconds

#pragma warning disable GCop302                                                                                                                              // Since '{0}' implements IDisposable, wrap it in a using() statement
            //TODO: plug-in Polly as MessageProcessingHandler / whatever !
            var Client = new HttpClient(
                new HttpClientHandler {
                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = true
            })
            {
                Timeout = new TimeSpan(0, 0, 20)
            };
#pragma warning restore GCop302 // Since '{0}' implements IDisposable, wrap it in a using() statement

            var        p          = new Program();
            var        retrycount = 2;
            Downloader download;
            do
            {
                HParser  = new HapParser(MaxLinks);
                download = new Downloader(repo, Client, HttpRetryPolicy, HParser, htmldir, otherdir, backupdir, MaxFileSize);
                var dlresult = await p.DownloadAndParse(repo, batchSize, download);

                if (!dlresult)                          // failure may be due to tainted EF context so have to reset all these
                {
                    dbctx = new WebModel();             // EF context defaults to config: "name=DefaultConnection"
                    repo  = new BulkRepository(dbctx, AdoRetryPolicy);
                    retrycount--;
                }
                else
                {
                    break;
                }
            } while (retrycount >= 0);
            Console.WriteLine("*** DownloadAndParse FINISHED ***");

            var localise = new Localiser(HParser, htmldir, backupdir, download);
            await p.HtmlLocalise(repo, batchSize, localise, getMissing : true);

            Console.WriteLine("*** HtmlLocalise FINISHED ***");

#if DEBUG
            foreach (var extn in MimeCollection.MissingExtns.OrderBy(e => e))
            {
                Console.WriteLine($"missing extn\t{extn}");
            }
#endif

            Console.ReadLine();
        }
Beispiel #2
0
        static async Task Main(string[] args)
        {
            ctx  = new WebModel();
            repo = new Repository(ctx);
            MimeCollection.Load(await repo.GetContentTypeToExtnsAsync());
            allpages = ctx.WebPages.ToList();

            var p = new Program();

            #region unused

            /*
             * var urls = new string[] {
             * "http://www.ligonier.org",                        //
             * "http://www.ligonier.org/blog",
             * "http://www.ligonier.org/blog/category/ministry-news",
             * "http://www.ligonier.org?",                        //
             * "http://www.ligonier.org/blog?",
             * "http://www.ligonier.org/blog/category/ministry-news?",
             * "https://www.ligonier.org",                        //
             * "https://www.ligonier.org/blog",
             * "https://www.ligonier.org/blog/category/ministry-news",
             * "https://www.ligonier.org?",                        //
             * "https://www.ligonier.org/blog?",
             * "https://www.ligonier.org/blog/category/ministry-news?",
             * "https://www.ligonier.org/",                        //
             * "https://www.ligonier.org/blog/",                        //
             * "https://www.ligonier.org/blog/category/ministry-news/",
             * "https://www.ligonier.org/?",                        //
             * "https://www.ligonier.org/blog/?",                        //
             * "https://www.ligonier.org/blog/category/ministry-news/?",
             * "https://www.ligonier.org?abc=123",                        //
             * "https://www.ligonier.org/blog?abc=123",                        //
             * "https://www.ligonier.org/blog/category/ministry-news?abc=123",
             * "https://www.ligonier.org/?abc=123",                        //
             * "https://www.ligonier.org/blog/?abc=123",                        //
             * "https://www.ligonier.org/blog/category/ministry-news/?abc=123",
             * "https://www.ligonier.org?abc=123",                        //
             * "https://www.ligonier.org/blog?abc=123",                        //
             * "https://www.ligonier.org/blog/category/ministry-news?abc=123"
             * };
             * foreach (var url in urls)
             * {
             * var u2 = StdUrl(url);
             * }
             */
            #endregion

            var u       = "http://www.ligonier.org/store/keyword/apologetics";
            var fs      = @"C:\Ligonier\webcache\41m4uuk2.html";
            var HParser = new HapParser();
            HParser.LoadFromFile(u, fs);
            var lnks = HParser.GetLinks();

            var url0 = "http://www.ligonier.org/store/keyword/apologetics";
            var bld  = new UriBuilder(url0);
            var url1 = bld.Uri.AbsoluteUri;
            if (url0 != url1)
            {
                Console.WriteLine($"{url0}\t->\t{url1}");
            }

            foreach (var webpage in allpages)
            {
                var url  = webpage.Url;
                var url2 = StdUrl(url);
                LookupPage2(webpage, url, url2, changeUrl: true);

                url2 = (url2.StartsWith(Uri.UriSchemeHttp))
                    ? Uri.UriSchemeHttps + url2.Substring(Uri.UriSchemeHttp.Length)
                    : Uri.UriSchemeHttp + url2.Substring(Uri.UriSchemeHttps.Length);
                LookupPage2(webpage, url, url2, changeUrl: false);
            }
        }