示例#1
0
        private static void GetAllLinksToDB(IUriBucket <WaitingPage> uriBucket, IHtmlDocument document)
        {
            var links = document
                        .Links
                        .OfType <IHtmlAnchorElement>()
                        .Where(e => e.Href.StartsWith("https://www.webtoons.com"))
                        .Select(e => new Uri(e.Href));

            foreach (var link in links.Where(item => item.Scheme.ToLower() == "http" || item.Scheme.ToLower() == "https"))
            {
                uriBucket.Add(new WaitingPage {
                    Uri = link.ToString()
                });
            }
        }
示例#2
0
        public async Task DoCrawling(WaitingPage pageToCrawl, IUriBucket <WaitingPage> uriBucket)
        {
            //1. Get page url

            //4. Get parsed data into:
            //   Page url -> Scheduler
            //   Data Model -> Storage
            Logger.LogInformation("Start crawling " + pageToCrawl.Uri);

            //2. Download the page (IDownloader)
            var downloader = Services.GetService <IDownloader>();
            var method     = new HttpMethod(pageToCrawl.Verb);
            var requestUri = new Uri(pageToCrawl.Uri);

            if (!(requestUri.Scheme.ToLower() == "http" || requestUri.Scheme.ToLower() == "https"))
            {
                return;
            }
            var downloadedContent = await downloader.GetPage(new HttpRequestMessage(method, pageToCrawl.Uri));

            pageToCrawl.DownloadedTime = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat);

            //3. Process the page (IPageProcessor)
            var browsingContext = Services.GetService <IBrowsingContext>();

            // var credentials = new NetworkCredential("user", "pass", "domain");
            // var handler     = new HttpClientHandler { Credentials = credentials };
            // var config = Configuration.Default
            //     .WithRequesters(handler)
            //     .WithCookies()
            //     .WithDefaultLoader();
            // var context  = BrowsingContext.New(config);
            var parser   = browsingContext.GetService <IHtmlParser>();
            var document = parser.ParseDocument(downloadedContent);

            // var document = await browsingContext.OpenAsync(downloadedContent);
            GetAllLinksToDB(uriBucket, document);

            GetPageContentToDB(uriBucket, document, requestUri);
        }
示例#3
0
 private void GetPageContentToDB(IUriBucket <WaitingPage> uriBucket, IHtmlDocument document, Uri requestUri)
 {
     using var pageProcessor = Services.GetService <IPageProcessor>();
     pageProcessor.ProcessPageContent(document, requestUri);
 }