private static void GetAllLinksToDB(IUriBucket <WaitingPage> uriBucket, IHtmlDocument document) { var links = document .Links .OfType <IHtmlAnchorElement>() .Where(e => e.Href.StartsWith("https://www.webtoons.com")) .Select(e => new Uri(e.Href)); foreach (var link in links.Where(item => item.Scheme.ToLower() == "http" || item.Scheme.ToLower() == "https")) { uriBucket.Add(new WaitingPage { Uri = link.ToString() }); } }
public async Task DoCrawling(WaitingPage pageToCrawl, IUriBucket <WaitingPage> uriBucket) { //1. Get page url //4. Get parsed data into: // Page url -> Scheduler // Data Model -> Storage Logger.LogInformation("Start crawling " + pageToCrawl.Uri); //2. Download the page (IDownloader) var downloader = Services.GetService <IDownloader>(); var method = new HttpMethod(pageToCrawl.Verb); var requestUri = new Uri(pageToCrawl.Uri); if (!(requestUri.Scheme.ToLower() == "http" || requestUri.Scheme.ToLower() == "https")) { return; } var downloadedContent = await downloader.GetPage(new HttpRequestMessage(method, pageToCrawl.Uri)); pageToCrawl.DownloadedTime = DateTime.UtcNow.ToString(Tools.StrDateTimeFormat); //3. Process the page (IPageProcessor) var browsingContext = Services.GetService <IBrowsingContext>(); // var credentials = new NetworkCredential("user", "pass", "domain"); // var handler = new HttpClientHandler { Credentials = credentials }; // var config = Configuration.Default // .WithRequesters(handler) // .WithCookies() // .WithDefaultLoader(); // var context = BrowsingContext.New(config); var parser = browsingContext.GetService <IHtmlParser>(); var document = parser.ParseDocument(downloadedContent); // var document = await browsingContext.OpenAsync(downloadedContent); GetAllLinksToDB(uriBucket, document); GetPageContentToDB(uriBucket, document, requestUri); }
private void GetPageContentToDB(IUriBucket <WaitingPage> uriBucket, IHtmlDocument document, Uri requestUri) { using var pageProcessor = Services.GetService <IPageProcessor>(); pageProcessor.ProcessPageContent(document, requestUri); }