private static CrawlerDownloadPackageModel CreatePackge(BaseHttpCrawler crawler, HttpModel httpGet, HtmlNode anchor) { var name = crawler.CreatePackageName(anchor, httpGet); var twoLetterLang = crawler.GetTwoLetterLanguage(httpGet, anchor); return(CrawlerDownloadPackageModel.CreateSuccess(name, httpGet.Encoding, anchor.Href(), twoLetterLang)); }
private static IReadOnlyCollection <HtmlNode> GetAnchors(HttpModel httpGet, BaseHttpCrawler crawler) { var html = httpGet.GetAsHtml(); var anchors = html.DocumentNode.SelectNodes(".//a").ToList(); var resultAnchors = anchors.ToList(); foreach (var anchor in anchors) { var href = anchor.Href(); var baseUri = new Uri(httpGet.Url); var uri = new Uri(baseUri, href); if (String.IsNullOrEmpty(uri.Fragment) == false && uri.Fragment.StartsWith("#")) { // hrefs with anchors (internal) should be unified because the anchors don't represent // new url - they represent just location in the same document // example: www.google.com#main -> www.google.com var(urlWithoutFragment, _) = uri.AbsoluteUri.TupleSplit("#"); uri = new Uri(urlWithoutFragment); } anchor.SetAttributeValue("href", uri.AbsoluteUri.HtmlDecode()); var additionalUrls = crawler.GenerateAdditionalUrls(anchor, null); resultAnchors.AddRange(additionalUrls.Select(x => HtmlNode.CreateNode($"<a href='{x}'></a>"))); } // add and the caller as anchor var callerAnchor = html.CreateElement("a"); callerAnchor.SetAttributeValue("href", httpGet.Url); resultAnchors.Add(callerAnchor); return(resultAnchors); }
private async Task AddOrUpdatePackageToDatabase(BaseHttpCrawler crawler, IReadOnlyCollection <CrawlerDownloadPackageModel> packages, int crawlerId) { foreach (var package in packages.Where(x => x.IsProcessed == false && x.Success == true)) { await this.AddOrUpdatePackageToDatabase(crawler, package, crawlerId); } }
private async Task AddOrUpdatePackageToDatabase(BaseHttpCrawler crawler, CrawlerDownloadPackageModel package, int crawlerId) { var documentGroupModel = new DocumentGroupModel { Name = package.Name.Value, TwoLetterLanguage = package.TwoLetterLanguage, CrawlerId = crawlerId }; var failed = false; foreach (var downloadModel in package.Documents) { try { var httpGet = downloadModel.HttpGet; if (downloadModel.IsDownloaded == failed) { httpGet = await crawler.Http.GetAsync(downloadModel.Url); } documentGroupModel.Documents.Add(new DocumentModel { Format = httpGet.MimeType, Raw = httpGet.Raw, Name = downloadModel.Name.Value, Url = downloadModel.Url, }); } catch (Exception e) { this.logger.Error(message: $"package name: {package.Name}; url: {downloadModel.Url}", exception: e); failed = true; break; } } package.IsProcessed = true; package.FreeResource(); this.logger.Info($"Processed package: {package.Url}"); if (failed == false) { //var isNewOrUpdated = this.documentGroupManager.AddOrUpdateDocumentGroup(documentGroupModel); //if (isNewOrUpdated) //{ // this.logger.Info(message: $"New or updated: {documentGroupModel.Name}; {documentGroupModel.Identifier}"); //} } }
/// <summary> /// Starts the crawling, creating and persisting of the packages to the database /// </summary> /// <param name="crawler">Crawler to start</param> /// <returns></returns> public async Task StartAsync(BaseHttpCrawler crawler) { // todo: protect agains recursive calls link1 -> link2 -> link1 var crawlerName = crawler.GetType().FullName; var crawlerId = this.documentGroupManager.GetOrCreateCrawlerId(crawlerName); this.logger.Info($"Start {crawlerName} - {DateTime.UtcNow}"); foreach (var initialPageUrl in crawler.InitialPageUrls) { var packages = await this.CrawlAsyncRecursivly(crawler, initialPageUrl, crawlerId); } this.logger.Info($"End {crawlerName} - {DateTime.UtcNow}"); }
private async Task <IReadOnlyCollection <CrawlerDownloadPackageModel> > CrawlAsyncRecursivly(BaseHttpCrawler crawler, String url, int crawlerId) { var result = new List <CrawlerDownloadPackageModel>(); try { var httpGet = await crawler.Http.GetAsync(url); var packageInfo = await this.CrawlAsyncRecursivly(crawler, httpGet, null, crawlerId); // with the current impl. of the CrawlAsyncRecursivly the last package could be skipped and not pushed in the database // depending on the recursion level from which the package was downloaded // so we must explicitly check and process it var notProcessed = packageInfo.Where(x => x.Success && x.IsProcessed == false).SingleOrDefault(); if (notProcessed != null) { await this.AddOrUpdatePackageToDatabase(crawler, notProcessed, crawlerId); } result.AddRange(packageInfo); // todo: log or send notification view email or something like this for the package info // probably how many has failed / successed .. etc } catch (Exception e) { this.logger.Error(message: url, exception: e); } return(result); }
private static CrawlerDownloadDocumentModel CreatePackgeDocument(BaseHttpCrawler crawler, HttpModel httpGet, HtmlNode anchor) { var name = crawler.CreatePackageDocumentName(anchor, httpGet); return(new CrawlerDownloadDocumentModel(name, anchor.Href(), httpGet)); }
private async Task <(CrawlerDownloadPackageModel package, HttpModel httpGet)> CreatePackage(BaseHttpCrawler crawler, HtmlNode anchor, HttpModel parentHttpGet, HtmlNode parentAnchor) { var httpGet = await crawler.Http.GetAsync(anchor.Href()); var package = CreatePackge(crawler, httpGet, anchor); if (crawler.ShouldIncludePackgeAsDocument(anchor)) { // the package itself is also treated as package document (if this needs to be changed just introduce config option to control the behaviour) package.Documents.Add(CreatePackgeDocument(crawler, httpGet, anchor)); } this.logger.Info($"Downloaded package: {package.Url}"); if (crawler.ShouldIncludeParentPageAsDocument(parentAnchor, anchor)) { package.Documents.Add(CreatePackgeDocument(crawler, parentHttpGet, parentAnchor)); } return(package, httpGet); }
private async Task <IReadOnlyCollection <CrawlerDownloadPackageModel> > CrawlAsyncRecursivly(BaseHttpCrawler crawler, HttpModel parentHttpGet, HtmlNode parentAnchor, int crawlerId) { var packages = new List <CrawlerDownloadPackageModel>(); var package = (CrawlerDownloadPackageModel)null; var anchors = GetAnchors(parentHttpGet, crawler); foreach (var anchor in anchors) { var httpGet = (HttpModel)null; // if the url is not already processed (protection against infinity recursive call) if (this.processedUrls.Add(anchor.Href())) { try { if (crawler.IsPackage(anchor)) { // process current package before create new if (package?.Success == true && package?.IsProcessed == false) { await this.AddOrUpdatePackageToDatabase(crawler, package, crawlerId); } (package, httpGet) = await this.CreatePackage(crawler, anchor, parentHttpGet, parentAnchor); packages.Add(package); } // if package has failed we should skip processing documents for the failed package else if (crawler.IsPackageDocument(anchor) && package?.Success == true) { // if given package is already processed we should not attach additional documents to the package if (package.IsProcessed) { throw new InvalidOperationException($"Not allowed to add documents to already processed package. Name: {package.Name}; Url: {anchor.Href()}"); } httpGet = await crawler.Http.GetAsync(anchor.Href()); package.Documents.Add(CreatePackgeDocument(crawler, httpGet, anchor)); this.logger.Info($"Downloaded document package: {anchor.Href()}"); } if (crawler.ShouldExamine(anchor)) { if (httpGet == null) { httpGet = await crawler.Http.GetAsync(anchor.Href()); } this.logger.Debug($"Going recursivly for: {anchor.Href()}"); packages.AddRange(await this.CrawlAsyncRecursivly(crawler, httpGet, anchor, crawlerId)); // process if there is any package await this.AddOrUpdatePackageToDatabase(crawler, packages, crawlerId); } // when not processed else if (httpGet == null) { if (crawler.IsPackageDocument(anchor) && package?.Success == false) { this.logger.Debug($"Package failed -> skip downloading document: {anchor.Href()}"); } else { this.logger.Debug($"Skip downloading: {anchor.Href()}"); } } } catch (Exception e) { package = CrawlerDownloadPackageModel.CreateFailed(e.ToString()); packages.Add(package); this.logger.Error(message: $"{anchor.Href()}", e); } } } return(packages); }