private static CrawlerDownloadPackageModel CreatePackge(BaseHttpCrawler crawler, HttpModel httpGet, HtmlNode anchor) { var name = crawler.CreatePackageName(anchor, httpGet); var twoLetterLang = crawler.GetTwoLetterLanguage(httpGet, anchor); return(CrawlerDownloadPackageModel.CreateSuccess(name, httpGet.Encoding, anchor.Href(), twoLetterLang)); }
private async Task AddOrUpdatePackageToDatabase(BaseHttpCrawler crawler, CrawlerDownloadPackageModel package, int crawlerId) { var documentGroupModel = new DocumentGroupModel { Name = package.Name.Value, TwoLetterLanguage = package.TwoLetterLanguage, CrawlerId = crawlerId }; var failed = false; foreach (var downloadModel in package.Documents) { try { var httpGet = downloadModel.HttpGet; if (downloadModel.IsDownloaded == failed) { httpGet = await crawler.Http.GetAsync(downloadModel.Url); } documentGroupModel.Documents.Add(new DocumentModel { Format = httpGet.MimeType, Raw = httpGet.Raw, Name = downloadModel.Name.Value, Url = downloadModel.Url, }); } catch (Exception e) { this.logger.Error(message: $"package name: {package.Name}; url: {downloadModel.Url}", exception: e); failed = true; break; } } package.IsProcessed = true; package.FreeResource(); this.logger.Info($"Processed package: {package.Url}"); if (failed == false) { //var isNewOrUpdated = this.documentGroupManager.AddOrUpdateDocumentGroup(documentGroupModel); //if (isNewOrUpdated) //{ // this.logger.Info(message: $"New or updated: {documentGroupModel.Name}; {documentGroupModel.Identifier}"); //} } }
private async Task <IReadOnlyCollection <CrawlerDownloadPackageModel> > CrawlAsyncRecursivly(BaseHttpCrawler crawler, HttpModel parentHttpGet, HtmlNode parentAnchor, int crawlerId) { var packages = new List <CrawlerDownloadPackageModel>(); var package = (CrawlerDownloadPackageModel)null; var anchors = GetAnchors(parentHttpGet, crawler); foreach (var anchor in anchors) { var httpGet = (HttpModel)null; // if the url is not already processed (protection against infinity recursive call) if (this.processedUrls.Add(anchor.Href())) { try { if (crawler.IsPackage(anchor)) { // process current package before create new if (package?.Success == true && package?.IsProcessed == false) { await this.AddOrUpdatePackageToDatabase(crawler, package, crawlerId); } (package, httpGet) = await this.CreatePackage(crawler, anchor, parentHttpGet, parentAnchor); packages.Add(package); } // if package has failed we should skip processing documents for the failed package else if (crawler.IsPackageDocument(anchor) && package?.Success == true) { // if given package is already processed we should not attach additional documents to the package if (package.IsProcessed) { throw new InvalidOperationException($"Not allowed to add documents to already processed package. Name: {package.Name}; Url: {anchor.Href()}"); } httpGet = await crawler.Http.GetAsync(anchor.Href()); package.Documents.Add(CreatePackgeDocument(crawler, httpGet, anchor)); this.logger.Info($"Downloaded document package: {anchor.Href()}"); } if (crawler.ShouldExamine(anchor)) { if (httpGet == null) { httpGet = await crawler.Http.GetAsync(anchor.Href()); } this.logger.Debug($"Going recursivly for: {anchor.Href()}"); packages.AddRange(await this.CrawlAsyncRecursivly(crawler, httpGet, anchor, crawlerId)); // process if there is any package await this.AddOrUpdatePackageToDatabase(crawler, packages, crawlerId); } // when not processed else if (httpGet == null) { if (crawler.IsPackageDocument(anchor) && package?.Success == false) { this.logger.Debug($"Package failed -> skip downloading document: {anchor.Href()}"); } else { this.logger.Debug($"Skip downloading: {anchor.Href()}"); } } } catch (Exception e) { package = CrawlerDownloadPackageModel.CreateFailed(e.ToString()); packages.Add(package); this.logger.Error(message: $"{anchor.Href()}", e); } } } return(packages); }