protected async Task <List <string> > BasicCrawlPostUrlAsync(Dictionary <string, string> urlDictionary, CancellationToken cancellationToken = default) { using var browsingContext = GoblinCrawlerHelper.GetIBrowsingContext(); var endpoint = GetEndpoint(urlDictionary); var htmlDocument = await browsingContext.OpenAsync(endpoint, cancellation : cancellationToken).ConfigureAwait(true); var postUrls = await GetPostUrlsAsync(browsingContext, htmlDocument).ConfigureAwait(true); postUrls = postUrls.Select(x => x.Trim().Trim('/').ToLowerInvariant()).ToList(); var isStopCrawling = IsStopCrawling(postUrls); if (isStopCrawling) { return(postUrls); } urlDictionary = GetNextPageUrlDictionary(); var nextPagePostUrls = await BasicCrawlPostUrlAsync(urlDictionary, cancellationToken); postUrls.AddRange(nextPagePostUrls); return(postUrls); }
public virtual async Task CrawlPostsAsync(CancellationToken cancellationToken = default) { var startTime = GoblinDateTimeHelper.SystemTimeNow; var sourceEntity = await GetSourceEntity(cancellationToken); var crawledPostUrl = await CrawlPostUrlAsync(cancellationToken).ConfigureAwait(true); var postUrls = crawledPostUrl.TakeWhile(url => url != sourceEntity.LastCrawledPostUrl).ToList(); var postsMetadata = await GoblinCrawlerHelper.GetListMetadataModelsAsync(postUrls).ConfigureAwait(true); using var transaction = await GoblinUnitOfWork.BeginTransactionAsync(cancellationToken).ConfigureAwait(true); // Posts Metadata to Post Crawled Database await GoblinCrawlerHelper.SavePostEntitiesAsync(Domain, postsMetadata, startTime, _postRepo, GoblinUnitOfWork).ConfigureAwait(true); // Update Source sourceEntity.LastCrawlStartTime = startTime; sourceEntity.LastCrawlEndTime = GoblinDateTimeHelper.SystemTimeNow; sourceEntity.TimeSpent = sourceEntity.LastCrawlEndTime.Subtract(sourceEntity.LastCrawlStartTime); sourceEntity.TotalPostCrawledLastTime = postsMetadata.Count; sourceEntity.TotalPostCrawled += postsMetadata.Count; if (!string.IsNullOrWhiteSpace(postsMetadata.FirstOrDefault()?.OriginalUrl)) { sourceEntity.LastCrawledPostUrl = postsMetadata.FirstOrDefault()?.OriginalUrl; } _sourceRepo.Update(sourceEntity, x => x.LastCrawlStartTime, x => x.LastCrawlEndTime, x => x.TimeSpent, x => x.TotalPostCrawledLastTime, x => x.TotalPostCrawled, x => x.LastCrawledPostUrl ); await GoblinUnitOfWork.SaveChangesAsync(cancellationToken).ConfigureAwait(true); transaction.Commit(); }