示例#1
0
        protected async Task <List <string> > BasicCrawlPostUrlAsync(Dictionary <string, string> urlDictionary, CancellationToken cancellationToken = default)
        {
            using var browsingContext = GoblinCrawlerHelper.GetIBrowsingContext();

            var endpoint = GetEndpoint(urlDictionary);

            var htmlDocument = await browsingContext.OpenAsync(endpoint, cancellation : cancellationToken).ConfigureAwait(true);

            var postUrls = await GetPostUrlsAsync(browsingContext, htmlDocument).ConfigureAwait(true);

            postUrls = postUrls.Select(x => x.Trim().Trim('/').ToLowerInvariant()).ToList();

            var isStopCrawling = IsStopCrawling(postUrls);

            if (isStopCrawling)
            {
                return(postUrls);
            }

            urlDictionary = GetNextPageUrlDictionary();

            var nextPagePostUrls = await BasicCrawlPostUrlAsync(urlDictionary, cancellationToken);

            postUrls.AddRange(nextPagePostUrls);

            return(postUrls);
        }
示例#2
0
        public virtual async Task CrawlPostsAsync(CancellationToken cancellationToken = default)
        {
            var startTime = GoblinDateTimeHelper.SystemTimeNow;

            var sourceEntity = await GetSourceEntity(cancellationToken);

            var crawledPostUrl = await CrawlPostUrlAsync(cancellationToken).ConfigureAwait(true);

            var postUrls = crawledPostUrl.TakeWhile(url => url != sourceEntity.LastCrawledPostUrl).ToList();

            var postsMetadata = await GoblinCrawlerHelper.GetListMetadataModelsAsync(postUrls).ConfigureAwait(true);

            using var transaction =
                      await GoblinUnitOfWork.BeginTransactionAsync(cancellationToken).ConfigureAwait(true);

            // Posts Metadata to Post Crawled Database

            await GoblinCrawlerHelper.SavePostEntitiesAsync(Domain, postsMetadata, startTime, _postRepo, GoblinUnitOfWork).ConfigureAwait(true);

            // Update Source

            sourceEntity.LastCrawlStartTime       = startTime;
            sourceEntity.LastCrawlEndTime         = GoblinDateTimeHelper.SystemTimeNow;
            sourceEntity.TimeSpent                = sourceEntity.LastCrawlEndTime.Subtract(sourceEntity.LastCrawlStartTime);
            sourceEntity.TotalPostCrawledLastTime = postsMetadata.Count;
            sourceEntity.TotalPostCrawled        += postsMetadata.Count;

            if (!string.IsNullOrWhiteSpace(postsMetadata.FirstOrDefault()?.OriginalUrl))
            {
                sourceEntity.LastCrawledPostUrl = postsMetadata.FirstOrDefault()?.OriginalUrl;
            }

            _sourceRepo.Update(sourceEntity,
                               x => x.LastCrawlStartTime,
                               x => x.LastCrawlEndTime,
                               x => x.TimeSpent,
                               x => x.TotalPostCrawledLastTime,
                               x => x.TotalPostCrawled,
                               x => x.LastCrawledPostUrl
                               );

            await GoblinUnitOfWork.SaveChangesAsync(cancellationToken).ConfigureAwait(true);

            transaction.Commit();
        }