コード例 #1
0
ファイル: CrawlScheduler.cs プロジェクト: vpetroff/NetCrawler
        private void RaisePageCrawled(PageCrawlResult result)
        {
            var handler = PageCrawled;

            if (handler != null)
            {
                handler.Invoke(result);
            }
        }
コード例 #2
0
        public PageCrawlResult Crawl(Uri url)
        {
            var crawlResult = new PageCrawlResult();

            var downloadResponse = pageDownloader.Download(url);
            crawlResult.StatusCode = downloadResponse.StatusCode;

            if (downloadResponse.IsSuccessful)
            {
                crawlResult.Contents = downloadResponse.Contents;
                crawlResult.Links = htmlParser.ExtractLinks(url, downloadResponse.Contents);
            }

            crawlResult.CrawlEndedAt = DateTimeOffset.Now;

            return crawlResult;
        }
コード例 #3
0
        public void Save(PageCrawlResult pageCrawlResult)
        {
            using (var session = documentStore.OpenSession())
            {
                var existing = session.Advanced.LuceneQuery<Page, PagesToCrawlByUrl>().Where(string.Format("Hash:\"{0}\"", pageCrawlResult.CrawlUrl.Hash)).FirstOrDefault() ?? new Page();

                existing.WebsiteUrl = pageCrawlResult.CrawlUrl.WebsiteDefinition.Website.RootUrl;
                existing.Url = pageCrawlResult.CrawlUrl.Url;
                existing.Hash = pageCrawlResult.CrawlUrl.Hash;
                existing.Contents = pageCrawlResult.Contents;
                existing.StatusCode = pageCrawlResult.StatusCode;
                existing.CrawledAt = pageCrawlResult.CrawlEndedAt;

                session.Store(existing);
                session.SaveChanges();
            }
        }
コード例 #4
0
        public PageCrawlResult Crawl(Uri url)
        {
            var crawlResult = new PageCrawlResult();

            var downloadResponse = pageDownloader.Download(url);

            crawlResult.StatusCode = downloadResponse.StatusCode;

            if (downloadResponse.IsSuccessful)
            {
                crawlResult.Contents = downloadResponse.Contents;
                crawlResult.Links    = htmlParser.ExtractLinks(url, downloadResponse.Contents);
            }

            crawlResult.CrawlEndedAt = DateTimeOffset.Now;

            return(crawlResult);
        }
コード例 #5
0
ファイル: CrawlScheduler.cs プロジェクト: vpetroff/NetCrawler
 private void RaisePageCrawled(PageCrawlResult result)
 {
     var handler = PageCrawled;
     if (handler != null)
         handler.Invoke(result);
 }