예제 #1
0
        private CrawlJob _ReportJobStep(CrawlJob job, TransformBlock fetchBlock, TransformBlock parseBlock)
        {
            _Report.TrackJob(job);

            _Stats.FetchPending = fetchBlock.InputCount;
            _Stats.ParsePending = parseBlock.InputCount;
            ConsoleMonitor.PrintStatus(_Stats, job);

            return(job);
        }
        public void TrackJob(CrawlJob job)
        {
            var key = job.GetJobKey();

            if (_Jobs.ContainsKey(key))
            {
                return;
            }
            _Jobs.Add(job.GetJobKey(), job);
        }
예제 #3
0
 private void _RequeueJobStep(CrawlJob newJob, TransformBlock fetchBlock, TransformBlock parseBlock, CrawlOptions options)
 {
     if (newJob.Depth <= options.MaxDepth.GetValueOrDefault(DEFAULT_MAX_DEPTH))
     {
         fetchBlock.Post(newJob);
     }
     if (fetchBlock.InputCount == 0 && parseBlock.InputCount == 0)
     {
         fetchBlock.Complete();
     }
 }
 private object _BuildJobToTree(CrawlJob job)
 {
     return(new {
         url = job.Url,
         title = job.PageTitle,
         children = job.PageHrefs
                    .Select((url) => new JobKey(job.Depth + 1, url))
                    .Where((key) => _Jobs.ContainsKey(key))
                    .Select((key) => _Jobs[key])
                    .Select((childJob) => this._BuildJobToTree(childJob)),
     });
 }
예제 #5
0
 public static void PrintStatus(CrawlStats stats, CrawlJob currentJob)
 {
     Console.SetCursorPosition(0, _InitialCursorTop);
     Console.ForegroundColor = ConsoleColor.Yellow;
     Console.WriteLine("Fetched:\t{0:D4}\t(waiting {1:D4})", stats.FetchCount, stats.FetchPending);
     Console.WriteLine("Parsed:\t\t{0:D4}\t(waiting {1:D4})", stats.ParseCount, stats.ParsePending);
     Console.ForegroundColor = ConsoleColor.DarkRed;
     Console.WriteLine("Faulted:\t{0:D4}", stats.ErrorCount);
     Console.ForegroundColor = ConsoleColor.Gray;
     Console.WriteLine("Current: (depth: {0:D2}) {1}", currentJob.Depth, currentJob.Url.Truncate(55), currentJob.PageHrefs.Count());
     Console.ResetColor();
     _InitialCursorTop = Math.Max(Console.CursorTop - 4, 0);
 }
예제 #6
0
        private IEnumerable <CrawlJob> _SpanNewJobsStep(CrawlJob parentJob, TransformBlock fetchBlock, CrawlOptions options)
        {
            var newJobs = parentJob
                          .PageHrefs
                          .Select((href) => new CrawlJob(href, parentJob.Depth + 1))
                          .Take(options.MaxLinksPerNode.GetValueOrDefault(DEFAULT_MAX_LINKS_PER_NODE));

            if (parentJob.Depth == 0 && newJobs.Count() == 0)
            {
                fetchBlock.Complete();
            }

            return(newJobs);
        }
예제 #7
0
        private async Task <CrawlJob> _FetchHtmlStep(CrawlJob job)
        {
            try
            {
                job.Html = await HtmlFetcher.GetHtml(job.Url);

                _Stats.FetchCount += 1;
            }
            catch (Exception ex)
            {
                job.Exception      = ex;
                _Stats.ErrorCount += 1;
            }

            return(job);
        }
예제 #8
0
        private CrawlJob _ParseHtmlStep(CrawlJob job, CrawlOptions options)
        {
            try
            {
                job.PageTitle      = HtmlParser.GetTitle(job.Html);
                job.PageHrefs      = HtmlParser.GetHrefs(job.Url, job.Html);
                _Stats.ParseCount += 1;
            }
            catch (Exception ex)
            {
                job.Exception      = ex;
                _Stats.ErrorCount += 1;
            }

            //delay a little bit, just to make stats interesting
            System.Threading.Thread.Sleep(options.ParseDelay.GetValueOrDefault(DEFAULT_PARSE_DELAY));

            return(job);
        }
예제 #9
0
 public CrawlPipeline(string url)
 {
     _RootJob = new CrawlJob(url, 0);
     _Report  = new ReportData(_RootJob);
     _Stats   = new CrawlStats();
 }
예제 #10
0
 public ReportData(CrawlJob rootJob)
 {
     _RootJob = rootJob;
     _Jobs    = new Dictionary <JobKey, CrawlJob>();
 }