private CrawlJob _ReportJobStep(CrawlJob job, TransformBlock fetchBlock, TransformBlock parseBlock) { _Report.TrackJob(job); _Stats.FetchPending = fetchBlock.InputCount; _Stats.ParsePending = parseBlock.InputCount; ConsoleMonitor.PrintStatus(_Stats, job); return(job); }
public void TrackJob(CrawlJob job) { var key = job.GetJobKey(); if (_Jobs.ContainsKey(key)) { return; } _Jobs.Add(job.GetJobKey(), job); }
private void _RequeueJobStep(CrawlJob newJob, TransformBlock fetchBlock, TransformBlock parseBlock, CrawlOptions options) { if (newJob.Depth <= options.MaxDepth.GetValueOrDefault(DEFAULT_MAX_DEPTH)) { fetchBlock.Post(newJob); } if (fetchBlock.InputCount == 0 && parseBlock.InputCount == 0) { fetchBlock.Complete(); } }
private object _BuildJobToTree(CrawlJob job) { return(new { url = job.Url, title = job.PageTitle, children = job.PageHrefs .Select((url) => new JobKey(job.Depth + 1, url)) .Where((key) => _Jobs.ContainsKey(key)) .Select((key) => _Jobs[key]) .Select((childJob) => this._BuildJobToTree(childJob)), }); }
public static void PrintStatus(CrawlStats stats, CrawlJob currentJob) { Console.SetCursorPosition(0, _InitialCursorTop); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Fetched:\t{0:D4}\t(waiting {1:D4})", stats.FetchCount, stats.FetchPending); Console.WriteLine("Parsed:\t\t{0:D4}\t(waiting {1:D4})", stats.ParseCount, stats.ParsePending); Console.ForegroundColor = ConsoleColor.DarkRed; Console.WriteLine("Faulted:\t{0:D4}", stats.ErrorCount); Console.ForegroundColor = ConsoleColor.Gray; Console.WriteLine("Current: (depth: {0:D2}) {1}", currentJob.Depth, currentJob.Url.Truncate(55), currentJob.PageHrefs.Count()); Console.ResetColor(); _InitialCursorTop = Math.Max(Console.CursorTop - 4, 0); }
private IEnumerable <CrawlJob> _SpanNewJobsStep(CrawlJob parentJob, TransformBlock fetchBlock, CrawlOptions options) { var newJobs = parentJob .PageHrefs .Select((href) => new CrawlJob(href, parentJob.Depth + 1)) .Take(options.MaxLinksPerNode.GetValueOrDefault(DEFAULT_MAX_LINKS_PER_NODE)); if (parentJob.Depth == 0 && newJobs.Count() == 0) { fetchBlock.Complete(); } return(newJobs); }
private async Task <CrawlJob> _FetchHtmlStep(CrawlJob job) { try { job.Html = await HtmlFetcher.GetHtml(job.Url); _Stats.FetchCount += 1; } catch (Exception ex) { job.Exception = ex; _Stats.ErrorCount += 1; } return(job); }
private CrawlJob _ParseHtmlStep(CrawlJob job, CrawlOptions options) { try { job.PageTitle = HtmlParser.GetTitle(job.Html); job.PageHrefs = HtmlParser.GetHrefs(job.Url, job.Html); _Stats.ParseCount += 1; } catch (Exception ex) { job.Exception = ex; _Stats.ErrorCount += 1; } //delay a little bit, just to make stats interesting System.Threading.Thread.Sleep(options.ParseDelay.GetValueOrDefault(DEFAULT_PARSE_DELAY)); return(job); }
public CrawlPipeline(string url) { _RootJob = new CrawlJob(url, 0); _Report = new ReportData(_RootJob); _Stats = new CrawlStats(); }
public ReportData(CrawlJob rootJob) { _RootJob = rootJob; _Jobs = new Dictionary <JobKey, CrawlJob>(); }