public async Task <ReportData> RunAsync(CrawlOptions options) { _Stats.Reset(); _Report.Reset(); var fetchHtmlBlock = new TransformBlock((job) => _FetchHtmlStep(job), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = options.FetchWorkers.GetValueOrDefault(DEFAULT_FETCH_WORKERS) }); var parseHtmlBlock = new TransformBlock((job) => _ParseHtmlStep(job, options), new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = options.ParseWorkers.GetValueOrDefault(DEFAULT_PARSE_WORKERS) }); var reportJobBlock = new TransformBlock((job) => _ReportJobStep(job, fetchHtmlBlock, parseHtmlBlock)); var spawnNewJobsBlock = new TransformManyBlock((job) => _SpanNewJobsStep(job, fetchHtmlBlock, options)); var requeueJobs = new ActionBlock <CrawlJob>((job) => _RequeueJobStep(job, fetchHtmlBlock, parseHtmlBlock, options)); fetchHtmlBlock.LinkTo(parseHtmlBlock, new DataflowLinkOptions { PropagateCompletion = true }); parseHtmlBlock.LinkTo(reportJobBlock, new DataflowLinkOptions { PropagateCompletion = true }); reportJobBlock.LinkTo(spawnNewJobsBlock, new DataflowLinkOptions { PropagateCompletion = true }); spawnNewJobsBlock.LinkTo(requeueJobs, new DataflowLinkOptions { PropagateCompletion = true }); fetchHtmlBlock.Post(_RootJob); await requeueJobs.Completion; return(_Report); }
static CommandLineApplication SetupCliApp() { CommandLineApplication cliApp = new CommandLineApplication(); cliApp.Name = "dotnet wdcrawler.dll"; cliApp.Description = "A web crawler written in C# using the Dataflow library"; cliApp.VersionOption("-v | --version", "wdcrawler.dll v0.4"); cliApp.HelpOption("-h | --help"); var urlArg = cliApp.Argument("url", "The seed url to crawl", false); var fetchWorkersOpt = cliApp.Option("-f | --fetch-workers", "Degree of parallel fetches", CommandOptionType.SingleValue); var parseWorkersOpt = cliApp.Option("-p | --parse-workers", "Degree of parallel parsers", CommandOptionType.SingleValue); var maxDepthOpt = cliApp.Option("-d | --max-depth", "Max crawl depth from seed url", CommandOptionType.SingleValue); var maxLinksOpt = cliApp.Option("-l | --max-links", "Max links to crawl per node", CommandOptionType.SingleValue); var parseWaitOpt = cliApp.Option("-w | --parse-wait", "Add artificial delay to parse step to play with stats", CommandOptionType.SingleValue); cliApp.OnExecute(() => { var crawlOptions = new CrawlOptions { FetchWorkers = fetchWorkersOpt.ParseIntOption(), ParseWorkers = parseWorkersOpt.ParseIntOption(), MaxDepth = maxDepthOpt.ParseIntOption(), MaxLinksPerNode = maxLinksOpt.ParseIntOption(), ParseDelay = parseWaitOpt.ParseIntOption(), }; var task = RunAsync(urlArg.Value, crawlOptions); task.Wait(); return((task.IsFaulted) ? 1 : 0); }); return(cliApp); }
private CrawlJob _ParseHtmlStep(CrawlJob job, CrawlOptions options) { try { job.PageTitle = HtmlParser.GetTitle(job.Html); job.PageHrefs = HtmlParser.GetHrefs(job.Url, job.Html); _Stats.ParseCount += 1; } catch (Exception ex) { job.Exception = ex; _Stats.ErrorCount += 1; } //delay a little bit, just to make stats interesting System.Threading.Thread.Sleep(options.ParseDelay.GetValueOrDefault(DEFAULT_PARSE_DELAY)); return(job); }
static async Task RunAsync(string url, CrawlOptions options) { PrintTaskInfo("running crawl pipeline"); var crawlPipeline = new CrawlPipeline(url); await crawlPipeline.RunAsync(options); PrintTaskInfo("uploading report to S3"); var reportGuid = await ReportPublisher.UploadToS3Async(crawlPipeline.Report); var reportUrl = ReportPublisher.GeneratePublicUrl(reportGuid); PrintTaskInfo("saving backup data to file"); System.IO.File.AppendAllText(String.Format("{0}.json", reportGuid), crawlPipeline.Report.ToJson()); PrintTaskInfo(String.Format("closing up (report GUID: {0})", reportGuid)); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("========================"); Console.WriteLine("View your report here:"); Console.WriteLine(reportUrl); Console.WriteLine("========================"); Console.ResetColor(); }
private IEnumerable <CrawlJob> _SpanNewJobsStep(CrawlJob parentJob, TransformBlock fetchBlock, CrawlOptions options) { var newJobs = parentJob .PageHrefs .Select((href) => new CrawlJob(href, parentJob.Depth + 1)) .Take(options.MaxLinksPerNode.GetValueOrDefault(DEFAULT_MAX_LINKS_PER_NODE)); if (parentJob.Depth == 0 && newJobs.Count() == 0) { fetchBlock.Complete(); } return(newJobs); }
private void _RequeueJobStep(CrawlJob newJob, TransformBlock fetchBlock, TransformBlock parseBlock, CrawlOptions options) { if (newJob.Depth <= options.MaxDepth.GetValueOrDefault(DEFAULT_MAX_DEPTH)) { fetchBlock.Post(newJob); } if (fetchBlock.InputCount == 0 && parseBlock.InputCount == 0) { fetchBlock.Complete(); } }