Exemple #1
0
        public async Task <ReportData> RunAsync(CrawlOptions options)
        {
            _Stats.Reset();
            _Report.Reset();

            var fetchHtmlBlock = new TransformBlock((job) => _FetchHtmlStep(job), new ExecutionDataflowBlockOptions {
                MaxDegreeOfParallelism = options.FetchWorkers.GetValueOrDefault(DEFAULT_FETCH_WORKERS)
            });
            var parseHtmlBlock = new TransformBlock((job) => _ParseHtmlStep(job, options), new ExecutionDataflowBlockOptions {
                MaxDegreeOfParallelism = options.ParseWorkers.GetValueOrDefault(DEFAULT_PARSE_WORKERS)
            });
            var reportJobBlock    = new TransformBlock((job) => _ReportJobStep(job, fetchHtmlBlock, parseHtmlBlock));
            var spawnNewJobsBlock = new TransformManyBlock((job) => _SpanNewJobsStep(job, fetchHtmlBlock, options));
            var requeueJobs       = new ActionBlock <CrawlJob>((job) => _RequeueJobStep(job, fetchHtmlBlock, parseHtmlBlock, options));

            fetchHtmlBlock.LinkTo(parseHtmlBlock, new DataflowLinkOptions {
                PropagateCompletion = true
            });
            parseHtmlBlock.LinkTo(reportJobBlock, new DataflowLinkOptions {
                PropagateCompletion = true
            });
            reportJobBlock.LinkTo(spawnNewJobsBlock, new DataflowLinkOptions {
                PropagateCompletion = true
            });
            spawnNewJobsBlock.LinkTo(requeueJobs, new DataflowLinkOptions {
                PropagateCompletion = true
            });

            fetchHtmlBlock.Post(_RootJob);

            await requeueJobs.Completion;

            return(_Report);
        }
        static CommandLineApplication SetupCliApp()
        {
            CommandLineApplication cliApp = new CommandLineApplication();

            cliApp.Name        = "dotnet wdcrawler.dll";
            cliApp.Description = "A web crawler written in C# using the Dataflow library";
            cliApp.VersionOption("-v | --version", "wdcrawler.dll v0.4");
            cliApp.HelpOption("-h | --help");

            var urlArg          = cliApp.Argument("url", "The seed url to crawl", false);
            var fetchWorkersOpt = cliApp.Option("-f | --fetch-workers", "Degree of parallel fetches", CommandOptionType.SingleValue);
            var parseWorkersOpt = cliApp.Option("-p | --parse-workers", "Degree of parallel parsers", CommandOptionType.SingleValue);
            var maxDepthOpt     = cliApp.Option("-d | --max-depth", "Max crawl depth from seed url", CommandOptionType.SingleValue);
            var maxLinksOpt     = cliApp.Option("-l | --max-links", "Max links to crawl per node", CommandOptionType.SingleValue);
            var parseWaitOpt    = cliApp.Option("-w | --parse-wait", "Add artificial delay to parse step to play with stats", CommandOptionType.SingleValue);

            cliApp.OnExecute(() =>
            {
                var crawlOptions = new CrawlOptions
                {
                    FetchWorkers    = fetchWorkersOpt.ParseIntOption(),
                    ParseWorkers    = parseWorkersOpt.ParseIntOption(),
                    MaxDepth        = maxDepthOpt.ParseIntOption(),
                    MaxLinksPerNode = maxLinksOpt.ParseIntOption(),
                    ParseDelay      = parseWaitOpt.ParseIntOption(),
                };

                var task = RunAsync(urlArg.Value, crawlOptions);
                task.Wait();
                return((task.IsFaulted) ? 1 : 0);
            });

            return(cliApp);
        }
Exemple #3
0
        private CrawlJob _ParseHtmlStep(CrawlJob job, CrawlOptions options)
        {
            try
            {
                job.PageTitle      = HtmlParser.GetTitle(job.Html);
                job.PageHrefs      = HtmlParser.GetHrefs(job.Url, job.Html);
                _Stats.ParseCount += 1;
            }
            catch (Exception ex)
            {
                job.Exception      = ex;
                _Stats.ErrorCount += 1;
            }

            //delay a little bit, just to make stats interesting
            System.Threading.Thread.Sleep(options.ParseDelay.GetValueOrDefault(DEFAULT_PARSE_DELAY));

            return(job);
        }
        static async Task RunAsync(string url, CrawlOptions options)
        {
            PrintTaskInfo("running crawl pipeline");
            var crawlPipeline = new CrawlPipeline(url);
            await crawlPipeline.RunAsync(options);

            PrintTaskInfo("uploading report to S3");
            var reportGuid = await ReportPublisher.UploadToS3Async(crawlPipeline.Report);

            var reportUrl = ReportPublisher.GeneratePublicUrl(reportGuid);

            PrintTaskInfo("saving backup data to file");
            System.IO.File.AppendAllText(String.Format("{0}.json", reportGuid), crawlPipeline.Report.ToJson());
            PrintTaskInfo(String.Format("closing up (report GUID: {0})", reportGuid));
            Console.ForegroundColor = ConsoleColor.Green;
            Console.WriteLine("========================");
            Console.WriteLine("View your report here:");
            Console.WriteLine(reportUrl);
            Console.WriteLine("========================");
            Console.ResetColor();
        }
Exemple #5
0
        private IEnumerable <CrawlJob> _SpanNewJobsStep(CrawlJob parentJob, TransformBlock fetchBlock, CrawlOptions options)
        {
            var newJobs = parentJob
                          .PageHrefs
                          .Select((href) => new CrawlJob(href, parentJob.Depth + 1))
                          .Take(options.MaxLinksPerNode.GetValueOrDefault(DEFAULT_MAX_LINKS_PER_NODE));

            if (parentJob.Depth == 0 && newJobs.Count() == 0)
            {
                fetchBlock.Complete();
            }

            return(newJobs);
        }
Exemple #6
0
 private void _RequeueJobStep(CrawlJob newJob, TransformBlock fetchBlock, TransformBlock parseBlock, CrawlOptions options)
 {
     if (newJob.Depth <= options.MaxDepth.GetValueOrDefault(DEFAULT_MAX_DEPTH))
     {
         fetchBlock.Post(newJob);
     }
     if (fetchBlock.InputCount == 0 && parseBlock.InputCount == 0)
     {
         fetchBlock.Complete();
     }
 }