private void ExecuteJobForSubscription(Subscription subscription) { CrawlJob job = _jobFactory.CreateInstance(); List <Uri> alreadyProcessedUrls = _alreadyProcessedUrls.GetOrAdd(subscription.Id, valueFactory: _ => new List <Uri>()); job.Execute(subscription, alreadyProcessedUrls); }
[JsonConstructor] // need this to tell JSON.NET which constructor to pick public JobStatusUpdate(CrawlJob job, CrawlJobStats stats, JobStatus status, DateTime startTime, DateTime?endTime) { Job = job; StartTime = startTime; EndTime = endTime; Status = status; Stats = stats; }
public CrawlMaster(CrawlJob job) { Job = job; RunningStatus = new JobStatusUpdate(Job); TotalStats = new CrawlJobStats(Job).Copy(1); // count the index page as "discovered" Context.SetReceiveTimeout(TimeSpan.FromSeconds(5)); WaitingForTracker(); }
public CrawlMaster(CrawlJob job) { Job = job; RunningStatus = new JobStatusUpdate(Job); TotalStats = new CrawlJobStats(Job); Context.SetReceiveTimeout(TimeSpan.FromSeconds(5)); WaitingForTracker(); }
public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker, long maxConcurrentDownloads) { Job = job; DownloadsTracker = downloadsTracker; MaxConcurrentDownloads = maxConcurrentDownloads; Commander = commander; Stats = new CrawlJobStats(Job); Receiving(); }
public DownloadCoordinator(CrawlJob job, IActorRef commander, IActorRef downloadsTracker, long maxConcurrentDownloads) { Job = job; DownloadsTracker = downloadsTracker; MaxConcurrentDownloads = maxConcurrentDownloads; Commander = commander; Stats = new CrawlJobStats(Job); var selfHtmlSink = Sink.ActorRef <CheckDocuments>(Self, StreamCompleteTick.Instance); var selfDocSink = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance); var selfImgSink = Sink.ActorRef <CompletedDocument>(Self, StreamCompleteTick.Instance); var htmlFlow = Flow.Create <CrawlDocument>().Via(DownloadFlow.SelectDocType()) .Throttle(30, TimeSpan.FromSeconds(5), 100, ThrottleMode.Shaping) .Via(DownloadFlow.ProcessHtmlDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient())); var imageFlow = Flow.Create <CrawlDocument>() .Via(DownloadFlow.SelectDocType()) .Throttle(30, TimeSpan.FromSeconds(1), 100, ThrottleMode.Shaping) .Via(DownloadFlow.ProcessImageDownloadFor(DefaultMaxConcurrentDownloads, HttpClientFactory.GetClient())) .Via(DownloadFlow.ProcessCompletedDownload()); var source = Source.ActorRef <CrawlDocument>(5000, OverflowStrategy.DropTail); var graph = GraphDsl.Create(source, (builder, s) => { // html flows var downloadHtmlFlow = builder.Add(htmlFlow); var downloadBroadcast = builder.Add(new Broadcast <DownloadHtmlResult>(2)); var completedDownload = builder.Add(DownloadFlow.ProcessCompletedHtmlDownload()); var parseCompletedDownload = builder.Add(ParseFlow.GetParseFlow(Job)); var htmlSink = builder.Add(selfHtmlSink); var docSink = builder.Add(selfDocSink); builder.From(downloadHtmlFlow).To(downloadBroadcast); builder.From(downloadBroadcast.Out(0)).To(completedDownload.Inlet); builder.From(downloadBroadcast.Out(1)).To(parseCompletedDownload.Inlet); builder.From(parseCompletedDownload).To(htmlSink); builder.From(completedDownload).To(docSink); // image flows var imgSink = builder.Add(selfImgSink); var downloadImageFlow = builder.Add(imageFlow); builder.From(downloadImageFlow).To(imgSink); var sourceBroadcast = builder.Add(new Broadcast <CrawlDocument>(2)); builder.From(sourceBroadcast.Out(0)).To(downloadImageFlow.Inlet); builder.From(sourceBroadcast.Out(1)).To(downloadHtmlFlow.Inlet); builder.From(s.Outlet).To(sourceBroadcast.In); return(ClosedShape.Instance); }); SourceActor = Context.Materializer().Materialize(graph); Receiving(); }
public static bool CanMakeAbsoluteUri(CrawlJob jobRoot, string rawUri) { if (Uri.IsWellFormedUriString(rawUri, UriKind.Absolute)) { return(true); } try { var absUri = new Uri(jobRoot.Root, rawUri); var returnVal = absUri.Scheme.Equals(Uri.UriSchemeHttp) || absUri.Scheme.Equals(Uri.UriSchemeHttps); return(returnVal); } catch { return(false); } }
// // GET: /Crawl/Run public ActionResult Run(bool sync = true) { var sw = Stopwatch.StartNew(); var job = new CrawlJob(); if (sync) { job.SyncDigLinks(); } else { job.DigLinks(); } sw.Stop(); var unhandledLinks = job.GetUnhandledLinks(999, DateTime.Now.AddMinutes(30)); var result = new { Span = sw.Elapsed.ToString() , UnhandledLinks = unhandledLinks }; return Json(result, JsonRequestBehavior.AllowGet); }
public async Task Test() { var limiter = new RollingWindowRateLimiter(10000, TimeSpan.FromMinutes(1)); var proxyService = new DefaultProxyService(); var agent = new WebAgent(limiter, proxyService); var job = new CrawlJob() { Domain = new Uri("https://reddit.com/"), CompletionConditions = new List <ICrawlCompletionCondition> { new MaxPagesCrawledCondition(100), new MaxTimeCondition(TimeSpan.FromMinutes(3)), new MaxResultsFoundCondition(2000) }, ThreadAllowance = 10, Cookies = new List <Cookie> { new Cookie("over18", "1", "/", "reddit.com") }, Regex = "<img.+?src=\"(?<image>.+?)\"" }; using (var crawler = new Crawler(agent)) { var results = await crawler.Crawl(job); Console.WriteLine(results.CrawlCount); Console.WriteLine(results.QueueSize); Console.WriteLine(results.ResultsCount); foreach (var item in results.Data) { Console.WriteLine(item.Item2); } } }
public ParseWorker(CrawlJob jobRoot, IActorRef coordinatorActor) { JobRoot = jobRoot; CoordinatorActor = coordinatorActor; WaitingForDownloadActor(); }
public JobStatusUpdate(CrawlJob job) : this(job, null, JobStatus.Starting, DateTime.UtcNow, null) { }
public JobFound(CrawlJob key, IActorRef crawlMaster) { CrawlMaster = crawlMaster; Key = key; }
public SubscribeToJob(CrawlJob job, IActorRef subscriber) { Subscriber = subscriber; Job = job; }
public static Flow <DownloadHtmlResult, CheckDocuments, NotUsed> GetParseFlow(CrawlJob jobRoot) { return(Flow.Create <DownloadHtmlResult>().Async() .Select(downloadHtmlResult => { var requestedUrls = new List <CrawlDocument>(); var htmlString = downloadHtmlResult.Content; var doc = new HtmlDocument(); doc.LoadHtml(htmlString); //find all of the IMG tags via XPATH var imgs = doc.DocumentNode.SelectNodes("//img[@src]"); //find all of the A...HREF tags via XPATH var links = doc.DocumentNode.SelectNodes("//a[@href]"); /* PROCESS ALL IMAGES */ if (imgs != null) { var validImgUris = imgs.Select(x => x.Attributes["src"].Value) .Where(x => CanMakeAbsoluteUri(jobRoot, x)) .Select(x => ToAsboluteUri(jobRoot, x)) .Where(x => AbsoluteUriIsInDomain(jobRoot, x)) .Select(y => new CrawlDocument(y, true)); requestedUrls = requestedUrls.Concat(validImgUris).ToList(); } /* PROCESS ALL LINKS */ if (links != null) { var validLinkUris = links.Select(x => x.Attributes["href"].Value) .Where(x => CanMakeAbsoluteUri(jobRoot, x)) .Select(x => ToAsboluteUri(jobRoot, x)) .Where(x => AbsoluteUriIsInDomain(jobRoot, x)) .Select(y => new CrawlDocument(y, false)); requestedUrls = requestedUrls.Concat(validLinkUris).ToList(); } return new CheckDocuments(requestedUrls, ActorRefs.NoSender, TimeSpan.FromMilliseconds(requestedUrls.Count * 5000)); })); }
public StartJob(CrawlJob job, ActorRef requestor) { Requestor = requestor; Job = job; }
public CreatedTracker(CrawlJob key, ActorRef tracker) { Tracker = tracker; Key = key; }
public TrackerFound(CrawlJob key, ActorRef tracker) { Key = key; Tracker = tracker; }
public TrackerDead(CrawlJob key) { Key = key; }
public TrackerNotFound(CrawlJob key) { Key = key; }
public GetDownloadTracker(CrawlJob key) { Key = key; }
public RequestDownloadTrackerFor(CrawlJob key, ActorRef originator) { Originator = originator; Key = key; }
public FindRunningJob(CrawlJob key) { Key = key; }
public StopJob(CrawlJob job, IActorRef requestor) { Requestor = requestor; Job = job; }
public CreatedTracker(CrawlJob key, IActorRef tracker) { Tracker = tracker; Key = key; }
public static bool AbsoluteUriIsInDomain(CrawlJob jobRoot, Uri otherUri) { return(jobRoot.Domain == otherUri.Host); }
public CrawlJobStats(CrawlJob key) { Key = key; }
public UnsubscribeFromJob(CrawlJob job, ActorRef subscriber) { Subscriber = subscriber; Job = job; }
public JobStatusMessage(CrawlJob job, string documentTitle, string message) { Message = message; DocumentTitle = documentTitle; Job = job; }
public JobNotFound(CrawlJob key) { Key = key; }
public static Uri ToAsboluteUri(CrawlJob jobRoot, string rawUri) { return(Uri.IsWellFormedUriString(rawUri, UriKind.Absolute) ? new Uri(rawUri, UriKind.Absolute) : new Uri(jobRoot.Root, rawUri)); }
public JobStatusUpdate(CrawlJob job) { Job = job; StartTime = DateTime.UtcNow; Status = JobStatus.Starting; }
public UnsubscribeFromJob(CrawlJob job, IActorRef subscriber) { Subscriber = subscriber; Job = job; }
public TrackerFound(CrawlJob key, IActorRef tracker) { Key = key; Tracker = tracker; }
public RequestDownloadTrackerFor(CrawlJob key, IActorRef originator) { Originator = originator; Key = key; }