public async Task Crawl(Uri start, string basicUser, string basicPassword) { try { _startUrl = start; _timer.Enabled = true; HostingEnvironment.RegisterObject(this); var broadcastBlock = new BroadcastBlock <UriItem>(li => li, new DataflowBlockOptions { CancellationToken = _cts.Token }); Func <UriItem, Task <IEnumerable <UriItem> > > downloadFromLink = async link => { var nextLinks = new UriItem[] { }; try { log.Debug(link.AbsoluteUri); if (_crawledLinks.Count >= 10000) { //log.Warn("Max Links cancel: " + _crawledLinks.Count + " " + CrawlerKey + " " + start.AbsoluteUri); _cancelledReason = CancelledReason.MaxLinks; _cts.Cancel(); } if (link.Level < 20 && !_cts.IsCancellationRequested && _crawledLinks.TryAdd(link.AbsoluteUri, link)) { using (var client = new HttpClient(new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = false })) { client.DefaultRequestHeaders.UserAgent.ParseAdd("PrimocaCrawler (http://www.primoca.com)"); if (!string.IsNullOrEmpty(basicUser) || !string.IsNullOrEmpty(basicPassword)) { client.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", Convert.ToBase64String(Encoding.ASCII.GetBytes(basicUser + ":" + basicPassword))); } HttpResponseMessage r = null; try { r = await client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, _cts.Token); } catch (Exception ex) { log.Error(link.AbsoluteUri, ex); _errorLinks.Add(link.AbsoluteUri); } if (r != null) { if (r.IsSuccessStatusCode) { using (var s = await r.Content.ReadAsStreamAsync()) { try { var pst = s; if (r.Content.Headers.ContentType != null && r.Content.Headers.ContentType.MediaType != null && (r.Content.Headers.ContentType.MediaType.StartsWith("text/html", StringComparison.OrdinalIgnoreCase) || r.Content.Headers.ContentType.MediaType.StartsWith("text/css", StringComparison.OrdinalIgnoreCase) || r.Content.Headers.ContentType.MediaType.StartsWith("text/xml", StringComparison.OrdinalIgnoreCase))) { var html = await new StreamContent(s).ReadAsStringAsync(); //var modHtml = Regex.Replace(html, start.GetLeftPart(UriPartial.Authority), "http://" + _app.ActiveHostname, RegexOptions.IgnoreCase); //if (!Object.ReferenceEquals(html, modHtml)) //{ // pst = new MemoryStream(Encoding.UTF8.GetBytes(modHtml)); //} pst = new MemoryStream(Encoding.UTF8.GetBytes(html)); nextLinks = Find(html, r.RequestMessage.RequestUri).Where(x => !_crawledLinks.ContainsKey(x.AbsoluteUri)).Select(x => new UriItem(x, link.Level + 1)).ToArray(); } else { pst = new MemoryStream(await new StreamContent(s).ReadAsByteArrayAsync()); } var request = new Amazon.S3.Model.PutObjectRequest { BucketName = CloudConfigurationManager.GetSetting("bucket"), InputStream = pst, Key = (link.PathAndQuery.EndsWith("/") ? (link.PathAndQuery + _rootAction) : link.PathAndQuery).Remove(0, 1) }; var p = S3.PutObject(request); } catch (Exception ex) { log.Error("Error:" + r.StatusCode + " " + link.AbsoluteUri, ex); throw ex; } } } else if ((int)r.StatusCode >= 400) { log.Debug("Crawl Error code: " + r.StatusCode + " " + link.AbsoluteUri); _errorLinks.Add(link.AbsoluteUri); } } } } } catch (Exception ex) { ((IDataflowBlock)broadcastBlock).Fault(ex); } finally { lock (_remainingLinks) { _remainingLinks.Remove(link.AbsoluteUri); foreach (var l in nextLinks) { _remainingLinks.Add(l.AbsoluteUri); } if (_remainingLinks.Count == 0) { broadcastBlock.Complete(); } } } return(nextLinks); }; var linkFinderBlock = new TransformManyBlock <UriItem, UriItem>(downloadFromLink, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); linkFinderBlock.LinkTo(broadcastBlock); broadcastBlock.LinkTo(linkFinderBlock); _remainingLinks.Add(start.AbsoluteUri); broadcastBlock.Post(new UriItem(start, 0)); await broadcastBlock.Completion; } catch (Exception ex) { if (!(ex is TaskCanceledException)) { _cancelledReason = CancelledReason.Error; log.Error(start.AbsoluteUri, ex); } } HostingEnvironment.UnregisterObject(this); _timer.Enabled = false; _timer.Dispose(); _isCompleted = true; }
public void Cancel() { //log.Warn("User cancel: " + CrawlerKey); _cancelledReason = CancelledReason.User; _cts.Cancel(); }
public void Stop(bool immediate) { //log.Warn("AppDomain shutdown cancel: " + CrawlerKey); _cancelledReason = CancelledReason.AppDomainShutdown; _cts.Cancel(); }
void timer_Elapsed(object sender, ElapsedEventArgs e) { //log.Warn("Timeout cancel: " + CrawlerKey); _cancelledReason = CancelledReason.Timeout; _cts.Cancel(); }