Beispiel #1
0
        public async Task Crawl(Uri start, string basicUser, string basicPassword)
        {
            try
            {
                _startUrl      = start;
                _timer.Enabled = true;
                HostingEnvironment.RegisterObject(this);

                var broadcastBlock = new BroadcastBlock <UriItem>(li => li, new DataflowBlockOptions {
                    CancellationToken = _cts.Token
                });

                Func <UriItem, Task <IEnumerable <UriItem> > > downloadFromLink =
                    async link =>
                {
                    var nextLinks = new UriItem[] { };
                    try
                    {
                        log.Debug(link.AbsoluteUri);
                        if (_crawledLinks.Count >= 10000)
                        {
                            //log.Warn("Max Links cancel: " + _crawledLinks.Count + " " + CrawlerKey + " " + start.AbsoluteUri);
                            _cancelledReason = CancelledReason.MaxLinks;
                            _cts.Cancel();
                        }

                        if (link.Level < 20 && !_cts.IsCancellationRequested && _crawledLinks.TryAdd(link.AbsoluteUri, link))
                        {
                            using (var client = new HttpClient(new HttpClientHandler {
                                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = false
                            }))
                            {
                                client.DefaultRequestHeaders.UserAgent.ParseAdd("PrimocaCrawler (http://www.primoca.com)");
                                if (!string.IsNullOrEmpty(basicUser) || !string.IsNullOrEmpty(basicPassword))
                                {
                                    client.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", Convert.ToBase64String(Encoding.ASCII.GetBytes(basicUser + ":" + basicPassword)));
                                }
                                HttpResponseMessage r = null;
                                try
                                {
                                    r = await client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, _cts.Token);
                                }
                                catch (Exception ex)
                                {
                                    log.Error(link.AbsoluteUri, ex);
                                    _errorLinks.Add(link.AbsoluteUri);
                                }
                                if (r != null)
                                {
                                    if (r.IsSuccessStatusCode)
                                    {
                                        using (var s = await r.Content.ReadAsStreamAsync())
                                        {
                                            try {
                                                var pst = s;
                                                if (r.Content.Headers.ContentType != null && r.Content.Headers.ContentType.MediaType != null &&
                                                    (r.Content.Headers.ContentType.MediaType.StartsWith("text/html", StringComparison.OrdinalIgnoreCase) ||
                                                     r.Content.Headers.ContentType.MediaType.StartsWith("text/css", StringComparison.OrdinalIgnoreCase) ||
                                                     r.Content.Headers.ContentType.MediaType.StartsWith("text/xml", StringComparison.OrdinalIgnoreCase)))
                                                {
                                                    var html = await new StreamContent(s).ReadAsStringAsync();
                                                    //var modHtml = Regex.Replace(html, start.GetLeftPart(UriPartial.Authority), "http://" + _app.ActiveHostname, RegexOptions.IgnoreCase);
                                                    //if (!Object.ReferenceEquals(html, modHtml))
                                                    //{
                                                    //	pst = new MemoryStream(Encoding.UTF8.GetBytes(modHtml));
                                                    //}
                                                    pst       = new MemoryStream(Encoding.UTF8.GetBytes(html));
                                                    nextLinks = Find(html, r.RequestMessage.RequestUri).Where(x => !_crawledLinks.ContainsKey(x.AbsoluteUri)).Select(x => new UriItem(x, link.Level + 1)).ToArray();
                                                }
                                                else
                                                {
                                                    pst = new MemoryStream(await new StreamContent(s).ReadAsByteArrayAsync());
                                                }
                                                var request = new Amazon.S3.Model.PutObjectRequest
                                                {
                                                    BucketName  = CloudConfigurationManager.GetSetting("bucket"),
                                                    InputStream = pst,
                                                    Key         = (link.PathAndQuery.EndsWith("/") ? (link.PathAndQuery + _rootAction) : link.PathAndQuery).Remove(0, 1)
                                                };
                                                var p = S3.PutObject(request);
                                            }
                                            catch (Exception ex)
                                            {
                                                log.Error("Error:" + r.StatusCode + " " + link.AbsoluteUri, ex);
                                                throw ex;
                                            }
                                        }
                                    }
                                    else if ((int)r.StatusCode >= 400)
                                    {
                                        log.Debug("Crawl Error code: " + r.StatusCode + " " + link.AbsoluteUri);
                                        _errorLinks.Add(link.AbsoluteUri);
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        ((IDataflowBlock)broadcastBlock).Fault(ex);
                    }
                    finally
                    {
                        lock (_remainingLinks)
                        {
                            _remainingLinks.Remove(link.AbsoluteUri);
                            foreach (var l in nextLinks)
                            {
                                _remainingLinks.Add(l.AbsoluteUri);
                            }
                            if (_remainingLinks.Count == 0)
                            {
                                broadcastBlock.Complete();
                            }
                        }
                    }
                    return(nextLinks);
                };

                var linkFinderBlock = new TransformManyBlock <UriItem, UriItem>(downloadFromLink, new ExecutionDataflowBlockOptions {
                    MaxDegreeOfParallelism = 4
                });

                linkFinderBlock.LinkTo(broadcastBlock);
                broadcastBlock.LinkTo(linkFinderBlock);
                _remainingLinks.Add(start.AbsoluteUri);
                broadcastBlock.Post(new UriItem(start, 0));

                await broadcastBlock.Completion;
            }
            catch (Exception ex)
            {
                if (!(ex is TaskCanceledException))
                {
                    _cancelledReason = CancelledReason.Error;
                    log.Error(start.AbsoluteUri, ex);
                }
            }

            HostingEnvironment.UnregisterObject(this);
            _timer.Enabled = false;
            _timer.Dispose();
            _isCompleted = true;
        }
Beispiel #2
0
 public void Cancel()
 {
     //log.Warn("User cancel: " + CrawlerKey);
     _cancelledReason = CancelledReason.User;
     _cts.Cancel();
 }
Beispiel #3
0
 public void Stop(bool immediate)
 {
     //log.Warn("AppDomain shutdown cancel: " + CrawlerKey);
     _cancelledReason = CancelledReason.AppDomainShutdown;
     _cts.Cancel();
 }
Beispiel #4
0
 void timer_Elapsed(object sender, ElapsedEventArgs e)
 {
     //log.Warn("Timeout cancel: " + CrawlerKey);
     _cancelledReason = CancelledReason.Timeout;
     _cts.Cancel();
 }