Beispiel #1
0
        private async Task LoadNext(ISpiderContainer container, UriItem url)
        {
            var spider = container.Application;

            spider.UrlProvider.UpdateItem(url, UriStatus.DOING);
            var content = await spider.RequestProvider.Getter().GetAsync(url.Source,
                                                                         spider.Option.HeaderItems,
                                                                         spider.ProxyProvider.Get());

            if (content == null)
            {
                spider.UrlProvider.UpdateItem(url, UriStatus.DONE);
                await container.NextAsync();

                return;
            }
            var rules = spider.RuleProvider.Get(url.Source);
            var keys  = container.AttributeKeys;

            foreach (var item in rules)
            {
                var con = spider.GetContainer(url, spider.PluginLoader.Render(item.Rules));
                foreach (var key in keys)
                {
                    con.SetAttribute(key, con.GetAttribute(key));
                }
                await con.NextAsync();
            }
            spider.UrlProvider.UpdateItem(url, UriStatus.DONE);
            await container.NextAsync();
        }
Beispiel #2
0
        public async Task <IList <ISpiderContainer> > GetContainerAsync(UriItem url)
        {
            var items = new List <ISpiderContainer>();
            var rules = RuleProvider.Get(url.Source);

            if (rules == null || rules.Count < 1)
            {
                return(items);
            }
            var shouldPrepare = false;

            foreach (var item in rules)
            {
                items.Add(GetContainer(url, PluginLoader.Render(item.Rules, ref shouldPrepare)));
            }
            if (!shouldPrepare)
            {
                return(items);
            }
            var content = await RequestProvider.Getter().GetAsync(url.Source, Option.HeaderItems, ProxyProvider.Get());

            if (content == null)
            {
                Logger?.Waining($"{url.Source} HTML EMPTY");
                items.Clear();
                return(items);
            }
            url.Title = Html.MatchTitle(content);
            foreach (var item in items)
            {
                item.Data = new RuleString(content);
            }
            return(items);
        }
Beispiel #3
0
 public SpiderContainer(ISpider spider, UriItem url, IList <IRule> rules)
 {
     Application = spider;
     Logger      = spider.Logger;
     Url         = url;
     Rules       = rules;
 }
        public void Add(string url, UriType uriType, UriStatus status)
        {
            if (Contains(url))
            {
                return;
            }
            var item = new UriItem()
            {
                Source = url, Type = uriType, Status = status
            };

            Items.Add(item);
            UrlChanged?.Invoke(item, true);
        }
Beispiel #5
0
        private void MenuItem_Click(object sender, RoutedEventArgs e)
        {
            switch ((sender as MenuItem).Header as string)
            {
            case "选中":
                var items = new UriItem[UrlListBox.SelectedItems.Count];
                UrlListBox.SelectedItems.CopyTo(items, 0);
                ViewModel.Instance?.UrlProvider.Remove(items);
                break;

            case "已完成":
                ViewModel.Instance?.UrlProvider.Remove(UriStatus.DONE);
                break;

            case "全部":
                ViewModel.Instance?.UrlProvider.Clear();
                break;

            default:
                break;
            }
        }
Beispiel #6
0
        protected async Task RunTaskAsync(UriItem url)
        {
            var items = await GetContainerAsync(url);

            if (items.Count < 1)
            {
                Logger?.Info($"{url.Source} has 0 rule groups, jump");
                UrlProvider.UpdateItem(url, UriStatus.ERROR);
                return;
            }
            Logger?.Info($"{url.Source} has {items.Count} rule groups");
            UrlProvider.UpdateItem(url, UriStatus.DOING);
            foreach (var item in items)
            {
                await item.NextAsync();

                if (Paused)
                {
                    UrlProvider.UpdateItem(url, UriStatus.NONE);
                    return;
                }
            }
            UrlProvider.UpdateItem(url, UriStatus.DONE);
        }
Beispiel #7
0
 public string GetFileName(UriItem uri)
 {
     return(Application.RuleProvider.GetFileName(uri.Source));
 }
Beispiel #8
0
 public ISpiderContainer GetContainer(UriItem url, IList <IRule> rules)
 {
     return(new SpiderContainer(this, url, rules));
 }
 public void UpdateItem(UriItem item)
 {
     UrlChanged?.Invoke(item, false);
 }
 public void UpdateItem(UriItem item, UriStatus status)
 {
     item.Status = status;
     UpdateItem(item);
 }
 public void UpdateItem(int index, UriItem item)
 {
     UpdateItem(Items[index] = item);
 }
Beispiel #12
0
 public Task <FileStream?> OpenStreamAsync(UriItem uri)
 {
     return(OpenStreamAsync(GetFileName(uri)));
 }
 public void ActivateHyperlink(UriItem item)
 {
     Windows.Infrastructure.Navigate(item.Uri);
 }
Beispiel #14
0
        public async Task Crawl(Uri start, string basicUser, string basicPassword)
        {
            try
            {
                _startUrl      = start;
                _timer.Enabled = true;
                HostingEnvironment.RegisterObject(this);

                var broadcastBlock = new BroadcastBlock <UriItem>(li => li, new DataflowBlockOptions {
                    CancellationToken = _cts.Token
                });

                Func <UriItem, Task <IEnumerable <UriItem> > > downloadFromLink =
                    async link =>
                {
                    var nextLinks = new UriItem[] { };
                    try
                    {
                        log.Debug(link.AbsoluteUri);
                        if (_crawledLinks.Count >= 10000)
                        {
                            //log.Warn("Max Links cancel: " + _crawledLinks.Count + " " + CrawlerKey + " " + start.AbsoluteUri);
                            _cancelledReason = CancelledReason.MaxLinks;
                            _cts.Cancel();
                        }

                        if (link.Level < 20 && !_cts.IsCancellationRequested && _crawledLinks.TryAdd(link.AbsoluteUri, link))
                        {
                            using (var client = new HttpClient(new HttpClientHandler {
                                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = false
                            }))
                            {
                                client.DefaultRequestHeaders.UserAgent.ParseAdd("PrimocaCrawler (http://www.primoca.com)");
                                if (!string.IsNullOrEmpty(basicUser) || !string.IsNullOrEmpty(basicPassword))
                                {
                                    client.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", Convert.ToBase64String(Encoding.ASCII.GetBytes(basicUser + ":" + basicPassword)));
                                }
                                HttpResponseMessage r = null;
                                try
                                {
                                    r = await client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, _cts.Token);
                                }
                                catch (Exception ex)
                                {
                                    log.Error(link.AbsoluteUri, ex);
                                    _errorLinks.Add(link.AbsoluteUri);
                                }
                                if (r != null)
                                {
                                    if (r.IsSuccessStatusCode)
                                    {
                                        using (var s = await r.Content.ReadAsStreamAsync())
                                        {
                                            try {
                                                var pst = s;
                                                if (r.Content.Headers.ContentType != null && r.Content.Headers.ContentType.MediaType != null &&
                                                    (r.Content.Headers.ContentType.MediaType.StartsWith("text/html", StringComparison.OrdinalIgnoreCase) ||
                                                     r.Content.Headers.ContentType.MediaType.StartsWith("text/css", StringComparison.OrdinalIgnoreCase) ||
                                                     r.Content.Headers.ContentType.MediaType.StartsWith("text/xml", StringComparison.OrdinalIgnoreCase)))
                                                {
                                                    var html = await new StreamContent(s).ReadAsStringAsync();
                                                    //var modHtml = Regex.Replace(html, start.GetLeftPart(UriPartial.Authority), "http://" + _app.ActiveHostname, RegexOptions.IgnoreCase);
                                                    //if (!Object.ReferenceEquals(html, modHtml))
                                                    //{
                                                    //	pst = new MemoryStream(Encoding.UTF8.GetBytes(modHtml));
                                                    //}
                                                    pst       = new MemoryStream(Encoding.UTF8.GetBytes(html));
                                                    nextLinks = Find(html, r.RequestMessage.RequestUri).Where(x => !_crawledLinks.ContainsKey(x.AbsoluteUri)).Select(x => new UriItem(x, link.Level + 1)).ToArray();
                                                }
                                                else
                                                {
                                                    pst = new MemoryStream(await new StreamContent(s).ReadAsByteArrayAsync());
                                                }
                                                var request = new Amazon.S3.Model.PutObjectRequest
                                                {
                                                    BucketName  = CloudConfigurationManager.GetSetting("bucket"),
                                                    InputStream = pst,
                                                    Key         = (link.PathAndQuery.EndsWith("/") ? (link.PathAndQuery + _rootAction) : link.PathAndQuery).Remove(0, 1)
                                                };
                                                var p = S3.PutObject(request);
                                            }
                                            catch (Exception ex)
                                            {
                                                log.Error("Error:" + r.StatusCode + " " + link.AbsoluteUri, ex);
                                                throw ex;
                                            }
                                        }
                                    }
                                    else if ((int)r.StatusCode >= 400)
                                    {
                                        log.Debug("Crawl Error code: " + r.StatusCode + " " + link.AbsoluteUri);
                                        _errorLinks.Add(link.AbsoluteUri);
                                    }
                                }
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                        ((IDataflowBlock)broadcastBlock).Fault(ex);
                    }
                    finally
                    {
                        lock (_remainingLinks)
                        {
                            _remainingLinks.Remove(link.AbsoluteUri);
                            foreach (var l in nextLinks)
                            {
                                _remainingLinks.Add(l.AbsoluteUri);
                            }
                            if (_remainingLinks.Count == 0)
                            {
                                broadcastBlock.Complete();
                            }
                        }
                    }
                    return(nextLinks);
                };

                var linkFinderBlock = new TransformManyBlock <UriItem, UriItem>(downloadFromLink, new ExecutionDataflowBlockOptions {
                    MaxDegreeOfParallelism = 4
                });

                linkFinderBlock.LinkTo(broadcastBlock);
                broadcastBlock.LinkTo(linkFinderBlock);
                _remainingLinks.Add(start.AbsoluteUri);
                broadcastBlock.Post(new UriItem(start, 0));

                await broadcastBlock.Completion;
            }
            catch (Exception ex)
            {
                if (!(ex is TaskCanceledException))
                {
                    _cancelledReason = CancelledReason.Error;
                    log.Error(start.AbsoluteUri, ex);
                }
            }

            HostingEnvironment.UnregisterObject(this);
            _timer.Enabled = false;
            _timer.Dispose();
            _isCompleted = true;
        }
Beispiel #15
0
 public void ActivateHyperlink(UriItem item)
 {
     CopyToClipboard(item);
 }
Beispiel #16
0
 public Task <FileStream> CreateStreamAsync(UriItem uri)
 {
     return(CreateStreamAsync(Disk.RenderFile(uri.Source)));
 }
Beispiel #17
0
 public Task CreateAsync(UriItem uri, byte[] data)
 {
     return(CreateAsync(Disk.RenderFile(uri.Source), data));
 }