private async Task LoadNext(ISpiderContainer container, UriItem url) { var spider = container.Application; spider.UrlProvider.UpdateItem(url, UriStatus.DOING); var content = await spider.RequestProvider.Getter().GetAsync(url.Source, spider.Option.HeaderItems, spider.ProxyProvider.Get()); if (content == null) { spider.UrlProvider.UpdateItem(url, UriStatus.DONE); await container.NextAsync(); return; } var rules = spider.RuleProvider.Get(url.Source); var keys = container.AttributeKeys; foreach (var item in rules) { var con = spider.GetContainer(url, spider.PluginLoader.Render(item.Rules)); foreach (var key in keys) { con.SetAttribute(key, con.GetAttribute(key)); } await con.NextAsync(); } spider.UrlProvider.UpdateItem(url, UriStatus.DONE); await container.NextAsync(); }
public async Task <IList <ISpiderContainer> > GetContainerAsync(UriItem url) { var items = new List <ISpiderContainer>(); var rules = RuleProvider.Get(url.Source); if (rules == null || rules.Count < 1) { return(items); } var shouldPrepare = false; foreach (var item in rules) { items.Add(GetContainer(url, PluginLoader.Render(item.Rules, ref shouldPrepare))); } if (!shouldPrepare) { return(items); } var content = await RequestProvider.Getter().GetAsync(url.Source, Option.HeaderItems, ProxyProvider.Get()); if (content == null) { Logger?.Waining($"{url.Source} HTML EMPTY"); items.Clear(); return(items); } url.Title = Html.MatchTitle(content); foreach (var item in items) { item.Data = new RuleString(content); } return(items); }
public SpiderContainer(ISpider spider, UriItem url, IList <IRule> rules) { Application = spider; Logger = spider.Logger; Url = url; Rules = rules; }
public void Add(string url, UriType uriType, UriStatus status) { if (Contains(url)) { return; } var item = new UriItem() { Source = url, Type = uriType, Status = status }; Items.Add(item); UrlChanged?.Invoke(item, true); }
private void MenuItem_Click(object sender, RoutedEventArgs e) { switch ((sender as MenuItem).Header as string) { case "选中": var items = new UriItem[UrlListBox.SelectedItems.Count]; UrlListBox.SelectedItems.CopyTo(items, 0); ViewModel.Instance?.UrlProvider.Remove(items); break; case "已完成": ViewModel.Instance?.UrlProvider.Remove(UriStatus.DONE); break; case "全部": ViewModel.Instance?.UrlProvider.Clear(); break; default: break; } }
protected async Task RunTaskAsync(UriItem url) { var items = await GetContainerAsync(url); if (items.Count < 1) { Logger?.Info($"{url.Source} has 0 rule groups, jump"); UrlProvider.UpdateItem(url, UriStatus.ERROR); return; } Logger?.Info($"{url.Source} has {items.Count} rule groups"); UrlProvider.UpdateItem(url, UriStatus.DOING); foreach (var item in items) { await item.NextAsync(); if (Paused) { UrlProvider.UpdateItem(url, UriStatus.NONE); return; } } UrlProvider.UpdateItem(url, UriStatus.DONE); }
public string GetFileName(UriItem uri) { return(Application.RuleProvider.GetFileName(uri.Source)); }
public ISpiderContainer GetContainer(UriItem url, IList <IRule> rules) { return(new SpiderContainer(this, url, rules)); }
public void UpdateItem(UriItem item) { UrlChanged?.Invoke(item, false); }
public void UpdateItem(UriItem item, UriStatus status) { item.Status = status; UpdateItem(item); }
public void UpdateItem(int index, UriItem item) { UpdateItem(Items[index] = item); }
public Task <FileStream?> OpenStreamAsync(UriItem uri) { return(OpenStreamAsync(GetFileName(uri))); }
public void ActivateHyperlink(UriItem item) { Windows.Infrastructure.Navigate(item.Uri); }
public async Task Crawl(Uri start, string basicUser, string basicPassword) { try { _startUrl = start; _timer.Enabled = true; HostingEnvironment.RegisterObject(this); var broadcastBlock = new BroadcastBlock <UriItem>(li => li, new DataflowBlockOptions { CancellationToken = _cts.Token }); Func <UriItem, Task <IEnumerable <UriItem> > > downloadFromLink = async link => { var nextLinks = new UriItem[] { }; try { log.Debug(link.AbsoluteUri); if (_crawledLinks.Count >= 10000) { //log.Warn("Max Links cancel: " + _crawledLinks.Count + " " + CrawlerKey + " " + start.AbsoluteUri); _cancelledReason = CancelledReason.MaxLinks; _cts.Cancel(); } if (link.Level < 20 && !_cts.IsCancellationRequested && _crawledLinks.TryAdd(link.AbsoluteUri, link)) { using (var client = new HttpClient(new HttpClientHandler { AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, AllowAutoRedirect = false })) { client.DefaultRequestHeaders.UserAgent.ParseAdd("PrimocaCrawler (http://www.primoca.com)"); if (!string.IsNullOrEmpty(basicUser) || !string.IsNullOrEmpty(basicPassword)) { client.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", Convert.ToBase64String(Encoding.ASCII.GetBytes(basicUser + ":" + basicPassword))); } HttpResponseMessage r = null; try { r = await client.GetAsync(link, HttpCompletionOption.ResponseHeadersRead, _cts.Token); } catch (Exception ex) { log.Error(link.AbsoluteUri, ex); _errorLinks.Add(link.AbsoluteUri); } if (r != null) { if (r.IsSuccessStatusCode) { using (var s = await r.Content.ReadAsStreamAsync()) { try { var pst = s; if (r.Content.Headers.ContentType != null && r.Content.Headers.ContentType.MediaType != null && (r.Content.Headers.ContentType.MediaType.StartsWith("text/html", StringComparison.OrdinalIgnoreCase) || r.Content.Headers.ContentType.MediaType.StartsWith("text/css", StringComparison.OrdinalIgnoreCase) || r.Content.Headers.ContentType.MediaType.StartsWith("text/xml", StringComparison.OrdinalIgnoreCase))) { var html = await new StreamContent(s).ReadAsStringAsync(); //var modHtml = Regex.Replace(html, start.GetLeftPart(UriPartial.Authority), "http://" + _app.ActiveHostname, RegexOptions.IgnoreCase); //if (!Object.ReferenceEquals(html, modHtml)) //{ // pst = new MemoryStream(Encoding.UTF8.GetBytes(modHtml)); //} pst = new MemoryStream(Encoding.UTF8.GetBytes(html)); nextLinks = Find(html, r.RequestMessage.RequestUri).Where(x => !_crawledLinks.ContainsKey(x.AbsoluteUri)).Select(x => new UriItem(x, link.Level + 1)).ToArray(); } else { pst = new MemoryStream(await new StreamContent(s).ReadAsByteArrayAsync()); } var request = new Amazon.S3.Model.PutObjectRequest { BucketName = CloudConfigurationManager.GetSetting("bucket"), InputStream = pst, Key = (link.PathAndQuery.EndsWith("/") ? (link.PathAndQuery + _rootAction) : link.PathAndQuery).Remove(0, 1) }; var p = S3.PutObject(request); } catch (Exception ex) { log.Error("Error:" + r.StatusCode + " " + link.AbsoluteUri, ex); throw ex; } } } else if ((int)r.StatusCode >= 400) { log.Debug("Crawl Error code: " + r.StatusCode + " " + link.AbsoluteUri); _errorLinks.Add(link.AbsoluteUri); } } } } } catch (Exception ex) { ((IDataflowBlock)broadcastBlock).Fault(ex); } finally { lock (_remainingLinks) { _remainingLinks.Remove(link.AbsoluteUri); foreach (var l in nextLinks) { _remainingLinks.Add(l.AbsoluteUri); } if (_remainingLinks.Count == 0) { broadcastBlock.Complete(); } } } return(nextLinks); }; var linkFinderBlock = new TransformManyBlock <UriItem, UriItem>(downloadFromLink, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 4 }); linkFinderBlock.LinkTo(broadcastBlock); broadcastBlock.LinkTo(linkFinderBlock); _remainingLinks.Add(start.AbsoluteUri); broadcastBlock.Post(new UriItem(start, 0)); await broadcastBlock.Completion; } catch (Exception ex) { if (!(ex is TaskCanceledException)) { _cancelledReason = CancelledReason.Error; log.Error(start.AbsoluteUri, ex); } } HostingEnvironment.UnregisterObject(this); _timer.Enabled = false; _timer.Dispose(); _isCompleted = true; }
public void ActivateHyperlink(UriItem item) { CopyToClipboard(item); }
public Task <FileStream> CreateStreamAsync(UriItem uri) { return(CreateStreamAsync(Disk.RenderFile(uri.Source))); }
public Task CreateAsync(UriItem uri, byte[] data) { return(CreateAsync(Disk.RenderFile(uri.Source), data)); }