public async Task <string> RequestHTML(ICatchItem catchItem) { string response = string.Empty; if (catchItem is null) { return(response); } Uri address = catchItem.Uri; Uri proxy = catchItem.Proxy; object extend = catchItem.Extend; string cookie = catchItem.Cookie; try { var webCrawlerHttpClientHandler = new WebCrawlerHttpClientHandler(cookie, proxy); HttpClient httpClient = new HttpClient(webCrawlerHttpClientHandler); response = await httpClient.GetStringAsync(address); } catch (Exception ex) { logger.DebugFormat("Error RequestHTML address:{0}: {1}", address.AbsoluteUri, ex.Message); } return(response); }
protected override void WebResponseHandle(string response, ICatchItem catchItem) { try { string requestDomain = string.Empty; requestDomain = string.Format("{0}://{1}", catchItem.Uri.Scheme, catchItem.Uri.Host); string html = response; var doc = new HtmlDocument(); doc.LoadHtml(html); IPage page = RuleConfig.PageRule.GetRule(PageType.AllWork); var galleryNodes = page.GetNodes(doc.DocumentNode, "Gallery"); if (galleryNodes != null) { logger.InfoFormat("获取作品列表: 个数:{0}", galleryNodes.Count); galleryNodes.ToList().ForEach(item => { Work work = new Work(); string _address = page.GetSingleNodeValue(item, "Gallery.Address"); work.Address = StringUtil.FillWithDomain(_address, requestDomain)?.AbsoluteUri; //work.Originality = page.GetSingleNodeValue(item, "Gallery.Originality"); //work.Role = page.GetSingleNodeValue(item, "Gallery.Role"); work.Name = page.GetSingleNodeValue(item, "Gallery.Name"); Author author = new Author(); author.Address = page.GetSingleNodeValue(item, "Gallery.Author.Address"); author.Name = work.Name; authorHelper.Add(author); workHelper.Add(work); logger.InfoFormat("获得作品: [Address:{0}], [Originality:{1}], [Role:{2}], [Name:{3}]", work.Address, work.Originality, work.Role, work.Name); }); } } catch (Exception ex) { logger.ErrorFormat("WorkCatcher WebResponseHandle:{0}", ex); } }
protected override void WebResponseHandle(string response, ICatchItem catchItem) { try { string className = (string)catchItem.Extend; IProxyHandle handle = proxyFactory.GetProxyHandle(className); if (handle == null) { return; } List <Proxy> proxyList = handle.DoHandle(response); if (proxyList == null) { return; } proxyList.ForEach(proxy => { httpUtil.VerifyProxy(proxy.Address, isSuccess => { if (isSuccess) { proxyHelper.Add(proxy); logger.InfoFormat("添加代理:{0}", proxy.Address.AbsoluteUri); } }); }); } catch (Exception ex) { logger.ErrorFormat("Error ProxyHandle:{0}", ex); } }
protected override void WebResponseHandle(string response, ICatchItem catchItem) { try { string requestDomain = string.Empty; requestDomain = string.Format("{0}://{1}", catchItem.Uri.Scheme, catchItem.Uri.Host); string html = response; var doc = new HtmlDocument(); doc.LoadHtml(html); IPage page = RuleConfig.PageRule.GetRule(PageType.AllWork); var nodes = doc.DocumentNode.SelectNodes("//li[@class='pager__item js-nologinLink']"); var nextPageButtonNode = nodes.ElementAt(nodes.Count - 2); var nextPageNode = nextPageButtonNode?.SelectSingleNode("./a"); string href = nextPageNode?.Attributes["href"].Value; if (string.IsNullOrEmpty(href)) { nextWorkPage = null; } else { nextWorkPage = new AllWorkPage(StringUtil.FillWithDomain(href, requestDomain)); } var galleryNodes = page.GetNodes(doc.DocumentNode, "Gallery"); if (galleryNodes != null) { logger.InfoFormat("获取作品列表: 个数:{0}", galleryNodes.Count); galleryNodes.ToList().ForEach(item => { Work work = new Work(); string _address = page.GetSingleNodeValue(item, "Gallery.Address"); work.Address = StringUtil.FillWithDomain(_address, requestDomain)?.AbsoluteUri; //work.Originality = page.GetSingleNodeValue(item, "Gallery.Originality"); //work.Role = page.GetSingleNodeValue(item, "Gallery.Role"); work.Name = page.GetSingleNodeValue(item, "Gallery.Name"); Author author = new Author(); string authorAddress = page.GetSingleNodeValue(item, "Gallery.Author.Address"); author.Address = StringUtil.FillWithDomain(authorAddress, requestDomain)?.AbsoluteUri; author.Name = work.Name; authorHelper.Add(author); workHelper.Add(work); logger.InfoFormat("获得作品: [Address:{0}], [Originality:{1}], [Role:{2}], [Name:{3}]", work.Address, work.Originality, work.Role, work.Name); }); } } catch (Exception ex) { logger.ErrorFormat("WorkCatcher WebResponseHandle:{0}", ex); } }
protected virtual void WebResponseHandle(string response, ICatchItem catchItem) { }
protected override void WebResponseHandle(string response, ICatchItem catchItem) { try { string html = response; var doc = new HtmlDocument(); string coserName = string.Empty; string title = string.Empty; string fileName = string.Empty; doc.LoadHtml(html); IPage page = RuleConfig.PageRule.GetRule(PageType.Images); coserName = page.GetSingleNodeValue(doc.DocumentNode, "CoserName"); title = page.GetSingleNodeValue(doc.DocumentNode, "Title"); if (!string.IsNullOrEmpty(title)) { foreach (char rInvalidChar in System.IO.Path.GetInvalidPathChars()) { title = title.Replace(rInvalidChar.ToString(), string.Empty); } string errChar = "\\/:*?"; foreach (char rInvalidChar in errChar) { title = title.Replace(rInvalidChar.ToString(), string.Empty); } } var imgNodes = page.GetNodes(doc.DocumentNode, "WorkImage"); if (imgNodes != null) { if (catchItem.Extend != null && catchItem.Extend is Work) { Work work = (Work)catchItem.Extend; string workAddress = catchItem.Uri.AbsoluteUri; imgNodes.ToList() .ForEach(x => { string address = page.GetSingleNodeValue(x, "WorkImage.Src"); // 删除尾部限定大小 var regex = new Regex(@"((http|https)://)(([a-zA-Z0-9\._-]+)/)+(w\d+)"); if (regex.IsMatch(address)) { address = address.Substring(0, address.LastIndexOf('/')); } logger.InfoFormat("添加图片:[Address:{0}, CoserName:{1}, WorkAddress:{2}]", address, coserName, workAddress); WorkImage workImage = new WorkImage() { Work = work, WorkId = work.Id, Address = address }; workImageHelper.Add(workImage); // 事件推送图片添加成功 syncWorkImageAdd?.Invoke(workImage); }); // 完成后,从作品抓取列表中清除 logger.InfoFormat("完成抓取: {0}", catchItem.Uri.AbsoluteUri); // 事件推送作品图片已抓取 work.IsCatchImage = true; work.Title = title; } } } catch (Exception ex) { logger.ErrorFormat("WorkImageCatcher WebResponseHandle:{0}", ex); } }