/// <summary> /// 取图片分页 /// </summary> public static IEnumerable <PageModel> GetImagePage(SexSpider sex, string url) { string html = sex.ImgType != null && sex.ImgType.Contains("ajax") ? GetJSContent(url, sex.PageEncode) : GetHtmlContent(url, sex.PageEncode, sex.Domain); //过滤站点 html = FilterHtml(html, sex.SiteFilter); html = ReplaceHtml(html, sex.SiteReplace); FilterChain chain = LoadFilter(sex.PageFilter); html = chain.DoFilter(html); //分页的时候取当前页 string _domain = url.Substring(0, url.LastIndexOf('/') + 1); var parser = new HtmlParser(); var _document = parser.Parse(html); var content = _document.QuerySelectorAll(sex.PageDiv); foreach (var item in content) { //分页时点击类型没有内容要注释,此处与获取多分页时冲突 if (!Regex.IsMatch(item.InnerHtml, @"^\d*$")) { continue; } string _link = item.GetAttribute("href"); if (_link == null || _link == "#" || _link.Contains("javascript")) { continue; } _link = GetLink(_link, _domain, sex.Domain); yield return(new PageModel { PageUrl = _link }); } }
/// <summary> /// 取图片总页数 /// </summary> public static string GetPageTotal(SexSpider sex, string url) { string total = ""; string html = sex.ImgType != null && sex.ImgType.Contains("ajax") ? GetJSContent(url, sex.PageEncode) : GetHtmlContent(url, sex.PageEncode, sex.Domain); //过滤站点 html = FilterHtml(html, sex.SiteFilter); html = ReplaceHtml(html, sex.SiteReplace); var parser = new HtmlParser(); var _document = parser.Parse(html); var content = _document.QuerySelectorAll(sex.PageFilter);//取总页数 foreach (var item in content) { string str = System.Net.WebUtility.HtmlDecode(item.InnerHtml); total = Regex.Replace(str, "[^\\d]", ""); } return(total); }
/// <summary> /// 取有分页的图片 /// </summary> public static IEnumerable <ImageModel> GetListImagePage(SexSpider sex, string url) { var images = new List <ImageModel>(); var pages = new List <PageModel>(); var newPages = SiteHelper.GetImagePage(sex, url).ToList(); //1:默认 2:总页数[1][2]..[10] 3:先通过filter取总页数 4:ajax #page if (sex.PageLevel == 4) { string total = GetPageTotal(sex, url); pages = GetPageAjax(url, total); } else if (sex.PageLevel == 3) { string total = GetPageTotal(sex, url); pages = GetPageMany(url, newPages, total); } else if (sex.PageLevel == 2) { pages = GetPageMany(url, newPages, ""); } else { pages = newPages; } //添加原始页面 pages.Insert(0, new PageModel { PageUrl = url }); foreach (var p in pages) { var image = SiteHelper.GetListImage(sex, p.PageUrl).ToList(); images.AddRange(image); } return(images); }
/// <summary> /// 取图片页面 /// </summary> public static IEnumerable <ImageModel> GetListImage(SexSpider sex, string url) { string html = sex.ImgType != null && sex.ImgType.Contains("ajax") ? GetJSContent(url, sex.PageEncode) : GetHtmlContent(url, sex.PageEncode, sex.Domain); //过滤站点 html = FilterHtml(html, sex.SiteFilter); html = ReplaceHtml(html, sex.SiteReplace); FilterChain chain = LoadFilter(sex.ImageFilter); var parser = new HtmlParser(); var _document = parser.Parse(html); var content = _document.QuerySelectorAll(sex.ImageDiv); foreach (var item in content) { string link = ""; if (chain.Count() > 0) { link = chain.DoFilter(item.OuterHtml); } else { link = item.GetAttribute("src"); } if (String.IsNullOrEmpty(link)) { continue; } string _image = GetLink(link, sex.Domain); yield return(new ImageModel { ImageUrl = _image, ImageDomain = sex.Domain }); } }
/// <summary> /// 取站点列表 /// </summary> public static IEnumerable <ListModel> GetSiteList(SexSpider sex) { string html = sex.DocType != null && sex.DocType.Contains("ajax") ? GetJSContent(sex.SiteLink, sex.PageEncode) : GetHtmlContent(sex.SiteLink, sex.PageEncode, sex.Domain); //过滤站点 html = FilterHtml(html, sex.SiteFilter); html = ReplaceHtml(html, sex.SiteReplace); FilterChain chain = LoadFilter(sex.ListFilter); if (sex.DocType != null && sex.DocType.Contains("json")) { string[] root = Regex.Split(sex.ListDiv, "\\|\\|"); var jObject = Newtonsoft.Json.Linq.JObject.Parse(html); var jToken = jObject[root[0]]; string[] m = root[0].Split('&'); switch (m.Length) { case 2: jToken = jObject[m[0]][m[1]]; break; case 3: jToken = jObject[m[0]][m[1]][m[2]]; break; } foreach (var item in jToken) { string[] child = root[1].Split('&'); yield return(new ListModel { Thumb = item.Value <string>(child[2]), Title = System.Net.WebUtility.HtmlDecode(item.Value <string>(child[0])), Link = GetLink(item.Value <string>(child[1]), sex.Domain), Domain = sex.Domain, LastStart = item.Value <string>(child[3]) }); } } else { var parser = new HtmlParser(); var _document = parser.Parse(html); if (!string.IsNullOrWhiteSpace(sex.MainDiv)) { var main = _document.QuerySelectorAll(sex.MainDiv); foreach (var m in main) { var ctx = parser.Parse(m.InnerHtml); var item = ctx.QuerySelector(sex.ListDiv); if (item == null) { continue; } string _title = chain.DoFilter(item.InnerHtml); string _link = GetLink(item.GetAttribute("href"), sex.Domain); if (String.IsNullOrEmpty(_title)) { continue; } var thumb = ctx.QuerySelector(sex.ThumbDiv); var imgtext = thumb == null ? "" : thumb.OuterHtml; yield return(new ListModel { Thumb = GetThumb(imgtext, sex.Domain), Title = System.Net.WebUtility.HtmlDecode(_title), Link = _link, Domain = sex.Domain }); } } else { var content = _document.QuerySelectorAll(sex.ListDiv); foreach (var item in content) { string _title = chain.DoFilter(item.InnerHtml); string _link = GetLink(item.GetAttribute("href"), sex.Domain); if (String.IsNullOrEmpty(_title)) { continue; } yield return(new ListModel { Thumb = GetThumb(item.InnerHtml, sex.Domain), Title = System.Net.WebUtility.HtmlDecode(_title), Link = _link, Domain = sex.Domain }); } } } }