Esempio n. 1
0
        /// <summary>
        /// 取图片分页
        /// </summary>
        public static IEnumerable <PageModel> GetImagePage(SexSpider sex, string url)
        {
            string html = sex.ImgType != null && sex.ImgType.Contains("ajax") ? GetJSContent(url, sex.PageEncode) : GetHtmlContent(url, sex.PageEncode, sex.Domain);

            //过滤站点
            html = FilterHtml(html, sex.SiteFilter);
            html = ReplaceHtml(html, sex.SiteReplace);

            FilterChain chain = LoadFilter(sex.PageFilter);

            html = chain.DoFilter(html);

            //分页的时候取当前页
            string _domain = url.Substring(0, url.LastIndexOf('/') + 1);

            var parser    = new HtmlParser();
            var _document = parser.Parse(html);
            var content   = _document.QuerySelectorAll(sex.PageDiv);

            foreach (var item in content)
            {
                //分页时点击类型没有内容要注释,此处与获取多分页时冲突
                if (!Regex.IsMatch(item.InnerHtml, @"^\d*$"))
                {
                    continue;
                }

                string _link = item.GetAttribute("href");

                if (_link == null || _link == "#" || _link.Contains("javascript"))
                {
                    continue;
                }

                _link = GetLink(_link, _domain, sex.Domain);

                yield return(new PageModel
                {
                    PageUrl = _link
                });
            }
        }
Esempio n. 2
0
        /// <summary>
        /// 取图片总页数
        /// </summary>
        public static string GetPageTotal(SexSpider sex, string url)
        {
            string total = "";
            string html  = sex.ImgType != null && sex.ImgType.Contains("ajax") ? GetJSContent(url, sex.PageEncode) : GetHtmlContent(url, sex.PageEncode, sex.Domain);

            //过滤站点
            html = FilterHtml(html, sex.SiteFilter);
            html = ReplaceHtml(html, sex.SiteReplace);

            var parser    = new HtmlParser();
            var _document = parser.Parse(html);
            var content   = _document.QuerySelectorAll(sex.PageFilter);//取总页数

            foreach (var item in content)
            {
                string str = System.Net.WebUtility.HtmlDecode(item.InnerHtml);
                total = Regex.Replace(str, "[^\\d]", "");
            }

            return(total);
        }
Esempio n. 3
0
        /// <summary>
        /// 取有分页的图片
        /// </summary>
        public static IEnumerable <ImageModel> GetListImagePage(SexSpider sex, string url)
        {
            var images = new List <ImageModel>();
            var pages  = new List <PageModel>();

            var newPages = SiteHelper.GetImagePage(sex, url).ToList();

            //1:默认 2:总页数[1][2]..[10] 3:先通过filter取总页数 4:ajax #page
            if (sex.PageLevel == 4)
            {
                string total = GetPageTotal(sex, url);
                pages = GetPageAjax(url, total);
            }
            else if (sex.PageLevel == 3)
            {
                string total = GetPageTotal(sex, url);
                pages = GetPageMany(url, newPages, total);
            }
            else if (sex.PageLevel == 2)
            {
                pages = GetPageMany(url, newPages, "");
            }
            else
            {
                pages = newPages;
            }

            //添加原始页面
            pages.Insert(0, new PageModel {
                PageUrl = url
            });

            foreach (var p in pages)
            {
                var image = SiteHelper.GetListImage(sex, p.PageUrl).ToList();
                images.AddRange(image);
            }

            return(images);
        }
Esempio n. 4
0
        /// <summary>
        /// 取图片页面
        /// </summary>
        public static IEnumerable <ImageModel> GetListImage(SexSpider sex, string url)
        {
            string html = sex.ImgType != null && sex.ImgType.Contains("ajax") ? GetJSContent(url, sex.PageEncode) : GetHtmlContent(url, sex.PageEncode, sex.Domain);

            //过滤站点
            html = FilterHtml(html, sex.SiteFilter);
            html = ReplaceHtml(html, sex.SiteReplace);

            FilterChain chain = LoadFilter(sex.ImageFilter);

            var parser    = new HtmlParser();
            var _document = parser.Parse(html);
            var content   = _document.QuerySelectorAll(sex.ImageDiv);

            foreach (var item in content)
            {
                string link = "";
                if (chain.Count() > 0)
                {
                    link = chain.DoFilter(item.OuterHtml);
                }
                else
                {
                    link = item.GetAttribute("src");
                }

                if (String.IsNullOrEmpty(link))
                {
                    continue;
                }

                string _image = GetLink(link, sex.Domain);

                yield return(new ImageModel
                {
                    ImageUrl = _image,
                    ImageDomain = sex.Domain
                });
            }
        }
Esempio n. 5
0
        /// <summary>
        /// 取站点列表
        /// </summary>
        public static IEnumerable <ListModel> GetSiteList(SexSpider sex)
        {
            string html = sex.DocType != null && sex.DocType.Contains("ajax") ? GetJSContent(sex.SiteLink, sex.PageEncode) : GetHtmlContent(sex.SiteLink, sex.PageEncode, sex.Domain);

            //过滤站点
            html = FilterHtml(html, sex.SiteFilter);
            html = ReplaceHtml(html, sex.SiteReplace);

            FilterChain chain = LoadFilter(sex.ListFilter);

            if (sex.DocType != null && sex.DocType.Contains("json"))
            {
                string[] root    = Regex.Split(sex.ListDiv, "\\|\\|");
                var      jObject = Newtonsoft.Json.Linq.JObject.Parse(html);

                var jToken = jObject[root[0]];

                string[] m = root[0].Split('&');
                switch (m.Length)
                {
                case 2:
                    jToken = jObject[m[0]][m[1]];
                    break;

                case 3:
                    jToken = jObject[m[0]][m[1]][m[2]];
                    break;
                }

                foreach (var item in jToken)
                {
                    string[] child = root[1].Split('&');

                    yield return(new ListModel
                    {
                        Thumb = item.Value <string>(child[2]),
                        Title = System.Net.WebUtility.HtmlDecode(item.Value <string>(child[0])),
                        Link = GetLink(item.Value <string>(child[1]), sex.Domain),
                        Domain = sex.Domain,
                        LastStart = item.Value <string>(child[3])
                    });
                }
            }
            else
            {
                var parser    = new HtmlParser();
                var _document = parser.Parse(html);

                if (!string.IsNullOrWhiteSpace(sex.MainDiv))
                {
                    var main = _document.QuerySelectorAll(sex.MainDiv);
                    foreach (var m in main)
                    {
                        var ctx  = parser.Parse(m.InnerHtml);
                        var item = ctx.QuerySelector(sex.ListDiv);

                        if (item == null)
                        {
                            continue;
                        }

                        string _title = chain.DoFilter(item.InnerHtml);
                        string _link  = GetLink(item.GetAttribute("href"), sex.Domain);

                        if (String.IsNullOrEmpty(_title))
                        {
                            continue;
                        }

                        var thumb   = ctx.QuerySelector(sex.ThumbDiv);
                        var imgtext = thumb == null ? "" : thumb.OuterHtml;

                        yield return(new ListModel
                        {
                            Thumb = GetThumb(imgtext, sex.Domain),
                            Title = System.Net.WebUtility.HtmlDecode(_title),
                            Link = _link,
                            Domain = sex.Domain
                        });
                    }
                }
                else
                {
                    var content = _document.QuerySelectorAll(sex.ListDiv);
                    foreach (var item in content)
                    {
                        string _title = chain.DoFilter(item.InnerHtml);
                        string _link  = GetLink(item.GetAttribute("href"), sex.Domain);

                        if (String.IsNullOrEmpty(_title))
                        {
                            continue;
                        }

                        yield return(new ListModel
                        {
                            Thumb = GetThumb(item.InnerHtml, sex.Domain),
                            Title = System.Net.WebUtility.HtmlDecode(_title),
                            Link = _link,
                            Domain = sex.Domain
                        });
                    }
                }
            }
        }