예제 #1
0
        private static IEnumerable <TamllBase> GetTUrlsFromHtml(string url, out string next)
        {
            next = "";
            var urls = new List <TamllBase>();

            //tmall根据cookie不一样,前端显示也不一样。。
            const string cookie =
                "x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=qMo3B45XYmoCAct2enaIrZoT; t=9bfd6b376a1f1e450056f0e1b1c54240; tracknick=luoyong87610; mpp=t%3D0%26m%3D%26h%3D0%26l%3D0; uc1=x; cookie2=22291aea11e397a82512118642ac0abe; passtime=1341285069752; isFirstOpen=true";
            string docHtml = HtmlCls.GetHtmlByUrl(url, Encoding.Default, cookie);

            if (!string.IsNullOrEmpty(docHtml))
            {
                docHtml = RegexHelper.ClearBr(docHtml);
                next    =
                    Utils.UrlDecode(RegexHelper.Match(docHtml,
                                                      "<a[^>]*href=['\"]([^'\"\\s]+)['\"][^>]*class=['\"]ui-page-s-next['\"][^>]*>"));
                var listHtml = HtmlCls.GetHtmlById(docHtml, "J_itemList");
                var list     = HtmlCls.GetHtmlByCss(listHtml, "product");
                //1:url,2:name
                const string regStr =
                    "<a[^>]*href=['\"]([^'\"\\s]+?)['\"][^>]*class=['\"]product-title['\"][^>]*title=['\"]([^'\"]+?)['\"][^>]*>";
                //price
                const string priceReg =
                    "<span[^>]*class=['\"]product-normal['\"][^>]*title=['\"]([^'\"\\s]+)['\"][^>]*>";

                urls.AddRange(list.Select(item => new TamllBase
                {
                    Url   = Utils.UrlDecode(RegexHelper.Match(item, regStr, 1)),
                    Title = RegexHelper.Match(item, regStr, 2),
                    Price = Convert.ToDecimal(RegexHelper.Match(item, priceReg))
                }));
            }
            return(urls);
        }
예제 #2
0
 public override IEnumerable <string> GetUrlList(string listUrl)
 {
     try
     {
         using (var http = new HttpHelper(listUrl, SiteEncoding))
         {
             var html = http.GetHtml();
             var url  = http.GetRequestUrl();
             if (Regex.IsMatch(url, "^http://www.newegg.com.cn/Product/[0-9a-zA-Z\\-]+.htm$"))
             {
                 return new List <string> {
                            url
                 }
             }
             ;
             if (!html.IsNullOrEmpty())
             {
                 html = RegexHelper.ClearTrn(html);
             }
             var showList = HtmlCls.GetHtmlById(html, "itemGrid1");
             var list     =
                 RegexHelper.Matches(showList, "(http://www.newegg.com.cn/Product/[0-9a-zA-Z\\-]+.htm)").Distinct()
                 .ToList();
             return
                 (list.Where(t => !t.IsNullOrEmpty()).Select(
                      t => Utils.GetAbsoluteUrl(GetWebSiteInfo().BaseUrl, t)).ToList());
         }
     }
     catch (Exception)
     {
         return(new List <string>());
     }
 }
예제 #3
0
        /// <summary>
        /// 获取首图
        /// </summary>
        /// <param name="docHtml"></param>
        /// <returns></returns>
        public static string GetBigPic(string docHtml)
        {
            string area = HtmlCls.GetHtmlById(docHtml, "spec-n1");
            string src  = RegexHelper.Match(area, "<img[^>]*src=['\"]([^'\"]*)['\"][^>]*>");

            return(src);
        }
예제 #4
0
        private static IEnumerable <string> GetUrlsFromHtml(string url)
        {
            var urlList = new List <string>();
            //int ver = GetListUrlVersion(url);
            int    ver     = 0;                                       //有些图书html样式居然不一样~
            string docHtml = HtmlCls.GetHtmlByUrl(url, SiteEncoding); //HtmlCls.GetHtmlByUrl(url, _useProxy);)

            if (!string.IsNullOrEmpty(docHtml))
            {
                docHtml = RegexHelper.ClearTrn(docHtml);
                var cssName  = "p-img";
                var listHtml = HtmlCls.GetHtmlById(docHtml, "plist");
                if (listHtml.IsNullOrEmpty())
                {
                    cssName  = "i-img";
                    listHtml = HtmlCls.GetHtmlByCss(docHtml, "list-h").FirstOrDefault();
                }
                var list =
                    HtmlCls.GetHtmlByCss(listHtml, cssName).Select(
                        t => RegexHelper.Match(t, "<a[^>]*href=[\"']?([^\"'>#]+)(#[^\"'>]*)?[\"']?[^>]*>")).Distinct().
                    ToList();
                return(list);
            }
            return(urlList);
        }
예제 #5
0
        public override string GetProName()
        {
            GetHtml(SiteEncoding);
            string area = HtmlCls.GetHtmlById(DocHtml, "name");

            return(RegexHelper.Match(area, "<h1>([^<]*)<"));
        }
예제 #6
0
 public override IEnumerable <string> SearchWord(string word)
 {
     try
     {
         string searchUrl = GetWebSiteInfo().BaseUrl +
                            "/s/ref=nb_sb_noss_1?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords={0}";
         var url = String.Format(searchUrl, Utils.UrlEncode(word, SiteEncoding));
         using (var http = new HttpHelper(url, SiteEncoding))
         {
             var html = http.GetHtml();
             if (!html.IsNullOrEmpty())
             {
                 html = RegexHelper.ClearTrn(html);
             }
             var showList = HtmlCls.GetHtmlById(html, "atfResults") + HtmlCls.GetHtmlById(html, "btfResults");
             var list     =
                 HtmlCls.GetHtmlByCss(showList, "productImage").Select(
                     t => RegexHelper.Match(t, "<a[^>]*href=[\"']?([^\"'>]+)(#[^\"'>]*)?[\"']?[^>]*>")).Distinct().
                 ToList();
             return(list);
         }
     }
     catch (Exception ex)
     {
         FileHelper.WriteException(ex);
         return(new List <string>());
     }
 }
예제 #7
0
        public override string GetProPic()
        {
            GetHtml(SiteEncoding);
            string area = HtmlCls.GetHtmlById(DocHtml, "spec-n1");
            string src  = RegexHelper.Match(area, "<img[^>]*src=['\"]([^'\"]*)['\"][^>]*>");

            return(src);
        }
예제 #8
0
        public static string GetPackingList(string docHtml)
        {
            string area = HtmlCls.GetHtmlById(docHtml, "bzqd");

            if (!string.IsNullOrEmpty(area))
            {
                return(RegexHelper.Match(area, "<[^>]*>([^<]+)<[^>]*>").Trim());
            }
            return("");
        }
예제 #9
0
        public static string GetBrandName(string docHtml)
        {
            string area = HtmlCls.GetHtmlById(docHtml, "i-detail");

            if (!string.IsNullOrEmpty(area))
            {
                return(RegexHelper.Match(area, "<li[^>]*>生产厂家:<a[^>]*brand[^>]*>([^<]+)</a>").Trim());
            }
            return("");
        }
예제 #10
0
 public override string GetProPic()
 {
     try
     {
         GetHtml(SiteEncoding);
         var pic = HtmlCls.GetHtmlById(DocHtml, "bgPics");
         return(RegexHelper.Match(pic, "\\s+src=[\"']([^\"'>]+)[\"']"));
     }
     catch (Exception)
     {
         return("");
     }
 }
예제 #11
0
 public override string GetProName()
 {
     try
     {
         GetHtml(SiteEncoding);
         var name = HtmlCls.GetHtmlById(DocHtml, "title-descript");
         return(Regex.Replace(name, "</?[0-9a-zA-Z]+[^>]*>", "").Trim());
     }
     catch (Exception)
     {
         return("");
     }
 }
예제 #12
0
        private static IEnumerable <string> GetUrlsFromHtml(string url)
        {
            var    urls    = new List <string>();
            string docHtml = HtmlCls.GetHtmlByUrl(url);

            if (!string.IsNullOrEmpty(docHtml))
            {
                var          listHtml = HtmlCls.GetHtmlById(docHtml, "Id_prodItemList");
                const string regStr   = "<div[^>]*class=['\"]proPic['\"][^>]*><a[^>]*href=['\"]([^'\"]+)['\"][^>]*>";
                urls = RegexHelper.Matches(listHtml, regStr);
                urls = urls.Select(t => (t.StartsWith("/") ? SanfoUrl + t : t)).ToList();
            }
            return(urls);
        }
예제 #13
0
 public override string GetProName()
 {
     try
     {
         GetHtml(SiteEncoding);
         var str = HtmlCls.GetHtmlById(DocHtml, "btAsinTitle");
         str = Regex.Replace(str, "</?[0-9a-zA-Z]+[^>]*>", "");
         return(str);
     }
     catch (Exception ex)
     {
         FileHelper.WriteException(ex);
         return("");
     }
 }
예제 #14
0
        /// <summary>
        /// 产品描述
        /// </summary>
        /// <param name="docHtml"></param>
        /// <returns></returns>
        public static string GetDescFromHtml(string docHtml)
        {
            string desc = HtmlCls.GetHtmlById(docHtml, "productDescription");

            if (!string.IsNullOrEmpty(desc))
            {
                desc = Regex.Replace(desc, @"红孩子母婴商城|红孩子", "本商场");
                desc = desc.Replace("id=\"productDescription\"", "");                           //排除样式冲突
                //area = Regex.Replace(area, "class=['\"][^'\"]*['\"]", "");//排除样式冲突.终极
                desc = Regex.Replace(desc, "<a[^]*href=[\"|'][^'\"]*[\"'][^>]*>(.*?)</a>", ""); //排除a标签
                desc = Regex.Replace(desc, "<script[^>]*>[^<]*</script>", "");                  //排除script标签
                desc = Regex.Replace(desc, "src\\d=", "src=");                                  //显示src
            }
            return(desc);
        }
예제 #15
0
        public static string GetAftersaleService(string docHtml)
        {
            string area = HtmlCls.GetHtmlById(docHtml, "detail");

            if (!string.IsNullOrEmpty(area))
            {
                var list = HtmlCls.GetHtmlByCss(area, "mc tabcon hide").ToList();
                if (list.Count() >= 3)
                {
                    return(RegexHelper.Match(list[2], "<[^>]*>([^<]+)<[^>]*>").Trim());
                }
                return("");
            }
            return("");
        }
예제 #16
0
 public override string GetProPic()
 {
     try
     {
         GetHtml(SiteEncoding);
         var str = HtmlCls.GetHtmlById(DocHtml, "midImg");
         str = HtmlCls.GetAttrValue(str, "src340");
         return(Regex.Replace(str, "\\?.*$", ""));
     }
     catch (Exception ex)
     {
         FileHelper.WriteException(ex);
         return("");
     }
 }
예제 #17
0
 public override string GetProPic()
 {
     try
     {
         GetHtml(SiteEncoding);
         var str = HtmlCls.GetHtmlById(DocHtml, "prodImageCell");
         str = RegexHelper.Match(str, "\\s+src=[\"']([^\"'>]+)[\"']");
         return(str);
     }
     catch (Exception ex)
     {
         FileHelper.WriteException(ex);
         return("");
     }
 }
예제 #18
0
 public override IEnumerable <string> GetUrlList(string listUrl)
 {
     try
     {
         using (var http = new HttpHelper(listUrl, SiteEncoding))
         {
             var html = http.GetHtml();
             html = RegexHelper.ClearTrn(html);
             var showList = HtmlCls.GetHtmlById(html, "proShow");
             var linkReg  = "<a[^>]*href=[\"']?(" + GetWebSiteInfo().BaseUrl +
                            "/emall/prd_\\d+_\\d+_-\\d+_\\d+_.html)[\"']?[^>]*>";
             var list = RegexHelper.Matches(showList, linkReg).Distinct().ToList();
             return(list);
         }
     }
     catch (Exception ex)
     {
         FileHelper.WriteException(ex);
         return(new List <string>());
     }
 }
예제 #19
0
        public static decimal GetMarketerPrice(string docHtml)
        {
            decimal mprice;

            try
            {
                string str = HtmlCls.GetHtmlById(docHtml, "summary");
                str = RegexHelper.Match(str, "<del>¥([^<]+)</del>");
                if (string.IsNullOrEmpty(str))
                {
                    str = HtmlCls.GetHtmlById(docHtml, "book-price");
                    str = RegexHelper.Match(str, "<del>¥([^<]+)</del>");
                }
                mprice = decimal.Parse(str.Replace(",", ""));
            }
            catch (Exception)
            {
                mprice = 0;
            }
            return(mprice > 100 ? Math.Round(mprice, 0) : Math.Round(mprice, 1));
        }
예제 #20
0
        private static IEnumerable <string> GetUrlsFromHtml(string url, out string next)
        {
            next = "";
            var urls = new List <string>();

            //tmall根据cookie不一样,前端显示也不一样。。
            const string cookie =
                "x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=qMo3B45XYmoCAct2enaIrZoT; t=9bfd6b376a1f1e450056f0e1b1c54240; tracknick=luoyong87610; mpp=t%3D0%26m%3D%26h%3D0%26l%3D0; uc1=x; cookie2=8eb29ff22cbe3bddcad34d264d01806f; passtime=1341280687588; isFirstOpen=false; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0";
            string docHtml = HtmlCls.GetHtmlByUrl(url, Encoding.Default, cookie);

            if (!string.IsNullOrEmpty(docHtml))
            {
                docHtml = RegexHelper.ClearBr(docHtml);
                next    =
                    Utils.UrlDecode(RegexHelper.Match(docHtml,
                                                      "<a[^>]*href=['\"]([^'\"\\s]+)['\"][^>]*class=['\"]ui-page-s-next['\"][^>]*>"));
                var          listHtml = HtmlCls.GetHtmlById(docHtml, "J_itemList");
                const string regStr   = "<a[^>]*href=['\"]([^'\"\\s]+?)['\"][^>]*class=['\"]product-Img['\"][^>]*>";
                //"<a[^>]*class=['\"]product-title['\"][^>]*href=['\"]([^'\"]+)['\"][^>]*>";
                urls = RegexHelper.Matches(listHtml, regStr);
                urls = urls.Select(t => (t.StartsWith("/") ? BaseUrl : "") + Utils.UrlDecode(t)).ToList();
            }
            return(urls);
        }
예제 #21
0
        /// <summary>
        /// 获取产品描述
        /// </summary>
        /// <param name="docHtml">html文件</param>
        /// <param name="version">区分图书1和其他0</param>
        /// <returns></returns>
        public static string GetProDesc(string docHtml, int version)
        {
            string area = "";

            try
            {
                docHtml = RegexHelper.ClearTrn(docHtml);

                if (version == 0)
                {
                    //增加 规格描述 -2012-02-29 shy
                    string pt = HtmlCls.GetHtmlByCss(docHtml, "Ptable").FirstOrDefault();
                    if (!string.IsNullOrEmpty(pt))
                    {
                        area += pt;
                    }

                    area += HtmlCls.GetHtmlByCss(docHtml, "content").FirstOrDefault();
                }
                else
                {
                    //图书类
                    var list = HtmlCls.GetHtmlByCss(docHtml, "m m1");
                    area = list.Aggregate(area, (current, s) => current + s);
                    string listH = HtmlCls.GetHtmlByCss(area, "list-h").FirstOrDefault();
                    if (!string.IsNullOrEmpty(listH))
                    {
                        area = area.Replace(listH, "");
                    }
                    //去除【该作者其它作品】区域
                    listH = HtmlCls.GetHtmlById(area, "related-works");
                    if (!string.IsNullOrEmpty(listH))
                    {
                        area = area.Replace(listH, "");
                    }
                    string sum     = HtmlCls.GetHtmlById(docHtml, "summary"); //加入图书信息
                    var    sumList = RegexHelper.Matches(sum, "<li[^>]*>(.*?)</li>").Take(9);
                    sum = sumList.Aggregate("", (current, s) => current + "<div>" + s + "</div>");
                    sum = Regex.Replace(sum, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签

                    area = sum + area;
                }
                //排除授权html
                string red = HtmlCls.GetHtmlByAttr(area, "color=\"red\"").FirstOrDefault();
                if (!string.IsNullOrEmpty(red))
                {
                    area = area.Replace(red, "");
                }
                area = area.Replace("class=\"content\"", "");                                        //排除样式冲突
                //area = Regex.Replace(area, "class=['\"][^'\"]*['\"]", "");//排除样式冲突.终极
                area = Regex.Replace(area, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签
                area = Regex.Replace(area, "\\sstyle=(['\"])[^'\"]+?\\1", "");                       //排除样式
                area = Regex.Replace(area, "<script[^>]*>(.*?)</script>", "");                       //排除script标签
                area = Regex.Replace(area, "src\\d=", "src=");                                       //显示src
                area = Regex.Replace(area, "京东商城|京东", "本商城");                                        //排除京东字样
            }
            catch (Exception ex)
            {
                FileHelper.WriteException(ex);
            }
            return(area);
        }
예제 #22
0
        /// <summary>
        /// 获取产品名
        /// </summary>
        /// <param name="docHtml">html</param>
        /// <returns></returns>
        public static string GetProName(string docHtml)
        {
            string area = HtmlCls.GetHtmlById(docHtml, "name");

            return(RegexHelper.Match(area, "<h1>([^<]*)<"));
        }