Exemplo n.º 1
0
        /// <summary>
        /// 获取大图html
        /// </summary>
        /// <param name="jdNum"></param>
        /// <returns></returns>
        public static string GetProBigPics(string jdNum)
        {
            string picArea = "";

            try
            {
                const string bigUrl  = BaseUrl + "/bigimage.aspx?id={0}";
                string       url     = String.Format(bigUrl, jdNum);
                string       picHtml = HtmlCls.GetHtmlByUrl(url);
                if (!string.IsNullOrEmpty(picHtml))
                {
                    string biger   = HtmlCls.GetHtmlByCss(picHtml, "right").FirstOrDefault();
                    var    bigList = RegexHelper.Matches(biger, "http://img10.360buyimg.com/n5([^'\"]*)");
                    if (bigList.Count() > 0)
                    {
                        picArea =
                            "<table width=\"750\" align=\"center\" border=\"0\" cellSpacing=\"0\" cellPadding=\"0\">";
                        picArea =
                            bigList.Aggregate(picArea,
                                              (current, s) =>
                                              current + "<tr><td><img src=\"http://img10.360buyimg.com/n0" + s +
                                              "\" /></td></tr>");
                        picArea += "</table>";
                    }
                }
            }
            catch (Exception ex)
            {
                FileHelper.WriteException(ex);
            }
            return(picArea);
        }
Exemplo n.º 2
0
        private static IEnumerable <string> GetUrlsFromHtml(string url)
        {
            var urlList = new List <string>();
            //int ver = GetListUrlVersion(url);
            int    ver     = 0;                                       //有些图书html样式居然不一样~
            string docHtml = HtmlCls.GetHtmlByUrl(url, SiteEncoding); //HtmlCls.GetHtmlByUrl(url, _useProxy);)

            if (!string.IsNullOrEmpty(docHtml))
            {
                docHtml = RegexHelper.ClearTrn(docHtml);
                var cssName  = "p-img";
                var listHtml = HtmlCls.GetHtmlById(docHtml, "plist");
                if (listHtml.IsNullOrEmpty())
                {
                    cssName  = "i-img";
                    listHtml = HtmlCls.GetHtmlByCss(docHtml, "list-h").FirstOrDefault();
                }
                var list =
                    HtmlCls.GetHtmlByCss(listHtml, cssName).Select(
                        t => RegexHelper.Match(t, "<a[^>]*href=[\"']?([^\"'>#]+)(#[^\"'>]*)?[\"']?[^>]*>")).Distinct().
                    ToList();
                return(list);
            }
            return(urlList);
        }
Exemplo n.º 3
0
        /// <summary>
        /// 获取描述
        /// </summary>
        /// <param name="docHtml"></param>
        /// <param name="sanId"></param>
        /// <returns></returns>
        public static string GetProDesc(string docHtml, string sanId)
        {
            var desc = "";
            var area = HtmlCls.GetHtmlByCss(docHtml, "detailBox");

            if (area.Count() > 0)
            {
                desc = area.Aggregate("", (current, t) => current + t);

                //图片居然单独一个请求

                const string imgUrl = "http://www.sanfo.com/shop/product.info.asp?command=findthumb&vid={0}";

                var imgs = HtmlCls.GetHtmlByUrl(String.Format(imgUrl, sanId), Encoding.UTF8);

                desc = Regex.Replace(desc, "<dt class=\"detailImg\" id=\"item_product_images\"></dt>",
                                     "<dt class=\"detailImg\" id=\"item_product_images\">" + imgs + "</dt>");

                //排除a标签
                desc = Regex.Replace(desc, "<a[^]*href=[\"|'][^'\"]*[\"'][^>]*>(.*?)</a>", "$1");
                //排除script标签
                desc = Regex.Replace(desc, "<script[^>]*>[^<]*</script>", "");
                //清除样式
                desc = Regex.Replace(desc, "(\\s*class=\"[^\"]+\")|(\\s*style=\"[^\"]+\")", "");

                //替换成绝对路径
                desc = Regex.Replace(desc, "src=\"(/[^\"]+)\"", "src=\"" + SanfoUrl + "$1\"");

                //替换三夫
                desc = Regex.Replace(desc, "(三夫(户外?)?)", "本商城");
            }
            return(desc);
        }
Exemplo n.º 4
0
        private static IEnumerable <TamllBase> GetTUrlsFromHtml(string url, out string next)
        {
            next = "";
            var urls = new List <TamllBase>();

            //tmall根据cookie不一样,前端显示也不一样。。
            const string cookie =
                "x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=qMo3B45XYmoCAct2enaIrZoT; t=9bfd6b376a1f1e450056f0e1b1c54240; tracknick=luoyong87610; mpp=t%3D0%26m%3D%26h%3D0%26l%3D0; uc1=x; cookie2=22291aea11e397a82512118642ac0abe; passtime=1341285069752; isFirstOpen=true";
            string docHtml = HtmlCls.GetHtmlByUrl(url, Encoding.Default, cookie);

            if (!string.IsNullOrEmpty(docHtml))
            {
                docHtml = RegexHelper.ClearBr(docHtml);
                next    =
                    Utils.UrlDecode(RegexHelper.Match(docHtml,
                                                      "<a[^>]*href=['\"]([^'\"\\s]+)['\"][^>]*class=['\"]ui-page-s-next['\"][^>]*>"));
                var listHtml = HtmlCls.GetHtmlById(docHtml, "J_itemList");
                var list     = HtmlCls.GetHtmlByCss(listHtml, "product");
                //1:url,2:name
                const string regStr =
                    "<a[^>]*href=['\"]([^'\"\\s]+?)['\"][^>]*class=['\"]product-title['\"][^>]*title=['\"]([^'\"]+?)['\"][^>]*>";
                //price
                const string priceReg =
                    "<span[^>]*class=['\"]product-normal['\"][^>]*title=['\"]([^'\"\\s]+)['\"][^>]*>";

                urls.AddRange(list.Select(item => new TamllBase
                {
                    Url   = Utils.UrlDecode(RegexHelper.Match(item, regStr, 1)),
                    Title = RegexHelper.Match(item, regStr, 2),
                    Price = Convert.ToDecimal(RegexHelper.Match(item, priceReg))
                }));
            }
            return(urls);
        }
Exemplo n.º 5
0
 public override IEnumerable <string> SearchWord(string word)
 {
     try
     {
         string searchUrl = GetWebSiteInfo().BaseUrl +
                            "/s/ref=nb_sb_noss_1?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords={0}";
         var url = String.Format(searchUrl, Utils.UrlEncode(word, SiteEncoding));
         using (var http = new HttpHelper(url, SiteEncoding))
         {
             var html = http.GetHtml();
             if (!html.IsNullOrEmpty())
             {
                 html = RegexHelper.ClearTrn(html);
             }
             var showList = HtmlCls.GetHtmlById(html, "atfResults") + HtmlCls.GetHtmlById(html, "btfResults");
             var list     =
                 HtmlCls.GetHtmlByCss(showList, "productImage").Select(
                     t => RegexHelper.Match(t, "<a[^>]*href=[\"']?([^\"'>]+)(#[^\"'>]*)?[\"']?[^>]*>")).Distinct().
                 ToList();
             return(list);
         }
     }
     catch (Exception ex)
     {
         FileHelper.WriteException(ex);
         return(new List <string>());
     }
 }
Exemplo n.º 6
0
        public static string GetAftersaleService(string docHtml)
        {
            string area = HtmlCls.GetHtmlById(docHtml, "detail");

            if (!string.IsNullOrEmpty(area))
            {
                var list = HtmlCls.GetHtmlByCss(area, "mc tabcon hide").ToList();
                if (list.Count() >= 3)
                {
                    return(RegexHelper.Match(list[2], "<[^>]*>([^<]+)<[^>]*>").Trim());
                }
                return("");
            }
            return("");
        }
Exemplo n.º 7
0
 public override int GetStockCode()
 {
     try
     {
         GetHtml(SiteEncoding);
         var stock = HtmlCls.GetHtmlByCss(DocHtml, "availGreen").FirstOrDefault();
         if (stock.IsNullOrEmpty())
         {
             return(0);
         }
         return(1);
     }
     catch (Exception ex)
     {
         FileHelper.WriteException(ex);
         return(-1);
     }
 }
Exemplo n.º 8
0
 public override int GetStockCode()
 {
     try
     {
         GetHtml(SiteEncoding);
         var proNum = HtmlCls.GetHtmlByCss(DocHtml, "prodNum").ToList()[1];
         proNum = Regex.Replace(proNum, "</?[0-9a-zA-Z]+[^>]*>", "").Replace("商品编号:", "").Trim();
         var stockUrl = GetWebSiteInfo().BaseUrl + "/ec/homeus/browse/exactMethod.jsp?goodsNo={0}&city=71010000";
         stockUrl = String.Format(stockUrl, proNum);
         using (var http = new HttpHelper(stockUrl, SiteEncoding))
         {
             var html = http.GetHtml();
             var str  = RegexHelper.Match(html, "\"result\":\"([a-zA-Z])\"");
             return(str == "Y" ? 1 : 0);
         }
     }
     catch (Exception)
     {
         return(-1);
     }
 }
Exemplo n.º 9
0
 public override IEnumerable <string> GetUrlList(string listUrl)
 {
     try
     {
         using (var http = new HttpHelper(listUrl, SiteEncoding))
         {
             var html = http.GetHtml();
             html = (!html.IsNullOrEmpty() ? RegexHelper.ClearTrn(html) : http.GetHtml());
             if (html.IsNullOrEmpty())
             {
                 return(new List <string>());
             }
             var showList = HtmlCls.GetHtmlByCss(html, "pic");
             var list     =
                 showList.Select(t => RegexHelper.Match(t, "<a[^>]*href=[\"']?([^\"'>;]+)(;[^\"'>]*)?[\"']?[^>]*>"))
                 .Distinct().ToList();
             return(list.Where(t => !t.IsNullOrEmpty()).Select(t => Utils.GetAbsoluteUrl(GetWebSiteInfo().BaseUrl, t)).ToList());
         }
     }
     catch (Exception)
     {
         return(new List <string>());
     }
 }
Exemplo n.º 10
0
 /// <summary>
 /// 获取右则区域html
 /// </summary>
 /// <param name="docHtml"></param>
 /// <returns></returns>
 public static string GetRigthArea(string docHtml)
 {
     return(HtmlCls.GetHtmlByCss(docHtml, "right-extra").FirstOrDefault());
 }
Exemplo n.º 11
0
        /// <summary>
        /// 获取产品描述
        /// </summary>
        /// <param name="docHtml">html文件</param>
        /// <param name="version">区分图书1和其他0</param>
        /// <returns></returns>
        public static string GetProDesc(string docHtml, int version)
        {
            string area = "";

            try
            {
                docHtml = RegexHelper.ClearTrn(docHtml);

                if (version == 0)
                {
                    //增加 规格描述 -2012-02-29 shy
                    string pt = HtmlCls.GetHtmlByCss(docHtml, "Ptable").FirstOrDefault();
                    if (!string.IsNullOrEmpty(pt))
                    {
                        area += pt;
                    }

                    area += HtmlCls.GetHtmlByCss(docHtml, "content").FirstOrDefault();
                }
                else
                {
                    //图书类
                    var list = HtmlCls.GetHtmlByCss(docHtml, "m m1");
                    area = list.Aggregate(area, (current, s) => current + s);
                    string listH = HtmlCls.GetHtmlByCss(area, "list-h").FirstOrDefault();
                    if (!string.IsNullOrEmpty(listH))
                    {
                        area = area.Replace(listH, "");
                    }
                    //去除【该作者其它作品】区域
                    listH = HtmlCls.GetHtmlById(area, "related-works");
                    if (!string.IsNullOrEmpty(listH))
                    {
                        area = area.Replace(listH, "");
                    }
                    string sum     = HtmlCls.GetHtmlById(docHtml, "summary"); //加入图书信息
                    var    sumList = RegexHelper.Matches(sum, "<li[^>]*>(.*?)</li>").Take(9);
                    sum = sumList.Aggregate("", (current, s) => current + "<div>" + s + "</div>");
                    sum = Regex.Replace(sum, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签

                    area = sum + area;
                }
                //排除授权html
                string red = HtmlCls.GetHtmlByAttr(area, "color=\"red\"").FirstOrDefault();
                if (!string.IsNullOrEmpty(red))
                {
                    area = area.Replace(red, "");
                }
                area = area.Replace("class=\"content\"", "");                                        //排除样式冲突
                //area = Regex.Replace(area, "class=['\"][^'\"]*['\"]", "");//排除样式冲突.终极
                area = Regex.Replace(area, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签
                area = Regex.Replace(area, "\\sstyle=(['\"])[^'\"]+?\\1", "");                       //排除样式
                area = Regex.Replace(area, "<script[^>]*>(.*?)</script>", "");                       //排除script标签
                area = Regex.Replace(area, "src\\d=", "src=");                                       //显示src
                area = Regex.Replace(area, "京东商城|京东", "本商城");                                        //排除京东字样
            }
            catch (Exception ex)
            {
                FileHelper.WriteException(ex);
            }
            return(area);
        }