private static IEnumerable <TamllBase> GetTUrlsFromHtml(string url, out string next) { next = ""; var urls = new List <TamllBase>(); //tmall根据cookie不一样,前端显示也不一样。。 const string cookie = "x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=qMo3B45XYmoCAct2enaIrZoT; t=9bfd6b376a1f1e450056f0e1b1c54240; tracknick=luoyong87610; mpp=t%3D0%26m%3D%26h%3D0%26l%3D0; uc1=x; cookie2=22291aea11e397a82512118642ac0abe; passtime=1341285069752; isFirstOpen=true"; string docHtml = HtmlCls.GetHtmlByUrl(url, Encoding.Default, cookie); if (!string.IsNullOrEmpty(docHtml)) { docHtml = RegexHelper.ClearBr(docHtml); next = Utils.UrlDecode(RegexHelper.Match(docHtml, "<a[^>]*href=['\"]([^'\"\\s]+)['\"][^>]*class=['\"]ui-page-s-next['\"][^>]*>")); var listHtml = HtmlCls.GetHtmlById(docHtml, "J_itemList"); var list = HtmlCls.GetHtmlByCss(listHtml, "product"); //1:url,2:name const string regStr = "<a[^>]*href=['\"]([^'\"\\s]+?)['\"][^>]*class=['\"]product-title['\"][^>]*title=['\"]([^'\"]+?)['\"][^>]*>"; //price const string priceReg = "<span[^>]*class=['\"]product-normal['\"][^>]*title=['\"]([^'\"\\s]+)['\"][^>]*>"; urls.AddRange(list.Select(item => new TamllBase { Url = Utils.UrlDecode(RegexHelper.Match(item, regStr, 1)), Title = RegexHelper.Match(item, regStr, 2), Price = Convert.ToDecimal(RegexHelper.Match(item, priceReg)) })); } return(urls); }
public override IEnumerable <string> GetUrlList(string listUrl) { try { using (var http = new HttpHelper(listUrl, SiteEncoding)) { var html = http.GetHtml(); var url = http.GetRequestUrl(); if (Regex.IsMatch(url, "^http://www.newegg.com.cn/Product/[0-9a-zA-Z\\-]+.htm$")) { return new List <string> { url } } ; if (!html.IsNullOrEmpty()) { html = RegexHelper.ClearTrn(html); } var showList = HtmlCls.GetHtmlById(html, "itemGrid1"); var list = RegexHelper.Matches(showList, "(http://www.newegg.com.cn/Product/[0-9a-zA-Z\\-]+.htm)").Distinct() .ToList(); return (list.Where(t => !t.IsNullOrEmpty()).Select( t => Utils.GetAbsoluteUrl(GetWebSiteInfo().BaseUrl, t)).ToList()); } } catch (Exception) { return(new List <string>()); } }
/// <summary> /// 获取首图 /// </summary> /// <param name="docHtml"></param> /// <returns></returns> public static string GetBigPic(string docHtml) { string area = HtmlCls.GetHtmlById(docHtml, "spec-n1"); string src = RegexHelper.Match(area, "<img[^>]*src=['\"]([^'\"]*)['\"][^>]*>"); return(src); }
private static IEnumerable <string> GetUrlsFromHtml(string url) { var urlList = new List <string>(); //int ver = GetListUrlVersion(url); int ver = 0; //有些图书html样式居然不一样~ string docHtml = HtmlCls.GetHtmlByUrl(url, SiteEncoding); //HtmlCls.GetHtmlByUrl(url, _useProxy);) if (!string.IsNullOrEmpty(docHtml)) { docHtml = RegexHelper.ClearTrn(docHtml); var cssName = "p-img"; var listHtml = HtmlCls.GetHtmlById(docHtml, "plist"); if (listHtml.IsNullOrEmpty()) { cssName = "i-img"; listHtml = HtmlCls.GetHtmlByCss(docHtml, "list-h").FirstOrDefault(); } var list = HtmlCls.GetHtmlByCss(listHtml, cssName).Select( t => RegexHelper.Match(t, "<a[^>]*href=[\"']?([^\"'>#]+)(#[^\"'>]*)?[\"']?[^>]*>")).Distinct(). ToList(); return(list); } return(urlList); }
public override string GetProName() { GetHtml(SiteEncoding); string area = HtmlCls.GetHtmlById(DocHtml, "name"); return(RegexHelper.Match(area, "<h1>([^<]*)<")); }
public override IEnumerable <string> SearchWord(string word) { try { string searchUrl = GetWebSiteInfo().BaseUrl + "/s/ref=nb_sb_noss_1?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords={0}"; var url = String.Format(searchUrl, Utils.UrlEncode(word, SiteEncoding)); using (var http = new HttpHelper(url, SiteEncoding)) { var html = http.GetHtml(); if (!html.IsNullOrEmpty()) { html = RegexHelper.ClearTrn(html); } var showList = HtmlCls.GetHtmlById(html, "atfResults") + HtmlCls.GetHtmlById(html, "btfResults"); var list = HtmlCls.GetHtmlByCss(showList, "productImage").Select( t => RegexHelper.Match(t, "<a[^>]*href=[\"']?([^\"'>]+)(#[^\"'>]*)?[\"']?[^>]*>")).Distinct(). ToList(); return(list); } } catch (Exception ex) { FileHelper.WriteException(ex); return(new List <string>()); } }
public override string GetProPic() { GetHtml(SiteEncoding); string area = HtmlCls.GetHtmlById(DocHtml, "spec-n1"); string src = RegexHelper.Match(area, "<img[^>]*src=['\"]([^'\"]*)['\"][^>]*>"); return(src); }
public static string GetPackingList(string docHtml) { string area = HtmlCls.GetHtmlById(docHtml, "bzqd"); if (!string.IsNullOrEmpty(area)) { return(RegexHelper.Match(area, "<[^>]*>([^<]+)<[^>]*>").Trim()); } return(""); }
public static string GetBrandName(string docHtml) { string area = HtmlCls.GetHtmlById(docHtml, "i-detail"); if (!string.IsNullOrEmpty(area)) { return(RegexHelper.Match(area, "<li[^>]*>生产厂家:<a[^>]*brand[^>]*>([^<]+)</a>").Trim()); } return(""); }
public override string GetProPic() { try { GetHtml(SiteEncoding); var pic = HtmlCls.GetHtmlById(DocHtml, "bgPics"); return(RegexHelper.Match(pic, "\\s+src=[\"']([^\"'>]+)[\"']")); } catch (Exception) { return(""); } }
public override string GetProName() { try { GetHtml(SiteEncoding); var name = HtmlCls.GetHtmlById(DocHtml, "title-descript"); return(Regex.Replace(name, "</?[0-9a-zA-Z]+[^>]*>", "").Trim()); } catch (Exception) { return(""); } }
private static IEnumerable <string> GetUrlsFromHtml(string url) { var urls = new List <string>(); string docHtml = HtmlCls.GetHtmlByUrl(url); if (!string.IsNullOrEmpty(docHtml)) { var listHtml = HtmlCls.GetHtmlById(docHtml, "Id_prodItemList"); const string regStr = "<div[^>]*class=['\"]proPic['\"][^>]*><a[^>]*href=['\"]([^'\"]+)['\"][^>]*>"; urls = RegexHelper.Matches(listHtml, regStr); urls = urls.Select(t => (t.StartsWith("/") ? SanfoUrl + t : t)).ToList(); } return(urls); }
public override string GetProName() { try { GetHtml(SiteEncoding); var str = HtmlCls.GetHtmlById(DocHtml, "btAsinTitle"); str = Regex.Replace(str, "</?[0-9a-zA-Z]+[^>]*>", ""); return(str); } catch (Exception ex) { FileHelper.WriteException(ex); return(""); } }
/// <summary> /// 产品描述 /// </summary> /// <param name="docHtml"></param> /// <returns></returns> public static string GetDescFromHtml(string docHtml) { string desc = HtmlCls.GetHtmlById(docHtml, "productDescription"); if (!string.IsNullOrEmpty(desc)) { desc = Regex.Replace(desc, @"红孩子母婴商城|红孩子", "本商场"); desc = desc.Replace("id=\"productDescription\"", ""); //排除样式冲突 //area = Regex.Replace(area, "class=['\"][^'\"]*['\"]", "");//排除样式冲突.终极 desc = Regex.Replace(desc, "<a[^]*href=[\"|'][^'\"]*[\"'][^>]*>(.*?)</a>", ""); //排除a标签 desc = Regex.Replace(desc, "<script[^>]*>[^<]*</script>", ""); //排除script标签 desc = Regex.Replace(desc, "src\\d=", "src="); //显示src } return(desc); }
public static string GetAftersaleService(string docHtml) { string area = HtmlCls.GetHtmlById(docHtml, "detail"); if (!string.IsNullOrEmpty(area)) { var list = HtmlCls.GetHtmlByCss(area, "mc tabcon hide").ToList(); if (list.Count() >= 3) { return(RegexHelper.Match(list[2], "<[^>]*>([^<]+)<[^>]*>").Trim()); } return(""); } return(""); }
public override string GetProPic() { try { GetHtml(SiteEncoding); var str = HtmlCls.GetHtmlById(DocHtml, "midImg"); str = HtmlCls.GetAttrValue(str, "src340"); return(Regex.Replace(str, "\\?.*$", "")); } catch (Exception ex) { FileHelper.WriteException(ex); return(""); } }
public override string GetProPic() { try { GetHtml(SiteEncoding); var str = HtmlCls.GetHtmlById(DocHtml, "prodImageCell"); str = RegexHelper.Match(str, "\\s+src=[\"']([^\"'>]+)[\"']"); return(str); } catch (Exception ex) { FileHelper.WriteException(ex); return(""); } }
public override IEnumerable <string> GetUrlList(string listUrl) { try { using (var http = new HttpHelper(listUrl, SiteEncoding)) { var html = http.GetHtml(); html = RegexHelper.ClearTrn(html); var showList = HtmlCls.GetHtmlById(html, "proShow"); var linkReg = "<a[^>]*href=[\"']?(" + GetWebSiteInfo().BaseUrl + "/emall/prd_\\d+_\\d+_-\\d+_\\d+_.html)[\"']?[^>]*>"; var list = RegexHelper.Matches(showList, linkReg).Distinct().ToList(); return(list); } } catch (Exception ex) { FileHelper.WriteException(ex); return(new List <string>()); } }
public static decimal GetMarketerPrice(string docHtml) { decimal mprice; try { string str = HtmlCls.GetHtmlById(docHtml, "summary"); str = RegexHelper.Match(str, "<del>¥([^<]+)</del>"); if (string.IsNullOrEmpty(str)) { str = HtmlCls.GetHtmlById(docHtml, "book-price"); str = RegexHelper.Match(str, "<del>¥([^<]+)</del>"); } mprice = decimal.Parse(str.Replace(",", "")); } catch (Exception) { mprice = 0; } return(mprice > 100 ? Math.Round(mprice, 0) : Math.Round(mprice, 1)); }
private static IEnumerable <string> GetUrlsFromHtml(string url, out string next) { next = ""; var urls = new List <string>(); //tmall根据cookie不一样,前端显示也不一样。。 const string cookie = "x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=qMo3B45XYmoCAct2enaIrZoT; t=9bfd6b376a1f1e450056f0e1b1c54240; tracknick=luoyong87610; mpp=t%3D0%26m%3D%26h%3D0%26l%3D0; uc1=x; cookie2=8eb29ff22cbe3bddcad34d264d01806f; passtime=1341280687588; isFirstOpen=false; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0"; string docHtml = HtmlCls.GetHtmlByUrl(url, Encoding.Default, cookie); if (!string.IsNullOrEmpty(docHtml)) { docHtml = RegexHelper.ClearBr(docHtml); next = Utils.UrlDecode(RegexHelper.Match(docHtml, "<a[^>]*href=['\"]([^'\"\\s]+)['\"][^>]*class=['\"]ui-page-s-next['\"][^>]*>")); var listHtml = HtmlCls.GetHtmlById(docHtml, "J_itemList"); const string regStr = "<a[^>]*href=['\"]([^'\"\\s]+?)['\"][^>]*class=['\"]product-Img['\"][^>]*>"; //"<a[^>]*class=['\"]product-title['\"][^>]*href=['\"]([^'\"]+)['\"][^>]*>"; urls = RegexHelper.Matches(listHtml, regStr); urls = urls.Select(t => (t.StartsWith("/") ? BaseUrl : "") + Utils.UrlDecode(t)).ToList(); } return(urls); }
/// <summary> /// 获取产品描述 /// </summary> /// <param name="docHtml">html文件</param> /// <param name="version">区分图书1和其他0</param> /// <returns></returns> public static string GetProDesc(string docHtml, int version) { string area = ""; try { docHtml = RegexHelper.ClearTrn(docHtml); if (version == 0) { //增加 规格描述 -2012-02-29 shy string pt = HtmlCls.GetHtmlByCss(docHtml, "Ptable").FirstOrDefault(); if (!string.IsNullOrEmpty(pt)) { area += pt; } area += HtmlCls.GetHtmlByCss(docHtml, "content").FirstOrDefault(); } else { //图书类 var list = HtmlCls.GetHtmlByCss(docHtml, "m m1"); area = list.Aggregate(area, (current, s) => current + s); string listH = HtmlCls.GetHtmlByCss(area, "list-h").FirstOrDefault(); if (!string.IsNullOrEmpty(listH)) { area = area.Replace(listH, ""); } //去除【该作者其它作品】区域 listH = HtmlCls.GetHtmlById(area, "related-works"); if (!string.IsNullOrEmpty(listH)) { area = area.Replace(listH, ""); } string sum = HtmlCls.GetHtmlById(docHtml, "summary"); //加入图书信息 var sumList = RegexHelper.Matches(sum, "<li[^>]*>(.*?)</li>").Take(9); sum = sumList.Aggregate("", (current, s) => current + "<div>" + s + "</div>"); sum = Regex.Replace(sum, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签 area = sum + area; } //排除授权html string red = HtmlCls.GetHtmlByAttr(area, "color=\"red\"").FirstOrDefault(); if (!string.IsNullOrEmpty(red)) { area = area.Replace(red, ""); } area = area.Replace("class=\"content\"", ""); //排除样式冲突 //area = Regex.Replace(area, "class=['\"][^'\"]*['\"]", "");//排除样式冲突.终极 area = Regex.Replace(area, "<a[^>]*href=[\"']([^'\"]+?)[\"'][^>]*>(.*?)</a>", "$2"); //排除a标签 area = Regex.Replace(area, "\\sstyle=(['\"])[^'\"]+?\\1", ""); //排除样式 area = Regex.Replace(area, "<script[^>]*>(.*?)</script>", ""); //排除script标签 area = Regex.Replace(area, "src\\d=", "src="); //显示src area = Regex.Replace(area, "京东商城|京东", "本商城"); //排除京东字样 } catch (Exception ex) { FileHelper.WriteException(ex); } return(area); }
/// <summary> /// 获取产品名 /// </summary> /// <param name="docHtml">html</param> /// <returns></returns> public static string GetProName(string docHtml) { string area = HtmlCls.GetHtmlById(docHtml, "name"); return(RegexHelper.Match(area, "<h1>([^<]*)<")); }