public void SaveAllSiteClass() { HasBindClasslist = new SiteClassInfoDB().getAllSiteCatInfo(Baseinfo.SiteId); string url = "http://www.j1.com/sitemap.html"; string page = HtmlAnalysis.Gethtmlcode(url); string content = RegGroupsX<string>(page, "<div class=\"sitemap_sortwrap qbfl\">(?<x>.*?)</div>"); var list = RegGroupCollection(content, "<a target='_blank' href='(?<y>.*?)'>(?<x>.*?)</a>"); for (int i = 0; i < list.Count; i++) { string catUrl = list[i].Groups["y"].Value; string catid = RegGroupsX<string>(catUrl, "http://www.j1.com/p-(?<x>\\d+)"); string catName = list[i].Groups["x"].Value; if (!HasBindClasslist.Exists(p => p.ClassId == catid)) { SiteClassInfo cat = new SiteClassInfo { ParentUrl = "", ParentClass = "", ParentName = "", TotalProduct = 0, Urlinfo = catUrl, ClassId = catid, UpdateTime = DateTime.Now, IsDel = false, BindClassId = 0, BindClassName = "", HasChild = true, IsBind = false, IsHide = false, ClassName = catName, SiteId = Baseinfo.SiteId, ClassCrumble = "", CreateDate = DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } } }
private void GetYhdClassInfo(string url) { if (!url.Contains("b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k/")) url += "b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k/"; string pageInfo = Gethtmlcode(url); string classInfo = RegGroupsX<string>(pageInfo, "<ul class=\"listCon clearfix\">(?<x>.*?)<a class=\"c_btn c_next iconSearch\""); var list = RegGroupCollection(classInfo, "href=\"(?<x>.*?)\".*?title=\"(?<y>.*?)\""); if (list == null) return; string pcatUrl = ""; string pcatName = ""; string pcatId = ""; string classCrumble = ""; int total = RegGroupsX<int>(pageInfo, "共(?<x>\\d+?)条"); if (list.Count == 1) { string categoryname = RegGroupsX<string>(pageInfo, "var categoryName = '(?<x>.*?)'"); string extid = RegGroupsX<string>(pageInfo, "var expectCategoryId = \"(?<x>\\d+)\""); string tempcurcatid = "c" + extid + "-" + categoryname; if(HasBindClasslist.Exists(p => p.ClassId == tempcurcatid)) return; string current = RegGroupsX<string>(pageInfo, "<title>(?<x>.*?)品种齐全|<div class=\"guide_title\"><span title=\"(?<x>.*?)\">"); SiteClassInfo catInfo = new SiteClassInfo { Urlinfo = $"http://list.yhd.com/{tempcurcatid}/b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k", ClassId = tempcurcatid, ClassName = current, BindClassId = 0, BindClassName = "", CreateDate = DateTime.Now, UpdateTime = DateTime.Now, SiteId = Baseinfo.SiteId, IsBind = false, IsDel = false, IsHide = false }; catInfo.HasChild = HasBindClasslist.Exists(p => p.ParentClass == catInfo.ClassId); catInfo.ParentClass = pcatId; catInfo.ParentName = pcatName; catInfo.ParentUrl = pcatUrl; if (regIsMatch(tempcurcatid, "^(?<x>c\\d+-\\d+(-\\d+)?)$")) { if (!HasBindClasslist.Exists(p => p.ClassId == catInfo.ClassId)) { new SiteClassInfoDB().AddSiteClass(catInfo); LogServer.WriteLog("线程id:" + Thread.CurrentThread.ManagedThreadId + "\t" + catInfo.ClassId + "\t" + catInfo.ClassName + "111111111111111", "addpro"); HasBindClasslist.Add(catInfo); } } return; } //SiteClassInfoDB db = new SiteClassInfoDB(); for (int i = 0; i < list.Count; i++) { SiteClassInfo catInfo = new SiteClassInfo(); catInfo.Urlinfo = list[i].Groups["x"].Value; catInfo.ClassId = RegGroupsX<string>(catInfo.Urlinfo, "http://list.yhd.com/(?<x>.*?)/"); catInfo.ClassName = list[i].Groups["y"].Value; if (catInfo.Urlinfo == null || catInfo.ClassId == null || catInfo.ClassName == null) { LogServer.WriteLog(Baseinfo.SiteName + "分类抓取错误1url\t" + url, "AddClassError"); continue; } if (!HasBindClasslist.Exists(p => p.ClassId == catInfo.ClassId) ) { catInfo.ParentClass = pcatId; catInfo.ParentName = pcatName; catInfo.ParentUrl = pcatUrl; if (i != 0 && pcatId!="") classCrumble += pcatId + ","; catInfo.ClassCrumble = classCrumble.TrimEnd(','); catInfo.BindClassId = 0; catInfo.BindClassName = ""; catInfo.CreateDate = DateTime.Now; catInfo.UpdateTime = DateTime.Now; catInfo.SiteId = Baseinfo.SiteId; catInfo.IsBind = false; catInfo.IsDel = false; catInfo.IsHide = false; catInfo.HasChild = true; if( list.Count - 1 == i) { catInfo.HasChild = false; catInfo.TotalProduct = total; } HasBindClasslist.Add(catInfo); LogServer.WriteLog("线程id:"+Thread.CurrentThread.ManagedThreadId+ "\t" + catInfo.ClassId +"\t"+catInfo.ClassName+"111111111111111","addpro"); new SiteClassInfoDB().AddSiteClass(catInfo); } pcatUrl = catInfo.Urlinfo; pcatName = catInfo.ClassName; pcatId = catInfo.ClassId; } }
private void UpdateCat(SiteClassInfo siteClassInfo) { string pageinfo = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); string crumb = RegGroupsX<string>(pageinfo, "<div class=detailnav>(?<x>.*?)</div>"); if (crumb == null) return; var deep = RegGroupCollection(crumb, "href=\"(?<y>.*?)\"( target=\"_blank\")?>(?<x>.*?)</a>"); string parentUrl = ""; string parentName = ""; string parentId = ""; for (int i = 0; i < deep.Count; i++) { if (deep[i].Groups["x"].Value.Contains("首页") || deep[i].Groups["x"].Value == siteClassInfo.ClassName) continue; parentUrl = deep[i].Groups["y"].Value; parentName = deep[i].Groups["x"].Value; parentId = RegGroupsX<string>(parentUrl, "http://www.j1.com/p-(?<x>\\d+)"); } string children = RegGroupsX<string>(pageinfo, "<div class=\"listpageChooseBox\">(?<x>.*?)</div>"); var catlist = RegGroupCollection(children, "<a href=\"(?<y>.*?)\">(?<x>.*?)<span>"); siteClassInfo.HasChild = !children.Contains(siteClassInfo.ClassName); if (catlist != null) { for (int i = 0; i < catlist.Count; i++) { string url = catlist[i].Groups["y"].Value; string catid = RegGroupsX<string>(url, "http://www.j1.com/p-(?<x>\\d+)"); if (!HasBindClasslist.Exists(p => p.ClassId == catid)) { string catName = catlist[i].Groups["x"].Value; SiteClassInfo cat = new SiteClassInfo { ParentUrl = "", ParentClass = "", ParentName = "", TotalProduct = 0, Urlinfo = url, ClassId = catid, UpdateTime = DateTime.Now, IsDel = false, BindClassId = 0, BindClassName = "", HasChild = true, IsBind = false, IsHide = false, ClassName = catName, SiteId = Baseinfo.SiteId, ClassCrumble = "", CreateDate = DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } siteClassInfo.ParentUrl = parentUrl; siteClassInfo.ParentClass = parentId; siteClassInfo.ParentName = parentName; siteClassInfo.TotalProduct = RegGroupsX<int>(pageinfo, "共(?<x>\\d+)个商品"); siteClassInfo.UpdateTime = DateTime.Now; new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); }
private void UpdateCat(SiteClassInfo siteClassInfo) { string page = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); string cromb = RegGroupsX<string>(page, "您现在的位置:</span>(?<x>.*?)</div>"); if (cromb == null) return; var plist = RegGroupCollection(cromb, "<a class=\"\" href=\"(?<x>.*?)\">(?<y>.*?)</a>"); if (plist == null) return; string parentUrl = ""; string parentName = ""; string parentId = ""; foreach (Match item in plist) { parentUrl = item.Groups["x"].Value; parentName = item.Groups["y"].Value; if(parentName=="首页") { continue; } if (!string.IsNullOrEmpty(parentName)) { parentName = parentName.Trim(); } if (parentName == "") { parentUrl = ""; continue; } parentId = RegGroupsX<string>(parentUrl, "category/(?<x>\\d+)-"); if (!ValidCatId(parentId)) { parentId = RegGroupsX<string>(parentUrl, "/(?<x>.*?).html"); if (string.IsNullOrEmpty(parentId)) continue; } parentUrl = string.Format(domain+"{0}", parentUrl); if (!HasBindClasslist.Exists(c => c.ClassId == parentId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = parentName, ClassId = parentId, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = parentUrl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } var templist = RegGroupCollection(page, "getJSON\\(\"(?<x>.*?)\""); for (int i = 0; i < templist.Count; i++) { var caturl = templist[i].Groups["x"].Value; string temppage = HtmlAnalysis.Gethtmlcode(domain+ caturl); var catlist = RegGroupCollection(temppage, "n_(?<x>.*?)\"EntityState"); if (catlist == null) continue; foreach (Match item in catlist) { string cat = item.Groups["x"].Value; string catid = RegGroupsX<string>(cat, "id\":(?<x>\\d+),"); string catName = RegGroupsX<string>(cat, "\"n_name\":\"(?<x>.*?)\""); string catpid = RegGroupsX<string>(cat, "\"parentid\":(?<x>.*?),"); string tempurl = string.Format("http://www.360kxr.com/category/{0}-0-2-1-15-1.html", catid); if (!HasBindClasslist.Exists(c => c.ClassId == catid)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = catpid, ParentName = "", ClassName = catName, ClassId = catid, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = tempurl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } siteClassInfo.HasChild = HasBindClasslist.Exists(c => c.ParentClass == siteClassInfo.ClassId); siteClassInfo.ParentClass = parentId; siteClassInfo.ParentName = parentName; siteClassInfo.ParentUrl = parentUrl; siteClassInfo.UpdateTime = DateTime.Now; siteClassInfo.TotalProduct = RegGroupsX<int>(page, "<div class=\"goods-total\">共<b>(?<x>\\d+)</b>个商品</div>"); new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); }
public bool updateSpiderOnly(SiteClassInfo catinfo) { if (catinfo == null) throw new ArgumentNullException("catinfo"); using (var db = _dbFactory.OpenDbConnection()) { try { //db.UpdateNonDefaults(new SiteClassInfo { Id = catinfo.Id, ClassName = catinfo.ClassName, ClassId = catinfo.ClassId, ParentClass = catinfo.ParentClass, ParentName = catinfo.ParentName, ParentUrl = catinfo.ParentUrl, UpdateTime = DateTime.Now, HasChild = catinfo.HasChild, Urlinfo = catinfo.Urlinfo, ClassCrumble = catinfo.ClassCrumble }, p => p.Id == catinfo.Id); int res = db.UpdateOnly(catinfo, u => new { u.ClassId, u.ClassName, u.Urlinfo, u.UpdateTime, u.ParentClass, u.ParentUrl, u.ParentName, u.ClassCrumble, u.TotalProduct, u.HasChild }, u => u.Id == catinfo.Id); if (res > 0) return true; return false; } catch (Exception ex) { LogServer.WriteLog(ex, "DBError"); return false; } //int res = db.Update(catinfo); //db.Update(catinfo, p => p.ClassId == catinfo.ClassId); } }
public void SetIsDel(SiteClassInfo catinfo) { if (catinfo == null) throw new ArgumentNullException("catinfo"); using (var db = _dbFactory.OpenDbConnection()) { try { db.UpdateOnly(new SiteClassInfo{IsDel=true}, p => p.IsDel,p=>p.Id==catinfo.Id); } catch (Exception ex) { LogServer.WriteLog(ex, "DBError"); // throw; } } }
private void GetCatInfo(string directoryHtml) { string catArea = RegGroupsX<string>(directoryHtml, "<div class=\"all-category-box\">(?<x>.*?)<div class=\"category-contact\">"); if (catArea == null) return; var list = RegGroupCollection(catArea, "href=\"(?<x>.*?)\".*?>(?<y>.*?)</a>"); foreach (Match item in list) { string tempUrl = item.Groups["x"].Value; string tempName = item.Groups["y"].Value; if (tempName == "商品分类") continue; string tempid = RegGroupsX<string>(tempUrl, "http://www.ehaier.com/l/(?<x>\\d+).html|http://www.ehaier.com/l/(?<x>\\d+-\\d+).html|http://www.ehaier.com/l/(?<x>\\d+-\\d+-\\d+).html"); if (!ValidCatId(tempid)) continue; if (!HasBindClasslist.Exists(c => c.ClassId == tempid)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = tempName, ClassId = tempid, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild =tempid.Contains("-"), ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = tempUrl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
private void GetCatInfo(string directoryHtml) { string catArea = RegGroupsX<string>(directoryHtml, "<div class=\"category-box\" id=\"category_box\">(?<x>.*?)</ul>"); if (catArea == null) return; var list = RegGroupCollection(catArea, "<a(?<x>.*?)</a>"); foreach (Match item in list) { string tempUrl = RegGroupsX<string>(item.ToString(), "href=\"(?<x>.*?)\""); if (string.IsNullOrEmpty(tempUrl)) continue; tempUrl = string.Format("http://www.hangowa.com{0}", tempUrl); string tempName = RegGroupsX<string>(item.ToString(), ">(?<x>.*?)</a>"); string tempid = RegGroupsX<string>(tempUrl, "gallery-(?<x>\\d+?).html"); if (!HasBindClasslist.Exists(c => c.ClassId == tempid)) { int page = RegGroupsX<int>(directoryHtml, "共<b class=\"op-search-result\">(?<x>\\d+?)</b>件"); SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = tempName, ClassId = tempid, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = !item.ToString().Contains("class=\"level3\""), ClassCrumble = "", TotalProduct = page, SiteId = Baseinfo.SiteId, Urlinfo = tempUrl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
/// <summary> /// 批量添加分类 /// </summary> /// <param name="catid"></param> private void bathAddCat(string catid) { SiteClassBll bll = new SiteClassBll(); int siteClassId; int parentId; SiteClassInfo siteCat = new SiteClassInfo(); if (int.TryParse(catid, out siteClassId)) { siteCat = bll.GetCatById(siteClassId); SiteCatName = siteCat.ClassName.Replace(" ",""); } if (!int.TryParse(Request.Form["parCatName"], out parentId)) return; #region 添加分类 ClassInfo cat = new ClassInfo(); cat.CatName = SiteCatName; cat.SpellWord = WordCenter.GetShortPinyin(cat.CatName); cat.SEOWords = ""; cat.Sort = 0; cat.CreateDate = DateTime.Now; cat.UpdateTime = DateTime.Now; cat.HasChild = false; ClassInfoBll catbll = new ClassInfoBll(); if (parentId != 0) { ClassInfo parCat = catbll.getCat(parentId); cat.ParentId = parCat.Id; cat.Level = parCat.Level + 1; cat.ParentName = parCat.CatName; if (!string.IsNullOrEmpty(parCat.CatCrumbleIds)) { cat.CatCrumbleIds = parCat.CatCrumbleIds + "," + parCat.Id; cat.CatCrumbleNames = parCat.CatCrumbleNames + "," + parCat.CatName; } else { cat.CatCrumbleIds = parCat.Id.ToString(CultureInfo.InvariantCulture); cat.CatCrumbleNames = parCat.CatName; } if (!parCat.HasChild) { parCat.HasChild = true; catbll.UpdateCat(parCat); } } else { cat.ParentId = 0; cat.Level = 1; cat.ParentName = ""; cat.CatCrumbleIds = ""; cat.CatCrumbleNames = ""; } cat.Id = catbll.AddCat(cat); #endregion #region 添加子分类 AllSiteCat = bll.GetClassInfo(siteCat.SiteId); addChildCat(siteCat.ClassId, cat); #endregion }
private void UpdateTmallNode(SiteClassInfo item) { if(item.ParentClass!="") return; string url = string.Format("http://list.tmall.com/search_product.htm?cat={0}" , item.ClassId); HtmlAnalysis reqest = new HtmlAnalysis(); reqest.Headers.Add("Cookie", "_med=dw:1440&dh:900&pw:1440&ph:900&ist:0; pnm_cku822=126UW5TcyMNYQwiAiwQRHhBfEF8QXtHcklnMWc%3D%7CUm5Ockt%2BQXVPdUp%2BQH9Dfyk%3D%7CU2xMHDJxPk82UjVOI1h2VnhCbExiPl85VTJMNhhOGA%3D%3D%7CVGhXd1llXGlWYlhiXWlXaFRoX2JAekN3TndMdUB1S3RAeUx0TmA2%7CVWldfS0SMgwzCCgULg4gWz0ReEB2Aix6LA%3D%3D%7CVmhIGCcYJAQ%2FAyMXLRc3DTQNORklHCUYOAwxDCwQKRAtDTgDPmg%2B%7CV25Tbk5zU2xMcEl1VWtTaUlwJg%3D%3D; cq=ccp%3D1; tt=login.taobao.com; res=scroll%3A990*776-client%3A977*290-offset%3A977*290-screen%3A1440*900; hng=; uss=BqRyb7nd5KLIbC5D91VCamaiwt66iy8KP0cAS24EJNQWFeWsxGZv%2FwEo%2BAs%3D; cna=cFJaEEwJdRsCATyy24A1yMNe; l=AkZGKGJIZ/WDVSsY65u6dVSj1jLItYph; isg=Alpa8TTm1nmgf1rVi7OVW5M1rADEst5lFaLZTWTTFO241_oRTBsudSCt8xs0; OZ_1U_2061=vid=v801c15a894bb1.0&ctime=1478143053<ime=1476512356; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; _tb_token_=eeed7bb353eb5; ck1=; uc1=cookie14=UoW%2FX9QwsnjAzg%3D%3D&lng=zh_CN&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&existShop=false&cookie21=V32FPkk%2FhSg%2F&tag=0&cookie15=UIHiLt3xD8xYTw%3D%3D&pas=0; uc3=sg2=AQI4ctClVx2ycnFp5kyAa%2F3VFKDYjzhZBJFC8KK2LVw%3D&nk2=D9ZNP7htc6w%3D&id2=UU8Lx7%2BmPirPbw%3D%3D&vt3=F8dARHfHI%2BnGtn3VuNA%3D&lg2=UtASsssmOIJ0bQ%3D%3D; lgc=lunce188; tracknick=lunce188; cookie2=10682dca3e46d779e26f299924785699; cookie1=AV0h8l61cg4iTp3AqqPZRlYP3nQGpHHQCAg%2FB5Sm3VI%3D; unb=2731635449; t=65336f3349d3648c68445898ef92bec2; skt=2c4d55251dbb75a9; _nk_=lunce188; _l_g_=Ug%3D%3D; cookie17=UU8Lx7%2BmPirPbw%3D%3D; login=true"); reqest.RequestUserAgent = "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413"; string catPage= reqest.HttpRequest(url); Thread.Sleep(new Random().Next(6, 30)*1000); //string catPage = HtmlAnalysis.Gethtmlcode("http://list.tmall.com/search_product.htm?cat=" + item.ClassId); if (catPage.Contains("ResponseUri:http://www.tmall.com/") && item.UpdateTime.AddDays(15)<DateTime.Now) { new SiteClassBll().delClass(item); return; } var crumbsList = RegGroupCollection(catPage, "<li data-tag=\"cat\">(?<x>.*?)</li>"); if (crumbsList == null) return; SiteClassInfo catinfo = new SiteClassInfo(); string paraInfo = ""; string paraUrl = ""; string paraCatId = ""; string paraName = ""; for (int i = 0; i < crumbsList.Count; i++) { catinfo.ParentName = paraName; catinfo.ParentClass = paraCatId; catinfo.ParentUrl = paraCatId == "" ? "" : "http://list.tmall.com/search_product.htm?cat=" + paraCatId; //添加父类 if (paraCatId != "" && paraCatId != "" && !HasBindClasslist.Exists(c => c.ClassId == paraCatId)) { string tempparaInfo = ""; string tempparaUrl = ""; string tempparaCatId = ""; string tempparaName = ""; if (i > 1) { Match pnode = crumbsList[i - 2]; tempparaInfo = pnode.Groups["x"].Value; // tempparaUrl = RegGroupsX<string>(tempparaInfo, "href=\"(?<x>.*?)\""); tempparaCatId = RegGroupsX<string>(paraUrl, "cat=(?<x>\\d+)"); tempparaUrl = "http://list.tmall.com/search_product.htm?cat=" + tempparaCatId; tempparaName = RegGroupsX<string>(tempparaInfo, "title=\"(?<x>.*?)\""); } SiteClassInfo catPareInfo = new SiteClassInfo { ClassName = paraName, ClassId = paraCatId, SiteId = Baseinfo.SiteId, CreateDate = DateTime.Now, UpdateTime = DateTime.Now, IsHide = false, ParentUrl = tempparaUrl, ParentName = tempparaName, ClassCrumble = tempparaCatId + ",", ParentClass = tempparaCatId, Urlinfo = "http://list.tmall.com/search_product.htm?cat=" + paraCatId, IsDel=false, HasChild = true, IsBind = false }; catPareInfo.ClassCrumble = catPareInfo.ClassCrumble.TrimEnd(','); HasBindClasslist.Add(catPareInfo); shopClasslist.Add(catPareInfo); //父类的同级分类 GetAllBrotherCats(catPareInfo); //子类 GetChildCats(catPareInfo, catPage); } Match node = crumbsList[i]; paraInfo = node.Groups["x"].Value; paraUrl = RegGroupsX<string>(paraInfo, "href=\"(?<x>.*?)\""); paraCatId = RegGroupsX<string>(paraUrl, "cat=(?<x>\\d+)"); paraName = RegGroupsX<string>(paraInfo, "title=\"(?<x>.*?)\""); catinfo.ClassName = paraName; catinfo.ClassId = paraCatId; catinfo.SiteId = Baseinfo.SiteId; catinfo.CreateDate = DateTime.Now; catinfo.UpdateTime = DateTime.Now; catinfo.Urlinfo = "http://list.tmall.com/search_product.htm?cat=" + paraCatId; catinfo.TotalProduct = RegGroupsX<int>(catPage, "共<span> (?<x>\\d+)</span>件相关商品"); catinfo.IsHide = false; catinfo.IsBind = false; catinfo.IsDel = false; catinfo.ClassCrumble += paraCatId + ","; GetAllBrotherCats(catinfo); } if (string.IsNullOrEmpty(catinfo.ClassId)) return; if(regIsMatch(catPage, "<div class=\"cateAttrs\" data-spm=\".*?\">(?<x>.*?)<div class=\"propAttrs\"")) { GetChildCats(catinfo, catPage); catinfo.HasChild = true; } else catinfo.HasChild = false; catinfo.ClassCrumble = catinfo.ClassCrumble.TrimEnd(','); var oldCatInfo = HasBindClasslist.Find(c => c.ClassId == catinfo.ClassId); if (oldCatInfo==null) { catinfo.HasChild = true; HasBindClasslist.Add(catinfo); shopClasslist.Add(catinfo); } else { oldCatInfo.Urlinfo = catinfo.Urlinfo; oldCatInfo.ClassId = catinfo.ClassId; oldCatInfo.ClassName = catinfo.ClassName; oldCatInfo.TotalProduct = catinfo.TotalProduct; oldCatInfo.ParentUrl = catinfo.ParentUrl; oldCatInfo.ParentClass = catinfo.ParentClass; oldCatInfo.ParentUrl = catinfo.ParentUrl; oldCatInfo.UpdateTime = DateTime.Now; new SiteClassBll().UpdateSiteCat(oldCatInfo); } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
private void SaveCat(string catId) { if (!ValidCatId(catId)) return; Thread.Sleep(new Random().Next(10, 60) * 1000); string catPage = HtmlAnalysis.Gethtmlcode("http://list.tmall.com/search_product.htm?cat=" + catId); var crumbsList = RegGroupCollection(catPage, "<li data-tag=\"cat\">(?<x>.*?)</li>"); if (crumbsList == null) return; SiteClassInfo catinfo = new SiteClassInfo(); string paraInfo = ""; string paraUrl = ""; string paraCatId = ""; string paraName = ""; for (int i = 0; i < crumbsList.Count; i++) { catinfo.ParentName = paraName; catinfo.ParentClass = paraCatId; catinfo.ParentUrl = paraCatId=="" ?"": "http://list.tmall.com/search_product.htm?cat=" + paraCatId; //添加父类 if (ValidCatId(paraCatId) &&!HasBindClasslist.Exists(c => c.ClassId == paraCatId)) { string tempparaInfo = ""; string tempparaUrl = ""; string tempparaCatId = ""; string tempparaName = ""; if (i > 1) { Match pnode = crumbsList[i - 2]; tempparaInfo = pnode.Groups["x"].Value; tempparaUrl = RegGroupsX<string>(tempparaInfo, "href=\"(?<x>.*?)\""); tempparaCatId = RegGroupsX<string>(paraUrl, "cat=(?<x>\\d+)"); tempparaUrl = "http://list.tmall.com/search_product.htm?cat=" + tempparaCatId; tempparaName = RegGroupsX<string>(tempparaInfo, "title=\"(?<x>.*?)\""); } SiteClassInfo catPareInfo = new SiteClassInfo { ClassName = paraName, ClassId = paraCatId, SiteId = Baseinfo.SiteId, CreateDate = DateTime.Now, UpdateTime = DateTime.Now, IsHide = false, ParentUrl = tempparaUrl, ParentName = tempparaName, ClassCrumble = tempparaCatId + ",", IsDel=false, ParentClass = tempparaCatId, Urlinfo = "http://list.tmall.com/search_product.htm?cat=" + paraCatId, HasChild = true, IsBind = false }; catPareInfo.ClassCrumble = catPareInfo.ClassCrumble.TrimEnd(','); HasBindClasslist.Add(catPareInfo); shopClasslist.Add(catPareInfo); //父类的同级分类 GetAllBrotherCats(catPareInfo); //子类 GetChildCats(catPareInfo, ""); } Match node = crumbsList[i]; paraInfo = node.Groups["x"].Value; paraUrl = RegGroupsX<string>(paraInfo, "href=\"(?<x>.*?)\""); paraCatId = RegGroupsX<string>(paraUrl, "cat=(?<x>\\d+)"); paraName = RegGroupsX<string>(paraInfo, "title=\"(?<x>.*?)\""); catinfo.ClassName = paraName; catinfo.ClassId = paraCatId; catinfo.SiteId = Baseinfo.SiteId; catinfo.CreateDate = DateTime.Now; catinfo.UpdateTime = DateTime.Now; catinfo.Urlinfo = "http://list.tmall.com/search_product.htm?cat=" + paraCatId; catinfo.TotalProduct = RegGroupsX<int>(catPage, "共<span> (?<x>\\d+)</span>件相关商品"); catinfo.IsHide = false; catinfo.IsBind = false; catinfo.ClassCrumble += paraCatId + ","; catinfo.IsDel = false; GetAllBrotherCats(catinfo); } GetChildCats(catinfo, catPage); catinfo.ClassCrumble = catinfo.ClassCrumble.TrimEnd(','); if (catinfo.ClassId != "" && !HasBindClasslist.Exists(c => c.ClassId == catinfo.ClassId)) { catinfo.HasChild = true; HasBindClasslist.Add(catinfo); shopClasslist.Add(catinfo); } else if (catinfo.ClassName != "" && !HasBindClasslist.Exists(c => c.ClassName == catinfo.ClassName)) { catinfo.HasChild = true; HasBindClasslist.Add(catinfo); shopClasslist.Add(catinfo); } if (shopClasslist.Count > 100) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
private void SaveBand(SiteClassInfo siteClassInfo) { string brandUrl = string.Format("http://list.tmall.com/ajax/allBrandShowForGaiBan.htm?cat={0}", siteClassInfo.ClassId); string page = HtmlAnalysis.Gethtmlcode(brandUrl); if (string.IsNullOrEmpty(page)) return; page = page.Replace("\r", "").Replace("\n", "").Trim(); if (page == "") return; var list = RegGroupCollection(page, "\\{(?<x>.*?)\\}"); if (list == null) return; List<SiteClassBand> catBands = new List<SiteClassBand>(); List<SiteBandInfo> siteBand = new List<SiteBandInfo>(); for(int i=0;i<list.Count;i++) { string templist = list[i].ToString(); string url = RegGroupsX<string>(templist, "\"href\":\"(?<x>.*?)\""); if (string.IsNullOrEmpty(url)) continue; url = url.Replace("&", "&"); if (!url.Contains("http://")) { url = "http://list.tmall.com/search_product.htm" + url; } string bandId = RegGroupsX<string>(templist, "brand=(?<x>\\d+)"); string disName = RegGroupsX<string>(templist, "\"title\":\"(?<x>.*?)\""); string img = RegGroupsX<string>(templist, "\"img\":\"(?<x>.*?)\""); string cnName = ""; string enName = ""; string key = Baseinfo.SiteId + "_" + siteClassInfo.ClassId + "_" + bandId; string key1 = Baseinfo.SiteId + "_" + bandId; string[] names = disName.Split('/'); foreach (string obj in names) { if (regIsMatch(obj, @"[\u4e00-\u9fa5]")) cnName = obj; else enName = obj; } SiteClassBand tempBand = new SiteClassBand { ImgUrl=img, UniqueKey = key, DisplayName = disName, CnName=cnName, EnName = enName, CommentCount=0, ProductCount=0, Urlinfo=url, SiteBandId = bandId, SiteClassId= siteClassInfo.ClassId, SiteId=Baseinfo.SiteId, IsHid=false, UpdateDate=DateTime.Now, CreateDate =DateTime.Now }; if (!HasBindBandlist.Exists(p => p.UniqueKey == key)) { HasBindBandlist.Add(tempBand); catBands.Add(tempBand); } if (!HasSiteBandlist.Exists(p => p.UniqueKey == key1)) { SiteBandInfo tempsband = new SiteBandInfo { CatArea="", EnName=tempBand.EnName, ImgUrl=tempBand.ImgUrl, Introduction="", IsHid=false, Remark="", TotalComments=0, TotalProduts=0, UniqueKey = key1, SiteId=tempBand.SiteId, SiteBandId=tempBand.SiteBandId, DisplayName=tempBand.DisplayName, CnName=tempBand.CnName, CreateDate=DateTime.Now, UpdateDate=DateTime.Now }; HasSiteBandlist.Add(tempsband); siteBand.Add(tempsband); } } try { new SiteClassBandDb().Save(catBands); new SiteBandDb().Save(siteBand); } catch (Exception ex) { LogServer.WriteLog(ex); } }
/// <summary> /// 获取子分类 /// </summary> /// <param name="catinfo"></param> private void GetChildCats(SiteClassInfo catinfo,string pageinfo) { if (pageinfo == "") { Thread.Sleep(new Random().Next(10, 30)*1000); pageinfo = HtmlAnalysis.Gethtmlcode(catinfo.Urlinfo); } string catInfo = RegGroupsX<string>(pageinfo, "<div class=\"cateAttrs\" data-spm=\".*?\">(?<x>.*?)<div class=\"propAttrs\""); if (catInfo == null) return; var catList = RegGroupCollection(catInfo, "<a title=\"(?<x>.*?)\">\r\n <b>(?<y>.*?)</b><span>\\((?<z>\\d+)\\)</span>\r\n </a>"); if (catList == null) return; foreach (Match item in catList) { string catUrl = item.Groups["x"].Value; string catId = RegGroupsX<string>(catUrl, "cat=(?<x>\\d+)"); if (string.IsNullOrEmpty(catId)) continue; int total = 0; int.TryParse(item.Groups["z"].Value, out total); if (!HasBindClasslist.Exists(c => c.ClassId == catId)) { SiteClassInfo cat = new SiteClassInfo { ClassId =catId, ClassCrumble = catinfo.ClassCrumble+","+catinfo.ClassId, ParentClass = catinfo.ClassId, ParentName = catinfo.ClassName, ClassName=item.Groups["y"].Value, IsHide=false, ParentUrl=catinfo.Urlinfo, UpdateTime=DateTime.Now, IsBind=false, IsDel=false, SiteId=Baseinfo.SiteId, Urlinfo = "http://list.tmall.com/search_product.htm?cat=" + catId, TotalProduct=total, CreateDate=DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } } }
/// <summary> /// 获取同父级的分类 /// </summary> /// <param name="catinfo"></param> private void GetAllBrotherCats(SiteClassInfo catinfo) { // Thread.Sleep(new Random().Next(5, 30) * 1000); string catPage = HtmlAnalysis.Gethtmlcode("http://list.tmall.com/ajax/getAllBrotherCats.htm?cat=" + catinfo.ClassId); var catList = RegGroupCollection(catPage, "\"href\":\"(?<x>.*?)\",\r\n\"title\":\"(?<y>.*?)\",\r\n\"atp\""); if (catList == null) return; foreach (Match item in catList) { string catUrl = item.Groups["x"].Value; string catId = RegGroupsX<string>(catUrl, "cat=(?<x>\\d+)"); if (string.IsNullOrEmpty(catId)) continue; if (!HasBindClasslist.Exists(c => c.ClassId == catId)) { SiteClassInfo cat = new SiteClassInfo { ClassId =catId, ClassCrumble = catinfo.ClassCrumble, ParentClass = catinfo.ParentClass, ParentName = catinfo.ParentName, ClassName=item.Groups["y"].Value, IsHide=false, ParentUrl=catinfo.ParentUrl, UpdateTime=DateTime.Now, IsBind=false, IsDel=false, SiteId=Baseinfo.SiteId, Urlinfo = "http://list.tmall.com/search_product.htm?cat=" + catId, CreateDate=DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } } }
private void UpdateNode(SiteClassInfo siteClassInfo) { //if (!siteClassInfo.Urlinfo.Contains("b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k")) // siteClassInfo.Urlinfo += "b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k/"; siteClassInfo.Urlinfo = $"http://list.yhd.com/{siteClassInfo.ClassId}/b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k"; string pageInfo = Gethtmlcode(siteClassInfo.Urlinfo); string categoryname = RegGroupsX<string>(pageInfo, "var categoryName = '(?<x>.*?)'"); string extid = RegGroupsX<string>(pageInfo, "var expectCategoryId = \"(?<x>\\d+)\""); string tempcurcatid = "c" + extid + "-" + categoryname; if (!regIsMatch(tempcurcatid, "^(?<x>c\\d+-\\d+(-\\d+)?)$")) { var aa = "ddd"; return; } if (categoryname == "0-0") tempcurcatid = "c" + extid; if (extid=="0") return; //string classInfo = RegGroupsX<string>(pageInfo, "<div id=\"searchColSub\"(?<x>.*?)<div id=\"bodyRight\"|<div class=\"crumbClip\">(?<x>.*?)<li class=\"crumb_search search_empty\">"); //string classInfo = RegGroupsX<string>(pageInfo, "<div class=\"classWrap\">(?<x>.*?)<div class=\"brandWrap\">"); string classInfo = RegGroupsX<string>(pageInfo, "<ul class=\"listCon clearfix\">(?<x>.*?)<a class=\"c_btn c_next iconSearch\""); string classinfo2 = RegGroupsX<string>(pageInfo, "<ul class=\"guide_con clearfix\">(?<x>.*?)</ul>"); if (!string.IsNullOrEmpty(classinfo2)) { classInfo += classinfo2; } string tempcatid = RegGroupsX<string>(siteClassInfo.Urlinfo, "c\\d+-0-(?<x>\\d+)/"); if (!string.IsNullOrEmpty(tempcatid)) { string classurl = string.Format( "http://list.yhd.com/lazyLoadBrotherCategory/nc{0}-a-f0d-mid0-k/?urlFilterSuffix=/b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k/", tempcatid); string catpage = Gethtmlcode(classurl); if (!string.IsNullOrEmpty(catpage)) classInfo += catpage.Replace("\\\"","\""); } var catList = RegGroupCollection(classInfo, "href=\"(?<x>.*?)\".*?title=\"(?<y>.*?)\""); if (catList == null) { if (pageInfo.Contains("没有找到符合条件的商品,建议您更改下搜索条件")||pageInfo.Contains("很抱歉!没有找到与<span class=\"color_red\">\"\"</span>相关的商品,要不你换个关键词我帮你再找找吧")) new SiteClassBll().delClass(siteClassInfo); return; } for (int i = 0; i < catList.Count; i++) { string url = catList[i].Groups["x"].Value; string catname = catList[i].Groups["y"].Value; if (string.IsNullOrEmpty(url)) continue; string catId = RegGroupsX<string>(url, "http://list.yhd.com/(?<x>.*?)/"); if (!HasBindClasslist.Exists(p => p.ClassId == catId) && !HasBindClasslist.Exists(p => p.ClassName == catname)) { GetYhdClassInfo(url); } } //string crumb = RegGroupsX<string>(pageInfo, // "<div class=\"mod_search_crumb clearfix\"(?<x>.*?)<div id=\"searchColSub\""); //string crumb = RegGroupsX<string>(pageInfo,"<div class=\"crumbClip\">(?<x>.*?)<li class=\"crumb_search search_empty\">|<div class=\"mod_search_crumb clearfix\"(?<x>.*?)<div id=\"searchColSub\""); //string crumb = RegGroupsX<string>(pageInfo, "<div class=\"crumbClip\">(?<x>.*?)<li class=\"crumb_search search_empty\">|<div class=\"mod_search_crumb clearfix\"(?<x>.*?)<div id=\"searchColSub\""); //if (crumb == null) // return; //var list = RegGroupCollection(crumb, "<div class=\"crumb_list\">(?<x>.*?)</div>"); var list = RegGroupCollection(classInfo, "<li class=\"crumb_list\">(?<x>.*?)</li>"); if (list == null) return; string current = RegGroupsX<string>(pageInfo, "<title>(?<x>.*?)品种齐全|<div class=\"guide_title\"><span title=\"(?<x>.*?)\">"); string pcatUrl = ""; string pcatName = ""; string pcatId = ""; string classCrumble = ""; int total = RegGroupsX<int>(pageInfo, "共(?<x>\\d+?)条"); SiteClassInfo catInfo = new SiteClassInfo(); for (int i = 0; i < list.Count; i++) { string div = list[i].Groups["x"].Value; catInfo.Urlinfo = RegGroupsX<string>(div, "href=\"(?<x>.*?)\""); catInfo.ClassId = RegGroupsX<string>(catInfo.Urlinfo, "http://list.yhd.com/(?<x>.*?)/"); if(string.IsNullOrEmpty(catInfo.Urlinfo)) continue; if (!catInfo.Urlinfo.Contains("b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k/")) catInfo.Urlinfo += "b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k/"; catInfo.ClassName = RegGroupsX<string>(div, "\" >(?<x>.*?)</a>|title=\"(?<x>.*?)\""); if (catInfo.ClassName== "全部结果") continue; if (catInfo.Urlinfo == null || catInfo.ClassId == null || catInfo.ClassName == null) { LogServer.WriteLog(Baseinfo.SiteName + "分类抓取错误1", "AddClassError"); continue; } if (!HasBindClasslist.Exists(p => p.ClassId == catInfo.ClassId) && !HasBindClasslist.Exists(p => p.ClassName == catInfo.ClassName)) { GetYhdClassInfo(catInfo.Urlinfo); } catInfo.ParentClass = pcatId; catInfo.ParentName = pcatName; catInfo.ParentUrl = pcatUrl; if (i != 0 && pcatId != "") classCrumble += pcatId + ","; catInfo.ClassCrumble = classCrumble.TrimEnd(','); if (list.Count - 1 == i) { catInfo.HasChild = false; catInfo.TotalProduct = total; } if (catInfo.ClassName == current) { break; } pcatUrl = catInfo.Urlinfo; pcatName = catInfo.ClassName; pcatId = catInfo.ClassId; } if (!string.IsNullOrEmpty(pcatId)) { if (HasBindClasslist.Any(p => p.ClassId == pcatId)) { GetYhdClassInfo(pcatUrl); } } string childCat = RegGroupsX<string>(pageInfo,"<div class=\"classWrap\">(?<x>.*?)</ul>"); if (childCat != null) { var childList = RegGroupCollection(childCat, "href=\"(?<x>.*?)\".*?<span title=\"(?<y>.*?)\">"); foreach (Match item in childList) { string tempUrl = item.Groups["x"].Value; string tempName = item.Groups["y"].Value; string tempid = RegGroupsX<string>(tempUrl, "http://list.yhd.com/(?<x>.*?)/"); if (!HasBindClasslist.Exists(c => c.ClassId == tempid) && !HasBindClasslist.Exists(p => p.ClassName == tempName)) { GetYhdClassInfo(tempUrl); } } siteClassInfo.HasChild = true; } else { siteClassInfo.HasChild = false; } if (siteClassInfo.ClassId != catInfo.ClassId) { LogServer.WriteLog("分类id 更改 old id:" + siteClassInfo.Id + siteClassInfo.Id + "oldclass:" + siteClassInfo.ClassId + "newclass:" + catInfo.ClassId); if (HasBindClasslist.Exists(c => c.ClassId == catInfo.ClassId)) { new SiteClassBll().delClass(siteClassInfo); return; } siteClassInfo.Urlinfo = catInfo.Urlinfo; siteClassInfo.ClassId = catInfo.ClassId; } if (tempcurcatid != siteClassInfo.ClassId && regIsMatch(tempcurcatid, "^(?<x>c\\d+-\\d+(-\\d+)?)$")) { siteClassInfo.ClassId = tempcurcatid; siteClassInfo.Urlinfo =$"http://list.yhd.com/{tempcurcatid}/b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k"; } siteClassInfo.ClassName = catInfo.ClassName; siteClassInfo.TotalProduct = catInfo.TotalProduct; siteClassInfo.ParentUrl = catInfo.ParentUrl; siteClassInfo.ParentClass = catInfo.ParentClass; if (regIsMatch(siteClassInfo.ParentClass, "c\\d+-0")) { string parentpage = Gethtmlcode(catInfo.ParentUrl); string pcategoryname = RegGroupsX<string>(parentpage, "var categoryName = '(?<x>.*?)'"); string pextid = RegGroupsX<string>(parentpage, "var expectCategoryId = \"(?<x>\\d+)\""); string ptempcurcatid = "c" + pextid + "-" + pcategoryname; if (regIsMatch(ptempcurcatid, "^(?<x>c\\d+-\\d+(-\\d+)?)$")) { siteClassInfo.ParentClass = ptempcurcatid; siteClassInfo.ParentUrl = $"http://list.yhd.com/{ptempcurcatid}/b/a-s1-v4-p1-price-d0-f0d-m1-rt0-pid-mid0-k"; } } siteClassInfo.HasChild = HasBindClasslist.Exists(p => p.ParentClass == siteClassInfo.ClassId); siteClassInfo.ParentName = catInfo.ParentName; siteClassInfo.UpdateTime = DateTime.Now; new SiteClassBll().UpdateSiteCat(siteClassInfo); }
public void SaveAllSiteClass() { HasBindClasslist = new SiteClassInfoDB().getAllSiteCatInfo(Baseinfo.SiteId); string page = HtmlAnalysis.Gethtmlcode(domain); string content = RegGroupsX<string>(page, "<h2>所有商品分类</h2><div class=\"submenu\">(?<x>.*?)<div class=\"nav\">"); var catList = RegGroupCollection(content, "<a href=\"(?<x>.*?)\">(?<y>.*?)</a>"); for (int i = 0; i < catList.Count; i++) { string tempurl = catList[i].Groups["x"].Value; if (string.IsNullOrEmpty(tempurl)) continue; tempurl = domain + tempurl; string tempName = catList[i].Groups["y"].Value; string catid = RegGroupsX<string>(tempurl, "/list-(?<x>\\d+)"); if (!ValidCatId(catid)) continue; if (!HasBindClasslist.Exists(p => p.ClassId == catid)) { SiteClassInfo cat = new SiteClassInfo { ParentUrl = "", ParentClass = "", ParentName = "", TotalProduct = 0, Urlinfo = tempurl, ClassId = catid, UpdateTime = DateTime.Now, IsDel = false, BindClassId = 0, BindClassName = "", HasChild = true, IsBind = false, IsHide = false, ClassName = tempName, SiteId = Baseinfo.SiteId, ClassCrumble = "", CreateDate = DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
private void UpdateCat(SiteClassInfo siteClassInfo) { string page = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); string cromb = RegGroupsX<string>(page, "<span class=\"pos-front\">(?<x>.*?)</div>"); if (cromb == null) return; var plist = RegGroupCollection(cromb, "<a href=\"(?<x>.*?)\" alt=\"\" title=\"\">(?<y>.*?)</a></span>"); if (plist == null) return; string parentUrl = ""; string parentName=""; string parentId = ""; foreach (Match item in plist) { if (item.ToString().Contains("首页")) continue; parentUrl = item.Groups["x"].Value; if (string.IsNullOrEmpty(parentUrl)) continue; parentUrl = string.Format("http://www.hangowa.com{0}", parentUrl); parentName = item.Groups["y"].Value; parentId = RegGroupsX<string>(parentUrl, "gallery-(?<x>\\d+?).html"); if (!HasBindClasslist.Exists(c => c.ClassId == parentId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = parentName, ClassId = parentId, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = parentUrl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } string chlidCat = RegGroupsX<string>(page, "<dt class=\"filter-entries-label\">分类:</dt>\n <dd class=\"filter-entries-values\">(?<x>.*?)</dd>"); if (chlidCat != null) { var blist = RegGroupCollection(chlidCat, "<a href=\"(?<x>.*?)\" class=\"handle action-cat-filter\">(?<y>.*?)</a>"); if (blist != null) { foreach (Match item in blist) { string burl = item.Groups["x"].Value; if (string.IsNullOrEmpty(burl)) continue; burl = "http://www.hangowa.com" + burl.TrimEnd('?'); string bName = item.Groups["y"].Value; string bId = RegGroupsX<string>(burl, "gallery-(?<x>\\d+?).html"); if (!HasBindClasslist.Exists(c => c.ClassId == bId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass =siteClassInfo.ClassId, ParentName =siteClassInfo.ClassName, ClassName = bName, ClassId = bId, ParentUrl = siteClassInfo.Urlinfo, IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = burl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } //if (chlidCat != null && chlidCat.Contains(siteClassInfo.ClassId)) // siteClassInfo.HasChild = false; //else // siteClassInfo.HasChild = true; siteClassInfo.ParentClass = parentId; siteClassInfo.ParentName = parentName; siteClassInfo.ParentUrl =parentUrl; siteClassInfo.UpdateTime = DateTime.Now; siteClassInfo.TotalProduct = RegGroupsX<int>(page, "共<b class=\"op-search-result\">(?<x>\\d+)</b>件商品"); new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); }
private void UpdateCat(SiteClassInfo siteClassInfo) { string pageinfo = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); string crumb = RegGroupsX<string>(pageinfo, "<div class=\"curr_position\">(?<x>.*?)</div>"); if (crumb == null) { siteClassInfo.IsDel = true; new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); LogServer.WriteLog(Baseinfo.SiteName + "分类抓取错误\turl:" + siteClassInfo.Urlinfo, "AddClassError"); return; } var list = RegGroupCollection(crumb, "<a href=\"(?<x>.*?)\">(?<y>.*?)</a>"); if (list == null) return; string parentid = ""; string parentName = ""; string parentUrl = ""; for (int i = 0; i < list.Count; i++) { string tempName = list[i].Groups["y"].Value; if (tempName.Contains("首页")) continue; string tempUrl = list[i].Groups["x"].Value; if (!tempUrl.Contains(domain)) tempUrl = domain + tempUrl; string tempid = RegGroupsX<string>(tempUrl, "(?<x>\\d+)"); if (string.IsNullOrEmpty(tempid)) { continue; } if (tempid == siteClassInfo.ClassId) break; parentid = tempid; parentName = tempName; parentUrl = string.Format("http://www.jxdyf.com/category/{0}.html", tempid); } if (ValidCatId(parentid)) { siteClassInfo.ParentName = parentName; siteClassInfo.ParentUrl = parentUrl; siteClassInfo.ParentClass = parentid; } siteClassInfo.TotalProduct = RegGroupsX<int>(pageinfo, "共有(?<x>\\d+)个商品"); siteClassInfo.UpdateTime = DateTime.Now; siteClassInfo.HasChild = HasBindClasslist.Exists(c => c.ParentClass == siteClassInfo.ClassId); new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); string areaCat = RegGroupsX<string>(pageinfo, "<div class=\"menu outline_01\">(?<x>.*?)<div class=\"web_surfer outline_01\">"); if (areaCat == null) return; var catList = RegGroupCollection(areaCat, "<a href=\"/category/(?<x>\\d+).html\" >(?<y>.*?)"); for (int i = 0; i < catList.Count; i++) { string tempid = catList[i].Groups["x"].Value; if (!ValidCatId(tempid) || HasBindClasslist.Exists(c => c.ClassId == tempid)) continue; string tempName = catList[i].Groups["y"].Value; SiteClassInfo cat = new SiteClassInfo { ParentUrl = "", ParentClass = "", ParentName = "", TotalProduct = 0, Urlinfo = string.Format("http://www.jxdyf.com/category/{0}.html", tempid), ClassId = tempid, UpdateTime = DateTime.Now, IsDel = false, BindClassId = 0, BindClassName = "", HasChild = true, IsBind = false, IsHide = false, ClassName = tempName, SiteId = Baseinfo.SiteId, ClassCrumble = "", CreateDate = DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
private void UpdateCat(SiteClassInfo siteClassInfo) { string page = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); string cromb = RegGroupsX<string>(page, "您现在的位置: <a href=\"http://www.ehaier.com\">海尔商城</a>(?<x>.*?)</div>"); if (cromb == null) return; var plist = RegGroupCollection(cromb, "<a href=\"(?<x>.*?)\">(?<y>.*?)</a>"); if (plist == null) return; string parentUrl = ""; string parentName=""; string parentId = ""; foreach (Match item in plist) { parentUrl = item.Groups["x"].Value; parentName = item.Groups["y"].Value; parentId = RegGroupsX<string>(parentUrl, "http://www.ehaier.com/l/(?<x>\\d+).html|http://www.ehaier.com/l/(?<x>\\d+-\\d+).html|http://www.ehaier.com/l/(?<x>\\d+-\\d+-\\d+).html"); if (!ValidCatId(parentId)) continue; if (!HasBindClasslist.Exists(c => c.ClassId == parentId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = parentName, ClassId = parentId, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = parentUrl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } string brotherCat = RegGroupsX<string>(page, "<div class=\"list-subnav\">(?<x>.*?)<dl class=\"dl-subnav dl-subnav-now\">"); if (brotherCat != null) { var blist = RegGroupCollection(brotherCat, "href=\"(?<x>.*?)\">(?<y>.*?)</a>"); if (blist != null) { foreach (Match item in blist) { string burl = item.Groups["x"].Value; string bName = item.Groups["y"].Value; string bId = RegGroupsX<string>(burl, "http://www.ehaier.com/l/(?<x>\\d+).html|http://www.ehaier.com/l/(?<x>\\d+-\\d+).html|http://www.ehaier.com/l/(?<x>\\d+-\\d+-\\d+).html"); if (!HasBindClasslist.Exists(c => c.ClassId == bId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = bName, ClassId = bId, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = burl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } if(HasBindClasslist.Exists(c=>c.ParentClass==siteClassInfo.ClassId)) siteClassInfo.HasChild = true; else siteClassInfo.HasChild = false; siteClassInfo.ParentClass = parentId; siteClassInfo.ParentName = parentName; siteClassInfo.ParentUrl = parentUrl; siteClassInfo.UpdateTime = DateTime.Now; siteClassInfo.TotalProduct = RegGroupsX<int>(page, "共<strong class=\"haierred\">(?<x>\\d+)</strong> 件"); new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); }
public void SaveAllSiteClass() { string url = "http://www.jxdyf.com/category"; List<SiteClassInfo> shopClasslist = new List<SiteClassInfo>(); HasBindClasslist = new SiteClassInfoDB().getAllSiteCatInfo(Baseinfo.SiteId); string popHtml = HtmlAnalysis.Gethtmlcode(url); string content = RegGroupsX<string>(popHtml, "<div class=\"fl\">(?<x>.*?)<div id=\"footer\""); var catlist = RegGroupCollection(content, "<a href='(?<y>.*?)'( class='.*?')?>(?<x>.*?)</a>"); for (int i = 0; i < catlist.Count; i++) { string tempurl = catlist[i].Groups["y"].Value; string catid = RegGroupsX<string>(tempurl, "(?<x>\\d+)"); if (!ValidCatId(catid) || HasBindClasslist.Exists(c => c.ClassId == catid)) { continue; } string catName = catlist[i].Groups["x"].Value; SiteClassInfo cat = new SiteClassInfo { ParentUrl = "", ParentClass = "", ParentName = "", TotalProduct = 0, Urlinfo =domain+ tempurl, ClassId = catid, UpdateTime = DateTime.Now, IsDel = false, BindClassId = 0, BindClassName = "", HasChild = true, IsBind = false, IsHide = false, ClassName = catName, SiteId = Baseinfo.SiteId, ClassCrumble = "", CreateDate = DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } ; }
public bool BingCatInfo(SiteClassInfo cat) { using (var db = _dbFactory.OpenDbConnection()) { int res = db.UpdateOnly(cat, u => new {u.IsBind, u.BindClassId,u.BindClassName}, u => u.Id == cat.Id); return res > 0; } }
private void UpdateCat(SiteClassInfo siteClassInfo) { string page = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); string cromb = RegGroupsX<string>(page, "<div class=\"breadcrumb-area fcn\">(?<x>.*?)</div>"); if (cromb == null) return; var plist = RegGroupCollection(cromb, "<a href=\"(?<x>.*?)\" title=\"(?<y>.*?)\">"); if (plist == null) return; string parentUrl = ""; string parentName=""; string parentId = ""; foreach (Match item in plist) { if (item.ToString().Contains("首页")) continue; parentUrl = item.Groups["x"].Value; parentName = item.Groups["y"].Value; parentId = RegGroupsX<string>(parentUrl, "list-(?<x>\\d+)"); if (!ValidCatId(parentId)) { parentId = ""; continue; } if (!HasBindClasslist.Exists(c => c.ClassId == parentId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = parentName, ClassId = parentId, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = string.Format("http://www.vmall.com/{0}", parentUrl), UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } string brotherCat = RegGroupsX<string>(page, "<div class=\"p-title\">分类:</div>(?<x>.*?)<div class=\"pro-cate-sort clearfix\">"); if (brotherCat != null) { var blist = RegGroupCollection(brotherCat, " <li ><a href=\"(?<x>.*?)\">(?<y>.*?)</a></li>"); if (blist != null) { foreach (Match item in blist) { string burl = item.Groups["x"].Value; string bName = item.Groups["y"].Value; string bId = RegGroupsX<string>(burl, "list-(?<x>\\d+)$"); if (!ValidCatId(bId)) { continue; } if (!HasBindClasslist.Exists(c => c.ClassId == bId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = bName, ClassId = bId, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = string.Format("http://www.vmall.com/{0}", burl), UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } if (brotherCat != null) siteClassInfo.HasChild = false; else siteClassInfo.HasChild = true; siteClassInfo.ParentClass = parentId; siteClassInfo.ParentName = parentName; if (!string.IsNullOrEmpty(parentUrl)) siteClassInfo.ParentUrl = string.Format("http://www.vmall.com/{0}", parentUrl); siteClassInfo.UpdateTime = DateTime.Now; new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); }
public void UpdateSiteClass(SiteClassInfo catinfo) { _dbFactory = new OrmLiteConnectionFactory(MmbpriceDbConnectionString, SqlServerDialect.Provider); if (catinfo == null) throw new ArgumentNullException("catinfo"); using (var db = _dbFactory.OpenDbConnection()) { try { db.Update(catinfo); } catch (Exception ex) { LogServer.WriteLog(ex, "DBError"); // throw; } } }
private void GetCatInfo(string directoryHtml) { string catArea = RegGroupsX<string>(directoryHtml, "<ol class=\"category-list\">(?<x>.*?)<a href=\"http://app.vmall.com\" target=\"_blank\"><span>应用市场"); if (catArea == null) return; catArea = catArea.Replace("\r", "").Replace("\n", "").Replace("\t", ""); var list = RegGroupCollection(catArea, "<a href=\"(?<x>.*?)\" (target=\"_blank\")?><span>(?<y>.*?)</span>"); foreach (Match item in list) { string tempUrl = item.Groups["x"].Value; string tempName = item.Groups["y"].Value; string tempid = RegGroupsX<string>(tempUrl, "list-(?<x>\\d+)$"); if (!HasBindClasslist.Exists(c => c.ClassId == tempid)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = tempName, ClassId = tempid, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = false, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = string.Format("http://www.vmall.com{0}", tempUrl), UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
public void AddSiteClass(SiteClassInfo siteclass) { if (siteclass == null) throw new ArgumentNullException("siteclass"); using (var db = _dbFactory.OpenDbConnection()) { try { if (db.Exists<SiteClassInfo>(p => p.SiteId == siteclass.SiteId && p.ClassId == siteclass.ClassId)) return; db.Insert(siteclass); } catch (Exception ex) { LogServer.WriteLog(ex, "DBError"); throw; } } }
private void AddNode(string url) { string classid = RegGroupsX<string>(url, "^http://searchex.yixun.com/(?<x>.*?)\\-"); if (string.IsNullOrEmpty(classid)) return; if (classid.Contains("t")) { var tempids = classid.Split('t'); string catid = tempids[tempids.Length - 1]; if (HasBindClasslist.Exists(p => p.ClassId == catid)) return; } else { if (HasBindClasslist.Exists(p => p.ClassId == classid)) return; } string pageinfo = HtmlAnalysis.Gethtmlcode(url); if (!pageinfo.Contains("<div class=\"goods\"")) return; string cromp = RegGroupsX<string>(pageinfo, "<div id=\"crumbBox\" class=\"crumb \">(?<x>.*?)<div class=\"crumb_search \">"); if (string.IsNullOrEmpty(cromp)) return; var caplist = RegGroupCollection(cromp, "<a class=\"crumb_lk\" href=\"(?<x>.*?)\" rg=\"[0-9_]+\" ytag=\"\\d+\">(?<y>.*?)</a>"); if (caplist == null || caplist.Count < 1) return; string proName = caplist[caplist.Count - 1].Groups["y"].Value; string parentUrl = ""; string parentName = ""; string parentid = ""; string classCrumble = ""; List<string> lessCat = new List<string>(); for (int i = 0; i < caplist.Count - 1; i++) { if (i == 0) continue; parentUrl = caplist[i].Groups["x"].Value; parentName = caplist[i].Groups["y"].Value; parentid = RegGroupsX<string>(parentUrl, "path=(?<x>[A-Za-z0-9]+)|^http://searchex.yixun.com/(?<x>.*?)\\-"); if (!string.IsNullOrEmpty(parentid)) { if (parentid.Contains(',')) parentid = parentid.Substring(parentid.LastIndexOf(',') + 1); classCrumble += parentid + ","; if (!HasBindClasslist.Exists(p => p.ClassId == parentid)) { lessCat.Add(string.Format("http://searchex.yixun.com/{0}-1-/", parentid)); } } } if (classid.Contains("t")) { var tempids = classid.Split('t'); parentid = tempids[0]; classid = tempids[1]; if (classCrumble == "") classCrumble = parentid; } classCrumble = classCrumble.TrimEnd(','); if (HasBindClasslist.Exists(p => p.ClassId == classid)) return; int total = RegGroupsX<int>(pageinfo, "共<b>(?<x>\\d+)</b>件商品"); SiteClassInfo cat = new SiteClassInfo { ParentUrl = parentUrl, ParentClass = parentid, ParentName = parentName, TotalProduct = total, Urlinfo = url, ClassId = classid, UpdateTime = DateTime.Now, IsDel = false, BindClassId = 0, BindClassName = "", HasChild = false, IsBind = false, IsHide = false, ClassName = proName, SiteId = Baseinfo.SiteId, ClassCrumble = classCrumble, CreateDate = DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); string catList = RegGroupsX<string>(pageinfo, "<div class=\"cate_bd\">(?<x>.*?)<div id=\"viewedGoods\""); var temCats = RegGroupCollection(catList,"href=\"(?<x>.*?)\""); if (temCats == null) return; for (int i = 0; i < temCats.Count; i++) { string tempCatUrl = temCats[i].Groups["x"].Value; string tempcatid = RegGroupsX<string>(tempCatUrl, "path=(?<x>[A-Za-z0-9]+)|^http://searchex.yixun.com/(?<x>.*?)\\-"); if (string.IsNullOrEmpty(tempcatid)) continue; if (!HasBindClasslist.Exists(p => p.ClassId == parentid)) { lessCat.Add(string.Format("http://searchex.yixun.com/{0}-1-/", tempcatid)); } } for (int i = 0; i < lessCat.Count; i++) { AddNode(lessCat[i]); } }
private void GetCatInfo(string directoryHtml) { string catArea = RegGroupsX<string>(directoryHtml, "全部商品分类</h2>(?<x>.*?)<div class=\"nav-right\">"); if (catArea == null) return; //catArea = catArea.Replace("\t", "").Replace("\r", "").Replace("\n", ""); var list = RegGroupCollection(catArea, "href=('|\")(?<x>.*?)('|\")\\s*>(?<y>.*?)</a>"); foreach (Match item in list) { string tempUrl = item.Groups["x"].Value; string tempName = item.Groups["y"].Value; if (!string.IsNullOrEmpty(tempName)) { tempName = tempName.Trim(); } string tempid = RegGroupsX<string>(tempUrl, "category/(?<x>\\d+)-"); if (ValidCatId(tempid) &&!HasBindClasslist.Exists(c => c.ClassId == tempid)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = tempName, ClassId = tempid, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = false, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = tempUrl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }
private void UpdateCat(SiteClassInfo siteClassInfo) { string pageinfo = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); string crumble = RegGroupsX<string>(pageinfo, "<div class=\"crumb_wrap\">(?<x>.*?)<div class=\"crumb_search \">"); if (crumble == null) { LogServer.WriteLog(Baseinfo.SiteName + "分类抓取错误1\turl:" + siteClassInfo.Urlinfo, "AddClassError"); return; } var crumblelist = RegGroupCollection(crumble, "<a class=\"crumb_lk\" href=\"(?<x>.*?)\" rg=\"\\d+_?\\d+\" ytag=\"\\d+\">(?<y>.*?)</a>"); siteClassInfo.TotalProduct = RegGroupsX<int>(pageinfo, "<div class=\"sort_page_txt\">共<b>(?<x>\\d+)</b>件商品</div>"); if (siteClassInfo.TotalProduct == 0) { siteClassInfo.IsDel = true; new SiteClassBll().delClass(siteClassInfo); } if (crumblelist == null || crumblelist.Count == 0) { LogServer.WriteLog(Baseinfo.SiteName + "分类抓取错误2\turl:" + siteClassInfo.Urlinfo, "AddClassError"); return; } string pcatUrl = ""; string pcatName = ""; string pcatId = ""; string classCrumble = ""; foreach (Match item in crumblelist) { if (item.ToString().Contains("首页")) continue; if (item.ToString().Contains(siteClassInfo.ClassName)) { string itemurl = item.Groups["x"].Value; string tempid = RegGroupsX<string>(itemurl, "http://searchex.yixun.com/(?<x>.*?)-1-/"); if (!ValidCatId(tempid)) continue; if (tempid.Contains('t')) { var tempids = tempid.Split('t'); string catid = tempids[tempids.Length - 1]; siteClassInfo.ClassId = catid; if (tempids.Length > 1) siteClassInfo.ParentClass = tempids[tempids.Length - 2]; } else { siteClassInfo.ClassId = tempid; } siteClassInfo.ClassName = item.Groups["y"].Value; siteClassInfo.Urlinfo = itemurl; } else { pcatUrl = item.Groups["x"].Value; pcatName = item.Groups["y"].Value; pcatId = RegGroupsX<string>(pcatUrl, "http://searchex.yixun.com/(?<x>.*?)-1-/"); if (!string.IsNullOrEmpty(pcatId)) classCrumble += pcatId + ","; } } if ( siteClassInfo.ClassId.Contains('t')) { siteClassInfo.ParentClass = siteClassInfo.ClassId.Substring(0, siteClassInfo.ClassId.IndexOf('t')); } string catArea = RegGroupsX<string>(pageinfo, "<div class=\"cate cate_2\" id=\"cateList\">(?<x>.*?)<div id=\"zdmArticle\" class=\"article_relative hide\">"); var tempcatlist = RegGroupCollection(catArea, "<a class=\"cate_lk2 \" href=\"(?<x>.*?)\" title=(?<y>.*?) navvalue"); foreach (Match item in tempcatlist) { string tempurl = item.Groups["x"].Value; string tempId = RegGroupsX<string>(tempurl, "http://searchex.yixun.com/(?<x>.*?)-1-/"); if (tempId.Contains("t")) { tempId = tempId.Substring(tempId.IndexOf('t') + 1); } string tempName = item.Groups["y"].Value; if (!HasBindClasslist.Exists(c => c.ClassId == tempId)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = tempName, ClassId = tempId, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = true, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = tempurl, UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } //siteClassInfo.ParentUrl = pcatUrl; if (pcatName != "") siteClassInfo.ParentName = pcatName; //if (pcatId!="") //siteClassInfo.ParentClass = pcatId; siteClassInfo.ClassCrumble = classCrumble; siteClassInfo.UpdateTime = DateTime.Now; new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); }
private void UpdateCat(SiteClassInfo siteClassInfo) { string pageinfo = HtmlAnalysis.Gethtmlcode(siteClassInfo.Urlinfo); //if (pageinfo.Contains("很抱歉,没有找到相关的商品。")) // return; string crumb = RegGroupsX<string>(pageinfo,"<div class=\"w1200 breadNav\">(?<x>.*?)<div class=\"w1200\">"); var catlist = RegGroupCollection(crumb, "<a href=\"(?<x>.*?)\">(?<y>.*?)</a>"); if (catlist == null) return; int deep = catlist.Count; if (deep > 1) { siteClassInfo.ParentUrl =domain+ catlist[deep - 2].Groups["x"].Value; siteClassInfo.ParentName = catlist[deep - 2].Groups["y"].Value; siteClassInfo.ParentClass = RegGroupsX<string>(siteClassInfo.ParentUrl, "/list-(?<x>\\d+)"); } siteClassInfo.HasChild = deep <= 1; siteClassInfo.TotalProduct = RegGroupsX<int>(pageinfo, "搜索到 <span class=\"red\">(?<x>\\d+)</span> 件相关商品"); string catArea = RegGroupsX<string>(pageinfo, "<div class=\"sortlist mb10\">(?<x>.*?)<!--左侧产品分类列表 E--"); if (!string.IsNullOrEmpty(catArea)) { var list = RegGroupCollection(catArea, "<a href=\"(?<x>.*?)\"\\s*>(?<y>.*?)</a>"); if (list != null) { foreach (Match match in list) { string tempurl =domain+ match.Groups["x"].Value; string tempid = RegGroupsX<string>(tempurl, "/list-(?<x>\\d+)"); if (!ValidCatId(tempid)) { continue; } if (!HasBindClasslist.Exists(p => p.ClassId == tempid)) { string tempName = match.Groups["y"].Value; if(string.IsNullOrEmpty(tempName)) { continue; } SiteClassInfo cat = new SiteClassInfo { ParentUrl = "", ParentClass = "", ParentName = "", TotalProduct = 0, Urlinfo = tempurl, ClassId = tempid, UpdateTime = DateTime.Now, IsDel = false, BindClassId = 0, BindClassName = "", HasChild = true, IsBind = false, IsHide = false, ClassName = tempName, SiteId = Baseinfo.SiteId, ClassCrumble = "", CreateDate = DateTime.Now }; HasBindClasslist.Add(cat); shopClasslist.Add(cat); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } } } siteClassInfo.UpdateTime = DateTime.Now; new mmbSiteClassInfoDB().UpdateSiteClass(siteClassInfo); }
private void GetCatInfo(string directoryHtml) { string catArea = RegGroupsX<string>(directoryHtml, "<div class=\"brandContent\">(?<x>.*?)<div id=\"div2\""); if (catArea == null) return; catArea = catArea.Replace("\t", "").Replace("\r", "").Replace("\n", ""); var list = RegGroupCollection(catArea, "<a href=\"(?<x>.*?)\" target=\"_blank\" title=\"(?<y>.*?)\">"); foreach (Match item in list) { string tempUrl = item.Groups["x"].Value; string tempName = item.Groups["y"].Value; if (!string.IsNullOrEmpty(tempName)) { tempName = tempName.Trim(); } string tempid = RegGroupsX<string>(tempUrl, "/Category/(?<x>\\d+)-"); if (!HasBindClasslist.Exists(c => c.ClassId == tempid)) { SiteClassInfo iteminfo = new SiteClassInfo { ParentClass = "", ParentName = "", ClassName = tempName, ClassId = tempid, ParentUrl = "", IsDel = false, IsBind = false, IsHide = false, BindClassId = 0, BindClassName = "", HasChild = false, ClassCrumble = "", TotalProduct = 0, SiteId = Baseinfo.SiteId, Urlinfo = string.Format("http://www.lbxcn.com/{0}", tempUrl), UpdateTime = DateTime.Now, CreateDate = DateTime.Now }; HasBindClasslist.Add(iteminfo); shopClasslist.Add(iteminfo); } } if (shopClasslist.Count > 0) { new SiteClassInfoDB().AddSiteClass(shopClasslist); shopClasslist.Clear(); } }