/// <summary> /// 亚马逊产品列表 /// </summary> public static void AmazonProductList(string url) { //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore if (string.IsNullOrEmpty(url)) { return; } Uri uri = new Uri(url); string queryString = uri.Query; NameValueCollection nameValue = UrlHelper.GetQueryString(queryString); // 根据url中抽取分类 string node = nameValue["node"]; if (string.IsNullOrEmpty(node)) { return; } string urlTemplate = "http://www.amazon.cn/s/ref=?rh=n:{0}&page={1}"; var firstPageUrl = string.Format(urlTemplate, node,1); var hcFirst = new HttpClient(firstPageUrl); hcFirst.Timeout = 30000; var html = HttpUtility.HtmlDecode(hcFirst.Request()); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); var maxPageNode = htmlDocument.DocumentNode.SelectSingleNode("//span[@class='pagnDisabled']"); var maxPageNumber = 0; if(maxPageNode!=null&&int.TryParse(maxPageNode.InnerText,out maxPageNumber)) { for (int i = 1; i <= maxPageNumber; i++) { if (i != 1) { var pageUrl = string.Format(urlTemplate, node, i); var hc = new HttpClient(pageUrl); hc.Timeout = 30000; html = HttpUtility.HtmlDecode(hc.Request()); htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); } var productNodes = htmlDocument.DocumentNode.SelectNodes( "//div[@class='result product'] | //div[@class='result lastRow product']"); if (productNodes == null) { return; } foreach (HtmlNode productNode in productNodes) { var imageNode = productNode.SelectSingleNode("div[@class='image']/a"); var titleNode = productNode.SelectSingleNode("div[@class='data']/h3[@class='title']/a"); } } } }
static void Main(string[] args) { #region HttpClient hc3 = new HttpClient("http://www.soxuan.com"); var html2=hc3.Request(); return; #endregion #region FileRead.Read(); FileRead.Read("GetPromotionRulesByIdsWithoutRuleExpands"); return; #endregion #region mongodb //var objectid= MongoDBOfficialTest.Insert(new ShoppingCartEntity(){CartId = "123",Ha = "ssss",Promotion = new PromotionEntity(){Date1="满赠新促销",Date2 = new List<string>(){"测试"}}}); var objectid = new ObjectId("50e78f8c9e3eca2d6c538b9d"); MongoDBOfficialTest.GetById(objectid); //MongoDBOfficialTest.GetById(objectid); //MongoDBTest.Insert(new ShoppingCartEntity(){CartId = "123456"}); //MongoDBTest.GetById("123456"); //MongoDBTest.Update(new ShoppingCartEntity(){CartId = "123456"}); return; #endregion //var indexUrls= GoldSpider.GetUrls(); //var urls= GoldSpider.Spider(indexUrls); //string json = JsonHelper.ToJson(urls); //File.WriteAllText(Environment.CurrentDirectory+"\\urls.json",json); var data= JsonHelper.FromJson<List<string>>(File.ReadAllText(Environment.CurrentDirectory + "\\urls.json")); var finalDatas= GoldSpider.SpiderPrice(data); File.WriteAllText(Environment.CurrentDirectory + "\\data.json", JsonHelper.ToJson(finalDatas)); return; //SqliteTest.Test(); //string connectionstring1 = "Data Source=e:\\sqlite.db3"; //string connectionstring2 = "Data Source=e:\\sqlite.db3;PRAGMA cache_size=10000"; //SqliteTest.Query("select id from test1 limit 0,10000", connectionstring2); //SqliteTest.Query("select id from test1 limit 0,10000", connectionstring1); //SqliteTest.Query("select id from test1 limit 0,10000", connectionstring1); SqliteTest.Memory(100000); SqliteTest.MemoryQuery("select id from test1 limit 0,10000"); return; WebBrowerManager.Instance.Setup(new cEXWB()); WebBrowerManager.Instance.TimeOut = 15; WebBrowerManager.Instance.FilterRequest = true; WebBrowerManager.Instance.FilterAction.Add(".css", (string key, string source) => { if(source.EndsWith(key)) { return true; } return false; }); string html1 = WebBrowerManager.Instance.Run("http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm"); Console.WriteLine(html1); Console.Read(); return; TaskManager taskManager=new TaskManager(); taskManager.Test02(); Console.ReadKey(); return; Process.Start("IExplore.exe", "www.northwindtraders.comTest"); // //EncodingTest.Test(); //return; using(HttpClient hc1 = new HttpClient("http://www.cnblogs.com")) { string html = hc1.Request(); } //WebPage page = new WebPage(html, "http://www.cnblogs.com", Encoding.UTF8); //page.SaveHtmlAndResource(@"1.html", false, new DirConfig(@"z:\1")); //return; HttpClientTest.Test(); return; SqliteTest.Test(); var uri = new Uri("http://misc.360buyimg.com/lib/js/2012/base-v1.js"); //WebBrowerManager.Instance.ToVisitUrls = new List<string> { "http://www.360buy.com" }; WebBrowerManager.Instance.Setup(new cEXWB()); WebBrowerManager.Instance.Run(uri.ToString()); BootStrapperManager.Initialize(new NinjectBootstrapper()); var add = CommonBootStrapper.ServiceLocator.GetInstance<Test>(); //add.Alert("ceshi"); add.Test1(); HttpClient hc = new HttpClient("http://misc.360buyimg.com/lib/js/2012/base-v1.js"); hc.SaveFile("e:\\1.js"); hc.Request(); hc.BeginRequest((h) => { Console.Write(h); }); Console.ReadKey(); var s= hc.Request(); var list = new List<UnionOrderTransBFD>(); list.Add(new UnionOrderTransBFD() { ActualPrice = 1, CommissionPrice = 1, Rate = 1, Source = ">123", SONumber = 111111111111, UpdateDate = DateTime.Now }); list.Add(new UnionOrderTransBFD() { ActualPrice = 1, CommissionPrice = 1, Rate = 1, Source = ">123", SONumber = 111111111111, UpdateDate = DateTime.Now }); var xml= ObjectXmlSerializer.ToXml(list,"ccc",true,false); var a=new A(); a.name = 1; //a.ObjectB = new B() { ItemCode = "1", Qty = 1 }; var b = new A(); b.name =1; //b.ObjectB = new B() { ItemCode = "1", Qty = 1 }; var isEqual = DotNet.Common.Utility.GenericEqualityComparer<A>.Equals(a, b); Console.WriteLine(isEqual); }
/// <summary> /// 一号店商品采集方法 /// </summary> /// <param name="url">全部分类url</param> public static void YiHaoDianSpider(string url) { //http://www.yihaodian.com/product/listAll.do HttpClient hc = new HttpClient(url); hc.Timeout = 30000; var allSortHtml = hc.Request(); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(allSortHtml); var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='alonesort']"); //var texts = new List<string>(); foreach (HtmlNode firstCategoryNode in firstCategoryContainer) { var node = firstCategoryNode.CssSelect(".mt>h3>a"); if (node != null && node.Any()) { //一级分类 var firstCategoryText = node.FirstOrDefault().InnerText; var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); Insert(firstKey.ToString(), firstCategoryText, node.FirstOrDefault().Attributes["href"].Value, "0","2"); var secondCategoryContainer = firstCategoryNode.CssSelect(".mc>.fore"); foreach (HtmlNode htmlNode in secondCategoryContainer) { //二级分类 var secondCategoryNode = htmlNode.CssSelect("dt>a").FirstOrDefault(); var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); if(secondCategoryNode.Attributes["href"]!=null) { Insert(secondKey.ToString(), secondCategoryNode.InnerText, secondCategoryNode.Attributes["href"].Value, firstKey.ToString(),"2"); } // 三级分类集合 var threeCategoryNodes = htmlNode.CssSelect("dd>em>span>a"); foreach (HtmlNode threeCategoryNode in threeCategoryNodes) { // 插入三级分类 var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2"); if(threeCategoryNode.Attributes["href"]!=null) { Insert(thirdKey.ToString(), threeCategoryNode.InnerText, threeCategoryNode.Attributes["href"].Value, secondKey.ToString(),"2"); } } } } } }
/// <summary> /// 一号店商品列表页面 /// </summary> public static void YiHaoDianProductList() { // HttpClient hc = new HttpClient("http://www.yihaodian.com/ctg/searchPage/c5484-%E5%A5%B6%E8%8C%B6/b0/a-s1-v0-p5-price-d0-f04-m1-rt0-pid-k/?callback=jsonp1352021900435"); hc.Timeout = 30000; var allSortHtml = hc.Request(); }
/// <summary> /// 易讯商品列表页面数据采集 /// </summary> /// <param name="url"></param> public static void WuYiBuyProductList(string url) { var hcFirst = new HttpClient(url); hcFirst.Timeout = 30000; var htmlFirst = hcFirst.Request(); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst); // 寻找第二页面链接及最大页码 var secondPageNode = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[4]/div[2]/div[6]/div[1]/a[1]"); var secondPageUrl = string.Empty; var urlTemplate = string.Empty; if(secondPageNode!=null) { if(secondPageNode.Attributes["href"]!=null&& !string.IsNullOrEmpty(secondPageNode.Attributes["href"].Value)) { secondPageUrl = secondPageNode.Attributes["href"].Value; var spiltArray = secondPageUrl.Split('-'); spiltArray[6] = "{0}"; // 每一页面链接模板 for (int i = 0; i < spiltArray.Length; i++) { if(i==spiltArray.Length-1) { urlTemplate += spiltArray[i]; } else { urlTemplate += spiltArray[i] + "-"; } } } } var maxPageNode = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[4]/div[2]/div[6]/div[1]/a[last()-1]"); var maxPageNumber = 1; //说明有多个页面 if(maxPageNode!=null&&secondPageNode!=null&&!string.IsNullOrEmpty(urlTemplate)) { int.TryParse(maxPageNode.InnerText, out maxPageNumber); } for (int i = 1; i <= maxPageNumber; i++) { if (i != 1) { var hc = new HttpClient(string.Format(urlTemplate, i)); hc.Timeout = 50000; var html = hcFirst.Request(); htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); } // 寻找当前商品的链接 var productListNodes = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[2]/div[5]/ul[1]/li"); if(productListNodes==null) { return; } foreach (HtmlNode productNode in productListNodes) { // 商品名称 var productNameNode = productNode.SelectSingleNode("./div[1]/h4[1]/a[1]"); if(productNameNode==null) { continue; } // 商品列表图 var productImageNode = productNode.SelectSingleNode("./a[1]/img[1]"); // 商品链接 var productHref = productNameNode.Attributes["href"].Value; // 商品评论数量 var commentNode = productNode.SelectSingleNode("./div[1]/p[2]/a[1]"); // 商品价格 var productPriceNode = productNode.SelectSingleNode("./div[2]/p[2]/strong[1]"); // 商品原始id } } }
/// <summary> /// 苏宁列表页面商品提取 /// </summary> /// <param name="url"></param> public static object SuNingProductList(object obj) { var url = obj as string; if(string.IsNullOrEmpty(url)) { return false; } Uri uri = new Uri(url); string queryString = uri.Query; NameValueCollection nameValue = UrlHelper.GetQueryString(queryString); // 根据url中抽取分类 string cid = nameValue["ci"]; if(string.IsNullOrEmpty(cid)) { return false; } string urlTemplate = "http://search.suning.com/emall/strd.do?ci={0}&cityId=9017&cp={1}&il=0&si=5&st=14&iy=-1"; var firstPageUrl = string.Format("http://search.suning.com/emall/strd.do?ci={0}&cityId=9017&cp=0&il=0&si=5&st=14&iy=-1", cid); var hcFirst = new HttpClient(firstPageUrl); hcFirst.Timeout = 30000; var htmlFirst = hcFirst.Request(); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst); // 先找最大页面页面 var pageContainer = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[8]/a"); if(pageContainer==null||!pageContainer.Any()) { return false; } var lastPageNode = pageContainer[pageContainer.Count - 2]; var lastPageNumber = int.Parse(lastPageNode.InnerText); for (int i = 0; i < lastPageNumber; i++) { if(i==0) { // 解析商品 var productLis =htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[6]/ul[1]/li"); foreach (var htmlNode in productLis) { var aNode = htmlNode.SelectSingleNode("a"); if(aNode!=null) { // 商品名称 var name = aNode.Attributes["title"].Value; // 商品链接 var href = aNode.Attributes["href"].Value; // 图片 var imageNode=aNode.SelectSingleNode("img"); var picUrl = string.Empty; if(imageNode!=null&&imageNode.Attributes["src2"]!=null&&!string.IsNullOrEmpty(imageNode.Attributes["src2"].Value)) { // 图片url picUrl= imageNode.Attributes["src2"].Value; } // 评论 var commentNode = htmlNode.SelectSingleNode("div[1]/div[1]/p[1]/a[1]/i[1]"); int commentNum = 0; if(commentNode!=null) { // 评论数目 int.TryParse(commentNode.InnerText, out commentNum); } if (!DataAccess.IsExistUrl(href)) { DataAccess.InsertProduct(name, href, int.Parse(cid), commentNum, picUrl); } } } } else { var categoryUrl = string.Format(urlTemplate, cid, i); HttpClient hc = new HttpClient(categoryUrl); hc.Timeout = 30000; var html = hc.Request(); htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); // 解析商品 // 解析商品 var productLis = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[6]/ul[1]/li"); foreach (var htmlNode in productLis) { var aNode = htmlNode.SelectSingleNode("a"); if (aNode != null) { // 商品名称 var name = aNode.Attributes["title"].Value; // 商品链接 var href = aNode.Attributes["href"].Value; // 图片 var imageNode = aNode.SelectSingleNode("img"); var picUrl = string.Empty; if (imageNode != null && imageNode.Attributes["src2"] != null && !string.IsNullOrEmpty(imageNode.Attributes["src2"].Value)) { // 图片url picUrl = imageNode.Attributes["src2"].Value; } // 评论 var commentNode = htmlNode.SelectSingleNode("div[1]/div[1]/p[1]/a[1]/i[1]"); int commentNum = 0; if (commentNode != null) { // 评论数目 int.TryParse(commentNode.InnerText, out commentNum); } if (!DataAccess.IsExistUrl(href)) { DataAccess.InsertProduct(name, href, int.Parse(cid), commentNum, picUrl); } } } } } return true; }
public void HtmlAgilityPack_Demo01() { var list = new List<string>(); list.Add("2"); list.Add("3"); list.Add("4"); list.Add("5"); list.Add("7"); list.Add("8"); list.Add("9"); list.Add("10"); list.Add("11"); list.Add("12"); list.Add("13"); list.Add("14"); list.Add("15"); list.Add("16"); list.Add("17"); list.Add("18"); list.Add("19"); list.Add("20"); list.Add("21"); list.Add("22"); list.Add("23"); list.Add("24"); list.Add("25"); list.Add("26"); list.Add("27"); list.Add("28"); list.Add("29"); list.Add("31"); foreach (string s in list) { var webGet = new HtmlWeb(); HttpClient httpClient1 = new HttpClient("http://www.yihui-lighting.com/productshow.asp?id="+s); HtmlAgilityPack.HtmlDocument htmlDocument1 = new HtmlAgilityPack.HtmlDocument(); htmlDocument1.LoadHtml(httpClient1.Request()); var document = htmlDocument1; var data = document.DocumentNode.SelectNodes("//*[@id=\"mitte2\"]"); StringBuilder sb = new StringBuilder(); int i = 0; foreach (HtmlNode htmlNode in data) { var images = document.DocumentNode.SelectNodes("//img"); if (images != null && images.Count > 0) { int j = 0; foreach (HtmlNode imageNode in images) { j++; WebClient webClient = new WebClient(); var dir = "images\\content"; string filepath = "z:\\upload\\" + dir; if (!Directory.Exists(filepath)) { Directory.CreateDirectory("z:\\upload\\" + dir); } if (imageNode.Attributes["src"].Value != null) { try { string url = "http://www.yihui-lighting.com/UpProduct"; downloadfile(url + "//" + imageNode.Attributes["src"].Value.Substring(imageNode.Attributes["src"].Value.LastIndexOf('/') + 1), imageNode.Attributes["src"].Value.Substring(imageNode.Attributes["src"].Value.LastIndexOf('/') + 1)); } catch { } } } } if (!string.IsNullOrEmpty(htmlNode.InnerText)) { // var titleNode = document.DocumentNode.SelectSingleNode( "/html/body/div/div[3]/div[3]/div[2]/div/h2/font"); var title = titleNode.InnerText??""; var body = htmlNode.InnerHtml.Replace("UpProduct/", "upload/images/content/")??""; var model = new ProductInfo(); model.Title = title; model.CategoryId = 2; model.Content = body; model.InDate = DateTime.Now; model.DisplayOrder = OrderGenerator.NewOrder(); IProductService productService = new ProductService(); try { productService.Add(model); } catch { } } } } }
public static void DangDangProductList(string categoryUrl) { // Uri uri = new Uri(categoryUrl); NameValueCollection nameValue = UrlHelper.GetQueryString(uri.Query); // 根据url中抽取分类 string cid = nameValue["cat"]; if(string.IsNullOrEmpty(cid)) { return; } var hcFirst = new HttpClient(categoryUrl); hcFirst.Timeout = 30000; var htmlFirst = hcFirst.Request(); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst); // 查找第一页面 var maxPageNode=htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[2]/div[3]/div[5]/div[1]/div[2]/span[1]"); if(maxPageNode==null||string.IsNullOrEmpty(maxPageNode.InnerText)) { return; } var maxPageString = maxPageNode.InnerText.Substring("1 / ".Length); // InnerText "1 / 19" string var maxPageNumber = 0; if (!int.TryParse(maxPageString, out maxPageNumber)) { return; } //页面模板 var tempUrl = "http://category.dangdang.com/all/?category_id={0}&page_index={1}"; for (int j = 1; j <= maxPageNumber; j++) { if(j!=1) { var hc = new HttpClient(string.Format(tempUrl,cid,j)); hc.Timeout = 30000; var html = hc.Request(); if(!string.IsNullOrEmpty(html)) { htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); } } var productList = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[2]/div[3]/div[7]/div"); if (productList == null || !productList.Any()) { return; } int i = 0; foreach (HtmlNode htmlNode in productList) { if (htmlNode.Attributes["class"] != null && htmlNode.Attributes["class"].Value.Contains("listitem")) { var a = htmlNode.SelectSingleNode("p[1]/a[1]"); // 商品页面 if (a != null && a.Attributes["href"] != null && !string.IsNullOrEmpty(a.Attributes["href"].Value)) { // // 商品链接 var productUrl = a.Attributes["href"].Value; // 商品列表页面图片 var image = a.SelectSingleNode("img[1]"); var productImageUrl = image.Attributes["src"].Value; // 商品名称 var titleNode = htmlNode.SelectSingleNode("p[3]/a[1]"); // 评论数 var commentNode = htmlNode.CssSelect("p.starlevel"); //htmlNode.SelectSingleNode("p[4]/span[1]/a[1]"); var commentCount = 0; if(commentNode!=null&&!string.IsNullOrEmpty(commentNode.FirstOrDefault().InnerText)) { var index = commentNode.FirstOrDefault().InnerText.IndexOf("条"); if(index!=-1) { var s = commentNode.FirstOrDefault().InnerText.Substring(1, index - 1); int.TryParse(s, out commentCount); } } if (!DataAccess.IsExistUrl(productUrl)) { DataAccess.InsertProduct(titleNode.InnerText, productUrl, int.Parse(cid), commentCount, productImageUrl); } } } } } }
public void Process() { string url = "http://www.360buy.com/products/670-671-672-0-0-0-0-0-0-0-1-1-{0}.html"; HttpClient client=new HttpClient("http://www.360buy.com/products/670-671-672-0-0-0-0-0-0-0-1-1-1.html"); var html = client.Request(); //通过第一页找到一共有多少页面 var htmlDocument= HtmlAgilityPackHelper.GetHtmlDocument(html); //var productNodes2 = htmlDocument.GetElementbyId("plist").SelectNodes("//a[@href]"); var maxPager = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='pagin fr']/a[4]"); var urls = new List<string>(); if (maxPager != null) { int maxPageNumber = 0; if (int.TryParse(maxPager.InnerText, out maxPageNumber) && maxPageNumber>0) { for (int i = 1; i <= maxPageNumber; i++) { string newUrl = string.Format(url, i); HttpClient hc=new HttpClient(newUrl); var newHtml = hc.Request(); var doc = HtmlAgilityPackHelper.GetHtmlDocument(newHtml); var productContainer = doc.GetElementbyId("plist"); if(productContainer==null) { //Logger.Log(url+"数据获取有误"); continue; } var productNodes = productContainer.SelectNodes("//a"); if(productNodes==null||productNodes.Count==0) { //Logger.Log(url + "数据获取有误"); continue; } foreach (var productNode in productNodes) { if (productNode.Attributes["href"] != null && productNode.Attributes["href"].Value != null && productNode.Attributes["href"].Value.Contains("product/")) { var href = productNode.Attributes["href"].Value; if (!IsExistUrl(href)) { using (DataCommand cmd = DataCommandManager.GetDataCommand("InsertUrls")) { cmd.SetParameterValue("@Guid", System.Guid.NewGuid().ToString()); cmd.SetParameterValue("@Url", href); cmd.ExecuteNonQuery(); } } } } Thread.Sleep(1000); } } } MessageBox.Show("笔记本数据采集完了"); }
public void ProcessCategory(string url,string categoryId) { var link = string.Format(url, 1); HttpClient client = new HttpClient(link); client.Timeout = 1000000; client.StatusUpdate += new EventHandler<StatusUpdateEventArgs>(Message); var html = client.Request(); //通过第一页找到一共有多少页面 var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); //var productNodes2 = htmlDocument.GetElementbyId("plist").SelectNodes("//a[@href]"); var maxPager = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='pagin fr']/a[4]"); if (maxPager != null) { int maxPageNumber = 0; if (int.TryParse(maxPager.InnerText, out maxPageNumber) && maxPageNumber > 0) { for (int i = 1; i <= maxPageNumber; i++) { string newUrl = string.Format(url, i); HttpClient hc = new HttpClient(newUrl); hc.Timeout = 1000000; hc.StatusUpdate += new EventHandler<StatusUpdateEventArgs>(Message); var newHtml = hc.Request(); var doc = HtmlAgilityPackHelper.GetHtmlDocument(newHtml); var productContainer = doc.GetElementbyId("plist"); if (productContainer == null) { //Logger.Log(url+"数据获取有误"); continue; } var productNodes = productContainer.CssSelect("div.p-name>a");//productContainer.SelectNodes("//a"); if (productNodes == null || productNodes.Count() == 0) { //Logger.Log(url + "数据获取有误"); continue; } foreach (var productNode in productNodes) { if (productNode.Attributes["href"] != null && productNode.Attributes["href"].Value != null && productNode.Attributes["href"].Value.Contains("product/")) { var href = productNode.Attributes["href"].Value; if (!IsExistUrl(href)) { InsertProduct(productNode.InnerText, href, int.Parse(categoryId), 0); } } } Thread.Sleep(5000); } } } }
public void GatherPrice(string productId,string url) { HttpClient client = new HttpClient(url); var html = client.Request(); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html); //标题 var title = htmlDocument.DocumentNode.SelectSingleNode("/html/body/div[5]/div/div/h1"); var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img"); // 文字价格 var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/ul[1]/script[1]"); // 产品图片 var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]"); // 促销信息是ajax if(title!=null&&price!=null&&priceText!=null) { var beginIndex= priceText.InnerText.IndexOf("京东价:¥"); var endIndex = priceText.InnerText.IndexOf("。", beginIndex); var readPrice = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length); } }
public void GatherAll() { //var d1 = Math.Round(3.346, 2); //var d2 = Math.Round(3.341, 2); //var d3 = Math.Round(3.34, 2); //var d4 = Math.Round(3.346, 2, MidpointRounding.ToEven); //var d5 = Math.Round(3.349, 2, MidpointRounding.AwayFromZero); //var d6 = Math.Round(3.345, 2, MidpointRounding.AwayFromZero); //var d7 = Math.Round(3.345, 2, MidpointRounding.ToEven); //var d8 = Math.Floor(3.345); //var d9 = Math.Floor(3.347); //var d10 = Math.Floor(3.341); //var a = Math.Round(2.346 - 0.005, 2); var url = "http://www.360buy.com/allSort.aspx"; HttpClient hc=new HttpClient(url); hc.Timeout = 30000; var allSortHtml= hc.Request(); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(allSortHtml); var categoryContainer = htmlDocument.GetElementbyId("allsort"); if(categoryContainer==null) { return; } var firstCategory = htmlDocument.DocumentNode.SelectNodes("/html/body/div[5]/div/div/div/h2"); foreach (var first in firstCategory) { //插入一级分类 var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory"); Insert(firstKey.ToString(), first.InnerText,"","0"); // 二级分类 var secondCategorys = first.ParentNode.ParentNode.CssSelect("dt"); foreach (var secondCategory in secondCategorys) { // 插入二级分类 var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory"); Insert(secondKey.ToString(), secondCategory.InnerText, "", firstKey.ToString()); // 三级分类 var thirdCategorys= secondCategory.ParentNode.CssSelect("dd>em>a"); foreach (var thirdCategory in thirdCategorys) { // 插入三级分类 var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory"); if(thirdCategory.Attributes["href"]!=null&&!string.IsNullOrEmpty(thirdCategory.Attributes["href"].Value)) { Insert(thirdKey.ToString(), thirdCategory.InnerText, thirdCategory.Attributes["href"].Value, secondKey.ToString()); } } } } }
/// <summary> /// 从一个CHttpWebResponse实例保存文件到本地 /// </summary> /// <param name="cResponse">一个CHttpWebResponse实例</param> /// <param name="dirPath">文件夹的绝对路径</param> /// <param name="fileName">文件名,如果为null或者String.Empty则自动获取(推荐设置设置为自动获取)</param> /// <returns>如果保存成功,返回文件绝对路径,否则返回null</returns> internal static string SaveFile(HttpClient httpClient, string dirPath, string fileName) { string filePath; //如果成功接收服务器输出的文件流 //如果没有指定文件名,则自动获取文件名 if (string.IsNullOrEmpty(fileName)) { fileName = httpClient.GetFileName(); } filePath = dirPath + "\\" + fileName; //filePath = Regex.Replace(filePath, @"\{2,}", @"\"); filePath = Path.GetFullPath(filePath); return FileUtility.SaveFile(filePath, httpClient.MemoryStream); }
/// <summary> ///使用同步方式下载,将html内容保存为html文件,并将html内容中引用的图片,css,js,flash都保存到本地,然后将html内容中引用的地址都转换为相对地址 /// </summary> /// <param name="fileName">保存的文件名</param> /// <param name="ignoreScript">保存的时候是否忽略Script标签,若为true,则忽略JS</param> /// <param name="dirConfig">页面下载文件夹相关配置信息</param> /// <returns>如果下载成功,返回文件的本地路径,否则返回null</returns> public string SaveHtmlAndResource(string fileName, bool ignoreScript, DirConfig dirConfig) { string htmlContent = this.html; if (!dirConfig.CheckLegal()) { throw new Exception("路径配置不在统一磁盘下"); } dirConfig.CreateDir(); if (string.IsNullOrEmpty(this.url)) { throw new Exception("未指定当前url"); } //是否忽略JS if (ignoreScript) { htmlContent = this.FilterScript(); } #region 保存所有css和css中引用的图片 htmlContent = RegexLibrary.RegCssLink.Replace(htmlContent, delegate(Match match) { //由于css文件里有可能引用图片,所以在此处需要自定义css文件下载函数,将css文件中的图片下载到本地然后替换css文件中的引用路径 return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.CssDirPath, "src", null, delegate(string cssUrl) { //css内容、css保存路径 string cssContent, cssSavePath = cssUrl; //请求css文件 //cRequest = Spider.CreateRequest(cssUrl); //cRequest.SetHeader("Referer", this.Url); //cResponse = Spider.Get(cRequest); HttpClient hc = new HttpClient(cssUrl); string content = hc.Request(); //如果请求成功 if (!string.IsNullOrEmpty(content)) { string tempFileName = hc.GetFileName(); //确定css文件保存的绝对路径 cssSavePath = dirConfig.UseWebSite ? Path.GetFullPath(PathUtility.GetSaveDir(this.Url, cssUrl, dirConfig.HtmlDirPath) + tempFileName) : Path.GetFullPath(dirConfig.CssDirPath + "\\" + tempFileName); //获取css内容 cssContent = content;// cResponse.GetContent(null, false); //下载css里引用的图片,并替换css内容中图片地址的引用 cssContent = this.ReplaceBackgroundUrl(cssContent, cssUrl, dirConfig, Path.GetDirectoryName(cssSavePath), dirConfig.ImgDirPath, false, null); //将css保存到本地 cssSavePath = FileUtility.SaveText(cssSavePath, cssContent, hc.Encoding); } //返回保存以后的css本地路径 return cssSavePath; }); }); #endregion #region 保存所有JS htmlContent = RegexLibrary.RegScriptLink.Replace(htmlContent, delegate(Match match) { return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.JsDirPath, "src", null, null); }); #endregion #region 保存所有图片 htmlContent = RegexLibrary.RegImg.Replace(htmlContent, delegate(Match match) { return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.ImgDirPath, "src", null, null); }); #endregion #region 保存所有Flash htmlContent = RegexLibrary.RegFlash.Replace(htmlContent, delegate(Match match) { return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.FlashDirPath, "src", null, null); }); #endregion #region 当前页面内嵌css中的图片 htmlContent = this.ReplaceBackgroundUrl(htmlContent, this.Url, dirConfig, dirConfig.HtmlDirPath, dirConfig.ImgDirPath, false, null); #endregion return FileUtility.SaveText(Path.GetFullPath(dirConfig.HtmlDirPath + "\\" + fileName), htmlContent, this.encode); }
/// <summary> /// 使用同步保存资源的函数,将css,js,flash,图片等资源文件保存到本地 /// </summary> /// <param name="fileUrl">资源在公网上的url路径</param> /// <param name="dirPath">本地文件夹完整路径,资源文件将保存在此文件夹</param> /// <param name="fileName">文件名,如果为null或者String.Empty则自动获取(推荐设置设置为自动获取)</param> /// <returns>如果保存成功,返回文件绝对路径,否则返回null</returns> public static string SaveResource(string fileUrl, string dirPath, string fileName) { HttpClient hc = new HttpClient(fileUrl); string data= hc.Request(); if (string.IsNullOrEmpty(data)) { return string.Empty; } return SaveFile(hc, dirPath, fileName); }