public void GetWebSiteForYiYaoWang() { //用于执行页面的分页计数器 int SNUMBER = 0; //获取目标分类地址数据 IList <string> categoryURLList = GetURLList(); //网页获取辅助类 HttpHelper helper = new HttpHelper(); #region 商品分类采集 //遍历分类 for (int i = 0; i < categoryURLList.Count; i++) { string webStieUrl = categoryURLList[i]; while (true) { SNUMBER++; string url = string.Format(webStieUrl, SNUMBER); HttpItem item = new HttpItem(); item.URL = url; item.Timeout = 10000; item.ResultType = ResultType.String; item.UserAgent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";//伪装百度爬虫名 HttpResult result = helper.GetHtml(item); //没有数据时进入下一个分类 if (result.Html.IndexOf("itemid") == -1) { SNUMBER = 0; break; } IList <SpiderProductInfo> productInfoList = AnalyticalHtml(result); Log("---------------------------------------------------"); Log("===目标:" + url + " ==="); Log("===已完成抓取,总条数:" + productInfoList.Count + "时间:" + System.DateTime.Now + "==="); if (productInfoList != null && productInfoList.Count > 0) { #region 抓取单个商品的详细 for (int j = 0; j < productInfoList.Count; j++) { SpiderProductInfo spInfo = productInfoList[j]; HttpItem detailItem = new HttpItem(); detailItem.URL = spInfo.Url; detailItem.Timeout = 10000; detailItem.ResultType = ResultType.String; detailItem.UserAgent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)"; HttpResult detailResult = helper.GetHtml(detailItem); AnalyticalDetailHtml(detailResult, spInfo); } #endregion } Log("===目标:" + url + "==="); Log("===已完成存储!==="); } } #endregion }
/// <summary> /// 采集商品详情 /// </summary> /// <param name="spiderProduct"></param> /// <returns></returns> public static ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { if (spiderDictionary.ContainsKey(spiderProduct.ECPlatformId)) { return spiderDictionary[spiderProduct.ECPlatformId].SpiderProductDetail(spiderProduct); } return null; }
/// <summary> /// 采集商品详情 /// </summary> /// <param name="spiderProduct"></param> /// <returns></returns> public static ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { if (spiderDictionary.ContainsKey(spiderProduct.ECPlatformId)) { return(spiderDictionary[spiderProduct.ECPlatformId].SpiderProductDetail(spiderProduct)); } return(null); }
public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { //var html=WebBrowerManager.Instance.Run(spiderProduct.Url); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(spiderProduct.HtmlSource); //标题 var title = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[1]/h1[1]"); var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img"); // 文字价格 var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[2]/ul[1]/li[2]/script[1]"); // 产品图片 //var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]"); decimal realPrice = 0; if (price.Attributes["src"] != null && !string.IsNullOrEmpty(price.Attributes["src"].Value)) { decimal.TryParse(ImageProcess.Recognize(price.Attributes["src"].Value), out realPrice); } // 促销信息是ajax if (title != null && price != null && priceText != null) { var beginIndex = priceText.InnerText.IndexOf("京东价:¥"); var endIndex = priceText.InnerText.IndexOf("。", beginIndex); var readPrice = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length); decimal decimalRealPrice = 0; if (decimal.TryParse(readPrice, out decimalRealPrice)) { //UpdateProduct(productId, title.InnerText, decimal.Parse(readPrice)); } } return(new ProductInfo() { Source = spiderProduct.HtmlSource, ProductId = spiderProduct.ProductId, Url = spiderProduct.Url, Price = realPrice }); }
/// <summary> /// 将目标商品分类页结果中的商品拆分成List 商品信息 /// </summary> /// <param name="result">result.html 字符串 </param> /// <returns>返回数据集合</returns> private IList <SpiderProductInfo> AnalyticalHtml(HttpResult result) { IList <SpiderProductInfo> productInfoList = new List <SpiderProductInfo>(); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(result.Html); var collection = doc.DocumentNode.SelectNodes("//div[@class='itemSearchResultCon']"); foreach (HtmlAgilityPack.HtmlNode item in collection) //div { SpiderProductInfo spInfo = null; //子节点:a标签中href数据, img中的alt数据 HtmlNodeCollection cNodes = item.ChildNodes; foreach (var citem in cNodes) { //a标签 if (citem.Name.ToLower() == "a") { spInfo = new SpiderProductInfo(); spInfo.SpiderPID = Convert.ToInt32(item.Attributes["itemid"].Value); spInfo.Url = citem.Attributes["href"].Value; HtmlNodeCollection ccNodes = citem.ChildNodes; foreach (var ccItem in ccNodes) { //img标签 if (ccItem.Name == "img") { spInfo.ProductName = ccItem.Attributes["alt"].Value; } } } } if (spInfo != null) { productInfoList.Add(spInfo); } } return(productInfoList); }
public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { //var html=WebBrowerManager.Instance.Run(spiderProduct.Url); var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(spiderProduct.HtmlSource); //标题 var title = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[1]/h1[1]"); var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img"); // 文字价格 var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[2]/ul[1]/li[2]/script[1]"); // 产品图片 //var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]"); decimal realPrice = 0; if (price.Attributes["src"]!=null&&!string.IsNullOrEmpty(price.Attributes["src"].Value)) { decimal.TryParse(ImageProcess.Recognize(price.Attributes["src"].Value),out realPrice) ; } // 促销信息是ajax if (title != null && price != null && priceText != null) { var beginIndex = priceText.InnerText.IndexOf("京东价:¥"); var endIndex = priceText.InnerText.IndexOf("。", beginIndex); var readPrice = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length); decimal decimalRealPrice = 0; if (decimal.TryParse(readPrice, out decimalRealPrice)) { //UpdateProduct(productId, title.InnerText, decimal.Parse(readPrice)); } } return new ProductInfo() { Source = spiderProduct.HtmlSource, ProductId = spiderProduct.ProductId, Url = spiderProduct.Url, Price = realPrice }; }
public void Enqueue(SpiderProductInfo spiderProduct) { SpiderUrlQueue.Enqueue(spiderProduct); }
public void Enqueue(SpiderProductInfo spiderProduct) { SpiderUrlQueue.Enqueue(spiderProduct); }
public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { throw new NotImplementedException(); }
public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { return(new ProductInfo()); }
/// <summary> /// 进入商品详情页后,解析出商品说明书Table,写入数据库 /// </summary> /// <param name="detailResult">result.html 字符串</param> /// <param name="spInfo">包含 ID name url</param> private void AnalyticalDetailHtml(HttpResult detailResult, SpiderProductInfo spInfo) { IList <SpiderProductInfo> spInfoList = new List <SpiderProductInfo>(); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(detailResult.Html); var nameCollection = doc.DocumentNode.SelectNodes("//div[@class='middle_property']");//获取名字 if (nameCollection == null) { nameCollection = doc.DocumentNode.SelectNodes("//div[@class='middle_property middle_propertyO2o']");//获取名字 if (nameCollection == null) { return; } } spInfo.ProductName = nameCollection[0].ChildNodes["h1"].InnerHtml; //h1中内嵌了其他标签 if (spInfo.ProductName.IndexOf("<") != -1) { spInfo.ProductName = spInfo.ProductName.Split('<')[0]; } var collection = doc.DocumentNode.SelectNodes("//table[@class='specificationBox']"); if (collection == null) { return; } foreach (HtmlAgilityPack.HtmlNode tableItem in collection) //table { HtmlNodeCollection trNodes = tableItem.ChildNodes["tbody"].ChildNodes; //tr 节点 foreach (var trItem in trNodes) { if (trItem.ChildNodes.Count == 1) { continue; } string keyName = trItem.ChildNodes["th"].InnerText.Trim(); string keyValue = trItem.ChildNodes["td"].InnerHtml.Trim(); SpiderProductInfo spInfoFlag = new SpiderProductInfo(); spInfoFlag.SpiderPID = spInfo.SpiderPID; spInfoFlag.ProductName = spInfo.ProductName; spInfoFlag.Url = spInfo.Url; spInfoFlag.KeyName = keyName; spInfoFlag.KeyValue = keyValue; spInfoList.Add(spInfoFlag); } } if (spInfoList != null && spInfoList.Count != 0) { var oResult = SpiderBLL.Instance.InsertSpider(spInfoList); if (oResult.ResultType != OperationResultType.Success) { Log(string.Format("写入异常:数据ID {0},异常内容:{1}", spInfo.SpiderPID, oResult.Message)); } else { Console.WriteLine(string.Format("数据ID:{0},名称:{1}, 已完成添加", spInfo.SpiderPID, spInfo.ProductName)); } } }
/// <summary> /// 写入数据表事务提交 /// </summary> /// <param name="spInfolist"></param> /// <returns></returns> public OperationResult <bool> InsertSpider(IList <SpiderProductInfo> spInfolist) { DbConnection con = null; DbTransaction transcation = null; try { using (con = dbw.CreateConnection()) { con.Open(); transcation = con.BeginTransaction(); using (var command = con.CreateCommand()) { command.Transaction = transcation; var sql = string.Empty; sql += " INSERT INTO Spider "; sql += " (SpiderPID,ProductName,Url,KeyName,KeyValue ) "; sql += " VALUES "; sql += " (@SpiderPID,@ProductName,@Url,@KeyName,@KeyValue) "; command.Connection = con; command.CommandText = sql; for (int i = 0; i < spInfolist.Count; i++) { command.Parameters.Clear(); SpiderProductInfo spInfo = spInfolist[i]; DbParameter spiderPID = command.CreateParameter(); spiderPID.DbType = DbType.Int32; spiderPID.Value = spInfo.SpiderPID; spiderPID.ParameterName = "SpiderPID"; command.Parameters.Add(spiderPID); DbParameter productName = command.CreateParameter(); productName.DbType = DbType.String; productName.Value = spInfo.ProductName; productName.ParameterName = "ProductName"; command.Parameters.Add(productName); DbParameter url = command.CreateParameter(); url.DbType = DbType.String; url.Value = spInfo.Url; url.ParameterName = "Url"; command.Parameters.Add(url); DbParameter keyName = command.CreateParameter(); keyName.DbType = DbType.String; keyName.Value = spInfo.KeyName; keyName.ParameterName = "KeyName"; command.Parameters.Add(keyName); DbParameter keyValue = command.CreateParameter(); keyValue.DbType = DbType.String; keyValue.Value = spInfo.KeyValue; keyValue.ParameterName = "KeyValue"; command.Parameters.Add(keyValue); command.ExecuteNonQuery(); } } transcation.Commit(); } return(new OperationResult <bool>(OperationResultType.Success, null, true)); } catch (Exception e) { if (transcation != null) { transcation.Rollback(); } return(new OperationResult <bool>(OperationResultType.Error, e.Message, false)); } finally { con.Close(); } }
public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { throw new NotImplementedException(); }
public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct) { return new ProductInfo(); }