public static HashSet <string> test5() //获取商品的介绍详情 { HashSet <string> hashSet = test.test1(); HashSet <string> data = new HashSet <string>(); int count = 0; foreach (string goodURL in hashSet) { string content = SpiderUtils.download("https:" + goodURL); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(content); HtmlNodeCollection HNC = htmlDoc.DocumentNode.SelectNodes(".//*[@id='parameter2']/li"); if (HNC != null && HNC.Count != 0) { foreach (HtmlNode Hnode in HNC) { data.Add(Hnode.InnerText); } } if (count++ > 1) { return(data); } } return(data); }
public static HashSet <string> test2() //获取商品名称 { HashSet <string> hashSet = test.test1(); HashSet <string> data = new HashSet <string>(); int count = 0; foreach (string goodURL in hashSet) { string content = SpiderUtils.download("https:" + goodURL); //利用HtmlAgilityPack来加载分析Html页面内容 HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(content); //得到商品名称节点 //var Tnode = htmlDoc.DocumentNode.SelectSingleNode(@"//strong[@class='price']/img"); HtmlNode Hnode = htmlDoc.DocumentNode.SelectSingleNode(".//*[@id='name']/h1"); if (Hnode != null) { data.Add(Hnode.InnerText); } if (count++ > 15) { return(data); } } return(data); }
public static HashSet <string> test1() //获取商品的URL { string content = SpiderUtils.download("https://sale.jd.com/act/6hd0T3HtkcEmqjpM.html"); Regex compile = new Regex("//item.jd.com/([0-9]+).html", RegexOptions.Compiled); MatchCollection matchs; matchs = compile.Matches(content); HashSet <string> hashSet = new HashSet <string>(); foreach (Match m in matchs) { hashSet.Add(m.ToString()); //Console.WriteLine(m.ToString()); } return(hashSet); }
//将每个页面的业务逻辑放在Runnable接口的run()方法中,这样可以调用多线程爬去每个页面 public void run() { //通过构造函数插入的URL,然后获取该URL的响应结果 string content = SpiderUtils.download(url); //利用HtmlAgilityPack来加载分析Html页面内容 HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(content); string goodsId = this.goodsId(htmlDoc); //if(goodsId != "") { insertGoodInfo(htmlDoc); insertComments(goodsId); insertHotComment(goodsId); insertCommentSummary(goodsId); //} }
public static HashSet <string> test6() //获取商品价格 { HashSet <string> goodIds = test3(); HashSet <string> Price = new HashSet <string>(); foreach (string goodId in goodIds) { string pricURL = "https://p.3.cn/prices/get?type=1&area=1_72_2799&pdtk=&pduid=1340247559&pdpin=&pdbp=0&skuid=J_" + goodId; string con = SpiderUtils.download(pricURL); //JsonReader reader = new JsonReader(new Str) //反序列化,解析json字符串 JArray JA = (JArray)JsonConvert.DeserializeObject(con); JObject OB = (JObject)JA[0]; //string priceStr = OB["p"].ToString(); Price.Add(OB["p"].ToString()); } return(Price); }
private string goodsPrice(HtmlDocument htmlDoc) { string id = goodsId(htmlDoc); string GoodPrice = "null"; if (id.Equals(string.Empty)) { return(GoodPrice); } else { string pricURL = "https://p.3.cn/prices/get?type=1&area=1_72_2799&pdtk=&pduid=1481166803710777284406&pdpin=&pdbp=0&skuid=J_" + id; string con = SpiderUtils.download(pricURL); //反序列化,解析json字符串 JArray JA = (JArray)JsonConvert.DeserializeObject(con); JObject OB = (JObject)JA[0]; GoodPrice = OB["p"].ToString(); } return(GoodPrice); }
private void insertCommentSummary(string goodsId) { if (goodsId == "2879902") { goodsId = "3752769"; } string url = "https://sclub.jd.com/comment/productPageComments.action?productId=" + goodsId + "&score=0&sortType=3&page=0&pageSize=10&isShadowSku=0"; string content = SpiderUtils.download(url); JObject jsonObject = (JObject)JsonConvert.DeserializeObject(content); JToken o2 = jsonObject["productCommentSummary"]; //Object obj = jsonObject.GetValue("productCommentSummary"); //JObject j_Obj = new JObject(obj); // 获取好评度中的信息 //o2[""].ToString(); string productId = o2["productId"].ToString(); //商品ID string commentUrl = url; //商品好评度URL string commentCount = o2["commentCount"].ToString(); //商品评价数 string goodRateShow = o2["goodRateShow"].ToString(); //好评度 string generalRateShow = o2["generalRateShow"].ToString(); //中评度 string poorRateShow = o2["poorRateShow"].ToString(); //差评度 string goodCount = o2["goodCount"].ToString(); //好评数 string generalCount = o2["generalCount"].ToString(); //中评数 string poorCount = o2["poorCount"].ToString(); //差评数 //下面是数据库插入语句,一会儿再写 string sSql = "insert into CommentSummary(goodId, productId, commentUrl, commentCount, goodRateShow, generalRateShow, poorRateShow, goodCount, generalCount, poorCount) values (" + "'" + goodsId + "'," + "'" + productId + "'," + "'" + commentUrl + "'," + "'" + commentCount + "'," + "'" + goodRateShow + "'," + "'" + generalRateShow + "'," + "'" + poorRateShow + "'," + "'" + goodCount + "'," + "'" + generalCount + "'," + "'" + poorCount + "')"; DBHelper helper = new DBHelper("JD_Online_Shop"); helper.Update(sSql); }
public static HashSet <string> test4() //获取商品图片URL { HashSet <string> hashSet = test.test1(); HashSet <string> data = new HashSet <string>(); int count = 0; foreach (string goodURL in hashSet) { string content = SpiderUtils.download("https:" + goodURL); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(content); HtmlNode Hnode = htmlDoc.DocumentNode.SelectSingleNode("//*[@id='spec-n1']/img"); if (Hnode != null) { data.Add(Hnode.GetAttributeValue("src", "")); } if (count++ > 15) { return(data); } } return(data); }
public static HashSet <string> test3() //获取ID { HashSet <string> hashSet = test.test1(); HashSet <string> data = new HashSet <string>(); int count = 0; foreach (string goodURL in hashSet) { string content = SpiderUtils.download("https:" + goodURL); HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(content); HtmlNode Hnode = htmlDoc.DocumentNode.SelectSingleNode(".//*[@id='parameter2']/li[2]"); if (Hnode != null) { data.Add(Hnode.InnerText.Substring(5)); } if (count++ > 30) { return(data); } } return(data); }
private void insertHotComment(string goodsId) { if (goodsId == "2879902") { goodsId = "3752769"; } string url = "https://sclub.jd.com/comment/productPageComments.action?productId=" + goodsId + "&score=0&sortType=3&page=0&pageSize=10&isShadowSku=0"; string content = SpiderUtils.download(url); JObject jsonObj = (JObject)JsonConvert.DeserializeObject(content); string val = jsonObj.GetValue("hotCommentTagStatistics").ToString(); JArray jsonArray = JArray.Parse(val); for (int i = 0; i < jsonArray.Count; i++) { JObject j_Obj = (JObject)jsonArray[i]; string productId = j_Obj.GetValue("productId").ToString(); //商品ID string tagId = j_Obj.GetValue("id").ToString(); //用户某类印象ID string count = j_Obj.GetValue("count").ToString(); //用户印象种类数 string status = j_Obj.GetValue("status").ToString(); //用户某类印象状态 string rid = j_Obj.GetValue("rid").ToString(); //rid string name = j_Obj.GetValue("name").ToString(); //用户某类印象(例如:大小合适,灵敏度高,反应灵敏等) string modified = j_Obj.GetValue("modified").ToString(); //一类用户印象最后更新的时间 //下面是数据库插入语句,一会儿再写 string sSql = "insert into hotComment(goodId, productId, tagId, count, status, rid, name, modified) values (" + "'" + goodsId + "'," + "'" + productId + "'," + "'" + tagId + "'," + "'" + count + "'," + "'" + status + "'," + "'" + rid + "'," + "'" + name + "'," + "'" + modified + "')"; DBHelper helper = new DBHelper("JD_Online_Shop"); helper.Update(sSql); } }
//商品评论部分 private void insertComments(string goodsId) { if (goodsId == "2879902") { goodsId = "3752769"; } //每一页评论的抬头都会有好评度,只需要加载第一页 string url = "https://sclub.jd.com/comment/productPageComments.action?productId=" + goodsId + "&score=0&sortType=3&page=0&pageSize=10&isShadowSku=0"; //获取商品评价总数,然后除以每页评价数10,得出总页数。 /* * string content1 = SpiderUtils.download(url); * JObject OB = (JObject)JsonConvert.DeserializeObject(content1); * JToken o2 = OB["productCommentSummary"]; * JToken o3 = o2["commentCount"]; * int count = Convert.ToInt32(o3.ToString()); * int pageCount = count / 10 + 1; */ //但是,每个商品评价太多了,这里直接赋值1页,共10条评论 int pageCount = 1; for (int j = 0; j < pageCount; j++) { string urlStr = "https://sclub.jd.com/comment/productPageComments.action?productId=" + goodsId + "&score=0&sortType=3&page=" + j + "&pageSize=10&isShadowSku=0"; string contentStr = SpiderUtils.download(urlStr); JObject jsonObj = (JObject)JsonConvert.DeserializeObject(contentStr); string val = jsonObj.GetValue("comments").ToString(); JArray jsonArray = JArray.Parse(val); for (int i = 0; i < jsonArray.Count; i++) { JObject j_Object = (JObject)jsonArray[i]; //string str = jsonArray[i]["guid"].ToString(); string goodId = goodsId; //商品ID string guid = j_Object.GetValue("guid").ToString(); //guid string content = j_Object.GetValue("content").ToString(); //评论内容 string creationTime = j_Object.GetValue("creationTime").ToString(); //评论创建时间 string isTop = j_Object.GetValue("isTop").ToString(); //isTop string referenceImage = j_Object.GetValue("referenceImage").ToString(); //参考图片URL string referenceName = j_Object.GetValue("referenceName").ToString(); //参考名称 string referenceTime = j_Object.GetValue("referenceTime").ToString(); //参考的创建日期 string referenceType = j_Object.GetValue("referenceType").ToString(); //评论类型 string referenceTypeId = j_Object.GetValue("referenceTypeId").ToString(); //评论类型ID string firstCategory = j_Object.GetValue("firstCategory").ToString(); // string secondCategory = j_Object.GetValue("secondCategory").ToString(); // string thirdCategory = j_Object.GetValue("thirdCategory").ToString(); // //下面是数据库插入语句 string sSql = "insert into Comment(goodId, guid, content, creationTime, isTop, referenceImage, referenceName, " + "referenceTime,referenceType ,referenceTypeId ,firstCategory ,secondCategory ,thirdCategory) values (" + "'" + goodId + "'," + "'" + guid + "'," + "'" + content + "'," + "'" + creationTime + "'," + "'" + isTop + "'," + "'" + referenceImage + "'," + "'" + referenceName + "'," + "'" + referenceTime + "'," + "'" + referenceType + "'," + "'" + referenceTypeId + "'," + "'" + firstCategory + "'," + "'" + secondCategory + "'," + "'" + thirdCategory + "')"; DBHelper helper = new DBHelper("JD_Online_Shop"); helper.Update(sSql); } } }