/// <summary> /// 解析出商品 /// </summary> /// <param name="htmlSource">The HTML source.</param> /// <param name="listOnly">仅解析出列表,不解析价格等需要再次访问网络的内容.</param> /// <returns></returns> private IResut[] ParseCurrentItems(string htmlSource, bool listOnly = false) { /* #if DEBUG * htmlSource = ""; * var htmlSources = File.ReadAllLines(@"C:\Users\Administrator\Desktop\htmlSource.txt",System.Text.Encoding.UTF8); * for (int i=0;i< htmlSources.Length;i++) * { * htmlSource += htmlSources[i]; * } * #endif */ const string SkuIdKey = "ProductSku"; var resultList = new List <IResut>(); var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource); var iterator = navigator.Select(@"//ul/li"); foreach (XPathNavigator item in iterator) { var title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']/a/text()"); var href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']//@href"); if (string.IsNullOrEmpty(title)) { title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']/a/text()"); href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']//@href"); } var imgSrc = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jPic']//@original"); var skuMatchResults = Regex.Match(href, @"(?<=/)\d+(?=\.html)"); var sku = skuMatchResults.Success ? skuMatchResults.Value : string.Empty; if (string.IsNullOrEmpty(sku)) { continue; } // 评价数据 var comments = ParseComments(item); IResut resut = new Resut(); resut[SkuIdKey] = sku; resut["ShopId"] = ShopUrl; resut["ProductName"] = title; resut["ProductUrl"] = href; resut["ProductImage"] = imgSrc; resut["ProductComments"] = comments; resultList.Add(resut); } if (!listOnly) { this.UpdateResultsPrices(resultList, SkuIdKey); } return(resultList.ToArray()); }
/// <summary> /// 解析当前页的所有产品信息 /// </summary> /// <param name="htmlSource"></param> /// <param name="listOnly"></param> /// <returns></returns> private IResut[] ParseCurrentItems(string htmlSource, bool listOnly = false) { const string SkuIdKey = "ProductSku"; var resultList = new List <IResut>(); // 返回xpath查询器 var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource); var iterator = navigator.Select(@"//ul/li"); foreach (XPathNavigator item in iterator) { var title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']/a/text()"); var href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']//@href"); if (string.IsNullOrEmpty(title)) { title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']/a/text()"); href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']//@href"); } //HtmlDocumentHelper.GetNodeValue(item,".//div[@class='jPic']//@original") var imgSrc = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jPic']//@original"); if (imgSrc.Equals(string.Empty)) { imgSrc = HtmlDocumentHelper.GetNodeValue(htmlSource, ".//div[@class='jPic']//@src"); } var skuMatchResults = Regex.Match(href, @"(?<=/)\d+(?=.html)"); var sku = skuMatchResults.Success ? skuMatchResults.Value : string.Empty; if (string.IsNullOrEmpty(sku)) { continue; } // 评价数据 var comments = ParseComments(item); IResut resut = new Resut(); resut[SkuIdKey] = sku; resut["ShopUrl"] = this.ShopUrl; resut["ProductName"] = title; resut["ProductUrl"] = href; resut["ProductImage"] = imgSrc; resut["ProductComments"] = comments; resultList.Add(resut); } if (!listOnly) { this.UpdateResultsPrices(resultList, SkuIdKey); } return(resultList.ToArray()); }
/// <summary> /// 构造搜索 ajax url /// </summary> /// <param name="webContent">Content of the web.</param> /// <param name="renderStructure">The render structure.</param> /// <returns></returns> /// <exception cref="System.NotSupportedException">$无法从页面中解析出搜索参数</exception> private string BuildAjaxSearchUrl(string webContent, IDictionary <string, string> renderStructure) { /*webContent = File.ReadAllText(@"C:\Users\sinoX\Desktop\京东搜索页.html");*/ var matchResults = Regex.Match(webContent, "(?<=var params = ){[^}]+}"); if (!matchResults.Success) { throw new NotSupportedException("无法从页面中解析出搜索参数"); } // {"appId":"435517","orderBy":"5","direction":"0","categoryId":"0","pageSize":"24","venderId":"1000004373","isGlobalSearch":"0","maxPrice":"0","pagePrototypeId":"17","pageNo":"1","shopId":"1000004373","minPrice":"0"} var jObject = JObject.Parse(matchResults.Value); var navigator = HtmlDocumentHelper.CreateNavigator(webContent); System.Func <string, string> readJsonFunc = key => JsonHelper.TryReadJobjectValue(jObject, key, (string)null); System.Func <string, string> readHtmlFunc = key => { string value; renderStructure.TryGetValue(key, out value); return(value); }; System.Func <string, string> readInputFunc = key => HtmlDocumentHelper.GetNodeValue(navigator, $@"//input[@id='{key}']/@value"); var collection = Url.CreateQueryCollection(); collection[@"appId"] = readJsonFunc("appId"); collection[@"orderBy"] = "5"; collection[@"pageNo"] = "1"; collection[@"direction"] = "1"; collection[@"categoryId"] = readJsonFunc(@"categoryId"); collection[@"pageSize"] = @"24"; collection[@"pagePrototypeId"] = readJsonFunc(@"pagePrototypeId"); collection[@"pageInstanceId"] = readHtmlFunc(@"m_render_pageInstance_id"); collection[@"moduleInstanceId"] = readHtmlFunc(@"m_render_instance_id"); collection[@"prototypeId"] = readHtmlFunc(@"m_render_prototype_id"); collection[@"templateId"] = readHtmlFunc(@"m_render_template_id"); collection[@"layoutInstanceId"] = readHtmlFunc(@"m_render_layout_instance_id"); collection[@"origin"] = readHtmlFunc(@"m_render_origin"); collection[@"shopId"] = readInputFunc(@"shop_id"); collection[@"venderId"] = readInputFunc(@"vender_id"); /*collection[@"callback"] = @"jshop_module_render_callback"; // 不用这个直接返回一个 json 结构 */ collection[@"_"] = $"{JsCodeHelper.GetDateTime()}"; var baseUrl = renderStructure[@"m_render_is_search"] == "true" ? @"http://module-jshop.jd.com/module/getModuleHtml.html" : @"http://mall.jd.com/view/getModuleHtml.html"; return(Url.CombinUrl(baseUrl, collection)); }
/// <summary> /// 处理特例 /// </summary> /// <param name="shopUrl"></param> /// <returns></returns> private string GetSpecialSearchPageContent(string shopUrl) { var webContent = this.GetWebContent(shopUrl); var pageAppId = HtmlDocumentHelper.GetNodeValue(webContent, @"//input[@id='pageInstance_appId']/@value"); var vender_id = HtmlDocumentHelper.GetNodeValue(webContent, @"//input[@id='vender_id']/@value"); var shop_id = HtmlDocumentHelper.GetNodeValue(webContent, @"//input[@id='shop_id']/@value"); var searchUrl = $"http://mall.jd.com/advance_search-{pageAppId}-{vender_id}-{shop_id}-5-0-0-1-1-24.html"; this.CurrentUrl = searchUrl; return(this.GetWebContent(searchUrl)); }
private string BuildAjaxSearchUrl(string webContent, IDictionary <string, string> renderStructure) { var matchResults = Regex.Match(webContent, "(?<=var params = ){[^}]+}"); if (!matchResults.Success) { throw new NotSupportedException("无法从页面中解析出搜索参数"); } var jObject = JObject.Parse(matchResults.Value); var navigator = HtmlDocumentHelper.CreateNavigator(webContent); System.Func <string, string> readJsonFunction = key => JsonHelper.TryReadJobjectValue(jObject, key, (string)null); System.Func <string, string> readHtmlFunc = key => { string value; renderStructure.TryGetValue(key, out value); return(value); }; System.Func <string, string> readInputFunc = key => HtmlDocumentHelper.GetNodeValue(navigator, $@"//input[@id='{key}']/@value"); var collection = Url.CreateQueryCollection(); collection[@"appId"] = readJsonFunction("appId"); collection[@"orderBy"] = "5"; collection[@"pageNo"] = "1"; collection[@"direction"] = "1"; collection[@"categoryId"] = readJsonFunction(@"categoryId"); collection[@"pageSize"] = @"24"; collection[@"pagePrototypeId"] = readJsonFunction(@"pagePrototypeId"); collection[@"pageInstanceId"] = readHtmlFunc(@"m_render_pageInstance_id"); collection[@"moduleInstanceId"] = readHtmlFunc("m_render_instance_id"); collection[@"prototypeId"] = readHtmlFunc(@"m_render_prototype_id"); collection[@"templateId"] = readHtmlFunc(@"m_render_template_id"); collection[@"layoutInstanceId"] = readHtmlFunc(@"m_render_layout_instance_id"); collection[@"origin"] = readHtmlFunc(@"m_render_origin"); collection[@"shopId"] = readInputFunc(@"m_render_origin"); collection[@"verderId"] = readInputFunc(@"vender_id"); collection[@"_"] = $"{JsCodeHelper.GetDateTime()}"; var baseUrl = renderStructure[@"m_render_is_search"] == "true" ? @"http://module-jshop.jd.com/module/getModuleHtml.html" : @"http://mall.jd.com/view/getModuleHtml.html"; return(Url.CombinUrl(baseUrl, collection)); }
/// <summary> ///解析评论人数以及评价 /// </summary> /// <param name="item"></param> /// <returns></returns> private static string ParseComments(XPathNavigator item) { var matchResults = Regex.Match(item.Value, @"\d+(?= *人评价)"); if (matchResults.Success) { return(matchResults.Value); } // 这里还没有找到例子 var commentsNode = HtmlDocumentHelper.GetNodeValue(item, ".//span[@class='evaluate']"); matchResults = Regex.Match(commentsNode, @"(?<=\()\d+(?=\))"); if (matchResults.Success) { return(matchResults.Value); } return("-1"); }
/// <summary> /// 返回搜索页面的内容 /// </summary> /// <param name="shopUrl"></param> /// <returns></returns> private string GetSearchPageContent(string shopUrl) { var webContent = this.GetWebContent(shopUrl); // 取出<input type="hidden" value="504028" id="pageInstance_appId"/>中的value var pageAppId = HtmlDocumentHelper.GetNodeValue(webContent, @"//input[@id='pageInstance_appId']/@value"); /* * // view_search-店铺页面编号-0-排序类型-排序方向-每页条数-页码.html * 排序类型: 5 销量 4 价格 3 收藏 2 时间 * 每页条数: 最大 24 * 排序方向: 1 从大到时小 0 从小到大 * 页码: 从 1 开始 * 查找pageInstance_appId找到value的值 * http://mall.jd.com//view_search-337310-0-5-0-24-5.html */ var searchUrl = $"http://mall.jd.com/view_search-{pageAppId}-0-5-0-24-1.html"; return(this.GetWebContent(searchUrl)); }
/// <summary> /// ReadInputFunc /// </summary> /// <param name="navigator"></param> /// <param name="key"></param> /// <returns></returns> public string ReadInputFunc(XPathNavigator navigator, string key) { return(HtmlDocumentHelper.GetNodeValue(navigator, $@"//input[@id='{key}']/@value")); }