/// <summary> /// 解析当前总页数 /// </summary> /// <param name="content">The content.</param> /// <returns></returns> private int ParseAmountPage(string content) { if (content == null) { return(-1); } var navigator = HtmlDocumentHelper.CreateNavigator(content); var node = navigator.SelectSingleNode(@"//div[@class='jPage']/em"); if (node == null) { return(-1); } /* * <div class="jPage"> * <em>共104条记录</em> * <span>上一页</span> * <a class="current">1</a> * <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-2-24.html?isGlobalSearch=0">2</a> * <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-3-24.html?isGlobalSearch=0">3</a> * <span>...</span> * <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-5-24.html?isGlobalSearch=0">5</a> * <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-2-24.html?isGlobalSearch=0">下一页</a> * </div> */ var matchResults = Regex.Match(node.Value, @"(?<=共)\d+(?=条记录)"); return(matchResults.Success ? int.Parse(matchResults.Value) : -1); }
/// <summary> /// 推荐是不是商品页面 /// </summary> /// <param name="webContent"></param> /// <returns></returns> private bool GuessIsSearchWebContent(string webContent) { if (string.IsNullOrEmpty(webContent)) { return(false); } var navigator = HtmlDocumentHelper.CreateNavigator(webContent); var iterator = navigator.Select(@"//a"); var itemCount = 0; foreach (XPathNavigator item in iterator) { var href = item.GetAttribute(@"href", string.Empty); if (href == null) { continue; } if (Regex.IsMatch(href, @"^//item\.jd\.com/\d+\.html", RegexOptions.IgnoreCase)) { itemCount++; } if (itemCount >= 8) { return(true); } } return(false); }
/// <summary> /// 解析出商品 /// </summary> /// <param name="htmlSource">The HTML source.</param> /// <param name="listOnly">仅解析出列表,不解析价格等需要再次访问网络的内容.</param> /// <returns></returns> private IResut[] ParseCurrentItems(string htmlSource, bool listOnly = false) { /* #if DEBUG * htmlSource = ""; * var htmlSources = File.ReadAllLines(@"C:\Users\Administrator\Desktop\htmlSource.txt",System.Text.Encoding.UTF8); * for (int i=0;i< htmlSources.Length;i++) * { * htmlSource += htmlSources[i]; * } * #endif */ const string SkuIdKey = "ProductSku"; var resultList = new List <IResut>(); var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource); var iterator = navigator.Select(@"//ul/li"); foreach (XPathNavigator item in iterator) { var title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']/a/text()"); var href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']//@href"); if (string.IsNullOrEmpty(title)) { title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']/a/text()"); href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']//@href"); } var imgSrc = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jPic']//@original"); var skuMatchResults = Regex.Match(href, @"(?<=/)\d+(?=\.html)"); var sku = skuMatchResults.Success ? skuMatchResults.Value : string.Empty; if (string.IsNullOrEmpty(sku)) { continue; } // 评价数据 var comments = ParseComments(item); IResut resut = new Resut(); resut[SkuIdKey] = sku; resut["ShopId"] = ShopUrl; resut["ProductName"] = title; resut["ProductUrl"] = href; resut["ProductImage"] = imgSrc; resut["ProductComments"] = comments; resultList.Add(resut); } if (!listOnly) { this.UpdateResultsPrices(resultList, SkuIdKey); } return(resultList.ToArray()); }
/// <summary> /// 解析当前页的所有产品信息 /// </summary> /// <param name="htmlSource"></param> /// <param name="listOnly"></param> /// <returns></returns> private IResut[] ParseCurrentItems(string htmlSource, bool listOnly = false) { const string SkuIdKey = "ProductSku"; var resultList = new List <IResut>(); // 返回xpath查询器 var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource); var iterator = navigator.Select(@"//ul/li"); foreach (XPathNavigator item in iterator) { var title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']/a/text()"); var href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']//@href"); if (string.IsNullOrEmpty(title)) { title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']/a/text()"); href = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']//@href"); } //HtmlDocumentHelper.GetNodeValue(item,".//div[@class='jPic']//@original") var imgSrc = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jPic']//@original"); if (imgSrc.Equals(string.Empty)) { imgSrc = HtmlDocumentHelper.GetNodeValue(htmlSource, ".//div[@class='jPic']//@src"); } var skuMatchResults = Regex.Match(href, @"(?<=/)\d+(?=.html)"); var sku = skuMatchResults.Success ? skuMatchResults.Value : string.Empty; if (string.IsNullOrEmpty(sku)) { continue; } // 评价数据 var comments = ParseComments(item); IResut resut = new Resut(); resut[SkuIdKey] = sku; resut["ShopUrl"] = this.ShopUrl; resut["ProductName"] = title; resut["ProductUrl"] = href; resut["ProductImage"] = imgSrc; resut["ProductComments"] = comments; resultList.Add(resut); } if (!listOnly) { this.UpdateResultsPrices(resultList, SkuIdKey); } return(resultList.ToArray()); }
/// <summary> /// 开始解析 /// </summary> /// <param name="webContent">Content of the web.</param> /// <returns></returns> public IResut Parse(string webContent) { IResut resut = new Resut(); var navigator = HtmlDocumentHelper.CreateNavigator(webContent); ParseShopScoreResult(resut, navigator); throw new NotImplementedException(); }
/* /// <summary> * /// 返回自营店的店铺编号列表 * /// </summary> * /// <param name="shopDictionary">The shop dictionary.</param> * /// <returns></returns> * private string[] GetSelfSupportShopIds(IDictionary<string, IResut> shopDictionary) * { * var shopIdList = new List<string>(); * * foreach (var shopItem in shopDictionary) * { * var shopId = shopItem.Key; * if (shopItem.Value.GetStringValue(@"vender_type") == "0") * { * shopIdList.Add(shopId); * } * } * * return shopIdList.ToArray(); * }*/ /// <summary> /// Parses the identifier from xpath. /// </summary> /// <param name="htmlSource">The HTML source.</param> /// <param name="xpath">The xpath.</param> /// <returns></returns> private string[] ParseIdFromXpath(string htmlSource, string xpath) { var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource); var iterator = navigator.Select(xpath); var xItems = HtmlDocumentHelper.CopyNodeToArray(iterator); var pids = Array.ConvertAll(xItems, item => item.Value); return(DictionaryHelper.Distinct(pids)); }
private string BuildAjaxSearchUrl(string webContent, IDictionary <string, string> renderStructure) { var matchResults = Regex.Match(webContent, "(?<=var params = ){[^}]+}"); if (!matchResults.Success) { throw new NotSupportedException("无法从页面中解析出搜索参数"); } var jObject = JObject.Parse(matchResults.Value); var navigator = HtmlDocumentHelper.CreateNavigator(webContent); //System.Func<string, string> readJsonFunc = key => JsonHelper.TryReadJobjectValue(jObject, key,(string)null); //System.Func<string, string> readHtmlFunc = key => //{ // string value; // renderStructure.TryGetValue(key,out value); // return value; //}; //System.Func<string, string> readInputFunc = key => //HtmlDocumentHelper.GetNodeValue(navigator,$@"//input[@id='{key}']/@value"); var collection = Url.CreateQueryCollection(); collection[@"appId"] = ReadJsonFunc(jObject, "appId"); collection[@"orderBy"] = "5"; collection[@"pageNo"] = "1"; collection[@"direction"] = "1"; collection[@"categoryId"] = ReadJsonFunc(jObject, @"categoryId"); collection[@"pageSize"] = @"24"; collection[@"pagePrototypeId"] = ReadJsonFunc(jObject, @"pagePrototypeId"); collection[@"pageInstanceId"] = ReadHtmlFunc(renderStructure, @"m_render_pageInstance_id"); collection[@"moduleInstanceId"] = ReadHtmlFunc(renderStructure, "m_render_instance_id"); collection[@"prototypeId"] = ReadHtmlFunc(renderStructure, @"m_render_prototype_id"); collection[@"templateId"] = ReadHtmlFunc(renderStructure, @"m_render_template_id"); collection[@"layoutInstanceId"] = ReadHtmlFunc(renderStructure, @"m_render_layout_instance_id"); collection[@"origin"] = ReadHtmlFunc(renderStructure, @"m_render_origin"); collection[@"shopId"] = ReadInputFunc(navigator, @"shop_id"); collection[@"verderId"] = ReadInputFunc(navigator, @"vender_id"); collection[@"_"] = $"{JsCodeHelper.GetDateTime()}"; var baseUrl = renderStructure[@"m_render_is_search"] == "true" ? @"http://module-jshop.jd.com/module/getModuleHtml.html" : @"http://mall.jd.com/view/getModuleHtml.html"; return(Url.CombinUrl(baseUrl, collection)); }
/// <summary> /// 构造搜索 ajax url /// </summary> /// <param name="webContent">Content of the web.</param> /// <param name="renderStructure">The render structure.</param> /// <returns></returns> /// <exception cref="System.NotSupportedException">$无法从页面中解析出搜索参数</exception> private string BuildAjaxSearchUrl(string webContent, IDictionary <string, string> renderStructure) { /*webContent = File.ReadAllText(@"C:\Users\sinoX\Desktop\京东搜索页.html");*/ var matchResults = Regex.Match(webContent, "(?<=var params = ){[^}]+}"); if (!matchResults.Success) { throw new NotSupportedException("无法从页面中解析出搜索参数"); } // {"appId":"435517","orderBy":"5","direction":"0","categoryId":"0","pageSize":"24","venderId":"1000004373","isGlobalSearch":"0","maxPrice":"0","pagePrototypeId":"17","pageNo":"1","shopId":"1000004373","minPrice":"0"} var jObject = JObject.Parse(matchResults.Value); var navigator = HtmlDocumentHelper.CreateNavigator(webContent); System.Func <string, string> readJsonFunc = key => JsonHelper.TryReadJobjectValue(jObject, key, (string)null); System.Func <string, string> readHtmlFunc = key => { string value; renderStructure.TryGetValue(key, out value); return(value); }; System.Func <string, string> readInputFunc = key => HtmlDocumentHelper.GetNodeValue(navigator, $@"//input[@id='{key}']/@value"); var collection = Url.CreateQueryCollection(); collection[@"appId"] = readJsonFunc("appId"); collection[@"orderBy"] = "5"; collection[@"pageNo"] = "1"; collection[@"direction"] = "1"; collection[@"categoryId"] = readJsonFunc(@"categoryId"); collection[@"pageSize"] = @"24"; collection[@"pagePrototypeId"] = readJsonFunc(@"pagePrototypeId"); collection[@"pageInstanceId"] = readHtmlFunc(@"m_render_pageInstance_id"); collection[@"moduleInstanceId"] = readHtmlFunc(@"m_render_instance_id"); collection[@"prototypeId"] = readHtmlFunc(@"m_render_prototype_id"); collection[@"templateId"] = readHtmlFunc(@"m_render_template_id"); collection[@"layoutInstanceId"] = readHtmlFunc(@"m_render_layout_instance_id"); collection[@"origin"] = readHtmlFunc(@"m_render_origin"); collection[@"shopId"] = readInputFunc(@"shop_id"); collection[@"venderId"] = readInputFunc(@"vender_id"); /*collection[@"callback"] = @"jshop_module_render_callback"; // 不用这个直接返回一个 json 结构 */ collection[@"_"] = $"{JsCodeHelper.GetDateTime()}"; var baseUrl = renderStructure[@"m_render_is_search"] == "true" ? @"http://module-jshop.jd.com/module/getModuleHtml.html" : @"http://mall.jd.com/view/getModuleHtml.html"; return(Url.CombinUrl(baseUrl, collection)); }
/// <summary> /// 解析出总共的页数 /// </summary> /// <param name="content"></param> /// <returns></returns> private int ParseAmountPage(string content) { if (content == null) { return(-1); } var navigator = HtmlDocumentHelper.CreateNavigator(content); var node = navigator.SelectSingleNode(@"//div[@class='jPage']/em"); if (node == null) { return(-1); } var matchResults = Regex.Match(node.Value, @"(?<=共)\d+(?=条记录)"); return(matchResults.Success ? int.Parse(matchResults.Value) : -1); }