Пример #1
0
        /// <summary>
        ///     解析当前总页数
        /// </summary>
        /// <param name="content">The content.</param>
        /// <returns></returns>
        private int ParseAmountPage(string content)
        {
            if (content == null)
            {
                return(-1);
            }

            var navigator = HtmlDocumentHelper.CreateNavigator(content);
            var node      = navigator.SelectSingleNode(@"//div[@class='jPage']/em");

            if (node == null)
            {
                return(-1);
            }

            /*
             *   <div class="jPage">
             *  <em>共104条记录</em>
             *  <span>上一页</span>
             *  <a class="current">1</a>
             *  <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-2-24.html?isGlobalSearch=0">2</a>
             *  <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-3-24.html?isGlobalSearch=0">3</a>
             *  <span>...</span>
             *  <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-5-24.html?isGlobalSearch=0">5</a>
             *  <a href="//mall.jd.com/view_search-504028-1000007084-1000007084-0-5-0-0-1-2-24.html?isGlobalSearch=0">下一页</a>
             *          </div>
             */
            var matchResults = Regex.Match(node.Value, @"(?<=共)\d+(?=条记录)");

            return(matchResults.Success ? int.Parse(matchResults.Value) : -1);
        }
Пример #2
0
        /// <summary>
        /// 推荐是不是商品页面
        /// </summary>
        /// <param name="webContent"></param>
        /// <returns></returns>
        private bool GuessIsSearchWebContent(string webContent)
        {
            if (string.IsNullOrEmpty(webContent))
            {
                return(false);
            }

            var navigator = HtmlDocumentHelper.CreateNavigator(webContent);
            var iterator  = navigator.Select(@"//a");

            var itemCount = 0;

            foreach (XPathNavigator item in iterator)
            {
                var href = item.GetAttribute(@"href", string.Empty);

                if (href == null)
                {
                    continue;
                }

                if (Regex.IsMatch(href, @"^//item\.jd\.com/\d+\.html", RegexOptions.IgnoreCase))
                {
                    itemCount++;
                }

                if (itemCount >= 8)
                {
                    return(true);
                }
            }

            return(false);
        }
Пример #3
0
        /// <summary>
        ///     解析出商品
        /// </summary>
        /// <param name="htmlSource">The HTML source.</param>
        /// <param name="listOnly">仅解析出列表,不解析价格等需要再次访问网络的内容.</param>
        /// <returns></returns>
        private IResut[] ParseCurrentItems(string htmlSource, bool listOnly = false)
        {
/*
 #if DEBUG
 *          htmlSource = "";
 *          var htmlSources = File.ReadAllLines(@"C:\Users\Administrator\Desktop\htmlSource.txt",System.Text.Encoding.UTF8);
 *          for (int i=0;i< htmlSources.Length;i++)
 *          {
 *              htmlSource += htmlSources[i];
 *          }
 *
 #endif
 */
            const string SkuIdKey   = "ProductSku";
            var          resultList = new List <IResut>();

            var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource);
            var iterator  = navigator.Select(@"//ul/li");

            foreach (XPathNavigator item in iterator)
            {
                var title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']/a/text()");
                var href  = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']//@href");
                if (string.IsNullOrEmpty(title))
                {
                    title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']/a/text()");
                    href  = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']//@href");
                }

                var imgSrc          = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jPic']//@original");
                var skuMatchResults = Regex.Match(href, @"(?<=/)\d+(?=\.html)");
                var sku             = skuMatchResults.Success ? skuMatchResults.Value : string.Empty;

                if (string.IsNullOrEmpty(sku))
                {
                    continue;
                }

                // 评价数据
                var comments = ParseComments(item);

                IResut resut = new Resut();

                resut[SkuIdKey]          = sku;
                resut["ShopId"]          = ShopUrl;
                resut["ProductName"]     = title;
                resut["ProductUrl"]      = href;
                resut["ProductImage"]    = imgSrc;
                resut["ProductComments"] = comments;
                resultList.Add(resut);
            }

            if (!listOnly)
            {
                this.UpdateResultsPrices(resultList, SkuIdKey);
            }

            return(resultList.ToArray());
        }
Пример #4
0
        /// <summary>
        /// 解析当前页的所有产品信息
        /// </summary>
        /// <param name="htmlSource"></param>
        /// <param name="listOnly"></param>
        /// <returns></returns>
        private IResut[] ParseCurrentItems(string htmlSource, bool listOnly = false)
        {
            const string SkuIdKey   = "ProductSku";
            var          resultList = new List <IResut>();

            // 返回xpath查询器
            var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource);
            var iterator  = navigator.Select(@"//ul/li");

            foreach (XPathNavigator item in iterator)
            {
                var title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']/a/text()");
                var href  = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jDesc']//@href");
                if (string.IsNullOrEmpty(title))
                {
                    title = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']/a/text()");
                    href  = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jTitle']//@href");
                }



                //HtmlDocumentHelper.GetNodeValue(item,".//div[@class='jPic']//@original")

                var imgSrc = HtmlDocumentHelper.GetNodeValue(item, ".//div[@class='jPic']//@original");
                if (imgSrc.Equals(string.Empty))
                {
                    imgSrc = HtmlDocumentHelper.GetNodeValue(htmlSource, ".//div[@class='jPic']//@src");
                }

                var skuMatchResults = Regex.Match(href, @"(?<=/)\d+(?=.html)");
                var sku             = skuMatchResults.Success ? skuMatchResults.Value : string.Empty;

                if (string.IsNullOrEmpty(sku))
                {
                    continue;
                }

                // 评价数据
                var comments = ParseComments(item);

                IResut resut = new Resut();

                resut[SkuIdKey]          = sku;
                resut["ShopUrl"]         = this.ShopUrl;
                resut["ProductName"]     = title;
                resut["ProductUrl"]      = href;
                resut["ProductImage"]    = imgSrc;
                resut["ProductComments"] = comments;
                resultList.Add(resut);
            }

            if (!listOnly)
            {
                this.UpdateResultsPrices(resultList, SkuIdKey);
            }

            return(resultList.ToArray());
        }
Пример #5
0
        /// <summary>
        /// 开始解析
        /// </summary>
        /// <param name="webContent">Content of the web.</param>
        /// <returns></returns>
        public IResut Parse(string webContent)
        {
            IResut resut     = new Resut();
            var    navigator = HtmlDocumentHelper.CreateNavigator(webContent);

            ParseShopScoreResult(resut, navigator);

            throw new NotImplementedException();
        }
Пример #6
0
        /*        /// <summary>
         * ///     返回自营店的店铺编号列表
         * /// </summary>
         * /// <param name="shopDictionary">The shop dictionary.</param>
         * /// <returns></returns>
         * private string[] GetSelfSupportShopIds(IDictionary<string, IResut> shopDictionary)
         * {
         *  var shopIdList = new List<string>();
         *
         *  foreach (var shopItem in shopDictionary)
         *  {
         *      var shopId = shopItem.Key;
         *      if (shopItem.Value.GetStringValue(@"vender_type") == "0")
         *      {
         *          shopIdList.Add(shopId);
         *      }
         *  }
         *
         *  return shopIdList.ToArray();
         * }*/

        /// <summary>
        ///     Parses the identifier from xpath.
        /// </summary>
        /// <param name="htmlSource">The HTML source.</param>
        /// <param name="xpath">The xpath.</param>
        /// <returns></returns>
        private string[] ParseIdFromXpath(string htmlSource, string xpath)
        {
            var navigator = HtmlDocumentHelper.CreateNavigator(htmlSource);
            var iterator  = navigator.Select(xpath);
            var xItems    = HtmlDocumentHelper.CopyNodeToArray(iterator);

            var pids = Array.ConvertAll(xItems, item => item.Value);

            return(DictionaryHelper.Distinct(pids));
        }
Пример #7
0
        private string BuildAjaxSearchUrl(string webContent, IDictionary <string, string> renderStructure)
        {
            var matchResults = Regex.Match(webContent, "(?<=var params = ){[^}]+}");

            if (!matchResults.Success)
            {
                throw new NotSupportedException("无法从页面中解析出搜索参数");
            }

            var jObject   = JObject.Parse(matchResults.Value);
            var navigator = HtmlDocumentHelper.CreateNavigator(webContent);



            //System.Func<string, string> readJsonFunc = key => JsonHelper.TryReadJobjectValue(jObject, key,(string)null);

            //System.Func<string, string> readHtmlFunc = key =>
            //{
            //    string value;
            //    renderStructure.TryGetValue(key,out value);
            //    return value;
            //};

            //System.Func<string, string> readInputFunc = key =>
            //HtmlDocumentHelper.GetNodeValue(navigator,$@"//input[@id='{key}']/@value");

            var collection = Url.CreateQueryCollection();



            collection[@"appId"]            = ReadJsonFunc(jObject, "appId");
            collection[@"orderBy"]          = "5";
            collection[@"pageNo"]           = "1";
            collection[@"direction"]        = "1";
            collection[@"categoryId"]       = ReadJsonFunc(jObject, @"categoryId");
            collection[@"pageSize"]         = @"24";
            collection[@"pagePrototypeId"]  = ReadJsonFunc(jObject, @"pagePrototypeId");
            collection[@"pageInstanceId"]   = ReadHtmlFunc(renderStructure, @"m_render_pageInstance_id");
            collection[@"moduleInstanceId"] = ReadHtmlFunc(renderStructure, "m_render_instance_id");
            collection[@"prototypeId"]      = ReadHtmlFunc(renderStructure, @"m_render_prototype_id");
            collection[@"templateId"]       = ReadHtmlFunc(renderStructure, @"m_render_template_id");
            collection[@"layoutInstanceId"] = ReadHtmlFunc(renderStructure, @"m_render_layout_instance_id");
            collection[@"origin"]           = ReadHtmlFunc(renderStructure, @"m_render_origin");
            collection[@"shopId"]           = ReadInputFunc(navigator, @"shop_id");
            collection[@"verderId"]         = ReadInputFunc(navigator, @"vender_id");

            collection[@"_"] = $"{JsCodeHelper.GetDateTime()}";

            var baseUrl = renderStructure[@"m_render_is_search"] == "true"
                            ? @"http://module-jshop.jd.com/module/getModuleHtml.html"
                            : @"http://mall.jd.com/view/getModuleHtml.html";

            return(Url.CombinUrl(baseUrl, collection));
        }
Пример #8
0
        /// <summary>
        ///     构造搜索  ajax url
        /// </summary>
        /// <param name="webContent">Content of the web.</param>
        /// <param name="renderStructure">The render structure.</param>
        /// <returns></returns>
        /// <exception cref="System.NotSupportedException">$无法从页面中解析出搜索参数</exception>
        private string BuildAjaxSearchUrl(string webContent, IDictionary <string, string> renderStructure)
        {
            /*webContent = File.ReadAllText(@"C:\Users\sinoX\Desktop\京东搜索页.html");*/
            var matchResults = Regex.Match(webContent, "(?<=var params = ){[^}]+}");

            if (!matchResults.Success)
            {
                throw new NotSupportedException("无法从页面中解析出搜索参数");
            }

            // {"appId":"435517","orderBy":"5","direction":"0","categoryId":"0","pageSize":"24","venderId":"1000004373","isGlobalSearch":"0","maxPrice":"0","pagePrototypeId":"17","pageNo":"1","shopId":"1000004373","minPrice":"0"}
            var jObject   = JObject.Parse(matchResults.Value);
            var navigator = HtmlDocumentHelper.CreateNavigator(webContent);


            System.Func <string, string> readJsonFunc = key => JsonHelper.TryReadJobjectValue(jObject, key, (string)null);
            System.Func <string, string> readHtmlFunc = key =>
            {
                string value;
                renderStructure.TryGetValue(key, out value);
                return(value);
            };
            System.Func <string, string> readInputFunc =
                key => HtmlDocumentHelper.GetNodeValue(navigator, $@"//input[@id='{key}']/@value");

            var collection = Url.CreateQueryCollection();

            collection[@"appId"]            = readJsonFunc("appId");
            collection[@"orderBy"]          = "5";
            collection[@"pageNo"]           = "1";
            collection[@"direction"]        = "1";
            collection[@"categoryId"]       = readJsonFunc(@"categoryId");
            collection[@"pageSize"]         = @"24";
            collection[@"pagePrototypeId"]  = readJsonFunc(@"pagePrototypeId");
            collection[@"pageInstanceId"]   = readHtmlFunc(@"m_render_pageInstance_id");
            collection[@"moduleInstanceId"] = readHtmlFunc(@"m_render_instance_id");
            collection[@"prototypeId"]      = readHtmlFunc(@"m_render_prototype_id");
            collection[@"templateId"]       = readHtmlFunc(@"m_render_template_id");
            collection[@"layoutInstanceId"] = readHtmlFunc(@"m_render_layout_instance_id");
            collection[@"origin"]           = readHtmlFunc(@"m_render_origin");
            collection[@"shopId"]           = readInputFunc(@"shop_id");
            collection[@"venderId"]         = readInputFunc(@"vender_id");

            /*collection[@"callback"] = @"jshop_module_render_callback";  // 不用这个直接返回一个 json 结构 */
            collection[@"_"] = $"{JsCodeHelper.GetDateTime()}";

            var baseUrl = renderStructure[@"m_render_is_search"] == "true"
                              ? @"http://module-jshop.jd.com/module/getModuleHtml.html"
                              : @"http://mall.jd.com/view/getModuleHtml.html";

            return(Url.CombinUrl(baseUrl, collection));
        }
Пример #9
0
        /// <summary>
        /// 解析出总共的页数
        /// </summary>
        /// <param name="content"></param>
        /// <returns></returns>
        private int ParseAmountPage(string content)
        {
            if (content == null)
            {
                return(-1);
            }

            var navigator = HtmlDocumentHelper.CreateNavigator(content);
            var node      = navigator.SelectSingleNode(@"//div[@class='jPage']/em");

            if (node == null)
            {
                return(-1);
            }

            var matchResults = Regex.Match(node.Value, @"(?<=共)\d+(?=条记录)");

            return(matchResults.Success ? int.Parse(matchResults.Value) : -1);
        }