Ejemplo n.º 1
0
        /// <summary>
        /// 解析商品节点
        /// </summary>
        /// <param name="modelProduct"></param>
        /// <param name="productDom"></param>
        private TmallProduct ResolverProductDom(IElement productDom)
        {
            TmallProduct modelProduct = null;

            if (null == productDom)
            {
                return(modelProduct);
            }
            modelProduct = new TmallProduct();
            try
            {
                //id
                string itemId = productDom.GetAttribute("data-id");
                if (string.IsNullOrEmpty(itemId))
                {
                    return(modelProduct);//凡是没有id 的商品,要么是广告 要么是其他非正常的商品
                }
                long.TryParse(itemId, out long _ItemId);
                modelProduct.ItemId = _ItemId;

                //title
                var titleDom = productDom.QuerySelector(".productTitle>a");
                modelProduct.Title   = titleDom.TextContent.Replace("\n", "");
                modelProduct.ItemUrl = titleDom.GetAttribute("href").GetHttpsUrl();

                //price
                var priceDom = productDom.QuerySelector(".productPrice>em");
                if (null != priceDom)
                {
                    decimal.TryParse(priceDom.GetAttribute("title"), out decimal _price);
                    modelProduct.Price = _price;
                }
                //pic
                var picDom = productDom.QuerySelector("div.productImg-wrap>a>img");
                if (null != picDom)
                {
                    if (picDom.HasAttribute("src"))
                    {
                        modelProduct.PicUrl = picDom.GetAttribute("src").GetHttpsUrl();
                    }
                    else if (picDom.HasAttribute("data-ks-lazyload"))
                    {
                        modelProduct.PicUrl = picDom.GetAttribute("data-ks-lazyload").GetHttpsUrl();
                    }

                    //modelProduct.PicUrl = picDom.GetAttribute("src").GetHttpsUrl();
                }

                //shop
                var shopDom = productDom.QuerySelector("div.productShop>a");
                if (null != shopDom)
                {
                    string shopHref = shopDom.GetAttribute("href");
                    modelProduct.ShopUrl = shopHref.GetHttpsUrl();
                    if (shopHref.Contains("user_number_id"))
                    {
                        var    queryString = shopHref.Substring(shopHref.IndexOf('?'));
                        string shopId      = HttpUtility.ParseQueryString(queryString, Encoding.UTF8)["user_number_id"];
                        long.TryParse(shopId, out long _shopId);
                        //modelProduct.ShopId = _shopId;//天猫店铺id 在搜索列表未出现
                        modelProduct.SellerId = _shopId;
                    }


                    modelProduct.ShopName = shopDom.TextContent.Replace("\n", "");
                }

                //status
                var statusDom = productDom.QuerySelector("p.productStatus");
                //成交量
                if (null != statusDom)
                {
                    var biz30dayDomSpan = statusDom.Children[0];
                    if (null != biz30dayDomSpan)
                    {
                        string bizTotal = biz30dayDomSpan.Children[0].TextContent;
                        if (!string.IsNullOrEmpty(bizTotal))
                        {
                            modelProduct.Biz30Day = bizTotal.Trim();
                        }
                    }


                    //评论量
                    var remarkDomSpan = statusDom.Children[1];
                    if (null != remarkDomSpan && remarkDomSpan.OuterHtml.Contains("评价"))
                    {
                        string remarkTotal = remarkDomSpan.Children[0].TextContent;
                        if (!string.IsNullOrEmpty(remarkTotal))
                        {
                            modelProduct.TotalBizRemarkCount = remarkTotal.Trim();
                        }
                        modelProduct.RemarkUrl = remarkDomSpan.Children[0].GetAttribute("href")
                                                 .GetHttpsUrl();
                    }
                }
                //sku list
                var skuListDom = productDom.QuerySelector("div.proThumb-wrap");
                if (null != skuListDom)
                {
                    var skuDomArry = skuListDom.QuerySelectorAll("b.proThumb-img");
                    if (skuDomArry != null && skuDomArry.Length > 0)
                    {
                        foreach (var itemSkuDom in skuDomArry)
                        {
                            var skuItemObj = new SkuItem();
                            skuItemObj.SkuId = itemSkuDom.GetAttribute("data-sku");
                            //skuItemObj.SkuName = itemSkuDom.GetAttribute("title");//天猫没有小图的名称
                            skuItemObj.SkuUrl = string.Concat(modelProduct.ItemUrl, "&sku_properties=", skuItemObj.SkuId)
                                                .GetHttpsUrl();

                            var imgSkuDom = itemSkuDom.Children[0];
                            if (imgSkuDom.HasAttribute("data-ks-lazyload"))
                            {
                                skuItemObj.SkuImgUrl = imgSkuDom.GetAttribute("data-ks-lazyload").GetHttpsUrl();
                            }
                            else if (imgSkuDom.HasAttribute("data-ks-lazyload-custom"))
                            {
                                skuItemObj.SkuImgUrl = imgSkuDom.GetAttribute("data-ks-lazyload-custom").GetHttpsUrl();
                            }



                            modelProduct.SkuList.Add(skuItemObj);
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(modelProduct);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 执行内容解析
        /// </summary>
        ///<param name="webArgs"> </param>
        /// <param name="content">要解析的内容</param>
        /// <returns>返回需要的字段对应的字典</returns>
        public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content)
        {
            var resultBag = new Dictionary <string, object>();

            if (!string.IsNullOrEmpty(content))
            {
                if (content.Contains("环境有异常"))
                {
                    PluginContext.Logger.Error("天猫查询被进行蜘蛛验证!关键词:" + webArgs.KeyWord);
                    return(resultBag);
                }
                if (content.Contains("member/login"))
                {
                    PluginContext.Logger.Error("天猫查询结果页面被强制跳转到了登录页!关键词:" + webArgs.KeyWord);
                    return(resultBag);
                }
            }


            try
            {
                //创建html 文档对象
                HtmlParser htmlParser   = new HtmlParser();
                var        htmlDoc      = htmlParser.Parse(content);
                var        div_AttrsDom = htmlDoc.QuerySelector("div.j_NavAttrs");

                if (webArgs.IsNeedResolveHeaderTags == true && null != div_AttrsDom)
                {
                    #region 品牌解析
                    var lstBrands = new List <BrandTag>();
                    var brandDom  = div_AttrsDom.QuerySelector("div.j_Brand");
                    if (null != brandDom)
                    {
                        //从属性区域解析dom-品牌内容
                        var brandULDom = brandDom.QuerySelector("div.attrValues>ul");//ulDomArray[0];//

                        if (null != brandULDom)
                        {
                            var regex_MatchBrandId = new Regex(@"brand=(\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
                            var li_ADomArray       = brandULDom.QuerySelectorAll("li>a");
                            foreach (var itemADom in li_ADomArray)
                            {
                                var model = new BrandTag();
                                model.Platform    = SupportPlatformEnum.Tmall;
                                model.FilterField = "brand";//使用的过滤字段参数
                                var urlBrand = itemADom.GetAttribute("href");
                                if (!string.IsNullOrEmpty(urlBrand) && urlBrand.Contains("brand="))
                                {
                                    model.BrandId = regex_MatchBrandId.Match(urlBrand).Groups[1].Value;//new//品牌id   href="?brand=110910&amp;q=%B4%F3%C3%D7&amp;sort=s&amp;style=g&amp;from=sn_1_brand-qp&amp;spm=a220m.1000858.1000720.1.348abe64rj5JVg#J_crumbs
                                }
                                model.BrandName = itemADom.GetAttribute("title");
                                model.CharIndex = PinYin.GetFirstLetter(model.BrandName);
                                lstBrands.Add(model);
                            }
                        }
                    }
                    resultBag.Add("Brands", lstBrands);

                    #endregion

                    // tags 解析
                    //var lstTags = new List<KeyWordTag> {
                    //new KeyWordTag {
                    //    Platform = NTCPMessage.EntityPackage.SupportPlatformEnum.Tmall,
                    //    TagName = "大衣", Value = "dayi", FilterFiled = "sku"
                    //} };
                    var ulDomArray = div_AttrsDom.QuerySelectorAll("div.attrValues>ul");

                    var lstTags = new List <KeyWordTagGroup>();
                    if (null != div_AttrsDom)
                    {
                        var blockList = new BlockingCollection <KeyWordTagGroup>();

                        //分类 or 属性;品牌是第一个,其他属性是后续
                        int startIdx = brandDom == null ? 0 : 1;// //是否存在品牌的判断

                        var taskArray = new Task[ulDomArray.Length - startIdx];
                        int counter   = 0;


                        for (int i = startIdx; i < ulDomArray.Length; i++)
                        {
                            int cursor = i;

                            var taskResolveAEmelems = Task.Factory.StartNew(() =>
                            {
                                var itemUl = ulDomArray[cursor];

                                //找到归属的组
                                var attrKeyDom   = itemUl.ParentElement.ParentElement.QuerySelector("div.attrKey");
                                string groupName = "";
                                if (null != attrKeyDom)
                                {
                                    groupName = attrKeyDom.TextContent.Replace("\n", "").Trim();
                                }

                                var tagGroup = new KeyWordTagGroup(groupName);

                                var childLiADomArray = itemUl.QuerySelectorAll("li>a");
                                foreach (var itemADom in childLiADomArray)
                                {
                                    var modelTag           = new KeyWordTag();
                                    modelTag.Platform      = SupportPlatformEnum.Tmall;
                                    modelTag.TagName       = itemADom.TextContent.Replace("\n", "");//标签名称
                                    modelTag.GroupShowName = groupName;

                                    //////----解析 a标签开始-------
                                    //////检查 a 的href 中的参数;cat 或者prop
                                    string hrefValue = itemADom.GetAttribute("href");
                                    if (!string.IsNullOrEmpty(hrefValue))
                                    {
                                        var urlParas = HttpUtility.ParseQueryString(hrefValue, Encoding.UTF8);
                                        if (null != urlParas)
                                        {
                                            if (hrefValue.IndexOf("cat=") > -1)
                                            {
                                                //1 cat
                                                string catValue      = urlParas["cat"];
                                                modelTag.FilterFiled = "cat";
                                                modelTag.Value       = catValue;
                                            }
                                            else if (hrefValue.IndexOf("prop=") > -1)
                                            {
                                                //2 prop
                                                string propValue     = urlParas["prop"];
                                                modelTag.FilterFiled = "prop";
                                                modelTag.Value       = propValue;
                                            }
                                        }
                                    }
                                    tagGroup.Tags.Add(modelTag);
                                }

                                //----解析 a标签完毕-------
                                blockList.Add(tagGroup);
                            });
                            //将并行任务放到数组
                            taskArray[counter] = taskResolveAEmelems;
                            counter           += 1;
                        }
                        var safeTaskArray = taskArray.Where(x => null != x).ToArray();
                        Task.WaitAll(safeTaskArray);
                        lstTags = blockList.ToList();
                    }
                    resultBag.Add("Tags", lstTags);
                }

                #region products  解析
                //ProductBaseCollection lstProducts = new ProductBaseCollection()
                //{
                //    new TmallProduct { ItemId=1,Title="测试大衣"}
                //};
                var lstProducts = new ProductBaseCollection();
                //多任务并行解析商品
                //BlockingCollection<TmallProduct> blockingList_Products = new BlockingCollection<TmallProduct>();
                ConcurrentDictionary <string, ProductOrdered <TmallProduct> > blockingList_Products = new ConcurrentDictionary <string, ProductOrdered <TmallProduct> >();

                var div_J_ItemListDom = htmlDoc.QuerySelector("div#J_ItemList");
                if (null != div_J_ItemListDom)
                {
                    var div_productDomArray = div_J_ItemListDom.QuerySelectorAll("div.product");
                    if (null != div_productDomArray && div_productDomArray.Any())
                    {
                        var pids = div_productDomArray
                                   .Select(x => { return(x.GetAttribute("data-id")); });
                        //设定排序对象
                        int counter_pid = 0;
                        foreach (var itemPid in pids)
                        {
                            if (null != itemPid)
                            {
                                blockingList_Products.TryAdd(itemPid, new ProductOrdered <TmallProduct> {
                                    UniqKey = itemPid, IndexOrder = counter_pid
                                });
                                counter_pid++;
                            }
                        }

                        //并行解析 并保留原序列
                        div_productDomArray.AsParallel()
                        .ForAll((itemProductDom) =>
                        {
                            //解析一个商品的节点
                            TmallProduct modelProduct = this.ResolverProductDom(itemProductDom);
                            if (null != modelProduct && modelProduct.ItemId > 0)
                            {
                                var orderedObj     = blockingList_Products[modelProduct.ItemId.ToString()];
                                orderedObj.Product = modelProduct;
                            }
                        });

                        //进行排序
                        var productsList = blockingList_Products
                                           .Where(x => x.Value != null)
                                           .OrderBy(x => x.Value.IndexOrder)
                                           .Select(x => x.Value.Product);
                        lstProducts.AddRange(productsList);
                    }
                }
                resultBag.Add("Products", lstProducts);

                #endregion
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(resultBag);// string.Concat("has process input :" + content);
        }