Ejemplo n.º 1
0
        private static void ProcessTags(List <KeyWordTagGroup> lstTags, IEnumerable <TaobaoPageJsonResut.sub> lstSub, string groupName)
        {
            if (null == lstTags)
            {
                lstTags = new List <KeyWordTagGroup>();
            }

            var tagGroup = new KeyWordTagGroup(groupName);

            foreach (var itemSub in lstSub)
            {
                var modelTag = new KeyWordTag();
                modelTag.Platform      = SupportPlatformEnum.Taobao;
                modelTag.TagName       = itemSub.text;
                modelTag.GroupShowName = groupName;
                modelTag.FilterFiled   = itemSub.key;
                modelTag.Value         = itemSub.value;

                tagGroup.Tags.Add(modelTag);
            }

            lstTags.Add(tagGroup);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 执行内容解析
        /// </summary>
        ///<param name="webArgs"> </param>
        /// <param name="content">要解析的内容</param>
        /// <returns>返回需要的字段对应的字典</returns>
        public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content)
        {
            var resultBag = new Dictionary <string, object>();

            if (!string.IsNullOrEmpty(content))
            {
                if (content.Contains("环境有异常"))
                {
                    PluginContext.Logger.Error("天猫查询被进行蜘蛛验证!关键词:" + webArgs.KeyWord);
                    return(resultBag);
                }
                if (content.Contains("member/login"))
                {
                    PluginContext.Logger.Error("天猫查询结果页面被强制跳转到了登录页!关键词:" + webArgs.KeyWord);
                    return(resultBag);
                }
            }


            try
            {
                //创建html 文档对象
                HtmlParser htmlParser   = new HtmlParser();
                var        htmlDoc      = htmlParser.Parse(content);
                var        div_AttrsDom = htmlDoc.QuerySelector("div.j_NavAttrs");

                if (webArgs.IsNeedResolveHeaderTags == true && null != div_AttrsDom)
                {
                    #region 品牌解析
                    var lstBrands = new List <BrandTag>();
                    var brandDom  = div_AttrsDom.QuerySelector("div.j_Brand");
                    if (null != brandDom)
                    {
                        //从属性区域解析dom-品牌内容
                        var brandULDom = brandDom.QuerySelector("div.attrValues>ul");//ulDomArray[0];//

                        if (null != brandULDom)
                        {
                            var regex_MatchBrandId = new Regex(@"brand=(\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
                            var li_ADomArray       = brandULDom.QuerySelectorAll("li>a");
                            foreach (var itemADom in li_ADomArray)
                            {
                                var model = new BrandTag();
                                model.Platform    = SupportPlatformEnum.Tmall;
                                model.FilterField = "brand";//使用的过滤字段参数
                                var urlBrand = itemADom.GetAttribute("href");
                                if (!string.IsNullOrEmpty(urlBrand) && urlBrand.Contains("brand="))
                                {
                                    model.BrandId = regex_MatchBrandId.Match(urlBrand).Groups[1].Value;//new//品牌id   href="?brand=110910&amp;q=%B4%F3%C3%D7&amp;sort=s&amp;style=g&amp;from=sn_1_brand-qp&amp;spm=a220m.1000858.1000720.1.348abe64rj5JVg#J_crumbs
                                }
                                model.BrandName = itemADom.GetAttribute("title");
                                model.CharIndex = PinYin.GetFirstLetter(model.BrandName);
                                lstBrands.Add(model);
                            }
                        }
                    }
                    resultBag.Add("Brands", lstBrands);

                    #endregion

                    // tags 解析
                    //var lstTags = new List<KeyWordTag> {
                    //new KeyWordTag {
                    //    Platform = NTCPMessage.EntityPackage.SupportPlatformEnum.Tmall,
                    //    TagName = "大衣", Value = "dayi", FilterFiled = "sku"
                    //} };
                    var ulDomArray = div_AttrsDom.QuerySelectorAll("div.attrValues>ul");

                    var lstTags = new List <KeyWordTagGroup>();
                    if (null != div_AttrsDom)
                    {
                        var blockList = new BlockingCollection <KeyWordTagGroup>();

                        //分类 or 属性;品牌是第一个,其他属性是后续
                        int startIdx = brandDom == null ? 0 : 1;// //是否存在品牌的判断

                        var taskArray = new Task[ulDomArray.Length - startIdx];
                        int counter   = 0;


                        for (int i = startIdx; i < ulDomArray.Length; i++)
                        {
                            int cursor = i;

                            var taskResolveAEmelems = Task.Factory.StartNew(() =>
                            {
                                var itemUl = ulDomArray[cursor];

                                //找到归属的组
                                var attrKeyDom   = itemUl.ParentElement.ParentElement.QuerySelector("div.attrKey");
                                string groupName = "";
                                if (null != attrKeyDom)
                                {
                                    groupName = attrKeyDom.TextContent.Replace("\n", "").Trim();
                                }

                                var tagGroup = new KeyWordTagGroup(groupName);

                                var childLiADomArray = itemUl.QuerySelectorAll("li>a");
                                foreach (var itemADom in childLiADomArray)
                                {
                                    var modelTag           = new KeyWordTag();
                                    modelTag.Platform      = SupportPlatformEnum.Tmall;
                                    modelTag.TagName       = itemADom.TextContent.Replace("\n", "");//标签名称
                                    modelTag.GroupShowName = groupName;

                                    //////----解析 a标签开始-------
                                    //////检查 a 的href 中的参数;cat 或者prop
                                    string hrefValue = itemADom.GetAttribute("href");
                                    if (!string.IsNullOrEmpty(hrefValue))
                                    {
                                        var urlParas = HttpUtility.ParseQueryString(hrefValue, Encoding.UTF8);
                                        if (null != urlParas)
                                        {
                                            if (hrefValue.IndexOf("cat=") > -1)
                                            {
                                                //1 cat
                                                string catValue      = urlParas["cat"];
                                                modelTag.FilterFiled = "cat";
                                                modelTag.Value       = catValue;
                                            }
                                            else if (hrefValue.IndexOf("prop=") > -1)
                                            {
                                                //2 prop
                                                string propValue     = urlParas["prop"];
                                                modelTag.FilterFiled = "prop";
                                                modelTag.Value       = propValue;
                                            }
                                        }
                                    }
                                    tagGroup.Tags.Add(modelTag);
                                }

                                //----解析 a标签完毕-------
                                blockList.Add(tagGroup);
                            });
                            //将并行任务放到数组
                            taskArray[counter] = taskResolveAEmelems;
                            counter           += 1;
                        }
                        var safeTaskArray = taskArray.Where(x => null != x).ToArray();
                        Task.WaitAll(safeTaskArray);
                        lstTags = blockList.ToList();
                    }
                    resultBag.Add("Tags", lstTags);
                }

                #region products  解析
                //ProductBaseCollection lstProducts = new ProductBaseCollection()
                //{
                //    new TmallProduct { ItemId=1,Title="测试大衣"}
                //};
                var lstProducts = new ProductBaseCollection();
                //多任务并行解析商品
                //BlockingCollection<TmallProduct> blockingList_Products = new BlockingCollection<TmallProduct>();
                ConcurrentDictionary <string, ProductOrdered <TmallProduct> > blockingList_Products = new ConcurrentDictionary <string, ProductOrdered <TmallProduct> >();

                var div_J_ItemListDom = htmlDoc.QuerySelector("div#J_ItemList");
                if (null != div_J_ItemListDom)
                {
                    var div_productDomArray = div_J_ItemListDom.QuerySelectorAll("div.product");
                    if (null != div_productDomArray && div_productDomArray.Any())
                    {
                        var pids = div_productDomArray
                                   .Select(x => { return(x.GetAttribute("data-id")); });
                        //设定排序对象
                        int counter_pid = 0;
                        foreach (var itemPid in pids)
                        {
                            if (null != itemPid)
                            {
                                blockingList_Products.TryAdd(itemPid, new ProductOrdered <TmallProduct> {
                                    UniqKey = itemPid, IndexOrder = counter_pid
                                });
                                counter_pid++;
                            }
                        }

                        //并行解析 并保留原序列
                        div_productDomArray.AsParallel()
                        .ForAll((itemProductDom) =>
                        {
                            //解析一个商品的节点
                            TmallProduct modelProduct = this.ResolverProductDom(itemProductDom);
                            if (null != modelProduct && modelProduct.ItemId > 0)
                            {
                                var orderedObj     = blockingList_Products[modelProduct.ItemId.ToString()];
                                orderedObj.Product = modelProduct;
                            }
                        });

                        //进行排序
                        var productsList = blockingList_Products
                                           .Where(x => x.Value != null)
                                           .OrderBy(x => x.Value.IndexOrder)
                                           .Select(x => x.Value.Product);
                        lstProducts.AddRange(productsList);
                    }
                }
                resultBag.Add("Products", lstProducts);

                #endregion
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(resultBag);// string.Concat("has process input :" + content);
        }