private static void ProcessTags(List <KeyWordTagGroup> lstTags, IEnumerable <TaobaoPageJsonResut.sub> lstSub, string groupName) { if (null == lstTags) { lstTags = new List <KeyWordTagGroup>(); } var tagGroup = new KeyWordTagGroup(groupName); foreach (var itemSub in lstSub) { var modelTag = new KeyWordTag(); modelTag.Platform = SupportPlatformEnum.Taobao; modelTag.TagName = itemSub.text; modelTag.GroupShowName = groupName; modelTag.FilterFiled = itemSub.key; modelTag.Value = itemSub.value; tagGroup.Tags.Add(modelTag); } lstTags.Add(tagGroup); }
/// <summary> /// 执行内容解析 /// </summary> ///<param name="webArgs"> </param> /// <param name="content">要解析的内容</param> /// <returns>返回需要的字段对应的字典</returns> public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content) { var resultBag = new Dictionary <string, object>(); if (!string.IsNullOrEmpty(content)) { if (content.Contains("环境有异常")) { PluginContext.Logger.Error("天猫查询被进行蜘蛛验证!关键词:" + webArgs.KeyWord); return(resultBag); } if (content.Contains("member/login")) { PluginContext.Logger.Error("天猫查询结果页面被强制跳转到了登录页!关键词:" + webArgs.KeyWord); return(resultBag); } } try { //创建html 文档对象 HtmlParser htmlParser = new HtmlParser(); var htmlDoc = htmlParser.Parse(content); var div_AttrsDom = htmlDoc.QuerySelector("div.j_NavAttrs"); if (webArgs.IsNeedResolveHeaderTags == true && null != div_AttrsDom) { #region 品牌解析 var lstBrands = new List <BrandTag>(); var brandDom = div_AttrsDom.QuerySelector("div.j_Brand"); if (null != brandDom) { //从属性区域解析dom-品牌内容 var brandULDom = brandDom.QuerySelector("div.attrValues>ul");//ulDomArray[0];// if (null != brandULDom) { var regex_MatchBrandId = new Regex(@"brand=(\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase); var li_ADomArray = brandULDom.QuerySelectorAll("li>a"); foreach (var itemADom in li_ADomArray) { var model = new BrandTag(); model.Platform = SupportPlatformEnum.Tmall; model.FilterField = "brand";//使用的过滤字段参数 var urlBrand = itemADom.GetAttribute("href"); if (!string.IsNullOrEmpty(urlBrand) && urlBrand.Contains("brand=")) { model.BrandId = regex_MatchBrandId.Match(urlBrand).Groups[1].Value;//new//品牌id href="?brand=110910&q=%B4%F3%C3%D7&sort=s&style=g&from=sn_1_brand-qp&spm=a220m.1000858.1000720.1.348abe64rj5JVg#J_crumbs } model.BrandName = itemADom.GetAttribute("title"); model.CharIndex = PinYin.GetFirstLetter(model.BrandName); lstBrands.Add(model); } } } resultBag.Add("Brands", lstBrands); #endregion // tags 解析 //var lstTags = new List<KeyWordTag> { //new KeyWordTag { // Platform = NTCPMessage.EntityPackage.SupportPlatformEnum.Tmall, // TagName = "大衣", Value = "dayi", FilterFiled = "sku" //} }; var ulDomArray = div_AttrsDom.QuerySelectorAll("div.attrValues>ul"); var lstTags = new List <KeyWordTagGroup>(); if (null != div_AttrsDom) { var blockList = new BlockingCollection <KeyWordTagGroup>(); //分类 or 属性;品牌是第一个,其他属性是后续 int startIdx = brandDom == null ? 0 : 1;// //是否存在品牌的判断 var taskArray = new Task[ulDomArray.Length - startIdx]; int counter = 0; for (int i = startIdx; i < ulDomArray.Length; i++) { int cursor = i; var taskResolveAEmelems = Task.Factory.StartNew(() => { var itemUl = ulDomArray[cursor]; //找到归属的组 var attrKeyDom = itemUl.ParentElement.ParentElement.QuerySelector("div.attrKey"); string groupName = ""; if (null != attrKeyDom) { groupName = attrKeyDom.TextContent.Replace("\n", "").Trim(); } var tagGroup = new KeyWordTagGroup(groupName); var childLiADomArray = itemUl.QuerySelectorAll("li>a"); foreach (var itemADom in childLiADomArray) { var modelTag = new KeyWordTag(); modelTag.Platform = SupportPlatformEnum.Tmall; modelTag.TagName = itemADom.TextContent.Replace("\n", "");//标签名称 modelTag.GroupShowName = groupName; //////----解析 a标签开始------- //////检查 a 的href 中的参数;cat 或者prop string hrefValue = itemADom.GetAttribute("href"); if (!string.IsNullOrEmpty(hrefValue)) { var urlParas = HttpUtility.ParseQueryString(hrefValue, Encoding.UTF8); if (null != urlParas) { if (hrefValue.IndexOf("cat=") > -1) { //1 cat string catValue = urlParas["cat"]; modelTag.FilterFiled = "cat"; modelTag.Value = catValue; } else if (hrefValue.IndexOf("prop=") > -1) { //2 prop string propValue = urlParas["prop"]; modelTag.FilterFiled = "prop"; modelTag.Value = propValue; } } } tagGroup.Tags.Add(modelTag); } //----解析 a标签完毕------- blockList.Add(tagGroup); }); //将并行任务放到数组 taskArray[counter] = taskResolveAEmelems; counter += 1; } var safeTaskArray = taskArray.Where(x => null != x).ToArray(); Task.WaitAll(safeTaskArray); lstTags = blockList.ToList(); } resultBag.Add("Tags", lstTags); } #region products 解析 //ProductBaseCollection lstProducts = new ProductBaseCollection() //{ // new TmallProduct { ItemId=1,Title="测试大衣"} //}; var lstProducts = new ProductBaseCollection(); //多任务并行解析商品 //BlockingCollection<TmallProduct> blockingList_Products = new BlockingCollection<TmallProduct>(); ConcurrentDictionary <string, ProductOrdered <TmallProduct> > blockingList_Products = new ConcurrentDictionary <string, ProductOrdered <TmallProduct> >(); var div_J_ItemListDom = htmlDoc.QuerySelector("div#J_ItemList"); if (null != div_J_ItemListDom) { var div_productDomArray = div_J_ItemListDom.QuerySelectorAll("div.product"); if (null != div_productDomArray && div_productDomArray.Any()) { var pids = div_productDomArray .Select(x => { return(x.GetAttribute("data-id")); }); //设定排序对象 int counter_pid = 0; foreach (var itemPid in pids) { if (null != itemPid) { blockingList_Products.TryAdd(itemPid, new ProductOrdered <TmallProduct> { UniqKey = itemPid, IndexOrder = counter_pid }); counter_pid++; } } //并行解析 并保留原序列 div_productDomArray.AsParallel() .ForAll((itemProductDom) => { //解析一个商品的节点 TmallProduct modelProduct = this.ResolverProductDom(itemProductDom); if (null != modelProduct && modelProduct.ItemId > 0) { var orderedObj = blockingList_Products[modelProduct.ItemId.ToString()]; orderedObj.Product = modelProduct; } }); //进行排序 var productsList = blockingList_Products .Where(x => x.Value != null) .OrderBy(x => x.Value.IndexOrder) .Select(x => x.Value.Product); lstProducts.AddRange(productsList); } } resultBag.Add("Products", lstProducts); #endregion } catch (Exception ex) { PluginContext.Logger.Error(ex); } return(resultBag);// string.Concat("has process input :" + content); }