// <summary>
        /// 执行内容解析
        /// </summary>
        ///<param name="webArgs">来自web 参数</param>
        /// <param name="content">要解析的内容</param>
        /// <returns></returns>
        public virtual SearchProductViewModel ResolvePageContent(BaseFetchWebPageArgument webArgs, string pageContent)
        {
            SearchProductViewModel dataModel = new SearchProductViewModel();

            /// 尝试加载所需的插件,使用插件进行内容解析
            IPlugin pluginInstance = this.GetNeedPluginInstance();


            var resultBag = pluginInstance.ResolveSearchPageContent(webArgs, pageContent) as Dictionary <string, object>;

            if (null == resultBag)
            {
                throw new Exception("插件:" + NeedPluginName + " ;未能正确解析内容:" + pageContent);
            }
            if (webArgs.IsNeedResolveHeaderTags == true)
            {
                if (resultBag.ContainsKey("Brands"))
                {
                    dataModel.Brands = resultBag["Brands"] as List <BrandTag>;
                }
                if (resultBag.ContainsKey("Tags"))
                {
                    dataModel.Tags = resultBag["Tags"] as List <KeyWordTagGroup>;
                }
            }

            if (resultBag.ContainsKey("Products"))
            {
                dataModel.Products = resultBag["Products"] as ProductBaseCollection;
            }

            return(dataModel);
        }
Exemple #2
0
        public BaseFetchWebPageArgument Get()
        {
            var model = new BaseFetchWebPageArgument();

            model.Platform    = SupportPlatformEnum.Tmall;
            model.AttachParas = new Dictionary <string, object>();
            model.AttachParas.Add("key-1", 1);
            model.Brands = new List <BrandTag>()
            {
                new BrandTag {
                    BrandId = "", BrandName = "", CharIndex = "", FilterField = "", IconUrl = "", Platform = SupportPlatformEnum.Tmall
                }
            };
            model.TagGroup = new KeyWordTagGroup {
                Tags = new List <KeyWordTag> {
                    new KeyWordTag {
                        Platform = SupportPlatformEnum.Tmall, TagName = "", FilterFiled = "", GroupShowName = "", Value = ""
                    }
                }
            };
            model.OrderFiled = new OrderField {
                DisplayName = "", FieldValue = "", Rule = OrderRule.ASC
            };

            return(model);
        }
        /// <summary>
        /// 执行内容解析
        /// </summary>
        ///<param name="webArgs"></param>
        /// <param name="content">要解析的内容</param>
        /// <returns>返回需要的字段对应的字典</returns>
        public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content)
        {
            var resultBag = new Dictionary <string, object>();
            //1 tags 解析
            var lstTags = new List <KeyWordTag> {
                new KeyWordTag {
                    Platform = NTCPMessage.EntityPackage.SupportPlatformEnum.ETao,
                    TagName  = "大衣", Value = "dayi", FilterFiled = "sku"
                }
            };

            resultBag.Add("Tags", lstTags);

            // 2 products  解析
            var lstProducts = new  ProductBaseCollection()
            {
                new ETaoProduct {
                    ItemId = 1, Title = "测试大衣"
                }
            };

            resultBag.Add("Products", lstProducts);



            return(resultBag);// string.Concat("has process input :" + content);
        }
Exemple #4
0
        /// <summary>
        /// 从缓存中读取抓取页面的结果
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public static SearchProductViewModel GetFetchPageResultFromCache(BaseFetchWebPageArgument webArgs)
        {
            SearchProductViewModel reultModel = null;
            var key = webArgs.CacheKey;

            reultModel = RedisClient.Get <SearchProductViewModel>(key);
            return(reultModel);
        }
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();


            StringBuilder sbSearchUrl = new StringBuilder("http://apiv4.yangkeduo.com/search?q=@###@");

            #region 品牌
            if (null != webArgs.Brands && webArgs.Brands.Count > 0)
            {
                //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                var otherPlatformBrands = webArgs.Brands.FirstOrDefault();
                if (null != otherPlatformBrands)
                {
                    webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                }
            }
            #endregion

            #region  属性标签
            if (null != webArgs.TagGroup)
            {
                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault();
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }
            #endregion

            #region 关键词
            sbSearchUrl.Replace("@###@", webArgs.KeyWord);
            #endregion

            #region  排序
            if (null == webArgs.OrderFiled || webArgs.OrderFiled.Rule == OrderRule.Default)
            {
                sbSearchUrl.Append("&sort=default");//默认综合排序
            }
            else
            {
                sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
            }
            #endregion

            #region  筛选-价格区间
            #endregion

            #region  页码

            sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1);
            sbSearchUrl.Append("&size=50");
            #endregion
            # region 杂项
        /// <summary>
        /// 尝试解析 来自web 参数
        /// 解析为具体的平台的搜索地址:附带参数
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public virtual ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas searchUrl = null;

            /// 尝试加载所需的插件,使用插件进行内容解析
            IPlugin pluginInstance = this.GetNeedPluginInstance();

            searchUrl = pluginInstance.ResolveSearchUrl(webArgs);

            return(searchUrl);
        }
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();
            var timeToken = JavascriptContext.getUnixTimestamp();
            //http://list.mogujie.com/search?callback=jQuery211013398370030336082_{0}&_version=8193&q=%E5%8F%A3%E7%BA%A2&cKey=43&minPrice=&_mgjuuid=66b111f4-e6ce-4b8b-bf0c-311fa8cf0c31&ppath=&page=1&maxPrice=&sort=pop&userId=&cpc_offset=&ratio=2%3A3&_=1500446274789

            StringBuilder sbSearchUrl = new StringBuilder(string.Format("http://list.mogujie.com/search?callback=jQuery211013398370030336082_{0}&_version=8193&q=@###@&cKey=43", timeToken));

            #region  属性 分类 都在参数 ppath 中 标签
            if (null != webArgs.TagGroup)
            {
                //1 当前平台的
                var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Mogujie);
                if (null != currentPlatformTag)
                {
                    var dicPara = new Dictionary <string, string>();
                    foreach (var item in currentPlatformTag)
                    {
                        dicPara.Add(item.FilterFiled, item.Value);
                    }
                    //将参数序列化为json
                    sbSearchUrl.Append("&ppath=").Append(JsonConvert.SerializeObject(dicPara));
                }

                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Mogujie);
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }
            #endregion

            #region 关键词
            sbSearchUrl.Replace("@###@", webArgs.KeyWord);
            #endregion

            #region  排序
            if (null != webArgs.OrderFiled)
            {
                sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);
            }
            #endregion

            #region  筛选-价格区间
            sbSearchUrl.Append("&minPrice=");
            sbSearchUrl.Append("&maxPrice=");
            #endregion

            #region  页码
            sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1);
            #endregion

            # region 杂项
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            //http://www.meilishuo.com/search/goods?page=1&searchKey=围巾&ppath={"2048":"10112"}&cpc_offset=0

            StringBuilder sbSearchUrl = new StringBuilder("http://www.meilishuo.com/search/goods?searchKey=@###@");



            #region  属性 分类 都在参数 ppath 中 标签
            if (null != webArgs.TagGroup)
            {
                //1 当前平台的
                var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Meilishuo);
                if (null != currentPlatformTag)
                {
                    var dicPara = new Dictionary <string, string>();
                    foreach (var item in currentPlatformTag)
                    {
                        dicPara.Add(item.FilterFiled, item.Value);
                    }
                    //将参数序列化为json
                    sbSearchUrl.Append("&ppath=").Append(JsonConvert.SerializeObject(dicPara));
                }

                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Meilishuo);
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }
            #endregion

            #region 关键词
            sbSearchUrl.Replace("@###@", webArgs.KeyWord);
            #endregion

            #region  排序
            if (null != webArgs.OrderFiled)
            {
                sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);
            }
            #endregion

            #region  筛选-价格区间
            #endregion

            #region  页码
            sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1);
            #endregion
            # region 杂项
Exemple #9
0
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            resultUrl.ParasPost = new Dictionary <string, object> ();
            var urlParaContainer = resultUrl.ParasPost;

            //1 查询品牌
            var brandParaModel = new VipSearchParaBrand(webArgs.KeyWord);

            urlParaContainer.Add("para_brand", JsonConvert.SerializeObject(brandParaModel));
            // 2 查询分类
            var categoryTreeParaModel = new VipSearchParaCategoryTree(webArgs.KeyWord);

            urlParaContainer.Add("para_categoryTree", JsonConvert.SerializeObject(categoryTreeParaModel));

            //3检索内容
            var searchListParaModel = new VipSearchParaSearchList(webArgs.KeyWord);

            //分页
            searchListParaModel.paramsDetails.np = webArgs.PageIndex + 1;
            //排序
            int tempSort = 0;

            int.TryParse(webArgs.OrderFiled.FieldValue, out tempSort);
            searchListParaModel.paramsDetails.sort = tempSort;
            //品牌
            if (null != webArgs.Brands && webArgs.Brands.Any())
            {
                searchListParaModel.paramsDetails.brand_store_sn = string.Join(",", webArgs.Brands.Select(x => x.BrandId));
            }
            //分类+规格
            if (null != webArgs.TagGroup)
            {
                //分类
                var category_id_1_5_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_1_5_show");
                searchListParaModel.paramsDetails.category_id_1_5_show = string.Join(",", category_id_1_5_show.Select(x => x.Value));
                var category_id_1_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_1_show");
                searchListParaModel.paramsDetails.category_id_1_show = string.Join(",", category_id_1_show.Select(x => x.Value));
                var category_id_2_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_2_show");
                searchListParaModel.paramsDetails.category_id_2_show = string.Join(",", category_id_2_show.Select(x => x.Value));
                var category_id_3_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_3_show");
                searchListParaModel.paramsDetails.category_id_3_show = string.Join(",", category_id_3_show.Select(x => x.Value));
                //规格
                var props = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "props");
                searchListParaModel.paramsDetails.props = string.Join(";", props.Select(x => x.Value));
            }

            urlParaContainer.Add("para_searchList", JsonConvert.SerializeObject(searchListParaModel));

            return(resultUrl);
        }
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            StringBuilder sbSearchUrl = new StringBuilder("http://search.yhd.com/c0-0/");


            #region 品牌
            //string brandString = "mbname";
            if (null != webArgs.Brands && webArgs.Brands.Count > 0)
            {
                //1 当前平台的品牌
                var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Yhd);
                if (currentPlatformBrands.Any())
                {
                    //http://search.yhd.com/c0-0/mbname金龙鱼,十月稻田-b9429,15840/
                    //多个品牌
                    string brandNames = string.Join(",", currentPlatformBrands.Select(x => x.BrandName));
                    string brandIds   = string.Join(",", currentPlatformBrands.Select(x => x.BrandId));
                    sbSearchUrl.Append("mbname").Append(brandNames).Append("-").Append(brandIds).Append("/");
                }
                else
                {
                    sbSearchUrl.Append("mbname-b/");
                }

                //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Yhd);
                if (null != otherPlatformBrands)
                {
                    webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                }
            }
            #endregion

            #region  属性标签
            string attrString = "a";
            if (null != webArgs.TagGroup)
            {
                //1 当前平台的
                var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Yhd);
                if (currentPlatformTag.Any())
                {
                    //http://search.yhd.com/c0-0/mbname十月稻田-b15840/a83213||83214::1916_268464519||268472939::268435461

                    var attrIdGroups = currentPlatformTag.GroupBy(x => x.FilterFiled); //string.Join("-", currentPlatformTag.Select(x => x.Value));//&att=1000012:1985-1000012:1986
                    foreach (var gp in attrIdGroups)
                    {
                        string attrIds = string.Join("||", gp.Select(x => x.Value));
                        attrString += string.Concat(attrIds, "::", gp.Key);
                    }
                }
                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Yhd);
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }
            sbSearchUrl.Append(attrString);

            //http://search.yhd.com/c0-0/mbname十月稻田-b15840/a83213||83214::1916_268464519||268472939::268435461-s1-v4-p1-price-d0-f0-m1-rt0-pid-mid0-color-size-k大米/#page=1&sort=5
            sbSearchUrl.Append("-s1-v4-p2-price-d0-f0-m1-rt0-pid-mid0-color-size-");//p1:不开启; p2:开启 控制后面的分页参数是否启用
            #endregion


            #region 关键词
            sbSearchUrl.AppendFormat("k{0}", webArgs.KeyWord);//将关键词的占位符 进行替换
            #endregion

            #region  页码

            sbSearchUrl.Append("#page=").Append(webArgs.PageIndex + 1);

            #endregion

            #region  排序
            if (null == webArgs.OrderFiled)
            {
                sbSearchUrl.Append("&sort=1");//默认综合排序
            }
            else
            {
                sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
            }
            #endregion

            #region  筛选-价格区间
            #endregion


            # region 杂项
        /// <summary>
        /// 根据关键词 ,筛选条件, 请求对应平台上的返回结果
        /// 结果是各自平台上的 商品Item列表
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public SearchProductViewModel QueryProductsByKeyWords(BaseFetchWebPageArgument webArgs)
        {
            SearchProductViewModel dataModel = new SearchProductViewModel();

            if (webArgs.IsValid() == false)
            {
                return(dataModel);
            }

            try
            {
                //注册搜索词到热词服务
                HotWordService.AddWord(webArgs.KeyWord);

                //是否开启内容缓存,如果开启,那么从缓存中加载内容
                if (true == WorkContext.IsFetchPageCacheaAble)
                {
                    dataModel = WorkContext.GetFetchPageResultFromCache(webArgs);
                    if (null != dataModel)
                    {
                        return(dataModel);
                    }
                }

                //工厂模式 获取指定平台的内容解析器
                var resolver = ResolverFactory.GetSearchProductResolver(webArgs.Platform);
                //尝试解析页面参数的检索地址
                var searchUrl = resolver.ResolveSearchUrl(webArgs);
                if (null != searchUrl)
                {
                    webArgs.ResolvedUrl = searchUrl;
                }
                string pageContent = string.Empty;

                using (var connMgr = new WebCrawlerConnConfigManager())
                {
                    var connStrConfig = connMgr.Connection;
                    //;//ConfigHelper.WebCrawlerSection.ConnectionStringCollection["Crawler-Server1"];
                    webArgs.SystemAttachParas["SoapTcpConnectionString"] = connStrConfig;//register to attach paras

                    if (searchUrl.IsNeedPreRequest == true)
                    {
                        ////1 打开tcp 链接
                        ////2 发送参数
                        ////3 解析结果

                        using (var conn = new SoapTcpConnection(connStrConfig))
                        {
                            if (conn.State == ConnectionState.Closed)
                            {
                                conn.Open();
                            }

                            //发送soap
                            var soapCmd = new SoapMessage()
                            {
                                Head = CommandConstants.CMD_FetchPage
                            };
                            soapCmd.Body = webArgs.ToJson();
                            var dataContainer = conn.SendSoapMessage(soapCmd);
                            if (null != dataContainer && dataContainer.Status == 1)
                            {
                                pageContent = dataContainer.Result;
                            }
                            else
                            {
                                StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:");
                                errMsg.Append(soapCmd.Body);
                                if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg))
                                {
                                    errMsg.Append(";服务端错误消息:")
                                    .Append(dataContainer.ErrorMsg);
                                }
                                throw new Exception(errMsg.ToString());
                            }
                        }
                    }
                }


                //开始解析内容字符串
                //*******注意:针对可以直接进行内容解析的连接,交给内容解析函数进行地址的内容请求和解析*********
                if (!string.IsNullOrEmpty(pageContent) || !searchUrl.IsNeedPreRequest)
                {
                    dataModel = resolver.ResolvePageContent(webArgs, pageContent);
                    if (null != dataModel)
                    {
                        dataModel.KeyWord = webArgs.KeyWord;
                        dataModel.IsNeedResolveHeaderTags = webArgs.IsNeedResolveHeaderTags;
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.Error(ex);
            }

            //如果开启缓存页面结果
            if (true == WorkContext.IsFetchPageCacheaAble &&
                null != dataModel &&
                dataModel.Products.IsNotEmpty())
            {
                int cacheTime = ConfigHelper.AppSettingsConfiguration.GetConfigInt("FetchPageCacheTime");
                if (cacheTime <= 0)
                {
                    cacheTime = 60;//默认缓存页面结果60秒
                }
                WorkContext.SetFetchPageResultFromCache(webArgs, dataModel, cacheTime);
            }
            return(dataModel);
        }
Exemple #12
0
 /// <summary>
 ///  解析搜索列表内容方法
 /// </summary>
 /// <param name="isNeedHeadFilter"></param>
 /// <param name="content"></param>
 /// <returns></returns>
 public abstract Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content);
Exemple #13
0
 /// <summary>
 /// 尝试解析 来自web 参数
 /// 解析为具体的平台的搜索地址:附带参数
 /// </summary>
 /// <param name="webArgs"></param>
 /// <returns></returns>
 public virtual ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
 {
     return(null);
 }
Exemple #14
0
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            try
            {
                StringBuilder sbSearchUrl = new StringBuilder("https://s.taobao.com/search?q=@###@&imgfile=");

                string filerValueString = "";
                #region 品牌
                if (null != webArgs.Brands && webArgs.Brands.Count > 0)
                {
                    //1 当前平台的品牌
                    var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Taobao);
                    if (currentPlatformBrands.Any())
                    {
                        //多个品牌用 , 号分割
                        string brandIds = string.Join(";", currentPlatformBrands.Select(x => x.BrandId));
                        filerValueString += brandIds;
                    }

                    //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                    var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao);
                    if (null != otherPlatformBrands)
                    {
                        webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                    }
                }
                #endregion

                #region  属性标签
                if (null != webArgs.TagGroup)
                {
                    //1 当前平台的
                    var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Taobao);
                    if (null != currentPlatformTag)
                    {
                        //1 分类 cat
                        var catFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cat");
                        if (null != catFilter)
                        {
                            sbSearchUrl.Append("&cat=").Append(catFilter.Value);
                        }

                        // 2 其他的ppath标签
                        var ppathFilter = currentPlatformTag.Where(x => x.FilterFiled == "ppath");
                        if (ppathFilter.Any())
                        {
                            string ppathIds = string.Join(";", ppathFilter.Select(x => x.Value));
                            filerValueString += ";";
                            filerValueString += ppathIds;
                        }
                    }
                    //2 其他平台的tag 作为关键词的一部分
                    var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao);
                    if (null != otherPlatformTag)
                    {
                        webArgs.KeyWord += " " + otherPlatformTag.TagName;
                    }
                }
                //-----追加过滤字段特性--------
                if (!string.IsNullOrEmpty(filerValueString))
                {
                    sbSearchUrl.Append("&ppath=").Append(filerValueString);
                }


                #endregion

                #region 关键词
                sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换
                #endregion

                #region  排序
                if (null != webArgs.OrderFiled)
                {
                    sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
                }
                #endregion

                #region  筛选-价格区间
                #endregion

                #region  页码

                var pageNumber = webArgs.PageIndex + 1;
                if (pageNumber > 0)
                {
                    //sbSearchUrl.Append("&data-key=s&data-value=").Append(pageNumber * 44);//淘宝的分页是基于页索引*44
                    sbSearchUrl.Append("&s=").Append(webArgs.PageIndex * 44);
                }
                #endregion

                #region 杂项
                //string timeToken = JavascriptContext.getUnixTimestamp();
                //sbSearchUrl.AppendFormat("&_ksTS={0}_897", timeToken);
                //sbSearchUrl.Append("&commend=all");
                //sbSearchUrl.Append("&ssid=s5-e");
                //sbSearchUrl.Append("&search_type=item");
                //sbSearchUrl.Append("&sourceId=tb.index");
                //sbSearchUrl.Append("&spm=a21bo.50862.201856-taobao-item.1");
                sbSearchUrl.Append("&ie=utf8");
                //sbSearchUrl.Append("&ajax=true");
                sbSearchUrl.Append("&js=1");
                //sbSearchUrl.Append("&style=grid");
                sbSearchUrl.Append("&stats_click=search_radio_all%3A1");
                sbSearchUrl.Append("&bcoffset=4");
                sbSearchUrl.Append("&ntoffset=4");
                sbSearchUrl.Append("&p4ppushleft=1%2C48");

                sbSearchUrl.AppendFormat("&initiative_id=staobaoz_{0}", DateTime.Now.ToString("yyyyMMdd"));
                #endregion
                resultUrl.Url = sbSearchUrl.ToString();
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(resultUrl);
        }
Exemple #15
0
        /// <summary>
        /// 执行内容解析
        /// </summary>
        ///<param name="webArgs"></param>
        /// <param name="content">要解析的内容</param>
        /// <returns>返回需要的字段对应的字典</returns>
        public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content)
        {
            var resultBag = new Dictionary <string, object>();

            try
            {
                string jsonData = string.Empty;

                if (content.IndexOf("g_page_config") < 0)
                {
                    return(null);//无效的页面结果数据
                }


                //send request for load other data of first search page
                Task <string> tskSilcedJsonpContent = null;
                if (webArgs.PageIndex == 0)
                {
                    tskSilcedJsonpContent = Task.Factory.StartNew(() =>
                    {
                        string jsonpContent = "";
                        ////1 打开tcp 链接
                        ////2 发送参数
                        ////3 解析结果
                        if (!webArgs.SystemAttachParas.ContainsKey("SoapTcpConnectionString"))
                        {
                            return(jsonpContent);
                        }
                        var connStrConfig = webArgs.SystemAttachParas["SoapTcpConnectionString"] as WebCrawlerConnection;
                        if (null == connStrConfig)
                        {
                            return(jsonpContent);
                        }
                        //重写解析地址-首页的分片jsonp地址
                        string urlOfSlicedJsonp = this.ResolveSlicedSearchPageSilcedUrl(webArgs);
                        webArgs.ResolvedUrl     = new ResolvedSearchUrlWithParas {
                            Url = urlOfSlicedJsonp
                        };
                        using (var conn = new SoapTcpConnection(connStrConfig))
                        {
                            if (conn.State == ConnectionState.Closed)
                            {
                                conn.Open();
                            }

                            //发送soap
                            var soapCmd = new SoapMessage()
                            {
                                Head = CommandConstants.CMD_FetchPage
                            };
                            soapCmd.Body      = JsonConvert.SerializeObject(webArgs);
                            var dataContainer = conn.SendSoapMessage(soapCmd);
                            if (null != dataContainer && dataContainer.Status == 1)
                            {
                                jsonpContent = dataContainer.Result;
                            }
                            else
                            {
                                StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:");
                                errMsg.Append(soapCmd.Body);
                                if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg))
                                {
                                    errMsg.Append(";服务端错误消息:")
                                    .Append(dataContainer.ErrorMsg);
                                }
                                PluginContext.Logger.Error(errMsg.ToString());
                            }
                        }

                        return(jsonpContent);
                    });
                }


                int startPos      = content.IndexOf("g_page_config");
                int endPos        = content.IndexOf("g_srp_loadCss") - startPos;
                var secondContent = content.Substring(startPos, endPos);
                int secStartPos   = secondContent.IndexOf('{');
                int secEndPos     = secondContent.IndexOf("};") - secStartPos + 1;
                jsonData = secondContent.Substring(secStartPos, secEndPos);



                TaobaoPageJsonResut pageJsonObj = JsonConvert.DeserializeObject <TaobaoPageJsonResut>(jsonData);
                if (null == pageJsonObj)
                {
                    return(null);
                }

                if (webArgs.IsNeedResolveHeaderTags == true)
                {
                    var navNode = pageJsonObj.mods.nav;
                    if (null != navNode && null != navNode.data)
                    {
                        var commonNode = navNode.data.common;
                        var advNode    = navNode.data.adv;

                        //解析common节点
                        if (null != commonNode && commonNode.Any())
                        {
                            //1 检测是否有品牌,有的话 解析品牌
                            #region 品牌解析


                            var brandNode = commonNode.FirstOrDefault(x => x.text == "品牌" && x.sub != null);
                            if (null != brandNode && brandNode.sub != null)
                            {
                                var lstBrands = new List <BrandTag>();
                                foreach (var subItem in brandNode.sub)
                                {
                                    var model = new BrandTag();
                                    model.Platform    = SupportPlatformEnum.Taobao;
                                    model.FilterField = "ppath";//使用的过滤字段参数

                                    model.BrandId   = subItem.value;
                                    model.BrandName = subItem.text;
                                    model.CharIndex = PinYin.GetFirstLetter(model.BrandName);
                                    lstBrands.Add(model);
                                }
                                //解析完毕品牌
                                resultBag.Add("Brands", lstBrands);
                            }

                            #endregion
                        }


                        //2其他筛选节点的分析

                        #region tags 解析


                        var lstTags = new List <KeyWordTagGroup>();

                        var otherFilterNode1 = commonNode.Where(x => x.text != "品牌" && x.sub != null);
                        foreach (var itemNode in otherFilterNode1)
                        {
                            //找到归属的组
                            string groupName = itemNode.text;
                            ProcessTags(lstTags, itemNode.sub, groupName);
                        }
                        ////////if (null!= advNode)----高级筛选不要了
                        ////////{
                        ////////    //advNode 的解析
                        ////////    foreach (var itemNode in advNode)
                        ////////    {
                        ////////        //找到归属的组
                        ////////        string groupName = itemNode.text;
                        ////////        ProcessTags(lstTags, itemNode.sub, groupName);
                        ////////    }
                        ////////}

                        resultBag.Add("Tags", lstTags);

                        #endregion
                    }
                }

                #region products  解析
                var lstProducts = new ProductBaseCollection();
                resultBag.Add("Products", lstProducts);

                var itemListNode = pageJsonObj.mods.itemlist;
                if (null != itemListNode && itemListNode.data != null && null != itemListNode.data.auctions)
                {
                    foreach (var itemProduct in itemListNode.data.auctions)
                    {
                        TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct);

                        if (null != modelProduct)
                        {
                            lstProducts.Add(modelProduct);
                        }
                    }
                }

                //淘宝的搜索列表 - 第一页的数据是进行了分片的,在加载html ;36条数据, 后续会进行一次jsonp的请求;加载12条数据
                if (webArgs.PageIndex == 0 && null != tskSilcedJsonpContent)
                {
                    string jsonpContent = tskSilcedJsonpContent.Result;
                    if (!string.IsNullOrEmpty(jsonpContent) && jsonpContent.Contains("API.CustomizedApi"))
                    {
                        int    startIdx         = jsonpContent.IndexOf(':') + 1;
                        int    endIdx           = jsonpContent.Length - startIdx - 3;
                        string pureJsonContent  = jsonpContent.Substring(startIdx, endIdx);
                        var    slicedJsonpResut = JsonConvert.DeserializeObject <TaobaoSlicedJsonpResut>(pureJsonContent);


                        if (null != slicedJsonpResut)
                        {
                            var itemList = slicedJsonpResut.itemlist;
                            if (null != itemList && itemList.auctions != null)
                            {
                                foreach (var itemProduct in itemList.auctions)
                                {
                                    TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct);

                                    if (null != modelProduct)
                                    {
                                        lstProducts.Add(modelProduct);
                                    }
                                }
                            }
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(resultBag);// string.Concat("has process input :" + content);
        }
Exemple #16
0
        /// <summary>
        /// 解析搜索首页的剩余的jsonp 获取商品的地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        private string ResolveSlicedSearchPageSilcedUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();
            StringBuilder sbSearchUrl            = new StringBuilder("https://s.taobao.com/api?q=@###@&imgfile=");

            try
            {
                string filerValueString = "";
                #region 品牌
                if (null != webArgs.Brands && webArgs.Brands.Count > 0)
                {
                    //1 当前平台的品牌
                    var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Taobao);
                    if (currentPlatformBrands.Any())
                    {
                        //多个品牌用 , 号分割
                        string brandIds = string.Join(";", currentPlatformBrands.Select(x => x.BrandId));
                        filerValueString += brandIds;
                    }

                    //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                    var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao);
                    if (null != otherPlatformBrands)
                    {
                        webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                    }
                }
                #endregion

                #region  属性标签
                if (null != webArgs.TagGroup)
                {
                    //1 当前平台的
                    var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Taobao);
                    if (null != currentPlatformTag)
                    {
                        //1 分类 cat
                        var catFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cat");
                        if (null != catFilter)
                        {
                            sbSearchUrl.Append("&cat=").Append(catFilter.Value);
                        }

                        // 2 其他的ppath标签
                        var ppathFilter = currentPlatformTag.Where(x => x.FilterFiled == "ppath");
                        if (ppathFilter.Any())
                        {
                            string ppathIds = string.Join(";", ppathFilter.Select(x => x.Value));
                            filerValueString += ";";
                            filerValueString += ppathIds;
                        }
                    }
                    //2 其他平台的tag 作为关键词的一部分
                    var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao);
                    if (null != otherPlatformTag)
                    {
                        webArgs.KeyWord += " " + otherPlatformTag.TagName;
                    }
                }
                //-----追加过滤字段特性--------
                if (!string.IsNullOrEmpty(filerValueString))
                {
                    sbSearchUrl.Append("&ppath=").Append(filerValueString);
                }


                #endregion

                #region 关键词
                sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换
                #endregion

                #region  排序
                if (null != webArgs.OrderFiled)
                {
                    sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
                }
                #endregion

                #region  筛选-价格区间
                #endregion

                #region  页码
                sbSearchUrl.Append("&s=36");//this must be a constant value  36 !!!!!!
                #endregion

                #region 杂项
                string timeToken = JavascriptContext.getUnixTimestamp();
                sbSearchUrl.AppendFormat("&_ksTS={0}_897", timeToken);

                sbSearchUrl.Append("&callback=jsonp2822");
                sbSearchUrl.Append("&m=customized");
                sbSearchUrl.Append("&ps=1");

                sbSearchUrl.Append("&ie=utf8");
                sbSearchUrl.Append("&ajax=true");
                sbSearchUrl.Append("&js=1");
                sbSearchUrl.Append("&p4ppushleft=1,48");
                sbSearchUrl.Append("&stats_click=search_radio_all:1");
                sbSearchUrl.Append("&bcoffset=0");
                sbSearchUrl.Append("&ntoffset=4");
                sbSearchUrl.Append("&rn=ee5b33aee4d18bf96ab0ad083eadc7f0");
                sbSearchUrl.AppendFormat("&initiative_id=staobaoz_{0}", DateTime.Now.ToString("yyyyMMdd"));
                #endregion
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(sbSearchUrl.ToString());
        }
Exemple #17
0
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            StringBuilder sbSearchUrl = new StringBuilder("http://search.dangdang.com/?key=@###@");


            #region 品牌
            if (null != webArgs.Brands && webArgs.Brands.Count > 0)
            {
                //1 当前平台的品牌
                var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Dangdang);
                if (currentPlatformBrands.Any())
                {
                    //多个品牌用 _ 号分割
                    string brandIds = string.Join("_", currentPlatformBrands.Select(x => x.BrandId));
                    sbSearchUrl.Append("&att=1:").Append(brandIds);
                }

                //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Dangdang);
                if (null != otherPlatformBrands)
                {
                    webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                }
            }
            #endregion

            #region  属性标签
            if (null != webArgs.TagGroup)
            {
                //1 当前平台的
                var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Dangdang);
                if (currentPlatformTag.Any())
                {
                    //1 分类 cat
                    var catFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "category_id");
                    if (null != catFilter)
                    {
                        sbSearchUrl.Append("&category_id=").Append(catFilter.Value);
                    }
                    //2 属性 att
                    var attFilter = currentPlatformTag.Where(x => x.FilterFiled == "att");
                    if (attFilter.Any())
                    {
                        string attrIds = string.Join("-", currentPlatformTag.Select(x => x.Value));//&att=1000012:1985-1000012:1986
                        sbSearchUrl.Append("&att=").Append(attrIds);
                    }

                    //3 其他分类路径
                    var catePathFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "category_path");
                    if (null != catePathFilter)
                    {
                        sbSearchUrl.Append("&category_path=").Append(catePathFilter.Value);
                    }
                }
                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Tmall);
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }
            #endregion

            #region 关键词
            sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换
            #endregion

            #region  排序
            if (null == webArgs.OrderFiled)
            {
                sbSearchUrl.Append("&sort_type=sort_default");//默认综合排序
            }
            else
            {
                sbSearchUrl.Append("&sort_type=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
            }
            #endregion

            #region  筛选-价格区间
            #endregion

            #region  页码

            sbSearchUrl.Append("&page_index=").Append(webArgs.PageIndex + 1);

            #endregion
            # region 杂项
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            try
            {
                StringBuilder sbSearchUrl = new StringBuilder("https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000720.1.348abe64rj5JVg");


                #region 品牌
                if (null != webArgs.Brands && webArgs.Brands.Count > 0)
                {
                    //1 当前平台的品牌
                    var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Tmall);
                    if (currentPlatformBrands.Any())
                    {
                        //多个品牌用 , 号分割
                        string brandIds = string.Join(",", currentPlatformBrands.Select(x => x.BrandId));
                        sbSearchUrl.Append("&brand=").Append(brandIds);
                    }

                    //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                    var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Tmall);
                    if (null != otherPlatformBrands)
                    {
                        webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                    }
                }
                #endregion

                #region  属性标签
                if (null != webArgs.TagGroup)
                {
                    //1 当前平台的
                    var currentPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform == SupportPlatformEnum.Tmall);
                    if (null != currentPlatformTag)
                    {
                        sbSearchUrl.Append("&prop=").Append(currentPlatformTag.Value);
                    }
                    //2 其他平台的tag 作为关键词的一部分
                    var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Tmall);
                    if (null != otherPlatformTag)
                    {
                        webArgs.KeyWord += " " + otherPlatformTag.TagName;
                    }
                }
                #endregion

                #region 关键词
                sbSearchUrl.Append("&q=").Append(webArgs.KeyWord);
                #endregion

                #region  排序
                if (null == webArgs.OrderFiled)
                {
                    sbSearchUrl.Append("&sort=s");//默认综合排序
                }
                else
                {
                    sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
                }
                #endregion

                #region  筛选-价格区间
                #endregion

                #region  页码

                var pageNumber = webArgs.PageIndex;
                if (pageNumber > 0)
                {
                    sbSearchUrl.Append("&s=").Append(pageNumber * 60);//天猫的分页是基于页索引*60
                    sbSearchUrl.Append("&search_condition=2");
                }
                #endregion
                #region 杂项
                sbSearchUrl.Append("&from=mallfp..pc_1_searchbutton");
                sbSearchUrl.Append("&type=pc");
                sbSearchUrl.Append("&style=g");
                #endregion
                resultUrl.Url = sbSearchUrl.ToString();
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(resultUrl);
        }
        /// <summary>
        /// 执行内容解析
        /// </summary>
        ///<param name="webArgs"> </param>
        /// <param name="content">要解析的内容</param>
        /// <returns>返回需要的字段对应的字典</returns>
        public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content)
        {
            var resultBag = new Dictionary <string, object>();

            if (!string.IsNullOrEmpty(content))
            {
                if (content.Contains("环境有异常"))
                {
                    PluginContext.Logger.Error("天猫查询被进行蜘蛛验证!关键词:" + webArgs.KeyWord);
                    return(resultBag);
                }
                if (content.Contains("member/login"))
                {
                    PluginContext.Logger.Error("天猫查询结果页面被强制跳转到了登录页!关键词:" + webArgs.KeyWord);
                    return(resultBag);
                }
            }


            try
            {
                //创建html 文档对象
                HtmlParser htmlParser   = new HtmlParser();
                var        htmlDoc      = htmlParser.Parse(content);
                var        div_AttrsDom = htmlDoc.QuerySelector("div.j_NavAttrs");

                if (webArgs.IsNeedResolveHeaderTags == true && null != div_AttrsDom)
                {
                    #region 品牌解析
                    var lstBrands = new List <BrandTag>();
                    var brandDom  = div_AttrsDom.QuerySelector("div.j_Brand");
                    if (null != brandDom)
                    {
                        //从属性区域解析dom-品牌内容
                        var brandULDom = brandDom.QuerySelector("div.attrValues>ul");//ulDomArray[0];//

                        if (null != brandULDom)
                        {
                            var regex_MatchBrandId = new Regex(@"brand=(\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
                            var li_ADomArray       = brandULDom.QuerySelectorAll("li>a");
                            foreach (var itemADom in li_ADomArray)
                            {
                                var model = new BrandTag();
                                model.Platform    = SupportPlatformEnum.Tmall;
                                model.FilterField = "brand";//使用的过滤字段参数
                                var urlBrand = itemADom.GetAttribute("href");
                                if (!string.IsNullOrEmpty(urlBrand) && urlBrand.Contains("brand="))
                                {
                                    model.BrandId = regex_MatchBrandId.Match(urlBrand).Groups[1].Value;//new//品牌id   href="?brand=110910&amp;q=%B4%F3%C3%D7&amp;sort=s&amp;style=g&amp;from=sn_1_brand-qp&amp;spm=a220m.1000858.1000720.1.348abe64rj5JVg#J_crumbs
                                }
                                model.BrandName = itemADom.GetAttribute("title");
                                model.CharIndex = PinYin.GetFirstLetter(model.BrandName);
                                lstBrands.Add(model);
                            }
                        }
                    }
                    resultBag.Add("Brands", lstBrands);

                    #endregion

                    // tags 解析
                    //var lstTags = new List<KeyWordTag> {
                    //new KeyWordTag {
                    //    Platform = NTCPMessage.EntityPackage.SupportPlatformEnum.Tmall,
                    //    TagName = "大衣", Value = "dayi", FilterFiled = "sku"
                    //} };
                    var ulDomArray = div_AttrsDom.QuerySelectorAll("div.attrValues>ul");

                    var lstTags = new List <KeyWordTagGroup>();
                    if (null != div_AttrsDom)
                    {
                        var blockList = new BlockingCollection <KeyWordTagGroup>();

                        //分类 or 属性;品牌是第一个,其他属性是后续
                        int startIdx = brandDom == null ? 0 : 1;// //是否存在品牌的判断

                        var taskArray = new Task[ulDomArray.Length - startIdx];
                        int counter   = 0;


                        for (int i = startIdx; i < ulDomArray.Length; i++)
                        {
                            int cursor = i;

                            var taskResolveAEmelems = Task.Factory.StartNew(() =>
                            {
                                var itemUl = ulDomArray[cursor];

                                //找到归属的组
                                var attrKeyDom   = itemUl.ParentElement.ParentElement.QuerySelector("div.attrKey");
                                string groupName = "";
                                if (null != attrKeyDom)
                                {
                                    groupName = attrKeyDom.TextContent.Replace("\n", "").Trim();
                                }

                                var tagGroup = new KeyWordTagGroup(groupName);

                                var childLiADomArray = itemUl.QuerySelectorAll("li>a");
                                foreach (var itemADom in childLiADomArray)
                                {
                                    var modelTag           = new KeyWordTag();
                                    modelTag.Platform      = SupportPlatformEnum.Tmall;
                                    modelTag.TagName       = itemADom.TextContent.Replace("\n", "");//标签名称
                                    modelTag.GroupShowName = groupName;

                                    //////----解析 a标签开始-------
                                    //////检查 a 的href 中的参数;cat 或者prop
                                    string hrefValue = itemADom.GetAttribute("href");
                                    if (!string.IsNullOrEmpty(hrefValue))
                                    {
                                        var urlParas = HttpUtility.ParseQueryString(hrefValue, Encoding.UTF8);
                                        if (null != urlParas)
                                        {
                                            if (hrefValue.IndexOf("cat=") > -1)
                                            {
                                                //1 cat
                                                string catValue      = urlParas["cat"];
                                                modelTag.FilterFiled = "cat";
                                                modelTag.Value       = catValue;
                                            }
                                            else if (hrefValue.IndexOf("prop=") > -1)
                                            {
                                                //2 prop
                                                string propValue     = urlParas["prop"];
                                                modelTag.FilterFiled = "prop";
                                                modelTag.Value       = propValue;
                                            }
                                        }
                                    }
                                    tagGroup.Tags.Add(modelTag);
                                }

                                //----解析 a标签完毕-------
                                blockList.Add(tagGroup);
                            });
                            //将并行任务放到数组
                            taskArray[counter] = taskResolveAEmelems;
                            counter           += 1;
                        }
                        var safeTaskArray = taskArray.Where(x => null != x).ToArray();
                        Task.WaitAll(safeTaskArray);
                        lstTags = blockList.ToList();
                    }
                    resultBag.Add("Tags", lstTags);
                }

                #region products  解析
                //ProductBaseCollection lstProducts = new ProductBaseCollection()
                //{
                //    new TmallProduct { ItemId=1,Title="测试大衣"}
                //};
                var lstProducts = new ProductBaseCollection();
                //多任务并行解析商品
                //BlockingCollection<TmallProduct> blockingList_Products = new BlockingCollection<TmallProduct>();
                ConcurrentDictionary <string, ProductOrdered <TmallProduct> > blockingList_Products = new ConcurrentDictionary <string, ProductOrdered <TmallProduct> >();

                var div_J_ItemListDom = htmlDoc.QuerySelector("div#J_ItemList");
                if (null != div_J_ItemListDom)
                {
                    var div_productDomArray = div_J_ItemListDom.QuerySelectorAll("div.product");
                    if (null != div_productDomArray && div_productDomArray.Any())
                    {
                        var pids = div_productDomArray
                                   .Select(x => { return(x.GetAttribute("data-id")); });
                        //设定排序对象
                        int counter_pid = 0;
                        foreach (var itemPid in pids)
                        {
                            if (null != itemPid)
                            {
                                blockingList_Products.TryAdd(itemPid, new ProductOrdered <TmallProduct> {
                                    UniqKey = itemPid, IndexOrder = counter_pid
                                });
                                counter_pid++;
                            }
                        }

                        //并行解析 并保留原序列
                        div_productDomArray.AsParallel()
                        .ForAll((itemProductDom) =>
                        {
                            //解析一个商品的节点
                            TmallProduct modelProduct = this.ResolverProductDom(itemProductDom);
                            if (null != modelProduct && modelProduct.ItemId > 0)
                            {
                                var orderedObj     = blockingList_Products[modelProduct.ItemId.ToString()];
                                orderedObj.Product = modelProduct;
                            }
                        });

                        //进行排序
                        var productsList = blockingList_Products
                                           .Where(x => x.Value != null)
                                           .OrderBy(x => x.Value.IndexOrder)
                                           .Select(x => x.Value.Product);
                        lstProducts.AddRange(productsList);
                    }
                }
                resultBag.Add("Products", lstProducts);

                #endregion
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(resultBag);// string.Concat("has process input :" + content);
        }
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            resultUrl.IsNeedPreRequest = false;//苏宁的搜索页面和数据列表是分离的,直接在解析中进行内容请求,不需要预先请求


            StringBuilder sbSearchUrl = new StringBuilder("https://search.suning.com/@###@/");



            #region 品牌

            if (null != webArgs.Brands && webArgs.Brands.Count > 0)
            {
                //1 当前平台的品牌
                var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Suning);
                if (currentPlatformBrands.Any())
                {
                    //多个品牌直接将id拼接为字符串,国美家的 是4位加密码进行的拼接组
                    string brandNames = string.Join(";", currentPlatformBrands.Select(x => x.BrandName));


                    sbSearchUrl.Append("&hf=brand_Name_FacetAll:").Append(brandNames);
                }

                //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Suning);
                if (null != otherPlatformBrands)
                {
                    webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                }
            }
            #endregion



            #region  属性标签
            if (null != webArgs.TagGroup)
            {
                //1 当前平台的
                var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Suning);
                if (currentPlatformTag.Any())
                {
                    #region 分类
                    var catIdTag = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "ci");
                    if (null != catIdTag)
                    {
                        sbSearchUrl.Append("&ci=").Append(catIdTag.Value);
                    }
                    #endregion

                    //https://search.suning.com/羽绒服/&iy=0&sc=0&hf=solr_13696_attrId:收腰型;常规&st=0#search-path-box
                    string attrIds = string.Join(";", currentPlatformTag.Select(x => x.Value));
                    sbSearchUrl.Append("&cf=")
                    .Append(currentPlatformTag.First().FilterFiled)
                    .Append("_attrId:")
                    .Append(attrIds);
                }

                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Suning);
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }


            #endregion

            #region 关键词
            sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换
            #endregion

            #region  排序
            if (null == webArgs.OrderFiled)
            {
                sbSearchUrl.Append("&st=0");//默认综合排序
            }
            else
            {
                sbSearchUrl.Append("&st=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
            }
            #endregion

            #region  筛选-价格区间
            #endregion

            #region  页码

            sbSearchUrl.Append("&cp=").Append(webArgs.PageIndex);

            #endregion
            # region 杂项
Exemple #21
0
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();

            resultUrl.IsNeedPreRequest = false;//国美的搜索页面和数据列表是分离的,不需要预先请求html


            StringBuilder sbSearchUrl = new StringBuilder("https://search.gome.com.cn/search?question=@###@");



            #region 品牌
            string facetsString = string.Empty;
            if (null != webArgs.Brands && webArgs.Brands.Count > 0)
            {
                //1 当前平台的品牌
                var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Guomei);
                if (currentPlatformBrands.Any())
                {
                    //sbSearchUrl.Append("&pzpq=0");
                    //sbSearchUrl.Append("&pzin=v4");

                    //多个品牌直接将id拼接为字符串,国美家的 是4位加密码进行的拼接组
                    string brandIds = string.Join("", currentPlatformBrands.Select(x => x.BrandId));
                    //sbSearchUrl.Append("&facets=").Append(brandIds);
                    facetsString += brandIds;

                    //有品牌参数的时候,国美前端有个附加参数 intcmp 没什么用,直接固定
                    sbSearchUrl.Append("&intcmp=search-9000001100-1");
                }

                //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Guomei);
                if (null != otherPlatformBrands)
                {
                    webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                }
            }
            #endregion



            #region  属性标签
            if (null != webArgs.TagGroup)
            {
                //1 当前平台的
                var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Guomei);
                if (currentPlatformTag.Any())
                {
                    #region 分类
                    var catIdTag = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "catId");
                    if (null != catIdTag)
                    {
                        sbSearchUrl.Append("&catId=").Append(catIdTag.Value);
                    }
                    #endregion

                    string attrIds = string.Join("", currentPlatformTag.Select(x => x.Value));//facetsid 的组合
                    facetsString += attrIds;
                }

                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Guomei);
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }

            if (!string.IsNullOrEmpty(facetsString))
            {
                sbSearchUrl.Append("&facets=").Append(facetsString);//国美是把所有的属性作为4字符串值作为参数解析的
            }
            #endregion

            #region 关键词
            sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换
            #endregion

            #region  排序
            if (null == webArgs.OrderFiled)
            {
                sbSearchUrl.Append("&sort=00");//默认综合排序
            }
            else
            {
                sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序
            }
            #endregion

            #region  筛选-价格区间
            #endregion

            #region  页码

            sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1);

            #endregion
            # region 杂项
Exemple #22
0
        /// <summary>
        ///  将指定的参数的抓取的页面的解析结果放到缓存
        /// </summary>
        /// <param name="webArgs"></param>
        /// <param name="reultModel"></param>
        /// <param name="timeOut(秒)">默认为30秒</param>
        public static void SetFetchPageResultFromCache(BaseFetchWebPageArgument webArgs, SearchProductViewModel reultModel, int timeOut = 30)
        {
            var key = webArgs.CacheKey;

            RedisClient.SetAsync(key, reultModel, timeOut);
        }
        /// <summary>
        /// 解析搜索地址
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs)
        {
            ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas();


            StringBuilder sbSearchUrl = new StringBuilder("https://search.jd.com/Search?keyword=@###@&enc=utf-8&wq=@###@");


            #region 品牌 规格 分类 都在参数ev 中
            //例如:exbrand_娇兰(Guerlain)||NARS^1107_82376||8240^
            string paraBrandAndSkusEv = "";
            if (null != webArgs.Brands && webArgs.Brands.Count > 0)
            {
                //1 当前平台的品牌
                var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Jingdong);
                if (currentPlatformBrands.Any())
                {
                    //多个品牌用 , 号分割
                    string brandNames = string.Join("||", currentPlatformBrands.Select(x => x.BrandName));
                    paraBrandAndSkusEv = string.Concat("exbrand_", brandNames, "^");
                }

                //2 非当前平台的品牌--选择其中的一个 作为关键词 分割
                var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Jingdong);
                if (null != otherPlatformBrands)
                {
                    webArgs.KeyWord += " " + otherPlatformBrands.BrandName;
                }
            }
            #endregion

            #region  属性标签
            if (null != webArgs.TagGroup)
            {
                //1 当前平台的
                var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Jingdong);
                if (null != currentPlatformTag)
                {
                    //归属科目 cid2
                    var cid2Para = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cid2");
                    if (null != cid2Para)
                    {
                        sbSearchUrl.Append("&cid2=").Append(cid2Para.Value);
                    }
                    //归属科目 cid3
                    var cid3Para = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cid3");
                    if (null != cid3Para)
                    {
                        sbSearchUrl.Append("&cid3=").Append(cid3Para.Value);
                    }

                    var    tagGroup = currentPlatformTag.GroupBy(x => x.FilterFiled);
                    string skuAttrs = "";
                    foreach (var itemGroup in tagGroup)
                    {
                        string key    = itemGroup.Key + "_";//属性_
                        string values = string.Join("||", itemGroup.Select(x => x.Value));
                        skuAttrs += string.Concat(key, values);
                        skuAttrs += "^";
                    }
                    paraBrandAndSkusEv += skuAttrs;
                }
                if (!string.IsNullOrEmpty(paraBrandAndSkusEv))
                {
                    sbSearchUrl.Append("&ev=").Append(paraBrandAndSkusEv);
                }
                //2 其他平台的tag 作为关键词的一部分
                var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Jingdong);
                if (null != otherPlatformTag)
                {
                    webArgs.KeyWord += " " + otherPlatformTag.TagName;
                }
            }
            #endregion

            #region 关键词
            sbSearchUrl.Replace("@###@", webArgs.KeyWord);
            #endregion

            #region  排序
            if (null != webArgs.OrderFiled && webArgs.OrderFiled.Rule != OrderRule.Default)
            {
                sbSearchUrl.Append("&psort=").Append(webArgs.OrderFiled.FieldValue);
            }
            #endregion

            #region  筛选-价格区间
            #endregion

            #region  页码
            int pageNumber = (webArgs.PageIndex * 2) + 1;//京东每页分割为2个子页,按照页索引0开始,倍乘2,然后加1 为正确的页码
            sbSearchUrl.Append("&page=").Append(pageNumber);

            //京东前后翻页的时候 需要这个s 参数,前为prev 参数 ,后翻为next 参数
            if (null != webArgs.AttachParas && webArgs.AttachParas.ContainsKey("jd_pager_s"))
            {
                sbSearchUrl.Append("&s=").Append(webArgs.AttachParas["jd_pager_s"]);
            }
            #endregion
            # region 杂项