// <summary> /// 执行内容解析 /// </summary> ///<param name="webArgs">来自web 参数</param> /// <param name="content">要解析的内容</param> /// <returns></returns> public virtual SearchProductViewModel ResolvePageContent(BaseFetchWebPageArgument webArgs, string pageContent) { SearchProductViewModel dataModel = new SearchProductViewModel(); /// 尝试加载所需的插件,使用插件进行内容解析 IPlugin pluginInstance = this.GetNeedPluginInstance(); var resultBag = pluginInstance.ResolveSearchPageContent(webArgs, pageContent) as Dictionary <string, object>; if (null == resultBag) { throw new Exception("插件:" + NeedPluginName + " ;未能正确解析内容:" + pageContent); } if (webArgs.IsNeedResolveHeaderTags == true) { if (resultBag.ContainsKey("Brands")) { dataModel.Brands = resultBag["Brands"] as List <BrandTag>; } if (resultBag.ContainsKey("Tags")) { dataModel.Tags = resultBag["Tags"] as List <KeyWordTagGroup>; } } if (resultBag.ContainsKey("Products")) { dataModel.Products = resultBag["Products"] as ProductBaseCollection; } return(dataModel); }
public BaseFetchWebPageArgument Get() { var model = new BaseFetchWebPageArgument(); model.Platform = SupportPlatformEnum.Tmall; model.AttachParas = new Dictionary <string, object>(); model.AttachParas.Add("key-1", 1); model.Brands = new List <BrandTag>() { new BrandTag { BrandId = "", BrandName = "", CharIndex = "", FilterField = "", IconUrl = "", Platform = SupportPlatformEnum.Tmall } }; model.TagGroup = new KeyWordTagGroup { Tags = new List <KeyWordTag> { new KeyWordTag { Platform = SupportPlatformEnum.Tmall, TagName = "", FilterFiled = "", GroupShowName = "", Value = "" } } }; model.OrderFiled = new OrderField { DisplayName = "", FieldValue = "", Rule = OrderRule.ASC }; return(model); }
/// <summary> /// 执行内容解析 /// </summary> ///<param name="webArgs"></param> /// <param name="content">要解析的内容</param> /// <returns>返回需要的字段对应的字典</returns> public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content) { var resultBag = new Dictionary <string, object>(); //1 tags 解析 var lstTags = new List <KeyWordTag> { new KeyWordTag { Platform = NTCPMessage.EntityPackage.SupportPlatformEnum.ETao, TagName = "大衣", Value = "dayi", FilterFiled = "sku" } }; resultBag.Add("Tags", lstTags); // 2 products 解析 var lstProducts = new ProductBaseCollection() { new ETaoProduct { ItemId = 1, Title = "测试大衣" } }; resultBag.Add("Products", lstProducts); return(resultBag);// string.Concat("has process input :" + content); }
/// <summary> /// 从缓存中读取抓取页面的结果 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public static SearchProductViewModel GetFetchPageResultFromCache(BaseFetchWebPageArgument webArgs) { SearchProductViewModel reultModel = null; var key = webArgs.CacheKey; reultModel = RedisClient.Get <SearchProductViewModel>(key); return(reultModel); }
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); StringBuilder sbSearchUrl = new StringBuilder("http://apiv4.yangkeduo.com/search?q=@###@"); #region 品牌 if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord); #endregion #region 排序 if (null == webArgs.OrderFiled || webArgs.OrderFiled.Rule == OrderRule.Default) { sbSearchUrl.Append("&sort=default");//默认综合排序 } else { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion #region 页码 sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1); sbSearchUrl.Append("&size=50"); #endregion # region 杂项
/// <summary> /// 尝试解析 来自web 参数 /// 解析为具体的平台的搜索地址:附带参数 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public virtual ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas searchUrl = null; /// 尝试加载所需的插件,使用插件进行内容解析 IPlugin pluginInstance = this.GetNeedPluginInstance(); searchUrl = pluginInstance.ResolveSearchUrl(webArgs); return(searchUrl); }
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); var timeToken = JavascriptContext.getUnixTimestamp(); //http://list.mogujie.com/search?callback=jQuery211013398370030336082_{0}&_version=8193&q=%E5%8F%A3%E7%BA%A2&cKey=43&minPrice=&_mgjuuid=66b111f4-e6ce-4b8b-bf0c-311fa8cf0c31&ppath=&page=1&maxPrice=&sort=pop&userId=&cpc_offset=&ratio=2%3A3&_=1500446274789 StringBuilder sbSearchUrl = new StringBuilder(string.Format("http://list.mogujie.com/search?callback=jQuery211013398370030336082_{0}&_version=8193&q=@###@&cKey=43", timeToken)); #region 属性 分类 都在参数 ppath 中 标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Mogujie); if (null != currentPlatformTag) { var dicPara = new Dictionary <string, string>(); foreach (var item in currentPlatformTag) { dicPara.Add(item.FilterFiled, item.Value); } //将参数序列化为json sbSearchUrl.Append("&ppath=").Append(JsonConvert.SerializeObject(dicPara)); } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Mogujie); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord); #endregion #region 排序 if (null != webArgs.OrderFiled) { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue); } #endregion #region 筛选-价格区间 sbSearchUrl.Append("&minPrice="); sbSearchUrl.Append("&maxPrice="); #endregion #region 页码 sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1); #endregion # region 杂项
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); //http://www.meilishuo.com/search/goods?page=1&searchKey=围巾&ppath={"2048":"10112"}&cpc_offset=0 StringBuilder sbSearchUrl = new StringBuilder("http://www.meilishuo.com/search/goods?searchKey=@###@"); #region 属性 分类 都在参数 ppath 中 标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Meilishuo); if (null != currentPlatformTag) { var dicPara = new Dictionary <string, string>(); foreach (var item in currentPlatformTag) { dicPara.Add(item.FilterFiled, item.Value); } //将参数序列化为json sbSearchUrl.Append("&ppath=").Append(JsonConvert.SerializeObject(dicPara)); } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Meilishuo); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord); #endregion #region 排序 if (null != webArgs.OrderFiled) { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue); } #endregion #region 筛选-价格区间 #endregion #region 页码 sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1); #endregion # region 杂项
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); resultUrl.ParasPost = new Dictionary <string, object> (); var urlParaContainer = resultUrl.ParasPost; //1 查询品牌 var brandParaModel = new VipSearchParaBrand(webArgs.KeyWord); urlParaContainer.Add("para_brand", JsonConvert.SerializeObject(brandParaModel)); // 2 查询分类 var categoryTreeParaModel = new VipSearchParaCategoryTree(webArgs.KeyWord); urlParaContainer.Add("para_categoryTree", JsonConvert.SerializeObject(categoryTreeParaModel)); //3检索内容 var searchListParaModel = new VipSearchParaSearchList(webArgs.KeyWord); //分页 searchListParaModel.paramsDetails.np = webArgs.PageIndex + 1; //排序 int tempSort = 0; int.TryParse(webArgs.OrderFiled.FieldValue, out tempSort); searchListParaModel.paramsDetails.sort = tempSort; //品牌 if (null != webArgs.Brands && webArgs.Brands.Any()) { searchListParaModel.paramsDetails.brand_store_sn = string.Join(",", webArgs.Brands.Select(x => x.BrandId)); } //分类+规格 if (null != webArgs.TagGroup) { //分类 var category_id_1_5_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_1_5_show"); searchListParaModel.paramsDetails.category_id_1_5_show = string.Join(",", category_id_1_5_show.Select(x => x.Value)); var category_id_1_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_1_show"); searchListParaModel.paramsDetails.category_id_1_show = string.Join(",", category_id_1_show.Select(x => x.Value)); var category_id_2_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_2_show"); searchListParaModel.paramsDetails.category_id_2_show = string.Join(",", category_id_2_show.Select(x => x.Value)); var category_id_3_show = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "category_id_3_show"); searchListParaModel.paramsDetails.category_id_3_show = string.Join(",", category_id_3_show.Select(x => x.Value)); //规格 var props = webArgs.TagGroup.Tags.Where(x => x.FilterFiled == "props"); searchListParaModel.paramsDetails.props = string.Join(";", props.Select(x => x.Value)); } urlParaContainer.Add("para_searchList", JsonConvert.SerializeObject(searchListParaModel)); return(resultUrl); }
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); StringBuilder sbSearchUrl = new StringBuilder("http://search.yhd.com/c0-0/"); #region 品牌 //string brandString = "mbname"; if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Yhd); if (currentPlatformBrands.Any()) { //http://search.yhd.com/c0-0/mbname金龙鱼,十月稻田-b9429,15840/ //多个品牌 string brandNames = string.Join(",", currentPlatformBrands.Select(x => x.BrandName)); string brandIds = string.Join(",", currentPlatformBrands.Select(x => x.BrandId)); sbSearchUrl.Append("mbname").Append(brandNames).Append("-").Append(brandIds).Append("/"); } else { sbSearchUrl.Append("mbname-b/"); } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Yhd); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 string attrString = "a"; if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Yhd); if (currentPlatformTag.Any()) { //http://search.yhd.com/c0-0/mbname十月稻田-b15840/a83213||83214::1916_268464519||268472939::268435461 var attrIdGroups = currentPlatformTag.GroupBy(x => x.FilterFiled); //string.Join("-", currentPlatformTag.Select(x => x.Value));//&att=1000012:1985-1000012:1986 foreach (var gp in attrIdGroups) { string attrIds = string.Join("||", gp.Select(x => x.Value)); attrString += string.Concat(attrIds, "::", gp.Key); } } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Yhd); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } sbSearchUrl.Append(attrString); //http://search.yhd.com/c0-0/mbname十月稻田-b15840/a83213||83214::1916_268464519||268472939::268435461-s1-v4-p1-price-d0-f0-m1-rt0-pid-mid0-color-size-k大米/#page=1&sort=5 sbSearchUrl.Append("-s1-v4-p2-price-d0-f0-m1-rt0-pid-mid0-color-size-");//p1:不开启; p2:开启 控制后面的分页参数是否启用 #endregion #region 关键词 sbSearchUrl.AppendFormat("k{0}", webArgs.KeyWord);//将关键词的占位符 进行替换 #endregion #region 页码 sbSearchUrl.Append("#page=").Append(webArgs.PageIndex + 1); #endregion #region 排序 if (null == webArgs.OrderFiled) { sbSearchUrl.Append("&sort=1");//默认综合排序 } else { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion # region 杂项
/// <summary> /// 根据关键词 ,筛选条件, 请求对应平台上的返回结果 /// 结果是各自平台上的 商品Item列表 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public SearchProductViewModel QueryProductsByKeyWords(BaseFetchWebPageArgument webArgs) { SearchProductViewModel dataModel = new SearchProductViewModel(); if (webArgs.IsValid() == false) { return(dataModel); } try { //注册搜索词到热词服务 HotWordService.AddWord(webArgs.KeyWord); //是否开启内容缓存,如果开启,那么从缓存中加载内容 if (true == WorkContext.IsFetchPageCacheaAble) { dataModel = WorkContext.GetFetchPageResultFromCache(webArgs); if (null != dataModel) { return(dataModel); } } //工厂模式 获取指定平台的内容解析器 var resolver = ResolverFactory.GetSearchProductResolver(webArgs.Platform); //尝试解析页面参数的检索地址 var searchUrl = resolver.ResolveSearchUrl(webArgs); if (null != searchUrl) { webArgs.ResolvedUrl = searchUrl; } string pageContent = string.Empty; using (var connMgr = new WebCrawlerConnConfigManager()) { var connStrConfig = connMgr.Connection; //;//ConfigHelper.WebCrawlerSection.ConnectionStringCollection["Crawler-Server1"]; webArgs.SystemAttachParas["SoapTcpConnectionString"] = connStrConfig;//register to attach paras if (searchUrl.IsNeedPreRequest == true) { ////1 打开tcp 链接 ////2 发送参数 ////3 解析结果 using (var conn = new SoapTcpConnection(connStrConfig)) { if (conn.State == ConnectionState.Closed) { conn.Open(); } //发送soap var soapCmd = new SoapMessage() { Head = CommandConstants.CMD_FetchPage }; soapCmd.Body = webArgs.ToJson(); var dataContainer = conn.SendSoapMessage(soapCmd); if (null != dataContainer && dataContainer.Status == 1) { pageContent = dataContainer.Result; } else { StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:"); errMsg.Append(soapCmd.Body); if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg)) { errMsg.Append(";服务端错误消息:") .Append(dataContainer.ErrorMsg); } throw new Exception(errMsg.ToString()); } } } } //开始解析内容字符串 //*******注意:针对可以直接进行内容解析的连接,交给内容解析函数进行地址的内容请求和解析********* if (!string.IsNullOrEmpty(pageContent) || !searchUrl.IsNeedPreRequest) { dataModel = resolver.ResolvePageContent(webArgs, pageContent); if (null != dataModel) { dataModel.KeyWord = webArgs.KeyWord; dataModel.IsNeedResolveHeaderTags = webArgs.IsNeedResolveHeaderTags; } } } catch (Exception ex) { Logger.Error(ex); } //如果开启缓存页面结果 if (true == WorkContext.IsFetchPageCacheaAble && null != dataModel && dataModel.Products.IsNotEmpty()) { int cacheTime = ConfigHelper.AppSettingsConfiguration.GetConfigInt("FetchPageCacheTime"); if (cacheTime <= 0) { cacheTime = 60;//默认缓存页面结果60秒 } WorkContext.SetFetchPageResultFromCache(webArgs, dataModel, cacheTime); } return(dataModel); }
/// <summary> /// 解析搜索列表内容方法 /// </summary> /// <param name="isNeedHeadFilter"></param> /// <param name="content"></param> /// <returns></returns> public abstract Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content);
/// <summary> /// 尝试解析 来自web 参数 /// 解析为具体的平台的搜索地址:附带参数 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public virtual ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { return(null); }
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); try { StringBuilder sbSearchUrl = new StringBuilder("https://s.taobao.com/search?q=@###@&imgfile="); string filerValueString = ""; #region 品牌 if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Taobao); if (currentPlatformBrands.Any()) { //多个品牌用 , 号分割 string brandIds = string.Join(";", currentPlatformBrands.Select(x => x.BrandId)); filerValueString += brandIds; } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Taobao); if (null != currentPlatformTag) { //1 分类 cat var catFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cat"); if (null != catFilter) { sbSearchUrl.Append("&cat=").Append(catFilter.Value); } // 2 其他的ppath标签 var ppathFilter = currentPlatformTag.Where(x => x.FilterFiled == "ppath"); if (ppathFilter.Any()) { string ppathIds = string.Join(";", ppathFilter.Select(x => x.Value)); filerValueString += ";"; filerValueString += ppathIds; } } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } //-----追加过滤字段特性-------- if (!string.IsNullOrEmpty(filerValueString)) { sbSearchUrl.Append("&ppath=").Append(filerValueString); } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换 #endregion #region 排序 if (null != webArgs.OrderFiled) { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion #region 页码 var pageNumber = webArgs.PageIndex + 1; if (pageNumber > 0) { //sbSearchUrl.Append("&data-key=s&data-value=").Append(pageNumber * 44);//淘宝的分页是基于页索引*44 sbSearchUrl.Append("&s=").Append(webArgs.PageIndex * 44); } #endregion #region 杂项 //string timeToken = JavascriptContext.getUnixTimestamp(); //sbSearchUrl.AppendFormat("&_ksTS={0}_897", timeToken); //sbSearchUrl.Append("&commend=all"); //sbSearchUrl.Append("&ssid=s5-e"); //sbSearchUrl.Append("&search_type=item"); //sbSearchUrl.Append("&sourceId=tb.index"); //sbSearchUrl.Append("&spm=a21bo.50862.201856-taobao-item.1"); sbSearchUrl.Append("&ie=utf8"); //sbSearchUrl.Append("&ajax=true"); sbSearchUrl.Append("&js=1"); //sbSearchUrl.Append("&style=grid"); sbSearchUrl.Append("&stats_click=search_radio_all%3A1"); sbSearchUrl.Append("&bcoffset=4"); sbSearchUrl.Append("&ntoffset=4"); sbSearchUrl.Append("&p4ppushleft=1%2C48"); sbSearchUrl.AppendFormat("&initiative_id=staobaoz_{0}", DateTime.Now.ToString("yyyyMMdd")); #endregion resultUrl.Url = sbSearchUrl.ToString(); } catch (Exception ex) { PluginContext.Logger.Error(ex); } return(resultUrl); }
/// <summary> /// 执行内容解析 /// </summary> ///<param name="webArgs"></param> /// <param name="content">要解析的内容</param> /// <returns>返回需要的字段对应的字典</returns> public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content) { var resultBag = new Dictionary <string, object>(); try { string jsonData = string.Empty; if (content.IndexOf("g_page_config") < 0) { return(null);//无效的页面结果数据 } //send request for load other data of first search page Task <string> tskSilcedJsonpContent = null; if (webArgs.PageIndex == 0) { tskSilcedJsonpContent = Task.Factory.StartNew(() => { string jsonpContent = ""; ////1 打开tcp 链接 ////2 发送参数 ////3 解析结果 if (!webArgs.SystemAttachParas.ContainsKey("SoapTcpConnectionString")) { return(jsonpContent); } var connStrConfig = webArgs.SystemAttachParas["SoapTcpConnectionString"] as WebCrawlerConnection; if (null == connStrConfig) { return(jsonpContent); } //重写解析地址-首页的分片jsonp地址 string urlOfSlicedJsonp = this.ResolveSlicedSearchPageSilcedUrl(webArgs); webArgs.ResolvedUrl = new ResolvedSearchUrlWithParas { Url = urlOfSlicedJsonp }; using (var conn = new SoapTcpConnection(connStrConfig)) { if (conn.State == ConnectionState.Closed) { conn.Open(); } //发送soap var soapCmd = new SoapMessage() { Head = CommandConstants.CMD_FetchPage }; soapCmd.Body = JsonConvert.SerializeObject(webArgs); var dataContainer = conn.SendSoapMessage(soapCmd); if (null != dataContainer && dataContainer.Status == 1) { jsonpContent = dataContainer.Result; } else { StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:"); errMsg.Append(soapCmd.Body); if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg)) { errMsg.Append(";服务端错误消息:") .Append(dataContainer.ErrorMsg); } PluginContext.Logger.Error(errMsg.ToString()); } } return(jsonpContent); }); } int startPos = content.IndexOf("g_page_config"); int endPos = content.IndexOf("g_srp_loadCss") - startPos; var secondContent = content.Substring(startPos, endPos); int secStartPos = secondContent.IndexOf('{'); int secEndPos = secondContent.IndexOf("};") - secStartPos + 1; jsonData = secondContent.Substring(secStartPos, secEndPos); TaobaoPageJsonResut pageJsonObj = JsonConvert.DeserializeObject <TaobaoPageJsonResut>(jsonData); if (null == pageJsonObj) { return(null); } if (webArgs.IsNeedResolveHeaderTags == true) { var navNode = pageJsonObj.mods.nav; if (null != navNode && null != navNode.data) { var commonNode = navNode.data.common; var advNode = navNode.data.adv; //解析common节点 if (null != commonNode && commonNode.Any()) { //1 检测是否有品牌,有的话 解析品牌 #region 品牌解析 var brandNode = commonNode.FirstOrDefault(x => x.text == "品牌" && x.sub != null); if (null != brandNode && brandNode.sub != null) { var lstBrands = new List <BrandTag>(); foreach (var subItem in brandNode.sub) { var model = new BrandTag(); model.Platform = SupportPlatformEnum.Taobao; model.FilterField = "ppath";//使用的过滤字段参数 model.BrandId = subItem.value; model.BrandName = subItem.text; model.CharIndex = PinYin.GetFirstLetter(model.BrandName); lstBrands.Add(model); } //解析完毕品牌 resultBag.Add("Brands", lstBrands); } #endregion } //2其他筛选节点的分析 #region tags 解析 var lstTags = new List <KeyWordTagGroup>(); var otherFilterNode1 = commonNode.Where(x => x.text != "品牌" && x.sub != null); foreach (var itemNode in otherFilterNode1) { //找到归属的组 string groupName = itemNode.text; ProcessTags(lstTags, itemNode.sub, groupName); } ////////if (null!= advNode)----高级筛选不要了 ////////{ //////// //advNode 的解析 //////// foreach (var itemNode in advNode) //////// { //////// //找到归属的组 //////// string groupName = itemNode.text; //////// ProcessTags(lstTags, itemNode.sub, groupName); //////// } ////////} resultBag.Add("Tags", lstTags); #endregion } } #region products 解析 var lstProducts = new ProductBaseCollection(); resultBag.Add("Products", lstProducts); var itemListNode = pageJsonObj.mods.itemlist; if (null != itemListNode && itemListNode.data != null && null != itemListNode.data.auctions) { foreach (var itemProduct in itemListNode.data.auctions) { TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct); if (null != modelProduct) { lstProducts.Add(modelProduct); } } } //淘宝的搜索列表 - 第一页的数据是进行了分片的,在加载html ;36条数据, 后续会进行一次jsonp的请求;加载12条数据 if (webArgs.PageIndex == 0 && null != tskSilcedJsonpContent) { string jsonpContent = tskSilcedJsonpContent.Result; if (!string.IsNullOrEmpty(jsonpContent) && jsonpContent.Contains("API.CustomizedApi")) { int startIdx = jsonpContent.IndexOf(':') + 1; int endIdx = jsonpContent.Length - startIdx - 3; string pureJsonContent = jsonpContent.Substring(startIdx, endIdx); var slicedJsonpResut = JsonConvert.DeserializeObject <TaobaoSlicedJsonpResut>(pureJsonContent); if (null != slicedJsonpResut) { var itemList = slicedJsonpResut.itemlist; if (null != itemList && itemList.auctions != null) { foreach (var itemProduct in itemList.auctions) { TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct); if (null != modelProduct) { lstProducts.Add(modelProduct); } } } } } } #endregion } catch (Exception ex) { PluginContext.Logger.Error(ex); } return(resultBag);// string.Concat("has process input :" + content); }
/// <summary> /// 解析搜索首页的剩余的jsonp 获取商品的地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> private string ResolveSlicedSearchPageSilcedUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); StringBuilder sbSearchUrl = new StringBuilder("https://s.taobao.com/api?q=@###@&imgfile="); try { string filerValueString = ""; #region 品牌 if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Taobao); if (currentPlatformBrands.Any()) { //多个品牌用 , 号分割 string brandIds = string.Join(";", currentPlatformBrands.Select(x => x.BrandId)); filerValueString += brandIds; } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Taobao); if (null != currentPlatformTag) { //1 分类 cat var catFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cat"); if (null != catFilter) { sbSearchUrl.Append("&cat=").Append(catFilter.Value); } // 2 其他的ppath标签 var ppathFilter = currentPlatformTag.Where(x => x.FilterFiled == "ppath"); if (ppathFilter.Any()) { string ppathIds = string.Join(";", ppathFilter.Select(x => x.Value)); filerValueString += ";"; filerValueString += ppathIds; } } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Taobao); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } //-----追加过滤字段特性-------- if (!string.IsNullOrEmpty(filerValueString)) { sbSearchUrl.Append("&ppath=").Append(filerValueString); } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换 #endregion #region 排序 if (null != webArgs.OrderFiled) { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion #region 页码 sbSearchUrl.Append("&s=36");//this must be a constant value 36 !!!!!! #endregion #region 杂项 string timeToken = JavascriptContext.getUnixTimestamp(); sbSearchUrl.AppendFormat("&_ksTS={0}_897", timeToken); sbSearchUrl.Append("&callback=jsonp2822"); sbSearchUrl.Append("&m=customized"); sbSearchUrl.Append("&ps=1"); sbSearchUrl.Append("&ie=utf8"); sbSearchUrl.Append("&ajax=true"); sbSearchUrl.Append("&js=1"); sbSearchUrl.Append("&p4ppushleft=1,48"); sbSearchUrl.Append("&stats_click=search_radio_all:1"); sbSearchUrl.Append("&bcoffset=0"); sbSearchUrl.Append("&ntoffset=4"); sbSearchUrl.Append("&rn=ee5b33aee4d18bf96ab0ad083eadc7f0"); sbSearchUrl.AppendFormat("&initiative_id=staobaoz_{0}", DateTime.Now.ToString("yyyyMMdd")); #endregion } catch (Exception ex) { PluginContext.Logger.Error(ex); } return(sbSearchUrl.ToString()); }
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); StringBuilder sbSearchUrl = new StringBuilder("http://search.dangdang.com/?key=@###@"); #region 品牌 if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Dangdang); if (currentPlatformBrands.Any()) { //多个品牌用 _ 号分割 string brandIds = string.Join("_", currentPlatformBrands.Select(x => x.BrandId)); sbSearchUrl.Append("&att=1:").Append(brandIds); } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Dangdang); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Dangdang); if (currentPlatformTag.Any()) { //1 分类 cat var catFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "category_id"); if (null != catFilter) { sbSearchUrl.Append("&category_id=").Append(catFilter.Value); } //2 属性 att var attFilter = currentPlatformTag.Where(x => x.FilterFiled == "att"); if (attFilter.Any()) { string attrIds = string.Join("-", currentPlatformTag.Select(x => x.Value));//&att=1000012:1985-1000012:1986 sbSearchUrl.Append("&att=").Append(attrIds); } //3 其他分类路径 var catePathFilter = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "category_path"); if (null != catePathFilter) { sbSearchUrl.Append("&category_path=").Append(catePathFilter.Value); } } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Tmall); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换 #endregion #region 排序 if (null == webArgs.OrderFiled) { sbSearchUrl.Append("&sort_type=sort_default");//默认综合排序 } else { sbSearchUrl.Append("&sort_type=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion #region 页码 sbSearchUrl.Append("&page_index=").Append(webArgs.PageIndex + 1); #endregion # region 杂项
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); try { StringBuilder sbSearchUrl = new StringBuilder("https://list.tmall.com/search_product.htm?spm=a220m.1000858.1000720.1.348abe64rj5JVg"); #region 品牌 if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Tmall); if (currentPlatformBrands.Any()) { //多个品牌用 , 号分割 string brandIds = string.Join(",", currentPlatformBrands.Select(x => x.BrandId)); sbSearchUrl.Append("&brand=").Append(brandIds); } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Tmall); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform == SupportPlatformEnum.Tmall); if (null != currentPlatformTag) { sbSearchUrl.Append("&prop=").Append(currentPlatformTag.Value); } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Tmall); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } #endregion #region 关键词 sbSearchUrl.Append("&q=").Append(webArgs.KeyWord); #endregion #region 排序 if (null == webArgs.OrderFiled) { sbSearchUrl.Append("&sort=s");//默认综合排序 } else { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion #region 页码 var pageNumber = webArgs.PageIndex; if (pageNumber > 0) { sbSearchUrl.Append("&s=").Append(pageNumber * 60);//天猫的分页是基于页索引*60 sbSearchUrl.Append("&search_condition=2"); } #endregion #region 杂项 sbSearchUrl.Append("&from=mallfp..pc_1_searchbutton"); sbSearchUrl.Append("&type=pc"); sbSearchUrl.Append("&style=g"); #endregion resultUrl.Url = sbSearchUrl.ToString(); } catch (Exception ex) { PluginContext.Logger.Error(ex); } return(resultUrl); }
/// <summary> /// 执行内容解析 /// </summary> ///<param name="webArgs"> </param> /// <param name="content">要解析的内容</param> /// <returns>返回需要的字段对应的字典</returns> public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content) { var resultBag = new Dictionary <string, object>(); if (!string.IsNullOrEmpty(content)) { if (content.Contains("环境有异常")) { PluginContext.Logger.Error("天猫查询被进行蜘蛛验证!关键词:" + webArgs.KeyWord); return(resultBag); } if (content.Contains("member/login")) { PluginContext.Logger.Error("天猫查询结果页面被强制跳转到了登录页!关键词:" + webArgs.KeyWord); return(resultBag); } } try { //创建html 文档对象 HtmlParser htmlParser = new HtmlParser(); var htmlDoc = htmlParser.Parse(content); var div_AttrsDom = htmlDoc.QuerySelector("div.j_NavAttrs"); if (webArgs.IsNeedResolveHeaderTags == true && null != div_AttrsDom) { #region 品牌解析 var lstBrands = new List <BrandTag>(); var brandDom = div_AttrsDom.QuerySelector("div.j_Brand"); if (null != brandDom) { //从属性区域解析dom-品牌内容 var brandULDom = brandDom.QuerySelector("div.attrValues>ul");//ulDomArray[0];// if (null != brandULDom) { var regex_MatchBrandId = new Regex(@"brand=(\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase); var li_ADomArray = brandULDom.QuerySelectorAll("li>a"); foreach (var itemADom in li_ADomArray) { var model = new BrandTag(); model.Platform = SupportPlatformEnum.Tmall; model.FilterField = "brand";//使用的过滤字段参数 var urlBrand = itemADom.GetAttribute("href"); if (!string.IsNullOrEmpty(urlBrand) && urlBrand.Contains("brand=")) { model.BrandId = regex_MatchBrandId.Match(urlBrand).Groups[1].Value;//new//品牌id href="?brand=110910&q=%B4%F3%C3%D7&sort=s&style=g&from=sn_1_brand-qp&spm=a220m.1000858.1000720.1.348abe64rj5JVg#J_crumbs } model.BrandName = itemADom.GetAttribute("title"); model.CharIndex = PinYin.GetFirstLetter(model.BrandName); lstBrands.Add(model); } } } resultBag.Add("Brands", lstBrands); #endregion // tags 解析 //var lstTags = new List<KeyWordTag> { //new KeyWordTag { // Platform = NTCPMessage.EntityPackage.SupportPlatformEnum.Tmall, // TagName = "大衣", Value = "dayi", FilterFiled = "sku" //} }; var ulDomArray = div_AttrsDom.QuerySelectorAll("div.attrValues>ul"); var lstTags = new List <KeyWordTagGroup>(); if (null != div_AttrsDom) { var blockList = new BlockingCollection <KeyWordTagGroup>(); //分类 or 属性;品牌是第一个,其他属性是后续 int startIdx = brandDom == null ? 0 : 1;// //是否存在品牌的判断 var taskArray = new Task[ulDomArray.Length - startIdx]; int counter = 0; for (int i = startIdx; i < ulDomArray.Length; i++) { int cursor = i; var taskResolveAEmelems = Task.Factory.StartNew(() => { var itemUl = ulDomArray[cursor]; //找到归属的组 var attrKeyDom = itemUl.ParentElement.ParentElement.QuerySelector("div.attrKey"); string groupName = ""; if (null != attrKeyDom) { groupName = attrKeyDom.TextContent.Replace("\n", "").Trim(); } var tagGroup = new KeyWordTagGroup(groupName); var childLiADomArray = itemUl.QuerySelectorAll("li>a"); foreach (var itemADom in childLiADomArray) { var modelTag = new KeyWordTag(); modelTag.Platform = SupportPlatformEnum.Tmall; modelTag.TagName = itemADom.TextContent.Replace("\n", "");//标签名称 modelTag.GroupShowName = groupName; //////----解析 a标签开始------- //////检查 a 的href 中的参数;cat 或者prop string hrefValue = itemADom.GetAttribute("href"); if (!string.IsNullOrEmpty(hrefValue)) { var urlParas = HttpUtility.ParseQueryString(hrefValue, Encoding.UTF8); if (null != urlParas) { if (hrefValue.IndexOf("cat=") > -1) { //1 cat string catValue = urlParas["cat"]; modelTag.FilterFiled = "cat"; modelTag.Value = catValue; } else if (hrefValue.IndexOf("prop=") > -1) { //2 prop string propValue = urlParas["prop"]; modelTag.FilterFiled = "prop"; modelTag.Value = propValue; } } } tagGroup.Tags.Add(modelTag); } //----解析 a标签完毕------- blockList.Add(tagGroup); }); //将并行任务放到数组 taskArray[counter] = taskResolveAEmelems; counter += 1; } var safeTaskArray = taskArray.Where(x => null != x).ToArray(); Task.WaitAll(safeTaskArray); lstTags = blockList.ToList(); } resultBag.Add("Tags", lstTags); } #region products 解析 //ProductBaseCollection lstProducts = new ProductBaseCollection() //{ // new TmallProduct { ItemId=1,Title="测试大衣"} //}; var lstProducts = new ProductBaseCollection(); //多任务并行解析商品 //BlockingCollection<TmallProduct> blockingList_Products = new BlockingCollection<TmallProduct>(); ConcurrentDictionary <string, ProductOrdered <TmallProduct> > blockingList_Products = new ConcurrentDictionary <string, ProductOrdered <TmallProduct> >(); var div_J_ItemListDom = htmlDoc.QuerySelector("div#J_ItemList"); if (null != div_J_ItemListDom) { var div_productDomArray = div_J_ItemListDom.QuerySelectorAll("div.product"); if (null != div_productDomArray && div_productDomArray.Any()) { var pids = div_productDomArray .Select(x => { return(x.GetAttribute("data-id")); }); //设定排序对象 int counter_pid = 0; foreach (var itemPid in pids) { if (null != itemPid) { blockingList_Products.TryAdd(itemPid, new ProductOrdered <TmallProduct> { UniqKey = itemPid, IndexOrder = counter_pid }); counter_pid++; } } //并行解析 并保留原序列 div_productDomArray.AsParallel() .ForAll((itemProductDom) => { //解析一个商品的节点 TmallProduct modelProduct = this.ResolverProductDom(itemProductDom); if (null != modelProduct && modelProduct.ItemId > 0) { var orderedObj = blockingList_Products[modelProduct.ItemId.ToString()]; orderedObj.Product = modelProduct; } }); //进行排序 var productsList = blockingList_Products .Where(x => x.Value != null) .OrderBy(x => x.Value.IndexOrder) .Select(x => x.Value.Product); lstProducts.AddRange(productsList); } } resultBag.Add("Products", lstProducts); #endregion } catch (Exception ex) { PluginContext.Logger.Error(ex); } return(resultBag);// string.Concat("has process input :" + content); }
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); resultUrl.IsNeedPreRequest = false;//苏宁的搜索页面和数据列表是分离的,直接在解析中进行内容请求,不需要预先请求 StringBuilder sbSearchUrl = new StringBuilder("https://search.suning.com/@###@/"); #region 品牌 if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Suning); if (currentPlatformBrands.Any()) { //多个品牌直接将id拼接为字符串,国美家的 是4位加密码进行的拼接组 string brandNames = string.Join(";", currentPlatformBrands.Select(x => x.BrandName)); sbSearchUrl.Append("&hf=brand_Name_FacetAll:").Append(brandNames); } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Suning); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Suning); if (currentPlatformTag.Any()) { #region 分类 var catIdTag = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "ci"); if (null != catIdTag) { sbSearchUrl.Append("&ci=").Append(catIdTag.Value); } #endregion //https://search.suning.com/羽绒服/&iy=0&sc=0&hf=solr_13696_attrId:收腰型;常规&st=0#search-path-box string attrIds = string.Join(";", currentPlatformTag.Select(x => x.Value)); sbSearchUrl.Append("&cf=") .Append(currentPlatformTag.First().FilterFiled) .Append("_attrId:") .Append(attrIds); } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Suning); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换 #endregion #region 排序 if (null == webArgs.OrderFiled) { sbSearchUrl.Append("&st=0");//默认综合排序 } else { sbSearchUrl.Append("&st=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion #region 页码 sbSearchUrl.Append("&cp=").Append(webArgs.PageIndex); #endregion # region 杂项
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); resultUrl.IsNeedPreRequest = false;//国美的搜索页面和数据列表是分离的,不需要预先请求html StringBuilder sbSearchUrl = new StringBuilder("https://search.gome.com.cn/search?question=@###@"); #region 品牌 string facetsString = string.Empty; if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Guomei); if (currentPlatformBrands.Any()) { //sbSearchUrl.Append("&pzpq=0"); //sbSearchUrl.Append("&pzin=v4"); //多个品牌直接将id拼接为字符串,国美家的 是4位加密码进行的拼接组 string brandIds = string.Join("", currentPlatformBrands.Select(x => x.BrandId)); //sbSearchUrl.Append("&facets=").Append(brandIds); facetsString += brandIds; //有品牌参数的时候,国美前端有个附加参数 intcmp 没什么用,直接固定 sbSearchUrl.Append("&intcmp=search-9000001100-1"); } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Guomei); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Guomei); if (currentPlatformTag.Any()) { #region 分类 var catIdTag = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "catId"); if (null != catIdTag) { sbSearchUrl.Append("&catId=").Append(catIdTag.Value); } #endregion string attrIds = string.Join("", currentPlatformTag.Select(x => x.Value));//facetsid 的组合 facetsString += attrIds; } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Guomei); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } if (!string.IsNullOrEmpty(facetsString)) { sbSearchUrl.Append("&facets=").Append(facetsString);//国美是把所有的属性作为4字符串值作为参数解析的 } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord);//将关键词的占位符 进行替换 #endregion #region 排序 if (null == webArgs.OrderFiled) { sbSearchUrl.Append("&sort=00");//默认综合排序 } else { sbSearchUrl.Append("&sort=").Append(webArgs.OrderFiled.FieldValue);//默认综合排序 } #endregion #region 筛选-价格区间 #endregion #region 页码 sbSearchUrl.Append("&page=").Append(webArgs.PageIndex + 1); #endregion # region 杂项
/// <summary> /// 将指定的参数的抓取的页面的解析结果放到缓存 /// </summary> /// <param name="webArgs"></param> /// <param name="reultModel"></param> /// <param name="timeOut(秒)">默认为30秒</param> public static void SetFetchPageResultFromCache(BaseFetchWebPageArgument webArgs, SearchProductViewModel reultModel, int timeOut = 30) { var key = webArgs.CacheKey; RedisClient.SetAsync(key, reultModel, timeOut); }
/// <summary> /// 解析搜索地址 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public override ResolvedSearchUrlWithParas ResolveSearchUrl(BaseFetchWebPageArgument webArgs) { ResolvedSearchUrlWithParas resultUrl = new ResolvedSearchUrlWithParas(); StringBuilder sbSearchUrl = new StringBuilder("https://search.jd.com/Search?keyword=@###@&enc=utf-8&wq=@###@"); #region 品牌 规格 分类 都在参数ev 中 //例如:exbrand_娇兰(Guerlain)||NARS^1107_82376||8240^ string paraBrandAndSkusEv = ""; if (null != webArgs.Brands && webArgs.Brands.Count > 0) { //1 当前平台的品牌 var currentPlatformBrands = webArgs.Brands.Where(x => x.Platform == SupportPlatformEnum.Jingdong); if (currentPlatformBrands.Any()) { //多个品牌用 , 号分割 string brandNames = string.Join("||", currentPlatformBrands.Select(x => x.BrandName)); paraBrandAndSkusEv = string.Concat("exbrand_", brandNames, "^"); } //2 非当前平台的品牌--选择其中的一个 作为关键词 分割 var otherPlatformBrands = webArgs.Brands.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Jingdong); if (null != otherPlatformBrands) { webArgs.KeyWord += " " + otherPlatformBrands.BrandName; } } #endregion #region 属性标签 if (null != webArgs.TagGroup) { //1 当前平台的 var currentPlatformTag = webArgs.TagGroup.Tags.Where(x => x.Platform == SupportPlatformEnum.Jingdong); if (null != currentPlatformTag) { //归属科目 cid2 var cid2Para = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cid2"); if (null != cid2Para) { sbSearchUrl.Append("&cid2=").Append(cid2Para.Value); } //归属科目 cid3 var cid3Para = currentPlatformTag.FirstOrDefault(x => x.FilterFiled == "cid3"); if (null != cid3Para) { sbSearchUrl.Append("&cid3=").Append(cid3Para.Value); } var tagGroup = currentPlatformTag.GroupBy(x => x.FilterFiled); string skuAttrs = ""; foreach (var itemGroup in tagGroup) { string key = itemGroup.Key + "_";//属性_ string values = string.Join("||", itemGroup.Select(x => x.Value)); skuAttrs += string.Concat(key, values); skuAttrs += "^"; } paraBrandAndSkusEv += skuAttrs; } if (!string.IsNullOrEmpty(paraBrandAndSkusEv)) { sbSearchUrl.Append("&ev=").Append(paraBrandAndSkusEv); } //2 其他平台的tag 作为关键词的一部分 var otherPlatformTag = webArgs.TagGroup.Tags.FirstOrDefault(x => x.Platform != SupportPlatformEnum.Jingdong); if (null != otherPlatformTag) { webArgs.KeyWord += " " + otherPlatformTag.TagName; } } #endregion #region 关键词 sbSearchUrl.Replace("@###@", webArgs.KeyWord); #endregion #region 排序 if (null != webArgs.OrderFiled && webArgs.OrderFiled.Rule != OrderRule.Default) { sbSearchUrl.Append("&psort=").Append(webArgs.OrderFiled.FieldValue); } #endregion #region 筛选-价格区间 #endregion #region 页码 int pageNumber = (webArgs.PageIndex * 2) + 1;//京东每页分割为2个子页,按照页索引0开始,倍乘2,然后加1 为正确的页码 sbSearchUrl.Append("&page=").Append(pageNumber); //京东前后翻页的时候 需要这个s 参数,前为prev 参数 ,后翻为next 参数 if (null != webArgs.AttachParas && webArgs.AttachParas.ContainsKey("jd_pager_s")) { sbSearchUrl.Append("&s=").Append(webArgs.AttachParas["jd_pager_s"]); } #endregion # region 杂项