/// <summary> /// 转发消息到从节点 /// </summary> /// <param name="slaveNode"></param> /// <param name="soaMsg"></param> /// <returns></returns> private IDataContainer TransferMsgToSlave(PeekerClusterNode slaveNode, SoapMessage soaMsg) { IDataContainer result = null; try { //string address = "127.0.0.1"; using (var conn = new SoapTcpConnection(slaveNode.IpAddress, slaveNode.Port)) { if (conn.State == ConnectionState.Closed) { conn.Open(); } //发送soap result = conn.SendSoapMessage(soaMsg); } } catch (Exception ex) { Logger.Error(ex); } return(result); }
/// <summary> /// 向服务端发送消息,注册登记 /// 并分配端口 /// </summary> /// <param name="cmd"></param> /// <param name="data"></param> public static int RegisterSlaveToMaster(string slaveIdentity) { var result = -1; using (var conn = new SoapTcpConnection("127.0.0.1", GlobalContext.MasterSocketPort)) { if (conn.State == ConnectionState.Closed) { conn.Open(); } if (conn.Ping() == false) { return(result); } //发送soap var paras = new RegisterPortArgument { SlaveIdentity = slaveIdentity }; string msg = JsonConvert.SerializeObject(paras); SoapMessage sopMsg = new SoapMessage() { Head = CommandConstants.CMD_RegisterSlavePort, Body = msg }; var repResult = conn.SendSoapMessage(sopMsg); if (repResult.Status == 1) { result = repResult.Result.ToInt(); } } return(result); }
/// <summary> /// 根据关键词 ,筛选条件, 请求对应平台上的返回结果 /// 结果是各自平台上的 商品Item列表 /// </summary> /// <param name="webArgs"></param> /// <returns></returns> public SearchProductViewModel QueryProductsByKeyWords(BaseFetchWebPageArgument webArgs) { SearchProductViewModel dataModel = new SearchProductViewModel(); if (webArgs.IsValid() == false) { return(dataModel); } try { //注册搜索词到热词服务 HotWordService.AddWord(webArgs.KeyWord); //是否开启内容缓存,如果开启,那么从缓存中加载内容 if (true == WorkContext.IsFetchPageCacheaAble) { dataModel = WorkContext.GetFetchPageResultFromCache(webArgs); if (null != dataModel) { return(dataModel); } } //工厂模式 获取指定平台的内容解析器 var resolver = ResolverFactory.GetSearchProductResolver(webArgs.Platform); //尝试解析页面参数的检索地址 var searchUrl = resolver.ResolveSearchUrl(webArgs); if (null != searchUrl) { webArgs.ResolvedUrl = searchUrl; } string pageContent = string.Empty; using (var connMgr = new WebCrawlerConnConfigManager()) { var connStrConfig = connMgr.Connection; //;//ConfigHelper.WebCrawlerSection.ConnectionStringCollection["Crawler-Server1"]; webArgs.SystemAttachParas["SoapTcpConnectionString"] = connStrConfig;//register to attach paras if (searchUrl.IsNeedPreRequest == true) { ////1 打开tcp 链接 ////2 发送参数 ////3 解析结果 using (var conn = new SoapTcpConnection(connStrConfig)) { if (conn.State == ConnectionState.Closed) { conn.Open(); } //发送soap var soapCmd = new SoapMessage() { Head = CommandConstants.CMD_FetchPage }; soapCmd.Body = webArgs.ToJson(); var dataContainer = conn.SendSoapMessage(soapCmd); if (null != dataContainer && dataContainer.Status == 1) { pageContent = dataContainer.Result; } else { StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:"); errMsg.Append(soapCmd.Body); if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg)) { errMsg.Append(";服务端错误消息:") .Append(dataContainer.ErrorMsg); } throw new Exception(errMsg.ToString()); } } } } //开始解析内容字符串 //*******注意:针对可以直接进行内容解析的连接,交给内容解析函数进行地址的内容请求和解析********* if (!string.IsNullOrEmpty(pageContent) || !searchUrl.IsNeedPreRequest) { dataModel = resolver.ResolvePageContent(webArgs, pageContent); if (null != dataModel) { dataModel.KeyWord = webArgs.KeyWord; dataModel.IsNeedResolveHeaderTags = webArgs.IsNeedResolveHeaderTags; } } } catch (Exception ex) { Logger.Error(ex); } //如果开启缓存页面结果 if (true == WorkContext.IsFetchPageCacheaAble && null != dataModel && dataModel.Products.IsNotEmpty()) { int cacheTime = ConfigHelper.AppSettingsConfiguration.GetConfigInt("FetchPageCacheTime"); if (cacheTime <= 0) { cacheTime = 60;//默认缓存页面结果60秒 } WorkContext.SetFetchPageResultFromCache(webArgs, dataModel, cacheTime); } return(dataModel); }
/// <summary> /// 执行内容解析 /// </summary> ///<param name="webArgs"></param> /// <param name="content">要解析的内容</param> /// <returns>返回需要的字段对应的字典</returns> public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content) { var resultBag = new Dictionary <string, object>(); try { string jsonData = string.Empty; if (content.IndexOf("g_page_config") < 0) { return(null);//无效的页面结果数据 } //send request for load other data of first search page Task <string> tskSilcedJsonpContent = null; if (webArgs.PageIndex == 0) { tskSilcedJsonpContent = Task.Factory.StartNew(() => { string jsonpContent = ""; ////1 打开tcp 链接 ////2 发送参数 ////3 解析结果 if (!webArgs.SystemAttachParas.ContainsKey("SoapTcpConnectionString")) { return(jsonpContent); } var connStrConfig = webArgs.SystemAttachParas["SoapTcpConnectionString"] as WebCrawlerConnection; if (null == connStrConfig) { return(jsonpContent); } //重写解析地址-首页的分片jsonp地址 string urlOfSlicedJsonp = this.ResolveSlicedSearchPageSilcedUrl(webArgs); webArgs.ResolvedUrl = new ResolvedSearchUrlWithParas { Url = urlOfSlicedJsonp }; using (var conn = new SoapTcpConnection(connStrConfig)) { if (conn.State == ConnectionState.Closed) { conn.Open(); } //发送soap var soapCmd = new SoapMessage() { Head = CommandConstants.CMD_FetchPage }; soapCmd.Body = JsonConvert.SerializeObject(webArgs); var dataContainer = conn.SendSoapMessage(soapCmd); if (null != dataContainer && dataContainer.Status == 1) { jsonpContent = dataContainer.Result; } else { StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:"); errMsg.Append(soapCmd.Body); if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg)) { errMsg.Append(";服务端错误消息:") .Append(dataContainer.ErrorMsg); } PluginContext.Logger.Error(errMsg.ToString()); } } return(jsonpContent); }); } int startPos = content.IndexOf("g_page_config"); int endPos = content.IndexOf("g_srp_loadCss") - startPos; var secondContent = content.Substring(startPos, endPos); int secStartPos = secondContent.IndexOf('{'); int secEndPos = secondContent.IndexOf("};") - secStartPos + 1; jsonData = secondContent.Substring(secStartPos, secEndPos); TaobaoPageJsonResut pageJsonObj = JsonConvert.DeserializeObject <TaobaoPageJsonResut>(jsonData); if (null == pageJsonObj) { return(null); } if (webArgs.IsNeedResolveHeaderTags == true) { var navNode = pageJsonObj.mods.nav; if (null != navNode && null != navNode.data) { var commonNode = navNode.data.common; var advNode = navNode.data.adv; //解析common节点 if (null != commonNode && commonNode.Any()) { //1 检测是否有品牌,有的话 解析品牌 #region 品牌解析 var brandNode = commonNode.FirstOrDefault(x => x.text == "品牌" && x.sub != null); if (null != brandNode && brandNode.sub != null) { var lstBrands = new List <BrandTag>(); foreach (var subItem in brandNode.sub) { var model = new BrandTag(); model.Platform = SupportPlatformEnum.Taobao; model.FilterField = "ppath";//使用的过滤字段参数 model.BrandId = subItem.value; model.BrandName = subItem.text; model.CharIndex = PinYin.GetFirstLetter(model.BrandName); lstBrands.Add(model); } //解析完毕品牌 resultBag.Add("Brands", lstBrands); } #endregion } //2其他筛选节点的分析 #region tags 解析 var lstTags = new List <KeyWordTagGroup>(); var otherFilterNode1 = commonNode.Where(x => x.text != "品牌" && x.sub != null); foreach (var itemNode in otherFilterNode1) { //找到归属的组 string groupName = itemNode.text; ProcessTags(lstTags, itemNode.sub, groupName); } ////////if (null!= advNode)----高级筛选不要了 ////////{ //////// //advNode 的解析 //////// foreach (var itemNode in advNode) //////// { //////// //找到归属的组 //////// string groupName = itemNode.text; //////// ProcessTags(lstTags, itemNode.sub, groupName); //////// } ////////} resultBag.Add("Tags", lstTags); #endregion } } #region products 解析 var lstProducts = new ProductBaseCollection(); resultBag.Add("Products", lstProducts); var itemListNode = pageJsonObj.mods.itemlist; if (null != itemListNode && itemListNode.data != null && null != itemListNode.data.auctions) { foreach (var itemProduct in itemListNode.data.auctions) { TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct); if (null != modelProduct) { lstProducts.Add(modelProduct); } } } //淘宝的搜索列表 - 第一页的数据是进行了分片的,在加载html ;36条数据, 后续会进行一次jsonp的请求;加载12条数据 if (webArgs.PageIndex == 0 && null != tskSilcedJsonpContent) { string jsonpContent = tskSilcedJsonpContent.Result; if (!string.IsNullOrEmpty(jsonpContent) && jsonpContent.Contains("API.CustomizedApi")) { int startIdx = jsonpContent.IndexOf(':') + 1; int endIdx = jsonpContent.Length - startIdx - 3; string pureJsonContent = jsonpContent.Substring(startIdx, endIdx); var slicedJsonpResut = JsonConvert.DeserializeObject <TaobaoSlicedJsonpResut>(pureJsonContent); if (null != slicedJsonpResut) { var itemList = slicedJsonpResut.itemlist; if (null != itemList && itemList.auctions != null) { foreach (var itemProduct in itemList.auctions) { TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct); if (null != modelProduct) { lstProducts.Add(modelProduct); } } } } } } #endregion } catch (Exception ex) { PluginContext.Logger.Error(ex); } return(resultBag);// string.Concat("has process input :" + content); }