예제 #1
0
        /// <summary>
        /// 转发消息到从节点
        /// </summary>
        /// <param name="slaveNode"></param>
        /// <param name="soaMsg"></param>
        /// <returns></returns>
        private IDataContainer TransferMsgToSlave(PeekerClusterNode slaveNode, SoapMessage soaMsg)
        {
            IDataContainer result = null;



            try
            {
                //string address = "127.0.0.1";

                using (var conn = new SoapTcpConnection(slaveNode.IpAddress, slaveNode.Port))
                {
                    if (conn.State == ConnectionState.Closed)
                    {
                        conn.Open();
                    }

                    //发送soap
                    result = conn.SendSoapMessage(soaMsg);
                }
            }
            catch (Exception ex)
            {
                Logger.Error(ex);
            }



            return(result);
        }
예제 #2
0
        /// <summary>
        ///  向服务端发送消息,注册登记
        ///  并分配端口
        /// </summary>
        /// <param name="cmd"></param>
        /// <param name="data"></param>
        public static int RegisterSlaveToMaster(string slaveIdentity)
        {
            var result = -1;



            using (var conn = new SoapTcpConnection("127.0.0.1", GlobalContext.MasterSocketPort))
            {
                if (conn.State == ConnectionState.Closed)
                {
                    conn.Open();
                }
                if (conn.Ping() == false)
                {
                    return(result);
                }

                //发送soap
                var paras = new RegisterPortArgument {
                    SlaveIdentity = slaveIdentity
                };
                string      msg    = JsonConvert.SerializeObject(paras);
                SoapMessage sopMsg = new SoapMessage()
                {
                    Head = CommandConstants.CMD_RegisterSlavePort,
                    Body = msg
                };

                var repResult = conn.SendSoapMessage(sopMsg);
                if (repResult.Status == 1)
                {
                    result = repResult.Result.ToInt();
                }
            }

            return(result);
        }
예제 #3
0
        /// <summary>
        /// 根据关键词 ,筛选条件, 请求对应平台上的返回结果
        /// 结果是各自平台上的 商品Item列表
        /// </summary>
        /// <param name="webArgs"></param>
        /// <returns></returns>
        public SearchProductViewModel QueryProductsByKeyWords(BaseFetchWebPageArgument webArgs)
        {
            SearchProductViewModel dataModel = new SearchProductViewModel();

            if (webArgs.IsValid() == false)
            {
                return(dataModel);
            }

            try
            {
                //注册搜索词到热词服务
                HotWordService.AddWord(webArgs.KeyWord);

                //是否开启内容缓存,如果开启,那么从缓存中加载内容
                if (true == WorkContext.IsFetchPageCacheaAble)
                {
                    dataModel = WorkContext.GetFetchPageResultFromCache(webArgs);
                    if (null != dataModel)
                    {
                        return(dataModel);
                    }
                }

                //工厂模式 获取指定平台的内容解析器
                var resolver = ResolverFactory.GetSearchProductResolver(webArgs.Platform);
                //尝试解析页面参数的检索地址
                var searchUrl = resolver.ResolveSearchUrl(webArgs);
                if (null != searchUrl)
                {
                    webArgs.ResolvedUrl = searchUrl;
                }
                string pageContent = string.Empty;

                using (var connMgr = new WebCrawlerConnConfigManager())
                {
                    var connStrConfig = connMgr.Connection;
                    //;//ConfigHelper.WebCrawlerSection.ConnectionStringCollection["Crawler-Server1"];
                    webArgs.SystemAttachParas["SoapTcpConnectionString"] = connStrConfig;//register to attach paras

                    if (searchUrl.IsNeedPreRequest == true)
                    {
                        ////1 打开tcp 链接
                        ////2 发送参数
                        ////3 解析结果

                        using (var conn = new SoapTcpConnection(connStrConfig))
                        {
                            if (conn.State == ConnectionState.Closed)
                            {
                                conn.Open();
                            }

                            //发送soap
                            var soapCmd = new SoapMessage()
                            {
                                Head = CommandConstants.CMD_FetchPage
                            };
                            soapCmd.Body = webArgs.ToJson();
                            var dataContainer = conn.SendSoapMessage(soapCmd);
                            if (null != dataContainer && dataContainer.Status == 1)
                            {
                                pageContent = dataContainer.Result;
                            }
                            else
                            {
                                StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:");
                                errMsg.Append(soapCmd.Body);
                                if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg))
                                {
                                    errMsg.Append(";服务端错误消息:")
                                    .Append(dataContainer.ErrorMsg);
                                }
                                throw new Exception(errMsg.ToString());
                            }
                        }
                    }
                }


                //开始解析内容字符串
                //*******注意:针对可以直接进行内容解析的连接,交给内容解析函数进行地址的内容请求和解析*********
                if (!string.IsNullOrEmpty(pageContent) || !searchUrl.IsNeedPreRequest)
                {
                    dataModel = resolver.ResolvePageContent(webArgs, pageContent);
                    if (null != dataModel)
                    {
                        dataModel.KeyWord = webArgs.KeyWord;
                        dataModel.IsNeedResolveHeaderTags = webArgs.IsNeedResolveHeaderTags;
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.Error(ex);
            }

            //如果开启缓存页面结果
            if (true == WorkContext.IsFetchPageCacheaAble &&
                null != dataModel &&
                dataModel.Products.IsNotEmpty())
            {
                int cacheTime = ConfigHelper.AppSettingsConfiguration.GetConfigInt("FetchPageCacheTime");
                if (cacheTime <= 0)
                {
                    cacheTime = 60;//默认缓存页面结果60秒
                }
                WorkContext.SetFetchPageResultFromCache(webArgs, dataModel, cacheTime);
            }
            return(dataModel);
        }
예제 #4
0
        /// <summary>
        /// 执行内容解析
        /// </summary>
        ///<param name="webArgs"></param>
        /// <param name="content">要解析的内容</param>
        /// <returns>返回需要的字段对应的字典</returns>
        public override Dictionary <string, object> ResolveSearchPageContent(BaseFetchWebPageArgument webArgs, string content)
        {
            var resultBag = new Dictionary <string, object>();

            try
            {
                string jsonData = string.Empty;

                if (content.IndexOf("g_page_config") < 0)
                {
                    return(null);//无效的页面结果数据
                }


                //send request for load other data of first search page
                Task <string> tskSilcedJsonpContent = null;
                if (webArgs.PageIndex == 0)
                {
                    tskSilcedJsonpContent = Task.Factory.StartNew(() =>
                    {
                        string jsonpContent = "";
                        ////1 打开tcp 链接
                        ////2 发送参数
                        ////3 解析结果
                        if (!webArgs.SystemAttachParas.ContainsKey("SoapTcpConnectionString"))
                        {
                            return(jsonpContent);
                        }
                        var connStrConfig = webArgs.SystemAttachParas["SoapTcpConnectionString"] as WebCrawlerConnection;
                        if (null == connStrConfig)
                        {
                            return(jsonpContent);
                        }
                        //重写解析地址-首页的分片jsonp地址
                        string urlOfSlicedJsonp = this.ResolveSlicedSearchPageSilcedUrl(webArgs);
                        webArgs.ResolvedUrl     = new ResolvedSearchUrlWithParas {
                            Url = urlOfSlicedJsonp
                        };
                        using (var conn = new SoapTcpConnection(connStrConfig))
                        {
                            if (conn.State == ConnectionState.Closed)
                            {
                                conn.Open();
                            }

                            //发送soap
                            var soapCmd = new SoapMessage()
                            {
                                Head = CommandConstants.CMD_FetchPage
                            };
                            soapCmd.Body      = JsonConvert.SerializeObject(webArgs);
                            var dataContainer = conn.SendSoapMessage(soapCmd);
                            if (null != dataContainer && dataContainer.Status == 1)
                            {
                                jsonpContent = dataContainer.Result;
                            }
                            else
                            {
                                StringBuilder errMsg = new StringBuilder("抓取网页请求失败!参数:");
                                errMsg.Append(soapCmd.Body);
                                if (null != dataContainer && !string.IsNullOrEmpty(dataContainer.ErrorMsg))
                                {
                                    errMsg.Append(";服务端错误消息:")
                                    .Append(dataContainer.ErrorMsg);
                                }
                                PluginContext.Logger.Error(errMsg.ToString());
                            }
                        }

                        return(jsonpContent);
                    });
                }


                int startPos      = content.IndexOf("g_page_config");
                int endPos        = content.IndexOf("g_srp_loadCss") - startPos;
                var secondContent = content.Substring(startPos, endPos);
                int secStartPos   = secondContent.IndexOf('{');
                int secEndPos     = secondContent.IndexOf("};") - secStartPos + 1;
                jsonData = secondContent.Substring(secStartPos, secEndPos);



                TaobaoPageJsonResut pageJsonObj = JsonConvert.DeserializeObject <TaobaoPageJsonResut>(jsonData);
                if (null == pageJsonObj)
                {
                    return(null);
                }

                if (webArgs.IsNeedResolveHeaderTags == true)
                {
                    var navNode = pageJsonObj.mods.nav;
                    if (null != navNode && null != navNode.data)
                    {
                        var commonNode = navNode.data.common;
                        var advNode    = navNode.data.adv;

                        //解析common节点
                        if (null != commonNode && commonNode.Any())
                        {
                            //1 检测是否有品牌,有的话 解析品牌
                            #region 品牌解析


                            var brandNode = commonNode.FirstOrDefault(x => x.text == "品牌" && x.sub != null);
                            if (null != brandNode && brandNode.sub != null)
                            {
                                var lstBrands = new List <BrandTag>();
                                foreach (var subItem in brandNode.sub)
                                {
                                    var model = new BrandTag();
                                    model.Platform    = SupportPlatformEnum.Taobao;
                                    model.FilterField = "ppath";//使用的过滤字段参数

                                    model.BrandId   = subItem.value;
                                    model.BrandName = subItem.text;
                                    model.CharIndex = PinYin.GetFirstLetter(model.BrandName);
                                    lstBrands.Add(model);
                                }
                                //解析完毕品牌
                                resultBag.Add("Brands", lstBrands);
                            }

                            #endregion
                        }


                        //2其他筛选节点的分析

                        #region tags 解析


                        var lstTags = new List <KeyWordTagGroup>();

                        var otherFilterNode1 = commonNode.Where(x => x.text != "品牌" && x.sub != null);
                        foreach (var itemNode in otherFilterNode1)
                        {
                            //找到归属的组
                            string groupName = itemNode.text;
                            ProcessTags(lstTags, itemNode.sub, groupName);
                        }
                        ////////if (null!= advNode)----高级筛选不要了
                        ////////{
                        ////////    //advNode 的解析
                        ////////    foreach (var itemNode in advNode)
                        ////////    {
                        ////////        //找到归属的组
                        ////////        string groupName = itemNode.text;
                        ////////        ProcessTags(lstTags, itemNode.sub, groupName);
                        ////////    }
                        ////////}

                        resultBag.Add("Tags", lstTags);

                        #endregion
                    }
                }

                #region products  解析
                var lstProducts = new ProductBaseCollection();
                resultBag.Add("Products", lstProducts);

                var itemListNode = pageJsonObj.mods.itemlist;
                if (null != itemListNode && itemListNode.data != null && null != itemListNode.data.auctions)
                {
                    foreach (var itemProduct in itemListNode.data.auctions)
                    {
                        TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct);

                        if (null != modelProduct)
                        {
                            lstProducts.Add(modelProduct);
                        }
                    }
                }

                //淘宝的搜索列表 - 第一页的数据是进行了分片的,在加载html ;36条数据, 后续会进行一次jsonp的请求;加载12条数据
                if (webArgs.PageIndex == 0 && null != tskSilcedJsonpContent)
                {
                    string jsonpContent = tskSilcedJsonpContent.Result;
                    if (!string.IsNullOrEmpty(jsonpContent) && jsonpContent.Contains("API.CustomizedApi"))
                    {
                        int    startIdx         = jsonpContent.IndexOf(':') + 1;
                        int    endIdx           = jsonpContent.Length - startIdx - 3;
                        string pureJsonContent  = jsonpContent.Substring(startIdx, endIdx);
                        var    slicedJsonpResut = JsonConvert.DeserializeObject <TaobaoSlicedJsonpResut>(pureJsonContent);


                        if (null != slicedJsonpResut)
                        {
                            var itemList = slicedJsonpResut.itemlist;
                            if (null != itemList && itemList.auctions != null)
                            {
                                foreach (var itemProduct in itemList.auctions)
                                {
                                    TaobaoProduct modelProduct = this.ResolverProductDom(itemProduct);

                                    if (null != modelProduct)
                                    {
                                        lstProducts.Add(modelProduct);
                                    }
                                }
                            }
                        }
                    }
                }
                #endregion
            }
            catch (Exception ex)
            {
                PluginContext.Logger.Error(ex);
            }
            return(resultBag);// string.Concat("has process input :" + content);
        }