Exemplo n.º 1
0
        /// <summary>
        /// ContentType:text/html
        /// </summary>
        /// <param name="httpWebResponse"></param>
        /// <param name="encoding"></param>
        /// <returns></returns>
        private HtmlResponse GetTextHtmlResponse(HttpWebResponse httpWebResponse, string encoding)
        {
            var response = new HtmlResponse();

            using (var stream = httpWebResponse.GetResponseStream())
            {
                if (stream == null)
                {
                    return(response);
                }

                using (var reader = new StreamReader(stream, Encoding.GetEncoding(encoding)))
                {
                    response.Html = reader.ReadToEnd();

                    reader.Close();
                    reader.Dispose();
                }

                stream.Close();
                stream.Dispose();
            }

            return(response);
        }
Exemplo n.º 2
0
        /// <summary>
        /// 解析Html内容
        ///
        /// 处理步骤:
        /// 1.解析出全部的A链接href地址
        /// 2.逐个对md5(url)签名,并验证是否已存在
        /// 3.如果不存在,则添加到调度器中
        /// 4.返回 MessageContext 消息
        /// </summary>
        /// <param name="htmlResponse"></param>
        /// <returns></returns>
        private HtmlMessageContext ExtractHtmlResponse(HtmlResponse htmlResponse)
        {
            //判断是否提取文档链接
            if (htmlResponse.Request.IsExtractLink)
            {
                ExtractLink(htmlResponse);
            }

            var message = new HtmlMessageContext
            {
                Response = htmlResponse.Clone() as Response,
                Text     = htmlResponse.Html
            };

            return(message);
        }
Exemplo n.º 3
0
        /// <summary>
        /// 提取文档链接
        /// </summary>
        /// <param name="htmlResponse"></param>
        private void ExtractLink(HtmlResponse htmlResponse)
        {
            //解析出全部Url并去除重复记录
            var urlList = GetHrefUrlList(htmlResponse.Html, htmlResponse.Request.Url).Distinct();

            //过滤第三方网站域名
            if (htmlResponse.Request.IgnoreThirdpartyDomain)
            {
                var uriHost = new Uri(htmlResponse.Request.Url).Host.Replace("www.", "");

                urlList = urlList.Where(p => p.Contains(uriHost)).ToList();
            }

            //循环处理url列表
            foreach (var url in urlList)
            {
                try
                {
                    var urlHash  = url.ToMd5();
                    var redisKey = string.Format("spider:url:{0}", urlHash);

                    //是否满足抓取条件
                    if (Exist(redisKey) == false)
                    {
                        var requestUri = new Uri(url);
                        var request    = htmlResponse.Request.Clone() as Request;
                        if (request != null && requestUri.IsFile == false)
                        {
                            request.Url     = url;
                            request.Referer = htmlResponse.Request.Url;

                            //推送到调度队列
                            SchedulerManage.Instance.Push(request);

                            var redisValue = JsonConvert.SerializeObject(request);

                            //推送到缓存
                            _cacheContext.Set(redisKey, redisValue);
                        }
                    }
                }
                catch (Exception exception)
                {
                    _logger.Error(exception);
                }
            }
        }