/// <summary> /// ContentType:text/html /// </summary> /// <param name="httpWebResponse"></param> /// <param name="encoding"></param> /// <returns></returns> private HtmlResponse GetTextHtmlResponse(HttpWebResponse httpWebResponse, string encoding) { var response = new HtmlResponse(); using (var stream = httpWebResponse.GetResponseStream()) { if (stream == null) { return(response); } using (var reader = new StreamReader(stream, Encoding.GetEncoding(encoding))) { response.Html = reader.ReadToEnd(); reader.Close(); reader.Dispose(); } stream.Close(); stream.Dispose(); } return(response); }
/// <summary> /// 解析Html内容 /// /// 处理步骤: /// 1.解析出全部的A链接href地址 /// 2.逐个对md5(url)签名,并验证是否已存在 /// 3.如果不存在,则添加到调度器中 /// 4.返回 MessageContext 消息 /// </summary> /// <param name="htmlResponse"></param> /// <returns></returns> private HtmlMessageContext ExtractHtmlResponse(HtmlResponse htmlResponse) { //判断是否提取文档链接 if (htmlResponse.Request.IsExtractLink) { ExtractLink(htmlResponse); } var message = new HtmlMessageContext { Response = htmlResponse.Clone() as Response, Text = htmlResponse.Html }; return(message); }
/// <summary> /// 提取文档链接 /// </summary> /// <param name="htmlResponse"></param> private void ExtractLink(HtmlResponse htmlResponse) { //解析出全部Url并去除重复记录 var urlList = GetHrefUrlList(htmlResponse.Html, htmlResponse.Request.Url).Distinct(); //过滤第三方网站域名 if (htmlResponse.Request.IgnoreThirdpartyDomain) { var uriHost = new Uri(htmlResponse.Request.Url).Host.Replace("www.", ""); urlList = urlList.Where(p => p.Contains(uriHost)).ToList(); } //循环处理url列表 foreach (var url in urlList) { try { var urlHash = url.ToMd5(); var redisKey = string.Format("spider:url:{0}", urlHash); //是否满足抓取条件 if (Exist(redisKey) == false) { var requestUri = new Uri(url); var request = htmlResponse.Request.Clone() as Request; if (request != null && requestUri.IsFile == false) { request.Url = url; request.Referer = htmlResponse.Request.Url; //推送到调度队列 SchedulerManage.Instance.Push(request); var redisValue = JsonConvert.SerializeObject(request); //推送到缓存 _cacheContext.Set(redisKey, redisValue); } } } catch (Exception exception) { _logger.Error(exception); } } }