/// <summary> /// 获得URL的唯一标识 /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <returns> /// The <see cref="ulong"/>. /// </returns> public static ulong GetUniqueIdentifier(this UrlInfo urlInfo) { byte[] bytes = Encoding.Default.GetBytes(urlInfo.UrlString); var service = new MD5CryptoServiceProvider(); byte[] hashValue = service.ComputeHash(bytes); return(BitConverter.ToUInt64(hashValue, 0)); }
/// <summary> /// The parse links. /// 超链接解析 /// 通用的超链接解析;没有针对特定的超链接解析暴露接口 /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { //设置参数大于0,并且 url深度 大于等于 设置深度==》同时满足==》退出 //设置参数小于等于0,或者url深度 小于 设置深度(2)==》执行下面处理 if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary <string, string>(); //<href,text> //http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); //原来的正则表达式"(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"超链接正则表达式 while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch();//从上一个匹配结束的位置(即在上一个匹配字符之后的字符)开始返回一个包含下一个匹配结果的新 } //至此会得到一个urlDictionary[urlKey] = urlValue; //Kiwi:在这里添加自定义处理也不错,url+html==>url //至此出现两个问题:1、有些URL不是我们需要的,它们添加进来了 // 2、有由JS自动生成的url,这里没有获得,比如page.js,页面接受完数据后,浏览器执行js代码,生成页码条,嵌入到页面上 //自定义操作,我挖掘html,获得新URL加入到urlDictionary; // urlDictionary删除不必要的URL if (CustomParseLinkEvent3 != null) { CustomParseLinkEvent3Args linkArgs = new CustomParseLinkEvent3Args { UrlInfo = urlInfo, UrlDictionary = urlDictionary, Html = html };//1、urlInfo原始信息;2、初步解析后的html信息;3、初步解析得到的url集合 #region 被升级的代码 //CustomParseLinkEvent2的代码 //urlDictionary = CustomParseLinkEvent2(new CustomParseLinkEvent2Args //{ // UrlInfo = urlInfo, // UrlDictionary = urlDictionary, // Html = html //}); #endregion 被升级的代码 CustomParseLinkEvent3(linkArgs); urlDictionary = linkArgs.UrlDictionary; } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) //有忽略链接 { //(suffix:后缀;StringComparison.OrdinalIgnoreCase:忽略大小写,进行比较) if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) // 确定序列中的任何元素是否都满足条件。 //满足条件,不添加;即有满足忽略后缀的超链接,不添加 { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) //有连接关键词 { if (!this.Settings.HrefKeywords.Any(href.Contains)) //不包含关键字,不添加。原来关键字这么重要。 { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); //根据指定的基 URI 和相对 URI 字符串,初始化 System.Uri 类的新实例。 //如果不包含http,则认为超链接是相对路径,根据baseUrl建立绝对路径 url = currentUri.AbsoluteUri; if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url))//如果不匹配 { continue; } #region addUrlEventArgs var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } #endregion addUrlEventArgs //Kiwi:在这里添加一个事件处理自定义的url处理方法最好了 //经过上面的一系列处理,决定将url加入队列 UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }
/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.ThreadStatus[currentThreadIndex] = true; if (!this.ThreadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.ThreadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); HttpWebRequest request = null; HttpWebResponse response = null; try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { int span = this.random.Next(1000, 5000); Thread.Sleep(span); } // 创建并配置Web请求 request = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest; this.ConfigRequest(request);//方法:引用参数的变身房。 if (request != null) { response = request.GetResponse() as HttpWebResponse; } if (response != null) //保证有数据返回,获得了响应 { this.PersistenceCookie(response); //维持cookie Stream stream = null; // 如果页面压缩,则解压数据流 if (response.ContentEncoding == "gzip") { Stream responseStream = response.GetResponseStream(); if (responseStream != null) { stream = new GZipStream(responseStream, CompressionMode.Decompress); } } else { stream = response.GetResponseStream(); } using (stream) { string html = this.ParseContent(stream, response.CharacterSet); /// 将流转化为字符串,包含了编码情况处理(如果页面有乱码,在这里处理) this.ParseLinks(urlInfo, html); //根据页面html和urlInfo,开启将链接添加到url爬行队列 if (this.DataReceivedEvent != null) { //在这里得到了数据。 this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html }); #region 20150930之前处理的方式 /*20150930之前处理的方式 * if (this.CustomParseLinkEvent != null) * { * //自定义事件,将DataReceivedEventArgs传入,将处理过程暴露给外部,将自定义方法处理获得的UrlInfo(主要是url)加入队列 * var customUrlInfo = this.CustomParseLinkEvent(new DataReceivedEventArgs * { * Url = urlInfo.UrlString, * Depth = urlInfo.Depth, * Html = html * }); * if (customUrlInfo != null) * { * UrlQueue.Instance.EnQueue(customUrlInfo); * } * } */ #endregion 20150930之前处理的方式 } if (stream != null) { stream.Close(); } } } } catch (Exception exception) { if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent( new CrawlErrorEventArgs { Url = urlInfo.UrlString, Exception = exception }); } } } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } } }