/// <summary> /// The get unique identifier. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <returns> /// The <see cref="ulong"/>. /// </returns> public static ulong GetUniqueIdentifier(this UrlInfo urlInfo) { byte[] bytes = Encoding.Default.GetBytes(urlInfo.UrlString); var service = new MD5CryptoServiceProvider(); byte[] hashValue = service.ComputeHash(bytes); return(BitConverter.ToUInt64(hashValue, 0)); }
/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary <string, string>(); Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }
/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); HttpWebRequest request = null; HttpWebResponse response = null; try { if (urlInfo == null) { continue; } // 1~5 秒随机间隔的自动限速 if (this.Settings.AutoSpeedLimit) { int span = this.random.Next(1000, 5000); Thread.Sleep(span); } // 创建并配置Web请求 request = WebRequest.Create(urlInfo.UrlString) as HttpWebRequest; this.ConfigRequest(request); if (request != null) { response = request.GetResponse() as HttpWebResponse; } if (response != null) { this.PersistenceCookie(response); Stream stream = null; // 如果页面压缩,则解压数据流 if (response.ContentEncoding == "gzip") { Stream responseStream = response.GetResponseStream(); if (responseStream != null) { stream = new GZipStream(responseStream, CompressionMode.Decompress); } } else { stream = response.GetResponseStream(); } using (stream) { string html = this.ParseContent(stream, response.CharacterSet); this.ParseLinks(urlInfo, html); if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html }); } if (stream != null) { stream.Close(); } } } } catch (Exception exception) { if (this.CrawlErrorEvent != null) { if (urlInfo != null) { this.CrawlErrorEvent( new CrawlErrorEventArgs { Url = urlInfo.UrlString, Exception = exception }); } } } finally { if (request != null) { request.Abort(); } if (response != null) { response.Close(); } } } }