/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary <string, string>(); Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }
/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary<string, string>(); string urlRegex = string.IsNullOrEmpty(Settings.UrlRegex) ? "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>" : Settings.UrlRegex; Match match = Regex.Match(html, urlRegex); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }
private bool MasterAddUrlEvent(AddUrlEventArgs args) { if (!filter.Contains(args.Url)) { filter.Add(args.Url); //Console.WriteLine(args.Url); return true; } return false; // 返回 false 代表:不添加到队列中 }
/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary<string, string>(); // Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); //var testStr = "<a href=\"http://baidu.com\" >融信鹤林花园</ a > "; //var testStr = "<A href=\"proDetail.asp? projectID = MTAyMjF8MjAxNS8xMC8yNnwyNA == \" target=_blank>阳光环站新城1#地...</a>"; // var firstIndex = html.IndexOf("<A href='result_new.asp"); // var testStr = html.Substring(firstIndex,200); Match match = Regex.Match(html.Replace("'","\""), "(?i)<a .*?href=[\",']([^\"]+)[\",'][^>]*>[^<]*</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[0].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }
/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { if (this.Settings.Depth > 0 && urlInfo.Depth >= this.Settings.Depth) { return; } var urlDictionary = new Dictionary <string, string>(); // Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); //var testStr = "<a href=\"http://baidu.com\" >融信鹤林花园</ a > "; //var testStr = "<A href=\"proDetail.asp? projectID = MTAyMjF8MjAxNS8xMC8yNnwyNA == \" target=_blank>阳光环站新城1#地...</a>"; // var firstIndex = html.IndexOf("<A href='result_new.asp"); // var testStr = html.Substring(firstIndex,200); //2016.5.24修正<a href="xxx"><span>123</span></a>获取不到问题 //Match match = Regex.Match(html.Replace("'","\""), "(?i)<a .*?href=[\",']([^\"]+)[\",'][^>]*>[^<]*</a>"); Match match = Regex.Match(html.Replace("'", "\""), "(?i)<a .*?href=[\",']([^\"]+)[\",'][^>]*>" + @".*?</a>"); while (match.Success) { // 以 href 作为 key string urlKey = match.Groups[1].Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Groups[0].Value, "(?i)<.*?>", string.Empty); urlDictionary[urlKey] = urlValue; match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { bool canBeAdd = true; if (this.Settings.EscapeLinks != null && this.Settings.EscapeLinks.Count > 0) { if (this.Settings.EscapeLinks.Any(suffix => href.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))) { canBeAdd = false; } } if (this.Settings.HrefKeywords != null && this.Settings.HrefKeywords.Count > 0) { if (!this.Settings.HrefKeywords.Any(href.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } Uri baseUri = null; Uri currentUri = null; try { baseUri = new Uri(urlInfo.UrlString); currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; } catch (Exception ex) { continue; } if (this.Settings.LockHost) { // 去除二级域名后,判断域名是否相等,相等则认为是同一个站点 // 例如:mail.pzcast.com 和 www.pzcast.com if (baseUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != currentUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!this.IsMatchRegular(url)) { continue; } var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1 }); } } } }