private static bool MasterAddUrlEvent(AddUrlEventArgs args) { if (!filter.Contains(args.Url))//不包含就添加 { filter.Add(args.Url); return(true); } return(false); // 返回 false 代表:不添加到队列中 }
/// <summary> /// The master add url event. /// </summary> /// <param name="args"> /// The args. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> private static bool MasterAddUrlEvent(AddUrlEventArgs args) { if (!filter.Contains(args.Url)) { filter.Add(args.Url); Console.WriteLine(args.Url); return(true); } return(false); // 返回 false 代表:不添加到队列中 }
private void OnAddUrlEventHandler(object sender, AddUrlEventArgs args) { bool isExist = urlPool.Add(args.Url); if (!isExist) { urlQueue.EnQueue(new UrlInfo(args.Url, args.Depth + 1)); } if (AddUrlEventHandler != null) { AddUrlEventHandler(sender, args); } }
/// <summary> /// The master add url event. /// </summary> /// <param name="args"> /// The args. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> private static bool MasterAddUrlEvent(AddUrlEventArgs args) { //符合条件的url //if (urlFilterKeyWord.Any(c => args.Url.Contains(c))) return false;//url过滤 if (!simpleCrawler.CanAddUrl(args)) { return(false); } if (!filter.Contains(args.Url)) { filter.Add(args.Url); Console.WriteLine(args.Url); return(true); } return(false); // 返回 false 代表:不添加到队列中 }
/// <summary> /// 监测Url是否重复 /// </summary> /// <param name="url"></param> /// <returns></returns> private static bool MasterAddUrlEvent(AddUrlEventArgs args) { foreach (string item in UrlDebar) { if (!string.IsNullOrWhiteSpace(item)) { if (args.Url.Contains(item)) { return(false); // 返回 false 代表:不添加到队列中 } } } if (!filter.Contains(args.Url)) { filter.Add(args.Url); return(true); } return(false); }
/// <summary> /// url处理,是否可添加 /// </summary> /// <param name="args"></param> /// <returns></returns> public bool CanAddUrl(AddUrlEventArgs args) { #region //todo:是否重复building 临时 var queryStr = GetQueryString(args.Url); if (!string.IsNullOrEmpty(queryStr)) { var dic = HttpUtility.ParseQueryString(queryStr); var buildId = dic["building"] != null ? dic["building"].ToString() : string.Empty; if (!string.IsNullOrEmpty(buildId)) { if (BloomBuildingIds.Contains(buildId)) { return(false); } } } #endregion return(true); }
/// <summary> /// url处理,是否可添加 /// </summary> /// <param name="args"></param> /// <returns></returns> public bool CanAddUrl(AddUrlEventArgs args) { return true; }
/// <summary> /// The master add url event. /// </summary> /// <param name="args"> /// The args. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> private static bool MasterAddUrlEvent(AddUrlEventArgs args) { if (!filter.Contains(args.Url)) { filter.Add(args.Url); Console.WriteLine(args.Url); return true; } return false; // 返回 false 代表:不添加到队列中 }
/// <summary> /// The master add url event. /// </summary> /// <param name="args"> /// The args. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> private static bool MasterAddUrlEvent(AddUrlEventArgs args) { if (!filter.Contains(args.Url))//不包含就添加 { filter.Add(args.Url); Console.WriteLine(args.Url); File.AppendAllText(urlFilePath, args.Url + "\r\n"); return true; } return false; // 返回 false 代表:不添加到队列中 }
/// <summary> /// 解析页面链接 /// </summary> /// <param name="html"></param> private void ParseLinks(UrlInfo currentUrl, string html) { if (_settings.CrawlDepth > 0 && currentUrl.Depth >= _settings.CrawlDepth) { return; } // 获取页面所有链接 Dictionary <string, string> urls = new Dictionary <string, string>(); Match match = Regex.Match(html, "(?i)<a .*?href=\"([^\"]+)\"[^>]*>(.*?)</a>"); while (match.Success) { // 以href作为key,已链接文本作为value urls[match.Groups[1].Value] = Regex.Replace(match.Groups[2].Value, "(?i)<.*?>", ""); match = match.NextMatch(); } foreach (var linknode in urls) { string href = linknode.Key; string linkText = linknode.Value; if (href != null) { bool canBeAdd = true; if (_escapeLinks != null) { foreach (var node in _escapeLinks) { if (href.EndsWith(node, StringComparison.OrdinalIgnoreCase)) { canBeAdd = false; break; } } } if (_keywords != null) { if (!_keywords.Any(linkText.Contains)) { canBeAdd = false; } } if (canBeAdd) { string url = href .Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (String.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } Uri uri = new Uri(currentUrl.Url); Uri thisUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(uri, url); url = thisUri.AbsoluteUri; if (_settings.LockHost) { // 对于new.baidu.com和www.baidu.com // 如果去除二级域名后相等,则认为是同一个网站 if (uri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b) != thisUri.Host.Split('.').Skip(1).Aggregate((a, b) => a + "." + b)) { continue; } } if (!IsUrlMatchRegex(url)) { continue; } AddUrlEventArgs args = new AddUrlEventArgs { Title = linkText, Depth = currentUrl.Depth + 1, Url = url }; if (AddUrlEvent != null && !AddUrlEvent(args)) { continue; } lock (this) { _urlQueue.Enqueue(new UrlInfo { Url = url, Depth = currentUrl.Depth + 1 }); } } } } }
private bool MasterAddUrlEvent(AddUrlEventArgs args) { return(_linkStorage.TryAdd(args.Url)); }
/// <summary> /// The master add url event. /// </summary> /// <param name="args"> /// The args. /// </param> /// <returns> /// The <see cref="bool"/>. /// </returns> private static bool MasterAddUrlEvent(AddUrlEventArgs args) { if (urlFilterKeyWord.Any(c => args.Url.Contains(c))) return false;//url过滤 if (!filter.Contains(args.Url)) { filter.Add(args.Url); Console.WriteLine(args.Url); return true; } return false; // 返回 false 代表:不添加到队列中 }
/// <summary> /// The crawl process. /// </summary> /// <param name="threadIndex"> /// The thread index. /// </param> private void CrawlProcess(object threadIndex) { var currentThreadIndex = (int)threadIndex; while (true) { // 根据队列中的 Url 数量和空闲线程的数量,判断线程是睡眠还是退出 if (UrlQueue.Instance.Count == 0) { this.threadStatus[currentThreadIndex] = true; if (!this.threadStatus.Any(t => t == false)) { break; } Thread.Sleep(2000); continue; } this.threadStatus[currentThreadIndex] = false; if (UrlQueue.Instance.Count == 0) { continue; } UrlInfo urlInfo = UrlQueue.Instance.DeQueue(); if (urlInfo == null) { continue; } HttpHandle.HttpResult httpResult = HttpHandle.Get(urlInfo); if (httpResult != null) { string html = httpResult.Body; string code = httpResult.Code.ToString(); if (urlInfo.Type && httpResult.Code == 200) { this.ParseLinks(urlInfo, html); Match mc = Regex.Match(urlInfo.UrlString, @"((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+(:[0-9]+)?|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)", RegexOptions.IgnoreCase); for (int i = 0; i < Program.UrlSuffix.Count; i++) { string url1 = mc.ToString(); if (url1.Substring(url1.Length - 1, 1) == "/") { url1 = url1.Substring(0, url1.Length - 1); } url1 = url1 + Program.UrlSuffix[i]; //url1 = url1.Replace("//", "/"); var addUrlEventArgs1 = new AddUrlEventArgs { Title = url1, Depth = urlInfo.Depth + 1, Url = url1 }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs1)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url1) { Depth = 1, Type = false }); } ; } else { String regex = @"<title>.+</title>"; //返回网页标题 String title = Regex.Match(html, regex).ToString(); html = Regex.Replace(title, @"[\""]+", ""); if (html.IndexOf("404") > -1 || html.IndexOf("Page Not Found") > -1 || html.IndexOf("未找到") > -1 || html.IndexOf("不存在") > -1 || html.IndexOf("错误") > -1 || html.IndexOf("网站防火墙") > -1 || html.IndexOf("请联系空间提供商") > -1) { code = "404"; } } if (this.DataReceivedEvent != null) { this.DataReceivedEvent( new DataReceivedEventArgs { Url = urlInfo.UrlString, Depth = urlInfo.Depth, Html = html, Code = code, Type = urlInfo.Type }); } } } }
/// <summary> /// The parse links. /// </summary> /// <param name="urlInfo"> /// The url info. /// </param> /// <param name="html"> /// The html. /// </param> private void ParseLinks(UrlInfo urlInfo, string html) { var urlDictionary = new Dictionary <string, string>(); Match match = Regex.Match(html, @"((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+(:[0-9]+)?|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[\w]*))?)"); while (match.Success) { ///; // 以 href 作为 key string urlKey = match.Value; // 以 text 作为 value string urlValue = Regex.Replace(match.Value, "(?i)<.*?>", string.Empty); string Url = @"^http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?$"; if (Regex.IsMatch(urlValue, Url)) { urlDictionary[urlKey] = urlValue; } match = match.NextMatch(); } foreach (var item in urlDictionary) { string href = item.Key; string text = item.Value; if (!string.IsNullOrEmpty(href)) { string url = href.Replace("%3f", "?") .Replace("%3d", "=") .Replace("%2f", "/") .Replace("&", "&"); if (string.IsNullOrEmpty(url) || url.StartsWith("#") || url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) || url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase)) { continue; } var baseUri = new Uri(urlInfo.UrlString); Uri currentUri = url.StartsWith("http", StringComparison.OrdinalIgnoreCase) ? new Uri(url) : new Uri(baseUri, url); url = currentUri.AbsoluteUri; var addUrlEventArgs = new AddUrlEventArgs { Title = text, Depth = urlInfo.Depth + 1, Url = url }; if (this.AddUrlEvent != null && !this.AddUrlEvent(addUrlEventArgs)) { continue; } UrlQueue.Instance.EnQueue(new UrlInfo(url) { Depth = urlInfo.Depth + 1, Type = true }); } } }