/// <summary> /// 处理网页 /// </summary> /// <param name="html"></param> protected override void Process(string html) { try { Regex re = new Regex(@"href=(?<web_url>[\s\S]*?)>|href=""(?<web_url>[\s\S]*?)""|href='(?<web_url>[\s\S]*?)'"); MatchCollection mc = re.Matches(html); foreach (Match m in mc) { string url = m.Groups["web_url"].ToString(); url = this.RemoveQuotation(url); if (url.IndexOf("http://") != -1) { UrlQueue.GetInstance().Enqueue(url); } } string title = string.Empty; re = new Regex(@"<title[\s\S]*?>(?<title>[\s\S]*?)</title>"); Match temp = re.Match(html.ToLower()); title = temp.Groups["title"].ToString(); if (!string.IsNullOrEmpty(title)) { Console.WriteLine(string.Format("网页标题:{0}", title)); Console.WriteLine(string.Format("网页URL:{0}", this.Url)); } } catch { } }
/// <summary> /// Clears / deletes all relevant crawler information from Azure Storage upon command. /// Sets the XML & URL Queue sizes to 0, clears the XML & URL Queues, and deletes /// the URL Table and Error Table. URL and Error Tables must be reinitialized for crawler /// to function again (handled in worker role). /// </summary> public void ClearAll() { TableOperation clearQueue = TableOperation.InsertOrReplace(new CrawlrQueueSize(0, 0)); StatusTable.Execute(clearQueue); XmlQueue.Clear(); UrlQueue.Clear(); UrlTable.Delete(); ErrorTable.Delete(); }
/// <summary> /// 构造函数 /// </summary> private SpiderBase() { this.urlQueue = new UrlQueue(); this.contentQueue = new ContentQueue(); this.requestThreads = new List<Thread>(); this.processThreads = new List<Thread>(); this.runStatus = SpiderRunStatus.UnStarted; this.pauseCalled = false; this.completed = false; this.startTimespan = new TimeSpan(); this.startTime = DateTime.MinValue; this.runtime = new SpiderRuntime(); }
static void Main(string[] args) { try { Console.Title = System.Configuration.ConfigurationManager.AppSettings["Title"].ToString(); Console.WriteLine("Process is running!"); string url = System.Configuration.ConfigurationManager.AppSettings["URL"].ToString(); UrlQueue.GetInstance().Enqueue(url); ThreadManager thread = new ThreadManager(); thread.Start(); } catch (Exception ex) { } Console.ReadLine(); }
public void HandUrl(UrlGroup urlGroup, int depath) { if (DoneQueue.Contains(urlGroup)) { return; } string html = HttpHelper.GetString(urlGroup.Url, Encoding.Default); lock (DoneQueueLock) { DoneQueue.Enqueue(urlGroup); } //1、获取页面所有url List <string> listAllUrl = MatchDomainURL(html); //2、过滤:去掉外站、js、图片等url string[] extArray = new string[] { ".jpg", ".png", ".gif", ".js" }; List <string> mlstUrl = new List <string>(); foreach (string url in listAllUrl) { var tmpurl = url.ToLower(); bool isfile = false; foreach (string ext in extArray) { if (tmpurl.Contains(ext)) { isfile = true; break; } } //匹配特殊url SepcialMatch(url); if (!isfile && !DoneQueue.Contains(url) && !UrlQueue.Contains(url)) { mlstUrl.Add(url); } } if (depath + 1 > MaxSearchDepth) { return; //达到最大搜索深度 } lock (UrlQueueLock) { foreach (string url in mlstUrl) { UrlQueue.Enqueue(new UrlGroup() { Url = url, DepathNumber = depath + 1 }); if (UrlQueue.Count > 10000) { Thread.Sleep(3000); } else if (UrlQueue.Count > 50000) { Thread.Sleep(10000); } } } }
public void Search(string url) { if (string.IsNullOrEmpty(url) && string.IsNullOrEmpty(BaseURL)) { return; } if (!string.IsNullOrEmpty(url)) { BaseURL = url; } HandUrl(new UrlGroup() { Url = BaseURL, DepathNumber = 0 }, 0); Thread.Sleep(1000); //for (int i = 0; i < MaxThreadCount; i++) //{ ThreadPool.QueueUserWorkItem(x => { try { while (true) { if (IsStop) { break; } if (IsAllDone) { break; } if (IsPause) { continue; } if (UrlQueue.Count == 0 && SpecUrlQueue.Count == 0 && UrlQueue.Count == 0) { break; } if (UrlQueue.Count == 0) { Thread.Sleep(10000); } UrlGroup urlGroup = (UrlGroup)UrlQueue.Dequeue(); HandUrl(urlGroup, urlGroup.DepathNumber); Thread.Sleep(1000); } } catch (Exception ex) { throw ex; } }); //} ThreadPool.QueueUserWorkItem(x => { try { while (true) { if (IsStop) { break; } if (IsPause) { continue; } if (IsAllDone && SpecUrlQueue.Count == 0) { break; } if (SpecUrlQueue.Count == 0) { Thread.Sleep(10000); } HandSpecialUrl(SpecUrlQueue.Dequeue().ToString()); Thread.Sleep(1000); } } catch (Exception ex) { throw ex; } }); }