private static void DoWork(object data) { CrawlerThread crawler = (CrawlerThread)data; Downloader downloader = crawler.m_downloader; IQueueManager queue = downloader.UrlsQueueFrontier; while (true) { crawler.m_suspendEvent.WaitOne(Timeout.Infinite); if (queue.Count > 0) { try { // 从队列中获取URL string url = (string)queue.Dequeue(); // 获取页面 Fetch(crawler, url); // TODO: 检测是否完成 //if (false) break; } catch (InvalidOperationException) { SleepWhenQueueIsEmpty(crawler); } } else { SleepWhenQueueIsEmpty(crawler); } } }
/// <summary> /// 为避免挤占CPU, 队列为空时睡觉. /// </summary> /// <param name="crawler"></param> private static void SleepWhenQueueIsEmpty(CrawlerThread crawler) { crawler.Status = CrawlerStatusType.Idle; crawler.Url = string.Empty; crawler.Flush(); Thread.Sleep(MemCache.ThreadSleepTimeWhenQueueIsEmptyMs); }
public void Start() { // 如果已经启动则退出 if (null != m_crawlerThreads) { return; } m_crawlerThreads = new Collection <CrawlerThread>(); for (int i = 0; i < MemCache.ThreadCount; i++) { CrawlerThread crawler = new CrawlerThread(this); crawler.StatusChanged += new CrawlerStatusChangedEventHandler(CrawlerStatusChanged); crawler.Start(); m_crawlerThreads.Add(crawler); } this.Status = DownloaderStatusType.Running; }
/// <summary> /// foamliu, 2009/12/27. /// 这个方法主要做三件事: /// 1.获取页面. /// 2.提取URL并加入队列. /// 3.保存页面(到网页库). /// </summary> /// <param name="url"></param> private static void Fetch(CrawlerThread crawler, string url) { try { // 获取页面. crawler.Url = url; crawler.Status = CrawlerStatusType.Fetch; crawler.Flush(); //HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); // 设置超时以避免耗费不必要的时间等待响应缓慢的服务器或尺寸过大的网页. //req.Timeout = MemCache.ConnectionTimeoutMs; //HttpWebResponse response = (HttpWebResponse)req.GetResponse(); //string contentType = crawler.MimeType = response.ContentType; //crawler.Size = response.ContentLength; NWebRequest req = new NWebRequest(new Uri(url), true); // 设置超时以避免耗费不必要的时间等待响应缓慢的服务器或尺寸过大的网页. req.Timeout = MemCache.ConnectionTimeoutMs; NWebResponse response = req.GetResponse(); string contentType = crawler.MimeType = response.ContentType; if (contentType != "text/html" && !MemCache.AllowAllMimeTypes && !MemCache.AllowedFileTypes.Contains(contentType)) { return; } byte[] buffer = response.GetResponseStream(); response.Close(); // 保存页面(到网页库). crawler.Status = CrawlerStatusType.Save; crawler.Flush(); string html = Encoding.UTF8.GetString(buffer); string baseUri = Utility.GetBaseUri(url); string[] links = Parser.ExtractLinks(baseUri, html); if (Settings.DataStoreMode == "1") { //SQLiteUtility.InsertToRepo(PageRank.calcPageRank(url),url, 0, "", buffer, DateTime.Now, DateTime.Now, 0, "", Environment.MachineName,links.Length); } else { FileSystemUtility.StoreWebFile(url, buffer); } crawler.m_downloader.CrawledUrlSet.Add(url); crawler.m_downloader.CrawleHistroy.Add(new CrawlHistroyEntry() { Timestamp = DateTime.UtcNow, Url = url, Size = response.ContentLength }); lock (crawler.m_downloader.TotalSizelock) { crawler.m_downloader.TotalSize += response.ContentLength; } // 提取URL并加入队列. IQueueManager queue = crawler.m_downloader.UrlsQueueFrontier; if (contentType == "text/html" && queue.Count < 1000) { crawler.Status = CrawlerStatusType.Parse; crawler.Flush(); foreach (string link in links) { // 避免爬虫陷阱 if (link.Length > 256) { continue; } // 避免出现环 if (crawler.m_downloader.CrawledUrlSet.Contains(link)) { continue; } // 加入队列 queue.Enqueue(link); } } Console.WriteLine("[{1}] Url: {0}", crawler.Url, crawler.m_downloader.CrawleHistroy.Count); crawler.Url = string.Empty; crawler.Status = CrawlerStatusType.Idle; crawler.MimeType = string.Empty; crawler.Flush(); } catch (IOException ioEx) { if (ioEx.InnerException != null) { if (ioEx.InnerException is SocketException) { SocketException socketEx = (SocketException)ioEx.InnerException; if (socketEx.NativeErrorCode == 10054) { // 远程主机强迫关闭了一个现有的连接。 //Logger.Error(ioEx.Message); } } else { int hr = (int)ioEx.GetType().GetProperty("HResult", System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic).GetValue(ioEx, null); if (hr == -2147024864) { // 另一个程序正在使用此文件,进程无法访问。 // 束手无策 TODO: 想个办法 //Logger.Error(ioEx.Message); } else { //throw; //Logger.Error(ioEx.Message); } } } } catch (NotSupportedException /*nsEx*/) { // 无法识别该 URI 前缀。 // 束手无策 TODO: 想个办法 //Logger.Error(nsEx.Message); } catch (Exception ex) { Logger.Error(ex.Message); } }