public static void DoWork(object data) { try { CollectorThread cThread = (CollectorThread)data; //采集主体 Collector collector = cThread.Collector; //即将访问的URL队列 UrlQueue urlQueue = collector.urlQueue; while (true) { if (urlQueue.Count > 0) { try { // 从队列中获取URL string url = (string)urlQueue.Dequeue(); // 获取页面 cThread.Url = url; if (cThread.Dirty) { cThread.CollectotThreadCallbacked(collector); } string html = HtmlHelper.GetHtml(url, "UTF-8"); //检索页面上的邮件 CollectHelper.CollectEmail(html); if (cThread.Dirty) { cThread.CollectotThreadCallbacked(collector); } } catch (InvalidOperationException) { SleepWhenQueueIsEmpty(cThread); } } else { SleepWhenQueueIsEmpty(cThread); } } } catch (ThreadAbortException) { // 线程被放弃 } }
/// <summary> /// 开始线程 /// </summary> /// <param name="data"></param> public static void DoWork(object data) { try { LinkerThread LinkerThread = (LinkerThread)data; //主体 Linker Linker = LinkerThread.Linker; //即将访问的URL队列 UrlQueue urlQueue = Linker.urlQueue; while (true) { if (urlQueue.Count > 0) { try { // 从队列中获取URL string url = (string)urlQueue.Dequeue(); // 获取页面 LinkerThread.Url = url; if (LinkerThread.IsComplete) { LinkerThread.LinkerThreadStatusChanged(Linker); } string html = HtmlHelper.GetHtml(url, "UTF-8"); LinkHelper.CollectEmail(url, html); if (LinkerThread.IsComplete) { LinkerThread.LinkerThreadStatusChanged(Linker); } } catch (InvalidOperationException) { SleepWhenQueueIsEmpty(LinkerThread); } } else { SleepWhenQueueIsEmpty(LinkerThread); } } } catch (ThreadAbortException) { // 线程被放弃 } }
/// <summary> /// foamliu, 2009/12/27. /// 这个方法主要做三件事: 1.获取页面. 2.提取URL并加入队列. 3.获取面上的邮件地址 /// </summary> /// <param name="url"></param> private static void Fetch(CollectorThread collector, string url) { try { // 获取页面. collector.Url = url; if (collector.Dirty) { collector.CollectotThreadCallbacked(collector); } string html = HtmlHelper.GetHtml(url, "UTF-8"); string baseUri = UrlHelper.GetBaseUri(url); string[] links = UrlHelper.ExtractLinks(baseUri, html); // 提取URL并加入队列. UrlQueue urlQueue = collector.Collector.urlQueue; foreach (string link in links) { //判断url过长,避免爬虫陷阱 if (link.Length > 256) { continue; } //判断是否访问过,避免出现环 if (collector.Collector.HSCollectorUrl.Contains(link)) { continue; } // 加入队列 urlQueue.Enqueue(link); } //检索页面上的邮件 if (collector.Dirty) { collector.CollectotThreadCallbacked(collector); } } catch (Exception ex) { MessageBox.Show(ex.Message); } }