示例#1
0
 public static void DoWork(object data)
 {
     try
     {
         CollectorThread cThread = (CollectorThread)data;
         //采集主体
         Collector collector = cThread.Collector;
         //即将访问的URL队列
         UrlQueue urlQueue = collector.urlQueue;
         while (true)
         {
             if (urlQueue.Count > 0)
             {
                 try
                 {
                     // 从队列中获取URL
                     string url = (string)urlQueue.Dequeue();
                     // 获取页面
                     cThread.Url = url;
                     if (cThread.Dirty)
                     {
                         cThread.CollectotThreadCallbacked(collector);
                     }
                     string html = HtmlHelper.GetHtml(url, "UTF-8");
                     //检索页面上的邮件
                     CollectHelper.CollectEmail(html);
                     if (cThread.Dirty)
                     {
                         cThread.CollectotThreadCallbacked(collector);
                     }
                 }
                 catch (InvalidOperationException)
                 {
                     SleepWhenQueueIsEmpty(cThread);
                 }
             }
             else
             {
                 SleepWhenQueueIsEmpty(cThread);
             }
         }
     }
     catch (ThreadAbortException)
     {
         // 线程被放弃
     }
 }
示例#2
0
 /// <summary>
 /// 开始线程
 /// </summary>
 /// <param name="data"></param>
 public static void DoWork(object data)
 {
     try
     {
         LinkerThread LinkerThread = (LinkerThread)data;
         //主体
         Linker Linker = LinkerThread.Linker;
         //即将访问的URL队列
         UrlQueue urlQueue = Linker.urlQueue;
         while (true)
         {
             if (urlQueue.Count > 0)
             {
                 try
                 {
                     // 从队列中获取URL
                     string url = (string)urlQueue.Dequeue();
                     // 获取页面
                     LinkerThread.Url = url;
                     if (LinkerThread.IsComplete)
                     {
                         LinkerThread.LinkerThreadStatusChanged(Linker);
                     }
                     string html = HtmlHelper.GetHtml(url, "UTF-8");
                     LinkHelper.CollectEmail(url, html);
                     if (LinkerThread.IsComplete)
                     {
                         LinkerThread.LinkerThreadStatusChanged(Linker);
                     }
                 }
                 catch (InvalidOperationException)
                 {
                     SleepWhenQueueIsEmpty(LinkerThread);
                 }
             }
             else
             {
                 SleepWhenQueueIsEmpty(LinkerThread);
             }
         }
     }
     catch (ThreadAbortException)
     {
         // 线程被放弃
     }
 }
示例#3
0
 /// <summary>
 /// foamliu, 2009/12/27.
 /// 这个方法主要做三件事: 1.获取页面. 2.提取URL并加入队列. 3.获取面上的邮件地址
 /// </summary>
 /// <param name="url"></param>
 private static void Fetch(CollectorThread collector, string url)
 {
     try
     {
         // 获取页面.
         collector.Url = url;
         if (collector.Dirty)
         {
             collector.CollectotThreadCallbacked(collector);
         }
         string   html    = HtmlHelper.GetHtml(url, "UTF-8");
         string   baseUri = UrlHelper.GetBaseUri(url);
         string[] links   = UrlHelper.ExtractLinks(baseUri, html);
         // 提取URL并加入队列.
         UrlQueue urlQueue = collector.Collector.urlQueue;
         foreach (string link in links)
         {
             //判断url过长,避免爬虫陷阱
             if (link.Length > 256)
             {
                 continue;
             }
             //判断是否访问过,避免出现环
             if (collector.Collector.HSCollectorUrl.Contains(link))
             {
                 continue;
             }
             // 加入队列
             urlQueue.Enqueue(link);
         }
         //检索页面上的邮件
         if (collector.Dirty)
         {
             collector.CollectotThreadCallbacked(collector);
         }
     }
     catch (Exception ex)
     {
         MessageBox.Show(ex.Message);
     }
 }