Example #1
0
        /// <summary>
        /// 为避免挤占CPU, 队列为空时睡觉.
        /// </summary>
        /// <param name="crawler"></param>

        private static void SleepWhenQueueIsEmpty(CollectorThread collector)
        {
            collector.Url = string.Empty;
            if (collector.Dirty)
            {
                collector.CollectotThreadCallbacked(collector);
            }
            Thread.Sleep(10 * 1000);
        }
Example #2
0
 /// <summary>
 /// 爬虫开始(按配置的线程数创建线程进行抓取)
 /// </summary>
 public void Start()
 {
     //创建线程
     _collectorThreads = new CollectorThread[ThreadCount];
     for (int i = 0; i < ThreadCount; i++)
     {
         CollectorThread cThread = new CollectorThread(this);
         cThread.Name = i.ToString();
         //为每个线程注册委托
         cThread.CollectotThreadCallbacked += new CollectorThreadCallback(CollectorThreadStatusChanged);
         cThread.Start();
         CrawlerThreads[i] = cThread;
     }
 }
Example #3
0
 public static void DoWork(object data)
 {
     try
     {
         CollectorThread cThread = (CollectorThread)data;
         //采集主体
         Collector collector = cThread.Collector;
         //即将访问的URL队列
         UrlQueue urlQueue = collector.urlQueue;
         while (true)
         {
             if (urlQueue.Count > 0)
             {
                 try
                 {
                     // 从队列中获取URL
                     string url = (string)urlQueue.Dequeue();
                     // 获取页面
                     cThread.Url = url;
                     if (cThread.Dirty)
                     {
                         cThread.CollectotThreadCallbacked(collector);
                     }
                     string html = HtmlHelper.GetHtml(url, "UTF-8");
                     //检索页面上的邮件
                     CollectHelper.CollectEmail(html);
                     if (cThread.Dirty)
                     {
                         cThread.CollectotThreadCallbacked(collector);
                     }
                 }
                 catch (InvalidOperationException)
                 {
                     SleepWhenQueueIsEmpty(cThread);
                 }
             }
             else
             {
                 SleepWhenQueueIsEmpty(cThread);
             }
         }
     }
     catch (ThreadAbortException)
     {
         // 线程被放弃
     }
 }
Example #4
0
 /// <summary>
 /// foamliu, 2009/12/27.
 /// 这个方法主要做三件事: 1.获取页面. 2.提取URL并加入队列. 3.获取面上的邮件地址
 /// </summary>
 /// <param name="url"></param>
 private static void Fetch(CollectorThread collector, string url)
 {
     try
     {
         // 获取页面.
         collector.Url = url;
         if (collector.Dirty)
         {
             collector.CollectotThreadCallbacked(collector);
         }
         string   html    = HtmlHelper.GetHtml(url, "UTF-8");
         string   baseUri = UrlHelper.GetBaseUri(url);
         string[] links   = UrlHelper.ExtractLinks(baseUri, html);
         // 提取URL并加入队列.
         UrlQueue urlQueue = collector.Collector.urlQueue;
         foreach (string link in links)
         {
             //判断url过长,避免爬虫陷阱
             if (link.Length > 256)
             {
                 continue;
             }
             //判断是否访问过,避免出现环
             if (collector.Collector.HSCollectorUrl.Contains(link))
             {
                 continue;
             }
             // 加入队列
             urlQueue.Enqueue(link);
         }
         //检索页面上的邮件
         if (collector.Dirty)
         {
             collector.CollectotThreadCallbacked(collector);
         }
     }
     catch (Exception ex)
     {
         MessageBox.Show(ex.Message);
     }
 }