コード例 #1
0
        /// <summary>
        /// foamliu, 2009/12/27.
        /// 这个方法主要做三件事:
        /// 1.获取页面.
        /// 2.提取URL并加入队列.
        /// 3.保存页面(到网页库).
        /// </summary>
        /// <param name="crawler">爬虫</param>
        /// <param name="url">起始URL</param>
        /// <param name="regexOutput">抓取URL的规则</param>
        /// <param name="regexFollow"></param>
        /// <param name="srcprefix">src前缀</param>
        private static void Fetch(CrawlerThread crawler, string url, string regexOutput, string regexFollow)
        {
            try
            {
                // 获取页面.
                crawler.Url = url;
                crawler.Status = CrawlerStatusType.Fetch;
                crawler.Flush();

                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
                // 设置超时以避免耗费不必要的时间等待响应缓慢的服务器或尺寸过大的网页.
                req.Timeout = MemCache.ConnectionTimeoutMs;
                HttpWebResponse response = (HttpWebResponse)req.GetResponse();
                string contentType = crawler.MimeType = response.ContentType;
                //crawler.Size = response.ContentLength;

                //WebClient w = new WebClient();
                //System.Collections.Specialized.NameValueCollection VarPost = new System.Collections.Specialized.NameValueCollection();
                //VarPost.Add("cityid", "33");//将textBox1中的数据变为用a标识的参数,并用POST传值方式传给网页 ­
                //VarPost.Add("ThemeId", "0");
                //VarPost.Add("priceRange", "0");
                //VarPost.Add("Gradeid", "0");
                //VarPost.Add("Ctypeid", "0");
                //VarPost.Add("pageindex", "3");
                //VarPost.Add("SortType", "2");
                ////将参数列表VarPost中的所有数据用POST传值的方式传给http://申请好的域名或用IIs配置好的地址/Default.aspx,
                ////并将从网页上返回的数据以字节流存放到byRemoteInfo中)(注:IIS配置的时候经常没配置好会提示错误,嘿) ­
                //byte[] byRemoteInfo = w.UploadValues("http://www.17u.com/tickets/Ajax/GetSceneryListByAjax.html", "POST", VarPost);
                //string sRemoteInfo = System.Text.Encoding.UTF8.GetString(byRemoteInfo);

                if (contentType != "text/html" &&
                    !MemCache.AllowAllMimeTypes &&
                    !MemCache.AllowedFileTypes.Contains(contentType))
                {
                    //return;
                }

                byte[] buffer = ReadInstreamIntoMemory(response.GetResponseStream());
                response.Close();

                // 保存页面(到网页库).
                crawler.Status = CrawlerStatusType.Save;
                crawler.Flush();

                string html = Encoding.UTF8.GetString(buffer);
                string baseUri = Utility.GetBaseUri(url);

                //需要替换imgsrc的选项,用相对路径的都改成绝对路径
                html=html.Replace("src=\"/", "src=\"" + baseUri);
                buffer = System.Text.UnicodeEncoding.UTF8.GetBytes(html);

                string[] links = null;
                if (!string.IsNullOrEmpty(regexOutput) && !string.IsNullOrEmpty(regexFollow))
                { Parser.ExtractLinks(baseUri, html, regexOutput, regexFollow); }

                if (Settings.DataStoreMode == "1")
                {
                    //SQLiteUtility.InsertToRepo(PageRank.calcPageRank(url),url, 0, "", buffer, DateTime.Now, DateTime.Now, 0, "", Environment.MachineName,links.Length);
                }
                else
                {
                    ////2012.12.5修改[old]  由输出regex判断模式改为多模式判断(包含不轮询)
                    ////是否匹配提取html的过滤条件
                    //if (Regex.IsMatch(url, regexOutput))
                    //{
                    //    FileSystemUtility.StoreWebFile(url, buffer);
                    //}
                    //2012.12.5修改[new]  由输出regex判断模式改为多模式判断(包含不轮询)
                    if (string.IsNullOrEmpty(regexOutput))
                    {
                        regexOutput = url;
                    }
                    if (Regex.IsMatch(url, regexOutput))
                    {
                        FileSystemUtility.StoreWebFile(url, buffer, url);
                    }
                }

                crawler.m_downloader.CrawledUrlSet.Add(url);
                crawler.m_downloader.CrawleHistroy.Add(new CrawlHistroyEntry() { Timestamp = DateTime.UtcNow, Url = url, Size = response.ContentLength });
                lock (crawler.m_downloader.TotalSizelock)
                {
                    crawler.m_downloader.TotalSize += response.ContentLength;
                }

                // 提取URL并加入队列.
                UrlQueueManager queue = crawler.m_downloader.UrlsQueueFrontier;

                if (contentType.Contains("text/html"))
                {
                    crawler.Status = CrawlerStatusType.Parse;
                    crawler.Flush();

                    foreach (string link in links)
                    {
                        // 避免爬虫陷阱
                        if (link.Length > 256) continue;
                        // 避免出现环
                        if (crawler.m_downloader.CrawledUrlSet.Contains(link)) continue;
                        // 加入队列
                        queue.Enqueue(link);
                    }
                }

                crawler.Url = string.Empty;
                crawler.Status = CrawlerStatusType.Idle;
                crawler.MimeType = string.Empty;
                crawler.Flush();

            }
            catch (IOException ioEx)
            {
                if (ioEx.InnerException != null)
                {

                    if (ioEx.InnerException is SocketException)
                    {
                        SocketException socketEx = (SocketException)ioEx.InnerException;
                        if (socketEx.NativeErrorCode == 10054)
                        {
                            // 远程主机强迫关闭了一个现有的连接。
                            //Logger.Error(ioEx.Message);
                        }
                    }
                    else
                    {
                        int hr = (int)ioEx.GetType().GetProperty("HResult",
                            System.Reflection.BindingFlags.Instance |
                            System.Reflection.BindingFlags.NonPublic).GetValue(ioEx, null);

                        if (hr == -2147024864)
                        {
                            // 另一个程序正在使用此文件,进程无法访问。
                            // 束手无策 TODO: 想个办法
                            //Logger.Error(ioEx.Message);
                        }
                        else
                        {
                            //throw;
                            //Logger.Error(ioEx.Message);
                        }
                    }
                }
            }
            catch (NotSupportedException /*nsEx*/)
            {
                // 无法识别该 URI 前缀。
                // 束手无策 TODO: 想个办法
                //Logger.Error(nsEx.Message);
            }
            catch (Exception ex)
            {
                //Logger.Error(ex.Message);
            }
        }
コード例 #2
0
        /// <summary>
        /// 为避免挤占CPU, 队列为空时睡觉. 
        /// </summary>
        /// <param name="crawler"></param>
        private static void SleepWhenQueueIsEmpty(CrawlerThread crawler)
        {
            crawler.Status = CrawlerStatusType.Idle;
            crawler.Url = string.Empty;
            crawler.Flush();

            Thread.Sleep(MemCache.ThreadSleepTimeWhenQueueIsEmptyMs);
        }
コード例 #3
0
ファイル: Downloader.cs プロジェクト: phiree/ContentExtractor
        public void Start()
        {
            // 如果已经启动则退出
            if (null != m_crawlerThreads) return;

            m_crawlerThreads = new Collection<CrawlerThread>();

            for (int i = 0; i < MemCache.ThreadCount; i++)
            {
                CrawlerThread crawler = new CrawlerThread(this, Regexcon, Regexcon2);
                crawler.StatusChanged += new CrawlerStatusChangedEventHandler(CrawlerStatusChanged);
                crawler.Start();

                m_crawlerThreads.Add(crawler);
            }

            this.Status = DownloaderStatusType.Running;
        }