Пример #1
0
        public void GetResponseTest()
        {
            Uri          uri        = new Uri("http://www.sohu.com");
            bool         bKeepAlive = true;
            NWebRequest  request    = new NWebRequest(uri, bKeepAlive);
            NWebResponse response   = request.GetResponse();

            Assert.AreEqual("text/html", response.ContentType);
            Assert.IsTrue(response.ContentLength > 360000);
            Assert.AreEqual(true, response.KeepAlive);
        }
Пример #2
0
        /// <summary>
        /// foamliu, 2009/12/27.
        /// 这个方法主要做三件事:
        /// 1.获取页面.
        /// 2.提取URL并加入队列.
        /// 3.保存页面(到网页库).
        /// </summary>
        /// <param name="url"></param>
        private static void Fetch(CrawlerThread crawler, string url)
        {
            try
            {
                // 获取页面.
                crawler.Url    = url;
                crawler.Status = CrawlerStatusType.Fetch;
                crawler.Flush();

                //HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
                // 设置超时以避免耗费不必要的时间等待响应缓慢的服务器或尺寸过大的网页.
                //req.Timeout = MemCache.ConnectionTimeoutMs;
                //HttpWebResponse response = (HttpWebResponse)req.GetResponse();
                //string contentType = crawler.MimeType = response.ContentType;
                //crawler.Size = response.ContentLength;

                NWebRequest req = new NWebRequest(new Uri(url), true);
                // 设置超时以避免耗费不必要的时间等待响应缓慢的服务器或尺寸过大的网页.
                req.Timeout = MemCache.ConnectionTimeoutMs;
                NWebResponse response    = req.GetResponse();
                string       contentType = crawler.MimeType = response.ContentType;

                if (contentType != "text/html" &&
                    !MemCache.AllowAllMimeTypes &&
                    !MemCache.AllowedFileTypes.Contains(contentType))
                {
                    return;
                }

                byte[] buffer = response.GetResponseStream();
                response.Close();

                // 保存页面(到网页库).
                crawler.Status = CrawlerStatusType.Save;
                crawler.Flush();

                string   html    = Encoding.UTF8.GetString(buffer);
                string   baseUri = Utility.GetBaseUri(url);
                string[] links   = Parser.ExtractLinks(baseUri, html);

                if (Settings.DataStoreMode == "1")
                {
                    //SQLiteUtility.InsertToRepo(PageRank.calcPageRank(url),url, 0, "", buffer, DateTime.Now, DateTime.Now, 0, "", Environment.MachineName,links.Length);
                }
                else
                {
                    FileSystemUtility.StoreWebFile(url, buffer);
                }

                crawler.m_downloader.CrawledUrlSet.Add(url);
                crawler.m_downloader.CrawleHistroy.Add(new CrawlHistroyEntry()
                {
                    Timestamp = DateTime.UtcNow, Url = url, Size = response.ContentLength
                });
                lock (crawler.m_downloader.TotalSizelock)
                {
                    crawler.m_downloader.TotalSize += response.ContentLength;
                }

                // 提取URL并加入队列.
                IQueueManager queue = crawler.m_downloader.UrlsQueueFrontier;

                if (contentType == "text/html" &&
                    queue.Count < 1000)
                {
                    crawler.Status = CrawlerStatusType.Parse;
                    crawler.Flush();

                    foreach (string link in links)
                    {
                        // 避免爬虫陷阱
                        if (link.Length > 256)
                        {
                            continue;
                        }
                        // 避免出现环
                        if (crawler.m_downloader.CrawledUrlSet.Contains(link))
                        {
                            continue;
                        }
                        // 加入队列
                        queue.Enqueue(link);
                    }
                }

                Console.WriteLine("[{1}] Url: {0}", crawler.Url, crawler.m_downloader.CrawleHistroy.Count);

                crawler.Url      = string.Empty;
                crawler.Status   = CrawlerStatusType.Idle;
                crawler.MimeType = string.Empty;
                crawler.Flush();
            }
            catch (IOException ioEx)
            {
                if (ioEx.InnerException != null)
                {
                    if (ioEx.InnerException is SocketException)
                    {
                        SocketException socketEx = (SocketException)ioEx.InnerException;
                        if (socketEx.NativeErrorCode == 10054)
                        {
                            // 远程主机强迫关闭了一个现有的连接。
                            //Logger.Error(ioEx.Message);
                        }
                    }
                    else
                    {
                        int hr = (int)ioEx.GetType().GetProperty("HResult",
                                                                 System.Reflection.BindingFlags.Instance |
                                                                 System.Reflection.BindingFlags.NonPublic).GetValue(ioEx, null);

                        if (hr == -2147024864)
                        {
                            // 另一个程序正在使用此文件,进程无法访问。
                            // 束手无策 TODO: 想个办法
                            //Logger.Error(ioEx.Message);
                        }
                        else
                        {
                            //throw;
                            //Logger.Error(ioEx.Message);
                        }
                    }
                }
            }
            catch (NotSupportedException /*nsEx*/)
            {
                // 无法识别该 URI 前缀。
                // 束手无策 TODO: 想个办法
                //Logger.Error(nsEx.Message);
            }
            catch (Exception ex)
            {
                Logger.Error(ex.Message);
            }
        }