Exemple #1
0
 /// <summary>
 /// 处理网页
 /// </summary>
 /// <param name="html"></param>
 protected override void Process(string html)
 {
     try
     {
         Regex           re = new Regex(@"href=(?<web_url>[\s\S]*?)>|href=""(?<web_url>[\s\S]*?)""|href='(?<web_url>[\s\S]*?)'");
         MatchCollection mc = re.Matches(html);
         foreach (Match m in mc)
         {
             string url = m.Groups["web_url"].ToString();
             url = this.RemoveQuotation(url);
             if (url.IndexOf("http://") != -1)
             {
                 UrlQueue.GetInstance().Enqueue(url);
             }
         }
         string title = string.Empty;
         re = new Regex(@"<title[\s\S]*?>(?<title>[\s\S]*?)</title>");
         Match temp = re.Match(html.ToLower());
         title = temp.Groups["title"].ToString();
         if (!string.IsNullOrEmpty(title))
         {
             Console.WriteLine(string.Format("网页标题:{0}", title));
             Console.WriteLine(string.Format("网页URL:{0}", this.Url));
         }
     }
     catch
     {
     }
 }
Exemple #2
0
        /// <summary>
        ///     Clears / deletes all relevant crawler information from Azure Storage upon command.
        ///     Sets the XML & URL Queue sizes to 0, clears the XML & URL Queues, and deletes
        ///     the URL Table and Error Table. URL and Error Tables must be reinitialized for crawler
        ///     to function again (handled in worker role).
        /// </summary>
        public void ClearAll()
        {
            TableOperation clearQueue =
                TableOperation.InsertOrReplace(new CrawlrQueueSize(0, 0));

            StatusTable.Execute(clearQueue);
            XmlQueue.Clear();
            UrlQueue.Clear();
            UrlTable.Delete();
            ErrorTable.Delete();
        }
        /// <summary>
        /// 构造函数
        /// </summary>
        private SpiderBase()
        {
            this.urlQueue = new UrlQueue();
            this.contentQueue = new ContentQueue();

            this.requestThreads = new List<Thread>();
            this.processThreads = new List<Thread>();

            this.runStatus = SpiderRunStatus.UnStarted;
            this.pauseCalled = false;
            this.completed = false;

            this.startTimespan = new TimeSpan();
            this.startTime = DateTime.MinValue;

            this.runtime = new SpiderRuntime();
        }
Exemple #4
0
        static void Main(string[] args)
        {
            try
            {
                Console.Title = System.Configuration.ConfigurationManager.AppSettings["Title"].ToString();
                Console.WriteLine("Process is running!");

                string url = System.Configuration.ConfigurationManager.AppSettings["URL"].ToString();
                UrlQueue.GetInstance().Enqueue(url);
                ThreadManager thread = new ThreadManager();
                thread.Start();
            }
            catch (Exception ex)
            {
            }
            Console.ReadLine();
        }
        public void HandUrl(UrlGroup urlGroup, int depath)
        {
            if (DoneQueue.Contains(urlGroup))
            {
                return;
            }
            string html = HttpHelper.GetString(urlGroup.Url, Encoding.Default);

            lock (DoneQueueLock)
            {
                DoneQueue.Enqueue(urlGroup);
            }
            //1、获取页面所有url
            List <string> listAllUrl = MatchDomainURL(html);

            //2、过滤:去掉外站、js、图片等url
            string[]      extArray = new string[] { ".jpg", ".png", ".gif", ".js" };
            List <string> mlstUrl  = new List <string>();

            foreach (string url in listAllUrl)
            {
                var  tmpurl = url.ToLower();
                bool isfile = false;
                foreach (string ext in extArray)
                {
                    if (tmpurl.Contains(ext))
                    {
                        isfile = true;
                        break;
                    }
                }
                //匹配特殊url
                SepcialMatch(url);

                if (!isfile && !DoneQueue.Contains(url) && !UrlQueue.Contains(url))
                {
                    mlstUrl.Add(url);
                }
            }
            if (depath + 1 > MaxSearchDepth)
            {
                return;                              //达到最大搜索深度
            }
            lock (UrlQueueLock)
            {
                foreach (string url in mlstUrl)
                {
                    UrlQueue.Enqueue(new UrlGroup()
                    {
                        Url = url, DepathNumber = depath + 1
                    });
                    if (UrlQueue.Count > 10000)
                    {
                        Thread.Sleep(3000);
                    }
                    else if (UrlQueue.Count > 50000)
                    {
                        Thread.Sleep(10000);
                    }
                }
            }
        }
        public void Search(string url)
        {
            if (string.IsNullOrEmpty(url) && string.IsNullOrEmpty(BaseURL))
            {
                return;
            }
            if (!string.IsNullOrEmpty(url))
            {
                BaseURL = url;
            }

            HandUrl(new UrlGroup()
            {
                Url = BaseURL, DepathNumber = 0
            }, 0);

            Thread.Sleep(1000);

            //for (int i = 0; i < MaxThreadCount; i++)
            //{
            ThreadPool.QueueUserWorkItem(x =>
            {
                try
                {
                    while (true)
                    {
                        if (IsStop)
                        {
                            break;
                        }
                        if (IsAllDone)
                        {
                            break;
                        }
                        if (IsPause)
                        {
                            continue;
                        }
                        if (UrlQueue.Count == 0 && SpecUrlQueue.Count == 0 && UrlQueue.Count == 0)
                        {
                            break;
                        }
                        if (UrlQueue.Count == 0)
                        {
                            Thread.Sleep(10000);
                        }
                        UrlGroup urlGroup = (UrlGroup)UrlQueue.Dequeue();
                        HandUrl(urlGroup, urlGroup.DepathNumber);
                        Thread.Sleep(1000);
                    }
                }
                catch (Exception ex)
                {
                    throw ex;
                }
            });
            //}
            ThreadPool.QueueUserWorkItem(x =>
            {
                try
                {
                    while (true)
                    {
                        if (IsStop)
                        {
                            break;
                        }
                        if (IsPause)
                        {
                            continue;
                        }
                        if (IsAllDone && SpecUrlQueue.Count == 0)
                        {
                            break;
                        }
                        if (SpecUrlQueue.Count == 0)
                        {
                            Thread.Sleep(10000);
                        }
                        HandSpecialUrl(SpecUrlQueue.Dequeue().ToString());
                        Thread.Sleep(1000);
                    }
                }
                catch (Exception ex)
                {
                    throw ex;
                }
            });
        }