Ejemplo n.º 1
0
        private async void btnStart_Click(object sender, EventArgs e)
        {
            //更新界面按钮的属性值
            btnStart.Enabled = false;
            btnStop.Enabled = !btnStart.Enabled;

            #region  从文本框中提取url信息
            string webSite = txtWebSite.Text;
            if (string.IsNullOrWhiteSpace(webSite))
            {
                MessageBox.Show("域名为空!");
                btnStart.Enabled = true;
                btnStop.Enabled = !btnStart.Enabled;
                return;
            }
            if (!webSite.Contains("http"))
            {
                webSite = "http://" + webSite;
            }
            while (webSite[webSite.Length - 1] == '/')
            {
                webSite = webSite.Substring(0, webSite.Length - 1);
            }
            //检查地址是否发生变化
            if (websiteCopy != webSite)
            {
                dataGridViewTitleURL.Rows.Clear();
                urls.Clear();
                urlsToBrowse.Clear();
                
            }
            #endregion

            Crawler crawler = new Crawler();

            try
            {
                #region 根据网站域名提取种子url
                if (urls.Count == 0 && urlsToBrowse.Count == 0)
                {
                    List<string> hostBasedHttpURL = await crawler.GetTitleAndHostBasedHTTPURLsFromURL(webSite);

                    //第一个是Title信息,需要移除掉
                    if (hostBasedHttpURL.Count > 0)
                    {
                        //先提取信息到界面上
                        dataGridViewTitleURL.Rows.Add(hostBasedHttpURL[0], webSite);
                        dataGridViewTitleURL.Rows[dataGridViewTitleURL.Rows.Count - 2].HeaderCell.Value = (dataGridViewTitleURL.Rows.Count - 1).ToString();
                        hostBasedHttpURL.RemoveAt(0);
                    }

                    urls.Add(webSite);
                    
                    foreach (string url in hostBasedHttpURL)
                    {
                        if(urls.Add(url))
                        {
                            urlsToBrowse.Enqueue(url);
                        }
                    }
                }
                #endregion

                #region 根据已有url和初始索引位置,不断地爬取Title和url信息                          

                Method method = this.ThreadMethod;
                //开启4个线程:3个互斥锁,至少将代码划分为4个执行区域
                for (int temp = 0; temp < 4; temp++)
                {
                    this.Invoke(method);                                        
                }

                #endregion
            }
            catch (Exception except)
            {
                MessageBox.Show(except.Message);
            }
            finally
            {
                crawler.Dispose();
            }
            //最后显示爬出的url数量和Title数量
            //MessageBox.Show("url有" + urls.Count + "\n" + "title有" + (dataGridViewTitleURL.Rows.Count - 1));

            //备份一份域名
            websiteCopy = webSite;

            //更新界面按钮的属性值
            //btnStart.Enabled = true;
            //btnStop.Enabled = !btnStart.Enabled;

            lblUrls.Text = "URL:" + urls.Count;
            lblTitles.Text = "TITLE:" + (dataGridViewTitleURL.Rows.Count - 1);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 异步函数:线程方法
        /// </summary>
        /// <param name="crawler"></param>
        private async void ThreadMethod()
        {
            Crawler crawler = new Crawler();

            while (urlsToBrowse.Count != 0 && btnStart.Enabled == false)
            {
                string url, title;

                //队列互斥锁
                lock (queueLock)
                {
                    //i++;           
                    //url = urls.ElementAt(i);

                    url = urlsToBrowse.Dequeue();                    
                    title = "";
                }

                //获取同一域名下的http超链接以及Title信息
                List<string> hostBasedHttpURL = await crawler.GetTitleAndHostBasedHTTPURLsFromURL(url);

                //先提取Title信息
                if (hostBasedHttpURL.Count > 0)
                {
                    title = hostBasedHttpURL[0];
                    hostBasedHttpURL.RemoveAt(0);
                }

                #region 添加Title和url到datagridview
                try
                {
                    //UI互斥锁
                    lock (UILock)
                    {
                        dataGridViewTitleURL.Rows.Add(title, url);
                        dataGridViewTitleURL.Rows[dataGridViewTitleURL.Rows.Count - 2].HeaderCell.Value = (dataGridViewTitleURL.Rows.Count - 1).ToString();
                        if (btnStopRoll.Text == "停止滚动")
                        {
                            dataGridViewTitleURL.FirstDisplayedScrollingRowIndex = dataGridViewTitleURL.Rows.Count - 1;
                        }
                        this.lblUrls.Text = "URL: " + urls.Count.ToString();
                        this.lblTitles.Text = "TITLE: " + (dataGridViewTitleURL.Rows.Count - 1).ToString();
                    }
                }
                catch (Exception)
                {
                    //do nothing。
                }
                #endregion

                foreach (string each in hostBasedHttpURL)
                {
                    //写互斥锁
                    lock (URLLock)
                    {
                        if (urls.Add(each))
                        {
                            lock (queueLock)
                            {
                                urlsToBrowse.Enqueue(each);
                            }                            
                        }    
                    }
                }
            }

            crawler.Dispose();
        }