private async void btnStart_Click(object sender, EventArgs e) { //更新界面按钮的属性值 btnStart.Enabled = false; btnStop.Enabled = !btnStart.Enabled; #region 从文本框中提取url信息 string webSite = txtWebSite.Text; if (string.IsNullOrWhiteSpace(webSite)) { MessageBox.Show("域名为空!"); btnStart.Enabled = true; btnStop.Enabled = !btnStart.Enabled; return; } if (!webSite.Contains("http")) { webSite = "http://" + webSite; } while (webSite[webSite.Length - 1] == '/') { webSite = webSite.Substring(0, webSite.Length - 1); } //检查地址是否发生变化 if (websiteCopy != webSite) { dataGridViewTitleURL.Rows.Clear(); urls.Clear(); urlsToBrowse.Clear(); } #endregion Crawler crawler = new Crawler(); try { #region 根据网站域名提取种子url if (urls.Count == 0 && urlsToBrowse.Count == 0) { List <string> hostBasedHttpURL = await crawler.GetTitleAndHostBasedHTTPURLsFromURL(webSite); //第一个是Title信息,需要移除掉 if (hostBasedHttpURL.Count > 0) { //先提取信息到界面上 dataGridViewTitleURL.Rows.Add(hostBasedHttpURL[0], webSite); dataGridViewTitleURL.Rows[dataGridViewTitleURL.Rows.Count - 2].HeaderCell.Value = (dataGridViewTitleURL.Rows.Count - 1).ToString(); hostBasedHttpURL.RemoveAt(0); } urls.Add(webSite); foreach (string url in hostBasedHttpURL) { if (urls.Add(url)) { urlsToBrowse.Enqueue(url); } } } #endregion #region 根据已有url和初始索引位置,不断地爬取Title和url信息 Method method = this.ThreadMethod; //开启4个线程:3个互斥锁,至少将代码划分为4个执行区域 for (int temp = 0; temp < 4; temp++) { this.Invoke(method); } #endregion } catch (Exception except) { MessageBox.Show(except.Message); } finally { crawler.Dispose(); } //最后显示爬出的url数量和Title数量 //MessageBox.Show("url有" + urls.Count + "\n" + "title有" + (dataGridViewTitleURL.Rows.Count - 1)); //备份一份域名 websiteCopy = webSite; //更新界面按钮的属性值 //btnStart.Enabled = true; //btnStop.Enabled = !btnStart.Enabled; lblUrls.Text = "URL:" + urls.Count; lblTitles.Text = "TITLE:" + (dataGridViewTitleURL.Rows.Count - 1); }
private async void btnStart_Click(object sender, EventArgs e) { //更新界面按钮的属性值 btnStart.Enabled = false; btnStop.Enabled = !btnStart.Enabled; #region 从文本框中提取url信息 string webSite = txtWebSite.Text; if (string.IsNullOrWhiteSpace(webSite)) { MessageBox.Show("域名为空!"); btnStart.Enabled = true; btnStop.Enabled = !btnStart.Enabled; return; } if (!webSite.Contains("http")) { webSite = "http://" + webSite; } while (webSite[webSite.Length - 1] == '/') { webSite = webSite.Substring(0, webSite.Length - 1); } //检查地址是否发生变化 if (websiteCopy != webSite) { dataGridViewTitleURL.Rows.Clear(); urls.Clear(); urlsToBrowse.Clear(); } #endregion Crawler crawler = new Crawler(); try { #region 根据网站域名提取种子url if (urls.Count == 0 && urlsToBrowse.Count == 0) { List<string> hostBasedHttpURL = await crawler.GetTitleAndHostBasedHTTPURLsFromURL(webSite); //第一个是Title信息,需要移除掉 if (hostBasedHttpURL.Count > 0) { //先提取信息到界面上 dataGridViewTitleURL.Rows.Add(hostBasedHttpURL[0], webSite); dataGridViewTitleURL.Rows[dataGridViewTitleURL.Rows.Count - 2].HeaderCell.Value = (dataGridViewTitleURL.Rows.Count - 1).ToString(); hostBasedHttpURL.RemoveAt(0); } urls.Add(webSite); foreach (string url in hostBasedHttpURL) { if(urls.Add(url)) { urlsToBrowse.Enqueue(url); } } } #endregion #region 根据已有url和初始索引位置,不断地爬取Title和url信息 Method method = this.ThreadMethod; //开启4个线程:3个互斥锁,至少将代码划分为4个执行区域 for (int temp = 0; temp < 4; temp++) { this.Invoke(method); } #endregion } catch (Exception except) { MessageBox.Show(except.Message); } finally { crawler.Dispose(); } //最后显示爬出的url数量和Title数量 //MessageBox.Show("url有" + urls.Count + "\n" + "title有" + (dataGridViewTitleURL.Rows.Count - 1)); //备份一份域名 websiteCopy = webSite; //更新界面按钮的属性值 //btnStart.Enabled = true; //btnStop.Enabled = !btnStart.Enabled; lblUrls.Text = "URL:" + urls.Count; lblTitles.Text = "TITLE:" + (dataGridViewTitleURL.Rows.Count - 1); }
/// <summary> /// 异步函数:线程方法 /// </summary> /// <param name="crawler"></param> private async void ThreadMethod() { Crawler crawler = new Crawler(); while (urlsToBrowse.Count != 0 && btnStart.Enabled == false) { string url, title; //队列互斥锁 lock (queueLock) { //i++; //url = urls.ElementAt(i); url = urlsToBrowse.Dequeue(); title = ""; } //获取同一域名下的http超链接以及Title信息 List <string> hostBasedHttpURL = await crawler.GetTitleAndHostBasedHTTPURLsFromURL(url); //先提取Title信息 if (hostBasedHttpURL.Count > 0) { title = hostBasedHttpURL[0]; hostBasedHttpURL.RemoveAt(0); } #region 添加Title和url到datagridview try { //UI互斥锁 lock (UILock) { dataGridViewTitleURL.Rows.Add(title, url); dataGridViewTitleURL.Rows[dataGridViewTitleURL.Rows.Count - 2].HeaderCell.Value = (dataGridViewTitleURL.Rows.Count - 1).ToString(); if (btnStopRoll.Text == "停止滚动") { dataGridViewTitleURL.FirstDisplayedScrollingRowIndex = dataGridViewTitleURL.Rows.Count - 1; } this.lblUrls.Text = "URL: " + urls.Count.ToString(); this.lblTitles.Text = "TITLE: " + (dataGridViewTitleURL.Rows.Count - 1).ToString(); } } catch (Exception) { //do nothing。 } #endregion foreach (string each in hostBasedHttpURL) { //写互斥锁 lock (URLLock) { if (urls.Add(each)) { lock (queueLock) { urlsToBrowse.Enqueue(each); } } } } } crawler.Dispose(); }
/// <summary> /// 异步函数:线程方法 /// </summary> /// <param name="crawler"></param> private async void ThreadMethod() { Crawler crawler = new Crawler(); while (urlsToBrowse.Count != 0 && btnStart.Enabled == false) { string url, title; //队列互斥锁 lock (queueLock) { //i++; //url = urls.ElementAt(i); url = urlsToBrowse.Dequeue(); title = ""; } //获取同一域名下的http超链接以及Title信息 List<string> hostBasedHttpURL = await crawler.GetTitleAndHostBasedHTTPURLsFromURL(url); //先提取Title信息 if (hostBasedHttpURL.Count > 0) { title = hostBasedHttpURL[0]; hostBasedHttpURL.RemoveAt(0); } #region 添加Title和url到datagridview try { //UI互斥锁 lock (UILock) { dataGridViewTitleURL.Rows.Add(title, url); dataGridViewTitleURL.Rows[dataGridViewTitleURL.Rows.Count - 2].HeaderCell.Value = (dataGridViewTitleURL.Rows.Count - 1).ToString(); if (btnStopRoll.Text == "停止滚动") { dataGridViewTitleURL.FirstDisplayedScrollingRowIndex = dataGridViewTitleURL.Rows.Count - 1; } this.lblUrls.Text = "URL: " + urls.Count.ToString(); this.lblTitles.Text = "TITLE: " + (dataGridViewTitleURL.Rows.Count - 1).ToString(); } } catch (Exception) { //do nothing。 } #endregion foreach (string each in hostBasedHttpURL) { //写互斥锁 lock (URLLock) { if (urls.Add(each)) { lock (queueLock) { urlsToBrowse.Enqueue(each); } } } } } crawler.Dispose(); }