/// <summary> /// 获取教练信息 /// </summary> /// <param name="urlAddress">爬取地址</param> private static void GetDetail(string urlAddress) { try { string content = HTTPGeneral.HTTPCrawler(urlAddress, "GET"); //UL var regexULValue = new Regex("(?i)<ul[^>]+class[=\"\'\\s]+list[\"\']?[^>]*>(?:(?!<\\/ul>)[\\s\\S])+<\\/ul>").Match(content); //LI var regexLIValue = new Regex("(?i)<li[^>]*>(?:(?!<\\/li>)[\\s\\S])+<\\/li>").Matches(regexULValue.Value); for (int i = 0; i < regexLIValue.Count; i++) { var _commentAddress = $"{urlCommon}{Regex.Match(regexLIValue[i].Value, string.Format(regexA, "href"), RegexOptions.IgnoreCase).Groups["href"].Value}"; var _imgAddress = $"{urlCommon}{Regex.Match(regexLIValue[i].Value, string.Format(regex, "src"), RegexOptions.IgnoreCase).Groups["src"].Value}"; var _name = Regex.Match(regexLIValue[i].Value, string.Format(regex, "alt"), RegexOptions.IgnoreCase).Groups["alt"].Value; //组织数据 listDetailedInfo.Add(new DetailedInfo() { Index = i + 1, Name = _name, CommentAddress = _commentAddress, ImgAddress = _imgAddress }); } } catch (Exception ex) { MessageBox.Show(ex.Message, "错误提示", MessageBoxButtons.OK, MessageBoxIcon.Error); } }
//爬取菜单 private void 爬取ToolStripMenuItem_Click(object sender, EventArgs e) { if (list_url.SelectedItem == null) { MessageBox.Show("请选择地址", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk); return; } //爬取一级菜单请求 string result = HTTPGeneral.HTTPCrawler(list_url.SelectedItem.ToString(), "GET"); List <string> listUrl = new List <string>(); var matches = Regex.Matches(result, @"<a href=""([^>]+?)"">([^<]+?)</a>", RegexOptions.Multiline); foreach (Match match in matches) { //爬取到最大页数 var matchHref = Regex.Match(match.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>", RegexOptions.IgnoreCase).Groups["href"].Value.ToString(); if (matchHref.Contains("list") && !listUrl.Contains($"http://www.46ek.com{matchHref}")) { if (matchHref.Contains("_")) { listUrl.Add($"http://www.46ek.com{matchHref}"); } } } list_sonurl.Enabled = true; //二级 string prefix_url = string.Empty; List <int> newlistUrl = new List <int>(); foreach (var item in listUrl) { newlistUrl.Add(int.Parse(item.Split('_')[1].Substring(0, item.Split('_')[1].IndexOf(".")))); prefix_url = item.Split('_')[0].ToString(); } newlistUrl.Sort(); //排序 //二级子菜单 list_sonurl.Items.Clear(); for (int i = 1; i <= newlistUrl[newlistUrl.Count - 1]; i++) { if (i == 1) { list_sonurl.Items.Add($"{prefix_url}.html"); } else { list_sonurl.Items.Add($"{prefix_url}_{i}.html"); } } }
/// <summary> /// 爬虫执行 /// </summary> public void Crawler() { DateTime beginTime = DateTime.Now; int threadOrder = Convert.ToInt32(Thread.CurrentThread.Name); long step = (maxValue - minValue + 1) / threadNumber; long beginValue = minValue + step * (threadOrder - 1); long endValue = beginValue + step; if (threadOrder == threadNumber) { endValue = beginValue + step + ((maxValue - minValue + 1) % threadNumber); } for (long i = beginValue; i < endValue; i++) { try { string url = $"http://www.46ek.com/view/{i}.html"; Regex regex = new Regex(regexString); string result = HTTPGeneral.HTTPCrawler(url, "GET"); Match m = regex.Match(result); if (!string.IsNullOrEmpty(m.Value)) { lock (o) { fileList.Add(new Model.FileInfo() { FileID = i.ToString(), //从1开始 FileName = m.Value.Substring(m.Value.LastIndexOf('/') + 1), FileSize = GetFileSize(m.Value), FileUrl = m.Value, SynProgress = "0%", Tag = "等待", SynSpeed = "0KB", DownPath = m.Value, Async = true }); Console.WriteLine(m.Value); } } } catch (Exception) { Thread.Sleep(1000); continue; } } DateTime endTime = DateTime.Now; TimeSpan timeSpan = endTime - beginTime; string message = "线程ID:" + Thread.CurrentThread.Name + " ----> Start: " + beginValue.ToString() + " -- End:" + endValue.ToString() + ", 耗时:" + timeSpan.TotalMinutes.ToString() + "分钟。"; Console.WriteLine(message); Thread.CurrentThread.Abort(); }
//开始爬虫 private void But_Crawler_Click(object sender, EventArgs e) { /* * 地址: * http://www.46ek.com * http://www.46ek.com/list/2.html * http://www.46ek.com/view/22157.html * 正则: * http://m4.26ts.com/[.0-9-a-zA-Z]*.mp4 */ if (string.IsNullOrEmpty(txt_Url.Text.Trim())) { MessageBox.Show("请填写地址", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk); } //地址请求 string result = HTTPGeneral.HTTPCrawler(txt_Url.Text.Trim(), "GET"); //存储提取到 List URL 请求 List <string> listUrl = new List <string>(); //提取页面的 a 标签 var matches = Regex.Matches(result, @"<a href=""([^>]+?)"">([^<]+?)</a>", RegexOptions.Multiline); foreach (Match match in matches) { var matchHref = Regex.Match(match.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>", RegexOptions.IgnoreCase).Groups["href"].Value.ToString(); //电影区 if (matchHref.Contains("list") && !listUrl.Contains($"http://www.46ek.com{matchHref}")) { listUrl.Add($"http://www.46ek.com{matchHref}"); } } listUrl.Sort(); list_url.Items.Clear(); foreach (var item in listUrl) { list_url.Items.Add(item); } //启用该控件 list_url.Enabled = true; }
/// <summary> /// 爬虫执行 /// </summary> public void Crawlers() { fileList = new List <Model.FileInfo>(); DateTime beginTime = DateTime.Now; int threadOrder = Convert.ToInt32(Thread.CurrentThread.Name); //线程ID long step = (maxValue - minValue + 1) / threadNumber; long beginValue = minValue + step * (threadOrder - 1); long endValue = beginValue + step; if (threadOrder == threadNumber) { endValue = beginValue + step + ((maxValue - minValue + 1) % threadNumber); } string msgUrl = string.Empty; for (long i = beginValue; i < endValue; i++) { try { string url = $"http://www.46ek.com/view/{listIndex[Convert.ToInt32(i) - 1]}.html"; string result = HTTPGeneral.HTTPCrawler(url, "GET"); string[] regexStrings = regexString.Split(','); Match match = null; for (int j = 0; j < regexStrings.Length; j++) { match = new Regex(regexStrings[j]).Match(result); if (match.Success) { break; } } if (!string.IsNullOrEmpty(match.Value)) { msgUrl = match.Value; //存储404链接 lock (o) { fileList.Add(new Model.FileInfo() { FileID = i.ToString(), //从1开始 FileName = match.Value.Substring(match.Value.LastIndexOf('/') + 1), FileSize = GetFileSize(match.Value), FileUrl = match.Value, SynProgress = "0%", Tag = "等待", SynSpeed = "0KB", DownPath = match.Value, Async = true }); Console.WriteLine(match.Value); Thread.Sleep(1000); } } } catch (Exception ex) { Log4net.log4netCreate(typeof(ThreadCrawler), log4netPath).Error($"爬取地址[{msgUrl}]访问错误,信息: " + ex.Message); Thread.Sleep(1000); continue; } } DateTime endTime = DateTime.Now; TimeSpan timeSpan = endTime - beginTime; string message = "线程ID:" + Thread.CurrentThread.Name + " ----> Start: " + beginValue.ToString() + " -- End:" + endValue.ToString() + ", 耗时:" + timeSpan.TotalMinutes.ToString() + "分钟。"; Console.WriteLine(message); Thread.CurrentThread.Abort(); }
//二级爬取 private void 二级爬取ToolStripMenuItem_Click(object sender, EventArgs e) { if (list_sonurl.SelectedItem == null) { MessageBox.Show("请选择地址", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk); return; } //爬取一级菜单请求 string result = HTTPGeneral.HTTPCrawler(list_sonurl.SelectedItem.ToString(), "GET"); List <string> listUrl = new List <string>(); var matches = Regex.Matches(result, @"<a href=""([^>]+?)"">([^<]+?)</a>", RegexOptions.Multiline); foreach (Match match in matches) { //爬取到最大页数 var matchHref = Regex.Match(match.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>", RegexOptions.IgnoreCase).Groups["href"].Value.ToString(); if (matchHref.Contains("view") && !listUrl.Contains(matchHref)) { listUrl.Add(matchHref); } } List <int> listIndex = new List <int>(); foreach (var item in listUrl) { listIndex.Add(int.Parse(item.Substring(item.LastIndexOf('/') + 1, 5))); } //爬取视频 if (listIndex.Count <= 0) { MessageBox.Show("没有爬取到信息数据", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk); } threadNumber = int.Parse(txt_thread.Text.Trim()); //线程数 ThreadCrawler threadCrawler = new ThreadCrawler(1, listIndex.Count, threadNumber, listIndex); Thread[] threads = new Thread[threadNumber]; for (int i = 1; i <= threadNumber; i++) { threads[i - 1] = new Thread(new ThreadStart(threadCrawler.Crawlers)); threads[i - 1].SetApartmentState(ApartmentState.MTA); threads[i - 1].Name = i.ToString(); threads[i - 1].IsBackground = true; threads[i - 1].Start(); } //启动定时器 timerList.Enabled = false; //阻塞线程,等待线程完成 for (int i = 0; i < threadNumber; i++) { threads[i].Join(); } foreach (var item in ThreadCrawler.fileList) { ListViewItem listViewItem = listView.Items.Add( new ListViewItem( new string[] { (listView.Items.Count + 1).ToString(), item.FileName, item.FileSize, "0", "0%", "0", "0", DateTime.Now.ToString(), "等待中", item.FileUrl })); downLoadFileGeneral.AddDown(item.FileUrl, txtDownPath.Text.Trim(), listViewItem.Index); } }