/// <summary>
        /// 获取教练信息
        /// </summary>
        /// <param name="urlAddress">爬取地址</param>
        private static void GetDetail(string urlAddress)
        {
            try
            {
                string content = HTTPGeneral.HTTPCrawler(urlAddress, "GET");
                //UL
                var regexULValue = new Regex("(?i)<ul[^>]+class[=\"\'\\s]+list[\"\']?[^>]*>(?:(?!<\\/ul>)[\\s\\S])+<\\/ul>").Match(content);
                //LI
                var regexLIValue = new Regex("(?i)<li[^>]*>(?:(?!<\\/li>)[\\s\\S])+<\\/li>").Matches(regexULValue.Value);
                for (int i = 0; i < regexLIValue.Count; i++)
                {
                    var _commentAddress = $"{urlCommon}{Regex.Match(regexLIValue[i].Value, string.Format(regexA, "href"), RegexOptions.IgnoreCase).Groups["href"].Value}";
                    var _imgAddress     = $"{urlCommon}{Regex.Match(regexLIValue[i].Value, string.Format(regex, "src"), RegexOptions.IgnoreCase).Groups["src"].Value}";
                    var _name           = Regex.Match(regexLIValue[i].Value, string.Format(regex, "alt"), RegexOptions.IgnoreCase).Groups["alt"].Value;

                    //组织数据
                    listDetailedInfo.Add(new DetailedInfo()
                    {
                        Index          = i + 1,
                        Name           = _name,
                        CommentAddress = _commentAddress,
                        ImgAddress     = _imgAddress
                    });
                }
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message, "错误提示", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }
        }
Exemple #2
0
        //爬取菜单
        private void 爬取ToolStripMenuItem_Click(object sender, EventArgs e)
        {
            if (list_url.SelectedItem == null)
            {
                MessageBox.Show("请选择地址", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk);
                return;
            }
            //爬取一级菜单请求
            string        result  = HTTPGeneral.HTTPCrawler(list_url.SelectedItem.ToString(), "GET");
            List <string> listUrl = new List <string>();
            var           matches = Regex.Matches(result, @"<a href=""([^>]+?)"">([^<]+?)</a>", RegexOptions.Multiline);

            foreach (Match match in matches)
            {
                //爬取到最大页数
                var matchHref = Regex.Match(match.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>", RegexOptions.IgnoreCase).Groups["href"].Value.ToString();
                if (matchHref.Contains("list") && !listUrl.Contains($"http://www.46ek.com{matchHref}"))
                {
                    if (matchHref.Contains("_"))
                    {
                        listUrl.Add($"http://www.46ek.com{matchHref}");
                    }
                }
            }
            list_sonurl.Enabled = true;

            //二级
            string     prefix_url = string.Empty;
            List <int> newlistUrl = new List <int>();

            foreach (var item in listUrl)
            {
                newlistUrl.Add(int.Parse(item.Split('_')[1].Substring(0, item.Split('_')[1].IndexOf("."))));
                prefix_url = item.Split('_')[0].ToString();
            }
            newlistUrl.Sort(); //排序

            //二级子菜单
            list_sonurl.Items.Clear();
            for (int i = 1; i <= newlistUrl[newlistUrl.Count - 1]; i++)
            {
                if (i == 1)
                {
                    list_sonurl.Items.Add($"{prefix_url}.html");
                }
                else
                {
                    list_sonurl.Items.Add($"{prefix_url}_{i}.html");
                }
            }
        }
Exemple #3
0
        /// <summary>
        /// 爬虫执行
        /// </summary>
        public void Crawler()
        {
            DateTime beginTime   = DateTime.Now;
            int      threadOrder = Convert.ToInt32(Thread.CurrentThread.Name);
            long     step        = (maxValue - minValue + 1) / threadNumber;
            long     beginValue  = minValue + step * (threadOrder - 1);
            long     endValue    = beginValue + step;

            if (threadOrder == threadNumber)
            {
                endValue = beginValue + step + ((maxValue - minValue + 1) % threadNumber);
            }

            for (long i = beginValue; i < endValue; i++)
            {
                try
                {
                    string url    = $"http://www.46ek.com/view/{i}.html";
                    Regex  regex  = new Regex(regexString);
                    string result = HTTPGeneral.HTTPCrawler(url, "GET");
                    Match  m      = regex.Match(result);
                    if (!string.IsNullOrEmpty(m.Value))
                    {
                        lock (o)
                        {
                            fileList.Add(new Model.FileInfo()
                            {
                                FileID      = i.ToString(), //从1开始
                                FileName    = m.Value.Substring(m.Value.LastIndexOf('/') + 1),
                                FileSize    = GetFileSize(m.Value),
                                FileUrl     = m.Value,
                                SynProgress = "0%",
                                Tag         = "等待",
                                SynSpeed    = "0KB",
                                DownPath    = m.Value,
                                Async       = true
                            });
                            Console.WriteLine(m.Value);
                        }
                    }
                }
                catch (Exception) { Thread.Sleep(1000); continue; }
            }
            DateTime endTime  = DateTime.Now;
            TimeSpan timeSpan = endTime - beginTime;
            string   message  = "线程ID:" + Thread.CurrentThread.Name + " ----> Start: " + beginValue.ToString() + " -- End:" + endValue.ToString() + ", 耗时:" + timeSpan.TotalMinutes.ToString() + "分钟。";

            Console.WriteLine(message);
            Thread.CurrentThread.Abort();
        }
Exemple #4
0
        //开始爬虫
        private void But_Crawler_Click(object sender, EventArgs e)
        {
            /*
             * 地址:
             * http://www.46ek.com
             * http://www.46ek.com/list/2.html
             * http://www.46ek.com/view/22157.html
             * 正则:
             * http://m4.26ts.com/[.0-9-a-zA-Z]*.mp4
             */

            if (string.IsNullOrEmpty(txt_Url.Text.Trim()))
            {
                MessageBox.Show("请填写地址", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk);
            }
            //地址请求
            string result = HTTPGeneral.HTTPCrawler(txt_Url.Text.Trim(), "GET");
            //存储提取到 List URL 请求
            List <string> listUrl = new List <string>();
            //提取页面的 a 标签
            var matches = Regex.Matches(result, @"<a href=""([^>]+?)"">([^<]+?)</a>", RegexOptions.Multiline);

            foreach (Match match in matches)
            {
                var matchHref = Regex.Match(match.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>", RegexOptions.IgnoreCase).Groups["href"].Value.ToString();
                //电影区
                if (matchHref.Contains("list") && !listUrl.Contains($"http://www.46ek.com{matchHref}"))
                {
                    listUrl.Add($"http://www.46ek.com{matchHref}");
                }
            }

            listUrl.Sort();
            list_url.Items.Clear();
            foreach (var item in listUrl)
            {
                list_url.Items.Add(item);
            }

            //启用该控件
            list_url.Enabled = true;
        }
Exemple #5
0
        /// <summary>
        /// 爬虫执行
        /// </summary>
        public void Crawlers()
        {
            fileList = new List <Model.FileInfo>();
            DateTime beginTime   = DateTime.Now;
            int      threadOrder = Convert.ToInt32(Thread.CurrentThread.Name); //线程ID
            long     step        = (maxValue - minValue + 1) / threadNumber;

            long beginValue = minValue + step * (threadOrder - 1);
            long endValue   = beginValue + step;

            if (threadOrder == threadNumber)
            {
                endValue = beginValue + step + ((maxValue - minValue + 1) % threadNumber);
            }

            string msgUrl = string.Empty;

            for (long i = beginValue; i < endValue; i++)
            {
                try
                {
                    string url    = $"http://www.46ek.com/view/{listIndex[Convert.ToInt32(i) - 1]}.html";
                    string result = HTTPGeneral.HTTPCrawler(url, "GET");

                    string[] regexStrings = regexString.Split(',');
                    Match    match        = null;
                    for (int j = 0; j < regexStrings.Length; j++)
                    {
                        match = new Regex(regexStrings[j]).Match(result);
                        if (match.Success)
                        {
                            break;
                        }
                    }

                    if (!string.IsNullOrEmpty(match.Value))
                    {
                        msgUrl = match.Value; //存储404链接
                        lock (o)
                        {
                            fileList.Add(new Model.FileInfo()
                            {
                                FileID      = i.ToString(), //从1开始
                                FileName    = match.Value.Substring(match.Value.LastIndexOf('/') + 1),
                                FileSize    = GetFileSize(match.Value),
                                FileUrl     = match.Value,
                                SynProgress = "0%",
                                Tag         = "等待",
                                SynSpeed    = "0KB",
                                DownPath    = match.Value,
                                Async       = true
                            });
                            Console.WriteLine(match.Value);
                            Thread.Sleep(1000);
                        }
                    }
                }
                catch (Exception ex)
                {
                    Log4net.log4netCreate(typeof(ThreadCrawler), log4netPath).Error($"爬取地址[{msgUrl}]访问错误,信息: " + ex.Message);
                    Thread.Sleep(1000); continue;
                }
            }
            DateTime endTime  = DateTime.Now;
            TimeSpan timeSpan = endTime - beginTime;
            string   message  = "线程ID:" + Thread.CurrentThread.Name + " ----> Start: " + beginValue.ToString() + " -- End:" + endValue.ToString() + ", 耗时:" + timeSpan.TotalMinutes.ToString() + "分钟。";

            Console.WriteLine(message);
            Thread.CurrentThread.Abort();
        }
Exemple #6
0
        //二级爬取
        private void 二级爬取ToolStripMenuItem_Click(object sender, EventArgs e)
        {
            if (list_sonurl.SelectedItem == null)
            {
                MessageBox.Show("请选择地址", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk);
                return;
            }

            //爬取一级菜单请求
            string result = HTTPGeneral.HTTPCrawler(list_sonurl.SelectedItem.ToString(), "GET");

            List <string> listUrl = new List <string>();
            var           matches = Regex.Matches(result, @"<a href=""([^>]+?)"">([^<]+?)</a>", RegexOptions.Multiline);

            foreach (Match match in matches)
            {
                //爬取到最大页数
                var matchHref = Regex.Match(match.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>", RegexOptions.IgnoreCase).Groups["href"].Value.ToString();
                if (matchHref.Contains("view") && !listUrl.Contains(matchHref))
                {
                    listUrl.Add(matchHref);
                }
            }

            List <int> listIndex = new List <int>();

            foreach (var item in listUrl)
            {
                listIndex.Add(int.Parse(item.Substring(item.LastIndexOf('/') + 1, 5)));
            }

            //爬取视频
            if (listIndex.Count <= 0)
            {
                MessageBox.Show("没有爬取到信息数据", "提示", MessageBoxButtons.OK, MessageBoxIcon.Asterisk);
            }

            threadNumber = int.Parse(txt_thread.Text.Trim()); //线程数

            ThreadCrawler threadCrawler = new ThreadCrawler(1, listIndex.Count, threadNumber, listIndex);

            Thread[] threads = new Thread[threadNumber];
            for (int i = 1; i <= threadNumber; i++)
            {
                threads[i - 1] = new Thread(new ThreadStart(threadCrawler.Crawlers));
                threads[i - 1].SetApartmentState(ApartmentState.MTA);
                threads[i - 1].Name         = i.ToString();
                threads[i - 1].IsBackground = true;
                threads[i - 1].Start();
            }

            //启动定时器
            timerList.Enabled = false;
            //阻塞线程,等待线程完成
            for (int i = 0; i < threadNumber; i++)
            {
                threads[i].Join();
            }
            foreach (var item in ThreadCrawler.fileList)
            {
                ListViewItem listViewItem = listView.Items.Add(
                    new ListViewItem(
                        new string[] { (listView.Items.Count + 1).ToString(), item.FileName, item.FileSize, "0", "0%", "0", "0", DateTime.Now.ToString(), "等待中", item.FileUrl }));
                downLoadFileGeneral.AddDown(item.FileUrl, txtDownPath.Text.Trim(), listViewItem.Index);
            }
        }