예제 #1
0
        public List <ExampleMyHref> HrefCrawler(int maxPages)
        {
            var    hrefList    = new List <ExampleMyHref>();
            var    urlList     = new List <string>();
            string urlTemplate = "http://tieba.baidu.com/f?kw=linux&ie=utf-8&pn={0}";

            for (var i = 0; i < maxPages; i++)
            {
                urlList.Add(string.Format(urlTemplate, (i + 1) * 50));
            }

            var hrefCrawler = new SimpleCrawler();
            //string result = string.Empty;
            int j = 1;

            foreach (var url in urlList)
            {
                hrefCrawler.url = new Uri(url);
                //Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />");
                hrefCrawler.OnError     += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); };
                hrefCrawler.OnCompleted += (s, e) =>
                {
                    // 使用正则表达式清洗数据
                    var links = Regex.Matches(e.PageSource,
                                              @"<a[^>]+href=""*(?<href>/p[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>",
                                              RegexOptions.IgnoreCase);
                    foreach (Match match in links)
                    {
                        var h = new ExampleMyHref
                        {
                            HrefTitle   = match.Groups["text"].Value,
                            HrefSrc     = "https://tieba.baidu.com" + match.Groups["href"].Value,
                            KeywordList = null
                        };
                        if (!hrefList.Contains(h))
                        {
                            hrefList.Add(h);
                            //result += h.HrefTitle + "|" + h.HrefSrc + "<br />";
                        }
                    }
                };
                hrefCrawler.start();
                j++;
            }

            return(hrefList);
        }
예제 #2
0
        public void HrefCrawler()
        {
            var    hrefList = new List <ExampleMyHref>();
            string initurl  =
                "http://tieba.baidu.com/f?kw=%E5%8D%8E%E4%B8%9C%E5%B8%88%E8%8C%83%E5%A4%A7%E5%AD%A6&ie=utf-8";
            string result      = string.Empty;
            var    hrefCrawler = new SimpleCrawler {
                url = new Uri(initurl)
            };

            Response.Write($"爬虫开始抓取地址: {hrefCrawler.url.ToString()} <br />");
            hrefCrawler.OnError     += (s, e) => { Response.Write($"爬虫抓取出现错误, 异常信息: {e.Exception.Message}"); };
            hrefCrawler.OnCompleted += (s, e) =>
            {
                // 使用正则表达式清洗数据
                var links = Regex.Matches(e.PageSource,
                                          @"<a[^>]+href=""*(?<href>[^>\s]+)""\s*[^>]*>(?<text>(?!.*img).*?)</a>",
                                          RegexOptions.IgnoreCase);
                foreach (Match match in links)
                {
                    var h = new ExampleMyHref
                    {
                        HrefTitle = match.Groups["text"].Value,
                        HrefSrc   = match.Groups["href"].Value
                    };
                    if (!hrefList.Contains(h))
                    {
                        hrefList.Add(h);
                        result += h.HrefTitle + "|" + h.HrefSrc + "<br />";
                    }
                }
                Response.Write("======================================<br />");
                Response.Write($"爬虫抓取任务完成!合计 {links.Count} 个超级链接。 <br />");
                Response.Write($"耗时: {e.Milliseconds} 毫秒<br />");
                Response.Write($"线程: {e.ThreadId} <br />");
                Response.Write(result);
                Response.Write("======================================<br />");
            };
            hrefCrawler.start();
        }