예제 #1
0
        // GET: Crawler
        public void Index()
        {
            //抓取整本小说
            CrawlerController cra  = new CrawlerController();// 顶点抓取小说网站小说
            string            html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", "");

            // 获取小说名字
            Match  ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
            string name    = ma_name.Groups[1].Value.ToString().Split(',')[0];

            // 获取章节目录
            Regex  reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" at"">(.|\n)*?</table>");
            var    mat_mulu = reg_mulu.Match(html);
            string mulu     = mat_mulu.Groups[0].ToString();

            // 匹配a标签里面的url
            Regex           tmpreg = new Regex("<a[^>]+?href=\"([^\"]+)\"[^>]*>([^<]+)</a>", RegexOptions.Compiled);
            MatchCollection sMC    = tmpreg.Matches(mulu);

            if (sMC.Count != 0)
            {
                //循环目录url,获取正文内容
                for (int i = 0; i < sMC.Count; i++)
                {
                    //sMC[i].Groups[1].Value
                    //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a>
                    //1是http://www.23us.so/files/article/html/13/13655/5638725.html
                    //2是第一章 泰山之巅

                    // 获取章节标题
                    string title = sMC[i].Groups[2].Value;

                    // 获取文章内容
                    string html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");

                    // 获取小说名字,章节中也可以查找名字
                    //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />");
                    //string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

                    // 获取标题,通过分析h1标签也可以得到章节标题
                    //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1];

                    // 获取正文
                    Regex           reg     = new Regex(@"<dd contents"">(.|\n)*?</dd>");
                    MatchCollection mc      = reg.Matches(html_z);
                    var             mat     = reg.Match(html_z);
                    string          content = mat.Groups[0].ToString().Replace("<dd id=\"contents\">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "\r\n");
                    // txt文本输出
                    string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\\", "/") + "Txt/";
                    Novel(title + "\r\n" + content, name, path);
                }
            }
        }
예제 #2
0
        static void Main(string[] args)
        {
            CrawlerController cc = new CrawlerController();

            cc.Index();
        }