// GET: Crawler public void Index() { //抓取整本小说 CrawlerController cra = new CrawlerController();// 顶点抓取小说网站小说 string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", ""); // 获取小说名字 Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>"); string name = ma_name.Groups[1].Value.ToString().Split(',')[0]; // 获取章节目录 Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" at"">(.|\n)*?</table>"); var mat_mulu = reg_mulu.Match(html); string mulu = mat_mulu.Groups[0].ToString(); // 匹配a标签里面的url Regex tmpreg = new Regex("<a[^>]+?href=\"([^\"]+)\"[^>]*>([^<]+)</a>", RegexOptions.Compiled); MatchCollection sMC = tmpreg.Matches(mulu); if (sMC.Count != 0) { //循环目录url,获取正文内容 for (int i = 0; i < sMC.Count; i++) { //sMC[i].Groups[1].Value //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a> //1是http://www.23us.so/files/article/html/13/13655/5638725.html //2是第一章 泰山之巅 // 获取章节标题 string title = sMC[i].Groups[2].Value; // 获取文章内容 string html_z = cra.HttpGet(sMC[i].Groups[1].Value, ""); // 获取小说名字,章节中也可以查找名字 //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />"); //string name = ma_name.Groups[1].Value.ToString().Split(',')[0]; // 获取标题,通过分析h1标签也可以得到章节标题 //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1]; // 获取正文 Regex reg = new Regex(@"<dd contents"">(.|\n)*?</dd>"); MatchCollection mc = reg.Matches(html_z); var mat = reg.Match(html_z); string content = mat.Groups[0].ToString().Replace("<dd id=\"contents\">", "").Replace("</dd>", "").Replace(" ", "").Replace("<br />", "\r\n"); // txt文本输出 string path = AppDomain.CurrentDomain.BaseDirectory.Replace("\\", "/") + "Txt/"; Novel(title + "\r\n" + content, name, path); } } }
static void Main(string[] args) { CrawlerController cc = new CrawlerController(); cc.Index(); }