public static void Main(string[] ar) { string website_prefix = "https://www.jlu.edu.cn/index/tzgg"; CrawlerController c = new CrawlerController(); c.Index(website_prefix + ".htm"); for (int i = 39; i > 0; i--) { string website = website_prefix + "/" + i.ToString() + ".htm"; //Console.WriteLine(website); c.Index(website); } Console.WriteLine("The total amount of news: " + total_amount_news); Console.WriteLine("The total amount of department detected: " + total_department); foreach (KeyValuePair <string, string> dep_date in department_date) { Console.WriteLine(dep_date.Key + dep_date.Value); } c.saveAsCSV(); c.jieba_analysis(); }
// GET: Crawler public void Index(string website) { //抓取整本小说 CrawlerController cra = new CrawlerController();// 顶点抓取小说网站小说 //string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", ""); //string html = cra.HttpGet("http://www.baidu.com", ""); //string html = cra.HttpGet("https://www.jlu.edu.cn/index/tzgg.htm", ""); string html = cra.HttpGet(website, ""); //Console.WriteLine(html); // 获取小说名字 //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>"); //string name = ma_name.Groups[1].Value.ToString().Split(',')[0]; /* * // 获取章节目录 * Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" at"">(.|\n)*?</table>"); * var mat_mulu = reg_mulu.Match(html); * string mulu = mat_mulu.Groups[0].ToString(); */ //截取 int startIndex = html.IndexOf(@"<ul class=""list fl"">"); int endIndex = html.IndexOf("</ul>", startIndex); html = html.Substring(startIndex, endIndex - startIndex); // 匹配a标签里面的url html.Replace("\n", ""); Regex tmpreg = new Regex(@"<a href=""[\s\S]+?/info/\d{4,}/\d{5,}.htm"">", RegexOptions.Compiled); MatchCollection sMC = tmpreg.Matches(html); //<li id="lineu11_0">< a href = "../info/1095/45890.htm" > 长春吉大附中实验学校(高中)公开招聘教师的启事 </ a >< span > 2019 - 06 - 24 </ span ></ li > //提取URL ArrayList URLS = new ArrayList(); for (int i = 0; i < sMC.Count; i++) { //Console.WriteLine(sMC[i].Groups[0].Value); int startInd = sMC[i].Groups[0].Value.IndexOf("/info/"); int len = sMC[i].Groups[0].Value.Length - startInd - 2; String subURL = sMC[i].Groups[0].Value.Substring(startInd, len); String prefix = "https://www.jlu.edu.cn"; String completedURL = prefix + subURL; URLS.Add(completedURL); } //依次对所有URL执行操作 foreach (String i in URLS) { //Console.WriteLine(i); processURL(i); } //保存网页内容 //FileStream fs = new FileStream(@"\samole\sample.txt", FileMode.Create, FileAccess.Write); //StreamWriter sr = new StreamWriter(fs); //sr.WriteLine(html);// 开始写入值 /* * if (sMC.Count != 0) * { * //循环目录url,获取正文内容 * for (int i = 0; i < sMC.Count; i++) * { * //sMC[i].Groups[1].Value * //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a> * //1是http://www.23us.so/files/article/html/13/13655/5638725.html * //2是第一章 泰山之巅 * * // 获取章节标题 * string title = sMC[i].Groups[2].Value; * * // 获取文章内容 * //string html_z = cra.HttpGet(sMC[i].Groups[1].Value, ""); * * // 获取小说名字,章节中也可以查找名字 * //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />"); * //string name = ma_name.Groups[1].Value.ToString().Split(',')[0]; * * // 获取标题,通过分析h1标签也可以得到章节标题 * //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1]; * * // 获取正文 * Regex reg = new Regex(@"<dd contents"">(.|\n)*?</dd>"); * MatchCollection mc = reg.Matches(html_z); * var mat = reg.Match(html_z); * string content = mat.Groups[0].ToString().Replace("<dd id=\"contents\">", "").Replace("</dd>", "").Replace(" ", "").Replace("<br />", "\r\n"); * // txt文本输出 * string path = Directory.GetCurrentDirectory()+"\\"; * Novel(title + "\r\n" + content, name, path); * } * } */ }