Beispiel #1
0
        public static void Main(string[] ar)
        {
            string            website_prefix = "https://www.jlu.edu.cn/index/tzgg";
            CrawlerController c = new CrawlerController();

            c.Index(website_prefix + ".htm");
            for (int i = 39; i > 0; i--)
            {
                string website = website_prefix + "/" + i.ToString() + ".htm";
                //Console.WriteLine(website);
                c.Index(website);
            }
            Console.WriteLine("The total amount of news: " + total_amount_news);
            Console.WriteLine("The total amount of department detected: " + total_department);
            foreach (KeyValuePair <string, string> dep_date in department_date)
            {
                Console.WriteLine(dep_date.Key + dep_date.Value);
            }
            c.saveAsCSV();
            c.jieba_analysis();
        }
Beispiel #2
0
        // GET: Crawler
        public void Index(string website)
        {
            //抓取整本小说
            CrawlerController cra = new CrawlerController();// 顶点抓取小说网站小说
            //string html = cra.HttpGet("http://www.23us.so/files/article/html/13/13655/index.html", "");
            //string html = cra.HttpGet("http://www.baidu.com", "");
            //string html = cra.HttpGet("https://www.jlu.edu.cn/index/tzgg.htm", "");
            string html = cra.HttpGet(website, "");
            //Console.WriteLine(html);

            // 获取小说名字
            //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)""/>");
            //string name = ma_name.Groups[1].Value.ToString().Split(',')[0];

            /*
             * // 获取章节目录
             * Regex reg_mulu = new Regex(@"<table cellspacing=""1"" cellpadding=""0"" bgcolor=""#E4E4E4"" at"">(.|\n)*?</table>");
             * var mat_mulu = reg_mulu.Match(html);
             * string mulu = mat_mulu.Groups[0].ToString();
             */
            //截取
            int startIndex = html.IndexOf(@"<ul class=""list fl"">");
            int endIndex   = html.IndexOf("</ul>", startIndex);

            html = html.Substring(startIndex, endIndex - startIndex);


            // 匹配a标签里面的url
            html.Replace("\n", "");
            Regex           tmpreg = new Regex(@"<a href=""[\s\S]+?/info/\d{4,}/\d{5,}.htm"">", RegexOptions.Compiled);
            MatchCollection sMC    = tmpreg.Matches(html);
            //<li id="lineu11_0">< a href = "../info/1095/45890.htm" > 长春吉大附中实验学校(高中)公开招聘教师的启事 </ a >< span > 2019 - 06 - 24 </ span ></ li >


            //提取URL
            ArrayList URLS = new ArrayList();

            for (int i = 0; i < sMC.Count; i++)
            {
                //Console.WriteLine(sMC[i].Groups[0].Value);
                int    startInd     = sMC[i].Groups[0].Value.IndexOf("/info/");
                int    len          = sMC[i].Groups[0].Value.Length - startInd - 2;
                String subURL       = sMC[i].Groups[0].Value.Substring(startInd, len);
                String prefix       = "https://www.jlu.edu.cn";
                String completedURL = prefix + subURL;
                URLS.Add(completedURL);
            }
            //依次对所有URL执行操作
            foreach (String i in URLS)
            {
                //Console.WriteLine(i);
                processURL(i);
            }


            //保存网页内容

            //FileStream fs = new FileStream(@"\samole\sample.txt", FileMode.Create, FileAccess.Write);
            //StreamWriter sr = new StreamWriter(fs);
            //sr.WriteLine(html);// 开始写入值

            /*
             * if (sMC.Count != 0)
             * {
             *  //循环目录url,获取正文内容
             *  for (int i = 0; i < sMC.Count; i++)
             *  {
             *      //sMC[i].Groups[1].Value
             *      //0是<a href="http://www.23us.so/files/article/html/13/13655/5638725.html">第一章 泰山之巅</a>
             *      //1是http://www.23us.so/files/article/html/13/13655/5638725.html
             *      //2是第一章 泰山之巅
             *
             *      // 获取章节标题
             *      string title = sMC[i].Groups[2].Value;
             *
             *      // 获取文章内容
             *      //string html_z = cra.HttpGet(sMC[i].Groups[1].Value, "");
             *
             *      // 获取小说名字,章节中也可以查找名字
             *      //Match ma_name = Regex.Match(html, @"<meta name=""keywords"".+content=""(.+)"" />");
             *      //string name = ma_name.Groups[1].Value.ToString().Split(',')[0];
             *
             *      // 获取标题,通过分析h1标签也可以得到章节标题
             *      //string title = html_z.Replace("<h1>", "*").Replace("</h1>", "*").Split('*')[1];
             *
             *      // 获取正文
             *      Regex reg = new Regex(@"<dd contents"">(.|\n)*?</dd>");
             *      MatchCollection mc = reg.Matches(html_z);
             *      var mat = reg.Match(html_z);
             *      string content = mat.Groups[0].ToString().Replace("<dd id=\"contents\">", "").Replace("</dd>", "").Replace("&nbsp;", "").Replace("<br />", "\r\n");
             *      // txt文本输出
             *      string path = Directory.GetCurrentDirectory()+"\\";
             *      Novel(title + "\r\n" + content, name, path);
             *  }
             * }
             */
        }