public void InitCategory() { var listBook = DataService.GetBooklist(); foreach (var book in listBook) { var url = string.Format("https://www.zujuan.com/question?bookversion={0}&chid={1}&xd={2}", book.BookVersionId, book.Child, book.Degree); var html = new HttpUnitHelper().GetRealHtmlTrice(url); var doc = NSoupClient.Parse(html); //获取当前dgree下的科目下的教材版本 var categoryDoc = doc.Select("div.search-type div.con-items")[1].GetElementsByTag("a"); // var total = doc.Select("div.total b")[0].Text().NullToInt(); foreach (var element in categoryDoc) { var elementId = element.Attr("data-bcaid"); var name = element.Text(); DataService.AddCategory(book.Id, elementId.NullToInt(), name, total); } } //https://www.zujuan.com/question?chid=2&xd=1 }
public void InitBook() { Parallel.ForEach(typeof(ChildEnum).GetEnumSource(), (child) => { Parallel.ForEach(typeof(DegreeEnum).GetEnumSource(), (degree) => { }); }); foreach (var child in typeof(ChildEnum).GetEnumSource()) { foreach (var degree in typeof(DegreeEnum).GetEnumSource()) { var childId = child.Item1.NullToInt(); var degreeId = degree.Item1.NullToInt(); var url = string.Format("https://www.zujuan.com/question?chid={0}&xd={1}", child.Item1, degree.Item1); var html = new HttpUnitHelper().GetRealHtmlTrice(url); var doc = NSoupClient.Parse(html); //获取当前dgree下的科目下的教材版本 var bookTypeDoc = doc.Select("div.search-type div.con-items")[0].GetElementsByTag("a"); foreach (var element in bookTypeDoc) { var elementId = element.Attr("data-bcaid"); var name = element.Text(); DataService.AddBook(childId, degreeId, name, elementId.NullToInt()); } } } }
public JObject ParseAnswer(string questionId) { // questionId = "8630746"; JObject answer = new JObject(); string url = string.Format("https://www.zujuan.com/question/detail-{0}.shtml", questionId); var html = new HttpUnitHelper().GetRealHtmlOnce(url); if ( NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByClass("exam-con")[0] .GetElementsByClass("exam-qlist").Count > 0) { var qlist = NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByClass("exam-con")[0].GetElementsByClass("exam-qlist")[0]; var exam_cons = qlist.GetElementsByClass("exam-con"); var exam_cons_answers = qlist.Select("div.analyticbox.replace_anawer"); JArray answer_list = new JArray(); for (int i = 0; i < exam_cons.Count; i++) { JObject result = new JObject(); result["question"] = exam_cons[i].GetElementsByClass("analyticbox")[0].ProcessHtmlImageElement().Html(); result["answer"] = exam_cons_answers[i].ProcessHtmlImageElement().Html(); answer_list.Add(result); } answer["answer_list"] = answer_list; } var analyticbox_brick = NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByClass("analyticbox-brick")[0]; answer["kaodian"] = analyticbox_brick.Children[0].ProcessHtmlImageElement().Html();// ProcessHtmlImageElement(qlist.GetElementsByClass("analyticbox")[0].GetElementsByTag("div")[0]).Html(); answer["jiexi"] = analyticbox_brick.Children[1].ProcessHtmlImageElement().Html(); return(answer); }
/// <summary> /// 整站采集,如果整体数据过一遍,需要重新生成这些category下每页的URL、 /// </summary> public void InitCatePageUrl() { var listCategory = DataService.GetCategorylist(); var hasCateID = DataService.GetCrawleredCatelist(); //https://www.zujuan.com/question?categories=47854&bookversion=47832&nianji=47854&chid=3&xd=1 foreach (var category in listCategory) //Parallel.ForEach(listCategory, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, (category) => { if (hasCateID.Any(t => t == category.CategoryId)) { continue; } Stopwatch sw = new Stopwatch(); sw.Start(); //通过页面采集方式,放弃 //var url = $"https://www.zujuan.com/question?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji={category.CategoryId}&chid={category.Child}&xd={category.Degree}"; //var html = new HttpUnitHelperNosingleton().GetRealHtmlTrice(url); //var doc = NSoupClient.Parse(html); //var total = doc.Select("div.total b")[0].Text().NullToInt(); //DataService.UpdateCategoryCount(category.Id, total); //var pageNum = total / 10 + 1; //for (int i = 1; i <= pageNum; i++) //{ // var grabUrl = ""; // if (i > 1) // { // grabUrl = // $"https://www.zujuan.com/question/index?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji=7741&chid={category.Child}&xd={category.Degree}&page={i}&per-page=10"; // } // else // { // grabUrl = url; // } // DataService.AddCateUrl(grabUrl, category.CategoryId, i); //} var api = $"https://www.zujuan.com/question/list?categories={category.CategoryId}&sortField=time&page=1&_=1540841532659"; var url = $"https://www.zujuan.com/question?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji={category.CategoryId}&chid={category.Child}&xd={category.Degree}"; var html = new HttpUnitHelper().GetRealHtmlOnceNotWaitJs(url); var questionCount = NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByTag("li").Count; long start = 1000000000000; long end = 1540849999999; long randomId = 1540840000000 + new Random().Next(0000000, 9999999); //todo 题型筛选 var cookies = new HttpUnitHelper().webClient.GetCookies(new URL("https://www.zujuan.com/")); var headerCookie = string.Empty; foreach (var cookie in cookies) { headerCookie += $"{cookie.Name}={cookie.Value};"; } var json = HttpClientHolder.Execute(api, headerCookie); var total = JObject.Parse(json); var totalCount = total["total"].NullToInt(); if (totalCount <= 0) { continue; } var pageNum = totalCount / 10 + 1; Parallel.For(1, pageNum, (i) => { var currentApi = $"https://www.zujuan.com/question/list?categories={category.CategoryId}&sortField=time&page={i}&_={randomId}"; var currentJson = HttpClientHolder.Execute(currentApi, headerCookie); var grabUrl = ""; if (i > 1) { grabUrl = $"https://www.zujuan.com/question/index?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji=7741&chid={category.Child}&xd={category.Degree}&page={i}&per-page=10"; } else { grabUrl = url; } Action actoin = () => { DataService.AddCateUrl(grabUrl, category.CategoryId, i, currentApi, currentJson); }; actoin.BeginInvoke(null, null); }); sw.Stop(); Debug.WriteLine("cost" + sw.ElapsedMilliseconds); } //); }
public JObject ParseAnswer(string questionId) { // questionId = "8630746"; JObject answer = new JObject(); string url = string.Format("https://www.zujuan.com/question/detail-{0}.shtml", questionId); var cookies = new HttpUnitHelper().webClient.GetCookies(new URL("https://www.zujuan.com/")); var headerCookie = string.Empty; foreach (var cookie in cookies) { headerCookie += $"{cookie.Name}={cookie.Value};"; } var html = new HttpUnitHelper().GetRealHtmlOnce(url, wait: 3000); var doc = NSoupClient.Parse(html); //var loginStatus = doc.Select("div.analyticbox-tips").Count > 0 ? (doc.Select("div.analyticbox-tips")[0].Html().IndexOf("未登录") >= 0 ? false : true) : false; //if (!loginStatus) //{ // throw new Exception("login error"); //} var doc = NSoupClient.Parse(html); answer["question_title"] = doc.GetElementById("J_QuestionList").GetElementsByClass("exam-con")[0] .GetElementsByClass("exam-q")[0].ProcessHtmlImageElement().Html(); if ( NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByClass("exam-con")[0] .GetElementsByClass("exam-qlist").Count > 0) { var qlist = NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByClass("exam-con")[0] .GetElementsByClass("exam-qlist")[0]; var exam_cons = qlist.GetElementsByClass("exam-con"); var exam_cons_answers = qlist.Select("div.analyticbox.replace_anawer"); JArray answer_list = new JArray(); for (int i = 0; i < exam_cons.Count; i++) { JObject result = new JObject(); result["question"] = exam_cons[i].GetElementsByClass("analyticbox")[0].ProcessHtmlImageElement().Html(); result["answer"] = exam_cons_answers[i].ProcessHtmlImageElement().Html(); answer_list.Add(result); } answer["answer_list"] = answer_list; var analyticbox_brick = NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByClass("analyticbox-brick")[0]; answer["kaodian"] = analyticbox_brick.Children[0].ProcessHtmlImageElement().Html(); // ProcessHtmlImageElement(qlist.GetElementsByClass("analyticbox")[0].GetElementsByTag("div")[0]).Html(); answer["jiexi"] = analyticbox_brick.Children[1].ProcessHtmlImageElement().Html(); } else { var analyticbox_brick = NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByClass("analyticbox-brick")[0]; answer["kaodian"] = analyticbox_brick.Children[0].ProcessHtmlImageElement().Html(); // ProcessHtmlImageElement(qlist.GetElementsByClass("analyticbox")[0].GetElementsByTag("div")[0]).Html(); answer["daan"] = analyticbox_brick.Children[1].ProcessHtmlImageElement().Html(); answer["jiexi"] = analyticbox_brick.Children[2].ProcessHtmlImageElement().Html(); } return(answer); }