public void Crawler() { while (ExistCrawlerJcSource()) { var listJiaocai = GetRandom10CrawlerJcSource(); Parallel.ForEach(listJiaocai, new ParallelOptions() { MaxDegreeOfParallelism = 1 }, (source) => { try { var total = source.Total; var pageCount = total / 10 + 1; Parallel.For(1, pageCount + 1, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, (i) => { if (ExistQuestionJiaocaiSourceResult(source.Id, i)) { return; } var url = $"http://zujuan.xkw.com/{source.Prefix}/zj{source.JiaocaiId}/a{source.AreaId}p{i}/"; Console.WriteLine(url); //CookieCollection cc = new CookieCollection(); //cc.Add(new System.Net.Cookie("bankId", source.SubjectId.ToString(), "/", "zujuan.xkw.com")); //cc.Add(new System.Net.Cookie("bankname", source.SubjectName.ToString(), "/", "zujuan.xkw.com")); var html = HttpWebResponseProxyHuake.ExecuteCreateGetHttpResponseProxy(url, 18000, $" isshowAnswer=false; bankname={source.Prefix}; bankId={source.SubjectId.ToString()};"); //var html = HttpWebResponseUtility.ExecuteCreateGetHttpResponseProxy(url, 3000, HttpUtility.UrlEncode( $"bankname={source.Prefix.ToString()};bankId={source.SubjectId.ToString()};categoryId=58550;categoryClick=58550;UM_distinctid=166e9518cbe298-01a36d572eaa51-43480420-144000-166e9518cbf41c; isshowAnswer=false;allbankCount=0%2c0; Hm_lpvt_68fb48a14b4fce9d823df8a437386f93=1541947025; _cnzz_CV1274198201=%E6%98%AF%E5%90%A6%E7%99%BB%E5%BD%95%7C%E6%9C%AA%E7%99%BB%E5%BD%95%7C1541947026394;ASP.NET_SessionId=ztqe22oqdjmbv1fsrru1il3i; ")); //var html = HttpWebResponseUtility.ExecuteCreateGetHttpResponseProxy(url, 3000, "UM_distinctid=166e9518cbe298-01a36d572eaa51-43480420-144000-166e9518cbf41c; isshowAnswer=false; bankname=czsx; pro_bank=0%2412; bankId=2; categoryClick=58550; categoryId=58550; CNZZDATA1261546733=2039576993-1541507459-https%253A%252F%252Fwww.baidu.com%252F%7C1541945470; pts=%2fczsx%2fzj58550%2fpts1a610000%2f; ds=%2fczsx%2fzj58550%2fds1a610000%2f; CNZZDATA1274198201=1092295544-1541511793-https%253A%252F%252Fwww.baidu.com%252F%7C1541946465; cn_5816665539db22708e01_dplus=%7B%22distinct_id%22%3A%20%22166e9518cbe298-01a36d572eaa51-43480420-144000-166e9518cbf41c%22%2C%22sp%22%3A%20%7B%22%24_sessionid%22%3A%200%2C%22%24_sessionTime%22%3A%201541946858%2C%22%24dp%22%3A%200%2C%22%24_sessionPVTime%22%3A%201541946858%2C%22%24recent_outside_referrer%22%3A%20%22%24direct%22%7D%2C%22initial_view_time%22%3A%20%221541513972%22%2C%22initial_referrer%22%3A%20%22http%3A%2F%2Fzujuan.xkw.com%2Fgzsx%2Fzsd27929%2Fpt3%2F%22%2C%22initial_referrer_domain%22%3A%20%22zujuan.xkw.com%22%7D; ASPSESSIONIDAQQRSDDD=ICJPMAGDNJMDDHFODCEOJIJD; ASP.NET_SessionId=ztqe22oqdjmbv1fsrru1il3i; Hm_lvt_68fb48a14b4fce9d823df8a437386f93=1541678249,1541797743,1541907309,1541946958; allbankCount=0%2c0; Hm_lpvt_68fb48a14b4fce9d823df8a437386f93=1541947025; _cnzz_CV1274198201=%E6%98%AF%E5%90%A6%E7%99%BB%E5%BD%95%7C%E6%9C%AA%E7%99%BB%E5%BD%95%7C1541947026394"); if (html.IndexOf("queslistbox") < 0) { throw new Exception("invalid html result"); } if (html.IndexOf("拒绝访问") >= 0) { ProxyManager.HasExpire = true; Console.WriteLine("invalid proxy"); throw new Exception("invalid proxy"); } if (html.IndexOf("涉嫌恶意操作") >= 0) { ProxyManager.HasExpire = true; Console.WriteLine("invalid proxy"); throw new Exception("invalid proxy"); } if (html.IndexOf("questioncount") < 0) { throw new Exception("invalid html result"); } var doc = NSoupClient.Parse(html); var questionHtml = doc.GetElementById("queslistbox").Html(); var questions = doc.GetElementById("queslistbox").GetElementsByClass("quesbox"); if (url != "http://zujuan.xkw.com/gzsx/zj81565/a500000p117/") { if (i < pageCount - 1 && questions.Count < 10) { var totalCount = doc.GetElementById("questioncount").Text().NullToInt(); if (source.Total != totalCount) { UpdateQuestionJiaocaiSourceTotalCount(source.Id, totalCount); } throw new Exception(); } } var elements = doc.GetElementsByTag("img"); var dicImageStaus = new ParseQuestionXkw().SaveImage(elements, url); ////// var listQuestion = new ParseQuestionXkw().AddQuestion(html, questionHtml, source.JiaocaiId, source.SubjectId.ToString(), url, source.AreaId, source.Id, source.Total, i, dicImageStaus); //AddQuestion(questionHtml, source.Id, url); //AddQuestionJiaocaiSourceResult(source.AreaId, source.JiaocaiId, source.Id, questionHtml, source.Total, i, url, listQuestion); }); UpdateQuestionJiaocaiSourceResultStatus(source.Id); } catch { } finally { } }); } }
public List <QuestionXkw> AddQuestion2(string html, string questionHtml, int jiaocaiId, string subjectId, string sourceUrl, int areaId, int sourceId, int total, int pageNum, int QuestionJiaoCaiDetailSourceId, Dictionary <string, bool> dicImageStaus) { List <QuestionXkw> result = new List <QuestionXkw>(); var elements = NSoupClient.Parse(html).GetElementsByClass("quesbox"); foreach (var element in elements) { try { QuestionXkw entity = new QuestionXkw(); entity.OriginHtml = element.Html(); var detail = element.Select("div.join-sj>a")[0]; entity.QuestionId = detail.Attr("quesid").NullToInt(); entity.@class = detail.Attr("class").NullToString(); entity.guid = detail.Attr("guid").NullToString(); entity.childnum = detail.Attr("childnum").NullToInt(); entity.questitle = detail.Attr("questitle").NullToString(); entity.categories = detail.Attr("categories").NullToString(); entity.qyid = detail.Attr("qyid").NullToInt(); entity.qdid = detail.Attr("qdid").NullToInt(); entity.qyname = detail.Attr("qyname").NullToString(); entity.qdname = detail.Attr("qdname").NullToString(); entity.JiaocaiId = jiaocaiId; var source = element.Select("div.quesource")[0]; entity.source = source.Html(); entity.SourceUrl = sourceUrl; var questiontitle = element.Select("div.question-inner")[0]; entity.key = questiontitle.Attr("key").NullToString(); entity.question_text = questiontitle.Html(); var href = element.Select("a.detail")[0]; entity.CrawlerUrl = href.Attr("href").NullToString(); var str = entity.CrawlerUrl.Replace("http://zujuan.xkw.com/", "") .Replace("https://zujuan.xkw.com/", ""); var bankId = str.Substring(0, str.IndexOf("q", StringComparison.OrdinalIgnoreCase)); ; var analysisUrl = $"http://im.zujuan.xkw.com/Parse/{entity.QuestionId}/{bankId}/700/14/28/{entity.key}"; var answerUrl = $"http://im.zujuan.xkw.com/Answer/{entity.QuestionId}/{bankId}/700/14/28/{entity.key}"; var paths = new ParseQuestionXkw().SaveAnswerImage(entity.QuestionId.ToString(), entity.key, subjectId); entity.AnalysisImg = paths[0]; entity.AnswerImg = paths[1]; entity.QuestionAnalysis = analysisUrl; entity.QuestionAnswer = answerUrl; entity.CreateTime = DateTime.Now; entity.QuestionJiaoCaiDetailSourceId = QuestionJiaoCaiDetailSourceId; if (dicImageStaus.Any(t => entity.OriginHtml.IndexOf(t.Key, StringComparison.OrdinalIgnoreCase) > 0)) { entity.ImageStatus = false; } else { entity.ImageStatus = true; } result.Add(entity); } //catch (DbUpdateException exception) //{ // var msg = string.Empty; // foreach (var validationError in ((DbUpdateException)exception).Data) // { // var o = validationError; // } // throw new Exception(); //} //catch (DbEntityValidationException ex) //{ // var msg = string.Empty; // foreach (var validationError in ((DbEntityValidationException)ex).EntityValidationErrors) // foreach (var error in validationError.ValidationErrors) // msg += string.Format("Property: {0} Error: {1}", error.PropertyName, error.ErrorMessage); // var fail = new Exception(msg); // throw fail; //} catch (Exception ex) { WriteLog(questionHtml, ex.ToString(), sourceUrl); throw ex; } } try { using (var db = new XKWEntities2()) { db.QuestionXkw.AddRange(result); if ( !db.QuestionJiaocaiSourceDetailResult.Any( t => t.JiaocaiDetailId == jiaocaiId && t.AreaId == areaId && t.PageNum == pageNum)) { var entity = new QuestionJiaocaiSourceDetailResult(); entity.AreaId = areaId; entity.JiaocaiId = jiaocaiId; entity.Html = html; entity.Total = total; entity.PageNum = pageNum; entity.QuestionJiaoCaiDetailSourceId = sourceId; entity.CrawlerUrl = sourceUrl; db.QuestionJiaocaiSourceDetailResult.Add(entity); } db.SaveChanges(); } } catch (Exception ex) { WriteLog(questionHtml, ex.ToString(), sourceUrl); throw ex; } return(result); }