Beispiel #1
0
        public static void CrawlerAnswer(int qid)
        {
            var proxys = GrabAnswer.GetProxyListFromCache();
            //var answer = new QuestionParser().ParseAnswer(item.QuestionId.ToString());
            var answerHttpClient = CrawlerSingleQuestion(qid, proxys);

            if (answerHttpClient == null || string.IsNullOrWhiteSpace(answerHttpClient.ToString()))
            {
                return;
            }
            var xd   = answerHttpClient[0]["questions"][0]["xd"].ToString();
            var chid = answerHttpClient[0]["questions"][0]["chid"].ToString();

            var apiUrl  = $"https://www.zujuan.com/question/list?question_id={qid}&xd={xd}&chid={chid}";
            var apiJson = HttpClientHolder.Execute(apiUrl);

            using (var db = new CrawlerEntities())
            {
                var entity = new QuestionAll();
                entity.QuestionId    = qid;
                entity.IsDelete      = false;
                entity.AnswerJson    = answerHttpClient.ToString();
                entity.ApiJson       = apiJson;
                entity.CrawlerUrl    = $"https://www.zujuan.com/question/detail-{qid}.shtml";
                entity.CrawlerApiUrl = apiUrl;
                entity.child         = chid.NullToInt();
                entity.xd            = xd.NullToInt();

                db.QuestionAll.Add(entity);
                db.SaveChanges();
            }
        }
Beispiel #2
0
        public static void CrawlerAnswer(QuestionAll item, List <string> proxys)
        {
            //var answer = new QuestionParser().ParseAnswer(item.QuestionId.ToString());
            var answerHttpClient = CrawlerSingleQuestion(item.QuestionId.ToString(), proxys);

            if (answerHttpClient == null || string.IsNullOrWhiteSpace(answerHttpClient.ToString()))
            {
                //Console.WriteLine($"anserhttp is null {item.QuestionId}");
                //UpdateQuestionGrabStatus(item.QuestionId);
                return;
            }
            var xd      = answerHttpClient[0]["questions"][0]["xd"].ToString();
            var chid    = answerHttpClient[0]["questions"][0]["chid"].ToString();
            var apiUrl  = $"https://www.zujuan.com/question/list?question_id={item.QuestionId}&xd={xd}&chid={chid}";
            var apiJson = HttpClientHolder.Execute(apiUrl, GetCookieState(chid.NullToInt(), xd.NullToInt()));

            Console.WriteLine($"save sql");
            UpdateQuesion(item.QuestionId, answerHttpClient, apiUrl, apiJson, xd, chid);
        }
Beispiel #3
0
        public static void StartWithIndex()
        {
            int qid = CacheManager.IncrementValue(key).NullToInt();

            if (qid > 10000000)
            {
                System.Windows.Forms.Application.Exit();
                return;
            }
            if (!IsCrawlered(qid))
            {
                var result = HttpClientHolder.Execute($"https://www.zujuan.com/question/detail-{qid}.shtml");
                if (result.IndexOf("试题已经被删除") >= 0)
                {
                    AddDeleteQuestion(qid);
                }
                else
                {
                    CrawlerAnswer(qid);
                }
            }
        }
        /// <summary>
        /// 整站采集,如果整体数据过一遍,需要重新生成这些category下每页的URL、
        /// </summary>
        public void InitCatePageUrl()
        {
            var listCategory = DataService.GetCategorylist();

            var hasCateID = DataService.GetCrawleredCatelist();

            //https://www.zujuan.com/question?categories=47854&bookversion=47832&nianji=47854&chid=3&xd=1
            foreach (var category in listCategory)

            //Parallel.ForEach(listCategory, new ParallelOptions() { MaxDegreeOfParallelism = 10 }, (category) =>
            {
                if (hasCateID.Any(t => t == category.CategoryId))
                {
                    continue;
                }
                Stopwatch sw = new Stopwatch();
                sw.Start();

                //通过页面采集方式,放弃
                //var url = $"https://www.zujuan.com/question?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji={category.CategoryId}&chid={category.Child}&xd={category.Degree}";
                //var html = new HttpUnitHelperNosingleton().GetRealHtmlTrice(url);
                //var doc = NSoupClient.Parse(html);
                //var total = doc.Select("div.total b")[0].Text().NullToInt();

                //DataService.UpdateCategoryCount(category.Id, total);

                //var pageNum = total / 10 + 1;
                //for (int i = 1; i <= pageNum; i++)
                //{
                //    var grabUrl = "";
                //    if (i > 1)
                //    {
                //        grabUrl =
                //            $"https://www.zujuan.com/question/index?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji=7741&chid={category.Child}&xd={category.Degree}&page={i}&per-page=10";
                //    }
                //    else
                //    {
                //        grabUrl = url;
                //    }

                //    DataService.AddCateUrl(grabUrl, category.CategoryId, i);
                //}
                var api =
                    $"https://www.zujuan.com/question/list?categories={category.CategoryId}&sortField=time&page=1&_=1540841532659";

                var url           = $"https://www.zujuan.com/question?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji={category.CategoryId}&chid={category.Child}&xd={category.Degree}";
                var html          = new HttpUnitHelper().GetRealHtmlOnceNotWaitJs(url);
                var questionCount = NSoupClient.Parse(html).GetElementById("J_QuestionList").GetElementsByTag("li").Count;

                long start    = 1000000000000;
                long end      = 1540849999999;
                long randomId = 1540840000000 + new Random().Next(0000000, 9999999);
                //todo 题型筛选

                var cookies      = new HttpUnitHelper().webClient.GetCookies(new URL("https://www.zujuan.com/"));
                var headerCookie = string.Empty;
                foreach (var cookie in cookies)
                {
                    headerCookie += $"{cookie.Name}={cookie.Value};";
                }

                var json = HttpClientHolder.Execute(api, headerCookie);

                var total      = JObject.Parse(json);
                var totalCount = total["total"].NullToInt();
                if (totalCount <= 0)
                {
                    continue;
                }
                var pageNum = totalCount / 10 + 1;
                Parallel.For(1, pageNum, (i) =>
                {
                    var currentApi =
                        $"https://www.zujuan.com/question/list?categories={category.CategoryId}&sortField=time&page={i}&_={randomId}";
                    var currentJson = HttpClientHolder.Execute(currentApi, headerCookie);
                    var grabUrl     = "";
                    if (i > 1)
                    {
                        grabUrl =
                            $"https://www.zujuan.com/question/index?categories={category.CategoryId}&bookversion={category.BookVersionId}&nianji=7741&chid={category.Child}&xd={category.Degree}&page={i}&per-page=10";
                    }
                    else
                    {
                        grabUrl = url;
                    }
                    Action actoin = () =>
                    {
                        DataService.AddCateUrl(grabUrl, category.CategoryId, i, currentApi, currentJson);
                    };
                    actoin.BeginInvoke(null, null);
                });

                sw.Stop();

                Debug.WriteLine("cost" + sw.ElapsedMilliseconds);
            }
            //);
        }