예제 #1
0
        /// <summary>
        /// 搜索开始
        /// </summary>
        /// <param name="url"></param>
        /// <param name="fileName"></param>
        public async void SearchStart(string url, string fileName, int type)
        {
            ViewEnum.searchEnum searchEmum = (ViewEnum.searchEnum)type;
            string        JsonContentPath  = ConfigurationManager.AppSettings["JsonContentPath"];//E:\CrawlerNproData\jsonFileSave\
            Elements      elements;
            string        strGetConetex = null;
            CrHttpRequest crHttpRequest = new CrHttpRequest();

            strGetConetex = await crHttpRequest.SentDataAsync(HttpMethod.Post, url);

            if (strGetConetex.Length > 0)
            {
                Regex regUrl   = new Regex(@"(/p/[0-9]{10})");
                Regex regTitle = new Regex(">(.*?)<");
                //this.txtContent.Text = strGetConetex;
                Document document       = NSoup.NSoupClient.Parse(strGetConetex);
                Elements elementTbNum   = document.GetElementsByClass("card_infoNum");
                Elements elementTbName  = document.GetElementsByClass("card_title_fname");
                Elements elementTbTitle = document.Select("li.j_thread_list").Select("li.clearfix");
                List <TitleContentEntity> listContentEntity = new List <TitleContentEntity>(); //内容 //有错误
                List <ResultEntity>       listResultEntity  = new List <ResultEntity>();       //所有内容
                try
                {
                    for (int i = 0; i < elementTbTitle.Count; i++)
                    {
                        var      elementss             = elementTbTitle[i];
                        Elements elementTbChildReplies = elementss.Select("span.threadlist_rep_num").Select("span.center_text");
                        Elements elementTbUrl          = elementss.Select("a.j_th_tit");
                        Debug.WriteLine(elementTbChildReplies + "/n/r" + elementTbUrl);
                        ResultEntity resultEntity = new ResultEntity();
                        if (elementTbUrl.ToString().Contains("href"))
                        {
                            var resultUrl   = regUrl.Match(elementTbUrl.ToString()).ToString();
                            var tempData    = regTitle.Match(elementTbUrl.ToString()).ToString();
                            var resultTitle = tempData.Substring(1, tempData.Length - 2);
                            resultEntity.Title      = resultTitle;
                            resultEntity.Url        = resultUrl;
                            resultEntity.CreateCode = resultUrl.Substring(3, resultUrl.Length - 3);//校验码
                        }
                        if (elementTbChildReplies.ToString().Contains("回复"))
                        {
                            Regex regReplies    = new Regex(@">(.*?)<");
                            var   tempData      = regReplies.Match(elementTbChildReplies.ToString()).ToString();
                            var   resultReplies = tempData.Substring(1, tempData.Length - 2);
                            resultEntity.Replies  = resultReplies;
                            resultEntity.Visiable = 1;//默认显示1
                        }
                        listResultEntity.Add(resultEntity);
                    }
                    CompareHelper       compareHelper          = new CompareHelper();
                    List <ResultEntity> filterListResultEntity = new List <ResultEntity>();//所有内容
                    var listkeyWorld = SelectKeyWorld();
                    if (listkeyWorld == null)
                    {
                        Debug.WriteLine("--------------获取关键字失败------------------");
                    }
                    foreach (var item in listResultEntity)
                    {
                        if (compareHelper.Compare(item.Title.Trim(), listkeyWorld, "KeyWorld"))
                        {
                            filterListResultEntity.Add(item);
                        }
                    }
                    IOFileHelper ioFileHelper = new IOFileHelper();
                    JsonHelper   jsonHelper   = new JsonHelper();
                    var          jsonData     = jsonHelper.SerializerJson(filterListResultEntity);
                    if (jsonData == null)
                    {
                        Debug.WriteLine("写入json之前 序列化出错");
                    }
                    /////写入json 文件之前 把实体转换为json字符串
                    //if (ioFileHelper.SaveJsonFile(JsonContentPath, "" + fileName + ".json", jsonData, false) == false)
                    //{
                    //    Debug.WriteLine("文件写入出错");
                    //}
                    switch (searchEmum)
                    {
                    case ViewEnum.searchEnum.WriteMysql:
                        TBTitleListService tBTitleListService = new TBTitleListService();
                        tBTitleListService.InsertTBTItleList(filterListResultEntity);    //添加到mysql数据库//
                        break;

                    case ViewEnum.searchEnum.WriteSQLite:
                        LocTitleListService locTitleListService = new LocTitleListService();
                        locTitleListService.InsertLocalTBTitleEx(filterListResultEntity);    //添加到sqlite,去重update
                        break;

                    case ViewEnum.searchEnum.WriteAll:
                        TBTitleListService tbTitleListService = new TBTitleListService();
                        tbTitleListService.InsertTBTItleList(filterListResultEntity);    //添加到mysql数据库
                        LocTitleListService loctitleListService = new LocTitleListService();
                        loctitleListService.InsertLocalTBTitle(filterListResultEntity);
                        break;
                    }
                }
                catch (Exception e)
                {
                    Debug.WriteLine("-------CrawlerNproView Error------" + e);
                }
                #region foreach
                //      //********************************************************************************//
                //    List<ContentEntity> listContentEntity = new List<ContentEntity>();//内容 有错误 应该是标题
                //    List<RepliesEneity> listRepliesEneity = new List<RepliesEneity>();//回复
                //    this.txtContent.Text = elementTbUrl.ToString();
                //    foreach (var itemPrplies in elementTbChildReplies)
                //    {
                //        RepliesEneity repliesEneity = new RepliesEneity();
                //        if (itemPrplies.ToString().Contains("回复"))
                //        {
                //            Regex regReplies = new Regex(@">(.*?)<");
                //            var tempData = regReplies.Match(itemPrplies.ToString()).ToString();
                //            var resultReplies = tempData.Substring(1, tempData.Length - 2);
                //            repliesEneity.Replies = resultReplies;
                //            listRepliesEneity.Add(repliesEneity);
                //        }
                //    }
                //    foreach (var itemUrl in elementTbUrl)
                //    {
                //        ContentEntity contentEntity = new ContentEntity();
                //        if (itemUrl.ToString().Contains("href"))
                //        {
                //            // string reg = @"<a> *href=([""'])?(?(.*?)[^'""]+)\1[^>]*>";
                //            Regex regUrl = new Regex(@"(/p/[0-9]{10})");
                //            //<a href="/p/5388999580" title="求大佬帮忙p个图" target="_blank" class="j_th_tit ">求大佬帮忙p个图</a>
                //            //Regex regUrl = new Regex(@"<a?* href=\/p\/^(.*?) */a> ");
                //            var resultUrl = regUrl.Match(itemUrl.ToString()).ToString();
                //            Regex regTitle = new Regex(">(.*?)<");
                //            var tempData = regTitle.Match(itemUrl.ToString()).ToString();
                //            var resultTitle = tempData.Substring(1, tempData.Length - 2);
                //            contentEntity.Title = resultTitle;
                //            contentEntity.Url = resultUrl;
                //            listContentEntity.Add(contentEntity);
                //        }
                //    }
                //    List<ResultEntity> listResultEntity = new List<ResultEntity>();//所有的
                //    //TBTitleListService tBTitleListService = new TBTitleListService();
                //    // tBTitleListService.InsertTBTItleList(MySqlConn.GetMysqlConn(), listTitleListEntity);添加到数据库
                //    //Elements elementTbChildTitle = docomentChild.Select("a.j_th_tit");
                //    //this.txtContent.Text = elementTbChildReplies.ToString()+"//**//"+"/n/r"+ elementTbChildTitle.ToString() +"//***//";
                #endregion
            }
        }
예제 #2
0
        /// <summary>
        /// 内容
        /// </summary>
        /// <param name="url"></param>
        /// <param name="pageSize"></param>
        /// <returns></returns>
        public async Task <GetContextModel> GetContext(string url, string pageSize = "1")
        {
            GetContextModel getContextModel = new GetContextModel();
            string          JsonContentPath = ConfigurationManager.AppSettings["JsonContentPath"];

            if (url == null || pageSize == null)
            {
                //错误报告?pn=3
                //  url = @"5463733135";
            }
            var           strUrl        = @"https://tieba.baidu.com/p/" + url + "?pn=" + pageSize;
            Regex         getAttContex  = new Regex(@">(.*?)<");
            Regex         regTime       = new Regex(@"20\d{2}(-|\/)((0[1-9])|(1[0-2]))(-|\/)((0[1-9])|([1-2][0-9])|(3[0-1]))(T|\s)(([0-1][0-9])|(2[0-3])):([0-5][0-9])");
            string        strGetConetex = null;
            string        pageNumber    = null;
            string        repelyNumber  = null;
            string        fileName      = "5463733135";
            CrHttpRequest crHttpRequest = new CrHttpRequest();

            strGetConetex = await crHttpRequest.SentDataAsync(HttpMethod.Post, strUrl);

            List <ContentEntity> listContent  = new List <ContentEntity>();
            ContentEntity        contenEntity = new ContentEntity();

            if (strGetConetex.Length > 0)
            {
                Document document = NSoup.NSoupClient.Parse(strGetConetex);
                //Elements elementTbNum = document.GetElementsByClass("card_infoNum");
                //Elements elementTbName = document.GetElementsByClass("card_title_fname");
                Elements elementTbTitle = document.Select("div.l_post").Select("div.l_post_bright").Select("div.j_l_post").Select("div.clearfix");
                //获取回复和分页数
                var      tempAttribute = document.Select("li.l_reply_num").First().ToString();
                Elements pageNum       = NSoup.NSoupClient.Parse(tempAttribute).Select("li.l_reply_num");
                contenEntity.PageSize   = pageSize;//页数赋值
                contenEntity.Visiable   = "1";
                contenEntity.CreateCode = Convert.ToInt32(url);
                foreach (var item in pageNum)
                {
                    var replyNumPage = NSoup.NSoupClient.Parse(item.ToString()).Select("span").ToString();
                    if (replyNumPage.Contains("style"))// 回复数
                    {
                        var tempRn = getAttContex.Match(replyNumPage).ToString();
                        repelyNumber = tempRn.Substring(1, tempRn.Length - 2);
                    }
                    if (replyNumPage.Contains("red"))//分页数
                    {
                        var tempPn = getAttContex.Match(replyNumPage).NextMatch().ToString();
                        pageNumber = tempPn.Substring(1, tempPn.Length - 2);
                    }
                }
                //this.txtContent.Text = elementTbTitle.ToString();
                foreach (var item in elementTbTitle)
                {
                    ContentEntity contenEntityTemp = new ContentEntity();
                    var           data             = NSoup.NSoupClient.Parse(item.ToString());
                    Debug.WriteLine(data);
                    //cc 标签
                    var content = NSoup.NSoupClient.Parse(data.ToString()).Select("cc");
                    if (content.Count == 0)
                    {
                        getContextModel.State = "300";
                        Debug.WriteLine("-------------------贴内容为空----error400-------");
                    }
                    else
                    {
                        //span  楼层 和时间
                        var span = data.Select("span.tail-info");
                        contenEntityTemp.Url = url;
                        var tempContent = Regex.Replace(content.ToString(), "<[^>]+>", "");
                        contenEntityTemp.Content = tempContent.Replace("\n", "").Replace(" ", "").Replace("\t", "").Replace("\r", "");
                        foreach (var itemspan in span)
                        {
                            if (itemspan.ToString().Contains("楼"))//可以不要
                            {
                                var tempFloor = getAttContex.Match(itemspan.ToString()).ToString();
                                var floor     = tempFloor.Substring(1, tempFloor.Length - 2);
                                contenEntityTemp.Floor = floor.Trim();
                            }
                            else if (itemspan.ToString().Contains(":"))//可以不要
                            {
                                var floorTIme = regTime.Match(itemspan.ToString()).ToString();
                                contenEntityTemp.ReplyTime = floorTIme.Trim();
                            }
                            contenEntityTemp.ReplyNum = repelyNumber;
                            contenEntityTemp.PageNum  = pageNumber;
                        }
                        contenEntity = contenEntityTemp;
                        listContent.Add(contenEntity);
                    }
                }
                #region 写入到sqlite
                LocContentListService locContentListService = new LocContentListService();
                var result = locContentListService.InsertLocalContent(listContent);

                //  locContentListService
                #endregion
                #region 写入到json文件

                /*  JsonHelper jsonHelper = new JsonHelper();
                 * var strlistJson = jsonHelper.SerializerJson(listContent);
                 * IOFileHelper ioFileHelper = new IOFileHelper();
                 * if (pageSize == "1")
                 * {
                 *     ioFileHelper.SaveJsonFile(JsonContentPath, "" + fileName + ".json", strlistJson);
                 * }
                 * else
                 * {
                 *     ioFileHelper.SaveJsonFile(JsonContentPath, "" + fileName + ".json", strlistJson, true);
                 * }
                 */
                #endregion
                getContextModel.Flag         = true;
                getContextModel.State        = "200";
                getContextModel.Total        = repelyNumber;
                getContextModel.FullPageSIze = pageNumber;
                //var list = jsonHelper.DeserializeJsonTo<List<ContentEntity>>(strlistJson);//反序列化测试
                return(getContextModel);
            }
            else
            {
                getContextModel.Flag = false;
                return(getContextModel);
            }
        }