Esempio n. 1
0
        /// <summary>根据当前Url与Url内容采集数据</summary>
        public void GatherOper()
        {
            int cindex = 0;

            while (NowUrl.Trim() != "" && ListLinkContent.Trim() != "")
            {
                if (!IsListLinkEcho(NowUrl))
                {
                    string ListLinkArea = "";
                    if (gi.ws.UrlRangeRule != "")
                    {
                        ListLinkArea = wapp.GatherApp.GetRegTxtString(ListLinkContent, gi.ws.UrlRangeRule, gi.ws.UrlRangeRuleIndex);
                        lout.OT(1, "获取内容页链接列表范围匹配内容。[" + ListLinkArea.Length.ToString() + "]");
                    }
                    else
                    {
                        ListLinkArea = ListLinkContent;
                    }
                    if (ListLinkArea.Trim() != "")
                    {
                        List <string> linklist = wapp.GatherApp.GetRegTxtList(ListLinkArea, gi.ws.UrlRule, gi.ws.UrlRuleIndex);
                        lout.OT(1, "获取内容页链接匹配数组[" + linklist.Count.ToString() + "]");
                        for (int x = 0; x < linklist.Count; x++)
                        {
                            StartDate = DateTime.Now;
                            EndDate   = StartDate;
                            string links = wapp.GatherApp.FormatUrl(NowUrl, linklist[x].Trim());
                            if (!IsContLinkEcho(links))
                            {
                                string actlink = links;
                                NowContLinkList.Add(actlink);
                                string errs  = "";
                                string conts = wapp.GatherApp.GetHttp(det, actlink, gi.ws.CodingMode, ref PageEncoding, ref PageCoding, ref errs);
                                if (errs != "")
                                {
                                    lout.OT(1, "获取内容页起始url内容失败,错误信息:" + errs + "。[Url:" + actlink + "]");
                                }
                                else
                                {
                                    lout.OT(1, "获取内容页起始url内容。[" + actlink + "][" + conts.Length.ToString() + "]");
                                }
                                DataRow nsdr = gi.gdt.NewRow();
                                for (int si = 0; si < gi.wsilist.Count; si++)
                                {
                                    string sconts = "";
                                    if (gi.wsilist[si].ContentRangeRule != "")
                                    {
                                        sconts = wapp.GatherApp.GetRegTxtString(conts, gi.wsilist[si].ContentRangeRule, gi.wsilist[si].ContentRangeRuleIndex);
                                    }
                                    else
                                    {
                                        sconts = conts;
                                    }
                                    string sis = "";
                                    if (gi.wsilist[si].ContentType == 2)
                                    {
                                        sis = wapp.GatherApp.FormatDate(Often.OutTxt(wapp.GatherApp.GetUnCode(wapp.GatherApp.GetRegsTxt(sconts, gi.wsilist[si].ContentRule, gi.wsilist[si].ContentRuleIndex), gi.wsilist[si].ContentCodeMode)));
                                    }
                                    else
                                    {
                                        sis = wapp.GatherApp.GetUnCode(wapp.GatherApp.GetRegsTxt(sconts, gi.wsilist[si].ContentRule, gi.wsilist[si].ContentRuleIndex), gi.wsilist[si].ContentCodeMode);
                                    }
                                    if (gi.wsilist[si].ContentType == 1 && Often.IsNum(sis))
                                    {
                                        nsdr[si] = sis;
                                    }
                                    else if (gi.wsilist[si].ContentType == 2 && Often.IsDate(sis))
                                    {
                                        nsdr[si] = sis;
                                    }
                                    else
                                    {
                                        nsdr[si] = sis;
                                    }
                                    if (sis != "")
                                    {
                                        lout.OT(1, "[" + gi.wsilist[si].ListTitle + "(" + gi.wsilist[si].ListCode + ")]内容:[" + sis + "]采集成功!");
                                    }
                                }
                                gi.gdt.Rows.Add(nsdr);
                                GatherCount++;
                                if (GatherCount >= gi.ws.GatherMaxNum)
                                {
                                    return;
                                }
                                #region 保存采集内存表中的信息到数据库
                                if (gi.gdt.Rows.Count > 999)
                                {
                                    string savefile = GetExcelSaveFileName();
                                    lout.OT(1, "保存数据到Excel文件操作开始[文件名:" + savefile + "]...");
                                    wapp.Excel ex = new wapp.Excel();
                                    ex.DataTableToExcel(gi.gdt);
                                    ex.WriteFile(savefile);
                                    gi.gdt.Clear();
                                    lout.OT(1, "保存数据到Excel文件操作结束[文件名:" + savefile + "]");
                                }
                                #endregion
                            }
                        }
                    }
                }
                string listurl = NowUrl;
                NowUrlList.Add(NowUrl);
                NowUrl = "";
                if (gi.ws.IsNext == 1)
                {
                    if (gi.ws.NextMode == 1)
                    {
                        #region 采集内容链接列表分页(列表分页为单页采集模式)
                        if (CentUrlList.Count > 0 && cindex < CentUrlList.Count)
                        {
                            NowUrl = CentUrlList[cindex].Trim();
                            cindex++;
                        }
                        #endregion
                    }
                    else
                    {
                        #region 采集内容链接列表分页(列表分页为多页采集模式)
                        if (gi.ws.NextRangeRule != "")
                        {
                            string s = wapp.GatherApp.GetRegTxtString(ListLinkContent, gi.ws.NextRangeRule, gi.ws.NextRangeRuleIndex);
                            if (s != "")
                            {
                                s = wapp.GatherApp.GetRegsTxt(s, gi.ws.NextRule, gi.ws.NextRuleIndex);
                                if (s != "")
                                {
                                    NowUrl = wapp.GatherApp.GetUrls(wapp.GatherApp.FormatUrl(listurl, s));
                                }
                            }
                        }
                        else
                        {
                            string s = wapp.GatherApp.GetRegsTxt(ListLinkContent, gi.ws.NextRule, gi.ws.NextRuleIndex);
                            if (s != "")
                            {
                                NowUrl = wapp.GatherApp.GetUrls(wapp.GatherApp.FormatUrl(listurl, s));
                            }
                        }
                        #endregion
                    }
                    if (NowUrl != "" && !IsListLinkEcho(NowUrl))
                    {
                        string errs = "";
                        ListLinkContent = wapp.GatherApp.GetHttp(det, NowUrl, gi.ws.CodingMode, ref PageEncoding, ref PageCoding, ref errs);
                        if (errs != "")
                        {
                            ListLinkContent = "";
                            lout.OT(1, "列表页采集失败,错误信息:" + errs + ",[Url:" + NowUrl + "]");
                        }
                    }
                    else
                    {
                        ListLinkContent = "";
                    }
                }
                NowUrlCount++;
            }
        }
Esempio n. 2
0
 /// <summary>开始采集</summary>
 public void StartGather()
 {
     ExcelSaveNum = 1000000;
     NowContLinkList.Clear();
     NowUrlList.Clear();
     UrlList = wapp.GatherApp.GetUrlList(gi.ws.UrlList);
     if (UrlList.Count == 0)
     {
         lout.OT(0, "Url列表无有效URL地址!");
     }
     for (int i = 0; i < UrlList.Count; i++)
     {
         CentUrlList     = new List <string>();
         NowUrlListIndex = i;
         string orders    = (i + 1).ToString();
         string gatherurl = UrlList[NowUrlListIndex];
         NowUrl = gatherurl;
         lout.OT(1, "第[" + orders + "]次执行采集URL[" + gatherurl + "]");
         string errs = "";
         ListLinkContent = wapp.GatherApp.GetHttp(det, gatherurl, gi.ws.CodingMode, ref PageEncoding, ref PageCoding, ref errs);
         if (errs != "")
         {
             lout.OT(1, "第[" + orders + "]次获取URL内容失败,错误信息:" + errs + "。[Url:" + gatherurl + "]");
         }
         else
         {
             lout.OT(1, "第[" + orders + "]次获取URL内容。[" + ListLinkContent.Length.ToString() + "]");
             if (gi.ws.IsNext == 1 && gi.ws.NextMode == 1 && ListLinkContent.Trim() != "")
             {
                 List <string> clist = new List <string>();
                 #region 采集内容链接列表分页(列表分页为单页采集模式)
                 if (gi.ws.UrlRangeRule != "")
                 {
                     clist = wapp.GatherApp.GetRegTxtList(wapp.GatherApp.GetRegTxtString(ListLinkContent, gi.ws.UrlRangeRule, gi.ws.UrlRangeRuleIndex).ToString(), gi.ws.UrlRule, gi.ws.UrlRuleIndex);
                 }
                 else
                 {
                     clist = wapp.GatherApp.GetRegTxtList(ListLinkContent, gi.ws.UrlRule, gi.ws.UrlRuleIndex);
                 }
                 for (int ii = 0; ii < clist.Count; ii++)
                 {
                     string clink = wapp.GatherApp.GetUrls(wapp.GatherApp.FormatUrl(gatherurl, clist[ii].ToString().Trim()));
                     if (clink != "")
                     {
                         CentUrlList.Add(clink);
                     }
                 }
                 #endregion
             }
             if (ListLinkContent.Trim() != "")
             {
                 GatherOper();
             }
         }
     }
     if (gi.gdt.Rows.Count >= 0)
     {
         string savefile = GetExcelSaveFileName();
         lout.OT(1, "保存数据到Excel文件操作开始[文件名:" + savefile + "]...");
         wapp.Excel ex = new wapp.Excel();
         ex.DataTableToExcel(gi.gdt);
         ex.WriteFile(savefile);
         gi.gdt.Clear();
         lout.OT(1, "保存数据到Excel文件操作结束[文件名:" + savefile + "]");
     }
     lout.OT(1, "全部操作完毕。共采集信息[" + GatherCount.ToString() + "]次,采集内容成功[" + ContSucceedCount.ToString() + "]次,采集内容失败[" + ContKaputCount.ToString() + "]次。");
     lout.OT(1, "本次采集信息耗时:[" + DateOften.DateDiff(OperStartDate, DateTime.Now, "s").ToString() + "]秒");
 }