Esempio n. 1
0
        private void ThreadGrabDetailPage(IListSheet listSheet, int detailPageIndex, Proj_Detail_SingleLine detailPageInfo, string sourceDir)
        {
            DateTime dt1           = DateTime.Now;
            string   pageUrl       = this.RunPage.DetailPageUrlList[detailPageIndex];
            string   cookie        = this.RunPage.DetailPageCookieList[detailPageIndex];
            string   localPagePath = this.RunPage.GetFilePath(pageUrl, sourceDir);
            Dictionary <string, string> listRow = listSheet.GetRow(detailPageIndex);

            bool succeed        = true;
            bool existLocalFile = File.Exists(localPagePath);


            if (!existLocalFile)
            {
                succeed = this.GrabDetailPage(listSheet, pageUrl, listRow, localPagePath, detailPageIndex, detailPageInfo, cookie);
            }

            this.RunPage.RefreshGrabCount(succeed);

            DateTime dt2 = DateTime.Now;
            TimeSpan ts  = dt2 - dt1;

            this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 抓取了第" + (detailPageIndex + 1).ToString() + "个页面, 用时" + ts.TotalSeconds.ToString("0.00") + "秒", LogLevelType.Normal, false);

            this.RunPage.RecordGrabDetailStatus(succeed, dt1, dt2);
        }
Esempio n. 2
0
        private void ThreadGrabDetailPage(object parameters)
        {
            object[]               parameterArray = (object[])parameters;
            IListSheet             listSheet      = (IListSheet)parameterArray[0];
            Proj_Detail_SingleLine detailPageInfo = (Proj_Detail_SingleLine)parameterArray[1];
            string         sourceDir     = this.RunPage.GetSourceFileDir(detailPageInfo);
            Nullable <int> nextPageIndex = this.RunPage.GetNextGrabDetailPageIndex();

            while (nextPageIndex != null)
            {
                try
                {
                    this.ThreadGrabDetailPage(listSheet, (int)nextPageIndex, detailPageInfo, sourceDir);
                }
                catch (NoneProxyException ex)
                {
                    this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 停止线程\r\n\r\n\r\n\r\n\r\n." + ex.Message, LogLevelType.System, true);
                    break;
                }
                catch (Exception ex)
                {
                    this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 出错!!!!!!!!!!!!" + ex.Message, LogLevelType.System, true);
                }
                nextPageIndex = this.RunPage.GetNextGrabDetailPageIndex();
            }
            this.RunPage.AllGrabDetailThreads.Remove(Thread.CurrentThread);
        }
Esempio n. 3
0
        private void BeginGrabDetailPageInParallelThread(IListSheet listSheet, Proj_Detail_SingleLine detailPageInfo)
        {
            //int threadCount = detailPageInfo.DataAccessType == Proj_DataAccessType.WebBrowserHtml ? 1 : detailPageInfo.ThreadCount;
            int threadCount = detailPageInfo.ThreadCount;

            this.RunPage.AllGrabDetailThreads = new List <Thread>();
            for (int i = 0; i < threadCount; i++)
            {
                Thread grabThread = new Thread(new ParameterizedThreadStart(ThreadGrabDetailPage));
                this.RunPage.AllGrabDetailThreads.Add(grabThread);
                this.RunPage.InvokeAppendLogText("线程" + grabThread.ManagedThreadId.ToString() + "开始抓取数据.", LogLevelType.System, true);
                grabThread.Start(new object[] { listSheet, detailPageInfo });
                Thread.Sleep(50);
            }
        }
Esempio n. 4
0
        public override bool BeginGrabDetailPageInExternalProgram(IListSheet listSheet, Proj_Detail_SingleLine detailPageInfo)
        {
            string sourceDir = this.RunPage.GetSourceFileDir(detailPageInfo);

            this.RunPage.AllNeedGrabCount = this.InitGrabDetailPageIndexList(listSheet, sourceDir);
            if (this.RunPage.AllNeedGrabCount != 0)
            {
                this.BeginGrabDetailPageInParallelThread(listSheet, detailPageInfo);
                while (this.RunPage.CompleteGrabCount < this.RunPage.AllNeedGrabCount && this.RunPage.AllGrabDetailThreads.Count > 0)
                {
                    Thread.Sleep(5000);
                }
                return(this.RunPage.SucceedGrabCount == this.RunPage.AllNeedGrabCount);
            }
            else
            {
                return(true);
            }
        }
Esempio n. 5
0
        private bool GrabDetailPage(IListSheet listSheet, string pageUrl, Dictionary <string, string> listRow, string localPagePath, int pageIndex, Proj_Detail_SingleLine detailPageInfo, string cookie, string tabName)
        {
            string   pageName            = this.RunPage.DetailPageNameList[pageIndex];
            decimal  intervalAfterLoaded = detailPageInfo.IntervalAfterLoaded;
            Encoding encoding            = Encoding.GetEncoding(detailPageInfo.Encoding);
            string   lastWebPageText     = "";

            try
            {
                bool gotLastPage      = false;
                int  requestPageIndex = 1;
                while (!gotLastPage)
                {
                    string indexPageUrl      = pageUrl + "?p=" + requestPageIndex.ToString();
                    string indexPageFilePath = this.RunPage.GetFilePath(indexPageUrl, this.RunPage.GetSourceFileDir(detailPageInfo));
                    if (!File.Exists(indexPageFilePath))
                    {
                        string webPageText = GetTextByRequest(indexPageUrl, listRow, detailPageInfo.NeedProxy, intervalAfterLoaded, detailPageInfo.RequestTimeout, encoding, cookie, detailPageInfo.XRequestedWith, detailPageInfo.AutoAbandonDisableProxy, detailPageInfo.DataAccessType, detailPageInfo.CompleteChecks, detailPageInfo.IntervalProxyRequest);
                        if (webPageText.Contains("\"msg\":\"ok\""))
                        {
                            if (webPageText.Contains("\"data\":[]") || lastWebPageText == webPageText)
                            {
                                //已到达最后一页
                                break;
                            }
                            else
                            {
                                this.RunPage.SaveFile(webPageText, indexPageFilePath, encoding);
                                lastWebPageText = webPageText;
                            }
                        }
                        else
                        {
                            throw new Exception("抓取出错: " + webPageText);
                        }
                    }
                    requestPageIndex++;
                }
                this.RunPage.SaveFile((requestPageIndex - 1).ToString(), localPagePath, encoding);

                return(true);
            }
            catch (NoneProxyException ex)
            {
                throw ex;
            }
            catch (Exception ex)
            {
                if (!detailPageInfo.AllowAutoGiveUp || !this.RunPage.GiveUpGrabPage(listSheet, pageUrl, pageIndex, ex))
                {
                    this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": PageUrl = " + pageUrl + ". " + ex.Message + (ex.InnerException == null ? "" : ex.InnerException.Message), LogLevelType.Error, true);
                    return(false);
                }
                else
                {
                    this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 放弃抓取. PageUrl = " + pageUrl + ". " + ex.Message + (ex.InnerException == null ? "" : ex.InnerException.Message), LogLevelType.Error, true);
                    return(true);
                }
            }
        }
Esempio n. 6
0
        private bool GrabDetailPage(IListSheet listSheet, string pageUrl, Dictionary <string, string> listRow, string localPagePath, int pageIndex, Proj_Detail_SingleLine detailPageInfo, string cookie)
        {
            string tabName = Thread.CurrentThread.ManagedThreadId.ToString();

            return(this.GrabDetailPage(listSheet, pageUrl, listRow, localPagePath, pageIndex, detailPageInfo, cookie, tabName));
        }
 /// <summary>
 /// 自定义ProgramType的方式,逐个抓取详情页
 /// </summary>
 /// <param name="listSheet"></param>
 /// <param name="detailPageInfo"></param>
 /// <returns></returns>
 public virtual bool BeginGrabDetailPageInExternalProgram(IListSheet listSheet, Proj_Detail_SingleLine detailPageInfo)
 {
     return(true);
 }