private void ThreadGrabDetailPage(IListSheet listSheet, int detailPageIndex, Proj_Detail_SingleLine detailPageInfo, string sourceDir) { DateTime dt1 = DateTime.Now; string pageUrl = this.RunPage.DetailPageUrlList[detailPageIndex]; string cookie = this.RunPage.DetailPageCookieList[detailPageIndex]; string localPagePath = this.RunPage.GetFilePath(pageUrl, sourceDir); Dictionary <string, string> listRow = listSheet.GetRow(detailPageIndex); bool succeed = true; bool existLocalFile = File.Exists(localPagePath); if (!existLocalFile) { succeed = this.GrabDetailPage(listSheet, pageUrl, listRow, localPagePath, detailPageIndex, detailPageInfo, cookie); } this.RunPage.RefreshGrabCount(succeed); DateTime dt2 = DateTime.Now; TimeSpan ts = dt2 - dt1; this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 抓取了第" + (detailPageIndex + 1).ToString() + "个页面, 用时" + ts.TotalSeconds.ToString("0.00") + "秒", LogLevelType.Normal, false); this.RunPage.RecordGrabDetailStatus(succeed, dt1, dt2); }
private void ThreadGrabDetailPage(object parameters) { object[] parameterArray = (object[])parameters; IListSheet listSheet = (IListSheet)parameterArray[0]; Proj_Detail_SingleLine detailPageInfo = (Proj_Detail_SingleLine)parameterArray[1]; string sourceDir = this.RunPage.GetSourceFileDir(detailPageInfo); Nullable <int> nextPageIndex = this.RunPage.GetNextGrabDetailPageIndex(); while (nextPageIndex != null) { try { this.ThreadGrabDetailPage(listSheet, (int)nextPageIndex, detailPageInfo, sourceDir); } catch (NoneProxyException ex) { this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 停止线程\r\n\r\n\r\n\r\n\r\n." + ex.Message, LogLevelType.System, true); break; } catch (Exception ex) { this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 出错!!!!!!!!!!!!" + ex.Message, LogLevelType.System, true); } nextPageIndex = this.RunPage.GetNextGrabDetailPageIndex(); } this.RunPage.AllGrabDetailThreads.Remove(Thread.CurrentThread); }
private void BeginGrabDetailPageInParallelThread(IListSheet listSheet, Proj_Detail_SingleLine detailPageInfo) { //int threadCount = detailPageInfo.DataAccessType == Proj_DataAccessType.WebBrowserHtml ? 1 : detailPageInfo.ThreadCount; int threadCount = detailPageInfo.ThreadCount; this.RunPage.AllGrabDetailThreads = new List <Thread>(); for (int i = 0; i < threadCount; i++) { Thread grabThread = new Thread(new ParameterizedThreadStart(ThreadGrabDetailPage)); this.RunPage.AllGrabDetailThreads.Add(grabThread); this.RunPage.InvokeAppendLogText("线程" + grabThread.ManagedThreadId.ToString() + "开始抓取数据.", LogLevelType.System, true); grabThread.Start(new object[] { listSheet, detailPageInfo }); Thread.Sleep(50); } }
public override bool BeginGrabDetailPageInExternalProgram(IListSheet listSheet, Proj_Detail_SingleLine detailPageInfo) { string sourceDir = this.RunPage.GetSourceFileDir(detailPageInfo); this.RunPage.AllNeedGrabCount = this.InitGrabDetailPageIndexList(listSheet, sourceDir); if (this.RunPage.AllNeedGrabCount != 0) { this.BeginGrabDetailPageInParallelThread(listSheet, detailPageInfo); while (this.RunPage.CompleteGrabCount < this.RunPage.AllNeedGrabCount && this.RunPage.AllGrabDetailThreads.Count > 0) { Thread.Sleep(5000); } return(this.RunPage.SucceedGrabCount == this.RunPage.AllNeedGrabCount); } else { return(true); } }
private bool GrabDetailPage(IListSheet listSheet, string pageUrl, Dictionary <string, string> listRow, string localPagePath, int pageIndex, Proj_Detail_SingleLine detailPageInfo, string cookie, string tabName) { string pageName = this.RunPage.DetailPageNameList[pageIndex]; decimal intervalAfterLoaded = detailPageInfo.IntervalAfterLoaded; Encoding encoding = Encoding.GetEncoding(detailPageInfo.Encoding); string lastWebPageText = ""; try { bool gotLastPage = false; int requestPageIndex = 1; while (!gotLastPage) { string indexPageUrl = pageUrl + "?p=" + requestPageIndex.ToString(); string indexPageFilePath = this.RunPage.GetFilePath(indexPageUrl, this.RunPage.GetSourceFileDir(detailPageInfo)); if (!File.Exists(indexPageFilePath)) { string webPageText = GetTextByRequest(indexPageUrl, listRow, detailPageInfo.NeedProxy, intervalAfterLoaded, detailPageInfo.RequestTimeout, encoding, cookie, detailPageInfo.XRequestedWith, detailPageInfo.AutoAbandonDisableProxy, detailPageInfo.DataAccessType, detailPageInfo.CompleteChecks, detailPageInfo.IntervalProxyRequest); if (webPageText.Contains("\"msg\":\"ok\"")) { if (webPageText.Contains("\"data\":[]") || lastWebPageText == webPageText) { //已到达最后一页 break; } else { this.RunPage.SaveFile(webPageText, indexPageFilePath, encoding); lastWebPageText = webPageText; } } else { throw new Exception("抓取出错: " + webPageText); } } requestPageIndex++; } this.RunPage.SaveFile((requestPageIndex - 1).ToString(), localPagePath, encoding); return(true); } catch (NoneProxyException ex) { throw ex; } catch (Exception ex) { if (!detailPageInfo.AllowAutoGiveUp || !this.RunPage.GiveUpGrabPage(listSheet, pageUrl, pageIndex, ex)) { this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": PageUrl = " + pageUrl + ". " + ex.Message + (ex.InnerException == null ? "" : ex.InnerException.Message), LogLevelType.Error, true); return(false); } else { this.RunPage.InvokeAppendLogText("线程" + Thread.CurrentThread.ManagedThreadId.ToString() + ": 放弃抓取. PageUrl = " + pageUrl + ". " + ex.Message + (ex.InnerException == null ? "" : ex.InnerException.Message), LogLevelType.Error, true); return(true); } } }
private bool GrabDetailPage(IListSheet listSheet, string pageUrl, Dictionary <string, string> listRow, string localPagePath, int pageIndex, Proj_Detail_SingleLine detailPageInfo, string cookie) { string tabName = Thread.CurrentThread.ManagedThreadId.ToString(); return(this.GrabDetailPage(listSheet, pageUrl, listRow, localPagePath, pageIndex, detailPageInfo, cookie, tabName)); }
/// <summary> /// 自定义ProgramType的方式,逐个抓取详情页 /// </summary> /// <param name="listSheet"></param> /// <param name="detailPageInfo"></param> /// <returns></returns> public virtual bool BeginGrabDetailPageInExternalProgram(IListSheet listSheet, Proj_Detail_SingleLine detailPageInfo) { return(true); }