/// 处理 分解采集任务 错误事件 private void TaskThreadError(object sender, TaskThreadErrorEventArgs e) { //当采集发生错误后,系统首先需要检测当前是否连接网络 //如果没有连接网络,即无Internet,则系统停止此任务执行 if (cTool.IsLinkInternet() == false) { Stop(); m_State = cGlobalParas.TaskState.Failed; if (e_TaskFailed != null) { e_TaskFailed(this, new cTaskEventArgs(TaskID, TaskName, false)); } return; } cGatherTaskSplit gt = (cGatherTaskSplit)sender; //如果出错调用此事件,也表示完成了一个网址的采集,但是出错了 //一个线程发生错误并不允许停止整个任务执行,即便所有线程都发生促务 //也需要保障任务执行,只是把任务出错信息写入日志 //if (gt.ErrorCount >= cGatherManage.MaxErrorCount) //{ // 达到最大错误数,停止当前线程 //bool failed = true; // 如果当前任务所有的线程都停止了,则判断为任务失败 //foreach (cGatherTaskSplit dtc in m_list_GatherTaskSplit) //{ // if (!gt.Equals(dtc) && dtc.IsThreadAlive) // { // failed = false; // break; // } //} //if (failed) //{ // State = cGlobalParas.TaskState.Failed; // return; //} //} if (e_TaskError != null) { e_TaskError(this, new TaskErrorEventArgs(gt, e.Error)); } //} }
/// 分解采集任务 线程完成 事件处理 判断的是独立线程,每个线程完成后 /// 都需要触发任务完成事件,交由任务继续判断,如果完成则调用任务完成 /// 事件,告诉程序此任务已经完成 private void TaskWorkThreadCompleted(object sender, cTaskEventArgs e) { cGatherTaskSplit dtc = (cGatherTaskSplit)sender; if (dtc.UrlCount == dtc.GatherErrUrlCount + dtc.GatheredUrlCount) { // 任务采集完成 onTaskCompleted(); } }
//将分解任务事件进行绑定 private void TaskEventInit(cGatherTaskSplit dtc) { if (!dtc.IsInitialized) { // 绑定 初始化事件、完成事件 dtc.TaskInit += this.TaskWorkThreadInit; dtc.Completed += this.TaskWorkThreadCompleted; dtc.GUrlCount += this.onGUrlCount; dtc.Log += this.onLog; dtc.GData += this.onGData; dtc.Error += this.TaskThreadError; dtc.IsInitialized = true; } }
/// 任务初始化,由分解任务触发, private void TaskWorkThreadInit(object sender, TaskInitializedEventArgs e) { cGatherTaskSplit dtc = (cGatherTaskSplit)sender; m_TaskData.TaskID = e.TaskID; if (e_TaskThreadInitialized != null) { // 代理触发 任务初始化 事件 m_TaskManage.EventProxy.AddEvent(delegate() { e_TaskThreadInitialized(this, new TaskInitializedEventArgs(m_TaskData.TaskID)); }); } }
internal void ReStartWaitingWorkThread(cGatherTaskSplit dtc) { m_WaitingWorkThread.Remove(dtc); dtc.ReStart(); }
internal void AddWaitingWorkThread(cGatherTaskSplit dtc) { dtc.Waittime = cGatherTaskList.Waittime; m_WaitingWorkThread.Add(dtc); }
/// <summary> /// /// </summary> /// <param name="dtc">��������ķֿ�</param> /// <param name="error">������쳣</param> public TaskErrorEventArgs(cGatherTaskSplit dtc, Exception error) { m_Error = error; m_ErrorThread = dtc; }
/// 初始化采集任务线程 private void TaskInit() { string sPath = m_TaskData.SavePath + "\\" + m_TaskData.TaskName + "_file"; ///任务初始化分为两种情况,一种是未启动执行的任务,一种是已经启动但未执行完毕的任务 /// //m_TaskData.GatheredUrlCount = 0; //m_TaskData.GatherErrUrlCount = 0; //m_TaskData.TrueUrlCount = m_TaskData.UrlCount; if (!m_IsDataInitialized) { if (m_list_GatherTaskSplit.Count > 0) { // 清理可能存在的子线程 foreach (cGatherTaskSplit dtc in m_list_GatherTaskSplit) { dtc.Stop(); } m_list_GatherTaskSplit.Clear(); } if (IsCompleted) { // 修改此采集任务的状态为已采集完成,设置为状态为已完成,需要出发事件 m_State = cGlobalParas.TaskState.Completed; //m_State = cGlobalParas.TaskState.Completed; //e_TaskCompleted(this, new cTaskEventArgs(m_TaskData.TaskID, false)); } else { cGatherTaskSplit dtc; if (m_TaskData.TaskSplitData.Count > 0) { foreach (cTaskSplitData configData in m_TaskData.TaskSplitData) { dtc = new cGatherTaskSplit(); dtc.TaskManage = m_TaskManage; dtc.TaskID = m_TaskData.TaskID; dtc.WebCode = m_TaskData.WebCode; dtc.IsUrlEncode = m_TaskData.IsUrlEncode; dtc.UrlEncode = m_TaskData.UrlEncode; dtc.Cookie = m_TaskData.Cookie; dtc.StartPos = m_TaskData.StartPos; dtc.EndPos = m_TaskData.EndPos; dtc.SavePath = sPath; dtc.AgainNumber = m_TaskData.GatherAgainNumber; dtc.Ignore404 = m_TaskData.IsIgnore404; dtc.IsErrorLog = m_TaskData.IsErrorLog; dtc.TaskSplitData = configData; m_list_GatherTaskSplit.Add(dtc); dtc = null; } } else { dtc = new cGatherTaskSplit(); dtc.TaskManage = m_TaskManage; dtc.TaskID = m_TaskData.TaskID; dtc.WebCode = m_TaskData.WebCode; dtc.IsUrlEncode = m_TaskData.IsUrlEncode; dtc.UrlEncode = m_TaskData.UrlEncode; dtc.Cookie = m_TaskData.Cookie; dtc.StartPos = m_TaskData.StartPos; dtc.EndPos = m_TaskData.EndPos; dtc.SavePath = sPath; dtc.AgainNumber = m_TaskData.GatherAgainNumber; dtc.Ignore404 = m_TaskData.IsIgnore404; dtc.IsErrorLog = m_TaskData.IsErrorLog; // 新任务,则新建子线程 m_list_GatherTaskSplit.Add(dtc); dtc = null; } foreach (cGatherTaskSplit TaskSplit in m_list_GatherTaskSplit) { // 初始化所有子线程 TaskEventInit(TaskSplit); } } m_IsDataInitialized = true; } }
//根据指定的任务ID对当前的任务进行分解,如果有导航页,也需要在此进行 //分解 //并初始化此任务的关键数据 private void SplitTask() { cGatherTaskSplit dtc; List <Task.cWebLink> tWeblink; Task.cTask t = new Task.cTask(); //m_TaskData.TaskID = e.TaskID; //根据指定的TaskID加载任务地址信息 try { t.LoadTask(Int64.Parse(m_TaskData.TaskID.ToString())); } catch (System.Exception) { //调试实体文件加载失败,有可能是文件丢失所造成 //但还是需要加载一个空信息,以便界面可以显示此丢失的任务 //这样用户可以通过界面操作删除此任务内容,这是一个针对 //丢失文件的处理手段 m_TaskData.SavePath = ""; m_TaskData.TaskDemo = ""; m_TaskData.StartPos = ""; m_TaskData.EndPos = ""; m_TaskData.Cookie = ""; m_TaskData.WebCode = cGlobalParas.WebCode.auto; m_TaskData.IsLogin = false; m_TaskData.LoginUrl = ""; m_TaskData.PublishType = cGlobalParas.PublishType.NoPublish; m_TaskData.IsUrlEncode = false; m_TaskData.UrlEncode = ""; m_TaskData.Weblink = null; m_TaskData.CutFlag = null; return; } ////加载页面的采集起始位置和终止位置 ///此两项数据不在taskrun中存储,是在任务的xml文件中存储 ///但m_TaskData是按照taskrun来加载的数据,所以无法加载此两 ///项值和采集页面的规则及网址。 ///为什么从taskrun中加载,是因为在索引taskrun的时候可以显示界面 ///信息,所以就共用了一个加载信息的内容 m_TaskData.SavePath = t.SavePath; m_TaskData.TaskDemo = t.TaskDemo; m_TaskData.StartPos = t.StartPos; m_TaskData.EndPos = t.EndPos; m_TaskData.Cookie = t.Cookie; m_TaskData.WebCode = (cGlobalParas.WebCode) int.Parse(t.WebCode); m_TaskData.IsLogin = t.IsLogin; m_TaskData.LoginUrl = t.LoginUrl; m_TaskData.PublishType = (cGlobalParas.PublishType) int.Parse(t.ExportType); m_TaskData.IsUrlEncode = t.IsUrlEncode; m_TaskData.UrlEncode = t.UrlEncode; m_TaskData.GatherAgainNumber = t.GatherAgainNumber; m_TaskData.IsIgnore404 = t.IsIgnore404; m_TaskData.IsErrorLog = t.IsErrorLog; m_TaskData.IsDelRepRow = t.IsDelRepRow; m_TaskData.IsTrigger = t.IsTrigger; if (t.IsTrigger == true) { m_TaskData.TriggerType = t.TriggerType; m_TaskData.TriggerTask = t.TriggerTask; } ////加载网页地址数据及采集标志数据 ////再次去处理如果带有参数的网址,则需要进行分解 ////确保加载的网址肯定是一个有效的网址 ////注意,此时由于有可能分解任务信息,所以,网址数量在此会发生变化,所以,最终还需修改网址数据 Task.cWebLink w; Task.cUrlAnalyze u = new Task.cUrlAnalyze(); for (int i = 0; i < t.WebpageLink.Count; i++) { if (Regex.IsMatch(t.WebpageLink[i].Weblink.ToString(), "{.*}")) { List <string> Urls; if (m_TaskData.IsUrlEncode == true) { Urls = u.SplitWebUrl(t.WebpageLink[i].Weblink.ToString()); } else { Urls = u.SplitWebUrl(t.WebpageLink[i].Weblink.ToString()); } //开始添加m_TaskData.weblink数据 for (int j = 0; j < Urls.Count; j++) { w = new Task.cWebLink(); w.IsGathered = t.WebpageLink[i].IsGathered; w.IsNavigation = t.WebpageLink[i].IsNavigation; w.IsNextpage = t.WebpageLink[i].IsNextpage; w.NextPageRule = t.WebpageLink[i].NextPageRule; w.Weblink = Urls[j].ToString(); //加载导航数据 if (t.WebpageLink[i].IsNavigation == true) { w.NavigRules = t.WebpageLink[i].NavigRules; } m_TaskData.Weblink.Add(w); w = null; } } else { m_TaskData.Weblink.Add(t.WebpageLink[i]); } } u = null; m_TaskData.CutFlag = t.WebpageCutFlag; string sPath = m_TaskData.SavePath + "\\" + m_TaskData.TaskName + "_file"; //重新初始化UrlCount //m_TaskData.UrlCount = m_TaskData.Weblink.Count; //开始进行任务分块,但此任务的Url数必须大于线程数,且线程数>1 if (m_TaskData.UrlCount > m_TaskData.ThreadCount && m_TaskData.ThreadCount > 1) { int SplitUrlCount = (int)Math.Ceiling((decimal)m_TaskData.UrlCount / (decimal)m_TaskData.ThreadCount); //设置每个分解任务的起始Url索引和终止的Url索引 int StartIndex = 0; int EndIndex = 0; int j = 0; //for (int i = 1; i <= SplitUrlCount; i++) for (int i = 1; i <= m_TaskData.ThreadCount; i++) { StartIndex = EndIndex; if (i == m_TaskData.ThreadCount) { EndIndex = m_TaskData.Weblink.Count; } else { //EndIndex = i * m_TaskData.ThreadCount; EndIndex = i * SplitUrlCount; } //初始化分解采集任务类 dtc = new cGatherTaskSplit(); dtc.TaskManage = m_TaskManage; dtc.TaskID = m_TaskData.TaskID; dtc.WebCode = m_TaskData.WebCode; dtc.IsUrlEncode = m_TaskData.IsUrlEncode; dtc.UrlEncode = m_TaskData.UrlEncode; dtc.Cookie = m_TaskData.Cookie; dtc.StartPos = m_TaskData.StartPos; dtc.EndPos = m_TaskData.EndPos; dtc.SavePath = sPath; dtc.AgainNumber = m_TaskData.GatherAgainNumber; dtc.Ignore404 = m_TaskData.IsIgnore404; dtc.IsErrorLog = m_TaskData.IsErrorLog; tWeblink = new List <Task.cWebLink>(); for (j = StartIndex; j < EndIndex; j++) { tWeblink.Add(m_TaskData.Weblink[j]); } //初始化分解的子任务数据 dtc.SetSplitData(StartIndex, EndIndex - 1, tWeblink, m_TaskData.CutFlag); m_TaskData.TaskSplitData.Add(dtc.TaskSplitData); tWeblink = null; dtc = null; } } else { //初始化分解采集任务类 dtc = new cGatherTaskSplit(); dtc.TaskManage = m_TaskManage; dtc.TaskID = m_TaskData.TaskID; dtc.WebCode = m_TaskData.WebCode; dtc.IsUrlEncode = m_TaskData.IsUrlEncode; dtc.UrlEncode = m_TaskData.UrlEncode; dtc.Cookie = m_TaskData.Cookie; dtc.StartPos = m_TaskData.StartPos; dtc.EndPos = m_TaskData.EndPos; dtc.SavePath = sPath; dtc.AgainNumber = m_TaskData.GatherAgainNumber; dtc.Ignore404 = m_TaskData.IsIgnore404; dtc.IsErrorLog = m_TaskData.IsErrorLog; dtc.SetSplitData(0, m_TaskData.UrlCount - 1, m_TaskData.Weblink, m_TaskData.CutFlag); m_TaskData.TaskSplitData.Add(dtc.TaskSplitData); //m_list_GatherTaskSplit.Add(dtc); } t = null; dtc = null; }
/// <summary> /// /// </summary> /// <param name="dtc">发生错误的分块</param> /// <param name="error">捕获的异常</param> public TaskErrorEventArgs(cGatherTaskSplit dtc, Exception error) { m_Error = error; m_ErrorThread = dtc; }