//根据指定的任务ID对当前的任务进行分解,如果有导航页,也需要在此进行 //分解 //并初始化此任务的关键数据 private void SplitTask() { cGatherTaskSplit dtc; List <Task.cWebLink> tWeblink; Task.cTask t = new Task.cTask(); //m_TaskData.TaskID = e.TaskID; //根据指定的TaskID加载任务地址信息 try { t.LoadTask(Int64.Parse(m_TaskData.TaskID.ToString())); } catch (System.Exception) { //调试实体文件加载失败,有可能是文件丢失所造成 //但还是需要加载一个空信息,以便界面可以显示此丢失的任务 //这样用户可以通过界面操作删除此任务内容,这是一个针对 //丢失文件的处理手段 m_TaskData.SavePath = ""; m_TaskData.TaskDemo = ""; m_TaskData.StartPos = ""; m_TaskData.EndPos = ""; m_TaskData.Cookie = ""; m_TaskData.WebCode = cGlobalParas.WebCode.auto; m_TaskData.IsLogin = false; m_TaskData.LoginUrl = ""; m_TaskData.PublishType = cGlobalParas.PublishType.NoPublish; m_TaskData.IsUrlEncode = false; m_TaskData.UrlEncode = ""; m_TaskData.Weblink = null; m_TaskData.CutFlag = null; return; } ////加载页面的采集起始位置和终止位置 ///此两项数据不在taskrun中存储,是在任务的xml文件中存储 ///但m_TaskData是按照taskrun来加载的数据,所以无法加载此两 ///项值和采集页面的规则及网址。 ///为什么从taskrun中加载,是因为在索引taskrun的时候可以显示界面 ///信息,所以就共用了一个加载信息的内容 m_TaskData.SavePath = t.SavePath; m_TaskData.TaskDemo = t.TaskDemo; m_TaskData.StartPos = t.StartPos; m_TaskData.EndPos = t.EndPos; m_TaskData.Cookie = t.Cookie; m_TaskData.WebCode = (cGlobalParas.WebCode) int.Parse(t.WebCode); m_TaskData.IsLogin = t.IsLogin; m_TaskData.LoginUrl = t.LoginUrl; m_TaskData.PublishType = (cGlobalParas.PublishType) int.Parse(t.ExportType); m_TaskData.IsUrlEncode = t.IsUrlEncode; m_TaskData.UrlEncode = t.UrlEncode; m_TaskData.GatherAgainNumber = t.GatherAgainNumber; m_TaskData.IsIgnore404 = t.IsIgnore404; m_TaskData.IsErrorLog = t.IsErrorLog; m_TaskData.IsDelRepRow = t.IsDelRepRow; m_TaskData.IsTrigger = t.IsTrigger; if (t.IsTrigger == true) { m_TaskData.TriggerType = t.TriggerType; m_TaskData.TriggerTask = t.TriggerTask; } ////加载网页地址数据及采集标志数据 ////再次去处理如果带有参数的网址,则需要进行分解 ////确保加载的网址肯定是一个有效的网址 ////注意,此时由于有可能分解任务信息,所以,网址数量在此会发生变化,所以,最终还需修改网址数据 Task.cWebLink w; Task.cUrlAnalyze u = new Task.cUrlAnalyze(); for (int i = 0; i < t.WebpageLink.Count; i++) { if (Regex.IsMatch(t.WebpageLink[i].Weblink.ToString(), "{.*}")) { List <string> Urls; if (m_TaskData.IsUrlEncode == true) { Urls = u.SplitWebUrl(t.WebpageLink[i].Weblink.ToString()); } else { Urls = u.SplitWebUrl(t.WebpageLink[i].Weblink.ToString()); } //开始添加m_TaskData.weblink数据 for (int j = 0; j < Urls.Count; j++) { w = new Task.cWebLink(); w.IsGathered = t.WebpageLink[i].IsGathered; w.IsNavigation = t.WebpageLink[i].IsNavigation; w.IsNextpage = t.WebpageLink[i].IsNextpage; w.NextPageRule = t.WebpageLink[i].NextPageRule; w.Weblink = Urls[j].ToString(); //加载导航数据 if (t.WebpageLink[i].IsNavigation == true) { w.NavigRules = t.WebpageLink[i].NavigRules; } m_TaskData.Weblink.Add(w); w = null; } } else { m_TaskData.Weblink.Add(t.WebpageLink[i]); } } u = null; m_TaskData.CutFlag = t.WebpageCutFlag; string sPath = m_TaskData.SavePath + "\\" + m_TaskData.TaskName + "_file"; //重新初始化UrlCount //m_TaskData.UrlCount = m_TaskData.Weblink.Count; //开始进行任务分块,但此任务的Url数必须大于线程数,且线程数>1 if (m_TaskData.UrlCount > m_TaskData.ThreadCount && m_TaskData.ThreadCount > 1) { int SplitUrlCount = (int)Math.Ceiling((decimal)m_TaskData.UrlCount / (decimal)m_TaskData.ThreadCount); //设置每个分解任务的起始Url索引和终止的Url索引 int StartIndex = 0; int EndIndex = 0; int j = 0; //for (int i = 1; i <= SplitUrlCount; i++) for (int i = 1; i <= m_TaskData.ThreadCount; i++) { StartIndex = EndIndex; if (i == m_TaskData.ThreadCount) { EndIndex = m_TaskData.Weblink.Count; } else { //EndIndex = i * m_TaskData.ThreadCount; EndIndex = i * SplitUrlCount; } //初始化分解采集任务类 dtc = new cGatherTaskSplit(); dtc.TaskManage = m_TaskManage; dtc.TaskID = m_TaskData.TaskID; dtc.WebCode = m_TaskData.WebCode; dtc.IsUrlEncode = m_TaskData.IsUrlEncode; dtc.UrlEncode = m_TaskData.UrlEncode; dtc.Cookie = m_TaskData.Cookie; dtc.StartPos = m_TaskData.StartPos; dtc.EndPos = m_TaskData.EndPos; dtc.SavePath = sPath; dtc.AgainNumber = m_TaskData.GatherAgainNumber; dtc.Ignore404 = m_TaskData.IsIgnore404; dtc.IsErrorLog = m_TaskData.IsErrorLog; tWeblink = new List <Task.cWebLink>(); for (j = StartIndex; j < EndIndex; j++) { tWeblink.Add(m_TaskData.Weblink[j]); } //初始化分解的子任务数据 dtc.SetSplitData(StartIndex, EndIndex - 1, tWeblink, m_TaskData.CutFlag); m_TaskData.TaskSplitData.Add(dtc.TaskSplitData); tWeblink = null; dtc = null; } } else { //初始化分解采集任务类 dtc = new cGatherTaskSplit(); dtc.TaskManage = m_TaskManage; dtc.TaskID = m_TaskData.TaskID; dtc.WebCode = m_TaskData.WebCode; dtc.IsUrlEncode = m_TaskData.IsUrlEncode; dtc.UrlEncode = m_TaskData.UrlEncode; dtc.Cookie = m_TaskData.Cookie; dtc.StartPos = m_TaskData.StartPos; dtc.EndPos = m_TaskData.EndPos; dtc.SavePath = sPath; dtc.AgainNumber = m_TaskData.GatherAgainNumber; dtc.Ignore404 = m_TaskData.IsIgnore404; dtc.IsErrorLog = m_TaskData.IsErrorLog; dtc.SetSplitData(0, m_TaskData.UrlCount - 1, m_TaskData.Weblink, m_TaskData.CutFlag); m_TaskData.TaskSplitData.Add(dtc.TaskSplitData); //m_list_GatherTaskSplit.Add(dtc); } t = null; dtc = null; }
//���ڲɼ���Ҫ��������ҳ���ڴ˴������ҳ���� private bool ParseGatherNavigationUrl(string Url, List<Task.cNavigRule> nRules) { Task.cUrlAnalyze u = new Task.cUrlAnalyze(); List<string> gUrls; bool IsSucceed = false; e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��ʼ���ݵ��������ȡ��ҳ��ַ����ȴ�......\n�����㼶Ϊ��" + nRules.Count + " ��\n", this.IsErrorLog)); gUrls = u.ParseUrlRule(Url, nRules,m_WebCode ,m_Cookie ); u = null; if (gUrls == null || gUrls.Count == 0) { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + Url + " ��������ʧ�ܣ��п��������ڵ����������ô���Ҳ�п�������������������ɣ�������������ݣ���Ӱ��ϵͳ�����ݵIJɼ�\n", this.IsErrorLog)); return false; } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɹ����ݵ��������ȡ" + gUrls.Count + "����ַ\n", this.IsErrorLog)); //����ʵ�ʲɼ���ַ���������ǵ���ҳ�棬����ʵ�ʲɼ���ַ���������˱仯 //ͨ���¼�������������IJɼ�����������ͬʱ����������IJɼ����� //ע�⣬������ʵ�ʲɼ���ַ������������������ַ��������������ֵ������ά�����Ե�ҵ�������� //ϵͳ�����������ķֽ��������ʱ���Ѿ�������Ҫ�ɼ���������������ԣ���Ҫ�����������ʵ�ʲɼ���ַ������ //ͬʱ���败����Ӧ���¼�����������IJɼ���ַ������ m_TaskSplitData.TrueUrlCount += gUrls.Count - 1; e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ReIni, gUrls.Count)); for (int j = 0; j < gUrls.Count; j++) { if (m_ThreadRunning == true) { try { if (string.Compare(gUrls[j].Substring(0, 4), "http", true) != 0) { string PreUrl = Url; if (gUrls[j].Substring(0, 1) == "/") { PreUrl = PreUrl.Substring(7, PreUrl.Length - 7); PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/")); PreUrl = "http://" + PreUrl; } else { Match aa = Regex.Match(PreUrl, ".*/"); PreUrl = aa.Groups[0].Value.ToString(); } IsSucceed= GatherParsedUrl(PreUrl + gUrls[j].ToString()); } else { IsSucceed= GatherParsedUrl(gUrls[j].ToString()); } //�����ɼ���ַ�����¼� e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0)); m_TaskSplitData.GatheredTrueUrlCount++; } catch (System.Exception ex) { e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0)); m_TaskSplitData.GatheredTrueErrUrlCount++; onError(ex); } } else if (m_ThreadRunning == false) { //��ʶҪ����ֹ�̣߳�ֹͣ�����˳�forѭ����ǰ�������� if (j == gUrls.Count) { //��ʾ���Dzɼ������ return true; } else { return false; } //break; } } return true; }
//用于采集需要导航的网页,在此处理导航页规则 private bool ParseGatherNavigationUrl(string Url, List <Task.cNavigRule> nRules) { Task.cUrlAnalyze u = new Task.cUrlAnalyze(); List <string> gUrls; bool IsSucceed = false; e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "开始根据导航规则获取网页地址,请等待......\n导航层级为:" + nRules.Count + " 层\n", this.IsErrorLog)); gUrls = u.ParseUrlRule(Url, nRules, m_WebCode, m_Cookie); u = null; if (gUrls == null || gUrls.Count == 0) { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + Url + " 导航解析失败,有可能是由于导航规则配置错误,也有可能是由于垃圾数据造成,如果是垃圾数据,则不影响系统对数据的采集\n", this.IsErrorLog)); return(false); } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "成功根据导航规则获取" + gUrls.Count + "个网址\n", this.IsErrorLog)); //更新实际采集网址总数,因是导航页面,所以实际采集网址总数发生了变化 //通过事件触发更新任务的采集数量总数,同时更新子任务的采集总数 //注意,仅更新实际采集网址的总数,但不更新网址总数,此是两个值,各自维护各自的业务逻辑处理 //系统进行了任务导航的分解操作,此时,已经修改了需要采集任务的总数,所以,需要更新子任务的实际采集网址的数量 //同时还需触发相应的事件修改整个任务的采集网址的总数 m_TaskSplitData.TrueUrlCount += gUrls.Count - 1; e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ReIni, gUrls.Count)); for (int j = 0; j < gUrls.Count; j++) { if (m_ThreadRunning == true) { try { if (string.Compare(gUrls[j].Substring(0, 4), "http", true) != 0) { string PreUrl = Url; if (gUrls[j].Substring(0, 1) == "/") { PreUrl = PreUrl.Substring(7, PreUrl.Length - 7); PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/")); PreUrl = "http://" + PreUrl; } else { Match aa = Regex.Match(PreUrl, ".*/"); PreUrl = aa.Groups[0].Value.ToString(); } IsSucceed = GatherParsedUrl(PreUrl + gUrls[j].ToString()); } else { IsSucceed = GatherParsedUrl(gUrls[j].ToString()); } //触发采集网址计数事件 e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0)); m_TaskSplitData.GatheredTrueUrlCount++; } catch (System.Exception ex) { e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0)); m_TaskSplitData.GatheredTrueErrUrlCount++; onError(ex); } } else if (m_ThreadRunning == false) { //标识要求终止线程,停止任务,退出for循环提前结束任务 if (j == gUrls.Count) { //表示还是采集完成了 return(true); } else { return(false); } //break; } } return(true); }