//���ڲɼ�һ����ҳ������ private bool GatherSingleUrl(string Url,bool IsNext,string NextRule) { cGatherWeb gWeb = new cGatherWeb(); DataTable tmpData; string NextUrl=Url ; string Old_Url = NextUrl; //gWeb.CutFlag = m_TaskSplitData.CutFlag; bool IsAjax = false; if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl) IsAjax = true; try { if (IsNext) { do { Url = NextUrl; Old_Url = NextUrl; e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "���ڲɼ���" + Url + "\n", this.IsErrorLog)); if (m_IsUrlEncode == true) { Url = cTool.UrlEncode(Url,(cGlobalParas.WebCode)int.Parse ( m_UrlEncode)); } //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos,m_SavePath,IsAjax ); tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax); if (tmpData != null) { m_GatherData.Merge(tmpData); } //������־���ɼ����ݵ��¼� if (tmpData == null || tmpData.Rows.Count == 0) { } else { e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData)); } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɼ���ɣ�" + Url + "\n", this.IsErrorLog)); e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��ʼ������һҳ�����ȡ��һҳ��ַ\n", this.IsErrorLog)); string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "", true, IsAjax); string NRule="((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")"; Match charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline); string strNext = charSetMatch.Groups[1].Value; if (strNext != "") { //�жϻ�ȡ�ĵ�ַ�Ƿ�Ϊ��Ե�ַ if (strNext.Substring(0, 1) == "/") { string PreUrl = Url; PreUrl = PreUrl.Substring(7, PreUrl.Length - 7); PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/")); PreUrl = "http://" + PreUrl; strNext = PreUrl + strNext; } else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase)) { //NextUrl = strNext; } else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase)) { Match aa = Regex.Match(Url, @".*(?=\?)"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } else { Match aa = Regex.Match(Url, ".*/"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��һҳ��ַ��ȡ�ɹ���" + NextUrl + "\n", this.IsErrorLog)); } else { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�Ѿ�������ҳ" + NextUrl + "\n", this.IsErrorLog)); } NextUrl = strNext; } while (NextUrl != "" && Old_Url != NextUrl); } else { if (m_IsUrlEncode == true) { Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode)int.Parse(m_UrlEncode)); } //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax); tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax); if (tmpData != null) { m_GatherData.Merge(tmpData); } //������־���ɼ����ݵ��¼� if (tmpData == null || tmpData.Rows.Count == 0) { } else { e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData)); } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɼ���ɣ�" + Url + "\n", this.IsErrorLog)); } //�����ɼ���ַ�����¼� e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0)); m_TaskSplitData.GatheredTrueUrlCount++; } catch (System.Exception ex) { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "�ɼ���������" + ex.Message + "\n", this.IsErrorLog)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0)); m_TaskSplitData.GatheredTrueErrUrlCount++; m_TaskSplitData.GatheredErrUrlCount++; onError(ex); return false; } gWeb = null; tmpData = null; return true; }
///���Dzɼ����е����������ַ���ݵ���� ///���������Ϊ���ࣺһ����һҳ�ĵ���������ҳ�浼���� ///�˷��������ַ����Ҫ������һҳ�Ĺ���Ȼ�����ParseGatherNavigationUrl ///����ҳ�浼�������� private bool GatherNavigationUrl(string Url, List<Task.cNavigRule> nRules, bool IsNext, string NextRule) { cGatherWeb gWeb = new cGatherWeb(); //gWeb.CutFlag = m_TaskSplitData.CutFlag; string NextUrl = Url; string Old_Url = NextUrl; bool IsSucceed = false; try { if (IsNext) { do { if (m_ThreadRunning == true) { Url = NextUrl; Old_Url = NextUrl; e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "���ڲɼ���" + Url + "\n", this.IsErrorLog)); IsSucceed = ParseGatherNavigationUrl(Url,nRules) ; //, NagRule, IsOppPath); e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�ɼ���ɣ�" + Url + "\n", this.IsErrorLog)); e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��ʼ������һҳ�����ȡ��һҳ��ַ\n", this.IsErrorLog)); bool IsAjax = false; if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl) IsAjax = true; string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "",true,IsAjax ); string NRule = "((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")"; Match charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline); string strNext = charSetMatch.Groups[1].Value; if (strNext != "") { //�жϻ�ȡ�ĵ�ַ�Ƿ�Ϊ��Ե�ַ if (strNext.Substring(0, 1) == "/") { string PreUrl = Url; PreUrl = PreUrl.Substring(7, PreUrl.Length - 7); PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/")); PreUrl = "http://" + PreUrl; strNext = PreUrl + strNext; } else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase)) { NextUrl = strNext; } else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase)) { Match aa = Regex.Match(Url, @".*(?=\?)"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } else { Match aa = Regex.Match(Url, ".*/"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "��һҳ��ַ��ȡ�ɹ���" + NextUrl + "\n", this.IsErrorLog)); } else { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "�Ѿ�������ҳ" + "\n", this.IsErrorLog)); } NextUrl = strNext; } else if (m_ThreadRunning == false) { //��ʶҪ����ֹ�̣߳�ֹͣ�����˳�doѭ����ǰ�������� if (NextUrl == "" || Old_Url == NextUrl) { return true; } else { return false; } //break; } } while (NextUrl != "" && Old_Url != NextUrl); } else { IsSucceed = ParseGatherNavigationUrl(Url, nRules); //, NagRule, IsOppPath); } } catch (System.Exception ex) { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "�ɼ���������" + ex.Message + "\n", this.IsErrorLog)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0)); m_TaskSplitData.GatheredTrueErrUrlCount++; m_TaskSplitData.GatheredErrUrlCount++; onError(ex); return false; } gWeb = null; return IsSucceed; }
///这是采集带有导航规则的网址数据的入口 ///导航规则分为两类:一是下一页的导航规则;而是页面导航, ///此方法传入地址后,主要处理下一页的规则,然后调用ParseGatherNavigationUrl ///处理页面导航的问题 private bool GatherNavigationUrl(string Url, List <Task.cNavigRule> nRules, bool IsNext, string NextRule) { cGatherWeb gWeb = new cGatherWeb(); //gWeb.CutFlag = m_TaskSplitData.CutFlag; string NextUrl = Url; string Old_Url = NextUrl; bool IsSucceed = false; try { if (IsNext) { do { if (m_ThreadRunning == true) { Url = NextUrl; Old_Url = NextUrl; e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "正在采集:" + Url + "\n", this.IsErrorLog)); IsSucceed = ParseGatherNavigationUrl(Url, nRules); //, NagRule, IsOppPath); e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "采集完成:" + Url + "\n", this.IsErrorLog)); e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "开始根据下一页规则获取下一页网址\n", this.IsErrorLog)); bool IsAjax = false; if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl) { IsAjax = true; } string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "", true, IsAjax); string NRule = "((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")"; Match charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline); string strNext = charSetMatch.Groups[1].Value; if (strNext != "") { //判断获取的地址是否为相对地址 if (strNext.Substring(0, 1) == "/") { string PreUrl = Url; PreUrl = PreUrl.Substring(7, PreUrl.Length - 7); PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/")); PreUrl = "http://" + PreUrl; strNext = PreUrl + strNext; } else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase)) { NextUrl = strNext; } else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase)) { Match aa = Regex.Match(Url, @".*(?=\?)"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } else { Match aa = Regex.Match(Url, ".*/"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "下一页网址获取成功:" + NextUrl + "\n", this.IsErrorLog)); } else { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "已经到最终页" + "\n", this.IsErrorLog)); } NextUrl = strNext; } else if (m_ThreadRunning == false) { //标识要求终止线程,停止任务,退出do循环提前结束任务 if (NextUrl == "" || Old_Url == NextUrl) { return(true); } else { return(false); } //break; } }while (NextUrl != "" && Old_Url != NextUrl); } else { IsSucceed = ParseGatherNavigationUrl(Url, nRules); //, NagRule, IsOppPath); } } catch (System.Exception ex) { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "采集发生错误:" + ex.Message + "\n", this.IsErrorLog)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0)); m_TaskSplitData.GatheredTrueErrUrlCount++; m_TaskSplitData.GatheredErrUrlCount++; onError(ex); return(false); } gWeb = null; return(IsSucceed); }
//���ݵ�������ȡ��ҳ��ַ����һ������ public List<string> GetUrlsByRule(string Url, string UrlRule,cGlobalParas.WebCode webCode, string cookie) { string Url1; List<string> Urls=new List<string> (); if (UrlRule.Trim() == "") { Urls.Add(Url); return Urls; } //�ж���ַ�Ƿ���ڲ�����������ڲ�����ȡ����һ��������ַ if (Regex.IsMatch(Url, "{.*}")) { List<string> Urls1 = SplitWebUrl(Url ); //,IsUrlEncode ,UrlEncode Url1 = Urls1[0].ToString(); } else { Url1 = Url; } //������ַ��Դ�룬��������ȡ������ȡ��������ַ //string UrlSource= cTool.GetHtmlSource(Url1,true ); cGatherWeb gW = new cGatherWeb(); string UrlSource = gW.GetHtml(Url1, webCode, cookie, "", "", true, false); gW = null; if (UrlSource == "") { return null ; } //string Rule=@"(?<=href=[\W])" + cTool.RegexReplaceTrans(UrlRule) + @"(\S[^'"">]*)(?=[\s'""])"; string Rule = ""; if (UrlRule.StartsWith("<Regex:")) { Rule = @"(?<=[href=|src=|open(][\W])"; //����ǰ string strPre = UrlRule.Substring(UrlRule.IndexOf("<Regex:")+7, UrlRule.IndexOf(">")-7); Rule += strPre; //�����м����� string cma=@"(?<=<Common:)\S+?(?=>)"; Regex cmas = new Regex(cma, RegexOptions.IgnoreCase | RegexOptions.Multiline); MatchCollection cs = cmas.Matches(UrlRule); foreach (Match ma in cs) { Rule +=@"(\S*)" + ma.Value.ToString (); } //����� if (Regex.IsMatch(UrlRule, "<End:")) { string s = UrlRule.Substring(UrlRule.IndexOf("<End:") + 5, UrlRule.Length - UrlRule.IndexOf("<End:") - 6); Rule += @"(\S*)" + s; } else { Rule += @"(\S[^'"">]*)(?=[\s'""])"; } } else { Rule = @"(?<=[href=|src=|open(][\W])" + cTool.RegexReplaceTrans(UrlRule) + @"(\S[^'"">]*)(?=[\s'""])"; } Regex re = new Regex(Rule, RegexOptions.IgnoreCase | RegexOptions.Multiline); MatchCollection aa = re.Matches(UrlSource); DataTable d1 = new DataTable(); d1.Columns.Add("Name"); foreach (Match ma in aa) { //Urls.Add(ma.Value.ToString()); d1.Rows.Add(ma.Value.ToString()); } //����ʱ���ܻ��ȡ�ظ���ַ���б��������Ҫȥ�� //ȥ���ظ��� string[] strComuns = new string[d1.Columns.Count]; for (int m = 0; m < d1.Columns.Count; m++) { strComuns[m] = d1.Columns[m].ColumnName; } DataView dv = new DataView(d1); DataTable d2 = dv.ToTable(true, strComuns); for (int i = 0; i < d2.Rows.Count; i++) { if (string.Compare(d2.Rows[i][0].ToString ().Substring (0,4), "http", true) != 0) { string PreUrl = Url; if (d2.Rows[i][0].ToString().Substring(0, 1) == "/") { PreUrl = PreUrl.Substring(7, PreUrl.Length - 7); PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/")); PreUrl = "http://" + PreUrl; } else { Match a = Regex.Match(PreUrl, ".*/"); PreUrl = a.Groups[0].Value.ToString(); } Urls.Add(PreUrl + d2.Rows[i][0].ToString()); } else { Urls.Add(d2.Rows[i][0].ToString()); } } return Urls; }
//用于采集一个网页的数据 private bool GatherSingleUrl(string Url, bool IsNext, string NextRule) { cGatherWeb gWeb = new cGatherWeb(); DataTable tmpData; string NextUrl = Url; string Old_Url = NextUrl; //gWeb.CutFlag = m_TaskSplitData.CutFlag; bool IsAjax = false; if (m_TaskType == cGlobalParas.TaskType.AjaxHtmlByUrl) { IsAjax = true; } try { if (IsNext) { do { Url = NextUrl; Old_Url = NextUrl; e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "正在采集:" + Url + "\n", this.IsErrorLog)); if (m_IsUrlEncode == true) { Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode) int.Parse(m_UrlEncode)); } //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos,m_SavePath,IsAjax ); tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax); if (tmpData != null) { m_GatherData.Merge(tmpData); } //触发日志及采集数据的事件 if (tmpData == null || tmpData.Rows.Count == 0) { } else { e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData)); } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "采集完成:" + Url + "\n", this.IsErrorLog)); e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "开始根据下一页规则获取下一页网址\n", this.IsErrorLog)); string webSource = gWeb.GetHtml(Url, m_WebCode, m_Cookie, "", "", true, IsAjax); string NRule = "((?<=href=[\'|\"])\\S[^#+$<>\\s]*(?=[\'|\"]))[^<]*(?<=" + NextRule + ")"; Match charSetMatch = Regex.Match(webSource, NRule, RegexOptions.IgnoreCase | RegexOptions.Multiline); string strNext = charSetMatch.Groups[1].Value; if (strNext != "") { //判断获取的地址是否为相对地址 if (strNext.Substring(0, 1) == "/") { string PreUrl = Url; PreUrl = PreUrl.Substring(7, PreUrl.Length - 7); PreUrl = PreUrl.Substring(0, PreUrl.IndexOf("/")); PreUrl = "http://" + PreUrl; strNext = PreUrl + strNext; } else if (strNext.StartsWith("http://", StringComparison.CurrentCultureIgnoreCase)) { //NextUrl = strNext; } else if (strNext.StartsWith("?", StringComparison.CurrentCultureIgnoreCase)) { Match aa = Regex.Match(Url, @".*(?=\?)"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } else { Match aa = Regex.Match(Url, ".*/"); string PreUrl = aa.Groups[0].Value.ToString(); strNext = PreUrl + strNext; } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "下一页网址获取成功:" + NextUrl + "\n", this.IsErrorLog)); } else { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "已经到最终页" + NextUrl + "\n", this.IsErrorLog)); } NextUrl = strNext; }while (NextUrl != "" && Old_Url != NextUrl); } else { if (m_IsUrlEncode == true) { Url = cTool.UrlEncode(Url, (cGlobalParas.WebCode) int.Parse(m_UrlEncode)); } //tmpData = gWeb.GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax); tmpData = GetGatherData(Url, m_WebCode, m_Cookie, m_gStartPos, m_gEndPos, m_SavePath, IsAjax); if (tmpData != null) { m_GatherData.Merge(tmpData); } //触发日志及采集数据的事件 if (tmpData == null || tmpData.Rows.Count == 0) { } else { e_GData(this, new cGatherDataEventArgs(m_TaskID, tmpData)); } e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Info).ToString() + "采集完成:" + Url + "\n", this.IsErrorLog)); } //触发采集网址计数事件 e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Gathered, 0)); m_TaskSplitData.GatheredTrueUrlCount++; } catch (System.Exception ex) { e_Log(this, new cGatherTaskLogArgs(m_TaskID, ((int)cGlobalParas.LogType.Error).ToString() + Url + "采集发生错误:" + ex.Message + "\n", this.IsErrorLog)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.Err, 0)); e_GUrlCount(this, new cGatherUrlCountArgs(m_TaskID, cGlobalParas.UpdateUrlCountType.ErrUrlCountAdd, 0)); m_TaskSplitData.GatheredTrueErrUrlCount++; m_TaskSplitData.GatheredErrUrlCount++; onError(ex); return(false); } gWeb = null; tmpData = null; return(true); }