/// <summary> /// 采集单篇文章 /// </summary> /// <param name="pageUri"></param> /// <param name="func"></param> public void InvokeSingle(string pageUri, DataPackFunc func) { int i = 0; this.State.TotalCount = 1; GetPageData(pageUri, ref i, func); }
private static void Download_Project1() { DateTime dt = DateTime.Now; Project pro = director.GetProject("ifengmainland"); pro.UseMultiThread = true; if (pro == null) { Console.WriteLine("项目不存在!"); return; } Console.WriteLine("项目:{0}开始下载数据!", pro.Name); int i = 0; DataPackFunc func = pk => { /* * db.ExecuteNonQuery("INSERT INTO test([title],[content],[createDate]) VALUES(@Title,@Content,@SubmitDate)", * db.NewParameter("@Title", pk["title"]), * db.NewParameter("@Content", pk["content"]), * db.NewParameter("@SubmitDate", string.Format("{0:yyyy-MM-dd HH:mm:ss}", DateTime.Now))); */ ++i; Console.WriteLine("入库第{0}条->{1} [总用时:{2}s]", i, pk["title"], (DateTime.Now - dt).Seconds.ToString()); }; /* * DataPack pack = new DataPack(pro.Rules); * pack["title"] = "1"; * pack["content"]="content"; * func(pack);*/ // pro.InvokeList(1, func); //pro.InvokeList(2, func); for (var j = 0; j < 100; j++) { pro.InvokeList(j + 1, false, func); } //Console.WriteLine(pro.Collect("2").Count.ToString()); }
/// <summary> /// ��ȡһ��ҳ������ݲ����� /// </summary> /// <param name="pageUri">ҳ���ַ</param> /// <param name="number">ά��һ������,�ж������Ƿ����</param> /// <returns></returns> private DataPack GetPageData(string pageUri, ref int number, DataPackFunc func) { DataPack dp; int bufferLength = 10; byte[] buffer = new byte[bufferLength]; //���ص����ݻ����� StringBuilder sb = new StringBuilder(); //���췵�صĽ�� Match match; //����ƥ�� //ҳ���ַ��ҳ���ַ����ƥ�䣡 if (!Regex.IsMatch(pageUri, this.FormatedPageUriRule)) { ++number; state.FailCount++; return null; //throw new ArgumentException("ҳ���ַ��ҳ���ַ����ƥ�䣡", pageUri); } //���ҳ���ַΪ���·������������� if (absoluteUriRegex.IsMatch(pageUri)) pageUri = GetBasePath(pageUri) + pageUri; //����ҳ���HTML string html = String.Empty; try { HttpWebRequest req = (HttpWebRequest) WebRequest.Create(pageUri); req.Timeout = this.RequestTimeOut; Stream stream = req.GetResponse().GetResponseStream(); html = sb.ToString(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } } catch (Exception exc) { state.FailCount++; return null; } //������ص����� #if DEBUG Console.WriteLine("\r\n------------------------------\r\n�õ�ƥ����б�����Ϊ:{0}",html); #endif dp = new DataPack(Rules, pageUri); foreach (string propertyName in this.Rules) { match = Regex.Match(html, this.Rules[propertyName]); if (match != null) { dp[propertyName] = match.Groups[1].Value; } } #if DEBUG Console.WriteLine("\r\n-------------------------\r\n"); foreach (KeyValuePair<string, string> pair in dp) { Console.WriteLine("{0}->{1}\r\n", pair.Key, pair.Value); } #endif //���¼��� ++number; #if DEBUG Console.WriteLine("flish"); #endif //ִ�л�ִ���� if (func != null) func(dp); //���һ���ɹ��ļ��� state.SuccessCount++; return dp; }
/// <summary> /// �����б�ҳ��,���Խ��ִ�л�ִ���� /// </summary> /// <param name="parameter"></param> private void AnalysisListPage(string pageUri, DataPackFunc func) { int taskCount = 0, //������ taskNumbers = 0; //һ�����������ж������Ƿ���� string html; //���ص��б�ҳ��Html int bufferLength = 1; byte[] buffer = new byte[bufferLength]; //���ص����ݻ����� StringBuilder sb = new StringBuilder(); //���췵�صĽ�� MatchCollection listMatches; //�б��ƥ�估ҳ���ַƥ�� #if DEBUG Console.WriteLine("��ʼ��:{0}��������...", pageUri); #endif //�����б�ҳ���� HttpWebRequest request = (HttpWebRequest) WebRequest.Create(pageUri); request.Timeout = this.RequestTimeOut; Stream stream = request.GetResponse().GetResponseStream(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } #if DEBUG Console.WriteLine("���ص�����Ϊ:{0}", html); #endif //�����б�ҳ���� listMatches = Regex.Matches(html, RuleFormat.Format(this.ListBlockRule)); //û���ҵ�ƥ�� if (listMatches.Count == 0) { #if DEBUG Console.WriteLine("û�ҵ�ƥ��!"); #endif return; } //����ƥ������ #if DEBUGS Console.WriteLine("\r\n------------------------------\r\n�õ�ƥ����б�����Ϊ:\r\n"); #endif Regex pageUriRegex = new Regex(this.FormatedPageUriRule); //�����ʵ� IList<string> pageUrls = new List<string>(); foreach (Match m in listMatches) { #if DEBUG Console.WriteLine("\r\n------------------------------------------------\r\n{0}", m.Value); #endif foreach (Match pm in pageUriRegex.Matches(m.Value)) { #if DEBUG Console.WriteLine(pm.Value); #endif pageUrls.Add(pm.Value); //��ȡҳ�����ݣ��������ִ�������� //���̻߳�ȡ //if (!UseSingleThread) //{ // new Thread(() => // { // //���û�ִ���� // GetPageData(pm.Value, ref taskNumbers, func); // } // ).Start(); //} //else //���̵߳��� //{ // //���û�ִ���� // GetPageData(pm.Value, ref taskNumbers, func); //} } } //���������� taskCount = pageUrls.Count; if (!this.UseMultiThread) //���߳� { foreach (string pageUrl in pageUrls) { //���û�ִ���� GetPageData(pageUrl, ref taskNumbers, func); } } else { MultiThreadProcess mp = new MultiThreadProcess(5, taskCount); mp.Start<IList<string>>(urls => { lock (urls) { //���û�ִ���� GetPageData(urls[0], ref taskNumbers, func); pageUrls.Remove(urls[0]); } }, pageUrls); } //������������ state.TotalCount = taskCount; //ֱ���߳̾�ִ����ϣ��� do { } while (taskNumbers != taskCount); #if DEBUG Console.WriteLine("�������....!���ɼ���{0}��", taskCount); #endif }
/// <summary> /// �ɼ���ƪ���� /// </summary> /// <param name="pageUri"></param> /// <param name="func"></param> public void InvokeSingle(string pageUri, DataPackFunc func) { int i = 0; this.State.TotalCount = 1; GetPageData(pageUri, ref i, func); }
/// <summary> /// �ɼ��б�ҳ�����Բɼ��Ľ��ִ�в��� /// </summary> /// <param name="listUri">�б�ҳ��ַ</param> /// <param name="func"></param> public void InvokeList(string listUri, DataPackFunc func) { AnalysisListPage(listUri, func); }
/// <summary> /// �ɼ��б�ҳ�����Բɼ��Ľ��ִ�в��� /// </summary> /// <param name="listUriParameter">�б�URI�����еIJ���"{0}"��ֵ</param> /// <param name="func"></param> public void InvokeList(object listUriParameter, DataPackFunc func) { string uri = String.Format(this.ListUriRule, listUriParameter); AnalysisListPage(uri, func); }
/// <summary> /// 获取一个页面的数据并返回 /// </summary> /// <param name="pageUri">页面地址</param> /// <param name="number">维护一个计数,判断任务是否完成</param> /// <returns></returns> private DataPack GetPageData(string pageUri, ref int number, DataPackFunc func) { DataPack dp; int bufferLength = 10; byte[] buffer = new byte[bufferLength]; //下载的数据缓冲区 StringBuilder sb = new StringBuilder(); //构造返回的结果 Match match; //属性匹配 //页面地址跟页面地址规则不匹配! if (!Regex.IsMatch(pageUri, this.FormatedPageUriRule)) { ++number; state.FailCount++; return(null); //throw new ArgumentException("页面地址跟页面地址规则不匹配!", pageUri); } //如果页面地址为相对路径,则加上域名 if (absoluteUriRegex.IsMatch(pageUri)) { pageUri = GetBasePath(pageUri) + pageUri; } //返回页面的HTML string html = String.Empty; try { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(pageUri); req.Timeout = this.RequestTimeOut; Stream stream = req.GetResponse().GetResponseStream(); html = sb.ToString(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } } catch (Exception exc) { state.FailCount++; return(null); } //输出返回的数据 #if DEBUG Console.WriteLine("\r\n------------------------------\r\n得到匹配的列表数据为:{0}", html); #endif dp = new DataPack(Rules, pageUri); foreach (string propertyName in this.Rules) { match = Regex.Match(html, this.Rules[propertyName]); if (match != null) { dp[propertyName] = match.Groups[1].Value; } } #if DEBUG Console.WriteLine("\r\n-------------------------\r\n"); foreach (KeyValuePair <string, string> pair in dp) { Console.WriteLine("{0}->{1}\r\n", pair.Key, pair.Value); } #endif //更新计数 ++number; #if DEBUG Console.WriteLine("flish"); #endif //执行回执参数 if (func != null) { func(dp); } //添加一个成功的计数 state.SuccessCount++; return(dp); }
/// <summary> /// 分析列表页面,并对结果执行回执操作 /// </summary> /// <param name="pageUri"></param> /// <param name="reverse"></param> /// <param name="func"></param> private void AnalysisListPage(string pageUri, bool reverse, DataPackFunc func) { int taskCount = 0, //任务数 taskNumbers = 0; //一个计数用于判定任务是否完成 string html; //下载的列表页面Html int bufferLength = 1; byte[] buffer = new byte[bufferLength]; //下载的数据缓冲区 StringBuilder sb = new StringBuilder(); //构造返回的结果 MatchCollection listMatches; //列表块匹配及页面地址匹配 #if DEBUG Console.WriteLine("开始从:{0}下载数据...", pageUri); #endif //下载列表页内容 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(pageUri); request.Timeout = this.RequestTimeOut; Stream stream = request.GetResponse().GetResponseStream(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } #if DEBUG Console.WriteLine("返回的数据为:{0}", html); #endif //分析列表页代码 listMatches = Regex.Matches(html, RuleFormat.Format(this.ListBlockRule)); //没有找到匹配 if (listMatches.Count == 0) { #if DEBUG Console.WriteLine("没找到匹配!"); #endif return; } //分析匹配数据 #if DEBUGS Console.WriteLine("\r\n------------------------------\r\n得到匹配的列表数据为:\r\n"); #endif Regex pageUriRegex = new Regex(this.FormatedPageUriRule); //创建词典 IList <string> pageUrls = new List <string>(); foreach (Match m in listMatches) { #if DEBUG Console.WriteLine("\r\n------------------------------------------------\r\n{0}", m.Value); #endif foreach (Match pm in pageUriRegex.Matches(m.Value)) { #if DEBUG Console.WriteLine(pm.Value); #endif pageUrls.Add(pm.Value); //获取页面数据,并添加已执行任务数 //多线程获取 //if (!UseSingleThread) //{ // new Thread(() => // { // //调用回执方法 // GetPageData(pm.Value, ref taskNumbers, func); // } // ).Start(); //} //else //单线程调用 //{ // //调用回执方法 // GetPageData(pm.Value, ref taskNumbers, func); //} } } //增加任务数 taskCount = pageUrls.Count; // 反转顺序 if (reverse) { pageUrls = new List <string>(pageUrls.Reverse()); } if (!this.UseMultiThread) //单线程 { foreach (string pageUrl in pageUrls) { //调用回执方法 GetPageData(pageUrl, ref taskNumbers, func); } } else { MultiThreadProcess mp = new MultiThreadProcess(5, taskCount); mp.Start <IList <string> >(urls => { lock (urls) { //调用回执方法 GetPageData(urls[0], ref taskNumbers, func); pageUrls.Remove(urls[0]); } }, pageUrls); } //设置任务总数 state.TotalCount = taskCount; //直到线程均执行完毕,则返回 do { } while (taskNumbers != taskCount); #if DEBUG Console.WriteLine("任务完成....!共采集到{0}条", taskCount); #endif }
/// <summary> /// 采集列表页,并对采集的结果执行操作 /// </summary> /// <param name="listUri">列表页地址</param> /// <param name="reverse"></param> /// <param name="func"></param> public void InvokeList(string listUri, bool reverse, DataPackFunc func) { AnalysisListPage(listUri, reverse, func); }
/// <summary> /// 采集列表页,并对采集的结果执行操作 /// </summary> /// <param name="listUriParameter">列表URI规则中的参数"{0}"的值</param> /// <param name="reverse"></param> /// <param name="func"></param> public void InvokeList(object listUriParameter, bool reverse, DataPackFunc func) { string uri = String.Format(this.ListUriRule, listUriParameter); AnalysisListPage(uri, reverse, func); }
/// <summary> /// ��ȡһ��ҳ������ݲ����� /// </summary> /// <param name="pageUri">ҳ���ַ</param> /// <param name="number">ά��һ������,�ж������Ƿ����</param> /// <returns></returns> private DataPack GetPageData(string pageUri, ref int number, DataPackFunc func) { DataPack dp; int bufferLength = 10; byte[] buffer = new byte[bufferLength]; //���ص����ݻ����� StringBuilder sb = new StringBuilder(); //���췵�صĽ�� Match match; //����ƥ�� //ҳ���ַ��ҳ���ַ����ƥ�䣡 if (!Regex.IsMatch(pageUri, this.FormatedPageUriRule)) { ++number; state.FailCount++; return(null); //throw new ArgumentException("ҳ���ַ��ҳ���ַ����ƥ�䣡", pageUri); } //���ҳ���ַΪ���·������������� if (absoluteUriRegex.IsMatch(pageUri)) { pageUri = GetBasePath(pageUri) + pageUri; } //����ҳ���HTML string html = String.Empty; try { HttpWebRequest req = (HttpWebRequest)WebRequest.Create(pageUri); req.Timeout = this.RequestTimeOut; Stream stream = req.GetResponse().GetResponseStream(); html = sb.ToString(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } } catch (Exception exc) { state.FailCount++; return(null); } //������ص����� #if DEBUG Console.WriteLine("\r\n------------------------------\r\n�õ�ƥ����б�����Ϊ:{0}", html); #endif dp = new DataPack(Rules, pageUri); foreach (string propertyName in this.Rules) { match = Regex.Match(html, this.Rules[propertyName]); if (match != null) { dp[propertyName] = match.Groups[1].Value; } } #if DEBUG Console.WriteLine("\r\n-------------------------\r\n"); foreach (KeyValuePair <string, string> pair in dp) { Console.WriteLine("{0}->{1}\r\n", pair.Key, pair.Value); } #endif //���¼��� ++number; #if DEBUG Console.WriteLine("flish"); #endif //ִ�л�ִ���� if (func != null) { func(dp); } //���һ���ɹ��ļ��� state.SuccessCount++; return(dp); }
/// <summary> /// �����б�ҳ��,���Խ��ִ�л�ִ���� /// </summary> /// <param name="pageUri"></param> /// <param name="reverse"></param> /// <param name="func"></param> private void AnalysisListPage(string pageUri, bool reverse, DataPackFunc func) { int taskCount = 0, //������ taskNumbers = 0; //һ�����������ж������Ƿ���� string html; //���ص��б�ҳ��Html int bufferLength = 1; byte[] buffer = new byte[bufferLength]; //���ص����ݻ����� StringBuilder sb = new StringBuilder(); //���췵�صĽ�� MatchCollection listMatches; //�б��ƥ�估ҳ���ַƥ�� #if DEBUG Console.WriteLine("��ʼ��:{0}��������...", pageUri); #endif //�����б�ҳ���� HttpWebRequest request = (HttpWebRequest)WebRequest.Create(pageUri); request.Timeout = this.RequestTimeOut; Stream stream = request.GetResponse().GetResponseStream(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } #if DEBUG Console.WriteLine("���ص�����Ϊ:{0}", html); #endif //�����б�ҳ���� listMatches = Regex.Matches(html, RuleFormat.Format(this.ListBlockRule)); //û���ҵ�ƥ�� if (listMatches.Count == 0) { #if DEBUG Console.WriteLine("û�ҵ�ƥ��!"); #endif return; } //����ƥ������ #if DEBUGS Console.WriteLine("\r\n------------------------------\r\n�õ�ƥ����б�����Ϊ:\r\n"); #endif Regex pageUriRegex = new Regex(this.FormatedPageUriRule); //�����ʵ� IList <string> pageUrls = new List <string>(); foreach (Match m in listMatches) { #if DEBUG Console.WriteLine("\r\n------------------------------------------------\r\n{0}", m.Value); #endif foreach (Match pm in pageUriRegex.Matches(m.Value)) { #if DEBUG Console.WriteLine(pm.Value); #endif pageUrls.Add(pm.Value); //��ȡҳ�����ݣ��������ִ�������� //���̻߳�ȡ //if (!UseSingleThread) //{ // new Thread(() => // { // //���û�ִ���� // GetPageData(pm.Value, ref taskNumbers, func); // } // ).Start(); //} //else //���̵߳��� //{ // //���û�ִ���� // GetPageData(pm.Value, ref taskNumbers, func); //} } } //���������� taskCount = pageUrls.Count; // ��ת˳�� if (reverse) { pageUrls = new List <string>(pageUrls.Reverse()); } if (!this.UseMultiThread) //���߳� { foreach (string pageUrl in pageUrls) { //���û�ִ���� GetPageData(pageUrl, ref taskNumbers, func); } } else { MultiThreadProcess mp = new MultiThreadProcess(5, taskCount); mp.Start <IList <string> >(urls => { lock (urls) { //���û�ִ���� GetPageData(urls[0], ref taskNumbers, func); pageUrls.Remove(urls[0]); } }, pageUrls); } //������������ state.TotalCount = taskCount; //ֱ���߳̾�ִ����ϣ��� do { } while (taskNumbers != taskCount); #if DEBUG Console.WriteLine("�������....!���ɼ���{0}��", taskCount); #endif }
/// <summary> /// 采集列表页,并对采集的结果执行操作 /// </summary> /// <param name="listUri">列表页地址</param> /// <param name="func"></param> public void InvokeList(string listUri, DataPackFunc func) { AnalysisListPage(listUri, func); }