/// <summary> /// 创建完整索引 /// </summary> /// <param name="items"></param> public static void GenerateFullIndex(IList<Item> items) { XmlDocument xd = new XmlDocument(); FileInfo file = new FileInfo(String.Format("{0}{1}fullindex.xml", Config.PhypicPath, Config.SavePath)); if (file.Exists) { file.Delete(); } xd.AppendChild(xd.CreateXmlDeclaration("1.0", null, null)); XmlNode root = xd.CreateElement("root"); Core.AppendNode(xd, root, "version", Config.Version); Core.AppendNode(xd, root, "modified", String.Format("{0:yyyy-MM-dd HH:mm:ss}", DateTime.Now)); Core.AppendNode(xd, root, "seller_id", Config.Seller); Core.AppendNode(xd, root, "cat_url", String.Format("{0}{1}SellerCats.xml", Config.Domain, Config.SavePath)); Core.AppendNode(xd, root, "dir", String.Format("{0}{1}items/", Config.Domain, Config.SavePath)); XmlNode ids = xd.CreateElement("item_ids"); MultiThreadProcess mp = new MultiThreadProcess(Config.Threads, items.Count); mp.Start<IList<Item>>(item => { Item itm = item[0]; XmlNode xn = xd.CreateElement("outer_id"); XmlAttribute xat = xd.CreateAttribute("action"); xat.Value = "upload"; xn.Attributes.Append(xat); xn.InnerText = itm.outer_id; ids.AppendChild(xn); if (item.Count == 1) { UploadItem(itm); } else { Generate(itm); } item.Remove(itm); }, items); while (true) { if (!mp.IsAlive) { root.AppendChild(ids); xd.AppendChild(root); xd.Save(file.FullName); break; } } }
/// <summary> /// �����б�ҳ��,���Խ��ִ�л�ִ���� /// </summary> /// <param name="parameter"></param> private void AnalysisListPage(string pageUri, DataPackFunc func) { int taskCount = 0, //������ taskNumbers = 0; //һ�����������ж������Ƿ���� string html; //���ص��б�ҳ��Html int bufferLength = 1; byte[] buffer = new byte[bufferLength]; //���ص����ݻ����� StringBuilder sb = new StringBuilder(); //���췵�صĽ�� MatchCollection listMatches; //�б��ƥ�估ҳ���ַƥ�� #if DEBUG Console.WriteLine("��ʼ��:{0}��������...", pageUri); #endif //�����б�ҳ���� HttpWebRequest request = (HttpWebRequest) WebRequest.Create(pageUri); request.Timeout = this.RequestTimeOut; Stream stream = request.GetResponse().GetResponseStream(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } #if DEBUG Console.WriteLine("���ص�����Ϊ:{0}", html); #endif //�����б�ҳ���� listMatches = Regex.Matches(html, RuleFormat.Format(this.ListBlockRule)); //û���ҵ�ƥ�� if (listMatches.Count == 0) { #if DEBUG Console.WriteLine("û�ҵ�ƥ��!"); #endif return; } //����ƥ������ #if DEBUGS Console.WriteLine("\r\n------------------------------\r\n�õ�ƥ����б�����Ϊ:\r\n"); #endif Regex pageUriRegex = new Regex(this.FormatedPageUriRule); //�����ʵ� IList<string> pageUrls = new List<string>(); foreach (Match m in listMatches) { #if DEBUG Console.WriteLine("\r\n------------------------------------------------\r\n{0}", m.Value); #endif foreach (Match pm in pageUriRegex.Matches(m.Value)) { #if DEBUG Console.WriteLine(pm.Value); #endif pageUrls.Add(pm.Value); //��ȡҳ�����ݣ��������ִ�������� //���̻߳�ȡ //if (!UseSingleThread) //{ // new Thread(() => // { // //���û�ִ���� // GetPageData(pm.Value, ref taskNumbers, func); // } // ).Start(); //} //else //���̵߳��� //{ // //���û�ִ���� // GetPageData(pm.Value, ref taskNumbers, func); //} } } //���������� taskCount = pageUrls.Count; if (!this.UseMultiThread) //���߳� { foreach (string pageUrl in pageUrls) { //���û�ִ���� GetPageData(pageUrl, ref taskNumbers, func); } } else { MultiThreadProcess mp = new MultiThreadProcess(5, taskCount); mp.Start<IList<string>>(urls => { lock (urls) { //���û�ִ���� GetPageData(urls[0], ref taskNumbers, func); pageUrls.Remove(urls[0]); } }, pageUrls); } //������������ state.TotalCount = taskCount; //ֱ���߳̾�ִ����ϣ��� do { } while (taskNumbers != taskCount); #if DEBUG Console.WriteLine("�������....!���ɼ���{0}��", taskCount); #endif }
/// <summary> /// 分析列表页面,并对结果执行回执操作 /// </summary> /// <param name="parameter"></param> private void AnalysisListPage(string pageUri, DataPackFunc func) { int taskCount = 0, //任务数 taskNumbers = 0; //一个计数用于判定任务是否完成 string html; //下载的列表页面Html int bufferLength = 1; byte[] buffer = new byte[bufferLength]; //下载的数据缓冲区 StringBuilder sb = new StringBuilder(); //构造返回的结果 MatchCollection listMatches; //列表块匹配及页面地址匹配 #if DEBUG Console.WriteLine("开始从:{0}下载数据...", pageUri); #endif //下载列表页内容 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(pageUri); request.Timeout = this.RequestTimeOut; Stream stream = request.GetResponse().GetResponseStream(); using (StreamReader sr = new StreamReader(stream, this.Encode)) { html = sr.ReadToEnd(); } #if DEBUG Console.WriteLine("返回的数据为:{0}", html); #endif //分析列表页代码 listMatches = Regex.Matches(html, RuleFormat.Format(this.ListBlockRule)); //没有找到匹配 if (listMatches.Count == 0) { #if DEBUG Console.WriteLine("没找到匹配!"); #endif return; } //分析匹配数据 #if DEBUGS Console.WriteLine("\r\n------------------------------\r\n得到匹配的列表数据为:\r\n"); #endif Regex pageUriRegex = new Regex(this.FormatedPageUriRule); //创建词典 IList <string> pageUrls = new List <string>(); foreach (Match m in listMatches) { #if DEBUG Console.WriteLine("\r\n------------------------------------------------\r\n{0}", m.Value); #endif foreach (Match pm in pageUriRegex.Matches(m.Value)) { #if DEBUG Console.WriteLine(pm.Value); #endif pageUrls.Add(pm.Value); //获取页面数据,并添加已执行任务数 //多线程获取 //if (!UseSingleThread) //{ // new Thread(() => // { // //调用回执方法 // GetPageData(pm.Value, ref taskNumbers, func); // } // ).Start(); //} //else //单线程调用 //{ // //调用回执方法 // GetPageData(pm.Value, ref taskNumbers, func); //} } } //增加任务数 taskCount = pageUrls.Count; if (!this.UseMultiThread) //单线程 { foreach (string pageUrl in pageUrls) { //调用回执方法 GetPageData(pageUrl, ref taskNumbers, func); } } else { MultiThreadProcess mp = new MultiThreadProcess(5, taskCount); mp.Start <IList <string> >(urls => { lock (urls) { //调用回执方法 GetPageData(urls[0], ref taskNumbers, func); pageUrls.Remove(urls[0]); } }, pageUrls); } //设置任务总数 state.TotalCount = taskCount; //直到线程均执行完毕,则返回 do { } while (taskNumbers != taskCount); #if DEBUG Console.WriteLine("任务完成....!共采集到{0}条", taskCount); #endif }