public Boolean AddUrl(Url_info url) //加入已经正被爬行的URL { Boolean tmp; tmp = iListener.OnAddUrl(url); return(tmp); }
public Boolean postQueue(Url_info url)//投递任务,投递时传递url来源 { Boolean tmp; tmp = iListener.OnPostWork(url); return(tmp); }
public Url_info getNewWork() //取的任务队列里头一个URL,并从队列里删除 { Url_info tmp = null; lock (workQueue) { if (workQueue.Count != 0) { tmp = (Url_info)workQueue.Dequeue(); } } return(tmp); }
private Boolean LoadList(string filepath) //加载初始任务队列 { StreamReader sr = null; ArrayList Ayl = new ArrayList(); try { sr = File.OpenText(filepath); string read = null; while ((read = sr.ReadLine()) != null) { Ayl.Add(read); } } catch (IOException e) { Console.WriteLine(e.Message); return(false); } finally { if (sr != null) { sr.Close(); } if (Ayl != null) { if (Ayl.Count != 0) { for (int a = 0; a <= Ayl.Count - 1; a++) { Url_info url = new Url_info(); url.meUrl = Ayl[a].ToString(); url.fromUrl = null; url.sid = 0; workQueue.Enqueue(url); } } } } return(true); }
public Boolean OnAddUrl(Url_info url) { lock (lastUrlTable) { Crc32 CRC32 = new Crc32(); CRC32.Reset(); byte[] BYTE = Encoding.Default.GetBytes(url.meUrl); CRC32.Crc(BYTE); string tmp = CRC32.Value.ToString(); if (lastUrlTable.Contains(tmp) != true) //检查url是否已经爬过 { lastUrlTable.Add(tmp, url); //加入Hashtbale } else { return(false); } } return(true); }
public Boolean OnPostWork(Url_info url) //投递新任务URL { lock (lastUrlTable) { Crc32 CRC32 = new Crc32(); CRC32.Reset(); byte[] BYTE = Encoding.Default.GetBytes(url.meUrl); CRC32.Crc(BYTE); string tmp = CRC32.Value.ToString(); if (lastUrlTable.Contains(tmp) != true) //检查url是否已经爬过 { workQueue.Enqueue(url); //加入任务队列 } else { return(false); } } return(true); }
public void startWork(object state) //工作线程主函数(未完成函数) { while (IsWork != true) { System.Windows.Forms.Application.DoEvents(); //doEvents if (IsWork == false) //没任务,则向任务队列请求 { //IsWork = true; url_Info = getQueue(); if (url_Info != null) { //正在执行任务 DateTime tTmp = DateTime.Now; url_Info.lastTime = tTmp.ToString(); //爬行时间 if (AddUrl(url_Info) == true) { GetHtml htm = new GetHtml(); string tmpHtml = htm.GetPage(url_Info.meUrl); //GetHtml HtmlAnalyzer htmlAnalyzer = new HtmlAnalyzer(tmpHtml, url_Info.meUrl); //HtmlAnalyzer if (htmlAnalyzer.NewUrl != null) //如果分析出url { for (int a = 0; a <= htmlAnalyzer.NewUrl.Count - 1; a++) { Url_info tmp = new Url_info(); tmp.meUrl = htmlAnalyzer.NewUrl[a].ToString(); //把分析器分析出的url传递给临时对象tmp tmp.fromUrl = url_Info.meUrl; //来源设定 tmp.sid = url_Info.sid + 1; //级数+1 postQueue(tmp); //url信息投递到任务队列 } } htm.SavePage(tmpHtml, url_Info.meUrl); } } } if (iListener.OnCloseWork() == true) { IsWork = true; } } }
public Url_info getQueue() //领取任务 { Url_info tmp = iListener.getNewWork(); return(tmp); }