private void Parse(string html, string pageUrl) { var matches = new Regex(UrlDetectRegex).Matches(html); foreach (Match match in matches) { string linkUrl = match.Groups["url"].Value; if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:")) { continue; } linkUrl = FixUrl(linkUrl, pageUrl);//相对路径转绝对路径 Match linkUrlMatch = Regex.Match(linkUrl, urlParseRegex); string host = linkUrlMatch.Groups["host"].Value; string file = linkUrlMatch.Groups["file"].Value; if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) && !DownloadedPages.ContainsKey(linkUrl)) { lock (this) { pending.Enqueue(linkUrl); } } } }
public async Task Start() { DownloadedPages.Clear(); while (pending.TryDequeue(out string result)) { } pending.Enqueue(StartURL); while (DownloadedPages.Count < MaxPage && pending.Count > 0) { if (MaxParallel > 0 && DownloadedPages.Count > MaxParallel) { await Task.Delay(100); continue; } string url; pending.TryDequeue(out url); try { string html = await DownLoad(url); // 下载 DownloadedPages[url] = true; PageDownloaded(this, url, "success"); Parse(html, url);//解析,并加入新的链接 } catch (Exception ex) { PageDownloaded(this, url, " Error:" + ex.Message); } } CrawlerStopped(this); }
public void Start() { DownloadedPages.Clear(); pending.Clear(); pending.Enqueue(StartURL); TaskRunning = 0; while (DownloadedPages.Count < MaxPage) { if (pending.Count == 0 && TaskRunning == 0) { break; } else if (pending.Count == 0) { continue; } else { string url = pending.Dequeue(); lock (this) { TaskRunning++; } Task.Run(() => RunTask(url)); } } CrawlerStopped(this); }
//文本内容+文本下载的网址 public void Parse(string html, string current) { MatchCollection matches = new Regex(strRef).Matches(html); foreach (Match match in matches) { string linkUrl = match.Groups["url"].Value; if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:")) { continue; //空或者js,不是真正的html网址,就下一个 } linkUrl = TransToComplete(linkUrl, current); //转绝对路径 //解析出host和file两个部分,进行过滤 Match linkUrlMatch = Regex.Match(linkUrl, urlParseRegex); string host = linkUrlMatch.Groups["host"].Value; string file = linkUrlMatch.Groups["file"].Value; if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) &&//通过正则表达式判断是否符合 !DownloadedPages.ContainsKey(linkUrl)) { pending.Enqueue(linkUrl); } //strRef = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim // ('"', '\"', '#', '>'); //if (strRef.Length == 0) continue; //if (urls[strRef] == null) urls[strRef] = false; } }
public void Start() { DownloadedPages.Clear(); pending = new Queue <string>(); pending.Enqueue(StartURL); Parallel.Invoke(new Action[] { () => Add() }); CrawlerStopped(this); }
public void Start() { DownloadedPages.Clear(); pending.Clear(); pending.Enqueue(StartURL); while (DownloadedPages.Count < MaxPage && pending.Count > 0) { string url = pending.Dequeue(); try { string html = DownLoad(url); // 下载 DownloadedPages[url] = true; PageDownloaded(this, url, "success"); Parse(html, url);//解析,并加入新的链接 }catch (Exception ex) { PageDownloaded(this, url, " Error:" + ex.Message); } } CrawlerStopped(this); }
public void init() { DownloadedPages.Clear(); pending.Clear(); pending.Enqueue(StartURL); }