private void Parse(string html, string pageUrl) { var matches = new Regex(UrlDetectRegex).Matches(html); foreach (Match match in matches) { string linkUrl = match.Groups["url"].Value; if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:")) { continue; } linkUrl = FixUrl(linkUrl, pageUrl);//相对路径转绝对路径 Match linkUrlMatch = Regex.Match(linkUrl, urlParseRegex); string host = linkUrlMatch.Groups["host"].Value; string file = linkUrlMatch.Groups["file"].Value; if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) && !DownloadedPages.ContainsKey(linkUrl)) { lock (this) { pending.Enqueue(linkUrl); } } } }
//文本内容+文本下载的网址 public void Parse(string html, string current) { MatchCollection matches = new Regex(strRef).Matches(html); foreach (Match match in matches) { string linkUrl = match.Groups["url"].Value; if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:")) { continue; //空或者js,不是真正的html网址,就下一个 } linkUrl = TransToComplete(linkUrl, current); //转绝对路径 //解析出host和file两个部分,进行过滤 Match linkUrlMatch = Regex.Match(linkUrl, urlParseRegex); string host = linkUrlMatch.Groups["host"].Value; string file = linkUrlMatch.Groups["file"].Value; if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) &&//通过正则表达式判断是否符合 !DownloadedPages.ContainsKey(linkUrl)) { pending.Enqueue(linkUrl); } //strRef = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim // ('"', '\"', '#', '>'); //if (strRef.Length == 0) continue; //if (urls[strRef] == null) urls[strRef] = false; } }