Beispiel #1
0
        private void Parse(string html, string pageUrl)
        {
            var matches = new Regex(UrlDetectRegex).Matches(html);

            foreach (Match match in matches)
            {
                string linkUrl = match.Groups["url"].Value;
                if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:"))
                {
                    continue;
                }

                linkUrl = FixUrl(linkUrl, pageUrl);//相对路径转绝对路径

                Match  linkUrlMatch = Regex.Match(linkUrl, urlParseRegex);
                string host         = linkUrlMatch.Groups["host"].Value;
                string file         = linkUrlMatch.Groups["file"].Value;

                if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) &&
                    !DownloadedPages.ContainsKey(linkUrl))
                {
                    lock (this)
                    {
                        pending.Enqueue(linkUrl);
                    }
                }
            }
        }
Beispiel #2
0
        //文本内容+文本下载的网址
        public void Parse(string html, string current)
        {
            MatchCollection matches = new Regex(strRef).Matches(html);

            foreach (Match match in matches)
            {
                string linkUrl = match.Groups["url"].Value;
                if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:"))
                {
                    continue;                                //空或者js,不是真正的html网址,就下一个
                }
                linkUrl = TransToComplete(linkUrl, current); //转绝对路径
                                                             //解析出host和file两个部分,进行过滤
                Match  linkUrlMatch = Regex.Match(linkUrl, urlParseRegex);
                string host         = linkUrlMatch.Groups["host"].Value;
                string file         = linkUrlMatch.Groups["file"].Value;
                if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) &&//通过正则表达式判断是否符合
                    !DownloadedPages.ContainsKey(linkUrl))
                {
                    pending.Enqueue(linkUrl);
                }
                //strRef = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim
                //    ('"', '\"', '#', '>');
                //if (strRef.Length == 0) continue;
                //if (urls[strRef] == null) urls[strRef] = false;
            }
        }