Beispiel #1
0
        private void Parse(string html, string pageUrl)
        {
            var matches = new Regex(UrlDetectRegex).Matches(html);

            foreach (Match match in matches)
            {
                string linkUrl = match.Groups["url"].Value;
                if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:"))
                {
                    continue;
                }

                linkUrl = FixUrl(linkUrl, pageUrl);//相对路径转绝对路径

                Match  linkUrlMatch = Regex.Match(linkUrl, urlParseRegex);
                string host         = linkUrlMatch.Groups["host"].Value;
                string file         = linkUrlMatch.Groups["file"].Value;

                if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) &&
                    !DownloadedPages.ContainsKey(linkUrl))
                {
                    lock (this)
                    {
                        pending.Enqueue(linkUrl);
                    }
                }
            }
        }
Beispiel #2
0
        public async Task Start()
        {
            DownloadedPages.Clear();
            while (pending.TryDequeue(out string result))
            {
            }
            pending.Enqueue(StartURL);

            while (DownloadedPages.Count < MaxPage && pending.Count > 0)
            {
                if (MaxParallel > 0 && DownloadedPages.Count > MaxParallel)
                {
                    await Task.Delay(100);

                    continue;
                }
                string url;
                pending.TryDequeue(out url);
                try
                {
                    string html = await DownLoad(url); // 下载

                    DownloadedPages[url] = true;
                    PageDownloaded(this, url, "success");
                    Parse(html, url);//解析,并加入新的链接
                }
                catch (Exception ex)
                {
                    PageDownloaded(this, url, "  Error:" + ex.Message);
                }
            }
            CrawlerStopped(this);
        }
Beispiel #3
0
 public void Start()
 {
     DownloadedPages.Clear();
     pending.Clear();
     pending.Enqueue(StartURL);
     TaskRunning = 0;
     while (DownloadedPages.Count < MaxPage)
     {
         if (pending.Count == 0 && TaskRunning == 0)
         {
             break;
         }
         else if (pending.Count == 0)
         {
             continue;
         }
         else
         {
             string url = pending.Dequeue();
             lock (this)
             {
                 TaskRunning++;
             }
             Task.Run(() => RunTask(url));
         }
     }
     CrawlerStopped(this);
 }
Beispiel #4
0
        //文本内容+文本下载的网址
        public void Parse(string html, string current)
        {
            MatchCollection matches = new Regex(strRef).Matches(html);

            foreach (Match match in matches)
            {
                string linkUrl = match.Groups["url"].Value;
                if (linkUrl == null || linkUrl == "" || linkUrl.StartsWith("javascript:"))
                {
                    continue;                                //空或者js,不是真正的html网址,就下一个
                }
                linkUrl = TransToComplete(linkUrl, current); //转绝对路径
                                                             //解析出host和file两个部分,进行过滤
                Match  linkUrlMatch = Regex.Match(linkUrl, urlParseRegex);
                string host         = linkUrlMatch.Groups["host"].Value;
                string file         = linkUrlMatch.Groups["file"].Value;
                if (Regex.IsMatch(host, HostFilter) && Regex.IsMatch(file, FileFilter) &&//通过正则表达式判断是否符合
                    !DownloadedPages.ContainsKey(linkUrl))
                {
                    pending.Enqueue(linkUrl);
                }
                //strRef = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim
                //    ('"', '\"', '#', '>');
                //if (strRef.Length == 0) continue;
                //if (urls[strRef] == null) urls[strRef] = false;
            }
        }
Beispiel #5
0
        public void Start()
        {
            DownloadedPages.Clear();
            pending = new Queue <string>();
            pending.Enqueue(StartURL);
            Parallel.Invoke(new Action[] {
                () => Add()
            });

            CrawlerStopped(this);
        }
Beispiel #6
0
        public void Start()
        {
            DownloadedPages.Clear();
            pending.Clear();
            pending.Enqueue(StartURL);

            while (DownloadedPages.Count < MaxPage && pending.Count > 0)
            {
                string url = pending.Dequeue();
                try {
                    string html = DownLoad(url); // 下载
                    DownloadedPages[url] = true;
                    PageDownloaded(this, url, "success");
                    Parse(html, url);//解析,并加入新的链接
                }catch (Exception ex) {
                    PageDownloaded(this, url, "  Error:" + ex.Message);
                }
            }
            CrawlerStopped(this);
        }
Beispiel #7
0
 public void init()
 {
     DownloadedPages.Clear();
     pending.Clear();
     pending.Enqueue(StartURL);
 }