Пример #1
0
        /// <summary>
        /// DFS 方式爬取
        /// </summary>
        public void BfsCraw()
        {
            var urlQueue = new Queue <string>();

            urlQueue.Enqueue(firstUrl);
            while (!stopFlag && count < maxCount && urlQueue.Count > 0)
            {
                string url = urlQueue.Dequeue();
                CrawlPageStarted?.Invoke(url); //开始爬取
                string filepath = folderpath + "/" + (count + 1) + ".html";
                string html;
                try
                {
                    html = Download(url, filepath);
                }
                catch (Exception e)
                {
                    CrawlPageFailed?.Invoke(url, e.Message); //爬取失败
                    return;
                }
                CrawlPageSucceeded?.Invoke(url, html); //爬取成功
                hasCrawedUrl.Add(url);
                count++;

                foreach (var nxtUrl in UrlsInHtml(html, url))
                {
                    if (!hasCrawedUrl.Contains(nxtUrl))
                    {
                        urlQueue.Enqueue(nxtUrl);
                    }
                }
            }
            CrawlTaskEnded?.Invoke();
        }
Пример #2
0
        private void Dfs(string url)
        {
            if (stopFlag || count >= maxCount)
            {
                return;
            }

            CrawlPageStarted?.Invoke(url); //开始爬取
            string filepath = folderpath + "/" + (count + 1) + ".html";
            string html;

            try
            {
                html = Download(url, filepath);
            }
            catch (Exception e)
            {
                CrawlPageFailed?.Invoke(url, e.Message); //爬取失败
                return;
            }
            CrawlPageSucceeded?.Invoke(url, html); //爬取成功
            hasCrawedUrl.Add(url);
            count++;

            foreach (var nxtUrl in UrlsInHtml(html, url))
            {
                if (!hasCrawedUrl.Contains(nxtUrl))
                {
                    Dfs(nxtUrl);
                }
            }
        }
Пример #3
0
        private void MultiThMethod(string url)
        {
            if (stopFlag || count >= maxCount)
            {
                return;
            }

            CrawlPageStarted?.Invoke(url); //开始爬取
            string html;

            try
            {
                html = GetHtml(url);
            }
            catch (Exception e)
            {
                CrawlPageFailed?.Invoke(url, e.Message); //爬取失败
                return;
            }
            mutex.WaitOne();
            if (!hasCrawedUrl.Contains(url))
            {
                count++;
                hasCrawedUrl.Add(url);
                if (count >= maxCount)
                {
                    mutex.ReleaseMutex();
                    return;
                }
                mutex.ReleaseMutex();
                string filepath = folderpath + "/" + count + ".html";
                try
                {
                    File.WriteAllText(filepath, html, Encoding.UTF8);
                }
                catch (Exception e)
                {
                    CrawlPageFailed?.Invoke(url, e.Message); //爬取失败
                    return;
                }
                CrawlPageSucceeded?.Invoke(url, html); //爬取成功
                foreach (var nxtUrl in UrlsInHtml(html, url))
                {
                    mutex.WaitOne();
                    bool canCrawl = !hasCrawedUrl.Contains(nxtUrl);
                    mutex.ReleaseMutex();
                    if (canCrawl)
                    {
                        Task.Run(() => MultiThMethod(nxtUrl));
                    }
                }
            }
            else
            {
                mutex.ReleaseMutex();
            }
        }