private void AddPending(string result)
 {
     if (string.IsNullOrEmpty(result))
     {
         return;
     }
     if (pendingUrls.Count > MaxCount)
     {
         return;
     }
     if (!pendingUrls.Contains(result) &&
         !CompletedUrls.Contains(result) &&
         !DownloadingUrls.Contains(result) &&
         !FailedUrls.Contains(result))
     {
         pendingUrls.Enqueue(result);
     }
 }
示例#2
0
        /// <summary>
        /// 线程执行方法
        /// </summary>
        void StartCrawlePage()
        {
            // 循环抓取URL
            while (MainForm.UrlManager.Urls.Count > 0)
            {
                UrlItem url;
                lock (MainForm.UrlManager.Urls)
                {
                    if (MainForm.UrlManager.Urls != null && MainForm.UrlManager.Urls.Count > 0)
                    {
                        url = MainForm.UrlManager.Urls[0];
                        MainForm.UrlManager.Urls.Remove(url);
                    }
                    else
                    {
                        return;
                    }
                }
                // 抓取单个网页并分词
                Dictionary <string, ulong> words = null;
                try
                {
                    words = HtmlHelper.SetupSingleUrl(url.Url, false, Service.JWordSegmentor, Service.DbHelper.WebRuleCollections);
                }
                catch (Exception)
                {
                    // 抓取一个网页
                    Interlocked.Increment(ref _failedPage);
                    FailedUrls.Add(url.Url);
                    MainForm.BeginInvoke(new Action(FailCrawle));
                    // 如果已经抓取完毕
                    if ((GetedPage + FailedPage) == TotalPages)
                    {
                        MainForm.BeginInvoke(new Action(PostCrawlerProcess));
                    }
                    continue;
                }

                // 合并所有线程抓回的单词
                if (words != null && words.Count > 0)
                {
                    lock (NewWords)
                    {
                        foreach (KeyValuePair <string, ulong> word in words)
                        {
                            if (NewWords.ContainsKey(word.Key))
                            {
                                NewWords[word.Key] = NewWords[word.Key] + word.Value;
                            }
                            else
                            {
                                NewWords.Add(word.Key, word.Value);
                            }
                        }
                        //foreach (KeyValuePair<string, ulong> word in words)
                        //{
                        //    if (JWordSegmentorService.IsExist(word.Key, ""))
                        //    {
                        //        if (ExistWords.ContainsKey(word.Key))
                        //        {
                        //            ExistWords[word.Key] = ExistWords[word.Key] + word.Value;
                        //        }
                        //        else
                        //        {
                        //            ExistWords.Add(word.Key, word.Value);
                        //        }
                        //        continue;
                        //    }
                        //    if (NewWords.ContainsKey(word.Key))
                        //    {
                        //        NewWords[word.Key] = NewWords[word.Key] + word.Value;
                        //    }
                        //    else
                        //    {
                        //        NewWords.Add(word.Key, word.Value);
                        //    }
                        //}
                    }
                }
                // 抓取一个网页
                Interlocked.Increment(ref _getedPage);
                GetedUrls.Add(url.Url);
                MainForm.BeginInvoke(new Action(SucceedCrawle));
                // 如果已经抓取完毕
                if ((GetedPage + FailedPage) == TotalPages)
                {
                    MainForm.BeginInvoke(new Action(PostCrawlerProcess));
                }
            }//while
        }
        public async Task Crawl()
        {
            List <Task> downloadTask = new List <Task>();

            pendingUrls.Enqueue(InitialUrl);


            while (pendingUrls.Count > 0)
            {
                if (!pendingUrls.TryDequeue(out var current))
                {
                    continue;
                }
                ;

                if (CompletedUrls.Count + DownloadingUrls.Count > MaxCount)
                {
                    break;
                }

                //限制并发
                if (MaxParallel > 0 && DownloadingUrls.Count >= MaxParallel)
                {
                    await Task.Delay(100);

                    continue;
                }

                //下载开始事件
                var downEvent = new BeforeDownloadEventArgs()
                {
                    Url       = current,
                    Cancelled = false
                };
                BeforeDownload?.Invoke(this, downEvent);
                if (!downEvent.Cancelled)
                {
                    lock (urlLock)
                    {
                        DownloadingUrls.Add(current);
                    }
                    var down = DownLoad(current, nameIndex++.ToString())
                               .ContinueWith(taks =>
                    {
                        if (taks.IsFaulted)
                        {
                            FailedUrls.Add(current);
                            lock (urlLock)
                            {
                                DownloadingUrls.Remove(current);
                            }
                            return;
                        }
                        CompletedUrls.Add(current);

                        lock (urlLock)
                        {
                            DownloadingUrls.Remove(current);
                        }
                        var ev = new DownloadedEventArgs()
                        {
                            Html             = taks.Result,
                            OverrideUrlParse = false,
                            Url = current
                        };
                        Downloaded?.Invoke(this, ev);
                        //覆盖默认的url解析
                        if (ev.OverrideUrlParse && ev.NextUrls != null)
                        {
                            foreach (var u in ev.NextUrls)
                            {
                                AddPending(u);
                            }
                        }
                        else
                        {
                            Parse(taks.Result, current);//解析,并加入新的链接
                        }
                    });
                    downloadTask.Add(down);
                    Downloading?.Invoke(this, new DownloadingEventArgs()
                    {
                        Url = current
                    });
                }
                if (pendingUrls.Count <= 0)
                {
                    await Task.WhenAny(downloadTask);

                    downloadTask.RemoveAll(t => t.IsCompleted);
                }
            }

            await Task.WhenAll(downloadTask);

            this.CrawlerCompleted?.Invoke(this, new CrawlerCompletedEventArgs()
            {
                FailedCount  = FailedUrls.Count,
                SuccessCount = CompletedUrls.Count
            });
        }