예제 #1
0
        /// <summary>
        /// 合并词库中已有词汇
        /// </summary>
        public void MergeExistWords()
        {
            Dictionary <string, ulong> existNewwords = new Dictionary <string, ulong>();

            // 将词库中已有词库记录并在新词中删除
            foreach (KeyValuePair <string, ulong> word in NewWords)
            {
                if (JWordSegmentorService.IsExist(word.Key))
                {
                    existNewwords.Add(word.Key, JWordSegmentorService.GetFrequency(word.Key) + word.Value);
                }
            }
            foreach (KeyValuePair <string, ulong> word in existNewwords)
            {
                if (NewWords.ContainsKey(word.Key))
                {
                    NewWords.Remove(word.Key);
                }
            }
            // 将已存词存入已在词库属性中
            foreach (var word in existNewwords)
            {
                if (ExistWords.ContainsKey(word.Key))
                {
                    ExistWords[word.Key] += word.Value;
                }
                else
                {
                    ExistWords.Add(word.Key, word.Value);
                }
            }
        }
예제 #2
0
 /// <summary>
 /// 判断关键词是否存在于新词和词库中
 /// </summary>
 /// <param name="word"></param>
 /// <returns></returns>
 public bool IsWordExist(string word)
 {
     return(NewWords.ContainsKey(word) || ExistWords.ContainsKey(word) || JWordSegmentorService.IsExist(word, ""));
 }
예제 #3
0
        /// <summary>
        /// 线程执行方法
        /// </summary>
        void StartCrawlePage()
        {
            // 循环抓取URL
            while (MainForm.UrlManager.Urls.Count > 0)
            {
                UrlItem url;
                lock (MainForm.UrlManager.Urls)
                {
                    if (MainForm.UrlManager.Urls != null && MainForm.UrlManager.Urls.Count > 0)
                    {
                        url = MainForm.UrlManager.Urls[0];
                        MainForm.UrlManager.Urls.Remove(url);
                    }
                    else
                    {
                        return;
                    }
                }
                // 抓取单个网页并分词
                Dictionary <string, ulong> words = null;
                try
                {
                    words = HtmlHelper.SetupSingleUrl(url.Url, false, Service.JWordSegmentor, Service.DbHelper.WebRuleCollections);
                }
                catch (Exception)
                {
                    // 抓取一个网页
                    Interlocked.Increment(ref _failedPage);
                    FailedUrls.Add(url.Url);
                    MainForm.BeginInvoke(new Action(FailCrawle));
                    // 如果已经抓取完毕
                    if ((GetedPage + FailedPage) == TotalPages)
                    {
                        MainForm.BeginInvoke(new Action(PostCrawlerProcess));
                    }
                    continue;
                }

                // 合并所有线程抓回的单词
                if (words != null && words.Count > 0)
                {
                    lock (NewWords)
                    {
                        foreach (KeyValuePair <string, ulong> word in words)
                        {
                            if (NewWords.ContainsKey(word.Key))
                            {
                                NewWords[word.Key] = NewWords[word.Key] + word.Value;
                            }
                            else
                            {
                                NewWords.Add(word.Key, word.Value);
                            }
                        }
                        //foreach (KeyValuePair<string, ulong> word in words)
                        //{
                        //    if (JWordSegmentorService.IsExist(word.Key, ""))
                        //    {
                        //        if (ExistWords.ContainsKey(word.Key))
                        //        {
                        //            ExistWords[word.Key] = ExistWords[word.Key] + word.Value;
                        //        }
                        //        else
                        //        {
                        //            ExistWords.Add(word.Key, word.Value);
                        //        }
                        //        continue;
                        //    }
                        //    if (NewWords.ContainsKey(word.Key))
                        //    {
                        //        NewWords[word.Key] = NewWords[word.Key] + word.Value;
                        //    }
                        //    else
                        //    {
                        //        NewWords.Add(word.Key, word.Value);
                        //    }
                        //}
                    }
                }
                // 抓取一个网页
                Interlocked.Increment(ref _getedPage);
                GetedUrls.Add(url.Url);
                MainForm.BeginInvoke(new Action(SucceedCrawle));
                // 如果已经抓取完毕
                if ((GetedPage + FailedPage) == TotalPages)
                {
                    MainForm.BeginInvoke(new Action(PostCrawlerProcess));
                }
            }//while
        }