/// <summary> /// 合并词库中已有词汇 /// </summary> public void MergeExistWords() { Dictionary <string, ulong> existNewwords = new Dictionary <string, ulong>(); // 将词库中已有词库记录并在新词中删除 foreach (KeyValuePair <string, ulong> word in NewWords) { if (JWordSegmentorService.IsExist(word.Key)) { existNewwords.Add(word.Key, JWordSegmentorService.GetFrequency(word.Key) + word.Value); } } foreach (KeyValuePair <string, ulong> word in existNewwords) { if (NewWords.ContainsKey(word.Key)) { NewWords.Remove(word.Key); } } // 将已存词存入已在词库属性中 foreach (var word in existNewwords) { if (ExistWords.ContainsKey(word.Key)) { ExistWords[word.Key] += word.Value; } else { ExistWords.Add(word.Key, word.Value); } } }
/// <summary> /// 判断关键词是否存在于新词和词库中 /// </summary> /// <param name="word"></param> /// <returns></returns> public bool IsWordExist(string word) { return(NewWords.ContainsKey(word) || ExistWords.ContainsKey(word) || JWordSegmentorService.IsExist(word, "")); }
/// <summary> /// 线程执行方法 /// </summary> void StartCrawlePage() { // 循环抓取URL while (MainForm.UrlManager.Urls.Count > 0) { UrlItem url; lock (MainForm.UrlManager.Urls) { if (MainForm.UrlManager.Urls != null && MainForm.UrlManager.Urls.Count > 0) { url = MainForm.UrlManager.Urls[0]; MainForm.UrlManager.Urls.Remove(url); } else { return; } } // 抓取单个网页并分词 Dictionary <string, ulong> words = null; try { words = HtmlHelper.SetupSingleUrl(url.Url, false, Service.JWordSegmentor, Service.DbHelper.WebRuleCollections); } catch (Exception) { // 抓取一个网页 Interlocked.Increment(ref _failedPage); FailedUrls.Add(url.Url); MainForm.BeginInvoke(new Action(FailCrawle)); // 如果已经抓取完毕 if ((GetedPage + FailedPage) == TotalPages) { MainForm.BeginInvoke(new Action(PostCrawlerProcess)); } continue; } // 合并所有线程抓回的单词 if (words != null && words.Count > 0) { lock (NewWords) { foreach (KeyValuePair <string, ulong> word in words) { if (NewWords.ContainsKey(word.Key)) { NewWords[word.Key] = NewWords[word.Key] + word.Value; } else { NewWords.Add(word.Key, word.Value); } } //foreach (KeyValuePair<string, ulong> word in words) //{ // if (JWordSegmentorService.IsExist(word.Key, "")) // { // if (ExistWords.ContainsKey(word.Key)) // { // ExistWords[word.Key] = ExistWords[word.Key] + word.Value; // } // else // { // ExistWords.Add(word.Key, word.Value); // } // continue; // } // if (NewWords.ContainsKey(word.Key)) // { // NewWords[word.Key] = NewWords[word.Key] + word.Value; // } // else // { // NewWords.Add(word.Key, word.Value); // } //} } } // 抓取一个网页 Interlocked.Increment(ref _getedPage); GetedUrls.Add(url.Url); MainForm.BeginInvoke(new Action(SucceedCrawle)); // 如果已经抓取完毕 if ((GetedPage + FailedPage) == TotalPages) { MainForm.BeginInvoke(new Action(PostCrawlerProcess)); } }//while }