/// <summary> /// 对一个URL进行完整的处理,抓取->抽取正文->分词->关键词整理(词频统计) /// 默认抓取尝试5次,每次等待50毫秒 /// </summary> /// <param name="url">将要访问的Url, 类似"http://Myserver/Mypath/Myfile.asp".</param> /// <param name="isFilterStopWords">是否启用“过滤停止词”,客户端使用时应该不过滤停止词而让所有词汇显示</param> /// <param name="jWordSegmentor">分詞對象,一般會有一個全局的靜態對象</param> /// <param name="wrList">URL規則及正文抽取規則</param> /// <returns>经过整理的关键词数组,按权重排序输出</returns> public static Dictionary <string, ulong> SetupSingleUrl( string url, bool isFilterStopWords, JWordSegmentor jWordSegmentor, Dictionary <string, WebRuleCollection> wrcDic) { Uri uri = new Uri(url); string ruleName = WebRule.GetRuleFormUrl(uri); WebRuleCollection wrc; WebRule webrule = null; if (wrcDic.TryGetValue(uri.Host, out wrc)) { webrule = wrc[ruleName]; } HtmlHelper htmlhelper = new HtmlHelper(url); foreach (var item in wrcDic) { if (item.Value.TryGetValue(uri.Host, out webrule)) { break; } } /// 抓取->抽取正文 htmlhelper.GetHtml(webrule); return(KwHelper.KeywordSegmentor(htmlhelper, isFilterStopWords, jWordSegmentor)); }
/// <summary> /// 初始化WordSegmentor的工作环境,返回一个全局的分词对象(WordSeg)。这个对象是否能够单建的? /// </summary> /// <param name="path">配置文件所在路径</param> /// <returns>Jeelu继承重写的WordSeg对象</returns> public static JWordSegmentor Creator(string configPath, string dataFullPath) { if (_JWordSegmentor == null) { _JWordSegmentor = new JWordSegmentor(); _JWordSegmentor.LoadConfig(Path.Combine(configPath, "Jeelu.WordSegmentor.segcfg")); _JWordSegmentor.DictPath = dataFullPath; _JWordSegmentor.DictionaryCollection = JDictionaryCollection.Creator( Path.Combine(dataFullPath, "channel"), JDictionaryTypeEnum.SiteDictionary); _JWordSegmentor.DictionaryStopCollection = JDictionaryCollection.Creator( Path.Combine(dataFullPath, "sitestop"), JDictionaryTypeEnum.SiteStopDictionary); _JWordSegmentor.LoadDict(); return(_JWordSegmentor); } return(_JWordSegmentor); }
internal static Dictionary <string, ulong> KeywordSegmentor(HtmlHelper htmlhelper, bool isFilterStopWords, JWordSegmentor jWordSegmentor) { string[] keywords; Dictionary <string, ulong> kwDic = new Dictionary <string, ulong>(); jWordSegmentor.FilterStopWords = isFilterStopWords; keywords = jWordSegmentor.Segment(htmlhelper.Content).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 1); keywords = jWordSegmentor.Segment(htmlhelper.Title).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 10); keywords = jWordSegmentor.Segment(htmlhelper.Keyword).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 3); keywords = jWordSegmentor.Segment(htmlhelper.Description).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 2); StringBuilder sb = new StringBuilder(); foreach (var item in htmlhelper.BCollection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 8); sb = new StringBuilder(); foreach (var item in htmlhelper.H1Collection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 7); sb = new StringBuilder(); foreach (var item in htmlhelper.H2Collection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 7); sb = new StringBuilder(); foreach (var item in htmlhelper.H3Collection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 5); sb = new StringBuilder(); foreach (var item in htmlhelper.H4Collection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 4); sb = new StringBuilder(); foreach (var item in htmlhelper.H5Collection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 3); sb = new StringBuilder(); foreach (var item in htmlhelper.H6Collection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 2); sb = new StringBuilder(); foreach (var item in htmlhelper.ImageAltCollection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 4); sb = new StringBuilder(); foreach (var item in htmlhelper.LinkAltCollection) { sb.AppendLine(item); } keywords = jWordSegmentor.Segment(sb.ToString()).ToArray(); kwDic = SetKeywordDic(keywords, kwDic, 6); return(kwDic); }