Пример #1
0
        /// <summary>
        /// 对一个URL进行完整的处理,抓取->抽取正文->分词->关键词整理(词频统计)
        /// 默认抓取尝试5次,每次等待50毫秒
        /// </summary>
        /// <param name="url">将要访问的Url, 类似"http://Myserver/Mypath/Myfile.asp".</param>
        /// <param name="isFilterStopWords">是否启用“过滤停止词”,客户端使用时应该不过滤停止词而让所有词汇显示</param>
        /// <param name="jWordSegmentor">分詞對象,一般會有一個全局的靜態對象</param>
        /// <param name="wrList">URL規則及正文抽取規則</param>
        /// <returns>经过整理的关键词数组,按权重排序输出</returns>
        public static Dictionary <string, ulong> SetupSingleUrl(
            string url,
            bool isFilterStopWords,
            JWordSegmentor jWordSegmentor,
            Dictionary <string, WebRuleCollection> wrcDic)
        {
            Uri uri = new Uri(url);

            string            ruleName = WebRule.GetRuleFormUrl(uri);
            WebRuleCollection wrc;
            WebRule           webrule = null;

            if (wrcDic.TryGetValue(uri.Host, out wrc))
            {
                webrule = wrc[ruleName];
            }

            HtmlHelper htmlhelper = new HtmlHelper(url);

            foreach (var item in wrcDic)
            {
                if (item.Value.TryGetValue(uri.Host, out webrule))
                {
                    break;
                }
            }
            /// 抓取->抽取正文
            htmlhelper.GetHtml(webrule);
            return(KwHelper.KeywordSegmentor(htmlhelper, isFilterStopWords, jWordSegmentor));
        }
 /// <summary>
 /// 初始化WordSegmentor的工作环境,返回一个全局的分词对象(WordSeg)。这个对象是否能够单建的?
 /// </summary>
 /// <param name="path">配置文件所在路径</param>
 /// <returns>Jeelu继承重写的WordSeg对象</returns>
 public static JWordSegmentor Creator(string configPath, string dataFullPath)
 {
     if (_JWordSegmentor == null)
     {
         _JWordSegmentor = new JWordSegmentor();
         _JWordSegmentor.LoadConfig(Path.Combine(configPath, "Jeelu.WordSegmentor.segcfg"));
         _JWordSegmentor.DictPath             = dataFullPath;
         _JWordSegmentor.DictionaryCollection = JDictionaryCollection.Creator(
             Path.Combine(dataFullPath, "channel"), JDictionaryTypeEnum.SiteDictionary);
         _JWordSegmentor.DictionaryStopCollection = JDictionaryCollection.Creator(
             Path.Combine(dataFullPath, "sitestop"), JDictionaryTypeEnum.SiteStopDictionary);
         _JWordSegmentor.LoadDict();
         return(_JWordSegmentor);
     }
     return(_JWordSegmentor);
 }
Пример #3
0
        internal static Dictionary <string, ulong> KeywordSegmentor(HtmlHelper htmlhelper, bool isFilterStopWords, JWordSegmentor jWordSegmentor)
        {
            string[] keywords;
            Dictionary <string, ulong> kwDic = new Dictionary <string, ulong>();

            jWordSegmentor.FilterStopWords = isFilterStopWords;

            keywords = jWordSegmentor.Segment(htmlhelper.Content).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 1);

            keywords = jWordSegmentor.Segment(htmlhelper.Title).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 10);

            keywords = jWordSegmentor.Segment(htmlhelper.Keyword).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 3);

            keywords = jWordSegmentor.Segment(htmlhelper.Description).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 2);

            StringBuilder sb = new StringBuilder();

            foreach (var item in htmlhelper.BCollection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 8);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.H1Collection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 7);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.H2Collection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 7);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.H3Collection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 5);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.H4Collection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 4);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.H5Collection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 3);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.H6Collection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 2);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.ImageAltCollection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 4);

            sb = new StringBuilder();
            foreach (var item in htmlhelper.LinkAltCollection)
            {
                sb.AppendLine(item);
            }
            keywords = jWordSegmentor.Segment(sb.ToString()).ToArray();
            kwDic    = SetKeywordDic(keywords, kwDic, 6);

            return(kwDic);
        }