コード例 #1
0
        /// <summary>
        /// 对一个URL进行完整的处理,抓取->抽取正文->分词->关键词整理(词频统计)
        /// 默认抓取尝试5次,每次等待50毫秒
        /// </summary>
        /// <param name="url">将要访问的Url, 类似"http://Myserver/Mypath/Myfile.asp".</param>
        /// <param name="isFilterStopWords">是否启用“过滤停止词”,客户端使用时应该不过滤停止词而让所有词汇显示</param>
        /// <param name="jWordSegmentor">分詞對象,一般會有一個全局的靜態對象</param>
        /// <param name="wrList">URL規則及正文抽取規則</param>
        /// <returns>经过整理的关键词数组,按权重排序输出</returns>
        public static Dictionary <string, ulong> SetupSingleUrl(
            string url,
            bool isFilterStopWords,
            JWordSegmentor jWordSegmentor,
            Dictionary <string, WebRuleCollection> wrcDic)
        {
            Uri uri = new Uri(url);

            string            ruleName = WebRule.GetRuleFormUrl(uri);
            WebRuleCollection wrc;
            WebRule           webrule = null;

            if (wrcDic.TryGetValue(uri.Host, out wrc))
            {
                webrule = wrc[ruleName];
            }

            HtmlHelper htmlhelper = new HtmlHelper(url);

            foreach (var item in wrcDic)
            {
                if (item.Value.TryGetValue(uri.Host, out webrule))
                {
                    break;
                }
            }
            /// 抓取->抽取正文
            htmlhelper.GetHtml(webrule);
            return(KwHelper.KeywordSegmentor(htmlhelper, isFilterStopWords, jWordSegmentor));
        }
コード例 #2
0
        private void GetWebRuleCollection()
        {
            Dictionary <long, string> hostDic = new Dictionary <long, string>();
            /// 规则的参数信息集合
            Dictionary <long, List <ParamState> > paramStateDic = GetParamStateDic();
            /// 规则的正文抽取规则集合
            Dictionary <long, List <ExtractRule> > ruleExtractRuleDic = GetRuleExtractRuleDic();

            /// 1. 创建Dictionary[long, WebRuleCollection],Key是域名ID,value是该域名下的各种Url规则集合
            Dictionary <long, WebRuleCollection> longruleDic = new Dictionary <long, WebRuleCollection>();

            foreach (DataRow row in GetAllHosts().Rows)
            {
                WebRuleCollection wrc = new WebRuleCollection(row[1].ToString());
                wrc.ID       = (long)row[0];
                wrc.SiteName = row[1].ToString();
                longruleDic.Add(wrc.ID, wrc);
                hostDic.Add(wrc.ID, wrc.SiteName);
            }

            ///2. 添加每个域名具体的所有URL规则
            foreach (DataRow row in GetAllRules().Rows)
            {
                long hostid = (long)row[1];

                WebRule wr = new WebRule();
                wr.ID        = (long)row[0];
                wr.RuleName  = row[3].ToString();
                wr.RuleState = WebRuleState.None;

                /// 3.添加URL下的参数集合
                List <ParamState> paramStateList;
                if (paramStateDic.TryGetValue(wr.ID, out paramStateList))
                {
                    foreach (ParamState item in paramStateList)
                    {
                        wr.AddParam(item);
                    }
                }

                /// 4.添加URL下的正文抽取规则集合
                List <ExtractRule> extractRuleList;
                if (ruleExtractRuleDic.TryGetValue(wr.ID, out extractRuleList))
                {
                    foreach (ExtractRule item in extractRuleList)
                    {
                        wr.ExtractRuleCollection.Add(item);
                    }
                }
                longruleDic[hostid].Add(wr);
            }

            foreach (var item in hostDic)
            {
                this._WebRuleCollections.Add(item.Value, longruleDic[item.Key]);
            }
        }
コード例 #3
0
        protected virtual string GetContent(string code, WebRule webrule)
        {
            if (webrule == null)
            {
                return(code);
            }
            StringBuilder sb = new StringBuilder();

            foreach (ExtractRule item in webrule.ExtractRuleCollection)
            {
                sb.AppendLine(item.Extract(code));
            }
            return(sb.ToString());
        }
コード例 #4
0
        /// <summary>
        /// 抽取正文的静态方法
        /// 运行较慢,但是从某种意义上来讲开发较快。
        /// </summary>
        /// <param name="webrule">網站的一些規則</param>
        private string GetHtml(WebRule webrule)
        {
            ///抓取網頁
            WebPage webpage = Jeelu.Utility.Web.GetHtmlCodeFromUrl(this.Uri.AbsoluteUri, 5, 30 * 1000);

            webpage.HtmlCode = ClearTag(webpage.HtmlCode, "style");
            webpage.HtmlCode = ClearTag(webpage.HtmlCode, "script");

            HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlDocument();
            htmlDoc.OptionOutputAsXml = true;

            htmlDoc.LoadHtml(webpage.HtmlCode);

            HtmlNode headElement = htmlDoc.DocumentNode.SelectSingleNode("//head");
            HtmlNode bodyElement = htmlDoc.DocumentNode.SelectSingleNode("//body");

            /////标题
            this.Title = GetTagElementString(headElement, "title");

            ///从Head中获得Keyword节点,Desciption节点
            HtmlNodeCollection metaNodes = headElement.SelectNodes("meta");

            if (metaNodes != null)
            {
                foreach (HtmlNode node in metaNodes)
                {
                    switch (node.GetAttributeValue("name", "").ToLower())
                    {
                        #region case
                    case "keywords":
                    {
                        string content = node.GetAttributeValue("content", "");
                        if (string.IsNullOrEmpty(content))
                        {
                            break;
                        }
                        this.Keyword += content + ',';
                    }
                    break;

                    case "description":
                    {
                        string content = node.GetAttributeValue("content", "");
                        if (string.IsNullOrEmpty(content))
                        {
                            break;
                        }
                        this.Description += content + ',';
                    }
                    break;

                    default:
                        break;
                        #endregion
                    } //switch
                }     //foreach
            }         //if

            ///获得页面的所有链接,目前在Jeelu.Billboard项目中用处不大
            this.HrefCollection = GetTagElementValue(bodyElement, "a", "href").ToArray();

            ///获得页面的所有图片的Alt属性
            this.ImageAltCollection = GetTagElementValue(bodyElement, "img", "alt").ToArray();
            ///获得页面的所有链接的Alt属性
            this.LinkAltCollection = GetTagElementValue(bodyElement, "a", "alt").ToArray();

            ///获得页面的所有加粗设为重点的字符串
            List <string> bList = GetTagElementValue(bodyElement, "strong", "");
            bList.AddRange(GetTagElementValue(bodyElement, "b", "").ToArray());
            this.BCollection = bList.ToArray();

            ///各级标题
            this.H1Collection = GetTagElementValue(bodyElement, "h1", "").ToArray();
            this.H2Collection = GetTagElementValue(bodyElement, "h2", "").ToArray();
            this.H3Collection = GetTagElementValue(bodyElement, "h3", "").ToArray();
            this.H4Collection = GetTagElementValue(bodyElement, "h4", "").ToArray();
            this.H5Collection = GetTagElementValue(bodyElement, "h5", "").ToArray();
            this.H6Collection = GetTagElementValue(bodyElement, "h6", "").ToArray();

            ///整体的字符串
            this.Content = this.GetContent(bodyElement.InnerText, webrule);

            return(this.Content);
        }