/// <summary> /// 对一个URL进行完整的处理,抓取->抽取正文->分词->关键词整理(词频统计) /// 默认抓取尝试5次,每次等待50毫秒 /// </summary> /// <param name="url">将要访问的Url, 类似"http://Myserver/Mypath/Myfile.asp".</param> /// <param name="isFilterStopWords">是否启用“过滤停止词”,客户端使用时应该不过滤停止词而让所有词汇显示</param> /// <param name="jWordSegmentor">分詞對象,一般會有一個全局的靜態對象</param> /// <param name="wrList">URL規則及正文抽取規則</param> /// <returns>经过整理的关键词数组,按权重排序输出</returns> public static Dictionary <string, ulong> SetupSingleUrl( string url, bool isFilterStopWords, JWordSegmentor jWordSegmentor, Dictionary <string, WebRuleCollection> wrcDic) { Uri uri = new Uri(url); string ruleName = WebRule.GetRuleFormUrl(uri); WebRuleCollection wrc; WebRule webrule = null; if (wrcDic.TryGetValue(uri.Host, out wrc)) { webrule = wrc[ruleName]; } HtmlHelper htmlhelper = new HtmlHelper(url); foreach (var item in wrcDic) { if (item.Value.TryGetValue(uri.Host, out webrule)) { break; } } /// 抓取->抽取正文 htmlhelper.GetHtml(webrule); return(KwHelper.KeywordSegmentor(htmlhelper, isFilterStopWords, jWordSegmentor)); }
private void GetWebRuleCollection() { Dictionary <long, string> hostDic = new Dictionary <long, string>(); /// 规则的参数信息集合 Dictionary <long, List <ParamState> > paramStateDic = GetParamStateDic(); /// 规则的正文抽取规则集合 Dictionary <long, List <ExtractRule> > ruleExtractRuleDic = GetRuleExtractRuleDic(); /// 1. 创建Dictionary[long, WebRuleCollection],Key是域名ID,value是该域名下的各种Url规则集合 Dictionary <long, WebRuleCollection> longruleDic = new Dictionary <long, WebRuleCollection>(); foreach (DataRow row in GetAllHosts().Rows) { WebRuleCollection wrc = new WebRuleCollection(row[1].ToString()); wrc.ID = (long)row[0]; wrc.SiteName = row[1].ToString(); longruleDic.Add(wrc.ID, wrc); hostDic.Add(wrc.ID, wrc.SiteName); } ///2. 添加每个域名具体的所有URL规则 foreach (DataRow row in GetAllRules().Rows) { long hostid = (long)row[1]; WebRule wr = new WebRule(); wr.ID = (long)row[0]; wr.RuleName = row[3].ToString(); wr.RuleState = WebRuleState.None; /// 3.添加URL下的参数集合 List <ParamState> paramStateList; if (paramStateDic.TryGetValue(wr.ID, out paramStateList)) { foreach (ParamState item in paramStateList) { wr.AddParam(item); } } /// 4.添加URL下的正文抽取规则集合 List <ExtractRule> extractRuleList; if (ruleExtractRuleDic.TryGetValue(wr.ID, out extractRuleList)) { foreach (ExtractRule item in extractRuleList) { wr.ExtractRuleCollection.Add(item); } } longruleDic[hostid].Add(wr); } foreach (var item in hostDic) { this._WebRuleCollections.Add(item.Value, longruleDic[item.Key]); } }
protected virtual string GetContent(string code, WebRule webrule) { if (webrule == null) { return(code); } StringBuilder sb = new StringBuilder(); foreach (ExtractRule item in webrule.ExtractRuleCollection) { sb.AppendLine(item.Extract(code)); } return(sb.ToString()); }
/// <summary> /// 抽取正文的静态方法 /// 运行较慢,但是从某种意义上来讲开发较快。 /// </summary> /// <param name="webrule">網站的一些規則</param> private string GetHtml(WebRule webrule) { ///抓取網頁 WebPage webpage = Jeelu.Utility.Web.GetHtmlCodeFromUrl(this.Uri.AbsoluteUri, 5, 30 * 1000); webpage.HtmlCode = ClearTag(webpage.HtmlCode, "style"); webpage.HtmlCode = ClearTag(webpage.HtmlCode, "script"); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.OptionOutputAsXml = true; htmlDoc.LoadHtml(webpage.HtmlCode); HtmlNode headElement = htmlDoc.DocumentNode.SelectSingleNode("//head"); HtmlNode bodyElement = htmlDoc.DocumentNode.SelectSingleNode("//body"); /////标题 this.Title = GetTagElementString(headElement, "title"); ///从Head中获得Keyword节点,Desciption节点 HtmlNodeCollection metaNodes = headElement.SelectNodes("meta"); if (metaNodes != null) { foreach (HtmlNode node in metaNodes) { switch (node.GetAttributeValue("name", "").ToLower()) { #region case case "keywords": { string content = node.GetAttributeValue("content", ""); if (string.IsNullOrEmpty(content)) { break; } this.Keyword += content + ','; } break; case "description": { string content = node.GetAttributeValue("content", ""); if (string.IsNullOrEmpty(content)) { break; } this.Description += content + ','; } break; default: break; #endregion } //switch } //foreach } //if ///获得页面的所有链接,目前在Jeelu.Billboard项目中用处不大 this.HrefCollection = GetTagElementValue(bodyElement, "a", "href").ToArray(); ///获得页面的所有图片的Alt属性 this.ImageAltCollection = GetTagElementValue(bodyElement, "img", "alt").ToArray(); ///获得页面的所有链接的Alt属性 this.LinkAltCollection = GetTagElementValue(bodyElement, "a", "alt").ToArray(); ///获得页面的所有加粗设为重点的字符串 List <string> bList = GetTagElementValue(bodyElement, "strong", ""); bList.AddRange(GetTagElementValue(bodyElement, "b", "").ToArray()); this.BCollection = bList.ToArray(); ///各级标题 this.H1Collection = GetTagElementValue(bodyElement, "h1", "").ToArray(); this.H2Collection = GetTagElementValue(bodyElement, "h2", "").ToArray(); this.H3Collection = GetTagElementValue(bodyElement, "h3", "").ToArray(); this.H4Collection = GetTagElementValue(bodyElement, "h4", "").ToArray(); this.H5Collection = GetTagElementValue(bodyElement, "h5", "").ToArray(); this.H6Collection = GetTagElementValue(bodyElement, "h6", "").ToArray(); ///整体的字符串 this.Content = this.GetContent(bodyElement.InnerText, webrule); return(this.Content); }