/// <summary> /// 解析搜索结果数据 /// </summary> /// <param name="tsk">关键词信息</param> /// <param name="excludedDomains">排除域名列表</param> /// <param name="searchKeywords">搜索关键词</param> /// <param name="title">标题</param> /// <param name="href">链接</param> /// <param name="description">描述</param> /// <param name="domain">域名</param> /// <param name="tag">搜索结果源码</param> /// <param name="isMarket">是否为推广链接</param> private void HanleTagData(Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedDomains, string searchKeywords, string title, string href, string description, ref string domain, string tag, bool isMarket) { string realUrl = null, detailHtml = null; //真实网址、网页源码 //判断百度蓝V等级 int?baiduVStar = null; if (tag.Contains("c-icon-v1")) { baiduVStar = 1; } else if (tag.Contains("c-icon-v2")) { baiduVStar = 2; } else if (tag.Contains("c-icon-v3")) { baiduVStar = 3; } //获取真实网址、网页源码和网页摘要 if (!string.IsNullOrWhiteSpace(href)) { //获取网页源码及真实地址 var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } //获取网页二级域名 if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } //如果网页本身也是跳转链接,进一步获取获取真实网页源码并解析数据 if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } //去除前缀并计算域名收录量 Regex regDomain = new Regex("http://|https://"); long collectionNum = 0; if (!string.IsNullOrEmpty(domain)) { domain = regDomain.Replace(domain, ""); collectionNum = GetDomainCollectionNum(domain); } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { return; } else { var hrefs = detailHtml.GetDescendents("a", "href"); } string content = GetMainContentHelper.GetMainContent(detailHtml); //获取网页中文正文 bool is_title_matched = title.IsContains2(searchKeywords); //标题是否匹配到关键词 bool is_desc_matched = description.IsContains2(searchKeywords); //描述是否匹配到关键词 BaiduItemPart part = is_title_matched && is_desc_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_desc_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; /* 匹配发布时间 */ Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); string time = ""; //先匹配搜索结果里是否有数据 string timeStr = tag.SubAfter("newTimeFactor_before_abs").SubBefore("</span>"); if (!string.IsNullOrEmpty(timeStr)) { Match mt = reg.Match(timeStr); time = mt.Value; } else { //匹配网页源码里的时间 MatchCollection mc = reg.Matches(detailHtml); if (mc.Count > 0) { foreach (Match x in mc) { //判断是正文中的还是代码和注释中的时间 if (!string.IsNullOrEmpty(x.Value)) { var txt = detailHtml.SubAfter(x.Value); var index1 = txt.IndexOf('<'); var index2 = txt.IndexOf('>'); var index3 = txt.IndexOf('\"'); //只使用正文中的时间 if (index1 < index2 && index1 < index3) { time = x.Value; break; } } } } } //生成链接信息 Dnl_Link_Baidu link = new Dnl_Link_Baidu { Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = tsk.Keyword, LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = description, Title = title, IsPromotion = isMarket, PublishTime = time, Content = content, DCNum = collectionNum }; if (baiduVStar.HasValue) { link.BaiduVStar = baiduVStar.Value; } SaveLink(link, tsk); }
private void GetBaiduUrlContent(string Url) { try { if (string.IsNullOrEmpty(Url)) { return; } Log4Helper.Write(LogLevel.Debug, Url); string EnCodeName = string.Empty; string path = this.BaseDir; WebClient client = new WebClient { Encoding = Encoding.GetEncoding("gbk") }; client.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"); string input = client.DownloadString(Url); Match match = Regex.Match(input, "charset=\"?(.+?)\""); if (match.Success) { EnCodeName = match.Groups[1].Value; } if (EnCodeName != "gbk") { WebClient client2 = new WebClient { Encoding = Encoding.GetEncoding(EnCodeName) }; client2.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"); string str8 = Url; if (!str8.Contains("http://")) { str8 = "http://" + str8; } input = client2.DownloadString(Url); } string titlePattern = "<title>(.+?)</title>"; Match titleMatch = Regex.Match(input, titlePattern, RegexOptions.IgnoreCase); string Title = string.Empty; if (titleMatch.Success) { Title = titleMatch.Groups[1].Value.Replace('\\', ' ').Replace('/', ' ').Replace(' ', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0]; } else { return; } string Content = GetMainContentHelper.GetMainContent(input); if (Content.Length < 300) { Log4Helper.Write(LogLevel.Error, "未能提取到正文线程号"); return; } Title = Title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0]; if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } string strHtml = ".html"; Title = Title.Replace(".", ""); Title = Title.Replace(",", ""); Title = Title.Replace("、", ""); Title = Title.Replace(" ", ""); Title = Title.Replace("*", "_"); using (FileStream stream = new FileStream(path + @"\" + Title + strHtml, FileMode.Create)) { StreamWriter writer2 = new StreamWriter(stream, Encoding.GetEncoding("utf-8")); writer2.Write(Content); writer2.Dispose(); client.Dispose(); } } catch (Exception ex) { Log4Helper.Write(LogLevel.Error, ex); } }