/// <summary> /// 保存链接 /// </summary> /// <param name="link">链接</param> /// <param name="task">关键词</param> public void SaveLink(Dnl_Link_Baidu link, Dnl_Keyword task) { //查询该链接是否已保存过 var builder = Builders <Dnl_Link_Baidu> .Filter; var filter = builder.Eq(x => x.SearchkeywordId, task._id.ToString()) & builder.Eq(x => x.LinkUrl, link.LinkUrl); var col = MongoDBHelper.Instance.GetDnl_Link_Baidu(); var query = col.Find(filter).FirstOrDefault(); if (query != null) { Console.WriteLine(DateTime.Now + " : " + "该链接已保存 - " + task.Keyword); } else { Console.WriteLine(DateTime.Now + " : " + "成功保存1条链接 - " + task.Keyword); col.InsertOne(link); } }
/// <summary> /// 解析搜索结果数据 /// </summary> /// <param name="tsk">关键词信息</param> /// <param name="excludedDomains">排除域名列表</param> /// <param name="searchKeywords">搜索关键词</param> /// <param name="title">标题</param> /// <param name="href">链接</param> /// <param name="description">描述</param> /// <param name="domain">域名</param> /// <param name="tag">搜索结果源码</param> /// <param name="isMarket">是否为推广链接</param> private void HanleTagData(Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedDomains, string searchKeywords, string title, string href, string description, ref string domain, string tag, bool isMarket) { string realUrl = null, detailHtml = null; //真实网址、网页源码 //判断百度蓝V等级 int?baiduVStar = null; if (tag.Contains("c-icon-v1")) { baiduVStar = 1; } else if (tag.Contains("c-icon-v2")) { baiduVStar = 2; } else if (tag.Contains("c-icon-v3")) { baiduVStar = 3; } //获取真实网址、网页源码和网页摘要 if (!string.IsNullOrWhiteSpace(href)) { //获取网页源码及真实地址 var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } //获取网页二级域名 if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } //如果网页本身也是跳转链接,进一步获取获取真实网页源码并解析数据 if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } //去除前缀并计算域名收录量 Regex regDomain = new Regex("http://|https://"); long collectionNum = 0; if (!string.IsNullOrEmpty(domain)) { domain = regDomain.Replace(domain, ""); collectionNum = GetDomainCollectionNum(domain); } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { return; } else { var hrefs = detailHtml.GetDescendents("a", "href"); } string content = GetMainContentHelper.GetMainContent(detailHtml); //获取网页中文正文 bool is_title_matched = title.IsContains2(searchKeywords); //标题是否匹配到关键词 bool is_desc_matched = description.IsContains2(searchKeywords); //描述是否匹配到关键词 BaiduItemPart part = is_title_matched && is_desc_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_desc_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; /* 匹配发布时间 */ Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); string time = ""; //先匹配搜索结果里是否有数据 string timeStr = tag.SubAfter("newTimeFactor_before_abs").SubBefore("</span>"); if (!string.IsNullOrEmpty(timeStr)) { Match mt = reg.Match(timeStr); time = mt.Value; } else { //匹配网页源码里的时间 MatchCollection mc = reg.Matches(detailHtml); if (mc.Count > 0) { foreach (Match x in mc) { //判断是正文中的还是代码和注释中的时间 if (!string.IsNullOrEmpty(x.Value)) { var txt = detailHtml.SubAfter(x.Value); var index1 = txt.IndexOf('<'); var index2 = txt.IndexOf('>'); var index3 = txt.IndexOf('\"'); //只使用正文中的时间 if (index1 < index2 && index1 < index3) { time = x.Value; break; } } } } } //生成链接信息 Dnl_Link_Baidu link = new Dnl_Link_Baidu { Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = tsk.Keyword, LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = description, Title = title, IsPromotion = isMarket, PublishTime = time, Content = content, DCNum = collectionNum }; if (baiduVStar.HasValue) { link.BaiduVStar = baiduVStar.Value; } SaveLink(link, tsk); }