/// <summary> /// 百度搜索关键词 /// </summary> /// <param name="p"></param> public void query(Dnl_Keyword p) { try { //获取要过滤的域名 var builder = Builders <Dnl_IgnoreDomain> .Filter; var excludedDomains = MongoDBHelper.Instance.GetDnl_IgnoreDomain().Find(builder.Empty).ToList(); //log("加载 {0} 个排除关键词 ".FormatStr(excludedKeywords == null ? 0 : excludedKeywords.Count)); //var filterbuilder = Builders<IW2S_KeywordFilter>.Filter; //var filterfilter = filterbuilder.Eq(x => x.UsrId, p.UsrId) & filterbuilder.Eq(x => x.ProjectId, p.ProjectId); //var filterKeywords = MongoDBHelper.Instance.GetIW2S_KeywordFilters().Find(filterfilter).Project(x => new IW2S_ExcludeKeyword //{ // Keyword = x.Keyword //}).ToList(); //excludedKeywords.AddRange(filterKeywords); try { //搜索关键词 Queries.DnlBaiduSearchQuery baidu = new Queries.DnlBaiduSearchQuery(p.Keyword); baidu.Query(p, excludedDomains); } catch (Exception ex) { log(ex.Message); } } catch (Exception ex) { log(ex.Message); } }
/// <summary> /// 获取搜索网址 /// </summary> /// <param name="tsk">要搜索的关键词信息</param> /// <returns></returns> string get_urls(Dnl_Keyword tsk) { string searchKeywords = tsk.Keyword.Trim(); if (!string.IsNullOrEmpty(searchKeywords)) { string baiduUrlFormat = "http://www.baidu.com/s?ie=utf-8&wd={0}"; return(baiduUrlFormat.FormatStr(searchKeywords.GetUrlEncodedString("utf-8"))); } return(string.Empty); }
/// <summary> /// 保存链接信息 /// </summary> /// <param name="link">链接类</param> /// <param name="tsk">关键词类</param> /// <param name="excludedKeywords">排除词</param> public void save_level1_links(List <IW2S_level1link> links, Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedKeywords) { //link = prehandle_data(link ,tsk ,excludedKeywords); try { if (links != null) { log("成功保存1条链接: " + tsk.Keyword); return; } int pagesize = 100; int count = 0; var col = MongoDBHelper.Instance.GetIW2S_level1links(); var builder = Builders <Dnl_Link_Baidu> .Filter; for (int page = 0; page *pagesize < links.Count; page++) { var list = links.Skip(page * pagesize).Take(pagesize).ToList(); //list.ForEach(x => x._id = new MongoDB.Bson.ObjectId(IDHelper.GetGuid("{0}/&itemid={1}".FormatStr(x.Domain, x.LinkUrl)).ToString())); list = ListDistinctBy(list, x => x.BizId); FieldsDocument fd = new FieldsDocument(); fd.Add("BizId", 1); List <Guid> BizId = list.Select(x => x.BizId).ToList(); //var exists_objs = col.Find(builder.In(x => x.BizId, BizId)).Project(x => x.BizId).ToList(); List <Guid> exists_ids = new List <Guid>(); //foreach (var result in exists_objs) //{ // exists_ids.Add(result); //} if (exists_ids != null && exists_ids.Count > 0) { list = list.Where(x => !exists_ids.Contains(x.BizId)).ToList(); } if (list == null || list.Count == 0) { continue; } count += pagesize; col.InsertMany(links); log("SUCCESS saving " + links.Count + " Level 1 Links for " + tsk.Keyword); } } catch (Exception ex) { log(ex.Message); log("保存出错!"); } }
/// <summary> /// 百度搜索 /// </summary> /// <param name="tsk">要搜索的关键词信息</param> /// <param name="excludedDomains">排除域名</param> public void Query(Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedDomains) { var link = get_urls(tsk); try { GetLinks(link, tsk, excludedDomains); } catch (Exception ex) { log(ex.Message + ex.StackTrace); } }
/// <summary> /// 保存链接 /// </summary> /// <param name="link">链接</param> /// <param name="task">关键词</param> public void SaveLink(Dnl_Link_Baidu link, Dnl_Keyword task) { //查询该链接是否已保存过 var builder = Builders <Dnl_Link_Baidu> .Filter; var filter = builder.Eq(x => x.SearchkeywordId, task._id.ToString()) & builder.Eq(x => x.LinkUrl, link.LinkUrl); var col = MongoDBHelper.Instance.GetDnl_Link_Baidu(); var query = col.Find(filter).FirstOrDefault(); if (query != null) { Console.WriteLine(DateTime.Now + " : " + "该链接已保存 - " + task.Keyword); } else { Console.WriteLine(DateTime.Now + " : " + "成功保存1条链接 - " + task.Keyword); col.InsertOne(link); } }
/// <summary> /// 抓取搜索页面 /// </summary> /// <param name="link">搜索链接</param> /// <param name="tsk">要搜索的关键词信息</param> /// <param name="excludedDomains">排除关键词</param> void GetLinks(string link, Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedDomains) { string searchKeyword = tsk.Keyword.Trim(); int nohist_pages = 0; //未命中页面 int quried_pages = 0; //已搜索页面 int rank = 1; //页面中网址排名 //最多搜索3页 while (!string.IsNullOrEmpty(link) && quried_pages <= 2) { log(link); //获取搜索页面源码 var html = get_html(link); if (html == null) { break; } //处理百度推广链接 var propContents = new List <string>(); if (!string.IsNullOrEmpty(html.SubAfter("content_left").SubAfter("div id=\"400"))) { propContents = html.SubAfter("content_left").SubAfter("div id=\"400").SubBefore("c-container").SplitWith("div id=\"400").ToList(); } else if (!string.IsNullOrEmpty(html.SubAfter("content_left").SubAfter("divid=\"400"))) { propContents = html.SubAfter("content_left").SubAfter("divid=\"400").SubBefore("c-container").SplitWith("divid=\"400").ToList(); } foreach (var tag in propContents) { var a = tag.SubAfter("h3").SubAfter("a"); //获取标题 string title = a.SubBefore("</h3>").GetTxtFromHtml2(); if (!string.IsNullOrEmpty(title)) { title = title.Trim(); } string href = a.GetFirstHref2(); //获取描述 string abs = tag.SubAfter("</h3>").SubBefore("</a").GetTxtFromHtml2(); if (string.IsNullOrEmpty(abs)) { abs = abs.Trim(); } string domain = string.Empty; //二级域名 //没有包含需要protect item信息的过滤掉 string txt = "{0}{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } HanleTagData(tsk, excludedDomains, searchKeyword, title, href, abs, ref domain, tag, true); } //获取搜索结果部分页面 var tags = html.SubAfter("content_left").SplitWith("c-container"); if (tags == null || tags.Length == 0) { log("BLOCKED " + tsk.Keyword); break; } bool nohit = true; foreach (string tag in tags) { //获取单个搜索结果信息 var a = tag.SubAfter("h3").SubAfter("a"); //获取标题 string title = a.SubBefore("</h3>").GetTxtFromHtml2(); if (!string.IsNullOrEmpty(title)) { title = title.Trim(); } string href = a.GetFirstHref2(); //链接 //获取描述 string description = tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2(); if (string.IsNullOrEmpty(description)) { description = description.Trim(); } string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2(); domain = GetDomain(domain); //域名 //没有包含需要protect item信息的过滤掉 string txt = "{0}{1}".FormatStr(title, description); if (string.IsNullOrEmpty(txt)) { continue; } //解析搜索结果数据 HanleTagData(tsk, excludedDomains, searchKeyword, title, href, description, ref domain, tag, false); nohit = false; nohist_pages = 0; rank++; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; //获取下一页搜索页面链接 link = html.SubAfter("fk fk_cur").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://www.baidu.com/".GetContact(link); } } //return result; }
/// <summary> /// 解析搜索结果数据 /// </summary> /// <param name="tsk">关键词信息</param> /// <param name="excludedDomains">排除域名列表</param> /// <param name="searchKeywords">搜索关键词</param> /// <param name="title">标题</param> /// <param name="href">链接</param> /// <param name="description">描述</param> /// <param name="domain">域名</param> /// <param name="tag">搜索结果源码</param> /// <param name="isMarket">是否为推广链接</param> private void HanleTagData(Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedDomains, string searchKeywords, string title, string href, string description, ref string domain, string tag, bool isMarket) { string realUrl = null, detailHtml = null; //真实网址、网页源码 //判断百度蓝V等级 int?baiduVStar = null; if (tag.Contains("c-icon-v1")) { baiduVStar = 1; } else if (tag.Contains("c-icon-v2")) { baiduVStar = 2; } else if (tag.Contains("c-icon-v3")) { baiduVStar = 3; } //获取真实网址、网页源码和网页摘要 if (!string.IsNullOrWhiteSpace(href)) { //获取网页源码及真实地址 var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } //获取网页二级域名 if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } //如果网页本身也是跳转链接,进一步获取获取真实网页源码并解析数据 if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } //去除前缀并计算域名收录量 Regex regDomain = new Regex("http://|https://"); long collectionNum = 0; if (!string.IsNullOrEmpty(domain)) { domain = regDomain.Replace(domain, ""); collectionNum = GetDomainCollectionNum(domain); } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { return; } else { var hrefs = detailHtml.GetDescendents("a", "href"); } string content = GetMainContentHelper.GetMainContent(detailHtml); //获取网页中文正文 bool is_title_matched = title.IsContains2(searchKeywords); //标题是否匹配到关键词 bool is_desc_matched = description.IsContains2(searchKeywords); //描述是否匹配到关键词 BaiduItemPart part = is_title_matched && is_desc_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_desc_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; /* 匹配发布时间 */ Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); string time = ""; //先匹配搜索结果里是否有数据 string timeStr = tag.SubAfter("newTimeFactor_before_abs").SubBefore("</span>"); if (!string.IsNullOrEmpty(timeStr)) { Match mt = reg.Match(timeStr); time = mt.Value; } else { //匹配网页源码里的时间 MatchCollection mc = reg.Matches(detailHtml); if (mc.Count > 0) { foreach (Match x in mc) { //判断是正文中的还是代码和注释中的时间 if (!string.IsNullOrEmpty(x.Value)) { var txt = detailHtml.SubAfter(x.Value); var index1 = txt.IndexOf('<'); var index2 = txt.IndexOf('>'); var index3 = txt.IndexOf('\"'); //只使用正文中的时间 if (index1 < index2 && index1 < index3) { time = x.Value; break; } } } } } //生成链接信息 Dnl_Link_Baidu link = new Dnl_Link_Baidu { Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = tsk.Keyword, LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = description, Title = title, IsPromotion = isMarket, PublishTime = time, Content = content, DCNum = collectionNum }; if (baiduVStar.HasValue) { link.BaiduVStar = baiduVStar.Value; } SaveLink(link, tsk); }