public void saveBaiduKeyword(IW2S_BaiduCommend baiduCommend) { if (baiduCommend == null) { return; } var col = MongoDBHelper.Instance.GetIW2S_BaiduCommends(); var builder = Builders <IW2S_BaiduCommend> .Filter; var exists_obj = col.Find(builder.Eq(x => x._id, baiduCommend._id)).Project(x => new IDIntDto { Id = x._id, Times = x.Times }).FirstOrDefault(); if (exists_obj == null || exists_obj.Id == new MongoDB.Bson.ObjectId("000000000000000000000000")) { col.InsertOne(baiduCommend); log("SUCCESS saving keywords {0} for {1}".FormatStr(baiduCommend.CommendKeyword, baiduCommend.Keyword)); } else { var update = new UpdateDocument { { "$set", new QueryDocument { { "Times", exists_obj.Times + 1 } } } }; var result = MongoDBHelper.Instance.GetIW2S_BaiduCommends().UpdateOne(new QueryDocument { { "_id", baiduCommend._id } }, update); } }
string get_urls(IW2S_BaiduCommend tsk) { string searchKeywords = tsk.CommendKeyword.RemoveSpace().GetLower(); if (!string.IsNullOrEmpty(searchKeywords)) { string baiduUrlFormat = "http://www.baidu.com/s?ie=utf-8&wd={0}"; return(baiduUrlFormat.FormatStr(searchKeywords.GetUrlEncodedString("utf-8"))); } return(string.Empty); }
void GetLinks(string link, IW2S_BaiduKeyword tsk, int height) { string searchKeyword = tsk.Keyword.GetLower().RemoveSpace(); if (!string.IsNullOrEmpty(link)) { log(link); var html = get_html(link); if (html == null) { return; } var tags = html.SubAfter("相关搜索</div>").SubBefore("id=\"page\"").SplitWith("<a"); if (tags == null || tags.Length == 0) { log("BLOCKED " + tsk.Keyword); return; } foreach (var a in tags) { string title = a.GetTxtFromHtml2().RemoveSpace().GetLower(); string href = a.GetFirstHref2(); var searchKey = tsk.Keyword.ToLower(); if (string.IsNullOrEmpty(title) || string.IsNullOrWhiteSpace(href) || title == searchKey || !title.Contains(searchKey)) { continue; } IW2S_BaiduCommend baiduCommend = new IW2S_BaiduCommend { _id = "{0}{1}".FormatStr(tsk._id, title).ToObjectId(), CommendKeyword = title, CreatedAt = DateTime.UtcNow.AddHours(8), Keyword = tsk.Keyword, KeywordId = tsk._id, UsrId = tsk.UsrId, BotIntervalHours = 7 * 24, ProjectId = tsk.ProjectId }; saveBaiduKeyword(baiduCommend); //if (!string.IsNullOrWhiteSpace(href) && height < 1) //{ // GetLinks("https://www.baidu.com" + href, tsk, height+1); //} } } }
public void Query(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords) { List <IW2S_level1link> result = new List <IW2S_level1link>(); var link = get_urls(tsk); try { GetLinks(link, tsk, excludedKeywords); //if (list != null && list.Count > 0) // result.AddRange(list); } catch (Exception ex) { log(ex.Message + ex.StackTrace); } }
public void save_level1_links(List <IW2S_level1link> links, IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords) { links = prehandle_data(links, tsk, excludedKeywords); if (links == null || links.Count == 0) { log("SUCCESS saving 0 Level 1 Links for " + tsk.CommendKeyword); return; } int pagesize = 100; int count = 0; var col = MongoDBHelper.Instance.GetIW2S_level1links(); var builder = Builders <IW2S_level1link> .Filter; for (int page = 0; page *pagesize < links.Count; page++) { var list = links.Skip(page * pagesize).Take(pagesize).ToList(); //list.ForEach(x => x._id = new MongoDB.Bson.ObjectId(IDHelper.GetGuid("{0}/&itemid={1}".FormatStr(x.Domain, x.LinkUrl)).ToString())); list = ListDistinctBy(list, x => x.BizId); FieldsDocument fd = new FieldsDocument(); fd.Add("BizId", 1); List <Guid> BizId = list.Select(x => x.BizId).ToList(); //var exists_objs = col.Find(builder.In(x => x.BizId, BizId)).Project(x => x.BizId).ToList(); List <Guid> exists_ids = new List <Guid>(); //foreach (var result in exists_objs) //{ // exists_ids.Add(result); //} if (exists_ids != null && exists_ids.Count > 0) { list = list.Where(x => !exists_ids.Contains(x.BizId)).ToList(); } if (list == null || list.Count == 0) { continue; } count += pagesize; col.InsertMany(links); log("SUCCESS saving " + links.Count + " Level 1 Links for " + tsk.CommendKeyword); } }
private void query(IW2S_BaiduCommend p) { try { var builder = Builders <IW2S_ExcludeKeyword> .Filter; var filter = builder.Eq(x => x.UsrId, p.UsrId); var excludedKeywords = MongoDBHelper.Instance.GetIW2S_ExcludeKeywords().Find(builder.Empty).ToList(); log("loaded {0} excluding keywords ".FormatStr(excludedKeywords == null ? 0 : excludedKeywords.Count)); if (excludedKeywords.GetCount() > 0) { excludedKeywords.ForEach(x => x.Keyword = x.Keyword.ToLower()); } var filterbuilder = Builders <IW2S_KeywordFilter> .Filter; var filterfilter = filterbuilder.Eq(x => x.UsrId, p.UsrId) & filterbuilder.Eq(x => x.ProjectId, p.ProjectId); var filterKeywords = MongoDBHelper.Instance.GetIW2S_KeywordFilters().Find(filterfilter).Project(x => new IW2S_ExcludeKeyword { Keyword = x.Keyword }).ToList(); excludedKeywords.AddRange(filterKeywords); try { Queries.IW2SBaiduQuery baidu = new Queries.IW2SBaiduQuery(p.Keyword); baidu.Query(p, excludedKeywords); //save_level1_links(links, p, excludedKeywords); //SogouWeixin sogou = new SogouWeixin(tsk.Keyword); //links = sogou.Query(tsk); //save_level1_links(links, tsk, excludedKeywords); } catch (Exception ex) { log(ex.Message); } } catch (Exception ex) { log(ex.Message); } }
void GetLinks(string link, IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords) { IW2SBotMng botmng = IW2SBotMng.Instance; string[] searchKeywords = tsk.CommendKeyword.GetLower().Trim().Split(';'); List <KeywordScore> patterns = new List <KeywordScore>(); patterns.Add(new KeywordScore { Keyword = tsk.CommendKeyword }); //List<level1link> result = new List<level1link>(); int nohist_pages = 0; int quried_pages = 0; int rank = 1; //最多搜索60页 while (!string.IsNullOrEmpty(link) && quried_pages <= 3) { log(link); var html = get_html(link); if (html == null) { break; } //处理百度推广链接 var propContents = new List <string>(); if (!string.IsNullOrEmpty(html.SubAfter("content_left").SubAfter("div id=\"400"))) { propContents = html.SubAfter("content_left").SubAfter("div id=\"400").SubBefore("c-container").SplitWith("div id=\"400").ToList(); } else if (!string.IsNullOrEmpty(html.SubAfter("content_left").SubAfter("divid=\"400"))) { propContents = html.SubAfter("content_left").SubAfter("divid=\"400").SubBefore("c-container").SplitWith("divid=\"400").ToList(); } foreach (var tag in propContents) { var a = tag.SubAfter("h3").SubAfter("a"); string title = a.SubBefore("</h3>").GetTxtFromHtml2(); if (!string.IsNullOrEmpty(title)) { title = title.Trim(); } string href = a.GetFirstHref2(); string abs = tag.SubAfter("</h3>").SubBefore("</a").GetTxtFromHtml2(); if (string.IsNullOrEmpty(abs)) { abs = abs.Trim(); } string domain = string.Empty; //没有包含需要protect item信息的过滤掉 string txt = "{0}{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } HanleTagData(tsk, excludedKeywords, botmng, searchKeywords, patterns, title, href, abs, ref domain, tag, true, rank); } var tags = html.SubAfter("content_left").SplitWith("c-container"); if (tags == null || tags.Length == 0) { log("BLOCKED " + tsk.CommendKeyword); break; } bool nohit = true; foreach (string tag in tags) { var a = tag.SubAfter("h3").SubAfter("a"); string title = a.SubBefore("</h3>").GetTxtFromHtml2(); if (!string.IsNullOrEmpty(title)) { title = title.Trim(); } string href = a.GetFirstHref2(); string abs = tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2(); if (string.IsNullOrEmpty(abs)) { abs = abs.Trim(); } string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2(); domain = GetDomain(domain); //没有包含需要protect item信息的过滤掉 string txt = "{0}{1}".FormatStr(title, abs); if (string.IsNullOrEmpty(txt)) { continue; } HanleTagData(tsk, excludedKeywords, botmng, searchKeywords, patterns, title, href, abs, ref domain, tag, false, rank); nohit = false; nohist_pages = 0; rank++; } if (nohit) { nohist_pages++; } //如果连续3页都没有结果,就跳出 if (nohist_pages > 3) { break; } quried_pages++; pages++; link = html.SubAfter("fk fk_cur").SubBefore("下一页").GetLastHref2(); if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http")) { if (link.IsStartWith("/")) { link = link.SubAfter("/"); } link = "http://www.baidu.com/".GetContact(link); } } //return result; }
private void HanleTagData(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords, IW2SBotMng botmng, string[] searchKeywords, List <KeywordScore> patterns, string title, string href, string abs, ref string domain, string tag, bool isMarket, int rank) { int maxScore = 0; string realUrl = null, detailHtml = null, abstracts = null; byte appType = 0; int?baiduVStar = null; if (tag.Contains("c-icon-v1")) { baiduVStar = 1; } else if (tag.Contains("c-icon-v2")) { baiduVStar = 2; } else if (tag.Contains("c-icon-v3")) { baiduVStar = 3; } if (!string.IsNullOrWhiteSpace(href)) { //Encoding enc = Encoding.UTF8; //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl); var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { return; } else { //if (!detailHtml.Contains(tsk.CommendKeyword)) //{ // return; //} var hrefs = detailHtml.GetDescendents("a", "href"); StringBuilder sbabstracts = new StringBuilder(); List <string> abstractlist = new List <string>(); StringBuilder sbabstractlist = new StringBuilder(); foreach (KeywordScore pattern in patterns) { string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword); if (splitDetailHtmls.Length > 1) { matchpatterns.Add(pattern); } StringBuilder sbpatternStr = new StringBuilder(); for (int i = 0; i < splitDetailHtmls.Length - 1; i++) { string splitDetailHtml1 = splitDetailHtmls[i]; string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : ""; for (int j = splitDetailHtml1.Length - 1; j >= 0; j--) { if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1])) { break; } sbpatternStr.Append(splitDetailHtml1[j]); } for (int q = sbpatternStr.Length - 1; q >= 0; q--) { sbabstracts.Append(sbpatternStr[q]); } sbabstracts.Append(pattern.Keyword); sbpatternStr.Clear(); for (int j = 0; j < splitDetailHtml2.Length; j++) { if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1])) { break; } sbpatternStr.Append(splitDetailHtml2[j]); } sbabstracts.Append(sbpatternStr); sbpatternStr.Clear(); string tmpsbabstracts = sbabstracts.ToString(); tmpsbabstracts = IW2SBaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower()); if (!abstractlist.Contains(tmpsbabstracts)) { abstractlist.Add(tmpsbabstracts); sbabstractlist.Append(tmpsbabstracts).Append(" "); } sbabstracts.Clear(); } } //获取摘要 abstracts = sbabstractlist.ToString(); if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs)) { matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList(); if (matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (maxScore > 100) { maxScore = 100; } bool is_title_matched = title.GetLower().IsContains2(searchKeywords); bool is_abstr_matched = abs.IsContains2(searchKeywords); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); MatchCollection mc = reg.Matches(detailHtml); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (mc.Count > 0) { foreach (Match x in mc) { //判断是正文中的还是代码和注释中的时间 if (!string.IsNullOrEmpty(x.Value)) { var txt = detailHtml.SubAfter(x.Value); var index1 = txt.IndexOf('<'); var index2 = txt.IndexOf('>'); var index3 = txt.IndexOf('\"'); //只使用正文中的时间 if (index1 < index2 && index1 < index3) { time = x.Value; break; } } } } IW2S_level1link l1 = new IW2S_level1link { UsrId = tsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0}", tsk.CommendKeyword), LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())), SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abstracts, IsMarket = isMarket, ProjectId = tsk.ProjectId, PublishTime = time, AlternateFields = "0", Rank = rank }; if (baiduVStar.HasValue) { l1.BaiduVStar = baiduVStar.Value; } botmng.save_level1_links(new List <IW2S_level1link> { l1 }, tsk, excludedKeywords); }
public List <IW2S_level1link> prehandle_data(List <IW2S_level1link> links, IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords) { if (links == null || links.Count == 0) { log("BLOCKED " + tsk.CommendKeyword); return(links); } else { links = links.DistinctBy(x => x.LinkUrl); log(links.Count + " Level 1 Links for " + tsk.CommendKeyword); } //var itm = MySqlDbHelper.GetEfEntities<protectitem>(ctx,"Id="+tsk.ProtectItemId).FirstOrDefault(); ////{ScoredKeywords:{aaa:12,bbb:13}} //if (itm != null && !string.IsNullOrEmpty(itm.FingerPrints2)) //{ // string[] sks = itm.FingerPrints2.SplitWith("$;"); // Dictionary<string, int> scores = new Dictionary<string, int>(); // if (sks != null) // { // foreach (var sk in sks) // { // string[] sps = sk.SplitWith(":", ":"); // if (sps == null || sps.Length != 2) // continue; // string k = sps[0].GetTrimed(); // int? s = sps[1].ExInt(); // if (sps[1].IsContains2("-")) // s = -1 * s; // if (string.IsNullOrEmpty(k) || !s.HasValue || scores.ContainsKey(k)) // continue; // scores.Add(k, s.Value); // } // } // foreach (var l in links) // { // string txt = string.Format("{0}{1}", l.Title, l.Abstract); // l.Score = scores.Sum(x => txt.IsContain(x.Key) ? x.Value : 0); // l.Title = "[使用了自定义打分]" + l.Title; // } //} //else if (tsk.ProjectType == (byte)ProjectType.Artical && !string.IsNullOrEmpty(itm.FingerPrints)) //{ // foreach (var l in links) // { // var txt = string.Join("", "{0},{1}".FormatStr(l.Title, l.Abstract).SplitWith( // ";", ",", ";", ".", ",", "。", ";", // "-", " ", "?", "“", "!", "”").Select(x => x.GetTrimed()).Where(x => !string.IsNullOrEmpty(x))); // string[] fps = itm.FingerPrints.SplitWith(","); // l.Score = l.Score / 2 + fps.Where(x => txt.IsContain(x)).Count() * 50 / fps.Length; // } //} links.ForEach(x => { //x.Keywords = tsk.Keyword; cleaning(x, excludedKeywords); }); links = links.Where(x => x.DataCleanStatus != (byte)DataCleanStatus.Excluded).ToList(); return(links); }