private void cleaning(IW2S_level1link x, List <IW2S_ExcludeKeyword> excludedKeywords) { if (excludedKeywords == null || excludedKeywords.Count == 0) { return; } string txt = "{0}/{1}/{2}/{3}".FormatStr(x.Description, x.Abstract, x.Title, x.LinkUrl).ToLower(); var matched_ex_kw = excludedKeywords.FirstOrDefault(k => txt.IsContains(k.Keyword)); if (matched_ex_kw != null) { x.DataCleanStatus = (byte)DataCleanStatus.Excluded; x.Keywords = "{0} $ExcludedByExcludeKeyword:{1}".FormatStr(x.Keywords, matched_ex_kw.Keyword); } }
private void cleaning(IW2S_level1link x, string keywords) { if (string.IsNullOrEmpty(keywords)) { return; } string[] kwds = keywords.GetLower().SplitWith(";", ","); if (kwds == null || kwds.Length == 0) { return; } string txt = "{0}/{1}/{2}".FormatStr(x.Keywords, x.Domain).ToLower(); string matched = kwds.FirstOrDefault(y => !string.IsNullOrEmpty(y) && txt.IsContain(y)); if (!string.IsNullOrEmpty(matched)) { x.DataCleanStatus = (byte)DataCleanStatus.Excluded; x.Keywords = "{0} $ExcludedByExcludingKeyword:{1}".FormatStr(x.Keywords, matched); } }
private void HanleTagData(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords, IW2SBotMng botmng, string[] searchKeywords, List <KeywordScore> patterns, string title, string href, string abs, ref string domain, string tag, bool isMarket, int rank) { int maxScore = 0; string realUrl = null, detailHtml = null, abstracts = null; byte appType = 0; int?baiduVStar = null; if (tag.Contains("c-icon-v1")) { baiduVStar = 1; } else if (tag.Contains("c-icon-v2")) { baiduVStar = 2; } else if (tag.Contains("c-icon-v3")) { baiduVStar = 3; } if (!string.IsNullOrWhiteSpace(href)) { //Encoding enc = Encoding.UTF8; //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl); var tuplehtml = get_htmlUrl(href); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()")) { var gourl = detailHtml.GetFirstHref2(); if (!string.IsNullOrEmpty(gourl)) { var tuplehtml = get_htmlUrl(gourl); if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1)) { realUrl = tuplehtml.Item1; } if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2)) { detailHtml = tuplehtml.Item2; } if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain)) { domain = GetDomain(realUrl); } } } if (string.IsNullOrEmpty(realUrl)) { realUrl = href; } List <KeywordScore> matchpatterns = new List <KeywordScore>(); if (string.IsNullOrEmpty(detailHtml)) { return; } else { //if (!detailHtml.Contains(tsk.CommendKeyword)) //{ // return; //} var hrefs = detailHtml.GetDescendents("a", "href"); StringBuilder sbabstracts = new StringBuilder(); List <string> abstractlist = new List <string>(); StringBuilder sbabstractlist = new StringBuilder(); foreach (KeywordScore pattern in patterns) { string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword); if (splitDetailHtmls.Length > 1) { matchpatterns.Add(pattern); } StringBuilder sbpatternStr = new StringBuilder(); for (int i = 0; i < splitDetailHtmls.Length - 1; i++) { string splitDetailHtml1 = splitDetailHtmls[i]; string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : ""; for (int j = splitDetailHtml1.Length - 1; j >= 0; j--) { if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1])) { break; } sbpatternStr.Append(splitDetailHtml1[j]); } for (int q = sbpatternStr.Length - 1; q >= 0; q--) { sbabstracts.Append(sbpatternStr[q]); } sbabstracts.Append(pattern.Keyword); sbpatternStr.Clear(); for (int j = 0; j < splitDetailHtml2.Length; j++) { if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1])) { break; } sbpatternStr.Append(splitDetailHtml2[j]); } sbabstracts.Append(sbpatternStr); sbpatternStr.Clear(); string tmpsbabstracts = sbabstracts.ToString(); tmpsbabstracts = IW2SBaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower()); if (!abstractlist.Contains(tmpsbabstracts)) { abstractlist.Add(tmpsbabstracts); sbabstractlist.Append(tmpsbabstracts).Append(" "); } sbabstracts.Clear(); } } //获取摘要 abstracts = sbabstractlist.ToString(); if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs)) { matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList(); if (matchpatterns.Count > 0) { maxScore = matchpatterns.Max(x => x.Score ?? 50); appType = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault(); maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10); maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10); } } if (maxScore > 100) { maxScore = 100; } bool is_title_matched = title.GetLower().IsContains2(searchKeywords); bool is_abstr_matched = abs.IsContains2(searchKeywords); BaiduItemPart part = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract : is_title_matched ? BaiduItemPart.Title : is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None; Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)"); MatchCollection mc = reg.Matches(detailHtml); //MatchCollection cols = reg.Matches(item.Html); string time = ""; if (mc.Count > 0) { foreach (Match x in mc) { //判断是正文中的还是代码和注释中的时间 if (!string.IsNullOrEmpty(x.Value)) { var txt = detailHtml.SubAfter(x.Value); var index1 = txt.IndexOf('<'); var index2 = txt.IndexOf('>'); var index3 = txt.IndexOf('\"'); //只使用正文中的时间 if (index1 < index2 && index1 < index3) { time = x.Value; break; } } } } IW2S_level1link l1 = new IW2S_level1link { UsrId = tsk.UsrId, Domain = domain, TopDomain = GetLevel1Domain(domain), Keywords = string.Format("{0}", tsk.CommendKeyword), LinkUrl = realUrl, MatchAt = (byte)part, Html = detailHtml, AppType = appType, BizId = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())), SearchkeywordId = tsk._id.ToString(), CreatedAt = DateTime.UtcNow.AddHours(8), Description = abs, Title = title, Score = maxScore, Abstract = abstracts, IsMarket = isMarket, ProjectId = tsk.ProjectId, PublishTime = time, AlternateFields = "0", Rank = rank }; if (baiduVStar.HasValue) { l1.BaiduVStar = baiduVStar.Value; } botmng.save_level1_links(new List <IW2S_level1link> { l1 }, tsk, excludedKeywords); }