コード例 #1
0
        private void cleaning(IW2S_level1link x, List <IW2S_ExcludeKeyword> excludedKeywords)
        {
            if (excludedKeywords == null || excludedKeywords.Count == 0)
            {
                return;
            }
            string txt           = "{0}/{1}/{2}/{3}".FormatStr(x.Description, x.Abstract, x.Title, x.LinkUrl).ToLower();
            var    matched_ex_kw = excludedKeywords.FirstOrDefault(k => txt.IsContains(k.Keyword));

            if (matched_ex_kw != null)
            {
                x.DataCleanStatus = (byte)DataCleanStatus.Excluded;
                x.Keywords        = "{0} $ExcludedByExcludeKeyword:{1}".FormatStr(x.Keywords, matched_ex_kw.Keyword);
            }
        }
コード例 #2
0
        private void cleaning(IW2S_level1link x, string keywords)
        {
            if (string.IsNullOrEmpty(keywords))
            {
                return;
            }
            string[] kwds = keywords.GetLower().SplitWith(";", ",");
            if (kwds == null || kwds.Length == 0)
            {
                return;
            }
            string txt     = "{0}/{1}/{2}".FormatStr(x.Keywords, x.Domain).ToLower();
            string matched = kwds.FirstOrDefault(y => !string.IsNullOrEmpty(y) && txt.IsContain(y));

            if (!string.IsNullOrEmpty(matched))
            {
                x.DataCleanStatus = (byte)DataCleanStatus.Excluded;
                x.Keywords        = "{0} $ExcludedByExcludingKeyword:{1}".FormatStr(x.Keywords, matched);
            }
        }
コード例 #3
0
        private void HanleTagData(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords, IW2SBotMng botmng, string[] searchKeywords, List <KeywordScore> patterns, string title, string href, string abs, ref string domain, string tag, bool isMarket, int rank)
        {
            int    maxScore = 0;
            string realUrl = null, detailHtml = null, abstracts = null;
            byte   appType = 0;

            int?baiduVStar = null;

            if (tag.Contains("c-icon-v1"))
            {
                baiduVStar = 1;
            }
            else if (tag.Contains("c-icon-v2"))
            {
                baiduVStar = 2;
            }
            else if (tag.Contains("c-icon-v3"))
            {
                baiduVStar = 3;
            }

            if (!string.IsNullOrWhiteSpace(href))
            {
                //Encoding enc = Encoding.UTF8;
                //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl);
                var tuplehtml = get_htmlUrl(href);
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                {
                    realUrl = tuplehtml.Item1;
                }
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                {
                    detailHtml = tuplehtml.Item2;
                }
                if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                {
                    domain = GetDomain(realUrl);
                }
            }
            if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
            {
                var gourl = detailHtml.GetFirstHref2();
                if (!string.IsNullOrEmpty(gourl))
                {
                    var tuplehtml = get_htmlUrl(gourl);
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                    {
                        realUrl = tuplehtml.Item1;
                    }
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                    {
                        detailHtml = tuplehtml.Item2;
                    }
                    if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                    {
                        domain = GetDomain(realUrl);
                    }
                }
            }
            if (string.IsNullOrEmpty(realUrl))
            {
                realUrl = href;
            }
            List <KeywordScore> matchpatterns = new List <KeywordScore>();

            if (string.IsNullOrEmpty(detailHtml))
            {
                return;
            }
            else
            {
                //if (!detailHtml.Contains(tsk.CommendKeyword))
                //{
                //    return;
                //}
                var           hrefs          = detailHtml.GetDescendents("a", "href");
                StringBuilder sbabstracts    = new StringBuilder();
                List <string> abstractlist   = new List <string>();
                StringBuilder sbabstractlist = new StringBuilder();

                foreach (KeywordScore pattern in patterns)
                {
                    string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword);
                    if (splitDetailHtmls.Length > 1)
                    {
                        matchpatterns.Add(pattern);
                    }
                    StringBuilder sbpatternStr = new StringBuilder();
                    for (int i = 0; i < splitDetailHtmls.Length - 1; i++)
                    {
                        string splitDetailHtml1 = splitDetailHtmls[i];
                        string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : "";
                        for (int j = splitDetailHtml1.Length - 1; j >= 0; j--)
                        {
                            if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml1[j]);
                        }
                        for (int q = sbpatternStr.Length - 1; q >= 0; q--)
                        {
                            sbabstracts.Append(sbpatternStr[q]);
                        }
                        sbabstracts.Append(pattern.Keyword);
                        sbpatternStr.Clear();
                        for (int j = 0; j < splitDetailHtml2.Length; j++)
                        {
                            if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml2[j]);
                        }
                        sbabstracts.Append(sbpatternStr);
                        sbpatternStr.Clear();

                        string tmpsbabstracts = sbabstracts.ToString();
                        tmpsbabstracts = IW2SBaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower());
                        if (!abstractlist.Contains(tmpsbabstracts))
                        {
                            abstractlist.Add(tmpsbabstracts);
                            sbabstractlist.Append(tmpsbabstracts).Append(" ");
                        }
                        sbabstracts.Clear();
                    }
                }
                //获取摘要
                abstracts = sbabstractlist.ToString();
                if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0)
                {
                    maxScore  = matchpatterns.Max(x => x.Score ?? 50);
                    appType   = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();
                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs))
            {
                matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList();
                if (matchpatterns.Count > 0)
                {
                    maxScore = matchpatterns.Max(x => x.Score ?? 50);
                    appType  = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();

                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (maxScore > 100)
            {
                maxScore = 100;
            }


            bool          is_title_matched = title.GetLower().IsContains2(searchKeywords);
            bool          is_abstr_matched = abs.IsContains2(searchKeywords);
            BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                             is_title_matched ? BaiduItemPart.Title :
                                             is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;

            Regex           reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
            MatchCollection mc  = reg.Matches(detailHtml);
            //MatchCollection cols = reg.Matches(item.Html);
            string time = "";

            if (mc.Count > 0)
            {
                foreach (Match x in mc)
                {
                    //判断是正文中的还是代码和注释中的时间
                    if (!string.IsNullOrEmpty(x.Value))
                    {
                        var txt    = detailHtml.SubAfter(x.Value);
                        var index1 = txt.IndexOf('<');
                        var index2 = txt.IndexOf('>');
                        var index3 = txt.IndexOf('\"');
                        //只使用正文中的时间
                        if (index1 < index2 && index1 < index3)
                        {
                            time = x.Value;
                            break;
                        }
                    }
                }
            }


            IW2S_level1link l1 = new IW2S_level1link
            {
                UsrId     = tsk.UsrId,
                Domain    = domain,
                TopDomain = GetLevel1Domain(domain),
                Keywords  = string.Format("{0}", tsk.CommendKeyword),
                LinkUrl   = realUrl,
                MatchAt   = (byte)part,
                Html      = detailHtml,

                AppType         = appType,
                BizId           = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())),
                SearchkeywordId = tsk._id.ToString(),
                CreatedAt       = DateTime.UtcNow.AddHours(8),
                Description     = abs,
                Title           = title,
                Score           = maxScore,
                Abstract        = abstracts,
                IsMarket        = isMarket,
                ProjectId       = tsk.ProjectId,
                PublishTime     = time,
                AlternateFields = "0",
                Rank            = rank
            };

            if (baiduVStar.HasValue)
            {
                l1.BaiduVStar = baiduVStar.Value;
            }

            botmng.save_level1_links(new List <IW2S_level1link> {
                l1
            }, tsk, excludedKeywords);
        }