public List <IW2S_Bing_level1link> GetLinks(string link, IW2S_Bing_BaiduCommend searchTsk)
        {
            List <IW2S_Bing_level1link> result = new List <IW2S_Bing_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;
            int fanye        = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 10)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "http://cn.bing.com/";
                string           cookie           = "";
                string           hhhtml           = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                int gg = new Random().Next(2000, 5000);
                Thread.Sleep(gg);

                Rurl = link;
                var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }

                if (html.Contains("没有找到搜索内容!"))
                {
                    break;
                }

                var tags  = html.SubAfter("body").SubBefore("/body").SplitWith("b_content");
                var tagsD = tags[tags.Length - 1].SubAfter("搜索结果").SubBefore("</ol>").ToString().SplitWith("</li>");
                if (tagsD == null || tagsD.Length == 0 || tagsD.Length == 1)
                {
                    tags = html.SplitWith("b_content");
                }
                if (tagsD == null || tagsD.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tagsD)
                {
                    if (!tag.Contains("h2"))
                    {
                        continue;
                    }

                    //if (!tag.Contains("sp_requery"))
                    //{
                    //    continue;
                    //}

                    var    a     = tag.SubAfter("h2").SubAfter("a");
                    string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h2>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href  = a.GetFirstHref2();                                                                                            //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();
                    if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href))
                    {
                        continue;
                    }

                    href = href.Replace("amp;", "");


                    var sdsfdsf = GetDomain(href);



                    string abs = RemoveInivalidChar(tag.SubAfter("<p>").SubBefore("</p").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());

                    string timesp = "";

                    if (tag.Contains("此网站的操作"))
                    {
                        timesp = tag.SubAfter("此网站的操作").SubAfter("</a>").SubBefore("</div>").Replace('"', ' ');
                    }

                    string domain = GetDomain(href); //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);

                    int maxScore = 0;

                    byte appType = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);
                    var htmldetail = "";

                    try
                    {
                        htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                    }
                    catch (Exception)
                    {
                        //htmldetail = "";
                        href = "http://cn.bing.com" + href;
                    }
                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());



                    IW2S_Bing_level1link l1 = new IW2S_Bing_level1link
                    {
                        UsrId           = searchTsk.UsrId,
                        Domain          = domain,
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword),
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Html            = htmldetail,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)),
                        SearchkeywordId = searchTsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abs,
                        ProjectId       = searchTsk.ProjectId
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("sb_pagN").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://cn.bing.com/".GetContact(link);
                }
                fanye = fanye + 10;
                link  = "http://cn.bing.com/search?q={0}&first={1}&FORM=PERE3".FormatStr(searchTsk.Keyword, fanye);
                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }
        private void HanleTagData(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords, IW2SBotMng botmng, string[] searchKeywords, List <KeywordScore> patterns, string title, string href, string abs, ref string domain, string tag, bool isMarket, int rank)
        {
            int    maxScore = 0;
            string realUrl = null, detailHtml = null, abstracts = null;
            byte   appType = 0;

            int?baiduVStar = null;

            if (tag.Contains("c-icon-v1"))
            {
                baiduVStar = 1;
            }
            else if (tag.Contains("c-icon-v2"))
            {
                baiduVStar = 2;
            }
            else if (tag.Contains("c-icon-v3"))
            {
                baiduVStar = 3;
            }

            if (!string.IsNullOrWhiteSpace(href))
            {
                //Encoding enc = Encoding.UTF8;
                //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl);
                var tuplehtml = get_htmlUrl(href);
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                {
                    realUrl = tuplehtml.Item1;
                }
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                {
                    detailHtml = tuplehtml.Item2;
                }
                if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                {
                    domain = GetDomain(realUrl);
                }
            }
            if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
            {
                var gourl = detailHtml.GetFirstHref2();
                if (!string.IsNullOrEmpty(gourl))
                {
                    var tuplehtml = get_htmlUrl(gourl);
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                    {
                        realUrl = tuplehtml.Item1;
                    }
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                    {
                        detailHtml = tuplehtml.Item2;
                    }
                    if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                    {
                        domain = GetDomain(realUrl);
                    }
                }
            }
            if (string.IsNullOrEmpty(realUrl))
            {
                realUrl = href;
            }
            List <KeywordScore> matchpatterns = new List <KeywordScore>();

            if (string.IsNullOrEmpty(detailHtml))
            {
                return;
            }
            else
            {
                //if (!detailHtml.Contains(tsk.CommendKeyword))
                //{
                //    return;
                //}
                var           hrefs          = detailHtml.GetDescendents("a", "href");
                StringBuilder sbabstracts    = new StringBuilder();
                List <string> abstractlist   = new List <string>();
                StringBuilder sbabstractlist = new StringBuilder();

                foreach (KeywordScore pattern in patterns)
                {
                    string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword);
                    if (splitDetailHtmls.Length > 1)
                    {
                        matchpatterns.Add(pattern);
                    }
                    StringBuilder sbpatternStr = new StringBuilder();
                    for (int i = 0; i < splitDetailHtmls.Length - 1; i++)
                    {
                        string splitDetailHtml1 = splitDetailHtmls[i];
                        string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : "";
                        for (int j = splitDetailHtml1.Length - 1; j >= 0; j--)
                        {
                            if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml1[j]);
                        }
                        for (int q = sbpatternStr.Length - 1; q >= 0; q--)
                        {
                            sbabstracts.Append(sbpatternStr[q]);
                        }
                        sbabstracts.Append(pattern.Keyword);
                        sbpatternStr.Clear();
                        for (int j = 0; j < splitDetailHtml2.Length; j++)
                        {
                            if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml2[j]);
                        }
                        sbabstracts.Append(sbpatternStr);
                        sbpatternStr.Clear();

                        string tmpsbabstracts = sbabstracts.ToString();
                        tmpsbabstracts = IW2SBaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower());
                        if (!abstractlist.Contains(tmpsbabstracts))
                        {
                            abstractlist.Add(tmpsbabstracts);
                            sbabstractlist.Append(tmpsbabstracts).Append(" ");
                        }
                        sbabstracts.Clear();
                    }
                }
                //获取摘要
                abstracts = sbabstractlist.ToString();
                if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0)
                {
                    maxScore  = matchpatterns.Max(x => x.Score ?? 50);
                    appType   = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();
                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs))
            {
                matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList();
                if (matchpatterns.Count > 0)
                {
                    maxScore = matchpatterns.Max(x => x.Score ?? 50);
                    appType  = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();

                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (maxScore > 100)
            {
                maxScore = 100;
            }


            bool          is_title_matched = title.GetLower().IsContains2(searchKeywords);
            bool          is_abstr_matched = abs.IsContains2(searchKeywords);
            BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                             is_title_matched ? BaiduItemPart.Title :
                                             is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;

            Regex           reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
            MatchCollection mc  = reg.Matches(detailHtml);
            //MatchCollection cols = reg.Matches(item.Html);
            string time = "";

            if (mc.Count > 0)
            {
                foreach (Match x in mc)
                {
                    //判断是正文中的还是代码和注释中的时间
                    if (!string.IsNullOrEmpty(x.Value))
                    {
                        var txt    = detailHtml.SubAfter(x.Value);
                        var index1 = txt.IndexOf('<');
                        var index2 = txt.IndexOf('>');
                        var index3 = txt.IndexOf('\"');
                        //只使用正文中的时间
                        if (index1 < index2 && index1 < index3)
                        {
                            time = x.Value;
                            break;
                        }
                    }
                }
            }


            IW2S_level1link l1 = new IW2S_level1link
            {
                UsrId     = tsk.UsrId,
                Domain    = domain,
                TopDomain = GetLevel1Domain(domain),
                Keywords  = string.Format("{0}", tsk.CommendKeyword),
                LinkUrl   = realUrl,
                MatchAt   = (byte)part,
                Html      = detailHtml,

                AppType         = appType,
                BizId           = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())),
                SearchkeywordId = tsk._id.ToString(),
                CreatedAt       = DateTime.UtcNow.AddHours(8),
                Description     = abs,
                Title           = title,
                Score           = maxScore,
                Abstract        = abstracts,
                IsMarket        = isMarket,
                ProjectId       = tsk.ProjectId,
                PublishTime     = time,
                AlternateFields = "0",
                Rank            = rank
            };

            if (baiduVStar.HasValue)
            {
                l1.BaiduVStar = baiduVStar.Value;
            }

            botmng.save_level1_links(new List <IW2S_level1link> {
                l1
            }, tsk, excludedKeywords);
        }
        /// <summary>
        /// 解析搜索结果数据
        /// </summary>
        /// <param name="tsk">关键词信息</param>
        /// <param name="excludedDomains">排除域名列表</param>
        /// <param name="searchKeywords">搜索关键词</param>
        /// <param name="title">标题</param>
        /// <param name="href">链接</param>
        /// <param name="description">描述</param>
        /// <param name="domain">域名</param>
        /// <param name="tag">搜索结果源码</param>
        /// <param name="isMarket">是否为推广链接</param>
        private void HanleTagData(Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedDomains, string searchKeywords, string title, string href, string description, ref string domain, string tag, bool isMarket)
        {
            string realUrl = null, detailHtml = null;     //真实网址、网页源码
            //判断百度蓝V等级
            int?baiduVStar = null;

            if (tag.Contains("c-icon-v1"))
            {
                baiduVStar = 1;
            }
            else if (tag.Contains("c-icon-v2"))
            {
                baiduVStar = 2;
            }
            else if (tag.Contains("c-icon-v3"))
            {
                baiduVStar = 3;
            }
            //获取真实网址、网页源码和网页摘要
            if (!string.IsNullOrWhiteSpace(href))
            {
                //获取网页源码及真实地址
                var tuplehtml = get_htmlUrl(href);
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                {
                    realUrl = tuplehtml.Item1;
                }
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                {
                    detailHtml = tuplehtml.Item2;
                }
                //获取网页二级域名
                if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                {
                    domain = GetDomain(realUrl);
                }
            }
            //如果网页本身也是跳转链接,进一步获取获取真实网页源码并解析数据
            if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
            {
                var gourl = detailHtml.GetFirstHref2();
                if (!string.IsNullOrEmpty(gourl))
                {
                    var tuplehtml = get_htmlUrl(gourl);
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                    {
                        realUrl = tuplehtml.Item1;
                    }
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                    {
                        detailHtml = tuplehtml.Item2;
                    }
                    if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                    {
                        domain = GetDomain(realUrl);
                    }
                }
            }
            //去除前缀并计算域名收录量
            Regex regDomain     = new Regex("http://|https://");
            long  collectionNum = 0;

            if (!string.IsNullOrEmpty(domain))
            {
                domain        = regDomain.Replace(domain, "");
                collectionNum = GetDomainCollectionNum(domain);
            }

            if (string.IsNullOrEmpty(realUrl))
            {
                realUrl = href;
            }
            List <KeywordScore> matchpatterns = new List <KeywordScore>();

            if (string.IsNullOrEmpty(detailHtml))
            {
                return;
            }
            else
            {
                var hrefs = detailHtml.GetDescendents("a", "href");
            }

            string content = GetMainContentHelper.GetMainContent(detailHtml);         //获取网页中文正文

            bool          is_title_matched = title.IsContains2(searchKeywords);       //标题是否匹配到关键词
            bool          is_desc_matched  = description.IsContains2(searchKeywords); //描述是否匹配到关键词
            BaiduItemPart part             = is_title_matched && is_desc_matched ? BaiduItemPart.TitleAbstract :
                                             is_title_matched ? BaiduItemPart.Title :
                                             is_desc_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;

            /* 匹配发布时间 */
            Regex  reg  = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
            string time = "";
            //先匹配搜索结果里是否有数据
            string timeStr = tag.SubAfter("newTimeFactor_before_abs").SubBefore("</span>");

            if (!string.IsNullOrEmpty(timeStr))
            {
                Match mt = reg.Match(timeStr);
                time = mt.Value;
            }
            else
            {
                //匹配网页源码里的时间
                MatchCollection mc = reg.Matches(detailHtml);
                if (mc.Count > 0)
                {
                    foreach (Match x in mc)
                    {
                        //判断是正文中的还是代码和注释中的时间
                        if (!string.IsNullOrEmpty(x.Value))
                        {
                            var txt    = detailHtml.SubAfter(x.Value);
                            var index1 = txt.IndexOf('<');
                            var index2 = txt.IndexOf('>');
                            var index3 = txt.IndexOf('\"');
                            //只使用正文中的时间
                            if (index1 < index2 && index1 < index3)
                            {
                                time = x.Value;
                                break;
                            }
                        }
                    }
                }
            }

            //生成链接信息
            Dnl_Link_Baidu link = new Dnl_Link_Baidu
            {
                Domain          = domain,
                TopDomain       = GetLevel1Domain(domain),
                Keywords        = tsk.Keyword,
                LinkUrl         = realUrl,
                MatchAt         = (byte)part,
                Html            = detailHtml,
                SearchkeywordId = tsk._id.ToString(),
                CreatedAt       = DateTime.UtcNow.AddHours(8),
                Description     = description,
                Title           = title,
                IsPromotion     = isMarket,
                PublishTime     = time,
                Content         = content,
                DCNum           = collectionNum
            };

            if (baiduVStar.HasValue)
            {
                link.BaiduVStar = baiduVStar.Value;
            }

            SaveLink(link, tsk);
        }
Пример #4
0
        public List <Dnl_Google_level1link> GetLinks(string link, Dnl_Google_BaiduCommend searchTsk)
        {
            List <Dnl_Google_level1link> result = new List <Dnl_Google_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;
            int fanye        = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 10)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "https://www.google.com";
                string           cookie           = "";

                WebClient webClient = new WebClient();
                webClient.Credentials = CredentialCache.DefaultCredentials;
                Byte[] pageData = webClient.DownloadData(link);

                string pageHtml = Encoding.GetEncoding("Big5").GetString(pageData);

                pageHtml = Strings.StrConv(pageHtml, VbStrConv.SimplifiedChinese, 0);

                //string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 80000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                //cookiesColl = cookieCollection;
                //int gg = new Random().Next(2000, 5000);
                //Thread.Sleep(gg);

                //Rurl = link;
                //var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                //cookiesColl = cookieCollection;

                var html = pageHtml;

                if (html == null)
                {
                    break;
                }
                //  html = Regex.Unescape(html);
                if (html.Contains("没有找到搜索内容!"))
                {
                    break;
                }

                var tags = html.SubAfter("<body").SubAfter("center_col").SubBefore("id=\"foot\"");


                var tagsD = tags.SplitWith("class=\"g\"");

                if (tagsD == null || tagsD.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tagsD)
                {
                    if (!tag.Contains("h3"))
                    {
                        continue;
                    }

                    //if (!tag.Contains("sp_requery"))
                    //{
                    //    continue;
                    //}
                    var    a     = tag.SubAfter("h3").SubAfter("a");
                    string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href  = a.GetFirstHref2();                                                                                            //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();

                    if (href.Contains("/url?q="))
                    {
                        href = href.Replace("/url?q=", "");
                    }


                    if (!href.Contains("http"))
                    {
                        href = "https://www.google.com" + href;
                    }

                    if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href))
                    {
                        continue;
                    }
                    href = href.Replace("amp;", "");
                    var    sdsfdsf = GetDomain(href);
                    string abs     = RemoveInivalidChar(tag.SubAfter("class=\"st\"").SubBefore("</span").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());
                    string timesp  = "";
                    string domain  = GetDomain(href);
                    //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);

                    int maxScore = 0;

                    byte appType = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);
                    var htmldetail = "";

                    try
                    {
                        // htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);

                        WebClient webClient2 = new WebClient();
                        webClient2.Credentials = CredentialCache.DefaultCredentials;
                        Byte[] pageData2 = webClient2.DownloadData(href);

                        htmldetail = Encoding.GetEncoding("Big5").GetString(pageData2);

                        htmldetail = Strings.StrConv(htmldetail, VbStrConv.SimplifiedChinese, 0);
                    }
                    catch (Exception)
                    {
                        //htmldetail = "";
                    }

                    Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
                    Match m   = reg.Match(htmldetail);
                    //MatchCollection cols = reg.Matches(item.Html);

                    if (m.Groups.Count > 0)
                    {
                        timesp = m.Groups[0].Value;
                    }

                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());



                    Dnl_Google_level1link l1 = new Dnl_Google_level1link
                    {
                        UsrId           = searchTsk.UsrId,
                        Domain          = domain.Replace("http://", "").Replace("https://", ""),
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword),
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Html            = htmldetail,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)),
                        SearchkeywordId = searchTsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abs,
                        ProjectId       = searchTsk.ProjectId,
                        PublishTime     = timesp
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("id=\"foot\"").SubAfter("text-align:left").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "https://www.google.com/".GetContact(link);

                    link = link.Replace("amp;", "");
                }
                fanye = fanye + 10;

                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }
Пример #5
0
        public List <IW2S_WX_level1link> GetLinks(string link, IW2S_WX_BaiduCommend searchTsk)
        {
            List <IW2S_WX_level1link> result = new List <IW2S_WX_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 2)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "http://weixin.sogou.com/";
                string           cookie           = "";
                string           hhhtml           = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                int gg = new Random().Next(5000, 8000);
                Thread.Sleep(gg);

                Rurl = link;
                var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }

                if (html.Contains("没有找到相关的微信公众号文章"))
                {
                    break;
                }

                var tags = html.SplitWith("wx-rb wx-rb3");
                if (tags == null || tags.Length == 0 || tags.Length == 1)
                {
                    tags = html.SplitWith("wx-rbwx-rb3");
                }
                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    if (!tag.Contains("txt-box"))
                    {
                        continue;
                    }
                    string title  = RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href   = tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();
                    string abs    = RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());
                    string domain = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);
                    string SourceLink = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetFirstHref2();

                    string TitleImg = tag.SubAfter("img_box2").SubBefore("</a").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace();


                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);

                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    //var excludekwdcount = ExcludeKeyword.Count(c => txt.Contains(c.KeywordName));
                    //if (excludekwdcount > 0)
                    //    continue;

                    if (href.IsStartWith("/websearch"))
                    {
                        href = "http://weixin.sogou.com" + href.Replace("amp;", "");
                    }
                    if (href.IsStartWith("s?__biz"))
                    {
                        var href1 = href.Replace("amp;", "");
                    }
                    href = href.Replace("amp;", "");
                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);

                    var htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);


                    Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
                    Match m   = reg.Match(htmldetail);
                    //MatchCollection cols = reg.Matches(item.Html);
                    string time = "";
                    if (m.Groups.Count > 0)
                    {
                        time = m.Groups[0].Value;
                    }
                    href = Rurl;
                    var hrefNew          = href + "&f=json";
                    var htmldetailNewUrl = get_Detailehtml(hrefNew, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                    try
                    {
                        var uuurl = htmldetailNewUrl.SubAfter("\"link\":").SubBefore(",\"source_url\":").Replace('"', ' ').Replace("\\", "").RemoveSpace();
                        href = uuurl;
                    }
                    catch (Exception)
                    {
                    }
                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());
                    var  no         = "";
                    var  qrcode     = "";
                    var  function   = "";
                    var  NoIcon     = "";
                    var  QrcodeIcon = "";
                    SourceLink = SourceLink.Replace("amp;", "");
                    int nnn = new Random().Next(8000, 15000);
                    Thread.Sleep(nnn);
                    var htmlNo = get_Nohtml(SourceLink, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);
                    if (!string.IsNullOrEmpty(htmlNo) && htmlNo.Contains("em_weixinhao"))
                    {
                        no         = htmlNo.SubAfter("em_weixinhao").SubBefore("/label").GetTxtFromHtml2().RemoveSpace();
                        qrcode     = htmlNo.SubAfter("v-box").SubBefore("<em").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace();
                        function   = htmlNo.SubAfter("功能介绍:</").SubBefore("/span").GetTxtFromHtml2().RemoveSpace();
                        SourceLink = htmlNo.SubAfter("微信认证:").SubBefore("/div").GetTxtFromHtml2().RemoveSpace();
                        NoIcon     = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("src=").SubBefore("onload").Replace(">", "").Replace('"', ' ').RemoveSpace();
                        QrcodeIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("err:").SubBefore(">").Replace(">", "").Replace('"', ' ').Replace("'", "").RemoveSpace();
                    }
                    IW2S_WX_level1link l1 = new IW2S_WX_level1link
                    {
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(title, domain, searchTsk.UsrId)),
                        Description     = abs,
                        Domain          = domain,
                        UsrId           = searchTsk.UsrId,
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Title           = title,
                        CreatedAt       = DateTime.Now,
                        DataCleanStatus = 0,
                        Function        = function,
                        SearchkeywordId = searchTsk._id.ToString(),
                        Keywords        = searchTsk.Keyword,
                        PublicNo        = no,
                        QrCode          = qrcode,
                        SourceLink      = SourceLink,
                        TagType         = 0,
                        ImgIcon         = NoIcon,
                        QrCodeIcon      = QrcodeIcon,
                        ProjectId       = searchTsk.ProjectId,
                        TitleImg        = TitleImg,
                        PublishTime     = time,
                        Html            = htmldetail
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://weixin.sogou.com/weixin".GetContact(link);
                }

                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }
Пример #6
0
        void GetLinks(string link, searchkeyword tsk, keyword businessKeyword, List <keyword> businessKeywords, List <keyword> excludedKeywords)
        {
            BotMng  botmng = BotMng.Instance;
            AppType img    = (AppType)tsk.AppType;

            string[] searchKeywords = tsk.Keyword.GetLower().RemoveSpace().Split(';');

            List <KeywordScore> patterns = businessKeywords.Select(x => new KeywordScore {
                Keyword = x.Txt, Score = x.Score, BizType = x.BizType
            }).ToList();

            string[] bizPatterns = businessKeywords.Select(x => x.Txt).ToArray();
            patterns.Add(new KeywordScore {
                Keyword = tsk.Keyword, Score = 50, BizType = 0
            });

            //List<level1link> result = new List<level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            //最多搜索60页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 60)
            {
                log(link);
                var html = get_html(link);
                if (html == null)
                {
                    break;
                }
                var tags = html.SubAfter("content_left").SplitWith("c-container");

                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + tsk.Keyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    var    a     = tag.SubAfter("h3").SubAfter("a");
                    string title = RemoveInivalidChar(
                        a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower());
                    string href = a.GetFirstHref2();


                    string abs    = RemoveInivalidChar(tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2().RemoveSpace().GetLower());
                    string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2();
                    domain = GetDomain(domain);

                    int maxScore = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0}{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    string realUrl = null, detailHtml = null, abstracts = null;
                    byte   appType = 0;

                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        //Encoding enc = Encoding.UTF8;
                        //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl);
                        var tuplehtml = get_htmlUrl(href);
                        if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                        {
                            realUrl = tuplehtml.Item1;
                        }
                        if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                        {
                            detailHtml = tuplehtml.Item2;
                        }
                        if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                        {
                            domain = GetDomain(realUrl);
                        }
                    }
                    if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
                    {
                        var gourl = detailHtml.GetFirstHref2();
                        if (!string.IsNullOrEmpty(gourl))
                        {
                            var tuplehtml = get_htmlUrl(gourl);
                            if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                            {
                                realUrl = tuplehtml.Item1;
                            }
                            if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                            {
                                detailHtml = tuplehtml.Item2;
                            }
                            if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                            {
                                domain = GetDomain(realUrl);
                            }
                        }
                    }
                    if (string.IsNullOrEmpty(realUrl))
                    {
                        realUrl = href;
                    }
                    List <KeywordScore> matchpatterns = new List <KeywordScore>();
                    if (string.IsNullOrEmpty(detailHtml))
                    {
                        continue;
                    }
                    else
                    {
                        if (!detailHtml.Contains(tsk.Keyword) || !detailHtml.IsContains2(bizPatterns))
                        {
                            continue;
                        }
                        var           hrefs          = detailHtml.GetDescendents("a", "href");
                        StringBuilder sbabstracts    = new StringBuilder();
                        List <string> abstractlist   = new List <string>();
                        StringBuilder sbabstractlist = new StringBuilder();

                        foreach (KeywordScore pattern in patterns)
                        {
                            string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword);
                            if (splitDetailHtmls.Length > 1)
                            {
                                matchpatterns.Add(pattern);
                            }
                            StringBuilder sbpatternStr = new StringBuilder();
                            for (int i = 0; i < splitDetailHtmls.Length - 1; i++)
                            {
                                string splitDetailHtml1 = splitDetailHtmls[i];
                                string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : "";
                                for (int j = splitDetailHtml1.Length - 1; j >= 0; j--)
                                {
                                    if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1]))
                                    {
                                        break;
                                    }
                                    sbpatternStr.Append(splitDetailHtml1[j]);
                                }
                                for (int q = sbpatternStr.Length - 1; q >= 0; q--)
                                {
                                    sbabstracts.Append(sbpatternStr[q]);
                                }
                                sbabstracts.Append(pattern.Keyword);
                                sbpatternStr.Clear();
                                for (int j = 0; j < splitDetailHtml2.Length; j++)
                                {
                                    if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1]))
                                    {
                                        break;
                                    }
                                    sbpatternStr.Append(splitDetailHtml2[j]);
                                }
                                sbabstracts.Append(sbpatternStr);
                                sbpatternStr.Clear();

                                string tmpsbabstracts = sbabstracts.ToString();
                                tmpsbabstracts = BaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower());
                                if (!abstractlist.Contains(tmpsbabstracts))
                                {
                                    abstractlist.Add(tmpsbabstracts);
                                    sbabstractlist.Append(tmpsbabstracts).Append(" ");
                                }
                                sbabstracts.Clear();
                            }
                        }
                        abstracts = sbabstractlist.ToString();
                        if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0)
                        {
                            maxScore  = matchpatterns.Max(x => x.Score ?? 50);
                            appType   = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();
                            maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                            maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                        }
                    }
                    if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs))
                    {
                        matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList();
                        maxScore      = matchpatterns.Max(x => x.Score ?? 50);
                        appType       = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();

                        maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                        maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                    }
                    if (maxScore > 100)
                    {
                        maxScore = 100;
                    }

                    bool is_bus_matched = txt.IsContains2(businessKeyword.Txt);

                    bool          is_title_matched = title.GetLower().IsContains2(searchKeywords);
                    bool          is_abstr_matched = abs.IsContains2(searchKeywords);
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains2(searchKeywords);



                    level1link l1 = new level1link
                    {
                        UsrId           = tsk.UsrId,
                        Domain          = domain,
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", tsk.Keyword, businessKeyword.Txt),
                        LinkUrl         = realUrl,
                        MatchAt         = (byte)part,
                        Html            = detailHtml,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(realUrl, tsk.UsrId, tsk.Keyword)),
                        SearchkeywordId = tsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abstracts
                    };

                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));

                    botmng.save_level1_links(new List <level1link> {
                        l1
                    }, tsk, excludedKeywords);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;
                link = html.SubAfter("fkfk_cur").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://www.baidu.com/".GetContact(link);
                }
            }
            //return result;
        }