public static Guid GenerateBotId()
        {
            if (_botId != Guid.Empty)
            {
                return(_botId);
            }

            var hostName  = Dns.GetHostName();
            var processId = Process.GetCurrentProcess().Id;
            var ts        = GetTimestamp();
            var idStr     = hostName + processId + ts;

            _botId = IDHelper.GetGuid(idStr);
            return(_botId);
        }
        public List <WX_links> Query(IW2S_WX_BaiduCommend searchTsk, List <IW2S_WX_BaiduCommend> taskList)
        {
            List <WX_links>           linkvaluelist = new List <WX_links>();
            WX_BotTaskService         WX_bt         = new WX_BotTaskService();
            List <IW2S_WX_level1link> linklist      = WX_bt.GetLinkTitleList(searchTsk.ProjectId);
            int index = 0;

            for (int i = 0; i < taskList.Count; i++)
            {
                if (searchTsk.CommendKeyword == taskList[i].CommendKeyword)
                {
                    index = i;
                }
            }
            for (int i = 0; i < taskList.Count; i++)
            {
                WX_links lk = new WX_links();
                //if (searchTsk.CommendKeyword == taskList[i].CommendKeyword)
                //{
                //}
                //else
                //{
                lk.source    = index;
                lk.target    = i;
                lk.KeywordId = searchTsk._id;
                lk.ProjectId = searchTsk.ProjectId;
                int linkNum = 0;
                foreach (var item in linklist)
                {
                    if (!string.IsNullOrEmpty(item.Title))
                    {
                        if (item.Title.Contains(searchTsk.CommendKeyword) && item.Title.Contains(taskList[i].CommendKeyword))
                        {
                            linkNum = linkNum + 1;
                        }
                    }
                }
                lk.value = linkNum;
                lk.Gid   = IDHelper.GetGuid("{0}/{1}/{2}/{3}".FormatStr(lk.source, lk.target, lk.KeywordId, lk.ProjectId));
                linkvaluelist.Add(lk);
                //  }
            }
            WX_SaveResult(linkvaluelist);
            return(linkvaluelist);
        }
        public void Run()
        {
            while (true)
            {
                Random r = new Random();
                var    p = get_search_to_qry();
                if (p == null)
                {
                    SetReady();
                    Thread.Sleep(r.Next(30000, 100000));
                    continue;
                }

                try
                {
                    SetBusy();

                    //var internetIp = IWSBot.Utility.Utility.GetInternetIpAddress();
                    var botId = IWSBot.Utility.Utility.GenerateBotId().ToString().Replace("-", "");

                    var    pro         = Process.GetCurrentProcess();
                    string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString();
                    int    botInterval = p.BotIntervalHours == 0 ? 7 * 24 : p.BotIntervalHours;
                    var    update      = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "BotStatus", 1 }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)botInterval + 8) }
                              , { "BotTag", string.Format("{0}#", processName) },
                              { "BotId", botId }
                          } }
                    };

                    var result = MongoDBHelper.Instance.GetIW2S_WB_BaiduCommends().UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);

                    query(p);
                }
                catch (Exception ex)
                {
                    while (ex != null)
                    {
                        Console.WriteLine("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace));
                        ex = ex.InnerException;
                    }
                }
                //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm")
                try
                {
                    var update = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "LastBotEndAt", DateTime.UtcNow.AddHours(8) },
                              { "BotStatus", 2 }
                          } }
                    };
                    var commendCol = MongoDBHelper.Instance.GetIW2S_WB_BaiduCommends();
                    var result     = commendCol.UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);

                    var builder = Builders <IW2S_WB_level1link> .Filter;
                    var filter  = builder.Eq(x => x.UsrId, p.UsrId);
                    filter &= builder.Eq(x => x.SearchkeywordId, p._id);
                    filter &= builder.Ne(x => x.DataCleanStatus, (byte)2);
                    filter &= builder.Regex(x => x.Description, new BsonRegularExpression("/.*" + p.Keyword + ".*/i"));

                    var col        = MongoDBHelper.Instance.GetIW2S_WB_level1links();
                    var agreresult = col.Aggregate().Match(filter)
                                     .Group(new BsonDocument {
                        { "_id", "$_id" }, { "Count", new BsonDocument("$sum", 1) }
                    })
                                     .ToListAsync()
                                     .Result;

                    var vallinkCount = agreresult.Count;
                    update = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "ValLinkCount", vallinkCount }
                          } }
                    };

                    commendCol.UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);
                }
                catch (Exception ex)
                {
                    Console.WriteLine("get_proj_to_qry ERROR ." + ex.Message);
                    Thread.Sleep(5000);
                }
            }
        }
示例#4
0
        public List <Dnl_Google_level1link> GetLinks(string link, Dnl_Google_BaiduCommend searchTsk)
        {
            List <Dnl_Google_level1link> result = new List <Dnl_Google_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;
            int fanye        = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 10)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "https://www.google.com";
                string           cookie           = "";

                WebClient webClient = new WebClient();
                webClient.Credentials = CredentialCache.DefaultCredentials;
                Byte[] pageData = webClient.DownloadData(link);

                string pageHtml = Encoding.GetEncoding("Big5").GetString(pageData);

                pageHtml = Strings.StrConv(pageHtml, VbStrConv.SimplifiedChinese, 0);

                //string hhhtml = TaobaoWebHelper.GetContentByIndex(Rurl, 80000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                //cookiesColl = cookieCollection;
                //int gg = new Random().Next(2000, 5000);
                //Thread.Sleep(gg);

                //Rurl = link;
                //var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                //cookiesColl = cookieCollection;

                var html = pageHtml;

                if (html == null)
                {
                    break;
                }
                //  html = Regex.Unescape(html);
                if (html.Contains("没有找到搜索内容!"))
                {
                    break;
                }

                var tags = html.SubAfter("<body").SubAfter("center_col").SubBefore("id=\"foot\"");


                var tagsD = tags.SplitWith("class=\"g\"");

                if (tagsD == null || tagsD.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tagsD)
                {
                    if (!tag.Contains("h3"))
                    {
                        continue;
                    }

                    //if (!tag.Contains("sp_requery"))
                    //{
                    //    continue;
                    //}
                    var    a     = tag.SubAfter("h3").SubAfter("a");
                    string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href  = a.GetFirstHref2();                                                                                            //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();

                    if (href.Contains("/url?q="))
                    {
                        href = href.Replace("/url?q=", "");
                    }


                    if (!href.Contains("http"))
                    {
                        href = "https://www.google.com" + href;
                    }

                    if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href))
                    {
                        continue;
                    }
                    href = href.Replace("amp;", "");
                    var    sdsfdsf = GetDomain(href);
                    string abs     = RemoveInivalidChar(tag.SubAfter("class=\"st\"").SubBefore("</span").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());
                    string timesp  = "";
                    string domain  = GetDomain(href);
                    //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);

                    int maxScore = 0;

                    byte appType = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);
                    var htmldetail = "";

                    try
                    {
                        // htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);

                        WebClient webClient2 = new WebClient();
                        webClient2.Credentials = CredentialCache.DefaultCredentials;
                        Byte[] pageData2 = webClient2.DownloadData(href);

                        htmldetail = Encoding.GetEncoding("Big5").GetString(pageData2);

                        htmldetail = Strings.StrConv(htmldetail, VbStrConv.SimplifiedChinese, 0);
                    }
                    catch (Exception)
                    {
                        //htmldetail = "";
                    }

                    Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
                    Match m   = reg.Match(htmldetail);
                    //MatchCollection cols = reg.Matches(item.Html);

                    if (m.Groups.Count > 0)
                    {
                        timesp = m.Groups[0].Value;
                    }

                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());



                    Dnl_Google_level1link l1 = new Dnl_Google_level1link
                    {
                        UsrId           = searchTsk.UsrId,
                        Domain          = domain.Replace("http://", "").Replace("https://", ""),
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword),
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Html            = htmldetail,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)),
                        SearchkeywordId = searchTsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abs,
                        ProjectId       = searchTsk.ProjectId,
                        PublishTime     = timesp
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("id=\"foot\"").SubAfter("text-align:left").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "https://www.google.com/".GetContact(link);

                    link = link.Replace("amp;", "");
                }
                fanye = fanye + 10;

                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }
示例#5
0
        public IW2SUserDto Regist(string uName, string uPwd1, string uPwd2, string email)
        {
            //var code = VerifyCodeClass.YzmCode;
            //if (YZM.ToLower() != code.ToLower())
            //{
            //    return new IW2SUserDto { Error = "验证码填写错误!" };
            //}
            if (string.IsNullOrEmpty(uName) || string.IsNullOrEmpty(uPwd1))
            {
                return new IW2SUserDto {
                           Error = "用户名和密码不能为空"
                }
            }
            ;
            if (!uPwd1.Equals(uPwd2))
            {
                return new IW2SUserDto {
                           Error = "密码不一致!"
                }
            }
            ;
            if (string.IsNullOrEmpty(email))
            {
                return new IW2SUserDto {
                           Error = "邮箱不能为空!"
                }
            }
            ;
            bool dd = System.Text.RegularExpressions.Regex.IsMatch(email, @"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?");

            if (dd == false)
            {
                return new IW2SUserDto {
                           Error = "邮箱格式不正确!"
                }
            }
            ;
            var builder = Builders <IW2SUser> .Filter;
            var filter  = builder.Eq(x => x.LoginName, uName);
            var _usr    = MongoDBHelper.Instance.Get_IW2SUser().Find(filter).FirstOrDefault();

            if (_usr != null)
            {
                return new IW2SUserDto {
                           Error = "用户名已经存在"
                }
            }
            ;
            //  var queryTask1 = new QueryDocument { { "UsrEmail", email } };
            filter = builder.Eq(x => x.UsrEmail, email);
            IW2SUser _usr1 = MongoDBHelper.Instance.Get_IW2SUser().Find(filter).FirstOrDefault();

            if (_usr1 != null)
            {
                return new IW2SUserDto {
                           Error = "该邮箱已经注册过,请换一个试试!"
                }
            }
            ;
            var md5 = EncryptHelper.GetEncryPwd(uPwd1.ToLower());

            _usr = new IW2SUser()
            {
                _id              = ObjectId.GenerateNewId(),
                LoginName        = uName,
                LoginPwd         = md5,
                UsrKey           = IDHelper.GetGuid(uName + usr_key),
                applicationState = false,
                IsEmailConfirmed = false,
                UsrEmail         = email,
                UsrRole          = UserTypes.Free,
                UsrNum           = 1,
                Gender           = "",
                MobileNo         = "",
                Remarks          = "",
                PictureSrc       = "",
                CreatedAt        = DateTime.Now.AddHours(8),
                ProjectNum       = 2,
                KeywordNum       = 20,
                ReportNum        = 2,
                LinkNum          = 2000
            };
            MongoDBHelper.Instance.Get_IW2SUser().InsertOne(_usr);
            IW2SUserDto freDto = new IW2SUserDto();

            freDto._id              = _usr._id.ToString();
            freDto.LoginName        = _usr.LoginName;
            freDto.LoginPwd         = _usr.LoginPwd;
            freDto.UsrRole          = _usr.UsrRole;
            freDto.UsrKey           = _usr.UsrKey;
            freDto.UsrEmail         = _usr.UsrEmail;
            freDto.IsEmailConfirmed = _usr.IsEmailConfirmed;
            freDto.applicationState = _usr.applicationState;
            freDto.UsrNum           = _usr.UsrNum;
            //freDto.Token = Helpers.IprAuthorizeAttribute.GetToken(_usr.LoginName, _usr.UsrRole);
            return(freDto);
        }
 public static Guid GetEncryPwd(string pwd)
 {
     return(IDHelper.GetGuid(pwd + pwd_key));
 }
示例#7
0
        public void SetID(XTask tsk)
        {
            Listings.ToList().ForEach(x =>
            {
                if (tsk != null)
                {
                    x.CompanyName        = tsk.CompanyName;
                    x.SiteName           = tsk.SiteName;
                    x.DetailQueryName    = tsk.DetailQueryName;
                    x.CommentQueryName   = tsk.CommentQueryName;
                    x.BuyerListQueryName = tsk.BuyerListQueryName;
                    x.BotComments        = tsk.BotComments;
                    x.BotBuyerList       = tsk.BotBuyerList;
                    x.RealBrandName      = tsk.BrandName;
                    x.RealProductName    = tsk.ProductName;
                    if (tsk.ItemID.HasValue)
                    {
                        x.BotItemID = tsk.ItemID.Value;
                    }
                    if (tsk.ShopID.HasValue)
                    {
                        x.BotShopID = tsk.ShopID.Value;
                    }
                }

                if (!x.BotShopID.HasValue && !string.IsNullOrEmpty(x.ShopName))
                {
                    x.BotShopID = IDHelper.GetGuid(string.Format("{0},{1}", x.ShopID ?? x.ShopName, x.SiteName));
                }
                if (!x.BotItemID.HasValue && !string.IsNullOrEmpty(x.ItemName))
                {
                    if (!string.IsNullOrEmpty(x.ItemID))
                    {
                        x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1}", x.ItemID, x.SiteName));
                    }
                    else if (x.BotShopID.HasValue)
                    {
                        x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1}", x.ItemName, x.BotShopID));
                    }
                    else if (!string.IsNullOrEmpty(x.ItemDetailUrl))
                    {
                        x.BotItemID = IDHelper.GetGuid(x.ItemDetailUrl);
                    }
                    else
                    {
                        x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1}", x.ItemName, x.SiteName));
                    }
                }
                if (x.ItemBotStatus == BotStatus.Removed)
                {
                    x.ClosedAt = DateTime.Now;
                }
                x.ItemCommentList.ForEach(y =>
                {
                    y.EntityID = x.BotItemID;
                    y.ID       = IDHelper.GetGuid(string.Format("{0},{1},{2}", x.BotItemID, y.Poster, y.PostAt));
                });
                x.SalesRecords.ForEach(y =>
                {
                    y.PackageID = x.BotItemID;
                });
            });
        }
示例#8
0
        void GetLinks(string link, IW2S_SG_BaiduCommend tsk)
        {
            List <IW2S_SG_level1link> result = new List <IW2S_SG_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            while (!string.IsNullOrEmpty(link) && quried_pages <= 2)
            {
                log(link);

                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "https://www.sogou.com/";
                string           cookie           = "";
                string           hhhtml           = GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                string realUrl = "";
                var    html    = GetContent(link, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); //(link);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }
                //处理 vrwrap
                var tags = html.SubAfter("<body").SubAfter("results").SubBefore("hint_container").SplitWith("<h3");
                if (tags == null || tags.Length == 0)
                {
                    log("已被sogou屏蔽,请调试! " + tsk.Keyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    try
                    {
                        if (!tag.Contains("<a"))
                        {
                            continue;
                        }
                        string title   = RemoveInivalidChar(tag.SubAfter("<a").SubBefore("</a>").GetTxtFromHtml2().RemoveSpace());
                        string href    = tag.SubAfter("<a").SubBefore("</a>").GetFirstHref2();
                        string Jianjie = "";
                        if (tag.Contains("简介:"))
                        {
                            Jianjie = tag.SubAfter("简介:").SubBefore("</").GetTxtFromHtml2().RemoveSpace();
                        }
                        if (tag.Contains("cacheresult_summary"))
                        {
                            Jianjie = tag.SubAfter("cacheresult_summary").SubBefore("</div>").GetTxtFromHtml2().RemoveSpace();
                        }
                        if (string.IsNullOrEmpty(Jianjie))
                        {
                            Jianjie = tag.SubAfter("summary_beg").SubBefore("summary_end").GetTxtFromHtml2().RemoveSpace();
                        }
                        int n = new Random().Next(8000, 15000);
                        Thread.Sleep(n);
                        var tuplehtml = GetContent(href, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); // get_htmlUrl(href);
                        if (!string.IsNullOrEmpty(tuplehtml))
                        {
                            if (tuplehtml.Contains("window.location.replace"))
                            {
                                realUrl = tuplehtml.SubAfter("window.location.replace").SubBefore("</script>").Replace('"', ' ').Replace("(", "").Replace(")", "").RemoveSpace();
                            }
                            else
                            {
                                realUrl = Rurl;
                            }
                        }
                        string domain = "";
                        if (!string.IsNullOrEmpty(realUrl))
                        {
                            domain = GetDomain(realUrl);
                        }
                        else
                        {
                            realUrl = href;
                            domain  = GetDomain(href);
                        }
                        string topDomain  = GetLevel1Domain(domain);
                        bool   IsContains = false;
                        int    States     = 0;
                        int    blackid    = 0;
                        realUrl = realUrl.Replace("amp;", "");
                        int nn = new Random().Next(6000, 15000);
                        Thread.Sleep(nn);
                        var   htmldetail = GetContent(realUrl, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                        Regex reg        = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
                        Match m          = reg.Match(htmldetail);
                        //MatchCollection cols = reg.Matches(item.Html);
                        string time = "";
                        if (m.Groups.Count > 0)
                        {
                            time = m.Groups[0].Value;
                        }
                        //foreach (var item in BLtb)
                        //{
                        //    if (item.Domain.Trim().ToLower().Equals(topDomain.Trim().ToLower()))
                        //    {
                        //        States = 2;
                        //        blackid = item.Id;
                        //    }
                        //}
                        //foreach (var item in excludedKeywords)
                        //{
                        //    if (item.AuthorizedUrl1.Contains(topDomain))
                        //    {
                        //        IsContains = true;
                        //        States = 1;
                        //    }
                        //}
                        //if (IsContains == true)
                        //    continue;

                        IW2S_SG_level1link l1 = new IW2S_SG_level1link
                        {
                            UsrId           = tsk.UsrId,
                            Domain          = domain,
                            TopDomain       = topDomain,
                            Keywords        = string.Format("{0}", tsk.Keyword),
                            LinkUrl         = realUrl,
                            Html            = htmldetail,
                            BizId           = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())),
                            SearchkeywordId = tsk._id.ToString(),
                            CreatedAt       = DateTime.UtcNow.AddHours(8),
                            Description     = Jianjie,
                            Title           = title,
                            ProjectId       = tsk.ProjectId,
                            PublishTime     = time,
                            AlternateFields = "0",
                            DataCleanStatus = 0
                        };
                        result.Add(l1);
                        nohit        = false;
                        nohist_pages = 0;
                    }
                    catch (Exception ex)
                    {
                        log("有错误信息!" + ex.Message);
                    }
                }
                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }
                quried_pages++;
                pages++;
                link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "https://www.sogou.com/web".GetContact(link);
                }
                SaveResult(result);
                result.Clear();
                int nn1 = new Random().Next(6000, 15000);
                Thread.Sleep(nn1);
            }
        }
示例#9
0
        public void Run()
        {
            while (true)
            {
                Random r = new Random();
                var    p = get_search_to_count();
                if (p == null)
                {
                    SetReady();
                    Thread.Sleep(r.Next(30000, 100000));
                    continue;
                }
                int LinkCount = 0;
                try
                {
                    SetBusy();
                    //                    var ipaddrs = System.Net.Dns.GetHostEntry(System.Environment.MachineName).AddressList;
                    //                    string ip = string.Empty;
                    //                    if (ipaddrs.Length >= 3)
                    //                    {
                    //                        ip = ipaddrs[2].ToString();
                    //                    }
                    //                    else if (string.IsNullOrEmpty(ip) && ipaddrs.Length >= 0)
                    //                    {
                    //                        ip = ipaddrs[0].ToString();
                    //                    }

                    //var internetIp = Utility.GetInternetIpAddress();
                    var botId = Utility.GenerateBotId().ToString().Replace("-", "");

                    var    pro         = Process.GetCurrentProcess();
                    string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString();
                    int    botInterval = p.BotIntervalHours == 0 ? 7 * 24 : p.BotIntervalHours;
                    var    update      = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "BotStatus", 1 }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)botInterval + 8) }
                              , { "BotTag", string.Format("{0}#", processName) },
                              { "BotId", botId }
                          } }
                    };

                    var result = MongoDBHelper.Instance.GetIW2S_Projects().UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);

                    LinkCount = count(p);
                }
                catch (Exception ex)
                {
                    while (ex != null)
                    {
                        log("Project_BaiduLinkCount_Count ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace));
                        ex = ex.InnerException;
                    }
                }
                //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm")
                try
                {
                    var update = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "LastBotEndAt", DateTime.UtcNow.AddHours(8) },
                              { "BotStatus", 2 }, { "BaiduLinkCount", LinkCount }
                          } }
                    };
                    var commendCol = MongoDBHelper.Instance.GetIW2S_Projects();
                    var result     = commendCol.UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);
                }
                catch (Exception ex)
                {
                    log("get_proj_to_count ERROR ." + ex.Message);
                    Thread.Sleep(5000);
                }
            }
        }
示例#10
0
        public void Run()
        {
            while (true)
            {
                Random r = new Random();
                var    p = get_search_to_qry();
                if (p == null)
                {
                    SetReady();
                    Thread.Sleep(r.Next(30000, 100000));
                    continue;
                }

                try
                {
                    SetBusy();

                    //var internetIp = IWSBot.Utility.Utility.GetInternetIpAddress();
                    var botId = IWSBot.Utility.Utility.GenerateBotId().ToString().Replace("-", "");

                    var    pro         = Process.GetCurrentProcess();
                    string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString();
                    int    botInterval = p.BotIntervalHours == 0 ? 7 * 24 : p.BotIntervalHours;
                    var    update      = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "BotStatus", 1 }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)botInterval + 8) }
                              , { "BotTag", string.Format("{0}#", processName) },
                              { "BotId", botId }
                          } }
                    };

                    var result = MongoDBHelper.Instance.GetIW2S_ImgSearchTasks().UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);

                    query(p);
                }
                catch (Exception ex)
                {
                    while (ex != null)
                    {
                        Console.WriteLine("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace));
                        ex = ex.InnerException;
                    }
                }
                //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm")
                try
                {
                    var update = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "LastBotEndAt", DateTime.UtcNow.AddHours(8) },
                              { "BotStatus", 2 }
                          } }
                    };
                    var commendCol = MongoDBHelper.Instance.GetIW2S_ImgSearchTasks();
                    var result     = commendCol.UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);
                }
                catch (Exception ex)
                {
                    Console.WriteLine("get_proj_to_qry ERROR ." + ex.Message);
                    Thread.Sleep(5000);
                }
            }
        }
        public void Run()
        {
            while (true)
            {
                Random r = new Random();
                var    p = get_search_to_qry();
                if (p == null)
                {
                    Thread.Sleep(r.Next(30000, 100000));
                    continue;
                }
                try
                {
                    var    ipaddrs = System.Net.Dns.GetHostEntry(System.Environment.MachineName).AddressList;
                    string ip      = string.Empty;
                    if (ipaddrs.Length >= 3)
                    {
                        ip = ipaddrs[2].ToString();
                    }
                    else if (string.IsNullOrEmpty(ip) && ipaddrs.Length >= 0)
                    {
                        ip = ipaddrs[0].ToString();
                    }
                    var    pro         = Process.GetCurrentProcess();
                    string processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString();
                    var    update      = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "IsBot", true }, { "NextBotStartAt", DateTime.UtcNow.AddHours((double)p.BotIntervalHours + 8) }
                              , { "BotTag", string.Format("{0}#{1}", ip, processName) }
                          } }
                    };

                    var result = MongoDBHelper.Instance.Getiws_searchkeywords().UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);

                    query(p);
                }
                catch (Exception ex)
                {
                    while (ex != null)
                    {
                        log("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace));
                        ex = ex.InnerException;
                    }
                }
                //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm")
                try
                {
                    var update = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "LastBotEndAt", DateTime.UtcNow.AddHours(8) },
                              { "IsBot", false }
                          } }
                    };

                    var result = MongoDBHelper.Instance.Getiws_searchkeywords().UpdateOne(new QueryDocument {
                        { "_id", p._id }
                    }, update);
                }
                catch (Exception ex)
                {
                    log("get_proj_to_qry ERROR ." + ex.Message);
                    Thread.Sleep(5000);
                }
            }
        }
        public void Run()
        {
            while (true)
            {
                Random r = new Random();
                var    p = get_search_to_qry();
                if (p == null)
                {
                    SetReady();
                    Thread.Sleep(r.Next(30000, 100000));
                    continue;
                }
                //var keywordbuilder = Builders<IW2S_BaiduKeyword>.Filter;
                //var keywordFilter = keywordbuilder.Eq(x => x.Keyword, p.Keyword) & keywordbuilder.Eq(x => x.BotStatus, 2);
                //var keywordId = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().Find(keywordFilter).Project(x=>x._id).FirstOrDefault();
                //var col = MongoDBHelper.Instance.GetIW2S_BaiduCommends();
                //var builder = Builders<IW2S_BaiduCommend>.Filter;
                //var filter = builder.Eq(x => x.KeywordId, keywordId);
                //List<string> commends = col.Find(filter).Project(x=>x.CommendKeyword).ToList().Distinct().ToList();
                //if (commends != null && commends.Count > 3)
                //{
                //    foreach (string commend in commends)
                //    {
                //        if (commend == p.Keyword) continue;

                //        IW2S_BaiduCommend baiduCommend = new IW2S_BaiduCommend
                //        {
                //            CommendKeyword = commend,
                //            CreatedAt = DateTime.UtcNow.AddHours(8),
                //            Keyword = p.Keyword,
                //            UsrId = p.UsrId,
                //            KeywordId = p._id,
                //            ProjectId = p.ProjectId
                //        };

                //        col.InsertOne(baiduCommend);

                //    }
                //    var update = new UpdateDocument { { "$set", new QueryDocument { { "BotStatus", 2 } } } };

                //    var result = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().UpdateOne(new QueryDocument { { "_id", p._id } }, update);
                //}
                //else
                {
                    try
                    {
                        SetBusy();
                        //var internetIp = Utility.GetInternetIpAddress();

                        //var ipaddrs = System.Net.Dns.GetHostEntry(System.Environment.MachineName).AddressList;
//                        //string ip = string.Empty;
//                        if (ipaddrs.Length >= 3)
//                        {
//                            ip = ipaddrs[2].ToString();
//                        }
//                        else if (string.IsNullOrEmpty(ip) && ipaddrs.Length >= 0)
//                        {
//                            ip = ipaddrs[0].ToString();
//                        }
                        var pro         = Process.GetCurrentProcess();
                        var processName = IDHelper.GetGuid(pro.MainModule.FileName).ToString();
                        var botId       = Utility.GenerateBotId().ToString().Replace("-", "");

                        var update = new UpdateDocument
                        {
                            {
                                "$set", new QueryDocument
                                {
                                    { "BotStatus", 1 }
                                    ,
                                    { "BotTag", string.Format("{0}#", processName) },
                                    { "BotId", botId }
                                }
                            }
                        };

                        var result = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().UpdateOne(new QueryDocument {
                            { "_id", p._id }
                        }, update);

                        query(p);
                    }
                    catch (Exception ex)
                    {
                        while (ex != null)
                        {
                            log("baidu_query ERROR.Message:{0},Statck:{1}".FormatStr(ex.Message, ex.StackTrace));
                            ex = ex.InnerException;
                        }
                    }
                    //Convert.ToDateTime(doc["CreateTime"]).ToLocalTime().ToString("yyyy-MM-dd HH:mm")
                    try
                    {
                        var update = new UpdateDocument {
                            { "$set", new QueryDocument {
                                  { "BotStatus", 2 }
                              } }
                        };

                        var result = MongoDBHelper.Instance.GetIW2S_BaiduKeywords().UpdateOne(new QueryDocument {
                            { "_id", p._id }
                        }, update);
                    }
                    catch (Exception ex)
                    {
                        log("get_proj_to_qry ERROR ." + ex.Message);
                        Thread.Sleep(5000);
                    }
                }
            }
        }
示例#13
0
        public List <IW2S_WX_level1link> GetLinks(string link, IW2S_WX_BaiduCommend searchTsk)
        {
            List <IW2S_WX_level1link> result = new List <IW2S_WX_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 2)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "http://weixin.sogou.com/";
                string           cookie           = "";
                string           hhhtml           = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                int gg = new Random().Next(5000, 8000);
                Thread.Sleep(gg);

                Rurl = link;
                var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }

                if (html.Contains("没有找到相关的微信公众号文章"))
                {
                    break;
                }

                var tags = html.SplitWith("wx-rb wx-rb3");
                if (tags == null || tags.Length == 0 || tags.Length == 1)
                {
                    tags = html.SplitWith("wx-rbwx-rb3");
                }
                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    if (!tag.Contains("txt-box"))
                    {
                        continue;
                    }
                    string title  = RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href   = tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();
                    string abs    = RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());
                    string domain = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);
                    string SourceLink = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetFirstHref2();

                    string TitleImg = tag.SubAfter("img_box2").SubBefore("</a").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace();


                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);

                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    //var excludekwdcount = ExcludeKeyword.Count(c => txt.Contains(c.KeywordName));
                    //if (excludekwdcount > 0)
                    //    continue;

                    if (href.IsStartWith("/websearch"))
                    {
                        href = "http://weixin.sogou.com" + href.Replace("amp;", "");
                    }
                    if (href.IsStartWith("s?__biz"))
                    {
                        var href1 = href.Replace("amp;", "");
                    }
                    href = href.Replace("amp;", "");
                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);

                    var htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);


                    Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
                    Match m   = reg.Match(htmldetail);
                    //MatchCollection cols = reg.Matches(item.Html);
                    string time = "";
                    if (m.Groups.Count > 0)
                    {
                        time = m.Groups[0].Value;
                    }
                    href = Rurl;
                    var hrefNew          = href + "&f=json";
                    var htmldetailNewUrl = get_Detailehtml(hrefNew, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                    try
                    {
                        var uuurl = htmldetailNewUrl.SubAfter("\"link\":").SubBefore(",\"source_url\":").Replace('"', ' ').Replace("\\", "").RemoveSpace();
                        href = uuurl;
                    }
                    catch (Exception)
                    {
                    }
                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());
                    var  no         = "";
                    var  qrcode     = "";
                    var  function   = "";
                    var  NoIcon     = "";
                    var  QrcodeIcon = "";
                    SourceLink = SourceLink.Replace("amp;", "");
                    int nnn = new Random().Next(8000, 15000);
                    Thread.Sleep(nnn);
                    var htmlNo = get_Nohtml(SourceLink, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);
                    if (!string.IsNullOrEmpty(htmlNo) && htmlNo.Contains("em_weixinhao"))
                    {
                        no         = htmlNo.SubAfter("em_weixinhao").SubBefore("/label").GetTxtFromHtml2().RemoveSpace();
                        qrcode     = htmlNo.SubAfter("v-box").SubBefore("<em").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace();
                        function   = htmlNo.SubAfter("功能介绍:</").SubBefore("/span").GetTxtFromHtml2().RemoveSpace();
                        SourceLink = htmlNo.SubAfter("微信认证:").SubBefore("/div").GetTxtFromHtml2().RemoveSpace();
                        NoIcon     = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("src=").SubBefore("onload").Replace(">", "").Replace('"', ' ').RemoveSpace();
                        QrcodeIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("err:").SubBefore(">").Replace(">", "").Replace('"', ' ').Replace("'", "").RemoveSpace();
                    }
                    IW2S_WX_level1link l1 = new IW2S_WX_level1link
                    {
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(title, domain, searchTsk.UsrId)),
                        Description     = abs,
                        Domain          = domain,
                        UsrId           = searchTsk.UsrId,
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Title           = title,
                        CreatedAt       = DateTime.Now,
                        DataCleanStatus = 0,
                        Function        = function,
                        SearchkeywordId = searchTsk._id.ToString(),
                        Keywords        = searchTsk.Keyword,
                        PublicNo        = no,
                        QrCode          = qrcode,
                        SourceLink      = SourceLink,
                        TagType         = 0,
                        ImgIcon         = NoIcon,
                        QrCodeIcon      = QrcodeIcon,
                        ProjectId       = searchTsk.ProjectId,
                        TitleImg        = TitleImg,
                        PublishTime     = time,
                        Html            = htmldetail
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://weixin.sogou.com/weixin".GetContact(link);
                }

                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }
示例#14
0
 public Guid GetID()
 {
     return(IDHelper.GetGuid(string.Format("{0}{1}{2}", PackageID, Buyer, SettleDT.ToDateKey2())));
 }
        private void HanleTagData(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords, IW2SBotMng botmng, string[] searchKeywords, List <KeywordScore> patterns, string title, string href, string abs, ref string domain, string tag, bool isMarket, int rank)
        {
            int    maxScore = 0;
            string realUrl = null, detailHtml = null, abstracts = null;
            byte   appType = 0;

            int?baiduVStar = null;

            if (tag.Contains("c-icon-v1"))
            {
                baiduVStar = 1;
            }
            else if (tag.Contains("c-icon-v2"))
            {
                baiduVStar = 2;
            }
            else if (tag.Contains("c-icon-v3"))
            {
                baiduVStar = 3;
            }

            if (!string.IsNullOrWhiteSpace(href))
            {
                //Encoding enc = Encoding.UTF8;
                //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl);
                var tuplehtml = get_htmlUrl(href);
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                {
                    realUrl = tuplehtml.Item1;
                }
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                {
                    detailHtml = tuplehtml.Item2;
                }
                if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                {
                    domain = GetDomain(realUrl);
                }
            }
            if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
            {
                var gourl = detailHtml.GetFirstHref2();
                if (!string.IsNullOrEmpty(gourl))
                {
                    var tuplehtml = get_htmlUrl(gourl);
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                    {
                        realUrl = tuplehtml.Item1;
                    }
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                    {
                        detailHtml = tuplehtml.Item2;
                    }
                    if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                    {
                        domain = GetDomain(realUrl);
                    }
                }
            }
            if (string.IsNullOrEmpty(realUrl))
            {
                realUrl = href;
            }
            List <KeywordScore> matchpatterns = new List <KeywordScore>();

            if (string.IsNullOrEmpty(detailHtml))
            {
                return;
            }
            else
            {
                //if (!detailHtml.Contains(tsk.CommendKeyword))
                //{
                //    return;
                //}
                var           hrefs          = detailHtml.GetDescendents("a", "href");
                StringBuilder sbabstracts    = new StringBuilder();
                List <string> abstractlist   = new List <string>();
                StringBuilder sbabstractlist = new StringBuilder();

                foreach (KeywordScore pattern in patterns)
                {
                    string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword);
                    if (splitDetailHtmls.Length > 1)
                    {
                        matchpatterns.Add(pattern);
                    }
                    StringBuilder sbpatternStr = new StringBuilder();
                    for (int i = 0; i < splitDetailHtmls.Length - 1; i++)
                    {
                        string splitDetailHtml1 = splitDetailHtmls[i];
                        string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : "";
                        for (int j = splitDetailHtml1.Length - 1; j >= 0; j--)
                        {
                            if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml1[j]);
                        }
                        for (int q = sbpatternStr.Length - 1; q >= 0; q--)
                        {
                            sbabstracts.Append(sbpatternStr[q]);
                        }
                        sbabstracts.Append(pattern.Keyword);
                        sbpatternStr.Clear();
                        for (int j = 0; j < splitDetailHtml2.Length; j++)
                        {
                            if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml2[j]);
                        }
                        sbabstracts.Append(sbpatternStr);
                        sbpatternStr.Clear();

                        string tmpsbabstracts = sbabstracts.ToString();
                        tmpsbabstracts = IW2SBaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower());
                        if (!abstractlist.Contains(tmpsbabstracts))
                        {
                            abstractlist.Add(tmpsbabstracts);
                            sbabstractlist.Append(tmpsbabstracts).Append(" ");
                        }
                        sbabstracts.Clear();
                    }
                }
                //获取摘要
                abstracts = sbabstractlist.ToString();
                if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0)
                {
                    maxScore  = matchpatterns.Max(x => x.Score ?? 50);
                    appType   = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();
                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs))
            {
                matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList();
                if (matchpatterns.Count > 0)
                {
                    maxScore = matchpatterns.Max(x => x.Score ?? 50);
                    appType  = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();

                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (maxScore > 100)
            {
                maxScore = 100;
            }


            bool          is_title_matched = title.GetLower().IsContains2(searchKeywords);
            bool          is_abstr_matched = abs.IsContains2(searchKeywords);
            BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                             is_title_matched ? BaiduItemPart.Title :
                                             is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;

            Regex           reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
            MatchCollection mc  = reg.Matches(detailHtml);
            //MatchCollection cols = reg.Matches(item.Html);
            string time = "";

            if (mc.Count > 0)
            {
                foreach (Match x in mc)
                {
                    //判断是正文中的还是代码和注释中的时间
                    if (!string.IsNullOrEmpty(x.Value))
                    {
                        var txt    = detailHtml.SubAfter(x.Value);
                        var index1 = txt.IndexOf('<');
                        var index2 = txt.IndexOf('>');
                        var index3 = txt.IndexOf('\"');
                        //只使用正文中的时间
                        if (index1 < index2 && index1 < index3)
                        {
                            time = x.Value;
                            break;
                        }
                    }
                }
            }


            IW2S_level1link l1 = new IW2S_level1link
            {
                UsrId     = tsk.UsrId,
                Domain    = domain,
                TopDomain = GetLevel1Domain(domain),
                Keywords  = string.Format("{0}", tsk.CommendKeyword),
                LinkUrl   = realUrl,
                MatchAt   = (byte)part,
                Html      = detailHtml,

                AppType         = appType,
                BizId           = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())),
                SearchkeywordId = tsk._id.ToString(),
                CreatedAt       = DateTime.UtcNow.AddHours(8),
                Description     = abs,
                Title           = title,
                Score           = maxScore,
                Abstract        = abstracts,
                IsMarket        = isMarket,
                ProjectId       = tsk.ProjectId,
                PublishTime     = time,
                AlternateFields = "0",
                Rank            = rank
            };

            if (baiduVStar.HasValue)
            {
                l1.BaiduVStar = baiduVStar.Value;
            }

            botmng.save_level1_links(new List <IW2S_level1link> {
                l1
            }, tsk, excludedKeywords);
        }
示例#16
0
        void SaveResult(List <XListing> listings, BotTypes botType, string recordId, FreeTask tsk)
        {
            listings.ToList().ForEach(x =>
            {
                if (string.IsNullOrEmpty(x.ShopName) && !string.IsNullOrEmpty(x.SiteName) && !x.BotShopID.HasValue &&
                    !x.SiteName.ToLower().Contains("taobao") && !x.SiteName.ToLower().Contains("alibaba"))
                {
                    x.ShopName  = x.SiteName;
                    x.BotShopID = x.SiteID;
                }
                if (!x.BotShopID.HasValue && !string.IsNullOrEmpty(x.ShopName))
                {
                    x.BotShopID = IDHelper.GetGuid(string.Format("{0},{1},{2},{3}", x.ShopName, x.SiteName, tsk._id, x.ShopID));
                }
            });
            var shopList = listings;

            shopList = shopList.DistinctBy(x => x.BotShopID);
            //var exists_ids = MySqlDbHelper.GetExsitsIds<Guid?>(com, "bot_shops", "Shop_id", shopList.Select(x => x.BotShopID).ToArray());
            //if (exists_ids != null && exists_ids.Count > 0)
            //{
            //    shopList = shopList.Where(x => !exists_ids.Contains(x.BotShopID)).ToList();
            //}

            FieldsDocument shopfd = new FieldsDocument();

            shopfd.Add("BotShopID", 1);
            // MongoCollection<Guid> shopcol = MongoDBHelper<Guid>.GetMongoDB().GetCollection<Guid>("FreeBotShop");
            var shopcol = MongoDBHelper.Instance.Get_FreeBotShop();
            var builder = Builders <FreeBotShop> .Filter;

            List <Guid?> BotShopID = shopList.Select(x => x.BotShopID).ToList();
            //  var existsshop_objs = shopcol.Find(MongoDB.Driver.Builders.Query.In("BotShopID", new BsonArray(BotShopID))).SetFields(shopfd);
            var          existsshop_objs = shopcol.Find(builder.In(x => x.Shop_id, BotShopID)).Project(x => x.Shop_id).ToList();
            List <Guid?> exists_ids      = new List <Guid?>();

            foreach (var result in existsshop_objs)
            {
                exists_ids.Add(result);
            }
            if (exists_ids != null && exists_ids.Count > 0)
            {
                shopList = shopList.Where(x => !exists_ids.Contains(x.BotShopID)).ToList();
            }
            if (shopList == null || shopList.Count == 0)
            {
                return;
            }
            List <FreeBotShop> dt = null;

            if (shopList.Count > 0)
            {
                dt = GetShopList(shopList);
                //  var saved = MongoDBHelper<FreeBotShop>.BatchInsertData(dt, "FreeBotShop");
                shopcol.InsertMany(dt);
                log("to save bot_shops");
                log("Done");
            }
            listings.ToList().ForEach(x =>
            {
                if (!x.BotItemID.HasValue && !string.IsNullOrEmpty(x.ItemName))
                {
                    x.BotItemID = IDHelper.GetGuid(string.Format("{0},{1},{2},{3}", x.ItemName, x.SiteName, tsk._id, x.ItemID));
                }
            });
            var itemList = listings;

            itemList = itemList.DistinctBy(x => x.BotItemID);
            //var exists_itemids = MySqlDbHelper.GetExsitsIds<Guid?>(com, "bot_items", "ItemId", itemList.Select(x => x.BotItemID).ToArray());
            //if (exists_itemids != null && exists_itemids.Count > 0)
            //{
            //    itemList = itemList.Where(x => !exists_itemids.Contains(x.BotItemID)).ToList();
            //}

            FieldsDocument fd = new FieldsDocument();

            fd.Add("BotItemID", 1);
            //   MongoCollection<Guid> col = MongoDBHelper<Guid>.GetMongoDB().GetCollection<Guid>("FreeBotItem");
            var          col         = MongoDBHelper.Instance.Get_FreeBotItem();
            var          itemBuilder = Builders <FreeBotItem> .Filter;
            List <Guid?> BotItemID   = itemList.Select(x => x.BotItemID).ToList();
            //  var exists_objs = col.Find(MongoDB.Driver.Builders.Query.In("BotItemID", new BsonArray(BotItemID))).SetFields(fd);
            var          exists_objs    = col.Find(itemBuilder.In(x => x.ItemId, BotItemID)).Project(x => x.ItemId).ToList();
            List <Guid?> existsitem_ids = new List <Guid?>();

            foreach (var result in exists_objs)
            {
                existsitem_ids.Add(result);
            }
            if (existsitem_ids != null && existsitem_ids.Count > 0)
            {
                itemList = itemList.Where(x => !existsitem_ids.Contains(x.BotItemID)).ToList();
            }

            List <XListing> updatelinks = new List <XListing>();

            if (existsitem_ids != null && existsitem_ids.Count > 0)
            {
                updatelinks = updatelinks.Where(x => existsitem_ids.Contains(x.BotItemID)).ToList();
            }
            update_level1_links(updatelinks, botType, recordId, tsk);
            if (itemList == null || itemList.Count == 0)
            {
                return;
            }

            var itemdt = GetItemList(itemList);

            //  var savedListings = MySqlDbHelper.BatchInsert(con, "bot_items", itemdt);
            // var savedListings = MongoDBHelper<FreeBotItem>.BatchInsertData(itemdt, "FreeBotItem");
            col.InsertMany(itemdt);

            // var wequery = new QueryDocument { { "_id", new ObjectId(recordId) } };
            // FreeTaskRecord TaskList = MongoDBHelper<FreeTaskRecord>.Find1("FreeTaskRecord", wequery);
            var            colRecord     = MongoDBHelper.Instance.Get_FreeTaskRecord();
            var            RecordBuilder = Builders <FreeTaskRecord> .Filter;
            FreeTaskRecord TaskList      = colRecord.Find(RecordBuilder.Eq(x => x._id, new ObjectId(recordId))).FirstOrDefault();
            int            LinksNum      = 0;
            int            ShopsNum      = 0;

            LinksNum = TaskList.LinksNum + itemdt.Count;
            ShopsNum = TaskList.ShopsNum + dt.Count;
            var updateWebsiteCount = new UpdateDocument {
                { "$set", new QueryDocument {
                      { "LinksNum", LinksNum }, { "ShopsNum", ShopsNum }
                  } }
            };

            //  MongoDBHelper<FreeTaskRecord>.Update("FreeTaskRecord", wequery, updateWebsiteCount);
            MongoDBHelper.Instance.Get_FreeTaskRecord().UpdateOne(new QueryDocument {
                { "_id", new ObjectId(recordId) }
            }, updateWebsiteCount);


            var      colTask      = MongoDBHelper.Instance.Get_FreeTask();
            var      TaskBuilder  = Builders <FreeTask> .Filter;
            FreeTask Task2List    = colTask.Find(TaskBuilder.Eq(x => x._id, itemdt[0].taskId)).FirstOrDefault();
            int      TaskLinksNum = 0;
            int      TaskShopsNum = 0;

            TaskLinksNum = Task2List.LinksNum + itemdt.Count;
            TaskShopsNum = Task2List.ShopsNum + dt.Count;
            var TaskupdateWebsiteCount = new UpdateDocument {
                { "$set", new QueryDocument {
                      { "LinksNum", TaskLinksNum }, { "ShopsNum", TaskShopsNum }
                  } }
            };

            //  MongoDBHelper<FreeTaskRecord>.Update("FreeTaskRecord", wequery, updateWebsiteCount);
            MongoDBHelper.Instance.Get_FreeTask().UpdateOne(new QueryDocument {
                { "_id", itemdt[0].taskId }
            }, TaskupdateWebsiteCount);

            log("to save listings");
            log("Done");
        }
        public List <IW2S_Bing_level1link> GetLinks(string link, IW2S_Bing_BaiduCommend searchTsk)
        {
            List <IW2S_Bing_level1link> result = new List <IW2S_Bing_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;
            int fanye        = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 10)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "http://cn.bing.com/";
                string           cookie           = "";
                string           hhhtml           = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                int gg = new Random().Next(2000, 5000);
                Thread.Sleep(gg);

                Rurl = link;
                var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }

                if (html.Contains("没有找到搜索内容!"))
                {
                    break;
                }

                var tags  = html.SubAfter("body").SubBefore("/body").SplitWith("b_content");
                var tagsD = tags[tags.Length - 1].SubAfter("搜索结果").SubBefore("</ol>").ToString().SplitWith("</li>");
                if (tagsD == null || tagsD.Length == 0 || tagsD.Length == 1)
                {
                    tags = html.SplitWith("b_content");
                }
                if (tagsD == null || tagsD.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tagsD)
                {
                    if (!tag.Contains("h2"))
                    {
                        continue;
                    }

                    //if (!tag.Contains("sp_requery"))
                    //{
                    //    continue;
                    //}

                    var    a     = tag.SubAfter("h2").SubAfter("a");
                    string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h2>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href  = a.GetFirstHref2();                                                                                            //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();
                    if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href))
                    {
                        continue;
                    }

                    href = href.Replace("amp;", "");


                    var sdsfdsf = GetDomain(href);



                    string abs = RemoveInivalidChar(tag.SubAfter("<p>").SubBefore("</p").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());

                    string timesp = "";

                    if (tag.Contains("此网站的操作"))
                    {
                        timesp = tag.SubAfter("此网站的操作").SubAfter("</a>").SubBefore("</div>").Replace('"', ' ');
                    }

                    string domain = GetDomain(href); //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);

                    int maxScore = 0;

                    byte appType = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);
                    var htmldetail = "";

                    try
                    {
                        htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                    }
                    catch (Exception)
                    {
                        //htmldetail = "";
                        href = "http://cn.bing.com" + href;
                    }
                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());



                    IW2S_Bing_level1link l1 = new IW2S_Bing_level1link
                    {
                        UsrId           = searchTsk.UsrId,
                        Domain          = domain,
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword),
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Html            = htmldetail,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)),
                        SearchkeywordId = searchTsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abs,
                        ProjectId       = searchTsk.ProjectId
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("sb_pagN").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://cn.bing.com/".GetContact(link);
                }
                fanye = fanye + 10;
                link  = "http://cn.bing.com/search?q={0}&first={1}&FORM=PERE3".FormatStr(searchTsk.Keyword, fanye);
                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }
示例#18
0
        void GetLinks(string link, searchkeyword tsk, keyword businessKeyword, List <keyword> businessKeywords, List <keyword> excludedKeywords)
        {
            BotMng  botmng = BotMng.Instance;
            AppType img    = (AppType)tsk.AppType;

            string[] searchKeywords = tsk.Keyword.GetLower().RemoveSpace().Split(';');

            List <KeywordScore> patterns = businessKeywords.Select(x => new KeywordScore {
                Keyword = x.Txt, Score = x.Score, BizType = x.BizType
            }).ToList();

            string[] bizPatterns = businessKeywords.Select(x => x.Txt).ToArray();
            patterns.Add(new KeywordScore {
                Keyword = tsk.Keyword, Score = 50, BizType = 0
            });

            //List<level1link> result = new List<level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            //最多搜索60页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 60)
            {
                log(link);
                var html = get_html(link);
                if (html == null)
                {
                    break;
                }
                var tags = html.SubAfter("content_left").SplitWith("c-container");

                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + tsk.Keyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    var    a     = tag.SubAfter("h3").SubAfter("a");
                    string title = RemoveInivalidChar(
                        a.RemoveSpace().GetLower().SubBefore("</h3>").GetTxtFromHtml2().RemoveSpace().GetLower());
                    string href = a.GetFirstHref2();


                    string abs    = RemoveInivalidChar(tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2().RemoveSpace().GetLower());
                    string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2();
                    domain = GetDomain(domain);

                    int maxScore = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0}{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    string realUrl = null, detailHtml = null, abstracts = null;
                    byte   appType = 0;

                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        //Encoding enc = Encoding.UTF8;
                        //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl);
                        var tuplehtml = get_htmlUrl(href);
                        if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                        {
                            realUrl = tuplehtml.Item1;
                        }
                        if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                        {
                            detailHtml = tuplehtml.Item2;
                        }
                        if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                        {
                            domain = GetDomain(realUrl);
                        }
                    }
                    if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
                    {
                        var gourl = detailHtml.GetFirstHref2();
                        if (!string.IsNullOrEmpty(gourl))
                        {
                            var tuplehtml = get_htmlUrl(gourl);
                            if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                            {
                                realUrl = tuplehtml.Item1;
                            }
                            if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                            {
                                detailHtml = tuplehtml.Item2;
                            }
                            if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                            {
                                domain = GetDomain(realUrl);
                            }
                        }
                    }
                    if (string.IsNullOrEmpty(realUrl))
                    {
                        realUrl = href;
                    }
                    List <KeywordScore> matchpatterns = new List <KeywordScore>();
                    if (string.IsNullOrEmpty(detailHtml))
                    {
                        continue;
                    }
                    else
                    {
                        if (!detailHtml.Contains(tsk.Keyword) || !detailHtml.IsContains2(bizPatterns))
                        {
                            continue;
                        }
                        var           hrefs          = detailHtml.GetDescendents("a", "href");
                        StringBuilder sbabstracts    = new StringBuilder();
                        List <string> abstractlist   = new List <string>();
                        StringBuilder sbabstractlist = new StringBuilder();

                        foreach (KeywordScore pattern in patterns)
                        {
                            string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword);
                            if (splitDetailHtmls.Length > 1)
                            {
                                matchpatterns.Add(pattern);
                            }
                            StringBuilder sbpatternStr = new StringBuilder();
                            for (int i = 0; i < splitDetailHtmls.Length - 1; i++)
                            {
                                string splitDetailHtml1 = splitDetailHtmls[i];
                                string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : "";
                                for (int j = splitDetailHtml1.Length - 1; j >= 0; j--)
                                {
                                    if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1]))
                                    {
                                        break;
                                    }
                                    sbpatternStr.Append(splitDetailHtml1[j]);
                                }
                                for (int q = sbpatternStr.Length - 1; q >= 0; q--)
                                {
                                    sbabstracts.Append(sbpatternStr[q]);
                                }
                                sbabstracts.Append(pattern.Keyword);
                                sbpatternStr.Clear();
                                for (int j = 0; j < splitDetailHtml2.Length; j++)
                                {
                                    if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1]))
                                    {
                                        break;
                                    }
                                    sbpatternStr.Append(splitDetailHtml2[j]);
                                }
                                sbabstracts.Append(sbpatternStr);
                                sbpatternStr.Clear();

                                string tmpsbabstracts = sbabstracts.ToString();
                                tmpsbabstracts = BaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower());
                                if (!abstractlist.Contains(tmpsbabstracts))
                                {
                                    abstractlist.Add(tmpsbabstracts);
                                    sbabstractlist.Append(tmpsbabstracts).Append(" ");
                                }
                                sbabstracts.Clear();
                            }
                        }
                        abstracts = sbabstractlist.ToString();
                        if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0)
                        {
                            maxScore  = matchpatterns.Max(x => x.Score ?? 50);
                            appType   = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();
                            maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                            maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                        }
                    }
                    if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs))
                    {
                        matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList();
                        maxScore      = matchpatterns.Max(x => x.Score ?? 50);
                        appType       = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();

                        maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                        maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                    }
                    if (maxScore > 100)
                    {
                        maxScore = 100;
                    }

                    bool is_bus_matched = txt.IsContains2(businessKeyword.Txt);

                    bool          is_title_matched = title.GetLower().IsContains2(searchKeywords);
                    bool          is_abstr_matched = abs.IsContains2(searchKeywords);
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains2(searchKeywords);



                    level1link l1 = new level1link
                    {
                        UsrId           = tsk.UsrId,
                        Domain          = domain,
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", tsk.Keyword, businessKeyword.Txt),
                        LinkUrl         = realUrl,
                        MatchAt         = (byte)part,
                        Html            = detailHtml,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(realUrl, tsk.UsrId, tsk.Keyword)),
                        SearchkeywordId = tsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abstracts
                    };

                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));

                    botmng.save_level1_links(new List <level1link> {
                        l1
                    }, tsk, excludedKeywords);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;
                link = html.SubAfter("fkfk_cur").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://www.baidu.com/".GetContact(link);
                }
            }
            //return result;
        }