示例#1
0
        void Snapshot(IW2S_Bing_BaiduCommend searchTask)
        {
            List <IW2S_Bing_level1link> xListings = new List <IW2S_Bing_level1link>();
            BingQuery wc = new BingQuery(searchTask.Keyword + searchTask.CommendKeyword);

            var links360 = wc.Query(searchTask);
            //  SaveKeyRecord(searchTask);
        }
示例#2
0
        public void Run()
        {
            while (true)
            {
                BotTaskService         bt      = new BotTaskService();
                Random                 r       = new Random();
                IW2S_Bing_BaiduCommend keyTask = bt.GetBotTask();  //get_task();
                if (keyTask == null || keyTask.CommendKeyword == "" || keyTask.CommendKeyword == null)
                {
                    log("No search task ! start search Detail !!!");
                    Thread.Sleep(1000);
                    //WX_Data wscData = bt.GetWxData();
                    //if (wscData == null)
                    //{
                    //}
                    continue;
                }

                var update = new UpdateDocument {
                    { "$set", new QueryDocument {
                          { "WXStatus", 1 }, { "BotStatus", 1 }
                      } }
                };

                var result = MongoDBHelper.Instance.Get_IW2S_Bing_BaiduCommend().UpdateOne(new QueryDocument {
                    { "_id", keyTask._id }
                }, update);

                Snapshot(keyTask);

                try
                {
                    update = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "WXStatus", 2 }, { "BotStatus", 2 }
                          } }
                    };

                    result = MongoDBHelper.Instance.Get_IW2S_Bing_BaiduCommend().UpdateOne(new QueryDocument {
                        { "_id", keyTask._id }
                    }, update);
                }
                catch (Exception ex)
                {
                    log(DateTime.Now + "ERROR ." + ex.Message);
                    Thread.Sleep(2000);
                }
            }
        }
        public List <IW2S_Bing_level1link> Query(IW2S_Bing_BaiduCommend searchTsk)
        {
            var links = get_url(searchTsk.Keyword, searchTsk.CommendKeyword);

            if (links == null || links == "")
            {
                return(null);
            }

            List <IW2S_Bing_level1link> result = new List <IW2S_Bing_level1link>();

            var list = GetLinks(links, searchTsk);

            if (list != null && list.Count > 0)
            {
                result.AddRange(list);
            }

            return(result);
        }
        public List <IW2S_Bing_level1link> GetLinks(string link, IW2S_Bing_BaiduCommend searchTsk)
        {
            List <IW2S_Bing_level1link> result = new List <IW2S_Bing_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;
            int fanye        = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 10)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "http://cn.bing.com/";
                string           cookie           = "";
                string           hhhtml           = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                int gg = new Random().Next(2000, 5000);
                Thread.Sleep(gg);

                Rurl = link;
                var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }

                if (html.Contains("没有找到搜索内容!"))
                {
                    break;
                }

                var tags  = html.SubAfter("body").SubBefore("/body").SplitWith("b_content");
                var tagsD = tags[tags.Length - 1].SubAfter("搜索结果").SubBefore("</ol>").ToString().SplitWith("</li>");
                if (tagsD == null || tagsD.Length == 0 || tagsD.Length == 1)
                {
                    tags = html.SplitWith("b_content");
                }
                if (tagsD == null || tagsD.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tagsD)
                {
                    if (!tag.Contains("h2"))
                    {
                        continue;
                    }

                    //if (!tag.Contains("sp_requery"))
                    //{
                    //    continue;
                    //}

                    var    a     = tag.SubAfter("h2").SubAfter("a");
                    string title = RemoveInivalidChar(a.RemoveSpace().GetLower().SubBefore("</h2>").GetTxtFromHtml2().RemoveSpace().GetLower()); // RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href  = a.GetFirstHref2();                                                                                            //tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();
                    if (string.IsNullOrEmpty(title) && string.IsNullOrEmpty(href))
                    {
                        continue;
                    }

                    href = href.Replace("amp;", "");


                    var sdsfdsf = GetDomain(href);



                    string abs = RemoveInivalidChar(tag.SubAfter("<p>").SubBefore("</p").GetTxtFromHtml2().RemoveSpace().GetLower()); //RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());

                    string timesp = "";

                    if (tag.Contains("此网站的操作"))
                    {
                        timesp = tag.SubAfter("此网站的操作").SubAfter("</a>").SubBefore("</div>").Replace('"', ' ');
                    }

                    string domain = GetDomain(href); //tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);

                    int maxScore = 0;

                    byte appType = 0;
                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);
                    var htmldetail = "";

                    try
                    {
                        htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                    }
                    catch (Exception)
                    {
                        //htmldetail = "";
                        href = "http://cn.bing.com" + href;
                    }
                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());



                    IW2S_Bing_level1link l1 = new IW2S_Bing_level1link
                    {
                        UsrId           = searchTsk.UsrId,
                        Domain          = domain,
                        TopDomain       = GetLevel1Domain(domain),
                        Keywords        = string.Format("{0} + {1}", searchTsk.Keyword, searchTsk.CommendKeyword),
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Html            = htmldetail,
                        MatchType       = (byte)((is_bus_matched ? 1 : 0) + (is_itm_title_matched ? 2 : 0)),
                        AppType         = appType,
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(href, searchTsk.UsrId, searchTsk.Keyword)),
                        SearchkeywordId = searchTsk._id.ToString(),
                        CreatedAt       = DateTime.UtcNow.AddHours(8),
                        Description     = abs,
                        Title           = title,
                        Score           = maxScore,
                        Abstract        = abs,
                        ProjectId       = searchTsk.ProjectId
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("sb_pagN").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://cn.bing.com/".GetContact(link);
                }
                fanye = fanye + 10;
                link  = "http://cn.bing.com/search?q={0}&first={1}&FORM=PERE3".FormatStr(searchTsk.Keyword, fanye);
                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }