Пример #1
0
        void Snapshot(IW2S_SG_BaiduCommend tsk)
        {
            SogouQuery wc = new SogouQuery();

            wc.Query(tsk);
            //  SaveKeyRecord(searchTask);
        }
Пример #2
0
        string get_urls(IW2S_SG_BaiduCommend tsk)
        {
            string searchKeywords = tsk.Keyword.RemoveSpace().GetLower();

            if (!string.IsNullOrEmpty(searchKeywords))
            {
                string baiduUrlFormat = "https://www.sogou.com/web?query={0}&cid=&page=1&ie=utf8&dr=1";// "https://www.sogou.com/web?query={0}&ie=utf8";
                return(baiduUrlFormat.FormatStr(searchKeywords.GetUrlEncodedString("utf-8")));
            }
            return(string.Empty);
        }
Пример #3
0
        public void Query(IW2S_SG_BaiduCommend tsk)
        {
            List <ResultLiks> result = new List <ResultLiks>();
            var link = get_urls(tsk);

            try
            {
                GetLinks(link, tsk);
                //if (list != null && list.Count > 0)
                //    result.AddRange(list);
            }
            catch (Exception ex)
            {
                log(ex.Message + ex.StackTrace);
            }
        }
        public List <SG_links> Query(IW2S_SG_BaiduCommend searchTsk, List <IW2S_SG_BaiduCommend> taskList)
        {
            List <SG_links>           linkvaluelist = new List <SG_links>();
            SG_BotTaskService         SG_bt         = new SG_BotTaskService();
            List <IW2S_SG_level1link> linklist      = SG_bt.GetLinkTitleList(searchTsk.ProjectId);
            int index = 0;

            for (int i = 0; i < taskList.Count; i++)
            {
                if (searchTsk.CommendKeyword == taskList[i].CommendKeyword)
                {
                    index = i;
                }
            }
            for (int i = 0; i < taskList.Count; i++)
            {
                SG_links lk = new SG_links();
                //if (searchTsk.CommendKeyword == taskList[i].CommendKeyword)
                //{
                //}
                //else
                //{
                lk.source    = index;
                lk.target    = i;
                lk.KeywordId = searchTsk._id;
                lk.ProjectId = searchTsk.ProjectId;
                int linkNum = 0;
                foreach (var item in linklist)
                {
                    if (!string.IsNullOrEmpty(item.Title))
                    {
                        if (item.Title.Contains(searchTsk.CommendKeyword) && item.Title.Contains(taskList[i].CommendKeyword))
                        {
                            linkNum = linkNum + 1;
                        }
                    }
                }
                lk.value = linkNum;
                lk.Gid   = IDHelper.GetGuid("{0}/{1}/{2}/{3}".FormatStr(lk.source, lk.target, lk.KeywordId, lk.ProjectId));
                linkvaluelist.Add(lk);
                //  }
            }
            SG_SaveResult(linkvaluelist);
            return(linkvaluelist);
        }
Пример #5
0
        public void Run()
        {
            while (true)
            {
                BotTaskService       bt      = new BotTaskService();
                Random               r       = new Random();
                IW2S_SG_BaiduCommend keyTask = bt.GetBotTask();  //get_task();
                if (keyTask == null || keyTask.CommendKeyword == "" || keyTask.CommendKeyword == null)
                {
                    SetReady();
                    log("没有搜索任务 !!!");
                    Thread.Sleep(3000);
                    continue;
                }
                SetBusy();
                var update = new UpdateDocument {
                    { "$set", new QueryDocument {
                          { "WXStatus", 1 }, { "BotStatus", 1 }
                      } }
                };

                var result = MongoDBHelper.Instance.Get_IW2S_SG_BaiduCommend().UpdateOne(new QueryDocument {
                    { "_id", keyTask._id }
                }, update);

                Snapshot(keyTask);
                try
                {
                    update = new UpdateDocument {
                        { "$set", new QueryDocument {
                              { "WXStatus", 2 }, { "BotStatus", 2 }
                          } }
                    };

                    result = MongoDBHelper.Instance.Get_IW2S_SG_BaiduCommend().UpdateOne(new QueryDocument {
                        { "_id", keyTask._id }
                    }, update);
                }
                catch (Exception ex)
                {
                    log(DateTime.Now + "ERROR ." + ex.Message);
                    Thread.Sleep(2000);
                }
            }
        }
Пример #6
0
 void SG_Snapshot(IW2S_SG_BaiduCommend searchTask, List <IW2S_SG_BaiduCommend> listKey)
 {
     List <IW2S_SG_Data> xListings = new List <IW2S_SG_Data>();
     SG_WeChatQuery      wc        = new SG_WeChatQuery(searchTask.Keyword + searchTask.CommendKeyword);
     var links360 = wc.Query(searchTask, listKey);
 }
Пример #7
0
        void GetLinks(string link, IW2S_SG_BaiduCommend tsk)
        {
            List <IW2S_SG_level1link> result = new List <IW2S_SG_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            while (!string.IsNullOrEmpty(link) && quried_pages <= 2)
            {
                log(link);

                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "https://www.sogou.com/";
                string           cookie           = "";
                string           hhhtml           = GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                string realUrl = "";
                var    html    = GetContent(link, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); //(link);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }
                //处理 vrwrap
                var tags = html.SubAfter("<body").SubAfter("results").SubBefore("hint_container").SplitWith("<h3");
                if (tags == null || tags.Length == 0)
                {
                    log("已被sogou屏蔽,请调试! " + tsk.Keyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    try
                    {
                        if (!tag.Contains("<a"))
                        {
                            continue;
                        }
                        string title   = RemoveInivalidChar(tag.SubAfter("<a").SubBefore("</a>").GetTxtFromHtml2().RemoveSpace());
                        string href    = tag.SubAfter("<a").SubBefore("</a>").GetFirstHref2();
                        string Jianjie = "";
                        if (tag.Contains("简介:"))
                        {
                            Jianjie = tag.SubAfter("简介:").SubBefore("</").GetTxtFromHtml2().RemoveSpace();
                        }
                        if (tag.Contains("cacheresult_summary"))
                        {
                            Jianjie = tag.SubAfter("cacheresult_summary").SubBefore("</div>").GetTxtFromHtml2().RemoveSpace();
                        }
                        if (string.IsNullOrEmpty(Jianjie))
                        {
                            Jianjie = tag.SubAfter("summary_beg").SubBefore("summary_end").GetTxtFromHtml2().RemoveSpace();
                        }
                        int n = new Random().Next(8000, 15000);
                        Thread.Sleep(n);
                        var tuplehtml = GetContent(href, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection); // get_htmlUrl(href);
                        if (!string.IsNullOrEmpty(tuplehtml))
                        {
                            if (tuplehtml.Contains("window.location.replace"))
                            {
                                realUrl = tuplehtml.SubAfter("window.location.replace").SubBefore("</script>").Replace('"', ' ').Replace("(", "").Replace(")", "").RemoveSpace();
                            }
                            else
                            {
                                realUrl = Rurl;
                            }
                        }
                        string domain = "";
                        if (!string.IsNullOrEmpty(realUrl))
                        {
                            domain = GetDomain(realUrl);
                        }
                        else
                        {
                            realUrl = href;
                            domain  = GetDomain(href);
                        }
                        string topDomain  = GetLevel1Domain(domain);
                        bool   IsContains = false;
                        int    States     = 0;
                        int    blackid    = 0;
                        realUrl = realUrl.Replace("amp;", "");
                        int nn = new Random().Next(6000, 15000);
                        Thread.Sleep(nn);
                        var   htmldetail = GetContent(realUrl, 8000, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                        Regex reg        = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
                        Match m          = reg.Match(htmldetail);
                        //MatchCollection cols = reg.Matches(item.Html);
                        string time = "";
                        if (m.Groups.Count > 0)
                        {
                            time = m.Groups[0].Value;
                        }
                        //foreach (var item in BLtb)
                        //{
                        //    if (item.Domain.Trim().ToLower().Equals(topDomain.Trim().ToLower()))
                        //    {
                        //        States = 2;
                        //        blackid = item.Id;
                        //    }
                        //}
                        //foreach (var item in excludedKeywords)
                        //{
                        //    if (item.AuthorizedUrl1.Contains(topDomain))
                        //    {
                        //        IsContains = true;
                        //        States = 1;
                        //    }
                        //}
                        //if (IsContains == true)
                        //    continue;

                        IW2S_SG_level1link l1 = new IW2S_SG_level1link
                        {
                            UsrId           = tsk.UsrId,
                            Domain          = domain,
                            TopDomain       = topDomain,
                            Keywords        = string.Format("{0}", tsk.Keyword),
                            LinkUrl         = realUrl,
                            Html            = htmldetail,
                            BizId           = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())),
                            SearchkeywordId = tsk._id.ToString(),
                            CreatedAt       = DateTime.UtcNow.AddHours(8),
                            Description     = Jianjie,
                            Title           = title,
                            ProjectId       = tsk.ProjectId,
                            PublishTime     = time,
                            AlternateFields = "0",
                            DataCleanStatus = 0
                        };
                        result.Add(l1);
                        nohit        = false;
                        nohist_pages = 0;
                    }
                    catch (Exception ex)
                    {
                        log("有错误信息!" + ex.Message);
                    }
                }
                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }
                quried_pages++;
                pages++;
                link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "https://www.sogou.com/web".GetContact(link);
                }
                SaveResult(result);
                result.Clear();
                int nn1 = new Random().Next(6000, 15000);
                Thread.Sleep(nn1);
            }
        }