Example #1
0
        //void SnapshotByAllBus(WX_SearchKeyword searchTask, GlobalBusKeyWords busTask, List<WX_ExcludeKeyword> ExcludeKeyword)
        //{


        //    List<WX_Data> xListings = new List<WX_Data>();

        //    // TaoBaoSnapshotQuery tbq = new TaoBaoSnapshotQuery();
        //    WeChatQueryByBus wc = new WeChatQueryByBus(searchTask.KeywordName + busTask.KeywordName);
        //    var links360 = wc.Query(searchTask, busTask, ExcludeKeyword);

        //    //for (int page = 0; page < links360.Count; page++)
        //    //{
        //    //    SaveKeyRecord(links360);
        //    //}

        //}
        void PublicNoNameSearch(IW2S_WX_level1link data)
        {
            //List<WX_Data> xListings = new List<WX_Data>();

            //// TaoBaoSnapshotQuery tbq = new TaoBaoSnapshotQuery();
            //WeChatQuery wc = new WeChatQuery(searchTask.KeywordName + busTask.KeywordName);
            //var links360 = wc.Query(searchTask, busTask);
            //var imgsList = wc.imgsList;
            //int pagesize = 1000;
            //for (int page = 0; page * pagesize < links360.Count; page++)
            //{
            //    SaveResult(links360.Skip(page * pagesize).Take(pagesize).ToList(), imgsList);
            //}
        }
Example #2
0
        public List <IW2S_WX_level1link> GetLinks(string link, IW2S_WX_BaiduCommend searchTsk)
        {
            List <IW2S_WX_level1link> result = new List <IW2S_WX_level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;

            //最多搜索10页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 2)
            {
                log(link);
                CookieContainer  cc               = new CookieContainer();
                Encoding         enc              = null;
                CookieCollection cookiesColl      = new CookieCollection();
                CookieCollection cookieCollection = new CookieCollection();
                string           Rurl             = "http://weixin.sogou.com/";
                string           cookie           = "";
                string           hhhtml           = TaobaoWebHelper.GetContentByIndex(Rurl, 8000, cc, ref enc, out Rurl, ref cookiesColl, out cookieCollection);
                cookiesColl = cookieCollection;
                int gg = new Random().Next(5000, 8000);
                Thread.Sleep(gg);

                Rurl = link;
                var html = get_html(link, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(link, 8000, cc, ref enc, out Rurl);
                cookiesColl = cookieCollection;
                if (html == null)
                {
                    break;
                }

                if (html.Contains("没有找到相关的微信公众号文章"))
                {
                    break;
                }

                var tags = html.SplitWith("wx-rb wx-rb3");
                if (tags == null || tags.Length == 0 || tags.Length == 1)
                {
                    tags = html.SplitWith("wx-rbwx-rb3");
                }
                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + searchTsk.Keyword + " " + searchTsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (var tag in tags)
                {
                    if (!tag.Contains("txt-box"))
                    {
                        continue;
                    }
                    string title  = RemoveInivalidChar(tag.SubAfter("<h4").SubBefore("</h4>").GetTxtFromHtml2().RemoveSpace());
                    string href   = tag.SubAfter("<h4").SubBefore("</a>").GetFirstHref2();
                    string abs    = RemoveInivalidChar(tag.SubAfter("<h4>").SubBefore("\"s-p\"").SubBefore("<script>").GetTxtFromHtml2().RemoveSpace());
                    string domain = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetTxtFromHtml2().SubAfter("(").SubAfter("(").SubBefore(",").Replace('"', ' ').Trim();
                    //domain = BaiduQuery.GetDomain(domain);
                    string SourceLink = tag.SubLastStringAfter("\"s-p\"").SubBefore("</a").GetFirstHref2();

                    string TitleImg = tag.SubAfter("img_box2").SubBefore("</a").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace();


                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0},{1}".FormatStr(title, abs);

                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    //var excludekwdcount = ExcludeKeyword.Count(c => txt.Contains(c.KeywordName));
                    //if (excludekwdcount > 0)
                    //    continue;

                    if (href.IsStartWith("/websearch"))
                    {
                        href = "http://weixin.sogou.com" + href.Replace("amp;", "");
                    }
                    if (href.IsStartWith("s?__biz"))
                    {
                        var href1 = href.Replace("amp;", "");
                    }
                    href = href.Replace("amp;", "");
                    int nn = new Random().Next(8000, 20000);
                    Thread.Sleep(nn);

                    var htmldetail = get_Detailehtml(href, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);


                    Regex reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
                    Match m   = reg.Match(htmldetail);
                    //MatchCollection cols = reg.Matches(item.Html);
                    string time = "";
                    if (m.Groups.Count > 0)
                    {
                        time = m.Groups[0].Value;
                    }
                    href = Rurl;
                    var hrefNew          = href + "&f=json";
                    var htmldetailNewUrl = get_Detailehtml(hrefNew, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);// GetContent(href, 8000, cc, ref enc, out Rurl);
                    try
                    {
                        var uuurl = htmldetailNewUrl.SubAfter("\"link\":").SubBefore(",\"source_url\":").Replace('"', ' ').Replace("\\", "").RemoveSpace();
                        href = uuurl;
                    }
                    catch (Exception)
                    {
                    }
                    bool          is_title_matched = title.GetLower().IsContains2(searchTsk.Keyword.ToLower(), searchTsk.CommendKeyword.ToLower());
                    bool          is_abstr_matched = abs.GetLower().IsContains2(searchTsk.Keyword.GetLower(), searchTsk.CommendKeyword.GetLower());
                    BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                                     is_title_matched ? BaiduItemPart.Title :
                                                     is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;
                    bool is_itm_title_matched = txt.GetLower().IsContains(searchTsk.Keyword.GetLower());
                    bool is_bus_matched       = txt.GetLower().IsContains2(searchTsk.CommendKeyword.GetLower());
                    var  no         = "";
                    var  qrcode     = "";
                    var  function   = "";
                    var  NoIcon     = "";
                    var  QrcodeIcon = "";
                    SourceLink = SourceLink.Replace("amp;", "");
                    int nnn = new Random().Next(8000, 15000);
                    Thread.Sleep(nnn);
                    var htmlNo = get_Nohtml(SourceLink, 8000, cc, ref enc, out Rurl, cookie, ref cookiesColl, out cookieCollection);
                    if (!string.IsNullOrEmpty(htmlNo) && htmlNo.Contains("em_weixinhao"))
                    {
                        no         = htmlNo.SubAfter("em_weixinhao").SubBefore("/label").GetTxtFromHtml2().RemoveSpace();
                        qrcode     = htmlNo.SubAfter("v-box").SubBefore("<em").SubAfter("src=").Replace(">", "").Replace('"', ' ').RemoveSpace();
                        function   = htmlNo.SubAfter("功能介绍:</").SubBefore("/span").GetTxtFromHtml2().RemoveSpace();
                        SourceLink = htmlNo.SubAfter("微信认证:").SubBefore("/div").GetTxtFromHtml2().RemoveSpace();
                        NoIcon     = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("src=").SubBefore("onload").Replace(">", "").Replace('"', ' ').RemoveSpace();
                        QrcodeIcon = htmlNo.SubAfter("img-box").SubBefore("</a").SubAfter("err:").SubBefore(">").Replace(">", "").Replace('"', ' ').Replace("'", "").RemoveSpace();
                    }
                    IW2S_WX_level1link l1 = new IW2S_WX_level1link
                    {
                        BizId           = IDHelper.GetGuid("{0}/{1}/{2}".FormatStr(title, domain, searchTsk.UsrId)),
                        Description     = abs,
                        Domain          = domain,
                        UsrId           = searchTsk.UsrId,
                        LinkUrl         = href,
                        MatchAt         = (byte)part,
                        Title           = title,
                        CreatedAt       = DateTime.Now,
                        DataCleanStatus = 0,
                        Function        = function,
                        SearchkeywordId = searchTsk._id.ToString(),
                        Keywords        = searchTsk.Keyword,
                        PublicNo        = no,
                        QrCode          = qrcode,
                        SourceLink      = SourceLink,
                        TagType         = 0,
                        ImgIcon         = NoIcon,
                        QrCodeIcon      = QrcodeIcon,
                        ProjectId       = searchTsk.ProjectId,
                        TitleImg        = TitleImg,
                        PublishTime     = time,
                        Html            = htmldetail
                    };
                    if (is_bus_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    if (is_itm_title_matched)
                    {
                        l1.MatchType = l1.MatchType;
                    }
                    byte MatchType = (byte)((is_bus_matched ? 10 : 0) + (is_itm_title_matched ? 30 : 0));
                    if (is_bus_matched == true && is_itm_title_matched == true)
                    {
                        //l1.Score = busTsk.Score + 5;
                        l1.Score = 80 + 10;
                    }
                    if (is_bus_matched == true && is_itm_title_matched == false)
                    {
                        l1.Score = 80;
                    }
                    if (is_bus_matched == false && is_itm_title_matched == true)
                    {
                        l1.Score = 50;
                    }

                    result.Add(l1);
                    nohit        = false;
                    nohist_pages = 0;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;

                //****** sougou 需要重写 *********************
                link = html.SubAfter("sogou_next").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://weixin.sogou.com/weixin".GetContact(link);
                }

                SaveResult(result);
                result.Clear();

                int n = new Random().Next(8000, 15000);
                Thread.Sleep(n);
            }
            return(result);
        }