public void saveBaiduKeyword(IW2S_BaiduCommend baiduCommend)
        {
            if (baiduCommend == null)
            {
                return;
            }


            var col     = MongoDBHelper.Instance.GetIW2S_BaiduCommends();
            var builder = Builders <IW2S_BaiduCommend> .Filter;

            var exists_obj = col.Find(builder.Eq(x => x._id, baiduCommend._id)).Project(x => new IDIntDto {
                Id = x._id, Times = x.Times
            }).FirstOrDefault();

            if (exists_obj == null || exists_obj.Id == new MongoDB.Bson.ObjectId("000000000000000000000000"))
            {
                col.InsertOne(baiduCommend);
                log("SUCCESS saving keywords {0} for {1}".FormatStr(baiduCommend.CommendKeyword, baiduCommend.Keyword));
            }
            else
            {
                var update = new UpdateDocument {
                    { "$set", new QueryDocument {
                          { "Times", exists_obj.Times + 1 }
                      } }
                };

                var result = MongoDBHelper.Instance.GetIW2S_BaiduCommends().UpdateOne(new QueryDocument {
                    { "_id", baiduCommend._id }
                }, update);
            }
        }
        string get_urls(IW2S_BaiduCommend tsk)
        {
            string searchKeywords = tsk.CommendKeyword.RemoveSpace().GetLower();

            if (!string.IsNullOrEmpty(searchKeywords))
            {
                string baiduUrlFormat = "http://www.baidu.com/s?ie=utf-8&wd={0}";
                return(baiduUrlFormat.FormatStr(searchKeywords.GetUrlEncodedString("utf-8")));
            }
            return(string.Empty);
        }
        void GetLinks(string link, IW2S_BaiduKeyword tsk, int height)
        {
            string searchKeyword = tsk.Keyword.GetLower().RemoveSpace();

            if (!string.IsNullOrEmpty(link))
            {
                log(link);
                var html = get_html(link);
                if (html == null)
                {
                    return;
                }
                var tags = html.SubAfter("相关搜索</div>").SubBefore("id=\"page\"").SplitWith("<a");

                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + tsk.Keyword);
                    return;
                }

                foreach (var a in tags)
                {
                    string title     = a.GetTxtFromHtml2().RemoveSpace().GetLower();
                    string href      = a.GetFirstHref2();
                    var    searchKey = tsk.Keyword.ToLower();

                    if (string.IsNullOrEmpty(title) || string.IsNullOrWhiteSpace(href) || title == searchKey || !title.Contains(searchKey))
                    {
                        continue;
                    }

                    IW2S_BaiduCommend baiduCommend = new IW2S_BaiduCommend
                    {
                        _id              = "{0}{1}".FormatStr(tsk._id, title).ToObjectId(),
                        CommendKeyword   = title,
                        CreatedAt        = DateTime.UtcNow.AddHours(8),
                        Keyword          = tsk.Keyword,
                        KeywordId        = tsk._id,
                        UsrId            = tsk.UsrId,
                        BotIntervalHours = 7 * 24,
                        ProjectId        = tsk.ProjectId
                    };

                    saveBaiduKeyword(baiduCommend);

                    //if (!string.IsNullOrWhiteSpace(href) && height < 1)
                    //{
                    //    GetLinks("https://www.baidu.com" + href, tsk, height+1);
                    //}
                }
            }
        }
        public void Query(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords)
        {
            List <IW2S_level1link> result = new List <IW2S_level1link>();

            var link = get_urls(tsk);

            try
            {
                GetLinks(link, tsk, excludedKeywords);
                //if (list != null && list.Count > 0)
                //    result.AddRange(list);
            }
            catch (Exception ex)
            {
                log(ex.Message + ex.StackTrace);
            }
        }
        public void save_level1_links(List <IW2S_level1link> links,
                                      IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords)
        {
            links = prehandle_data(links, tsk, excludedKeywords);

            if (links == null || links.Count == 0)
            {
                log("SUCCESS saving 0 Level 1 Links for " + tsk.CommendKeyword);
                return;
            }

            int pagesize = 100;
            int count    = 0;
            var col      = MongoDBHelper.Instance.GetIW2S_level1links();
            var builder  = Builders <IW2S_level1link> .Filter;

            for (int page = 0; page *pagesize < links.Count; page++)
            {
                var list = links.Skip(page * pagesize).Take(pagesize).ToList();
                //list.ForEach(x => x._id = new MongoDB.Bson.ObjectId(IDHelper.GetGuid("{0}/&itemid={1}".FormatStr(x.Domain, x.LinkUrl)).ToString()));
                list = ListDistinctBy(list, x => x.BizId);

                FieldsDocument fd = new FieldsDocument();
                fd.Add("BizId", 1);

                List <Guid> BizId = list.Select(x => x.BizId).ToList();
                //var exists_objs = col.Find(builder.In(x => x.BizId, BizId)).Project(x => x.BizId).ToList();
                List <Guid> exists_ids = new List <Guid>();
                //foreach (var result in exists_objs)
                //{
                //    exists_ids.Add(result);
                //}
                if (exists_ids != null && exists_ids.Count > 0)
                {
                    list = list.Where(x => !exists_ids.Contains(x.BizId)).ToList();
                }
                if (list == null || list.Count == 0)
                {
                    continue;
                }
                count += pagesize;

                col.InsertMany(links);
                log("SUCCESS saving " + links.Count + " Level 1 Links for " + tsk.CommendKeyword);
            }
        }
Exemple #6
0
        private void query(IW2S_BaiduCommend p)
        {
            try
            {
                var builder = Builders <IW2S_ExcludeKeyword> .Filter;
                var filter  = builder.Eq(x => x.UsrId, p.UsrId);

                var excludedKeywords = MongoDBHelper.Instance.GetIW2S_ExcludeKeywords().Find(builder.Empty).ToList();

                log("loaded {0} excluding keywords ".FormatStr(excludedKeywords == null ? 0 : excludedKeywords.Count));
                if (excludedKeywords.GetCount() > 0)
                {
                    excludedKeywords.ForEach(x => x.Keyword = x.Keyword.ToLower());
                }

                var filterbuilder  = Builders <IW2S_KeywordFilter> .Filter;
                var filterfilter   = filterbuilder.Eq(x => x.UsrId, p.UsrId) & filterbuilder.Eq(x => x.ProjectId, p.ProjectId);
                var filterKeywords = MongoDBHelper.Instance.GetIW2S_KeywordFilters().Find(filterfilter).Project(x => new IW2S_ExcludeKeyword
                {
                    Keyword = x.Keyword
                }).ToList();
                excludedKeywords.AddRange(filterKeywords);

                try
                {
                    Queries.IW2SBaiduQuery baidu = new Queries.IW2SBaiduQuery(p.Keyword);

                    baidu.Query(p, excludedKeywords);
                    //save_level1_links(links, p, excludedKeywords);

                    //SogouWeixin sogou = new SogouWeixin(tsk.Keyword);
                    //links = sogou.Query(tsk);
                    //save_level1_links(links, tsk, excludedKeywords);
                }
                catch (Exception ex)
                {
                    log(ex.Message);
                }
            }
            catch (Exception ex)
            {
                log(ex.Message);
            }
        }
        void GetLinks(string link, IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords)
        {
            IW2SBotMng botmng = IW2SBotMng.Instance;

            string[] searchKeywords = tsk.CommendKeyword.GetLower().Trim().Split(';');

            List <KeywordScore> patterns = new List <KeywordScore>();

            patterns.Add(new KeywordScore {
                Keyword = tsk.CommendKeyword
            });

            //List<level1link> result = new List<level1link>();
            int nohist_pages = 0;
            int quried_pages = 0;
            int rank         = 1;

            //最多搜索60页
            while (!string.IsNullOrEmpty(link) && quried_pages <= 3)
            {
                log(link);
                var html = get_html(link);
                if (html == null)
                {
                    break;
                }

                //处理百度推广链接
                var propContents = new List <string>();
                if (!string.IsNullOrEmpty(html.SubAfter("content_left").SubAfter("div id=\"400")))
                {
                    propContents = html.SubAfter("content_left").SubAfter("div id=\"400").SubBefore("c-container").SplitWith("div id=\"400").ToList();
                }
                else if (!string.IsNullOrEmpty(html.SubAfter("content_left").SubAfter("divid=\"400")))
                {
                    propContents = html.SubAfter("content_left").SubAfter("divid=\"400").SubBefore("c-container").SplitWith("divid=\"400").ToList();
                }
                foreach (var tag in propContents)
                {
                    var    a     = tag.SubAfter("h3").SubAfter("a");
                    string title = a.SubBefore("</h3>").GetTxtFromHtml2();
                    if (!string.IsNullOrEmpty(title))
                    {
                        title = title.Trim();
                    }
                    string href = a.GetFirstHref2();

                    string abs = tag.SubAfter("</h3>").SubBefore("</a").GetTxtFromHtml2();
                    if (string.IsNullOrEmpty(abs))
                    {
                        abs = abs.Trim();
                    }
                    string domain = string.Empty;


                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0}{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }



                    HanleTagData(tsk, excludedKeywords, botmng, searchKeywords, patterns, title, href, abs, ref domain, tag, true, rank);
                }

                var tags = html.SubAfter("content_left").SplitWith("c-container");

                if (tags == null || tags.Length == 0)
                {
                    log("BLOCKED " + tsk.CommendKeyword);
                    break;
                }
                bool nohit = true;
                foreach (string tag in tags)
                {
                    var    a     = tag.SubAfter("h3").SubAfter("a");
                    string title = a.SubBefore("</h3>").GetTxtFromHtml2();
                    if (!string.IsNullOrEmpty(title))
                    {
                        title = title.Trim();
                    }
                    string href = a.GetFirstHref2();


                    string abs = tag.SubAfter("abstract").SubBefore("</div").GetTxtFromHtml2();
                    if (string.IsNullOrEmpty(abs))
                    {
                        abs = abs.Trim();
                    }
                    string domain = tag.SubLastStringAfter("\"f13").SubBefore("</span").GetTxtFromHtml2();
                    domain = GetDomain(domain);


                    //没有包含需要protect item信息的过滤掉
                    string txt = "{0}{1}".FormatStr(title, abs);
                    if (string.IsNullOrEmpty(txt))
                    {
                        continue;
                    }

                    HanleTagData(tsk, excludedKeywords, botmng, searchKeywords, patterns, title, href, abs, ref domain, tag, false, rank);
                    nohit        = false;
                    nohist_pages = 0;
                    rank++;
                }

                if (nohit)
                {
                    nohist_pages++;
                }
                //如果连续3页都没有结果,就跳出
                if (nohist_pages > 3)
                {
                    break;
                }

                quried_pages++;
                pages++;
                link = html.SubAfter("fk fk_cur").SubBefore("下一页").GetLastHref2();
                if (!string.IsNullOrEmpty(link) && !link.IsStartWith("http"))
                {
                    if (link.IsStartWith("/"))
                    {
                        link = link.SubAfter("/");
                    }
                    link = "http://www.baidu.com/".GetContact(link);
                }
            }
            //return result;
        }
        private void HanleTagData(IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords, IW2SBotMng botmng, string[] searchKeywords, List <KeywordScore> patterns, string title, string href, string abs, ref string domain, string tag, bool isMarket, int rank)
        {
            int    maxScore = 0;
            string realUrl = null, detailHtml = null, abstracts = null;
            byte   appType = 0;

            int?baiduVStar = null;

            if (tag.Contains("c-icon-v1"))
            {
                baiduVStar = 1;
            }
            else if (tag.Contains("c-icon-v2"))
            {
                baiduVStar = 2;
            }
            else if (tag.Contains("c-icon-v3"))
            {
                baiduVStar = 3;
            }

            if (!string.IsNullOrWhiteSpace(href))
            {
                //Encoding enc = Encoding.UTF8;
                //detailHtml = HtmlQueryHelper.GetContent(href, 8000, ref enc, out realUrl);
                var tuplehtml = get_htmlUrl(href);
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                {
                    realUrl = tuplehtml.Item1;
                }
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                {
                    detailHtml = tuplehtml.Item2;
                }
                if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                {
                    domain = GetDomain(realUrl);
                }
            }
            if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
            {
                var gourl = detailHtml.GetFirstHref2();
                if (!string.IsNullOrEmpty(gourl))
                {
                    var tuplehtml = get_htmlUrl(gourl);
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                    {
                        realUrl = tuplehtml.Item1;
                    }
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                    {
                        detailHtml = tuplehtml.Item2;
                    }
                    if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                    {
                        domain = GetDomain(realUrl);
                    }
                }
            }
            if (string.IsNullOrEmpty(realUrl))
            {
                realUrl = href;
            }
            List <KeywordScore> matchpatterns = new List <KeywordScore>();

            if (string.IsNullOrEmpty(detailHtml))
            {
                return;
            }
            else
            {
                //if (!detailHtml.Contains(tsk.CommendKeyword))
                //{
                //    return;
                //}
                var           hrefs          = detailHtml.GetDescendents("a", "href");
                StringBuilder sbabstracts    = new StringBuilder();
                List <string> abstractlist   = new List <string>();
                StringBuilder sbabstractlist = new StringBuilder();

                foreach (KeywordScore pattern in patterns)
                {
                    string[] splitDetailHtmls = detailHtml.SplitWith(pattern.Keyword);
                    if (splitDetailHtmls.Length > 1)
                    {
                        matchpatterns.Add(pattern);
                    }
                    StringBuilder sbpatternStr = new StringBuilder();
                    for (int i = 0; i < splitDetailHtmls.Length - 1; i++)
                    {
                        string splitDetailHtml1 = splitDetailHtmls[i];
                        string splitDetailHtml2 = i < splitDetailHtmls.Length - 2 ? splitDetailHtmls[i + 1] : "";
                        for (int j = splitDetailHtml1.Length - 1; j >= 0; j--)
                        {
                            if (split_bef_commas.Contains(splitDetailHtml1[j]) && j - 1 >= 0 && !split_num_commas.Contains(splitDetailHtml1[j - 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml1[j]);
                        }
                        for (int q = sbpatternStr.Length - 1; q >= 0; q--)
                        {
                            sbabstracts.Append(sbpatternStr[q]);
                        }
                        sbabstracts.Append(pattern.Keyword);
                        sbpatternStr.Clear();
                        for (int j = 0; j < splitDetailHtml2.Length; j++)
                        {
                            if (split_aft_commas.Contains(splitDetailHtml2[j]) && j + 1 < splitDetailHtml2.Length && !split_num_commas.Contains(splitDetailHtml2[j + 1]))
                            {
                                break;
                            }
                            sbpatternStr.Append(splitDetailHtml2[j]);
                        }
                        sbabstracts.Append(sbpatternStr);
                        sbpatternStr.Clear();

                        string tmpsbabstracts = sbabstracts.ToString();
                        tmpsbabstracts = IW2SBaiduQuery.RemoveInivalidChar(tmpsbabstracts.GetTxtFromHtml2().RemoveSpace().GetLower());
                        if (!abstractlist.Contains(tmpsbabstracts))
                        {
                            abstractlist.Add(tmpsbabstracts);
                            sbabstractlist.Append(tmpsbabstracts).Append(" ");
                        }
                        sbabstracts.Clear();
                    }
                }
                //获取摘要
                abstracts = sbabstractlist.ToString();
                if (!string.IsNullOrEmpty(abstracts) && matchpatterns.Count > 0)
                {
                    maxScore  = matchpatterns.Max(x => x.Score ?? 50);
                    appType   = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();
                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (string.IsNullOrEmpty(abstracts) && !string.IsNullOrEmpty(abs))
            {
                matchpatterns = patterns.Where(x => abs.Contains(x.Keyword)).ToList();
                if (matchpatterns.Count > 0)
                {
                    maxScore = matchpatterns.Max(x => x.Score ?? 50);
                    appType  = matchpatterns.Where(x => x.BizType > 0).OrderByDescending(x => x.Score).Select(x => x.BizType).FirstOrDefault();

                    maxScore += matchpatterns.Sum(x => (x.Score ?? 50) / 10);
                    maxScore -= matchpatterns.Max(x => (x.Score ?? 50) / 10);
                }
            }
            if (maxScore > 100)
            {
                maxScore = 100;
            }


            bool          is_title_matched = title.GetLower().IsContains2(searchKeywords);
            bool          is_abstr_matched = abs.IsContains2(searchKeywords);
            BaiduItemPart part             = is_title_matched && is_abstr_matched ? BaiduItemPart.TitleAbstract :
                                             is_title_matched ? BaiduItemPart.Title :
                                             is_abstr_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;

            Regex           reg = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
            MatchCollection mc  = reg.Matches(detailHtml);
            //MatchCollection cols = reg.Matches(item.Html);
            string time = "";

            if (mc.Count > 0)
            {
                foreach (Match x in mc)
                {
                    //判断是正文中的还是代码和注释中的时间
                    if (!string.IsNullOrEmpty(x.Value))
                    {
                        var txt    = detailHtml.SubAfter(x.Value);
                        var index1 = txt.IndexOf('<');
                        var index2 = txt.IndexOf('>');
                        var index3 = txt.IndexOf('\"');
                        //只使用正文中的时间
                        if (index1 < index2 && index1 < index3)
                        {
                            time = x.Value;
                            break;
                        }
                    }
                }
            }


            IW2S_level1link l1 = new IW2S_level1link
            {
                UsrId     = tsk.UsrId,
                Domain    = domain,
                TopDomain = GetLevel1Domain(domain),
                Keywords  = string.Format("{0}", tsk.CommendKeyword),
                LinkUrl   = realUrl,
                MatchAt   = (byte)part,
                Html      = detailHtml,

                AppType         = appType,
                BizId           = IDHelper.GetGuid("{0}/{1}".FormatStr(realUrl, tsk._id.ToString())),
                SearchkeywordId = tsk._id.ToString(),
                CreatedAt       = DateTime.UtcNow.AddHours(8),
                Description     = abs,
                Title           = title,
                Score           = maxScore,
                Abstract        = abstracts,
                IsMarket        = isMarket,
                ProjectId       = tsk.ProjectId,
                PublishTime     = time,
                AlternateFields = "0",
                Rank            = rank
            };

            if (baiduVStar.HasValue)
            {
                l1.BaiduVStar = baiduVStar.Value;
            }

            botmng.save_level1_links(new List <IW2S_level1link> {
                l1
            }, tsk, excludedKeywords);
        }
Exemple #9
0
        public List <IW2S_level1link> prehandle_data(List <IW2S_level1link> links, IW2S_BaiduCommend tsk, List <IW2S_ExcludeKeyword> excludedKeywords)
        {
            if (links == null || links.Count == 0)
            {
                log("BLOCKED " + tsk.CommendKeyword);
                return(links);
            }
            else
            {
                links = links.DistinctBy(x => x.LinkUrl);
                log(links.Count + " Level 1 Links for " + tsk.CommendKeyword);
            }

            //var itm = MySqlDbHelper.GetEfEntities<protectitem>(ctx,"Id="+tsk.ProtectItemId).FirstOrDefault();
            ////{ScoredKeywords:{aaa:12,bbb:13}}
            //if (itm != null && !string.IsNullOrEmpty(itm.FingerPrints2))
            //{
            //    string[] sks = itm.FingerPrints2.SplitWith("$;");
            //    Dictionary<string, int> scores = new Dictionary<string, int>();
            //    if (sks != null)
            //    {
            //        foreach (var sk in sks)
            //        {
            //            string[] sps = sk.SplitWith(":", ":");
            //            if (sps == null || sps.Length != 2)
            //                continue;
            //            string k = sps[0].GetTrimed();
            //            int? s = sps[1].ExInt();
            //            if (sps[1].IsContains2("-"))
            //                s = -1 * s;
            //            if (string.IsNullOrEmpty(k) || !s.HasValue || scores.ContainsKey(k))
            //                continue;
            //            scores.Add(k, s.Value);
            //        }
            //    }
            //    foreach (var l in links)
            //    {
            //        string txt = string.Format("{0}{1}", l.Title, l.Abstract);
            //        l.Score = scores.Sum(x => txt.IsContain(x.Key) ? x.Value : 0);
            //        l.Title = "[使用了自定义打分]" + l.Title;
            //    }
            //}
            //else if (tsk.ProjectType == (byte)ProjectType.Artical && !string.IsNullOrEmpty(itm.FingerPrints))
            //{
            //    foreach (var l in links)
            //    {
            //        var txt = string.Join("", "{0},{1}".FormatStr(l.Title, l.Abstract).SplitWith(
            //            ";", ",", ";", ".", ",", "。", ";",
            //           "-", " ", "?", "“", "!", "”").Select(x => x.GetTrimed()).Where(x => !string.IsNullOrEmpty(x)));
            //        string[] fps = itm.FingerPrints.SplitWith(",");
            //        l.Score = l.Score / 2 + fps.Where(x => txt.IsContain(x)).Count() * 50 / fps.Length;
            //    }
            //}

            links.ForEach(x =>
            {
                //x.Keywords = tsk.Keyword;

                cleaning(x, excludedKeywords);
            });

            links = links.Where(x => x.DataCleanStatus != (byte)DataCleanStatus.Excluded).ToList();
            return(links);
        }