/// <summary>
        /// 解析搜索结果数据
        /// </summary>
        /// <param name="tsk">关键词信息</param>
        /// <param name="excludedDomains">排除域名列表</param>
        /// <param name="searchKeywords">搜索关键词</param>
        /// <param name="title">标题</param>
        /// <param name="href">链接</param>
        /// <param name="description">描述</param>
        /// <param name="domain">域名</param>
        /// <param name="tag">搜索结果源码</param>
        /// <param name="isMarket">是否为推广链接</param>
        private void HanleTagData(Dnl_Keyword tsk, List <Dnl_IgnoreDomain> excludedDomains, string searchKeywords, string title, string href, string description, ref string domain, string tag, bool isMarket)
        {
            string realUrl = null, detailHtml = null;     //真实网址、网页源码
            //判断百度蓝V等级
            int?baiduVStar = null;

            if (tag.Contains("c-icon-v1"))
            {
                baiduVStar = 1;
            }
            else if (tag.Contains("c-icon-v2"))
            {
                baiduVStar = 2;
            }
            else if (tag.Contains("c-icon-v3"))
            {
                baiduVStar = 3;
            }
            //获取真实网址、网页源码和网页摘要
            if (!string.IsNullOrWhiteSpace(href))
            {
                //获取网页源码及真实地址
                var tuplehtml = get_htmlUrl(href);
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                {
                    realUrl = tuplehtml.Item1;
                }
                if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                {
                    detailHtml = tuplehtml.Item2;
                }
                //获取网页二级域名
                if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                {
                    domain = GetDomain(realUrl);
                }
            }
            //如果网页本身也是跳转链接,进一步获取获取真实网页源码并解析数据
            if (!string.IsNullOrEmpty(detailHtml) && detailHtml.Contains("document.getElementById(\"link\").click()"))
            {
                var gourl = detailHtml.GetFirstHref2();
                if (!string.IsNullOrEmpty(gourl))
                {
                    var tuplehtml = get_htmlUrl(gourl);
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item1))
                    {
                        realUrl = tuplehtml.Item1;
                    }
                    if (tuplehtml != null && !string.IsNullOrEmpty(tuplehtml.Item2))
                    {
                        detailHtml = tuplehtml.Item2;
                    }
                    if (!string.IsNullOrEmpty(realUrl) && string.IsNullOrEmpty(domain))
                    {
                        domain = GetDomain(realUrl);
                    }
                }
            }
            //去除前缀并计算域名收录量
            Regex regDomain     = new Regex("http://|https://");
            long  collectionNum = 0;

            if (!string.IsNullOrEmpty(domain))
            {
                domain        = regDomain.Replace(domain, "");
                collectionNum = GetDomainCollectionNum(domain);
            }

            if (string.IsNullOrEmpty(realUrl))
            {
                realUrl = href;
            }
            List <KeywordScore> matchpatterns = new List <KeywordScore>();

            if (string.IsNullOrEmpty(detailHtml))
            {
                return;
            }
            else
            {
                var hrefs = detailHtml.GetDescendents("a", "href");
            }

            string content = GetMainContentHelper.GetMainContent(detailHtml);         //获取网页中文正文

            bool          is_title_matched = title.IsContains2(searchKeywords);       //标题是否匹配到关键词
            bool          is_desc_matched  = description.IsContains2(searchKeywords); //描述是否匹配到关键词
            BaiduItemPart part             = is_title_matched && is_desc_matched ? BaiduItemPart.TitleAbstract :
                                             is_title_matched ? BaiduItemPart.Title :
                                             is_desc_matched ? BaiduItemPart.Abstract : BaiduItemPart.None;

            /* 匹配发布时间 */
            Regex  reg  = new Regex("(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)");
            string time = "";
            //先匹配搜索结果里是否有数据
            string timeStr = tag.SubAfter("newTimeFactor_before_abs").SubBefore("</span>");

            if (!string.IsNullOrEmpty(timeStr))
            {
                Match mt = reg.Match(timeStr);
                time = mt.Value;
            }
            else
            {
                //匹配网页源码里的时间
                MatchCollection mc = reg.Matches(detailHtml);
                if (mc.Count > 0)
                {
                    foreach (Match x in mc)
                    {
                        //判断是正文中的还是代码和注释中的时间
                        if (!string.IsNullOrEmpty(x.Value))
                        {
                            var txt    = detailHtml.SubAfter(x.Value);
                            var index1 = txt.IndexOf('<');
                            var index2 = txt.IndexOf('>');
                            var index3 = txt.IndexOf('\"');
                            //只使用正文中的时间
                            if (index1 < index2 && index1 < index3)
                            {
                                time = x.Value;
                                break;
                            }
                        }
                    }
                }
            }

            //生成链接信息
            Dnl_Link_Baidu link = new Dnl_Link_Baidu
            {
                Domain          = domain,
                TopDomain       = GetLevel1Domain(domain),
                Keywords        = tsk.Keyword,
                LinkUrl         = realUrl,
                MatchAt         = (byte)part,
                Html            = detailHtml,
                SearchkeywordId = tsk._id.ToString(),
                CreatedAt       = DateTime.UtcNow.AddHours(8),
                Description     = description,
                Title           = title,
                IsPromotion     = isMarket,
                PublishTime     = time,
                Content         = content,
                DCNum           = collectionNum
            };

            if (baiduVStar.HasValue)
            {
                link.BaiduVStar = baiduVStar.Value;
            }

            SaveLink(link, tsk);
        }
Exemple #2
0
 private void GetBaiduUrlContent(string Url)
 {
     try {
         if (string.IsNullOrEmpty(Url))
         {
             return;
         }
         Log4Helper.Write(LogLevel.Debug, Url);
         string    EnCodeName = string.Empty;
         string    path       = this.BaseDir;
         WebClient client     = new WebClient {
             Encoding = Encoding.GetEncoding("gbk")
         };
         client.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
         string input = client.DownloadString(Url);
         Match  match = Regex.Match(input, "charset=\"?(.+?)\"");
         if (match.Success)
         {
             EnCodeName = match.Groups[1].Value;
         }
         if (EnCodeName != "gbk")
         {
             WebClient client2 = new WebClient {
                 Encoding = Encoding.GetEncoding(EnCodeName)
             };
             client2.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
             string str8 = Url;
             if (!str8.Contains("http://"))
             {
                 str8 = "http://" + str8;
             }
             input = client2.DownloadString(Url);
         }
         string titlePattern = "<title>(.+?)</title>";
         Match  titleMatch   = Regex.Match(input, titlePattern, RegexOptions.IgnoreCase);
         string Title        = string.Empty;
         if (titleMatch.Success)
         {
             Title = titleMatch.Groups[1].Value.Replace('\\', ' ').Replace('/', ' ').Replace(' ', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0];
         }
         else
         {
             return;
         }
         string Content = GetMainContentHelper.GetMainContent(input);
         if (Content.Length < 300)
         {
             Log4Helper.Write(LogLevel.Error, "未能提取到正文线程号");
             return;
         }
         Title = Title.Replace('\\', ' ').Replace('/', ' ').Split(new char[] { '_' })[0].Split(new char[] { '-' })[0];
         if (!Directory.Exists(path))
         {
             Directory.CreateDirectory(path);
         }
         string strHtml = ".html";
         Title = Title.Replace(".", "");
         Title = Title.Replace(",", "");
         Title = Title.Replace("、", "");
         Title = Title.Replace(" ", "");
         Title = Title.Replace("*", "_");
         using (FileStream stream = new FileStream(path + @"\" + Title + strHtml, FileMode.Create)) {
             StreamWriter writer2 = new StreamWriter(stream, Encoding.GetEncoding("utf-8"));
             writer2.Write(Content);
             writer2.Dispose();
             client.Dispose();
         }
     }
     catch (Exception ex) {
         Log4Helper.Write(LogLevel.Error, ex);
     }
 }