Beispiel #1
0
        public void Build(WebSpiderDbContext context)
        {
            if (!context.Database.CreateIfNotExists())
            {
                context.BiliBiliHtmls.AddOrUpdate(new BiliBiliHtml()
                {
                    Url = "www.baidu.com", FetchingTime = DateTime.Now, HtmlContent = "123", IsWebStaticSite = true, LastModifyTime = DateTime.Now, Title = "百度", ProtocolType = EnumProtocolType.HTTPS
                });

                //context.Database.CreateIfNotExists();
                context.SaveChanges();
            }
        }
        /// <summary>
        /// 添加主站数据
        /// </summary>
        public int InsertBiliBiliWebSiteOfHtml(string url)
        {
            //    BiliBiliHtml data1 = _context.BiliBiliHtmls.FirstOrDefault();


            string html = GetBiliBiliWebStaticSiteData(url, Encoding.UTF8);


            //添加主站
            BiliBiliHtml data = new BiliBiliHtml();

            data.FetchingTime   = DateTime.Now;
            data.Title          = html.GetPartHtmlString("<title>", "</title>");
            data.Url            = url;
            data.LastModifyTime = data.FetchingTime;
            data.HtmlContent    = html;
            data.ProtocolType   = Facility.Enums.EnumProtocolType.HTTPS;
            _context.BiliBiliHtmls.Add(data);


            html = Regex.Replace(html.Trim(), "\\s+", " ");
            //Match m = Regex.Match(html, "<div class=\"menu.*\">.*</div>");


            // < 尖括号在正则中算是一个特殊字符,在显式捕获分组中用它将分组名括起来。但是因为开头的尖括号在此上下文下并不会出现解析歧义,因此加不加转义符效果是一样的。
            // (?<GroupName>RegEx)格式定义一个命名分组,我们在上面定义了一个HtmlTag的标签分组,用来存放匹配到的Html标签名。Quote分组是用来给后面的匹配使用的。


            Match m = Regex.Match(html, "<(?<HtmlTag>[\\w]+)[^>]*\\sclass=(?<Quote>[\"']?)nav-menu(?(Quote)\\k<Quote>)[\"']?[^>]*>(((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*)</\\k<HtmlTag>>", RegexOptions.Singleline);



            string str1 = m.Groups[0].Value;


            //获取一级菜单
            //MatchCollection mc = Regex.Matches(str1, "(?i)[\\<]li\\s.+?[\\>](.*?)(?=</li>)", RegexOptions.IgnoreCase);
            MatchCollection mc = Regex.Matches(str1, "<(?<HtmlTag>[\\w]+)[^>]*\\sclass=(?<Quote>[\"'].*?)m-i.*?(?(Quote)\\k<Quote>)[\"']?[^>]*>(((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*)</\\k<HtmlTag>>", RegexOptions.Singleline);

            foreach (Match item in mc)
            {
                BiliBiliHtml bilimenu = new BiliBiliHtml();
                bilimenu.HtmlContent = item.Value;
                bilimenu.Title       = item.Value.GetPartHtmlString("<em>", "</em>");
                Match murl = Regex.Match(item.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>");
                bilimenu.Url             = murl.Groups["href"].Value;
                bilimenu.FetchingTime    = DateTime.Now;
                bilimenu.LastModifyTime  = bilimenu.FetchingTime;
                bilimenu.IsWebStaticSite = true;
                bilimenu.ProtocolType    = Facility.Enums.EnumProtocolType.HTTPS;
                _context.BiliBiliHtmls.Add(bilimenu);

                //二级菜单
                MatchCollection mcsonmenu = Regex.Matches(item.Value, "(?<=<ul>)(.*?)(?=</ul>)", RegexOptions.IgnoreCase);
            }


            //str1 = str1.Substring(0,str1.IndexOf("</div>"));

            //Match m = Regex.Match(html, "<(?<HtmlTag>[\\w]+)[^>]*\\sclass=(?<Quote>[\"']?)menu.*(?(Quote)\\k<Quote>)[^>]*?(/>|>((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*</\\k<HtmlTag>>)");


            int i = _context.SaveChanges();

            return(i);
        }