public void Build(WebSpiderDbContext context) { if (!context.Database.CreateIfNotExists()) { context.BiliBiliHtmls.AddOrUpdate(new BiliBiliHtml() { Url = "www.baidu.com", FetchingTime = DateTime.Now, HtmlContent = "123", IsWebStaticSite = true, LastModifyTime = DateTime.Now, Title = "百度", ProtocolType = EnumProtocolType.HTTPS }); //context.Database.CreateIfNotExists(); context.SaveChanges(); } }
/// <summary> /// 添加主站数据 /// </summary> public int InsertBiliBiliWebSiteOfHtml(string url) { // BiliBiliHtml data1 = _context.BiliBiliHtmls.FirstOrDefault(); string html = GetBiliBiliWebStaticSiteData(url, Encoding.UTF8); //添加主站 BiliBiliHtml data = new BiliBiliHtml(); data.FetchingTime = DateTime.Now; data.Title = html.GetPartHtmlString("<title>", "</title>"); data.Url = url; data.LastModifyTime = data.FetchingTime; data.HtmlContent = html; data.ProtocolType = Facility.Enums.EnumProtocolType.HTTPS; _context.BiliBiliHtmls.Add(data); html = Regex.Replace(html.Trim(), "\\s+", " "); //Match m = Regex.Match(html, "<div class=\"menu.*\">.*</div>"); // < 尖括号在正则中算是一个特殊字符,在显式捕获分组中用它将分组名括起来。但是因为开头的尖括号在此上下文下并不会出现解析歧义,因此加不加转义符效果是一样的。 // (?<GroupName>RegEx)格式定义一个命名分组,我们在上面定义了一个HtmlTag的标签分组,用来存放匹配到的Html标签名。Quote分组是用来给后面的匹配使用的。 Match m = Regex.Match(html, "<(?<HtmlTag>[\\w]+)[^>]*\\sclass=(?<Quote>[\"']?)nav-menu(?(Quote)\\k<Quote>)[\"']?[^>]*>(((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*)</\\k<HtmlTag>>", RegexOptions.Singleline); string str1 = m.Groups[0].Value; //获取一级菜单 //MatchCollection mc = Regex.Matches(str1, "(?i)[\\<]li\\s.+?[\\>](.*?)(?=</li>)", RegexOptions.IgnoreCase); MatchCollection mc = Regex.Matches(str1, "<(?<HtmlTag>[\\w]+)[^>]*\\sclass=(?<Quote>[\"'].*?)m-i.*?(?(Quote)\\k<Quote>)[\"']?[^>]*>(((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*)</\\k<HtmlTag>>", RegexOptions.Singleline); foreach (Match item in mc) { BiliBiliHtml bilimenu = new BiliBiliHtml(); bilimenu.HtmlContent = item.Value; bilimenu.Title = item.Value.GetPartHtmlString("<em>", "</em>"); Match murl = Regex.Match(item.Value, @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>"); bilimenu.Url = murl.Groups["href"].Value; bilimenu.FetchingTime = DateTime.Now; bilimenu.LastModifyTime = bilimenu.FetchingTime; bilimenu.IsWebStaticSite = true; bilimenu.ProtocolType = Facility.Enums.EnumProtocolType.HTTPS; _context.BiliBiliHtmls.Add(bilimenu); //二级菜单 MatchCollection mcsonmenu = Regex.Matches(item.Value, "(?<=<ul>)(.*?)(?=</ul>)", RegexOptions.IgnoreCase); } //str1 = str1.Substring(0,str1.IndexOf("</div>")); //Match m = Regex.Match(html, "<(?<HtmlTag>[\\w]+)[^>]*\\sclass=(?<Quote>[\"']?)menu.*(?(Quote)\\k<Quote>)[^>]*?(/>|>((?<Nested><\\k<HtmlTag>[^>]*>)|</\\k<HtmlTag>>(?<-Nested>)|.*?)*</\\k<HtmlTag>>)"); int i = _context.SaveChanges(); return(i); }