private static void savePageDetail( DetailLink lnk, StringBuilder sb ) { SpiderTemplate template = lnk.Template; string url = lnk.Url; string title = lnk.Title; string summary = lnk.Abstract; if (isPageExist( url, sb )) return; String pageBody = new PagedDetailSpider().GetContent( url, template, sb ); if (pageBody == null) return; SpiderArticle pd = new SpiderArticle(); pd.Title = title; pd.Url = strUtil.SubString( url, 250 ); pd.Abstract = summary; pd.Body = pageBody; pd.SpiderTemplate = template; MatchCollection matchs = Regex.Matches( pageBody, RegPattern.Img, RegexOptions.Singleline ); if (matchs.Count > 0) { pd.IsPic = 1; pd.PicUrl = matchs[0].Groups[1].Value; } pd.insert(); sb.AppendLine( "保存成功..." + lnk.Title + "_" + lnk.Url ); pageBody = Regex.Replace(pageBody, "font-size", "", RegexOptions.IgnoreCase); string strArcitleLink = "<div class=\"ArcitleLink\"><a href=" + pd.Url + ">原文链接</a></div>"; pageBody = pageBody + strArcitleLink; Maticsoft.BLL.BlogCategory bllBlogCategory = new Maticsoft.BLL.BlogCategory(); DataSet ds = bllBlogCategory.GetList("AppId = '" + template.IsDelete.ToString() + "'"); int nCateID = 1; if (ds.Tables[0].Rows.Count > 0) { nCateID = (int)ds.Tables[0].Rows[0]["Id"]; } BlogPost data = new BlogPost(); data.CategoryId = nCateID; data.Title = title; data.Abstract = summary; data.Content = pageBody; data.AccessStatus = 0; data.CommentCondition = 0; data.SaveStatus = 1;//草稿 data.Created = System.DateTime.Now.Date; data.IsTop = 0; data.IsPick = 0; data.IsPic = 0; data.Ip = ""; data.OwnerId = template.IsDelete; data.OwnerUrl = template.SiteName; data.OwnerType = "wojilu.Members.Users.Domain.User"; data.CreatorUrl = template.SiteName; data.AppId = template.IsDelete; ; data.CreatorId = template.IsDelete; Maticsoft.BLL.BlogPost bll = new Maticsoft.BLL.BlogPost(); bll.Add(data); }
private static DetailLink getDetailLink( Match match, SpiderTemplate s ) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; //判断输入的url是否满足用户定义的通配符方式的模式 MatchCollection matchs = Regex.Matches( url, ParseUrl( s.ListPattern ), RegexOptions.Singleline ); if (matchs.Count == 0) { return null; } if (url.IndexOf( "javascript:" ) >= 0) return null; if (url.StartsWith( "#" )) return null; title = Regex.Replace( title, "<.+?>", "" ); if (strUtil.IsNullOrEmpty( title )) return null; if (title == "更多") return null; if (title == "more") return null; if (title == "更多>>") return null; string summary = ""; if (match.Groups.Count > 2) summary = match.Groups[3].Value; if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url ); DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return lnk; }
private static void savePageDetail( DetailLink lnk, StringBuilder sb ) { SpiderTemplate template = lnk.Template; string url = lnk.Url; string title = lnk.Title; string summary = lnk.Abstract; if (isPageExist( url, sb )) return; String pageBody = new PagedDetailSpider().GetContent( url, template, sb ); if (pageBody == null) return; SpiderArticle pd = new SpiderArticle(); pd.Title = title; pd.Url = strUtil.SubString( url, 200 ); pd.Abstract = summary; pd.Body = pageBody; pd.SpiderTemplate = template; MatchCollection matchs = Regex.Matches( pageBody, RegPattern.Img, RegexOptions.Singleline ); if (matchs.Count > 0) { pd.IsPic = 1; pd.PicUrl = matchs[0].Groups[1].Value; } pd.insert(); sb.AppendLine( "保存成功..." + lnk.Title + "_" + lnk.Url ); }
protected static void SaveUrlToDB(string strReturnPage, SpiderTemplate s, List<DetailLink> list) { Dictionary<string, string> m_dicLink2Text = new Dictionary<string, string>(); string strUrlFilterRule = s.ListPattern; //strUrlFilterRule = ParseUrl(strUrlFilterRule); HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage); // string baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority); string baseUrl = GetUrlLeftPart(s.ListUrl); DocumentWithLinks links = htmlDoc.GetLinks(); bool bNoArticle = true; List<string> lstRevomeSame = new List<string>(); // int nCountPerPage = 0; // bool bExistFind = false; // List<string> lstNeedDownLoad = new List<string>(); foreach (string link in links.Links.Union(links.References)) { if (string.IsNullOrEmpty(link)) { continue; } //string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string decodedLink = link; //if (decodedLink != link) //{ // int a = 1; //} //Console.WriteLine(decodedLink); string normalizedLink = GetNormalizedLink(baseUrl, decodedLink); //Console.WriteLine(normalizedLink); if (string.IsNullOrEmpty(normalizedLink)) { continue; } MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline); if (matchs.Count > 0) { string strLinkText = ""; foreach (string strTemp in links.m_dicLink2Text.Keys) { if (strTemp.Contains(normalizedLink)) { strLinkText = links.m_dicLink2Text[strTemp]; break; } } //if (links.m_dicLink2Text.Keys.Contains(normalizedLink)) // strLinkText = links.m_dicLink2Text[normalizedLink]; if (strLinkText == "") { if (links.m_dicLink2Text.Keys.Contains(link)) strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart(); if (links.m_dicLink2Text.Keys.Contains(link.ToLower())) strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart(); } if (lstRevomeSame.Contains(normalizedLink)) continue; else lstRevomeSame.Add(normalizedLink); //bool bRet = AddLayerNodeToSaveUrlToDB(m_strWholeDbName, normalizedLink, ref strLinkText); DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = normalizedLink; lnk.Title = strLinkText; list.Add(lnk); } //Console.WriteLine(" uri is " + normalizedLink.ToString()); } return; }
private static DetailLink getDetailLink( Match match, SpiderTemplate s ) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; if (url.IndexOf( "javascript:" ) >= 0) return null; if (url.StartsWith( "#" )) return null; title = Regex.Replace( title, "<.+?>", "" ); if (strUtil.IsNullOrEmpty( title )) return null; if (title == "更多") return null; if (title == "more") return null; if (title == "更多>>") return null; string summary = ""; if (match.Groups.Count > 2) summary = match.Groups[3].Value; if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url ); DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return lnk; }