private static DetailLink getDetailLink(Match match, SpiderTemplate s) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; //判断输入的url是否满足用户定义的通配符方式的模式 MatchCollection matchs = Regex.Matches(url, ParseUrl(s.ListPattern), RegexOptions.Singleline); if (matchs.Count == 0) { return(null); } if (url.IndexOf("javascript:") >= 0) { return(null); } if (url.StartsWith("#")) { return(null); } title = Regex.Replace(title, "<.+?>", ""); if (strUtil.IsNullOrEmpty(title)) { return(null); } if (title == "更多") { return(null); } if (title == "more") { return(null); } if (title == "更多>>") { return(null); } string summary = ""; if (match.Groups.Count > 2) { summary = match.Groups[3].Value; } if (url.StartsWith("http") == false) { url = strUtil.Join(s.SiteUrl, url); } DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return(lnk); }
public static List <DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb) { List <DetailLink> list = new List <DetailLink>(); if (strUtil.IsNullOrEmpty(page)) { return(list); } MatchCollection matchs = Regex.Matches(page, s.ListPattern, RegexOptions.Singleline); sb.AppendLine("共抓取到链接:" + matchs.Count); for (int i = matchs.Count - 1; i >= 0; i--) { DetailLink dlink = getDetailLink(matchs[i], s); if (dlink == null) { continue; } if (dlink.Url.Length > 100) { continue; } list.Add(dlink); } return(list); }
public static List <DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb) { List <DetailLink> list = new List <DetailLink>(); if (strUtil.IsNullOrEmpty(page)) { return(list); } //获取全部url MatchCollection matchs = Regex.Matches(page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline); if (matchs.Count == 0) { logger.Error("list link match count=0"); } for (int i = matchs.Count - 1; i >= 0; i--) { DetailLink dlink = getDetailLink(matchs[i], s); if (dlink == null) { continue; } if (dlink.Url.Length > 100) { continue; } list.Add(dlink); } sb.AppendLine("共抓取到链接:" + list.Count); return(list); }
private static DetailLink getDetailLink(Match match, SpiderTemplate s) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; if (url.IndexOf("javascript:") >= 0) { return(null); } if (url.StartsWith("#")) { return(null); } title = Regex.Replace(title, "<.+?>", ""); if (strUtil.IsNullOrEmpty(title)) { return(null); } if (title == "更多") { return(null); } if (title == "more") { return(null); } if (title == "更多>>") { return(null); } string summary = ""; if (match.Groups.Count > 2) { summary = match.Groups[3].Value; } if (url.StartsWith("http") == false) { url = strUtil.Join(s.SiteUrl, url); } DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return(lnk); }
/// <summary> /// Creates a deep-copy clone of this instance. /// </summary> /// <returns>The cloned instance.</returns> protected virtual object CloneOverride() { var newLayer = (iLayer)MemberwiseClone(); newLayer.ID = Guid.NewGuid().ToString(); newLayer.ActionButtons = new List <Button>(ActionButtons.Select(a => a.Clone())); newLayer.Items = new List <iLayerItem>(Items.Select(i => { var clone = i.Clone(); if (i == FocusedItem) { newLayer.FocusedItem = clone; } else { var list = i as IList; int index = -1; if (list != null && (index = list.IndexOf(FocusedItem)) >= 0) { newLayer.FocusedItem = ((IList)clone)[index]; } } return(clone); })); if (BackButton != null) { newLayer.BackButton = BackButton.Clone(); } if (CompositeLayerLink != null) { newLayer.CompositeLayerLink = CompositeLayerLink.Clone(); } if (CompositeActionButton != null) { newLayer.CompositeActionButton = CompositeActionButton.Clone(); } if (DetailLink != null) { newLayer.DetailLink = DetailLink.Clone(); } if (FieldValuesRequested != null) { newLayer.FieldValuesRequested = FieldValuesRequested; } newLayer.LayerStyle = LayerStyle.Clone(); return(newLayer); }
private static void savePageDetail(DetailLink lnk, StringBuilder sb) { SpiderTemplate template = lnk.Template; string url = lnk.Url; string title = lnk.Title; string summary = lnk.Abstract; if (isPageExist(url, sb)) { return; } String pageBody = new PagedDetailSpider().GetContent(url, template, sb); if (pageBody == null) { return; } SpiderArticle pd = new SpiderArticle(); pd.Title = title; pd.Url = strUtil.SubString(url, 200); pd.Abstract = summary; pd.Body = pageBody; pd.SpiderTemplate = template; MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline); if (matchs.Count > 0) { pd.IsPic = 1; pd.PicUrl = matchs[0].Groups[1].Value; } pd.insert(); sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url); }
private static void savePageDetail(DetailLink lnk, StringBuilder sb) { SpiderTemplate template = lnk.Template; string url = lnk.Url; string title = lnk.Title; string summary = lnk.Abstract; if (isPageExist(url, sb)) { return; } String pageBody = new PagedDetailSpider().GetContent(url, template, sb); if (pageBody == null) { return; } SpiderArticle pd = new SpiderArticle(); pd.Title = title; pd.Url = strUtil.SubString(url, 250); pd.Abstract = summary; pd.Body = pageBody; pd.SpiderTemplate = template; MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline); if (matchs.Count > 0) { pd.IsPic = 1; pd.PicUrl = matchs[0].Groups[1].Value; } pd.insert(); sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url); pageBody = Regex.Replace(pageBody, "font-size", "", RegexOptions.IgnoreCase); string strArcitleLink = "<div class=\"ArcitleLink\"><a href=" + pd.Url + ">原文链接</a></div>"; pageBody = pageBody + strArcitleLink; Maticsoft.BLL.BlogCategory bllBlogCategory = new Maticsoft.BLL.BlogCategory(); DataSet ds = bllBlogCategory.GetList("AppId = '" + template.IsDelete.ToString() + "'"); int nCateID = 1; if (ds.Tables[0].Rows.Count > 0) { nCateID = (int)ds.Tables[0].Rows[0]["Id"]; } BlogPost data = new BlogPost(); data.CategoryId = nCateID; data.Title = title; data.Abstract = summary; data.Content = pageBody; data.AccessStatus = 0; data.CommentCondition = 0; data.SaveStatus = 1;//草稿 data.Created = System.DateTime.Now.Date; data.IsTop = 0; data.IsPick = 0; data.IsPic = 0; data.Ip = ""; data.OwnerId = template.IsDelete; data.OwnerUrl = template.SiteName; data.OwnerType = "wojilu.Members.Users.Domain.User"; data.CreatorUrl = template.SiteName; data.AppId = template.IsDelete;; data.CreatorId = template.IsDelete; Maticsoft.BLL.BlogPost bll = new Maticsoft.BLL.BlogPost(); bll.Add(data); }
protected static void SaveUrlToDB(string strReturnPage, SpiderTemplate s, List <DetailLink> list) { Dictionary <string, string> m_dicLink2Text = new Dictionary <string, string>(); string strUrlFilterRule = s.ListPattern; //strUrlFilterRule = ParseUrl(strUrlFilterRule); HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage); // string baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority); string baseUrl = GetUrlLeftPart(s.ListUrl); DocumentWithLinks links = htmlDoc.GetLinks(); bool bNoArticle = true; List <string> lstRevomeSame = new List <string>(); // int nCountPerPage = 0; // bool bExistFind = false; // List<string> lstNeedDownLoad = new List<string>(); foreach (string link in links.Links.Union(links.References)) { if (string.IsNullOrEmpty(link)) { continue; } //string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string decodedLink = link; //if (decodedLink != link) //{ // int a = 1; //} //Console.WriteLine(decodedLink); string normalizedLink = GetNormalizedLink(baseUrl, decodedLink); //Console.WriteLine(normalizedLink); if (string.IsNullOrEmpty(normalizedLink)) { continue; } MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline); if (matchs.Count > 0) { string strLinkText = ""; foreach (string strTemp in links.m_dicLink2Text.Keys) { if (strTemp.Contains(normalizedLink)) { strLinkText = links.m_dicLink2Text[strTemp]; break; } } //if (links.m_dicLink2Text.Keys.Contains(normalizedLink)) // strLinkText = links.m_dicLink2Text[normalizedLink]; if (strLinkText == "") { if (links.m_dicLink2Text.Keys.Contains(link)) { strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart(); } if (links.m_dicLink2Text.Keys.Contains(link.ToLower())) { strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart(); } } if (lstRevomeSame.Contains(normalizedLink)) { continue; } else { lstRevomeSame.Add(normalizedLink); } //bool bRet = AddLayerNodeToSaveUrlToDB(m_strWholeDbName, normalizedLink, ref strLinkText); DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = normalizedLink; lnk.Title = strLinkText; list.Add(lnk); } //Console.WriteLine(" uri is " + normalizedLink.ToString()); } return; }