private static DetailLink getDetailLink(Match match, SpiderTemplate s) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; //判断输入的url是否满足用户定义的通配符方式的模式 MatchCollection matchs = Regex.Matches(url, ParseUrl(s.ListPattern), RegexOptions.Singleline); if (matchs.Count == 0) { return(null); } if (url.IndexOf("javascript:") >= 0) { return(null); } if (url.StartsWith("#")) { return(null); } title = Regex.Replace(title, "<.+?>", ""); if (strUtil.IsNullOrEmpty(title)) { return(null); } if (title == "更多") { return(null); } if (title == "more") { return(null); } if (title == "更多>>") { return(null); } string summary = ""; if (match.Groups.Count > 2) { summary = match.Groups[3].Value; } if (url.StartsWith("http") == false) { url = strUtil.Join(s.SiteUrl, url); } DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return(lnk); }
//利用HtmlAgilityPack生成HtmlDocument protected HtmlDocument getDetailPageBodyHtmlDocument( string detailUrl, SpiderTemplate template, StringBuilder sb ) { try { sb.AppendLine( "抓取详细页..." + detailUrl ); HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; String page; if (strUtil.HasText( template.DetailEncoding )) page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, template.DetailEncoding ); else page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, "" ); htmlDoc.LoadHtml( page ); return htmlDoc; } catch (Exception ex) { logInfo( "error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb ); return null; } }
public override String GetContent( String url, SpiderTemplate s, StringBuilder sb ) { this._template = s; this._url = url; this._log = sb; return this.getPageContentEx(); }
protected string getDetailPageBody(string detailUrl, SpiderTemplate template, StringBuilder sb) { try { sb.AppendLine("抓取详细页..." + detailUrl); String page; if (strUtil.HasText(template.DetailEncoding)) { page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, template.DetailEncoding); } else { page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, ""); } template.SiteUrl = new UrlInfo(detailUrl).SiteUrl; if (strUtil.IsNullOrEmpty(page)) { logInfo("error=原始页面没有内容:" + detailUrl, detailUrl, template, sb); } return(page); } catch (Exception ex) { logInfo("error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb); return(null); } }
public virtual String GetContent(String url, SpiderTemplate s, StringBuilder sb) { this._template = s; this._url = url; this._log = sb; return(this.getPageContent()); }
protected string getDetailPageBody( string detailUrl, SpiderTemplate template, StringBuilder sb ) { try { sb.AppendLine( "抓取详细页..." + detailUrl ); String page; if (strUtil.HasText( template.DetailEncoding )) page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, template.DetailEncoding ); else page = PageLoader.Download( detailUrl, SpiderConfig.UserAgent, "" ); template.SiteUrl = new UrlInfo( detailUrl ).SiteUrl; if (strUtil.IsNullOrEmpty( page )) { logInfo( "error=原始页面没有内容:" + detailUrl, detailUrl, template, sb ); } return page; } catch (Exception ex) { logInfo( "error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb ); return null; } }
public static List <DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb) { List <DetailLink> list = new List <DetailLink>(); if (strUtil.IsNullOrEmpty(page)) { return(list); } //获取全部url //MatchCollection matchs = Regex.Matches( page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline ); //if (matchs.Count == 0) { // logger.Error( "list link match count=0" ); // logInfo( "list link match count=0", s, sb ); //} SaveUrlToDB(page, s, list); //for (int i = matchs.Count - 1; i >= 0; i--) { // DetailLink dlink = // if (dlink == null) continue; // if (dlink.Url.Length > 100) continue; // list.Add( dlink ); //} logInfo("共抓取到链接:" + list.Count, s, sb); return(list); }
private ISpiderTool getSpider( SpiderTemplate s ) { if (strUtil.IsNullOrEmpty( s.SpiderType )) return defaultSpider; ISpiderTool spider = ObjectContext.GetByType( s.SpiderType ) as ISpiderTool; if (spider == null) return defaultSpider; return spider; }
private static SpiderTemplate getTemplate( String listUrl, String beginCode, String endCode ) { SpiderTemplate s = new SpiderTemplate(); s.ListUrl = listUrl; s.ListBodyPattern = beginCode + ".+?" + endCode; s.ListPattern = SpiderConfig.ListLinkPattern; return s; }
private static SpiderTemplate getTemplate(String listUrl, String beginCode, String endCode) { SpiderTemplate s = new SpiderTemplate(); s.ListUrl = listUrl; s.ListBodyPattern = beginCode + ".+?" + endCode; s.ListPattern = SpiderConfig.ListLinkPattern; return(s); }
private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb) { String target; if (strUtil.HasText(s.ListEncoding)) { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding); } else { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, ""); } if (strUtil.IsNullOrEmpty(target)) { logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb); return(target); } if (!strUtil.IsNullOrEmpty(s.GetListBodyPattern())) { HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; htmlDoc.LoadHtml(target); IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetListBodyPattern()); if (Nodes.Count() > 0) { target = Nodes.ToArray()[0].OuterHtml; return(target.Trim()); } else { logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); return(null); } } //这里未来也可以改成css选择器的方式,来细化目标url集合的范围 //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline); //if (match.Success) //{ // target = match.Value; //} //else //{ // target = ""; // logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); //} return(target.Trim()); }
private static DetailLink getDetailLink(Match match, SpiderTemplate s) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; if (url.IndexOf("javascript:") >= 0) { return(null); } if (url.StartsWith("#")) { return(null); } title = Regex.Replace(title, "<.+?>", ""); if (strUtil.IsNullOrEmpty(title)) { return(null); } if (title == "更多") { return(null); } if (title == "more") { return(null); } if (title == "更多>>") { return(null); } string summary = ""; if (match.Groups.Count > 2) { summary = match.Groups[3].Value; } if (url.StartsWith("http") == false) { url = strUtil.Join(s.SiteUrl, url); } DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return(lnk); }
protected string getMatchedBody(string page, SpiderTemplate s, StringBuilder sb) { Match match = Regex.Match(page, s.GetDetailPattern(), RegexOptions.Singleline); if (match == null || !match.Success || string.IsNullOrEmpty(match.Value)) { logInfo("error=没有匹配的页面内容:" + _url, this._url, s, sb); return(null); } return(match.Groups[1].Value); }
public static List<DetailLink> GetDataList( SpiderTemplate s, StringBuilder sb ) { if (strUtil.HasText( s.ListUrl )) s.SiteUrl = new UrlInfo( s.ListUrl ).SiteUrl; // 一、先抓取列表页面内容 string page = downloadListPage( s, sb ); // 二、得到所有文章的title和url List<DetailLink> list = getListItem( s, page, sb ); return list; }
public void Execute() { // List<SpiderTemplate> list = SpiderTemplate.find( "IsDelete=0" ).list(); DbContext.closeConnectionAll(); StringBuilder log = new StringBuilder(); IList userRanks = User.find("order by Hits desc, id desc").list(1000); logger.Info("begin SpiderJob=" + userRanks.Count); foreach (User user in userRanks) { if (string.IsNullOrEmpty(user.Profile.Address)) continue; SpiderTemplate s = new SpiderTemplate(); s.ListUrl = user.Profile.Address; s.ListEncoding = user.QQ; s.ListBodyPattern = user.Profile.Tel; s.ListPattern = user.Profile.WebSite; s.DetailPattern = user.MSN; s.IsDelete = user.Id; s.SiteName = user.Url; ISpiderTool spider = getSpider(s); spider.DownloadPage(s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo }); // 2~6秒暂停 DbContext.closeConnectionAll(); int sleepms = rd.Next(SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo); Thread.Sleep(sleepms); } //foreach (SpiderTemplate s in list) { // ISpiderTool spider = getSpider( s ); // spider.DownloadPage( s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo } ); // 2~6秒暂停 // DbContext.closeConnectionAll(); // int sleepms = rd.Next( SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo ); // Thread.Sleep( sleepms ); //} String[] arrLog = log.ToString().Split( '\n' ); StringBuilder errorLog = new StringBuilder(); foreach (String item in arrLog) { if (item.Trim().StartsWith( "error=" )) errorLog.AppendLine( item.Trim() ); } SpiderLog sg = new SpiderLog(); sg.Msg = errorLog.ToString(); sg.insert(); DbContext.closeConnectionAll(); }
private string getPagedContent(string page, string url, SpiderTemplate s, StringBuilder sb) { StringBuilder pList = new StringBuilder(); List <String> urls = getPagedUrl(page, url); for (int i = 0; i < urls.Count; i++) { pList.AppendLine("<hr>"); String pageContent = new DetailSpider().GetContent(urls[i], s, sb); pList.Append(pageContent); } return(pList.ToString()); }
//css选择器方式提取详细页内容 protected string getMatchedBody(HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb) { //IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetDetailPattern() ); //if (Nodes.Count() > 0) { // String fpage = Nodes.ToArray()[0].OuterHtml; // return fpage; //} //else { // logInfo( "error=没有匹配的页面内容:" + _url, this._url, s, sb ); // return null; //} return(""); }
public void testSpider() { StringBuilder log = new StringBuilder(); SpiderTemplate s = getTemplate("http://news.163.com", "<div class=\"content\" style=\"zoom:1;\">", "<h2>图片新闻</h2>"); List <DetailLink> list = SpiderTool.GetDataList(s, log); Assert.Greater(list.Count, 1); s = getTemplate("http://women.sohu.com/love-story/", "<div class=\"f14list\">", "<div class=\"pages\">"); list = SpiderTool.GetDataList(s, log); Assert.Greater(list.Count, 1); }
protected string getMatchedBody( string page, SpiderTemplate s, StringBuilder sb ) { Match match = Regex.Match( page, s.GetDetailPattern(), RegexOptions.Singleline ); if (match == null || !match.Success || string.IsNullOrEmpty( match.Value )) { logInfo( "error=没有匹配的页面内容:"+_url, this._url, s, sb ); return null; } page = match.Groups[1].Value; String fpage = HtmlFilter.Filter( page ); // 过滤广告 return fpage; }
private static string downloadListPage(SpiderTemplate s, StringBuilder sb) { string page = null; try { page = downloadListPageBody(s, sb); } catch (Exception ex) { logInfo("error=抓取" + s.ListUrl + "发生错误:" + ex.Message, s, sb); return(page); } return(page); }
public static List<DetailLink> GetDataList( SpiderTemplate s, StringBuilder sb ) { if (strUtil.HasText( s.ListUrl )) s.SiteUrl = new UrlInfo( s.ListUrl ).SiteUrl; // 一、先抓取列表页面内容 string page = downloadListPage( s, sb ); if (strUtil.IsNullOrEmpty( page )) { logger.Error( "list page is empty, url=" + s.SiteUrl ); } // 二、得到所有文章的title和url List<DetailLink> list = getListItem( s, page, sb ); return list; }
public static List <DetailLink> GetDataList(SpiderTemplate s, StringBuilder sb) { if (strUtil.HasText(s.ListUrl)) { s.SiteUrl = new UrlInfo(s.ListUrl).SiteUrl; } // 一、先抓取列表页面内容 string page = downloadListPage(s, sb); // 二、得到所有文章的title和url List <DetailLink> list = getListItem(s, page, sb); return(list); }
private static string downloadListPage( SpiderTemplate s, StringBuilder sb ) { string page = null; try { page = downloadListPageBody( s, sb ); } catch (Exception ex) { logInfo( "error=抓取" + s.ListUrl + "发生错误:" + ex.Message, s, sb ); return page; } return page; }
private ISpiderTool getSpider(SpiderTemplate s) { if (strUtil.IsNullOrEmpty(s.SpiderType)) { return(defaultSpider); } ISpiderTool spider = ObjectContext.GetByType(s.SpiderType) as ISpiderTool; if (spider == null) { return(defaultSpider); } return(spider); }
private string filterPage(string input, SpiderTemplate spiderTemplate) { if (strUtil.IsNullOrEmpty(spiderTemplate.DetailClearTag)) { return(input); } String[] arrTag = spiderTemplate.DetailClearTag.ToLower().Split(','); if (arrTag.Length == 0) { return(input); } List <String> rTag = new List <String>(); logger.Info("filterTag, input=" + input); // 过滤标签,以及标签内部的内容 foreach (String tag in arrTag) { // font/span/a 只过滤tag,不过滤内容;其他都过滤内容 if (tag == "font" || tag == "span" || tag == "a") { rTag.Add(tag); continue; } logger.Info("tag=" + tag); input = RegPattern.ReplaceHtml(input, tag, true); } logger.Info("filterTag, clear tag1=" + input); // 只过滤标签,不过滤标签的内容 foreach (String tag in rTag) { logger.Info("tag=" + tag); input = RegPattern.ReplaceHtml(input, tag, false); } logger.Info("filterTag, clear tag2=" + input); return(input); }
public static List<DetailLink> getListItem( SpiderTemplate s, string page, StringBuilder sb ) { List<DetailLink> list = new List<DetailLink>(); if (strUtil.IsNullOrEmpty( page )) return list; MatchCollection matchs = Regex.Matches( page, s.ListPattern, RegexOptions.Singleline ); sb.AppendLine( "共抓取到链接:" + matchs.Count ); for (int i = matchs.Count - 1; i >= 0; i--) { DetailLink dlink = getDetailLink( matchs[i], s ); if (dlink == null) continue; if (dlink.Url.Length > 100) continue; list.Add( dlink ); } return list; }
public void DownloadPage( SpiderTemplate s, StringBuilder log, int[] arrSleep ) { logger.Info( "抓取列表页..." + s.SiteName + "_" + s.ListUrl ); log.AppendLine( "抓取列表页..." + s.SiteName + "_" + s.ListUrl ); List<DetailLink> list = GetDataList( s, log ); foreach (DetailLink link in list) { savePageDetail( link, log ); // 暂停几秒,TODO 可配置 int sleepms = rd.Next( arrSleep[0], arrSleep[1] ); Thread.Sleep( sleepms ); } log.AppendLine( "抓取完毕。" ); }
public void DownloadPage(SpiderTemplate s, StringBuilder log, int[] arrSleep) { logger.Info("抓取列表页..." + s.SiteName + "_" + s.ListUrl); log.AppendLine("抓取列表页..." + s.SiteName + "_" + s.ListUrl); List <DetailLink> list = GetDataList(s, log); foreach (DetailLink link in list) { savePageDetail(link, log); // 暂停几秒,TODO 可配置 int sleepms = rd.Next(arrSleep[0], arrSleep[1]); Thread.Sleep(sleepms); } log.AppendLine("抓取完毕。"); }
public void GetDetail() { String newsUrl = ctx.Post("detailUrl"); SpiderTemplate s = new SpiderTemplate(); //String detailBeginCode = ctx.PostHtmlAll( "detailBeginCode" ); //String detailEndCode = ctx.PostHtmlAll( "detailEndCode" ); //String DetailPattern = detailBeginCode + "(.+?)" + detailEndCode; String DetailPattern = ctx.PostHtmlAll("DetailPattern"); s.DetailPattern = DetailPattern; logger.Info("DetailPattern=" + s.DetailPattern); String detailEncoding = ctx.Post("detailEncoding"); s.DetailEncoding = detailEncoding; s.IsSavePic = 0; StringBuilder log = new StringBuilder(); string newsBody = new PagedDetailSpider().GetContent(newsUrl, s, log); String strLog = log.ToString(); if (strLog.IndexOf("error=") >= 0) { StringBuilder sblog = new StringBuilder(); sblog.AppendLine("detailUrl=" + newsUrl); sblog.AppendLine("detailPattern=" + s.DetailPattern); sblog.Append(log); echoText(sblog.ToString()); } else { echoText(newsBody); } }
public void GetList() { SpiderTemplate s = ctx.PostValue <SpiderTemplate>(); //String beginCode = ctx.PostHtmlAll( "listBeginCode" ); //String endCode = ctx.PostHtmlAll( "listEndCode" ); //s.ListBodyPattern = beginCode + ".+?" + endCode; String listBodyPattern = ctx.PostHtmlAll("ListBodyPattern"); String ListPattern = ctx.PostHtmlAll("ListPattern"); s.ListBodyPattern = listBodyPattern; if (strUtil.IsNullOrEmpty(ListPattern)) { ListPattern = SpiderConfig.ListLinkPattern; } s.ListPattern = ListPattern; String listEncoding = ctx.Post("listEncoding"); s.ListEncoding = listEncoding; StringBuilder log = new StringBuilder(); List <DetailLink> list = SpiderTool.GetDataList(s, log); if (list.Count == 0) { Dictionary <String, Object> dic = new Dictionary <String, Object>(); dic.Add("IsValid", false); dic.Add("listUrl", s.ListUrl); dic.Add("patternBody", s.ListBodyPattern); dic.Add("patternLinks", s.ListPattern); echoJson(JsonString.Convert(dic)); } else { renderJson(list); } }
public void SetTemplate(int id) { target(GetList); if (id > 0) { SpiderTemplate s = templateService.GetById(id); // 感谢 sgzwiz (http://www.wojilu.com/sgzwiz) 贡献此处代码 set("objTemplate", JsonString.ConvertObject(s).Replace("<", "<").Replace(">", ">")); } else { set("objTemplate", "{Id:0}"); } set("detailAction", to(GetDetail)); set("saveLink", to(Save)); set("listPattern", SpiderConfig.ListLinkPattern); set("returnUrl", to(List)); }
public void DoRefresh(int id) { if (id <= 0) { echoRedirect("请先选择模板"); return; } set("processLink", to(Process, id)); SpiderTemplate s = templateService.GetById(id); TemplateAndLog tl = new TemplateAndLog(); tl.Template = s; StringBuilder sb = LogCacher.GetNewSpiderLog("log" + ctx.viewer.Id); tl.log = sb; new Thread(beginRefresh).Start(tl); }
public static List<DetailLink> getListItem( SpiderTemplate s, string page, StringBuilder sb ) { List<DetailLink> list = new List<DetailLink>(); if (strUtil.IsNullOrEmpty( page )) return list; //获取全部url MatchCollection matchs = Regex.Matches( page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline ); if (matchs.Count == 0) { logger.Error( "list link match count=0" ); } for (int i = matchs.Count - 1; i >= 0; i--) { DetailLink dlink = getDetailLink( matchs[i], s ); if (dlink == null) continue; if (dlink.Url.Length > 100) continue; list.Add( dlink ); } sb.AppendLine( "共抓取到链接:" + list.Count ); return list; }
private static void savePageDetail(DetailLink lnk, StringBuilder sb) { SpiderTemplate template = lnk.Template; string url = lnk.Url; string title = lnk.Title; string summary = lnk.Abstract; if (isPageExist(url, sb)) { return; } String pageBody = new PagedDetailSpider().GetContent(url, template, sb); if (pageBody == null) { return; } SpiderArticle pd = new SpiderArticle(); pd.Title = title; pd.Url = strUtil.SubString(url, 200); pd.Abstract = summary; pd.Body = pageBody; pd.SpiderTemplate = template; MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline); if (matchs.Count > 0) { pd.IsPic = 1; pd.PicUrl = matchs[0].Groups[1].Value; } pd.insert(); sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url); }
public void Execute() { List <SpiderTemplate> list = SpiderTemplate.find("IsDelete=0").list(); DbContext.closeConnectionAll(); logger.Info("begin SpiderJob=" + list.Count); StringBuilder log = new StringBuilder(); foreach (SpiderTemplate s in list) { ISpiderTool spider = getSpider(s); spider.DownloadPage(s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo }); // 2~6秒暂停 DbContext.closeConnectionAll(); int sleepms = rd.Next(SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo); Thread.Sleep(sleepms); } String[] arrLog = log.ToString().Split('\n'); StringBuilder errorLog = new StringBuilder(); foreach (String item in arrLog) { if (item.Trim().StartsWith("error=")) { errorLog.AppendLine(item.Trim()); } } SpiderLog sg = new SpiderLog(); sg.Msg = errorLog.ToString(); sg.insert(); DbContext.closeConnectionAll(); }
public virtual void SaveSort() { int id = ctx.PostInt("id"); String cmd = ctx.Post("cmd"); SpiderTemplate s = templateService.GetById(id); List <SpiderTemplate> list = templateService.GetAll(); if (cmd == "up") { new SortUtil <SpiderTemplate>(s, list).MoveUp(); echoRedirect("ok"); } else if (cmd == "down") { new SortUtil <SpiderTemplate>(s, list).MoveDown(); echoRedirect("ok"); } else { echoError(lang("exUnknowCmd")); } }
private string filterPage( string input, SpiderTemplate spiderTemplate ) { if (strUtil.IsNullOrEmpty( spiderTemplate.DetailClearTag )) return input; String[] arrTag = spiderTemplate.DetailClearTag.ToLower().Split( ',' ); if (arrTag.Length == 0) return input; List<String> rTag = new List<String>(); logger.Info( "filterTag, input=" + input ); // 过滤标签,以及标签内部的内容 foreach (String tag in arrTag) { // font/span/a 只过滤tag,不过滤内容;其他都过滤内容 if (tag == "font" || tag == "span" || tag == "a") { rTag.Add( tag ); continue; } logger.Info( "tag=" + tag ); input = RegPattern.ReplaceHtml( input, tag, true ); } logger.Info( "filterTag, clear tag1=" + input ); // 只过滤标签,不过滤标签的内容 foreach (String tag in rTag) { logger.Info( "tag=" + tag ); input = RegPattern.ReplaceHtml( input, tag, false ); } logger.Info( "filterTag, clear tag2=" + input ); return input; }
public static List<DetailLink> getListItem(SpiderTemplate s, string page, StringBuilder sb) { List<DetailLink> list = new List<DetailLink>(); if (strUtil.IsNullOrEmpty( page )) return list; //获取全部url //MatchCollection matchs = Regex.Matches( page, SpiderConfig.ListLinkPattern, RegexOptions.Singleline ); //if (matchs.Count == 0) { // logger.Error( "list link match count=0" ); // logInfo( "list link match count=0", s, sb ); //} SaveUrlToDB(page, s, list); //for (int i = matchs.Count - 1; i >= 0; i--) { // DetailLink dlink = // if (dlink == null) continue; // if (dlink.Url.Length > 100) continue; // list.Add( dlink ); //} logInfo( "共抓取到链接:" + list.Count, s, sb ); return list; }
public void GetDetail() { String newsUrl = ctx.Post( "detailUrl" ); SpiderTemplate s = new SpiderTemplate(); //String detailBeginCode = ctx.PostHtmlAll( "detailBeginCode" ); //String detailEndCode = ctx.PostHtmlAll( "detailEndCode" ); //String DetailPattern = detailBeginCode + "(.+?)" + detailEndCode; String DetailPattern = ctx.PostHtmlAll( "DetailPattern" ); s.DetailPattern = DetailPattern; logger.Info( "DetailPattern=" + s.DetailPattern ); String detailEncoding = ctx.Post( "detailEncoding" ); s.DetailEncoding = detailEncoding; s.IsSavePic = 0; StringBuilder log = new StringBuilder(); string newsBody = new PagedDetailSpider().GetContent( newsUrl, s, log ); String strLog = log.ToString(); if (strLog.IndexOf( "error=" ) >= 0) { StringBuilder sblog = new StringBuilder(); sblog.AppendLine( "detailUrl=" + newsUrl ); sblog.AppendLine( "detailPattern=" + s.DetailPattern ); sblog.Append( log ); echoText( sblog.ToString() ); } else { echoText( newsBody ); } }
public List <SpiderTemplate> GetAll() { return(SpiderTemplate.find("order by OrderId desc, Id asc").list()); }
public void Update(SpiderTemplate s) { s.update(); }
public void Insert(SpiderTemplate s) { s.insert(); }
public SpiderTemplate GetById(int id) { return(SpiderTemplate.findById(id)); }
private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb ) { String target; if (strUtil.HasText( s.ListEncoding )) target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding ); else target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" ); if (strUtil.IsNullOrEmpty( target )) { logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb ); return target; } Match match = Regex.Match( target, s.GetListBodyPattern(), RegexOptions.Singleline ); if (match.Success) { target = match.Value; } else { target = ""; logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb ); } return target.Trim(); }
private static DetailLink getDetailLink( Match match, SpiderTemplate s ) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; //判断输入的url是否满足用户定义的通配符方式的模式 MatchCollection matchs = Regex.Matches( url, ParseUrl( s.ListPattern ), RegexOptions.Singleline ); if (matchs.Count == 0) { return null; } if (url.IndexOf( "javascript:" ) >= 0) return null; if (url.StartsWith( "#" )) return null; title = Regex.Replace( title, "<.+?>", "" ); if (strUtil.IsNullOrEmpty( title )) return null; if (title == "更多") return null; if (title == "more") return null; if (title == "更多>>") return null; string summary = ""; if (match.Groups.Count > 2) summary = match.Groups[3].Value; if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url ); DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return lnk; }
public void Save() { int templateId = ctx.PostInt( "tid" ); String listUrl = ctx.Post( "listUrl" ); //String beginCode = ctx.PostHtmlAll( "listBeginCode" ); //String endCode = ctx.PostHtmlAll( "listEndCode" ); String listBodyPattern = ctx.PostHtmlAll( "ListBodyPattern" ); String ListPattern = ctx.PostHtmlAll( "ListPattern" ); //String detailBeginCode = ctx.PostHtmlAll( "detailBeginCode" ); //String detailEndCode = ctx.PostHtmlAll( "detailEndCode" ); String DetailPattern = ctx.PostHtmlAll( "DetailPattern" ); if (strUtil.IsNullOrEmpty( listUrl )) errors.Add( "请填写列表页的网址" ); //if (strUtil.IsNullOrEmpty( beginCode )) errors.Add( "请填写列表页开始代码" ); //if (strUtil.IsNullOrEmpty( endCode )) errors.Add( "请填写列表页结束代码" ); //if (strUtil.IsNullOrEmpty( detailBeginCode )) errors.Add( "请填写详细页开始代码" ); //if (strUtil.IsNullOrEmpty( detailEndCode )) errors.Add( "请填写详细页结束代码" ); if (ctx.HasErrors) { echoError(); return; } SpiderTemplate s = new SpiderTemplate(); if (templateId > 0) s = templateService.GetById( templateId ); if (s == null) { echoError( "采集模板不存在" ); return; } s.ListUrl = listUrl; //s.ListBodyBegin = beginCode; //s.ListBodyEnd = endCode; s.ListPattern = ListPattern; s.ListBodyPattern = listBodyPattern; //s.DetailBegin = detailBeginCode; //s.DetailEnd = detailEndCode; s.DetailPattern = DetailPattern; s.SiteName = ctx.Post( "siteName" ); s.ListEncoding = ctx.Post( "listEncoding" ); s.DetailEncoding = ctx.Post( "detailEncoding" ); Boolean chkPic = cvt.ToBool( ctx.Post( "checkPic" ) ); s.IsSavePic = chkPic ? 1 : 0; if (templateId > 0) { templateService.Update( s ); } else { templateService.Insert( s ); } echoAjaxOk(); }
public void Insert( SpiderTemplate s ) { s.insert(); }
private static void savePageDetail(DetailLink lnk, StringBuilder sb) { SpiderTemplate template = lnk.Template; string url = lnk.Url; string title = lnk.Title; string summary = lnk.Abstract; if (isPageExist(url, sb)) { return; } String pageBody = new PagedDetailSpider().GetContent(url, template, sb); if (pageBody == null) { return; } SpiderArticle pd = new SpiderArticle(); pd.Title = title; pd.Url = strUtil.SubString(url, 250); pd.Abstract = summary; pd.Body = pageBody; pd.SpiderTemplate = template; MatchCollection matchs = Regex.Matches(pageBody, RegPattern.Img, RegexOptions.Singleline); if (matchs.Count > 0) { pd.IsPic = 1; pd.PicUrl = matchs[0].Groups[1].Value; } pd.insert(); sb.AppendLine("保存成功..." + lnk.Title + "_" + lnk.Url); pageBody = Regex.Replace(pageBody, "font-size", "", RegexOptions.IgnoreCase); string strArcitleLink = "<div class=\"ArcitleLink\"><a href=" + pd.Url + ">原文链接</a></div>"; pageBody = pageBody + strArcitleLink; Maticsoft.BLL.BlogCategory bllBlogCategory = new Maticsoft.BLL.BlogCategory(); DataSet ds = bllBlogCategory.GetList("AppId = '" + template.IsDelete.ToString() + "'"); int nCateID = 1; if (ds.Tables[0].Rows.Count > 0) { nCateID = (int)ds.Tables[0].Rows[0]["Id"]; } BlogPost data = new BlogPost(); data.CategoryId = nCateID; data.Title = title; data.Abstract = summary; data.Content = pageBody; data.AccessStatus = 0; data.CommentCondition = 0; data.SaveStatus = 1;//草稿 data.Created = System.DateTime.Now.Date; data.IsTop = 0; data.IsPick = 0; data.IsPic = 0; data.Ip = ""; data.OwnerId = template.IsDelete; data.OwnerUrl = template.SiteName; data.OwnerType = "wojilu.Members.Users.Domain.User"; data.CreatorUrl = template.SiteName; data.AppId = template.IsDelete;; data.CreatorId = template.IsDelete; Maticsoft.BLL.BlogPost bll = new Maticsoft.BLL.BlogPost(); bll.Add(data); }
private string getPagedContent( string page, string url, SpiderTemplate s, StringBuilder sb ) { StringBuilder pList = new StringBuilder(); List<String> urls = getPagedUrl( page, url ); for (int i = 0; i < urls.Count; i++) { pList.AppendLine( "<hr>" ); String pageContent = new DetailSpider().GetContent( urls[i], s, sb ); pList.Append( pageContent ); } return pList.ToString(); }
public void Execute() { // List<SpiderTemplate> list = SpiderTemplate.find( "IsDelete=0" ).list(); DbContext.closeConnectionAll(); StringBuilder log = new StringBuilder(); IList userRanks = User.find("order by Hits desc, id desc").list(1000); logger.Info("begin SpiderJob=" + userRanks.Count); foreach (User user in userRanks) { if (string.IsNullOrEmpty(user.Profile.Address)) { continue; } SpiderTemplate s = new SpiderTemplate(); s.ListUrl = user.Profile.Address; s.ListEncoding = user.QQ; s.ListBodyPattern = user.Profile.Tel; s.ListPattern = user.Profile.WebSite; s.DetailPattern = user.MSN; s.IsDelete = user.Id; s.SiteName = user.Url; ISpiderTool spider = getSpider(s); spider.DownloadPage(s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo }); // 2~6秒暂停 DbContext.closeConnectionAll(); int sleepms = rd.Next(SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo); Thread.Sleep(sleepms); } //foreach (SpiderTemplate s in list) { // ISpiderTool spider = getSpider( s ); // spider.DownloadPage( s, log, new int[] { SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo } ); // 2~6秒暂停 // DbContext.closeConnectionAll(); // int sleepms = rd.Next( SpiderConfig.SuspendFrom, SpiderConfig.SuspendTo ); // Thread.Sleep( sleepms ); //} String[] arrLog = log.ToString().Split('\n'); StringBuilder errorLog = new StringBuilder(); foreach (String item in arrLog) { if (item.Trim().StartsWith("error=")) { errorLog.AppendLine(item.Trim()); } } SpiderLog sg = new SpiderLog(); sg.Msg = errorLog.ToString(); sg.insert(); DbContext.closeConnectionAll(); }
public void Update( SpiderTemplate s ) { s.update(); }
protected static void SaveUrlToDB(string strReturnPage, SpiderTemplate s, List<DetailLink> list) { Dictionary<string, string> m_dicLink2Text = new Dictionary<string, string>(); string strUrlFilterRule = s.ListPattern; //strUrlFilterRule = ParseUrl(strUrlFilterRule); HtmlAgilityPack.HtmlDocument htmlDoc = GetHtmlDocument(strReturnPage); // string baseUrl = new Uri(strVisitUrl).GetLeftPart(UriPartial.Authority); string baseUrl = GetUrlLeftPart(s.ListUrl); DocumentWithLinks links = htmlDoc.GetLinks(); bool bNoArticle = true; List<string> lstRevomeSame = new List<string>(); // int nCountPerPage = 0; // bool bExistFind = false; // List<string> lstNeedDownLoad = new List<string>(); foreach (string link in links.Links.Union(links.References)) { if (string.IsNullOrEmpty(link)) { continue; } //string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string decodedLink = link; //if (decodedLink != link) //{ // int a = 1; //} //Console.WriteLine(decodedLink); string normalizedLink = GetNormalizedLink(baseUrl, decodedLink); //Console.WriteLine(normalizedLink); if (string.IsNullOrEmpty(normalizedLink)) { continue; } MatchCollection matchs = Regex.Matches(normalizedLink, strUrlFilterRule, RegexOptions.Singleline); if (matchs.Count > 0) { string strLinkText = ""; foreach (string strTemp in links.m_dicLink2Text.Keys) { if (strTemp.Contains(normalizedLink)) { strLinkText = links.m_dicLink2Text[strTemp]; break; } } //if (links.m_dicLink2Text.Keys.Contains(normalizedLink)) // strLinkText = links.m_dicLink2Text[normalizedLink]; if (strLinkText == "") { if (links.m_dicLink2Text.Keys.Contains(link)) strLinkText = links.m_dicLink2Text[link].TrimEnd().TrimStart(); if (links.m_dicLink2Text.Keys.Contains(link.ToLower())) strLinkText = links.m_dicLink2Text[link.ToLower()].TrimEnd().TrimStart(); } if (lstRevomeSame.Contains(normalizedLink)) continue; else lstRevomeSame.Add(normalizedLink); //bool bRet = AddLayerNodeToSaveUrlToDB(m_strWholeDbName, normalizedLink, ref strLinkText); DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = normalizedLink; lnk.Title = strLinkText; list.Add(lnk); } //Console.WriteLine(" uri is " + normalizedLink.ToString()); } return; }
private static string downloadListPageBody( SpiderTemplate s, StringBuilder sb ) { String target; if (strUtil.HasText( s.ListEncoding )) { target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding ); } else { target = PageLoader.Download( s.ListUrl, SpiderConfig.UserAgent, "" ); } if (strUtil.IsNullOrEmpty( target )) { logInfo( "error=原始页面没有内容: " + s.ListUrl, s, sb ); return target; } else { logInfo( "抓取列表内容成功", s, sb ); } if (strUtil.HasText( s.GetListBodyPattern() )) { HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; htmlDoc.LoadHtml( target ); try { IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetListBodyPattern() ); if (Nodes.Count() > 0) { logInfo( "匹配列表内容成功", s, sb ); target = Nodes.ToArray()[0].OuterHtml; target = target.Trim(); return target; } else { logInfo( "error=没有匹配的页面内容:" + s.ListUrl, s, sb ); return null; } } catch (Exception ex) { logInfo( "htmlDoc QuerySelectorAll解析出错=" + ex.Message, s, sb ); return null; } } //这里未来也可以改成css选择器的方式,来细化目标url集合的范围 //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline); //if (match.Success) //{ // target = match.Value; //} //else //{ // target = ""; // logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); //} return target.Trim(); }
private static DetailLink getDetailLink( Match match, SpiderTemplate s ) { string url = match.Groups[1].Value; string title = match.Groups[2].Value; if (url.IndexOf( "javascript:" ) >= 0) return null; if (url.StartsWith( "#" )) return null; title = Regex.Replace( title, "<.+?>", "" ); if (strUtil.IsNullOrEmpty( title )) return null; if (title == "更多") return null; if (title == "more") return null; if (title == "更多>>") return null; string summary = ""; if (match.Groups.Count > 2) summary = match.Groups[3].Value; if (url.StartsWith( "http" ) == false) url = strUtil.Join( s.SiteUrl, url ); DetailLink lnk = new DetailLink(); lnk.Template = s; lnk.Url = url; lnk.Title = title; lnk.Abstract = summary; return lnk; }
//css选择器方式提取详细页内容 protected string getMatchedBody( HtmlDocument htmlDoc, SpiderTemplate s, StringBuilder sb ) { IEnumerable<HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll( s.GetDetailPattern() ); if (Nodes.Count() > 0) { String fpage = Nodes.ToArray()[0].OuterHtml; return fpage; } else { logInfo( "error=没有匹配的页面内容:" + _url, this._url, s, sb ); return null; } }
private static void logInfo(String msg, SpiderTemplate s, StringBuilder sb) { logger.Info(msg); sb.AppendLine(msg); }
private static void logInfo( String msg, SpiderTemplate s, StringBuilder sb ) { logger.Info( msg ); sb.AppendLine( msg ); }