public void testPage() { String page = PageLoader.Download("http://news.ifeng.com/toprank/day/", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "utf-8"); Assert.IsNotEmpty(page); Assert.Greater(page.IndexOf("点击排行榜"), 0); }
//#region 判断是否为土豆视频链接 ///// <summary> ///// 判断是否为土豆视频链接 ///// </summary> ///// <returns></returns> //public static bool IsTuDouVideoUrl(string url) //{ // if (url.StartsWith("http://www.tudou.com/programs/view/")) // { // return true; // } // else // { // return false; // } //} //#endregion //#region 得到土豆视频的视频编码 ///// <summary> ///// 得到土豆视频的视频编码 ///// </summary> //public static string GetTuDouVideoItemCode(string url) //{ // String vid = ""; // //if (StringPlus.IsNullOrEmpty(url) || !IsTuDouVideoUrl(url)) // //{ // // return ""; // //} // if (StringPlus.IsNullOrEmpty(url)) // { // return ""; // } // vid = StringPlus.TrimStart(url, "http://www.tudou.com/programs/view/"); // if (StringPlus.IsNullOrEmpty(vid)) // { // return ""; // } // int FirstPoint = vid.IndexOf("/"); // if (FirstPoint > 0) // { // vid = vid.Substring(0, FirstPoint); // } // return vid; //} //#endregion //#region 得到土豆视频信息 ///// <summary> ///// 得到土豆视频信息 ///// </summary> ///// <param name="url"></param> ///// <returns></returns> //public static TuDouInfo GetTuDouInfo(string url) //{ // string itemCode = GetTuDouVideoItemCode(url); // if (StringPlus.IsNullOrEmpty(itemCode)) // { // return null; // } // string jsonData = PageLoader.Download(string.Format(tuDouJsonDataApiUrl, tuDouAppKey, itemCode)); // if (StringPlus.IsNullOrEmpty(jsonData)) // { // return null; // } // string strStart = "{\"multiResult\":{\"results\":["; // if (jsonData.StartsWith(strStart)) // { // jsonData = jsonData.Replace(strStart, ""); // } // string strEnd = "]}}"; // if (jsonData.EndsWith(strEnd)) // { // int FirstPoint = jsonData.LastIndexOf(strEnd); // if (FirstPoint > 0) // { // jsonData = jsonData.Substring(0, FirstPoint); // } // } // TuDouInfo info = (TuDouInfo)Jayrock.Json.Conversion.JsonConvert.Import(typeof(TuDouInfo), jsonData); // return info; //} //#endregion #region 判断是否是酷6视频链接 /// <summary> /// 判断是否是酷6视频链接 /// </summary> /// <returns></returns> public static bool IsKu6VideoUrl(string url) { string xmlData = PageLoader.Download(string.Format(ku6XmlDataApiUrl, url)); if (!string.IsNullOrEmpty(xmlData)) { XmlDocument xmldoc = new XmlDocument(); xmldoc.LoadXml(xmlData); XmlNode xmlNodeType = xmldoc.SelectSingleNode("root/result"); int type = -1; if (null != xmlNodeType && null != xmlNodeType.Attributes.GetNamedItem("type")) { type = Convert.ToInt32(xmlNodeType.Attributes.GetNamedItem("type").Value); } else { return(false); } if (type == -1) { return(false); } else { return(true); } } else { return(false); } }
public VideoInfo GetInfo(string playUrl) { VideoInfo vi = new VideoInfo(); vi.PlayUrl = playUrl; try { String pageBody = PageLoader.Download(playUrl); Match mt = Regex.Match(pageBody, "video : {(" + "." + "+?)\\}[^,]", RegexOptions.Singleline); String strJson = "{" + mt.Groups[1].Value + "}"; Dictionary <String, Object> dic = JSON.ToDictionary(strJson); vi.PicUrl = dic.ContainsKey("pic") ? dic["pic"].ToString() : ""; vi.FlashUrl = dic.ContainsKey("swfOutsideUrl") ? dic["swfOutsideUrl"].ToString() : ""; vi.Title = dic.ContainsKey("title") ? dic["title"].ToString() : ""; return(vi); } catch (Exception ex) { logger.Error("getUrl=" + playUrl); logger.Error(ex.Message); return(vi); } }
public VideoInfo GetInfo(String url) { String vid = strUtil.TrimStart(url, "http://www.tudou.com/programs/view/").TrimEnd('/'); String flashUrl = string.Format("http://www.tudou.com/v/{0}/v.swf", vid); VideoInfo vi = new VideoInfo(); vi.PlayUrl = url; vi.FlashId = vid; vi.FlashUrl = flashUrl; try { String pageBody = PageLoader.Download(url); Match mt = Regex.Match(pageBody, "<title>([^<]+?)</title>"); String title = VideoHelper.GetTitle(mt.Groups[1].Value); Match m = Regex.Match(pageBody, "thumbnail[^']+?'([^']+?)'"); String picUrl = m.Groups[1].Value; vi.Title = title; vi.PicUrl = picUrl; return(vi); } catch (Exception ex) { logger.Error("getUrl=" + url); logger.Error(ex.Message); return(vi); } }
public VideoInfo GetInfo(string playUrl) { String[] arrItem = strUtil.TrimEnd(playUrl, ".shtml").Split('/'); String flashId = arrItem[arrItem.Length - 1]; VideoInfo vi = new VideoInfo(); vi.PlayUrl = playUrl; vi.FlashId = flashId; vi.FlashUrl = string.Format("http://v.ifeng.com/include/exterior.swf?guid={0}&AutoPlay=false", flashId); try { String pageBody = PageLoader.Download(playUrl, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "utf-8"); Match mt = Regex.Match(pageBody, "var videoinfo=({.+?});"); String strJson = mt.Groups[1].Value; Dictionary <String, Object> dic = JSON.ToDictionary(strJson); vi.PicUrl = dic.ContainsKey("img") ? dic["img"].ToString() : ""; vi.Title = dic.ContainsKey("name") ? dic["name"].ToString() : ""; return(vi); } catch (Exception ex) { logger.Error("getUrl=" + playUrl); logger.Error(ex.Message); return(vi); } }
//利用HtmlAgilityPack生成HtmlDocument protected HtmlDocument getDetailPageBodyHtmlDocument(string detailUrl, SpiderTemplate template, StringBuilder sb) { try { sb.AppendLine("抓取详细页..." + detailUrl); HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; String page; if (strUtil.HasText(template.DetailEncoding)) { page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, template.DetailEncoding); } else { page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, ""); } htmlDoc.LoadHtml(page); return(htmlDoc); } catch (Exception ex) { logInfo("error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb); return(null); } }
protected string getDetailPageBody(string detailUrl, SpiderTemplate template, StringBuilder sb) { try { sb.AppendLine("抓取详细页..." + detailUrl); String page; if (strUtil.HasText(template.DetailEncoding)) { page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, template.DetailEncoding); } else { page = PageLoader.Download(detailUrl, SpiderConfig.UserAgent, ""); } template.SiteUrl = new UrlInfo(detailUrl).SiteUrl; if (strUtil.IsNullOrEmpty(page)) { logInfo("error=原始页面没有内容:" + detailUrl, detailUrl, template, sb); } return(page); } catch (Exception ex) { logInfo("error=抓取" + detailUrl + "发生错误:" + ex.Message, detailUrl, template, sb); return(null); } }
private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb) { String target; if (strUtil.HasText(s.ListEncoding)) { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding); } else { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, ""); } if (strUtil.IsNullOrEmpty(target)) { logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb); return(target); } Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline); if (match.Success) { target = match.Value; } else { target = ""; logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); } return(target.Trim()); }
public VideoInfo GetInfo(String url) { String vid = strUtil.TrimStart(url, "http://v.youku.com/v_show/id_"); vid = strUtil.TrimEnd(vid, ".html"); String flashUrl = string.Format("http://player.youku.com/player.php/sid/{0}/v.swf", vid); VideoInfo vi = new VideoInfo(); vi.PlayUrl = url; vi.FlashUrl = flashUrl; vi.FlashId = vid; try { String pageBody = PageLoader.Download(url); Match mt = Regex.Match(pageBody, "<title>([^<]+?)</title>"); String title = VideoHelper.GetTitle(mt.Groups[1].Value); Match m = Regex.Match(pageBody, "pic=(http://[^:]+?.ykimg.com.+?)\""); String picUrl = m.Groups[1].Value; vi.Title = title; vi.PicUrl = picUrl; return(vi); } catch (Exception ex) { logger.Error("getUrl=" + url); logger.Error(ex.Message); return(vi); } }
public VideoInfo GetInfo(String url) { String vid = strUtil.TrimStart(url, "http://v.ku6.com/show/"); vid = strUtil.TrimEnd(vid, ".html"); String flashUrl = string.Format("http://player.ku6.com/refer/{0}/v.swf", vid); VideoInfo vi = new VideoInfo(); vi.PlayUrl = url; vi.FlashUrl = flashUrl; vi.FlashId = vid; try { String pageBody = PageLoader.Download(url); Match mt = Regex.Match(pageBody, "<title>([^<]+?)</title>"); String title = VideoHelper.GetTitle(mt.Groups[1].Value); Match m = Regex.Match(pageBody, "<span class=\"s_pic\">([^<]+?)</span>"); String picUrl = m.Groups[1].Value; vi.Title = title; vi.PicUrl = picUrl; return(vi); } catch (Exception ex) { logger.Error("getUrl=" + url); logger.Error(ex.Message); return(vi); } }
private static string downloadListPageBody(SpiderTemplate s, StringBuilder sb) { String target; if (strUtil.HasText(s.ListEncoding)) { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, s.ListEncoding); } else { target = PageLoader.Download(s.ListUrl, SpiderConfig.UserAgent, ""); } if (strUtil.IsNullOrEmpty(target)) { logInfo("error=原始页面没有内容: " + s.ListUrl, s, sb); return(target); } if (!strUtil.IsNullOrEmpty(s.GetListBodyPattern())) { HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; htmlDoc.LoadHtml(target); IEnumerable <HtmlNode> Nodes = htmlDoc.DocumentNode.QuerySelectorAll(s.GetListBodyPattern()); if (Nodes.Count() > 0) { target = Nodes.ToArray()[0].OuterHtml; return(target.Trim()); } else { logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); return(null); } } //这里未来也可以改成css选择器的方式,来细化目标url集合的范围 //Match match = Regex.Match(target, s.GetListBodyPattern(), RegexOptions.Singleline); //if (match.Success) //{ // target = match.Value; //} //else //{ // target = ""; // logInfo("error=没有匹配的页面内容:" + s.ListUrl, s, sb); //} return(target.Trim()); }
public void testEncoding() { // 可以提供encoding,如果不提供,根据 WebResponse 的 ContentType 进行设置。 String page = PageLoader.Download("http://www.baidu.com/"); Assert.IsNotEmpty(page); Assert.Greater(page.IndexOf("百度"), 0); page = PageLoader.Download("http://news.163.com/", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", ""); Assert.IsNotEmpty(page); Assert.Greater(page.IndexOf("网易新闻"), 0); }
public VideoInfo GetInfo(string playUrl) { String[] arrItem = playUrl.Split('#'); String flashId = arrItem[arrItem.Length - 1]; VideoInfo vi = new VideoInfo(); vi.PlayUrl = playUrl; vi.FlashId = flashId; vi.FlashUrl = string.Format("http://v.ifeng.com/include/exterior.swf?guid={0}&AutoPlay=false", flashId); try { String pageBody = PageLoader.Download(playUrl, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "utf-8"); Match mt = Regex.Match(pageBody, "<li name=\"" + flashId + "\">(.+?)</li>", RegexOptions.Singleline); String strHtml = mt.Value.Replace("'", "\""); System.IO.StringReader strRd = new System.IO.StringReader(strHtml); XmlTextReader reader = new XmlTextReader(strRd); reader.MoveToContent(); while (reader.Read()) { if (equal(reader, "img")) { vi.PicUrl = reader.GetAttribute("src"); } else if (equal(reader, "h4")) { vi.Title = reader.ReadString(); } } return(vi); } catch (Exception ex) { logger.Error("getUrl=" + playUrl); logger.Error(ex.Message); return(vi); } }
public VideoInfo GetInfo(String url) { String turl = strUtil.TrimStart(url, "http://"); turl = strUtil.TrimStart(turl, "www.56.com"); turl = strUtil.TrimEnd(turl, ".html"); turl = turl.TrimStart('/'); String[] arrItem = turl.Split('/'); String vid = strUtil.TrimStart(arrItem[1], "v_"); String flashUrl = string.Format("http://player.56.com/v_{0}.swf", vid); VideoInfo vi = new VideoInfo(); vi.PlayUrl = url; vi.FlashId = vid; vi.FlashUrl = flashUrl; try { String pageBody = PageLoader.Download(url); Match mt = Regex.Match(pageBody, "<title>([^<]+?)</title>"); String title = VideoHelper.GetTitle(mt.Groups[1].Value); String pattern = "\"img\":\"([^\"]+?)\""; Match m = Regex.Match(pageBody, pattern); String picUrl = m.Groups[1].Value; picUrl = picUrl.Replace("\\", "").Trim(); vi.Title = title; vi.PicUrl = picUrl; return(vi); } catch (Exception ex) { logger.Error("getUrl=" + url); logger.Error(ex.Message); return(vi); } }
/// <summary> /// 得到优库视频信息 /// </summary> /// <param name="url"></param> /// <returns></returns> public static YouKuInfo GetYouKuInfo(string url) { string vid = GetYouKuVideoId(url); if (StringPlus.IsNullOrEmpty(vid)) { return(null); } string jsonData = PageLoader.Download(string.Format(youKuJsonDataApiUrl, vid)); if (StringPlus.IsNullOrEmpty(jsonData)) { return(null); } string strStart = "{\"data\":["; if (jsonData.StartsWith(strStart)) { jsonData = jsonData.Replace(strStart, ""); } string strEnd = "}"; if (jsonData.EndsWith(strEnd)) { int FirstPoint = jsonData.LastIndexOf(strEnd); if (FirstPoint > 0) { jsonData = jsonData.Substring(0, FirstPoint); } } YouKuInfo info = (YouKuInfo)YSWL.Json.Conversion.JsonConvert.Import(typeof(YouKuInfo), jsonData); return(info); }
/// <summary> /// 得到Ku6视频信息 /// </summary> /// <param name="url"></param> /// <returns></returns> public static Ku6Info GetKu6Info(string url) { Ku6Info info = new Ku6Info(); //if (!IsKu6VideoUrl(url)) //{ // return null; //} string xmlData = PageLoader.Download(string.Format(ku6XmlDataApiUrl, url)); XmlDocument xmldoc = new XmlDocument(); xmldoc.LoadXml(xmlData); XmlNode xmlNodeType = xmldoc.SelectSingleNode("root/result"); int type = -1; if (null != xmlNodeType && null != xmlNodeType.Attributes.GetNamedItem("type")) { type = Convert.ToInt32(xmlNodeType.Attributes.GetNamedItem("type").Value); } else { return(null); } if (type == -1) { return(null); } info.type = type; XmlNode xmlNodeVid = xmldoc.SelectSingleNode("root/result/vid"); if (null != xmlNodeVid) { info.vid = xmlNodeVid.InnerText; } XmlNode xmlNodeCoverurl = xmldoc.SelectSingleNode("root/result/coverurl"); if (null != xmlNodeCoverurl) { info.coverurl = xmlNodeCoverurl.InnerText; } XmlNode xmlNodeFlash = xmldoc.SelectSingleNode("root/result/flash"); if (null != xmlNodeFlash) { info.flash = xmlNodeFlash.InnerText; } XmlNode xmlNodeTitle = xmldoc.SelectSingleNode("root/result/title"); if (null != xmlNodeTitle) { info.title = xmlNodeTitle.InnerText; } XmlNode xmlNodeDesc = xmldoc.SelectSingleNode("root/result/desc"); if (null != xmlNodeDesc) { info.desc = xmlNodeDesc.InnerText; } return(info); }