/// <summary> /// コンストラクタ /// </summary> /// <param name="url">参照先URL</param> /// <param name="follow">robots.txt参照可否</param> /// <param name="agent">ユーザーエージェント</param> public HtmlReader(string url, bool follow = true, UserAgent agent = null, Encoding encoding = null) { // Httpリクエスト HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); // ユーザーエージェント if (agent != null) req.UserAgent = agent.ToString(); // robots.txt Robots robots = (follow) ? Robots.Create(new Uri(url)) : null; if (robots != null) { if (!robots.Parse(url)) throw new RobotsDisallowException("Robots Disallow [" + url + "]"); if (robots.CrawlDelay != 0) System.Threading.Thread.Sleep(robots.CrawlDelay * 1000); } using (HttpWebResponse res = (HttpWebResponse)req.GetResponse()) using (Stream stream = res.GetResponseStream()) { Encoding enc = (encoding != null) ? encoding : Encoding.GetEncoding(res.CharacterSet); using (StreamReader reader = new StreamReader(stream, enc)) using (SgmlReader sgml = new SgmlReader { DocType = "HTML", InputStream = reader, CaseFolding = CaseFolding.ToLower, IgnoreDtd = true }) { Html = XDocument.Load(sgml, LoadOptions.None); Uri = url; Encoding = enc; } } }
static void Main(string[] args) { string url = @"http://www.1pondo.tv/static-seo/sitemap-videos.xml"; UserAgent agent = new UserAgent("hoge", 1, "*****@*****.**", DefaultAgents.Firefox14); var reader = new HtmlReader(url, true, agent); var xml = reader.Html; XNamespace ns = reader.Namespace; XNamespace video = reader.Html.Root.Attribute(XNamespace.Xmlns + "video").Value; var list = xml.Descendants(video + "content_loc").Where(a => a.Value.Contains("092615_161")); var item = list.First().Ancestors(ns + "url"); var vitem = list.First().Ancestors(video + "video"); }