// Creates XmlDocument from html content and return it with rootitem "<root>". public static XmlDocument ParseHtml(string sContent) { StringReader sr = new StringReader("<root>" + sContent + "</root>"); SgmlReader reader = new SgmlReader(); reader.WhitespaceHandling = WhitespaceHandling.All; reader.CaseFolding = Sgml.CaseFolding.ToLower; reader.InputStream = sr; StringWriter sw = new StringWriter(); XmlTextWriter w = new XmlTextWriter(sw); w.Formatting = Formatting.Indented; w.WriteStartDocument(); reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); w.Close(); sw.Flush(); // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.LoadXml(sw.ToString()); reader.Close(); return doc; }
public static string GetWellFormedHTML(string html, string xpathNavPath) { // StreamReader sReader = null; StringWriter sw = null; SgmlReader reader = null; XmlTextWriter writer = null; try { // if (uri == String.Empty) uri = "http://www.XMLforASP.NET"; // HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri); // HttpWebResponse res = (HttpWebResponse)req.GetResponse(); // sReader = new StreamReader(res.GetResponseStream()); reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(html); sw = new StringWriter(); writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; //writer.WriteStartElement("Test"); while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } //writer.WriteEndElement(); if (xpathNavPath == null) { string sr = sw.ToString(); sr = sr.Replace("\r", "\n"); sr = sr.Replace("\n\n", "\n"); return sr; } else { //Filter out nodes from HTML StringBuilder sb = new StringBuilder(); XPathDocument doc = new XPathDocument(new StringReader(sw.ToString())); XPathNavigator nav = doc.CreateNavigator(); XPathNodeIterator nodes = nav.Select(xpathNavPath); while (nodes.MoveNext()) { sb.Append(nodes.Current.Value + "\n"); } string sr = sb.ToString(); sr = sr.Replace("\r", "\n"); sr = sr.Replace("\n\n", "\n"); return sr; } } catch (Exception exp) { writer.Close(); reader.Close(); sw.Close(); // sReader.Close(); return exp.Message; } }
XmlDocument FetchXmlDocument(Uri url) { var sr = FetchWebText (url); var xr = new SgmlReader () { InputStream = sr }; var doc = new XmlDocument (); doc.Load (xr); sr.Close (); xr.Close (); return doc; }
public void Run(string[] args) { SgmlReader reader = new SgmlReader(); string inputUri = null; for (int i = 0; i < args.Length; i++) { string arg = args[i]; if (arg[0] == '-' || arg[0] == '/') { switch (arg.Substring(1)) { case "e": string errorlog = args[++i]; if (errorlog.ToLower() == "$stderr") { reader.ErrorLog = Console.Error; } else { reader.ErrorLogFile = errorlog; } break; case "html": reader.DocType = "HTML"; break; case "dtd": reader.SystemLiteral = args[++i]; break; case "proxy": proxy = args[++i]; reader.WebProxy = proxy; break; case "encoding": encoding = Encoding.GetEncoding(args[++i]); break; case "f": formatted = true; reader.WhitespaceHandling = WhitespaceHandling.None; break; case "noxml": noxmldecl = true; break; case "doctype": reader.StripDocType = false; break; case "lower": reader.CaseFolding = CaseFolding.ToLower; break; case "upper": reader.CaseFolding = CaseFolding.ToUpper; break; default: Console.WriteLine("Usage: SgmlReader <options> [InputUri] [OutputFile]"); Console.WriteLine("-e log Optional log file name, name of '$STDERR' will write errors to stderr"); Console.WriteLine("-f Whether to pretty print the output."); Console.WriteLine("-html Specify the built in HTML dtd"); Console.WriteLine("-dtd url Specify other SGML dtd to use"); Console.WriteLine("-base Add base tag to output HTML"); Console.WriteLine("-noxml Do not add XML declaration to the output"); Console.WriteLine("-proxy svr:80 Proxy server to use for http requests"); Console.WriteLine("-encoding name Specify an encoding for the output file (default UTF-8)"); Console.WriteLine("-lower Convert input tags to lower case"); Console.WriteLine("-upper Convert input tags to upper case"); Console.WriteLine(); Console.WriteLine("InputUri The input file or http URL (default stdin). "); Console.WriteLine(" Supports wildcards for local file names."); Console.WriteLine("OutputFile Output file name (default stdout)"); Console.WriteLine(" If input file contains wildcards then this just specifies the output file extension (default .xml)"); return; } } else { if (inputUri == null) { inputUri = arg; string ext = Path.GetExtension(arg).ToLower(); if (ext == ".htm" || ext == ".html") reader.DocType = "HTML"; } else if (output == null) output = arg; } } if (inputUri != null && !inputUri.StartsWith("http://") && inputUri.IndexOfAny(new char[] { '*', '?' }) >= 0) { // wild card processing of a directory of files. string path = Path.GetDirectoryName(inputUri); if (path == "") path = ".\\"; string ext = ".xml"; if (output != null) ext = Path.GetExtension(output); foreach (string uri in Directory.GetFiles(path, Path.GetFileName(inputUri))) { Console.WriteLine("Processing: " + uri); string file = Path.GetFileName(uri); output = Path.GetDirectoryName(uri) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(file) + ext; Process(reader, uri); reader.Close(); } return; } Process(reader, inputUri); reader.Close(); return ; }
bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log) { depth++; StringBuilder indent = new StringBuilder(); for (int i = 0; i < depth; i++) indent.Append(" "); count++; Uri baseUri = new Uri(doc.BaseURI); XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base"); if (baseElmt != null) { string href = baseElmt.GetAttribute("href"); if (href != "") { try { baseUri = new Uri(href); } catch (Exception ) { Console.WriteLine("### Error parsing BASE href '"+href+"'"); } } } foreach (XmlElement a in doc.SelectNodes("//a")) { string href = a.GetAttribute("href"); if (href != "" && href != null && depth<5) { Uri local = new Uri(baseUri, href); if (domain && baseUri.Host != local.Host) continue; string ext = Path.GetExtension(local.AbsolutePath).ToLower(); if (ext == ".jpg" || ext == ".gif" || ext==".mpg") continue; string url = local.AbsoluteUri; if (!visited.ContainsKey(url)) { visited.Add(url, url); log.WriteLine(indent+"Loading '"+url+"'"); log.Flush(); StreamReader stm = null; try { HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url); wr.Timeout = 10000; if (proxy != null) wr.Proxy = new WebProxy(proxy); wr.PreAuthenticate = false; // Pass the credentials of the process. wr.Credentials = CredentialCache.DefaultCredentials; WebResponse resp = wr.GetResponse(); Uri actual = resp.ResponseUri; if (actual.AbsoluteUri != url) { local = new Uri(actual.AbsoluteUri); log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'"); log.Flush(); } if (resp.ContentType != "text/html") { log.WriteLine(indent+"Skipping ContentType="+resp.ContentType); log.Flush(); resp.Close(); } else { stm = new StreamReader(resp.GetResponseStream()); } } catch (Exception e) { log.WriteLine(indent+"### Error opening URL: " + e.Message); log.Flush(); } if (stm != null) { SgmlReader reader = new SgmlReader(); reader.Dtd = dtd; reader.SetBaseUri(local.AbsoluteUri); reader.InputStream = stm; reader.WebProxy = proxy; XmlDocument d2 = new XmlDocument(); d2.XmlResolver = null; // don't do any downloads! try { d2.Load(reader); reader.Close(); stm.Close(); if (!Crawl(dtd, d2, log)) return false; } catch (Exception e) { log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message); log.Flush(); reader.Close(); } } } } } depth--; return true; }
/*************************************************************************** * Useful debugging code... * **************************************************************************/ void StartCrawl(SgmlReader reader, string uri, bool basify) { Console.WriteLine("Loading '"+reader.BaseURI+"'"); XmlDocument doc = new XmlDocument(); try { doc.XmlResolver = null; // don't do any downloads! doc.Load(reader); } catch (Exception e) { Console.WriteLine("Error loading document\n"+e.Message); } reader.Close(); if (basify) { // html and head are option, if they are there use them otherwise not. XmlElement be = (XmlElement)doc.SelectSingleNode("//base"); if (be == null) { be = doc.CreateElement("base"); be.SetAttribute("href", doc.BaseURI); XmlElement head = (XmlElement)doc.SelectSingleNode("//head"); if (head != null) { head.InsertBefore(be, head.FirstChild); } else { XmlElement html = (XmlElement)doc.SelectSingleNode("//html"); if (html != null) html.InsertBefore(be, html.FirstChild); else doc.DocumentElement.InsertBefore(be, doc.DocumentElement.FirstChild); } } } try { Crawl(reader.Dtd, doc, reader.ErrorLog); } catch (Exception e) { Console.WriteLine("Uncaught exception: " + e.Message); } }
void Process(SgmlReader reader, string uri, bool loadAsStream) { if (uri == null) { reader.InputStream = Console.In; } else if (loadAsStream) { Uri location = new Uri(uri); if (location.IsFile) { reader.InputStream = new StreamReader(uri); } else { WebRequest wr = WebRequest.Create(location); reader.InputStream = new StreamReader(wr.GetResponse().GetResponseStream()); } } else { reader.Href = uri; } if (debug) { Debug(reader); reader.Close(); return; } if (crawl) { StartCrawl(reader, uri, basify); return; } if (this.encoding == null) { this.encoding = reader.GetEncoding(); } XmlTextWriter w = null; if (output != null) { w = new XmlTextWriter(output, this.encoding); } else { w = new XmlTextWriter(Console.Out); } if (formatted) w.Formatting = Formatting.Indented; if (!noxmldecl) { w.WriteStartDocument(); } if (testdoc) { XmlDocument doc = new XmlDocument(); try { doc.Load(reader); doc.WriteTo(w); } catch (XmlException e) { Console.WriteLine("Error:" + e.Message); Console.WriteLine("at line " + e.LineNumber + " column " + e.LinePosition); } } else { reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } } w.Flush(); w.Close(); }
/// <summary> private string GetWellFormedHTML_Handle(string uri) { StreamReader sReader = null; StringWriter sw = null; SgmlReader reader = null; XmlTextWriter writer = null; try { if (uri == String.Empty) uri = "http://www.ypshop.net/list--91-940-940--search-1.html"; HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri); HttpWebResponse res = (HttpWebResponse)req.GetResponse(); sReader = new StreamReader(res.GetResponseStream()); reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(sReader.ReadToEnd()); sw = new StringWriter(); writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } StringBuilder sb = new StringBuilder(); XPathDocument doc = new XPathDocument(new StringReader(sw.ToString())); XPathNavigator nav = doc.CreateNavigator(); //XPathNodeIterator nodes = nav.Select(xpath); //while (nodes.MoveNext()) //{ // sb.Append(nodes.Current.Value + " "); //} return sb.ToString(); } catch (Exception exp) { writer.Close(); reader.Close(); sw.Close(); sReader.Close(); return exp.Message; } }
/// <summary> /// 读取html页面内容 /// </summary> /// <param name="uri">网址</param> /// <param name="xpath">xpath标签</param> /// <returns></returns> private string GetWellFormedHTML(string uri, string xpath) { StreamReader sReader = null;//读取字节流 StringWriter sw = null;//写入字符串 SgmlReader reader = null;//sgml读取方法 XmlTextWriter writer = null;//生成xml数据流 try { if (uri == String.Empty) uri = "http://www.ypshop.net/list--91-940-940--search-1.html"; WebClient webclient = new WebClient(); webclient.Encoding = Encoding.UTF8; //页面内容 string strWebContent = webclient.DownloadString(uri); reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(strWebContent); sw = new StringWriter(); writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } //return sw.ToString(); if (xpath == null) { return sw.ToString(); } else { //Filter out nodes from HTML StringBuilder sb = new StringBuilder(); XPathDocument doc = new XPathDocument(new StringReader(sw.ToString())); XPathNavigator nav = doc.CreateNavigator(); XPathNodeIterator nodes = nav.Select(xpath); while (nodes.MoveNext()) { sb.Append(nodes.Current.Value + " "); } return sb.ToString(); } } catch (Exception exp) { writer.Close(); reader.Close(); sw.Close(); sReader.Close(); return exp.Message; } }
/// <summary> /// ת����Xhtml /// </summary> /// <param name="html">html����</param> /// <returns>Xhtml����</returns> public static string ToXhtml(string html) { SgmlReader reader = new SgmlReader(); reader.CaseFolding = CaseFolding.ToLower; reader.DocType = "HTML"; reader.InputStream = new StringReader(html); StringWriter sw = new StringWriter(CultureInfo.InvariantCulture); XmlTextWriter writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; reader.WhitespaceHandling = WhitespaceHandling.None; while (!reader.EOF) { writer.WriteNode(reader, true); } reader.Close(); sw.Close(); writer.Close(); return sw.ToString(); }
public XmlDocument Proceed() { HttpWebRequest req = (HttpWebRequest) HttpWebRequest.Create(_uri); WebResponse response = req.GetResponse(); var st = response.GetResponseStream(); System.IO.TextReader tr = new System.IO.StreamReader(st, System.Text.Encoding.GetEncoding(1251)) ; Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = tr; // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); response.Close(); sgmlReader.Close(); _doc = doc; return doc; }
/// <summary> /// Parse a HTML to XML and returns a string, if error occurs returns an exception. /// </summary> /// <remarks> Use this method when you want to catch a parsing error.</remarks> /// <param name="html"> HTML string to parse.</param> /// <returns>A string with the parsed value.</returns> public string GetParsableString(string html) { html = PreProcessHtml(html); SgmlReader reader = new SgmlReader(); // set SgmlReader values reader.DocType = "HTML"; // lower case all reader.InputStream = new StringReader(html); // write to xml StringWriter sw = new StringWriter(); XmlTextWriter w = new XmlTextWriter(sw); w.Formatting = Formatting.Indented; try { while (reader.Read()) { if ( (reader.NodeType != XmlNodeType.DocumentType) && (this.ParserProperties.RemoveDocumentType) ) { if ( reader.NodeType != XmlNodeType.Whitespace ) { // Write entire reader to xml w.WriteNode(reader, true); } } } return PostProcessHtml(sw.ToString()); } catch { throw; } finally { reader.Close(); w.Close(); } }