private static XDoc FromHtml(TextReader reader) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable) { Dtd = _dtd, DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, CaseFolding = Sgml.CaseFolding.ToLower, InputStream = reader }; try { XmlDocument doc = XDoc.NewXmlDocument(); doc.Load(sgmlReader); if (doc.DocumentElement == null) { return(XDoc.Empty); } if (_dtd == null) { _dtd = sgmlReader.Dtd; } return(new XDoc(doc)); } catch (Exception) { return(XDoc.Empty); } }
public static XDoc WebHtml( [DekiScriptParam("HTML source text or source uri (default: none)", true)] string source, [DekiScriptParam("xpath to value (default: none)", true)] string xpath, [DekiScriptParam("namespaces (default: none)", true)] Hashtable namespaces, [DekiScriptParam("caching duration in seconds (range: 60 - 86400; default: 300)", true)] double?ttl, [DekiScriptParam("return nil if source could not be loaded (default: text with error message)", true)] bool?nilIfMissing ) { string text = WebText(source, xpath, namespaces, true, ttl, nilIfMissing); if (text == null) { return(null); } // convert text to html without a converter XDoc result = XDoc.Empty; using (TextReader reader = new StringReader("<html><body>" + text + "</body></html>")) { // NOTE (steveb): we create the sgml reader explicitly since we don't want a DTD to be associated with it; the DTD would force a potentially unwanted HTML structure // check if HTML entities DTD has already been loaded if (_htmlEntitiesDtd == null) { using (StreamReader dtdReader = new StreamReader(Plug.New("resource://mindtouch.deki.script/MindTouch.Deki.Script.Resources.HtmlEntities.dtd").Get().AsStream())) { _htmlEntitiesDtd = Sgml.SgmlDtd.Parse(null, "HTML", dtdReader, null, null, XDoc.XmlNameTable); } } Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable); sgmlReader.Dtd = _htmlEntitiesDtd; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; try { XmlDocument doc = new XmlDocument(XDoc.XmlNameTable) { PreserveWhitespace = true, XmlResolver = null }; doc.Load(sgmlReader); // check if a valid document was created if (doc.DocumentElement != null) { result = new XDoc(doc); } } catch { // swallow parsing exceptions } } return(CleanseHtmlDocument(result)); }
public XDoc WebHtml( [DekiExtParam("HTML source text or source uri (default: none)", true)] string source, [DekiExtParam("xpath to value (default: none)", true)] string xpath, [DekiExtParam("namespaces (default: none)", true)] Hashtable namespaces, [DekiExtParam("caching duration in seconds (range: 300+; default: 300)", true)] double? ttl ) { string text = WebText(source, xpath, namespaces, true, ttl); // convert text to html XDoc result = XDoc.Empty; using(TextReader reader = new StringReader("<html><body>" + text + "</body></html>")) { // NOTE (steveb): we create the sgml reader explicitly since we don't want a DTD to be associated with it; the DTD would force a potentially unwanted HTML structure // check if HTML entities DTD has already been loaded if(_htmlEntitiesDtd == null) { using(StreamReader dtdReader = new StreamReader(Plug.New("resource://mindtouch.deki.script/MindTouch.Deki.Script.HtmlEntities.dtd").Get().AsStream())) { _htmlEntitiesDtd = Sgml.SgmlDtd.Parse(null, "HTML", dtdReader, null, null, XDoc.XmlNameTable); } } Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable); sgmlReader.Dtd = _htmlEntitiesDtd; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; try { XmlDocument doc = new XmlDocument(XDoc.XmlNameTable); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); // check if a valid document was created if(doc.DocumentElement != null) { result = new XDoc(doc); } } catch(Exception) { // swallow parsing exceptions } } return DekiScriptLibrary.CleanseHtmlDocument(result); }
private static XDoc FromHtml(TextReader reader) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable) { Dtd = _dtd, DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, CaseFolding = Sgml.CaseFolding.ToLower, InputStream = reader }; try { XmlDocument doc = XDoc.NewXmlDocument(); doc.Load(sgmlReader); if(doc.DocumentElement == null) { return XDoc.Empty; } if(_dtd == null) { _dtd = sgmlReader.Dtd; } return new XDoc(doc); } catch(Exception) { return XDoc.Empty; } }