public static XmlReader Create(string baseUri, string html) { var assembly = typeof(SgmlReader).Assembly; var name = "Html.dtd"; var dtd = default(SgmlDtd); using (var resource = assembly.GetManifestResourceStream(name)) { var input = new StreamReader(resource); dtd = SgmlDtd.Parse(new Uri(baseUri), "HTML", input, null, null, null); } var reader = new SgmlReader { WhitespaceHandling = WhitespaceHandling.All, CaseFolding = CaseFolding.ToLower, Dtd = dtd, IgnoreDtd = true, InputStream = new StringReader(html), }; reader.SetBaseUri(baseUri); return reader; }
bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log) { depth++; StringBuilder indent = new StringBuilder(); for (int i = 0; i < depth; i++) indent.Append(" "); count++; Uri baseUri = new Uri(doc.BaseURI); XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base"); if (baseElmt != null) { string href = baseElmt.GetAttribute("href"); if (href != "") { try { baseUri = new Uri(href); } catch (Exception ) { Console.WriteLine("### Error parsing BASE href '"+href+"'"); } } } foreach (XmlElement a in doc.SelectNodes("//a")) { string href = a.GetAttribute("href"); if (href != "" && href != null && depth<5) { Uri local = new Uri(baseUri, href); if (domain && baseUri.Host != local.Host) continue; string ext = Path.GetExtension(local.AbsolutePath).ToLower(); if (ext == ".jpg" || ext == ".gif" || ext==".mpg") continue; string url = local.AbsoluteUri; if (!visited.ContainsKey(url)) { visited.Add(url, url); log.WriteLine(indent+"Loading '"+url+"'"); log.Flush(); StreamReader stm = null; try { HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url); wr.Timeout = 10000; if (proxy != null) wr.Proxy = new WebProxy(proxy); wr.PreAuthenticate = false; // Pass the credentials of the process. wr.Credentials = CredentialCache.DefaultCredentials; WebResponse resp = wr.GetResponse(); Uri actual = resp.ResponseUri; if (actual.AbsoluteUri != url) { local = new Uri(actual.AbsoluteUri); log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'"); log.Flush(); } if (resp.ContentType != "text/html") { log.WriteLine(indent+"Skipping ContentType="+resp.ContentType); log.Flush(); resp.Close(); } else { stm = new StreamReader(resp.GetResponseStream()); } } catch (Exception e) { log.WriteLine(indent+"### Error opening URL: " + e.Message); log.Flush(); } if (stm != null) { SgmlReader reader = new SgmlReader(); reader.Dtd = dtd; reader.SetBaseUri(local.AbsoluteUri); reader.InputStream = stm; reader.WebProxy = proxy; XmlDocument d2 = new XmlDocument(); d2.XmlResolver = null; // don't do any downloads! try { d2.Load(reader); reader.Close(); stm.Close(); if (!Crawl(dtd, d2, log)) return false; } catch (Exception e) { log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message); log.Flush(); reader.Close(); } } } } } depth--; return true; }
/************************************************************************** * Run a test suite. Tests suites are organized into expected input/output * blocks separated by back quotes (`). It runs the input and compares it * with the expected output and reports any failures. **************************************************************************/ void RunTest(SgmlReader reader, string file) { Console.WriteLine(file); StreamReader sr = new StreamReader(file); StringBuilder input = new StringBuilder(); StringBuilder expectedOutput = new StringBuilder(); StringBuilder current = null; StringBuilder args = new StringBuilder(); Uri baseUri = new Uri(new Uri(Directory.GetCurrentDirectory()+"\\"), file); reader.SetBaseUri(baseUri.AbsoluteUri); int start = 1; int line = 1; int pos = 1; bool skipToEOL = false; bool readArgs = false; int i; do { i = sr.Read(); char ch = (char)i; if (pos == 1 && ch == '`') { ++pos; if (current == null) { current = input; current.Length = 0; readArgs = true; } else if (current == input) { current = expectedOutput; } else { RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString()); start = line; input.Length = 0; args.Length = 0; expectedOutput.Length = 0; current = input; readArgs = true; } skipToEOL = true; } else { ++pos; if(current != null) { if (readArgs){ args.Append(ch); } else if (!skipToEOL){ current.Append(ch); } } if (ch == '\r') { line++; pos = 1; if (sr.Peek() == '\n') { i = sr.Read(); if (!skipToEOL) current.Append((char)i); if (readArgs) args.Append(ch); } skipToEOL = false; readArgs = false; } else if (ch == '\n'){ skipToEOL = false; readArgs = false; line++; pos = 1; } } } while (i != -1); if (current.Length>0 && expectedOutput.Length>0) { RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString()); } }
/// <summary> /// Converts the entry body into XHTML compliant text. /// Returns false if it encounters a problem in doing so. /// </summary> /// <param name="entry">Entry.</param> /// <returns></returns> public static bool ConvertHtmlToXHtml(Entry entry) { SgmlReader reader = new SgmlReader(); reader.SetBaseUri(Config.CurrentBlog.RootUrl.ToString()); entry.Body = ConvertHtmlToXHtml(reader, entry.Body, null); return true; }
/// <summary> /// Gets the doc reader. /// </summary> /// <param name="html">The HTML.</param> /// <param name="baseUri">The base URI.</param> /// <returns></returns> private static XmlReader GetDocReader( string html, Uri baseUri ) { SgmlReader r = new SgmlReader(); if ( baseUri != null && !string.IsNullOrEmpty( baseUri.ToString() ) ) { r.SetBaseUri( baseUri.ToString() ); } r.DocType = @"HTML"; r.InputStream = new StringReader( html ); return r; }
/// <summary> /// /// </summary> private static XmlReader getDocReader( string html, string baseUrl ) { var r = new Sgml.SgmlReader(); if ( baseUrl.Length > 0 ) { r.SetBaseUri( baseUrl ); } r.DocType = @"HTML"; r.InputStream = new StringReader( html ); return r; }
/// <summary> /// Detects URLs in styles. /// </summary> /// <param name="baseUri">The base URI.</param> /// <param name="attributeName">Name of the attribute.</param> /// <param name="attributeValue">The attribute value.</param> /// <returns></returns> //private List<UriResourceInformation> ExtractStyleUrls( // Uri baseUri, // string attributeName, // string attributeValue) //{ // List<UriResourceInformation> result = // new List<UriResourceInformation>(); // if (string.Compare(attributeName, @"style", true) == 0) // { // if (attributeValue != null && // attributeValue.Trim().Length > 0) // { // MatchCollection matchs = Regex.Matches( // attributeValue, // @"url\s*\(\s*([^\)\s]+)\s*\)", // RegexOptions.Singleline | RegexOptions.IgnoreCase); // if (matchs.Count > 0) // { // foreach (Match match in matchs) // { // if (match != null && match.Success) // { // string url = match.Groups[1].Value; // UriResourceInformation ui = // new UriResourceInformation( // _settings.Options, // url, // new Uri(url, UriKind.RelativeOrAbsolute), // baseUri, // UriType.Resource, // _uriInfo.AbsoluteUri, // ); // bool isOnSameSite = // ui.IsOnSameSite(baseUri); // if ((isOnSameSite || // !_settings.Options.StayOnSite) && // ui.IsProcessableUri) // { // result.Add(ui); // } // } // } // } // } // } // return result; //} /// <summary> /// Gets the doc reader. /// </summary> /// <param name="html">The HTML.</param> /// <param name="baseUri">The base URI.</param> /// <returns></returns> private static XmlReader GetDocReader( string html, Uri baseUri) { SgmlReader r = new SgmlReader(); if (baseUri != null && !string.IsNullOrEmpty(baseUri.ToString())) r.SetBaseUri(baseUri.ToString()); r.DocType = @"HTML"; r.WhitespaceHandling = WhitespaceHandling.All; r.CaseFolding = CaseFolding.None; StringReader sr = new StringReader(html); r.InputStream = sr; r.Read(); return r; }