/// <summary> /// Gets the doc reader. /// </summary> /// <param name="html">The HTML.</param> /// <param name="baseUri">The base URI.</param> /// <returns></returns> private static XmlReader GetDocReader( string html, Uri baseUri) { SgmlReader r = new SgmlReader(); if (baseUri != null && !string.IsNullOrEmpty(baseUri.ToString())) { r.SetBaseUri(baseUri.ToString()); } r.DocType = @"HTML"; r.InputStream = new StringReader(html); return(r); }
/// <summary> /// Converts SGML to XML /// </summary> /// <param name="file">OFX File (SGML Format)</param> /// <returns>OFX File in XML format</returns> private string SGMLToXML(string file) { SgmlReader reader = new SgmlReader { CaseFolding = CaseFolding.None, DocType = "OFX", InputStream = new StringReader(ParseHeader(file)), WhitespaceHandling = WhitespaceHandling.None, SystemLiteral = "Embeded\\ofx160.dtd" }; string codeBase = Assembly.GetExecutingAssembly().CodeBase; UriBuilder uri = new UriBuilder(codeBase); string path = Uri.UnescapeDataString(uri.Path); reader.SetBaseUri(Path.GetDirectoryName(path)); Func <XmlReader, XmlWriter, XmlWriter> callback = (System.Xml.XmlReader rs, XmlWriter wr) => { rs.Read(); while (!reader.EOF) { wr.WriteNode(reader, true); } return(wr); }; MemoryStream str = new MemoryStream(); XmlWriter writer = new XmlTextWriter(str, Encoding.GetEncoding("ISO-8859-1")); //encoding da america latina var stringWriter = new StringWriter(); var xmlTextWriter = new XmlTextWriter(stringWriter); xmlTextWriter.Formatting = Formatting.Indented; callback(reader, xmlTextWriter); xmlTextWriter.Close(); // reproduce the parsed document var actual = stringWriter.ToString(); return(actual); }
bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log) { depth++; StringBuilder indent = new StringBuilder(); for (int i = 0; i < depth; i++) indent.Append(" "); count++; Uri baseUri = new Uri(doc.BaseURI); XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base"); if (baseElmt != null) { string href = baseElmt.GetAttribute("href"); if (href != "") { try { baseUri = new Uri(href); } catch (Exception ) { Console.WriteLine("### Error parsing BASE href '"+href+"'"); } } } foreach (XmlElement a in doc.SelectNodes("//a")) { string href = a.GetAttribute("href"); if (href != "" && href != null && depth<5) { Uri local = new Uri(baseUri, href); if (domain && baseUri.Host != local.Host) continue; string ext = Path.GetExtension(local.AbsolutePath).ToLower(); if (ext == ".jpg" || ext == ".gif" || ext==".mpg") continue; string url = local.AbsoluteUri; if (!visited.ContainsKey(url)) { visited.Add(url, url); log.WriteLine(indent+"Loading '"+url+"'"); log.Flush(); StreamReader stm = null; try { HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url); wr.Timeout = 10000; if (proxy != null) wr.Proxy = new WebProxy(proxy); wr.PreAuthenticate = false; // Pass the credentials of the process. wr.Credentials = CredentialCache.DefaultCredentials; WebResponse resp = wr.GetResponse(); Uri actual = resp.ResponseUri; if (actual.AbsoluteUri != url) { local = new Uri(actual.AbsoluteUri); log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'"); log.Flush(); } if (resp.ContentType != "text/html") { log.WriteLine(indent+"Skipping ContentType="+resp.ContentType); log.Flush(); resp.Close(); } else { stm = new StreamReader(resp.GetResponseStream()); } } catch (Exception e) { log.WriteLine(indent+"### Error opening URL: " + e.Message); log.Flush(); } if (stm != null) { SgmlReader reader = new SgmlReader(); reader.Dtd = dtd; reader.SetBaseUri(local.AbsoluteUri); reader.InputStream = stm; reader.WebProxy = proxy; XmlDocument d2 = new XmlDocument(); d2.XmlResolver = null; // don't do any downloads! try { d2.Load(reader); reader.Close(); stm.Close(); if (!Crawl(dtd, d2, log)) return false; } catch (Exception e) { log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message); log.Flush(); reader.Close(); } } } } } depth--; return true; }
/************************************************************************** * Run a test suite. Tests suites are organized into expected input/output * blocks separated by back quotes (`). It runs the input and compares it * with the expected output and reports any failures. **************************************************************************/ void RunTest(SgmlReader reader, string file) { Console.WriteLine(file); StreamReader sr = new StreamReader(file); StringBuilder input = new StringBuilder(); StringBuilder expectedOutput = new StringBuilder(); StringBuilder current = null; StringBuilder args = new StringBuilder(); Uri baseUri = new Uri(new Uri(Directory.GetCurrentDirectory()+"\\"), file); reader.SetBaseUri(baseUri.AbsoluteUri); int start = 1; int line = 1; int pos = 1; bool skipToEOL = false; bool readArgs = false; int i; do { i = sr.Read(); char ch = (char)i; if (pos == 1 && ch == '`') { if (current == null) { current = input; current.Length = 0; readArgs = true; } else if (current == input) { current = expectedOutput; } else { RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString()); start = line; input.Length = 0; args.Length = 0; expectedOutput.Length = 0; current = input; readArgs = true; } skipToEOL = true; } else { if (current != null) { if (readArgs){ args.Append(ch); } else if (!skipToEOL){ current.Append(ch); } } if (ch == '\r') { line++; pos = 1; if (sr.Peek() == '\n') { i = sr.Read(); if (!skipToEOL) current.Append((char)i); if (readArgs) args.Append(ch); } skipToEOL = false; readArgs = false; } else if (ch == '\n'){ skipToEOL = false; readArgs = false; line++; pos = 1; } } } while (i != -1); if (current.Length>0 && expectedOutput.Length>0) { RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString()); } }