/// <summary>
        /// Gets the doc reader.
        /// </summary>
        /// <param name="html">The HTML.</param>
        /// <param name="baseUri">The base URI.</param>
        /// <returns></returns>
        private static XmlReader GetDocReader(
            string html,
            Uri baseUri)
        {
            SgmlReader r = new SgmlReader();

            if (baseUri != null &&
                !string.IsNullOrEmpty(baseUri.ToString()))
            {
                r.SetBaseUri(baseUri.ToString());
            }
            r.DocType     = @"HTML";
            r.InputStream = new StringReader(html);

            return(r);
        }
Beispiel #2
0
        /// <summary>
        /// Converts SGML to XML
        /// </summary>
        /// <param name="file">OFX File (SGML Format)</param>
        /// <returns>OFX File in XML format</returns>
        private string SGMLToXML(string file)
        {
            SgmlReader reader = new SgmlReader
            {
                CaseFolding        = CaseFolding.None,
                DocType            = "OFX",
                InputStream        = new StringReader(ParseHeader(file)),
                WhitespaceHandling = WhitespaceHandling.None,
                SystemLiteral      = "Embeded\\ofx160.dtd"
            };

            string     codeBase = Assembly.GetExecutingAssembly().CodeBase;
            UriBuilder uri      = new UriBuilder(codeBase);
            string     path     = Uri.UnescapeDataString(uri.Path);

            reader.SetBaseUri(Path.GetDirectoryName(path));

            Func <XmlReader, XmlWriter, XmlWriter> callback = (System.Xml.XmlReader rs, XmlWriter wr) =>
            {
                rs.Read();
                while (!reader.EOF)
                {
                    wr.WriteNode(reader, true);
                }

                return(wr);
            };

            MemoryStream str    = new MemoryStream();
            XmlWriter    writer = new XmlTextWriter(str, Encoding.GetEncoding("ISO-8859-1")); //encoding da america latina


            var stringWriter  = new StringWriter();
            var xmlTextWriter = new XmlTextWriter(stringWriter);

            xmlTextWriter.Formatting = Formatting.Indented;

            callback(reader, xmlTextWriter);
            xmlTextWriter.Close();

            // reproduce the parsed document
            var actual = stringWriter.ToString();

            return(actual);
        }
Beispiel #3
0
        bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log) {
            depth++;
            StringBuilder indent = new StringBuilder();
            for (int i = 0; i < depth; i++)
                indent.Append(" ");
      
            count++;
            Uri baseUri = new Uri(doc.BaseURI);
            XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base");
            if (baseElmt != null) {
                string href = baseElmt.GetAttribute("href");
                if (href != "") {
                    try {
                        baseUri = new Uri(href);
                    }
                    catch (Exception ) {
                        Console.WriteLine("### Error parsing BASE href '"+href+"'");
                    }
                }
            }
            foreach (XmlElement a in doc.SelectNodes("//a")) {
                string href = a.GetAttribute("href");
                if (href != "" && href != null && depth<5) {
                    Uri local = new Uri(baseUri, href);
                    if (domain && baseUri.Host != local.Host)
                        continue;
                    string ext = Path.GetExtension(local.AbsolutePath).ToLower();
                    if (ext == ".jpg" || ext == ".gif" || ext==".mpg")
                        continue;
                    string url = local.AbsoluteUri;
                    if (!visited.ContainsKey(url)) {
                        visited.Add(url, url);
                        log.WriteLine(indent+"Loading '"+url+"'");
                        log.Flush();
                        StreamReader stm = null;
                        try {
                            HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
                            wr.Timeout = 10000; 
                            if (proxy != null) wr.Proxy = new WebProxy(proxy);
                            wr.PreAuthenticate = false; 
                            // Pass the credentials of the process. 
                            wr.Credentials = CredentialCache.DefaultCredentials; 

                            WebResponse resp = wr.GetResponse();
                            Uri actual = resp.ResponseUri;
                            if (actual.AbsoluteUri != url) {
                                local = new Uri(actual.AbsoluteUri);
                                log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'");
                                log.Flush();
                            }           
                            if (resp.ContentType != "text/html") {
                                log.WriteLine(indent+"Skipping ContentType="+resp.ContentType);
                                log.Flush();
                                resp.Close();
                            } 
                            else {
                                stm = new StreamReader(resp.GetResponseStream());
                            }
                        } 
                        catch (Exception e) {
                            log.WriteLine(indent+"### Error opening URL: " + e.Message);
                            log.Flush();
                        }
                        if (stm != null) {
                            SgmlReader reader = new SgmlReader();
                            reader.Dtd = dtd;
                            reader.SetBaseUri(local.AbsoluteUri);
                            reader.InputStream = stm;
                            reader.WebProxy = proxy;

                            XmlDocument d2 = new XmlDocument();
                            d2.XmlResolver = null; // don't do any downloads!
                            try {
                                d2.Load(reader);
                                reader.Close();
                                stm.Close();
                                if (!Crawl(dtd, d2, log))
                                    return false;
                            } 
                            catch (Exception e) {
                                log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message);
                                log.Flush();
                                reader.Close();
                            }
                        }
                    }
                }
            }
            depth--;
            return true;
        }
Beispiel #4
0
        /**************************************************************************
         * Run a test suite.  Tests suites are organized into expected input/output
         * blocks separated by back quotes (`).  It runs the input and compares it
         * with the expected output and reports any failures.
         **************************************************************************/
        void RunTest(SgmlReader reader, string file) {
            Console.WriteLine(file);
            StreamReader sr = new StreamReader(file);
            StringBuilder input = new StringBuilder();
            StringBuilder expectedOutput = new StringBuilder();
            StringBuilder current = null;
            StringBuilder args = new StringBuilder();

            Uri baseUri = new Uri(new Uri(Directory.GetCurrentDirectory()+"\\"), file);
            reader.SetBaseUri(baseUri.AbsoluteUri);
            
            int start = 1;
            int line = 1;
            int pos = 1;
            bool skipToEOL = false;
            bool readArgs = false;
            int i;
            do {
                i = sr.Read();
                char ch = (char)i;
                if (pos == 1 && ch == '`') {
                    if (current == null) {
                        current = input;
                        current.Length = 0;
                        readArgs = true;
                    } else if (current == input) {
                        current = expectedOutput;
                    }
                    else {
                        RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString());
                        start = line;
                        input.Length = 0;
                        args.Length = 0;
                        expectedOutput.Length = 0;
                        current = input;
                        readArgs = true;
                    }
                    skipToEOL = true;
                } else {
                    if (current != null) {
                        if (readArgs){
                            args.Append(ch);
                        } else if (!skipToEOL){
                            current.Append(ch);
                        }
                    }
                    if (ch == '\r') {
                        line++; pos = 1;
                        if (sr.Peek() == '\n') {
                            i = sr.Read();
                            if (!skipToEOL) current.Append((char)i);                            
                            if (readArgs) args.Append(ch);
                        }
                        skipToEOL = false;
                        readArgs = false;
                    } else if (ch == '\n'){
                        skipToEOL = false;
                        readArgs = false;
                        line++; pos = 1;
                    }
                }
            } while (i != -1);

            if (current.Length>0 && expectedOutput.Length>0) {
                RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString());
            }


        }