Example #1
1
        // Creates XmlDocument from html content and return it with rootitem "<root>".
        public static XmlDocument ParseHtml(string sContent)
        {
            StringReader sr = new StringReader("<root>" + sContent + "</root>");
            SgmlReader reader = new SgmlReader();
            reader.WhitespaceHandling = WhitespaceHandling.All;
            reader.CaseFolding = Sgml.CaseFolding.ToLower;
            reader.InputStream = sr;

            StringWriter sw = new StringWriter();
            XmlTextWriter w = new XmlTextWriter(sw);
            w.Formatting = Formatting.Indented;
            w.WriteStartDocument();
            reader.Read();
            while (!reader.EOF)
            {
                w.WriteNode(reader, true);
            }
            w.Flush();
            w.Close();

            sw.Flush();

            // create document
            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.LoadXml(sw.ToString());

            reader.Close();

            return doc;
        }
Example #2
1
 public static string GetWellFormedHTML(string html, string xpathNavPath)
 {
     // StreamReader sReader = null;
     StringWriter sw = null;
     SgmlReader reader = null;
     XmlTextWriter writer = null;
     try
     {
         //  if (uri == String.Empty) uri = "http://www.XMLforASP.NET";
         // HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
         //  HttpWebResponse res = (HttpWebResponse)req.GetResponse();
         //  sReader = new StreamReader(res.GetResponseStream());
         reader = new SgmlReader();
         reader.DocType = "HTML";
         reader.InputStream = new StringReader(html);
         sw = new StringWriter();
         writer = new XmlTextWriter(sw);
         writer.Formatting = Formatting.Indented;
         //writer.WriteStartElement("Test");
         while (reader.Read())
         {
             if (reader.NodeType != XmlNodeType.Whitespace)
             {
                 writer.WriteNode(reader, true);
             }
         }
         //writer.WriteEndElement();
         if (xpathNavPath == null)
         {
             string sr = sw.ToString();
             sr = sr.Replace("\r", "\n");
             sr = sr.Replace("\n\n", "\n");
             return sr;
         }
         else
         { //Filter out nodes from HTML
             StringBuilder sb = new StringBuilder();
             XPathDocument doc = new XPathDocument(new StringReader(sw.ToString()));
             XPathNavigator nav = doc.CreateNavigator();
             XPathNodeIterator nodes = nav.Select(xpathNavPath);
             while (nodes.MoveNext())
             {
                 sb.Append(nodes.Current.Value + "\n");
             }
             string sr = sb.ToString();
             sr = sr.Replace("\r", "\n");
             sr = sr.Replace("\n\n", "\n");
             return sr;
         }
     }
     catch (Exception exp)
     {
         writer.Close();
         reader.Close();
         sw.Close();
         // sReader.Close();
         return exp.Message;
     }
 }
 XmlDocument FetchXmlDocument(Uri url)
 {
     var sr = FetchWebText (url);
     var xr = new SgmlReader () { InputStream = sr };
     var doc = new XmlDocument ();
     doc.Load (xr);
     sr.Close ();
     xr.Close ();
     return doc;
 }
Example #4
0
        public void Run(string[] args)
        {
            SgmlReader reader = new SgmlReader();
            string inputUri = null;

            for (int i = 0; i < args.Length; i++) {
                string arg = args[i];
                if (arg[0] == '-' || arg[0] == '/') {
                    switch (arg.Substring(1)) {
                        case "e":
                            string errorlog = args[++i];
                            if (errorlog.ToLower() == "$stderr") {
                                reader.ErrorLog = Console.Error;
                            }
                            else {
                                reader.ErrorLogFile = errorlog;
                            }
                            break;
                        case "html":
                            reader.DocType = "HTML";
                            break;
                        case "dtd":
                            reader.SystemLiteral = args[++i];
                            break;
                        case "proxy":
                            proxy = args[++i];
                            reader.WebProxy = proxy;
                            break;
                        case "encoding":
                            encoding = Encoding.GetEncoding(args[++i]);
                            break;
                        case "f":
                            formatted = true;
                            reader.WhitespaceHandling = WhitespaceHandling.None;
                            break;
                        case "noxml":
                            noxmldecl = true;
                            break;
                        case "doctype":
                            reader.StripDocType = false;
                            break;
                        case "lower":
                            reader.CaseFolding = CaseFolding.ToLower;
                            break;
                        case "upper":
                            reader.CaseFolding = CaseFolding.ToUpper;
                            break;

                        default:
                            Console.WriteLine("Usage: SgmlReader <options> [InputUri] [OutputFile]");
                            Console.WriteLine("-e log         Optional log file name, name of '$STDERR' will write errors to stderr");
                            Console.WriteLine("-f             Whether to pretty print the output.");
                            Console.WriteLine("-html          Specify the built in HTML dtd");
                            Console.WriteLine("-dtd url       Specify other SGML dtd to use");
                            Console.WriteLine("-base          Add base tag to output HTML");
                            Console.WriteLine("-noxml         Do not add XML declaration to the output");
                            Console.WriteLine("-proxy svr:80  Proxy server to use for http requests");
                            Console.WriteLine("-encoding name Specify an encoding for the output file (default UTF-8)");
                            Console.WriteLine("-lower         Convert input tags to lower case");
                            Console.WriteLine("-upper         Convert input tags to upper case");
                            Console.WriteLine();
                            Console.WriteLine("InputUri       The input file or http URL (default stdin).  ");
                            Console.WriteLine("               Supports wildcards for local file names.");
                            Console.WriteLine("OutputFile     Output file name (default stdout)");
                            Console.WriteLine("               If input file contains wildcards then this just specifies the output file extension (default .xml)");
                            return;
                    }
                }
                else {
                    if (inputUri == null) {
                        inputUri = arg;
                        string ext = Path.GetExtension(arg).ToLower();
                        if (ext == ".htm" || ext == ".html")
                            reader.DocType = "HTML";
                    }
                    else if (output == null) output = arg;
                }
            }
            if (inputUri != null && !inputUri.StartsWith("http://") && inputUri.IndexOfAny(new char[] { '*', '?' }) >= 0) {
                // wild card processing of a directory of files.
                string path = Path.GetDirectoryName(inputUri);
                if (path == "") path = ".\\";
                string ext = ".xml";
                if (output != null)
                    ext = Path.GetExtension(output);
                foreach (string uri in Directory.GetFiles(path, Path.GetFileName(inputUri))) {
                    Console.WriteLine("Processing: " + uri);
                    string file = Path.GetFileName(uri);
                    output = Path.GetDirectoryName(uri) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(file) + ext;
                    Process(reader, uri);
                    reader.Close();
                }
                return;
            }
            Process(reader, inputUri);
            reader.Close();

            return ;
        }
Example #5
0
        bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log)
        {
            depth++;
            StringBuilder indent = new StringBuilder();
            for (int i = 0; i < depth; i++)
                indent.Append(" ");

            count++;
            Uri baseUri = new Uri(doc.BaseURI);
            XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base");
            if (baseElmt != null) {
                string href = baseElmt.GetAttribute("href");
                if (href != "") {
                    try {
                        baseUri = new Uri(href);
                    }
                    catch (Exception ) {
                        Console.WriteLine("### Error parsing BASE href '"+href+"'");
                    }
                }
            }
            foreach (XmlElement a in doc.SelectNodes("//a")) {
                string href = a.GetAttribute("href");
                if (href != "" && href != null && depth<5) {
                    Uri local = new Uri(baseUri, href);
                    if (domain && baseUri.Host != local.Host)
                        continue;
                    string ext = Path.GetExtension(local.AbsolutePath).ToLower();
                    if (ext == ".jpg" || ext == ".gif" || ext==".mpg")
                        continue;
                    string url = local.AbsoluteUri;
                    if (!visited.ContainsKey(url)) {
                        visited.Add(url, url);
                        log.WriteLine(indent+"Loading '"+url+"'");
                        log.Flush();
                        StreamReader stm = null;
                        try {
                            HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
                            wr.Timeout = 10000;
                            if (proxy != null) wr.Proxy = new WebProxy(proxy);
                            wr.PreAuthenticate = false;
                            // Pass the credentials of the process.
                            wr.Credentials = CredentialCache.DefaultCredentials;

                            WebResponse resp = wr.GetResponse();
                            Uri actual = resp.ResponseUri;
                            if (actual.AbsoluteUri != url) {
                                local = new Uri(actual.AbsoluteUri);
                                log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'");
                                log.Flush();
                            }
                            if (resp.ContentType != "text/html") {
                                log.WriteLine(indent+"Skipping ContentType="+resp.ContentType);
                                log.Flush();
                                resp.Close();
                            }
                            else {
                                stm = new StreamReader(resp.GetResponseStream());
                            }
                        }
                        catch (Exception e) {
                            log.WriteLine(indent+"### Error opening URL: " + e.Message);
                            log.Flush();
                        }
                        if (stm != null) {
                            SgmlReader reader = new SgmlReader();
                            reader.Dtd = dtd;
                            reader.SetBaseUri(local.AbsoluteUri);
                            reader.InputStream = stm;
                            reader.WebProxy = proxy;

                            XmlDocument d2 = new XmlDocument();
                            d2.XmlResolver = null; // don't do any downloads!
                            try {
                                d2.Load(reader);
                                reader.Close();
                                stm.Close();
                                if (!Crawl(dtd, d2, log))
                                    return false;
                            }
                            catch (Exception e) {
                                log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message);
                                log.Flush();
                                reader.Close();
                            }
                        }
                    }
                }
            }
            depth--;
            return true;
        }
Example #6
0
        /***************************************************************************
        * Useful debugging code...
        * **************************************************************************/
        void StartCrawl(SgmlReader reader, string uri, bool basify)
        {
            Console.WriteLine("Loading '"+reader.BaseURI+"'");

            XmlDocument doc = new XmlDocument();
            try {
                doc.XmlResolver = null; // don't do any downloads!
                doc.Load(reader);
            }
            catch (Exception e) {
                Console.WriteLine("Error loading document\n"+e.Message);
            }
            reader.Close();

            if (basify) {
                // html and head are option, if they are there use them otherwise not.
                XmlElement be = (XmlElement)doc.SelectSingleNode("//base");
                if (be == null) {
                    be = doc.CreateElement("base");
                    be.SetAttribute("href", doc.BaseURI);

                    XmlElement head = (XmlElement)doc.SelectSingleNode("//head");
                    if (head != null) {
                        head.InsertBefore(be, head.FirstChild);
                    }
                    else {
                        XmlElement html = (XmlElement)doc.SelectSingleNode("//html");
                        if (html != null) html.InsertBefore(be, html.FirstChild);
                        else doc.DocumentElement.InsertBefore(be, doc.DocumentElement.FirstChild);
                    }
                }
            }

            try {
                Crawl(reader.Dtd, doc, reader.ErrorLog);
            }
            catch (Exception e) {
                Console.WriteLine("Uncaught exception: " + e.Message);
            }
        }
Example #7
0
        void Process(SgmlReader reader, string uri, bool loadAsStream)
        {
            if (uri == null) {
                reader.InputStream = Console.In;
            }
            else if (loadAsStream) {
                Uri location = new Uri(uri);
                if (location.IsFile) {
                    reader.InputStream = new StreamReader(uri);
                } else {
                    WebRequest wr = WebRequest.Create(location);
                    reader.InputStream = new StreamReader(wr.GetResponse().GetResponseStream());
                }
            } else {
                reader.Href = uri;
            }

            if (debug) {
                Debug(reader);
                reader.Close();
                return;
            }
            if (crawl) {
                StartCrawl(reader, uri, basify);
                return;
            }

            if (this.encoding == null) {
                this.encoding = reader.GetEncoding();
            }

            XmlTextWriter w = null;
            if (output != null) {
                w = new XmlTextWriter(output, this.encoding);
            }
            else {
                w = new XmlTextWriter(Console.Out);
            }
            if (formatted) w.Formatting = Formatting.Indented;
            if (!noxmldecl) {
                w.WriteStartDocument();
            }
            if (testdoc) {
                XmlDocument doc = new XmlDocument();
                try {
                    doc.Load(reader);
                    doc.WriteTo(w);
                } catch (XmlException e) {
                    Console.WriteLine("Error:" + e.Message);
                    Console.WriteLine("at line " + e.LineNumber + " column " + e.LinePosition);
                }
            } else {
                reader.Read();
                while (!reader.EOF) {
                    w.WriteNode(reader, true);
                }
            }
            w.Flush();
            w.Close();
        }
        /// <summary>
        private string GetWellFormedHTML_Handle(string uri)
        {
            StreamReader sReader = null;
            StringWriter sw = null;
            SgmlReader reader = null;
            XmlTextWriter writer = null;
            try
            {
                if (uri == String.Empty) uri = "http://www.ypshop.net/list--91-940-940--search-1.html";
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
                HttpWebResponse res = (HttpWebResponse)req.GetResponse();
                sReader = new StreamReader(res.GetResponseStream());

                reader = new SgmlReader();
                reader.DocType = "HTML";
                reader.InputStream = new StringReader(sReader.ReadToEnd());

                sw = new StringWriter();
                writer = new XmlTextWriter(sw);
                writer.Formatting = Formatting.Indented;
                while (reader.Read())
                {
                    if (reader.NodeType != XmlNodeType.Whitespace)
                    {
                        writer.WriteNode(reader, true);
                    }
                }

                StringBuilder sb = new StringBuilder();
                XPathDocument doc = new XPathDocument(new StringReader(sw.ToString()));
                XPathNavigator nav = doc.CreateNavigator();
                //XPathNodeIterator nodes = nav.Select(xpath);
                //while (nodes.MoveNext())
                //{
                //    sb.Append(nodes.Current.Value + " ");
                //}
                return sb.ToString();

            }
            catch (Exception exp)
            {
                writer.Close();
                reader.Close();
                sw.Close();
                sReader.Close();
                return exp.Message;
            }
        }
        /// <summary>
        /// 读取html页面内容
        /// </summary>
        /// <param name="uri">网址</param>
        /// <param name="xpath">xpath标签</param>
        /// <returns></returns>
        private string GetWellFormedHTML(string uri, string xpath)
        {
            StreamReader sReader = null;//读取字节流
            StringWriter sw = null;//写入字符串
            SgmlReader reader = null;//sgml读取方法
            XmlTextWriter writer = null;//生成xml数据流
            try
            {
                if (uri == String.Empty)
                    uri = "http://www.ypshop.net/list--91-940-940--search-1.html";
                WebClient webclient = new WebClient();
                webclient.Encoding = Encoding.UTF8;
                //页面内容
                string strWebContent = webclient.DownloadString(uri);

                reader = new SgmlReader();
                reader.DocType = "HTML";
                reader.InputStream = new StringReader(strWebContent);

                sw = new StringWriter();
                writer = new XmlTextWriter(sw);
                writer.Formatting = Formatting.Indented;
                while (reader.Read())
                {
                    if (reader.NodeType != XmlNodeType.Whitespace)
                    {
                        writer.WriteNode(reader, true);
                    }
                }
                //return sw.ToString();
                if (xpath == null)
                {
                    return sw.ToString();
                }
                else
                { //Filter out nodes from HTML
                    StringBuilder sb = new StringBuilder();
                    XPathDocument doc = new XPathDocument(new StringReader(sw.ToString()));
                    XPathNavigator nav = doc.CreateNavigator();
                    XPathNodeIterator nodes = nav.Select(xpath);
                    while (nodes.MoveNext())
                    {
                        sb.Append(nodes.Current.Value + " ");
                    }
                    return sb.ToString();
                }
            }
            catch (Exception exp)
            {
                writer.Close();
                reader.Close();
                sw.Close();
                sReader.Close();
                return exp.Message;
            }
        }
        /// <summary>
        /// ת����Xhtml
        /// </summary>
        /// <param name="html">html����</param>
        /// <returns>Xhtml����</returns>
        public static string ToXhtml(string html)
        {
            SgmlReader reader = new SgmlReader();
            reader.CaseFolding = CaseFolding.ToLower;
            reader.DocType = "HTML";
            reader.InputStream = new StringReader(html);

            StringWriter sw = new StringWriter(CultureInfo.InvariantCulture);
            XmlTextWriter writer = new XmlTextWriter(sw);
            writer.Formatting = Formatting.Indented;
            reader.WhitespaceHandling = WhitespaceHandling.None;
            while (!reader.EOF)
            {
                writer.WriteNode(reader, true);
            }
            reader.Close();
            sw.Close();
            writer.Close();
            return sw.ToString();
        }
Example #11
0
        public XmlDocument Proceed()
        {
            HttpWebRequest req = (HttpWebRequest) HttpWebRequest.Create(_uri);

            WebResponse response = req.GetResponse();

            var st = response.GetResponseStream();

            System.IO.TextReader tr = new System.IO.StreamReader(st,  System.Text.Encoding.GetEncoding(1251)) ;

            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = tr;

            // create document
            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);
            response.Close();
            sgmlReader.Close();

            _doc = doc;
            return doc;
        }
        /// <summary>
        /// Parse a HTML to XML and returns a string, if error occurs returns an exception.
        /// </summary>
        /// <remarks> Use this method when you want to catch a parsing error.</remarks>
        /// <param name="html"> HTML string to parse.</param>
        /// <returns>A string with the parsed value.</returns>	
        public string GetParsableString(string html)
        {
            html = PreProcessHtml(html);
            SgmlReader reader = new SgmlReader();

            // set SgmlReader values
            reader.DocType = "HTML";

            // lower case all
            reader.InputStream = new StringReader(html);

            // write to xml
            StringWriter sw = new StringWriter();
            XmlTextWriter w = new XmlTextWriter(sw);

            w.Formatting = Formatting.Indented;

            try
            {
                while (reader.Read())
                {
                    if ( (reader.NodeType != XmlNodeType.DocumentType) && (this.ParserProperties.RemoveDocumentType) )
                    {
                        if ( reader.NodeType != XmlNodeType.Whitespace )
                        {
                            // Write entire reader to xml
                            w.WriteNode(reader, true);
                        }
                    }
                }

                return PostProcessHtml(sw.ToString());
            }
            catch
            {
                throw;
            }
            finally
            {

                reader.Close();
                w.Close();
            }
        }