Пример #1
0
        void Process(SgmlReader reader, string uri, bool loadAsStream) {   
            if (uri == null) {
                reader.InputStream = Console.In;
            } 
            else if (loadAsStream) {
                Uri location = new Uri(uri);
                if (location.IsFile) {   
                    reader.InputStream = new StreamReader(uri);
                } else {
                    WebRequest wr = WebRequest.Create(location);
                    reader.InputStream = new StreamReader(wr.GetResponse().GetResponseStream());
                }
            } else {
                reader.Href = uri;
            }

            if (debug) {
                Debug(reader);
                reader.Close();
                return;
            } 
            if (crawl) {
                StartCrawl(reader, uri, basify);
                return;
            } 

            if (this.encoding == null) {
                this.encoding = reader.GetEncoding();
            }

            
            XmlTextWriter w = null;
            if (output != null) {
                w = new XmlTextWriter(output, this.encoding);          
            } 
            else {
                w = new XmlTextWriter(Console.Out);
            }
            if (formatted) w.Formatting = Formatting.Indented;
            if (!noxmldecl) {
                w.WriteStartDocument();
            }
            if (testdoc) {
                XmlDocument doc = new XmlDocument();
                try {
                    doc.Load(reader);
                    doc.WriteTo(w);
                } catch (XmlException e) {
                    Console.WriteLine("Error:" + e.Message);
                    Console.WriteLine("at line " + e.LineNumber + " column " + e.LinePosition);
                }
            } else {
                reader.Read();
                while (!reader.EOF) {
                    w.WriteNode(reader, true);
                }
            }
            w.Flush();
            w.Close();          
        }
Пример #2
0
        // Creates XmlDocument from html content and return it with rootitem "<root>".
        public static XmlDocument ParseHtml(string sContent)
        {
            StringReader sr     = new StringReader("<root>" + sContent + "</root>");
            SgmlReader   reader = new SgmlReader();

            reader.WhitespaceHandling = WhitespaceHandling.All;
            reader.CaseFolding        = Sgml.CaseFolding.ToLower;
            reader.InputStream        = sr;

            StringWriter  sw = new StringWriter();
            XmlTextWriter w  = new XmlTextWriter(sw);

            w.Formatting = Formatting.Indented;
            w.WriteStartDocument();
            reader.Read();
            while (!reader.EOF)
            {
                w.WriteNode(reader, true);
            }
            w.Flush();
            w.Close();

            sw.Flush();

            // create document
            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.LoadXml(sw.ToString());

            reader.Close();

            return(doc);
        }
Пример #3
0
        /// <summary>
        /// 处理html代码
        /// </summary>
        /// <param name="input">等处理的字符串</param>
        /// <param name="skipHtmlNode">是否跳过html节点</param>
        /// <param name="clearTag">是否清除html tag,只输出纯文本</param>
        /// <param name="maxCount">copy的文本的字符数,如果maxCount&lt;=0,copy全部文本</param>
        /// <param name="endStr">如果只copy了部分文本,部分文本后的附加字符,如...</param>
        /// <returns>处理后的html代码</returns>
        public static string ProcessHtml(string input, bool skipHtmlNode, bool clearTag, int maxCount, string endStr)
        {
            if (string.IsNullOrEmpty(input))
            {
                return(input);
            }
            StringWriter  output = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(output);

            writer.Formatting = Formatting.Indented;

            SgmlReader reader = new SgmlReader();

            reader.DocType     = "HTML";
            reader.InputStream = new StringReader(input);

            WriteXml(writer, reader, true, skipHtmlNode, clearTag, maxCount, endStr);

            writer.Flush();
            writer.Close();

            reader.Close();

            return(output.ToString());
        }
Пример #4
0
        public static string GetWellFormedHTML(string html, string xpathNavPath)
        {
            // StreamReader sReader = null;
            StringWriter  sw     = null;
            SgmlReader    reader = null;
            XmlTextWriter writer = null;

            try
            {
                //  if (uri == String.Empty) uri = "http://www.XMLforASP.NET";
                // HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
                //  HttpWebResponse res = (HttpWebResponse)req.GetResponse();
                //  sReader = new StreamReader(res.GetResponseStream());
                reader             = new SgmlReader();
                reader.DocType     = "HTML";
                reader.InputStream = new StringReader(html);
                sw                = new StringWriter();
                writer            = new XmlTextWriter(sw);
                writer.Formatting = Formatting.Indented;
                //writer.WriteStartElement("Test");
                while (reader.Read())
                {
                    if (reader.NodeType != XmlNodeType.Whitespace)
                    {
                        writer.WriteNode(reader, true);
                    }
                }
                //writer.WriteEndElement();
                if (xpathNavPath == null)
                {
                    string sr = sw.ToString();
                    sr = sr.Replace("\r", "\n");
                    sr = sr.Replace("\n\n", "\n");
                    return(sr);
                }
                else
                { //Filter out nodes from HTML
                    StringBuilder     sb    = new StringBuilder();
                    XPathDocument     doc   = new XPathDocument(new StringReader(sw.ToString()));
                    XPathNavigator    nav   = doc.CreateNavigator();
                    XPathNodeIterator nodes = nav.Select(xpathNavPath);
                    while (nodes.MoveNext())
                    {
                        sb.Append(nodes.Current.Value + "\n");
                    }
                    string sr = sb.ToString();
                    sr = sr.Replace("\r", "\n");
                    sr = sr.Replace("\n\n", "\n");
                    return(sr);
                }
            }
            catch (Exception exp)
            {
                writer.Close();
                reader.Close();
                sw.Close();
                // sReader.Close();
                return(exp.Message);
            }
        }
Пример #5
0
        /// <summary>
        /// 获取xml中的数据  根据
        /// </summary>
        /// <param name="htmlString"></param>
        /// <param name="xpath"></param>
        /// <returns></returns>
        public static string GetWellFormedHTML(string htmlString, string xpath)
        {
            if (htmlString.Trim().Length < 10)
            {
                return("");
            }
            htmlString = htmlString.Replace("xmlns", "buyao");
            StringWriter  sw     = null;
            SgmlReader    reader = null;
            XmlTextWriter writer = null;

            try
            {
                reader             = new SgmlReader();
                reader.DocType     = "HTML";
                reader.InputStream = new StringReader(htmlString);
                sw                = new StringWriter();
                writer            = new XmlTextWriter(sw);
                writer.Formatting = Formatting.Indented;
                writer.WriteStartDocument();
                while (reader.Read())
                {
                    if (reader.NodeType != XmlNodeType.Whitespace)
                    {
                        try
                        {
                            //如果出错 抛弃此节点
                            writer.WriteNode(reader, true);
                        }
                        catch (Exception e)
                        {
                        }
                    }
                }
                if (xpath == null)
                {
                    return(sw.ToString());
                }
                else
                {
                    StringBuilder     sb    = new StringBuilder();
                    XPathDocument     doc   = new XPathDocument(new StringReader(sw.ToString()));
                    XPathNavigator    nav   = doc.CreateNavigator();
                    XPathNodeIterator nodes = nav.Select(xpath);
                    while (nodes.MoveNext())
                    {
                        sb.Append(nodes.Current.OuterXml + " ");
                    }
                    return(sb.ToString());
                }
            }
            catch (Exception exp)
            {
                writer.Close();
                reader.Close();
                sw.Close();
                return("");
            }
        }
Пример #6
0
        private ResultInfo getDetail(string backstring)
        {
            SgmlReader reader = new SgmlReader();

            reader.DocType = "HTML";

            reader.InputStream = new StringReader(backstring);

            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }


            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);

            xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml");

            XPathNavigator nav = doc.CreateNavigator();

            /////////////////根据网页返回结果分析

            string            xpath    = "//bottum:table[@id='ctl00_ContentPlaceHolder1_TrackDetail']/bottum:tr/bottum:td/bottum:div[8]/bottum:table/bottum:tr/bottum:td";
            XPathNodeIterator nodes    = nav.Select(xpath, xnm);//xpath表达式
            ResultInfo        backinfo = new ResultInfo(querynum);

            if (nodes.Count > 3)
            {
                nodes.MoveNext();
                nodes.MoveNext();
                nodes.MoveNext();
            }
            for (int i = 1; i < nodes.Count / 3; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
            }
            reader.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
Пример #7
0
        public ResultInfo getDetail(string backstring)
        {
            SgmlReader reader = new SgmlReader();

            reader.DocType     = "HTML";
            reader.InputStream = new StringReader(backstring);
            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }


            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));

            reader.Close();
            writer.Close();
            sw.Close();
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);

            xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml");
            XPathNavigator    nav   = doc.CreateNavigator();
            string            xpath = "//bottum:table[@id='GridView1']/bottum:tr/bottum:td";
            XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式

            if (nodes != null)
            {
                int        count    = nodes.Count;
                int        k        = count / 3;
                ResultInfo backinfo = new ResultInfo(queryNumber);
                for (int i = 0; i < k; i++)
                {
                    nodes.MoveNext();
                    nodes.MoveNext();
                    string time = nodes.Current.Value;
                    nodes.MoveNext();
                    string state = nodes.Current.Value;
                    backinfo.add(time, state);
                }

                return(backinfo);
            }
            else
            {
                return(new ResultInfo(queryNumber));
            }
        }
Пример #8
0
        private ResultInfo getDetail(string backstring)
        {
            SgmlReader reader = new SgmlReader();

            reader.DocType     = "HTML";
            reader.InputStream = new StringReader(backstring);
            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }
            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);
            XPathNavigator      nav = doc.CreateNavigator();

            /////////////////根据网页返回结果分析

            string            xpath = "//table[1]/tr/td";
            string            str   = "";
            XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式

            ResultInfo backinfo = new ResultInfo(querynum);

            if (nodes.Count >= 4)
            {
                nodes.MoveNext();
                nodes.MoveNext();
                nodes.MoveNext();
                nodes.MoveNext();
            }
            for (int i = 4; i < nodes.Count / 2; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
                nodes.MoveNext();
            }
            reader.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
Пример #9
0
        public ResultInfo getDetail(string backstring)
        {
            backstring = backstring.Replace("xmlns=\"http://www.w3.org/1999/xhtml\"", "");
            SgmlReader reader = new SgmlReader();

            reader.DocType = "HTML";

            reader.InputStream = new StringReader(backstring);

            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            reader.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting         = Formatting.Indented;
            while (reader.Read())
            {
                if (reader.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(reader, true);
                }
            }


            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable);

            xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml");
            XPathNavigator    nav   = doc.CreateNavigator();
            string            xpath = "/html/body/table[8]/tr/td";
            XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式

            if (nodes.Count >= 2)
            {
                nodes.MoveNext();
                nodes.MoveNext();
            }
            ResultInfo backinfo = new ResultInfo(querynum);

            for (int i = 1; i < nodes.Count / 2; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
            }
            reader.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
Пример #10
0
        /***************************************************************************
         * Useful debugging code...
         * **************************************************************************/
        void StartCrawl(SgmlReader reader, string uri, bool basify)
        {
            Console.WriteLine("Loading '" + reader.BaseURI + "'");

            XmlDocument doc = new XmlDocument();

            try {
                doc.XmlResolver = null; // don't do any downloads!
                doc.Load(reader);
            }
            catch (Exception e) {
                Console.WriteLine("Error loading document\n" + e.Message);
            }
            reader.Close();

            if (basify)
            {
                // html and head are option, if they are there use them otherwise not.
                XmlElement be = (XmlElement)doc.SelectSingleNode("//base");
                if (be == null)
                {
                    be = doc.CreateElement("base");
                    be.SetAttribute("href", doc.BaseURI);

                    XmlElement head = (XmlElement)doc.SelectSingleNode("//head");
                    if (head != null)
                    {
                        head.InsertBefore(be, head.FirstChild);
                    }
                    else
                    {
                        XmlElement html = (XmlElement)doc.SelectSingleNode("//html");
                        if (html != null)
                        {
                            html.InsertBefore(be, html.FirstChild);
                        }
                        else
                        {
                            doc.DocumentElement.InsertBefore(be, doc.DocumentElement.FirstChild);
                        }
                    }
                }
            }

            try {
                Crawl(reader.Dtd, doc, reader.ErrorLog);
            }
            catch (Exception e) {
                Console.WriteLine("Uncaught exception: " + e.Message);
            }
        }
Пример #11
0
    XmlDocument FetchXmlDocument(Uri url)
    {
        var sr = FetchWebText(url);
        var xr = new SgmlReader()
        {
            InputStream = sr
        };
        var doc = new XmlDocument();

        doc.Load(xr);
        sr.Close();
        xr.Close();
        return(doc);
    }
Пример #12
0
        private void ParsePageText()
        {
//			DateTime startTime;
//			DateTime endTime;

            SgmlReader reader = new SgmlReader();

            try
            {
                reader.InputStream = new StringReader(FixHtmlToAvoidParseErrors(pageText));
//				startTime = DateTime.Now;
                reader.Dtd = ParseDtd(reader.NameTable);                        // note: this is last-found performance bottleneck; not yet fixed.  Retest before fixing.
//				endTime = DateTime.Now;
                reader.ErrorLog = Console.Error;
                reader.DocType  = "HTML";

                document = new XhtmlDocument(reader.NameTable);
                try
                {
                    document.Load(reader);
                }
                catch (WebException e)
                {
                    throw new DoctypeDtdException(e);
                }

                ParseForms();

//				totalParseTime += endTime - startTime;
//				Console.WriteLine("parser: " + totalParseTime);
            }
            catch (XmlException e)
            {
                Console.WriteLine("vvvvvv The following HTML could not be parsed by NUnitAsp vvvvvv");
                Console.WriteLine(pageText);
                Console.WriteLine("^^^^^^ The preceding HTML could not be parsed by NUnitAsp ^^^^^^");
                throw new ParseException("Could not parse HTML.  See standard out for the HTML and use a validator (such as the one at validator.w3.org) to troubleshoot.  Parser error was: " + e.Message);
            }
            finally
            {
                reader.Close();
            }
        }
Пример #13
0
        private ResultInfo getDetail(string backstring)
        {
            //////////////用sgml库分析网页,转换成xml文件
            SgmlReader readern = new SgmlReader();

            readern.DocType     = "HTML";
            readern.InputStream = new StringReader(backstring);
            StringWriter  sw     = new StringWriter();
            XmlTextWriter writer = new XmlTextWriter(sw);

            readern.WhitespaceHandling = WhitespaceHandling.None;
            writer.Formatting          = Formatting.Indented;
            while (!readern.EOF)
            {
                readern.Read();
                if (readern.NodeType != XmlNodeType.Whitespace)
                {
                    writer.WriteNode(readern, true);
                }
            }
            XmlDocument doc = new XmlDocument();

            doc.Load(new StringReader(sw.ToString()));
            XmlNamespaceManager xnm    = new XmlNamespaceManager(doc.NameTable);
            XPathNavigator      nav    = doc.CreateNavigator();
            string            xpath    = "//div[@id='ess_ctr1579_TrackResult_DivBill']/table[2]/tr[@class='font_c']/td";
            XPathNodeIterator nodes    = nav.Select(xpath, xnm);//xpath表达式
            ResultInfo        backinfo = new ResultInfo(queryNumber);

            for (int i = 0; i < nodes.Count / 2; i++)
            {
                nodes.MoveNext();
                string time = nodes.Current.Value;
                nodes.MoveNext();
                string state = nodes.Current.Value;
                backinfo.add(time, state);
            }
            readern.Close();
            writer.Close();
            sw.Close();
            return(backinfo);
        }
Пример #14
0
        /// <summary>
        /// 转换成Xhtml
        /// </summary>
        /// <param name="html">html代码</param>
        /// <returns>Xhtml代码</returns>
        public static string ToXhtml(string html)
        {
            SgmlReader reader = new SgmlReader();

            reader.CaseFolding = CaseFolding.ToLower;
            reader.DocType     = "HTML";
            reader.InputStream = new StringReader(html);

            StringWriter  sw     = new StringWriter(CultureInfo.InvariantCulture);
            XmlTextWriter writer = new XmlTextWriter(sw);

            writer.Formatting         = Formatting.Indented;
            reader.WhitespaceHandling = WhitespaceHandling.None;
            while (!reader.EOF)
            {
                writer.WriteNode(reader, true);
            }
            reader.Close();
            sw.Close();
            writer.Close();
            return(sw.ToString());
        }
Пример #15
0
        private void ParsePageText()
        {
            SgmlReader reader = new SgmlReader();

            try
            {
                reader.InputStream = new StringReader(FixHtmlToAvoidParseErrors(pageText));
                reader.Dtd         = ParseDtd(reader.NameTable);

                // EP 02/01/07 - Don't output the errors, there are too many of them!
                //reader.ErrorLog = Console.Error;
                reader.DocType = "HTML";

                document = new XhtmlDocument(reader.NameTable);
                try
                {
                    document.Load(reader);
                }
                catch (WebException e)
                {
                    throw new DoctypeDtdException(e);
                }

                ParseInitialFormValues();
            }
            catch (XmlException e)
            {
                Console.WriteLine("vvvvvv The following HTML could not be parsed by NUnitAsp vvvvvv");
                Console.WriteLine(pageText);
                Console.WriteLine("^^^^^^ The preceding HTML could not be parsed by NUnitAsp ^^^^^^");
                throw new ParseException("Could not parse HTML.  See standard out for the HTML and use a validator (such as the one at validator.w3.org) to troubleshoot.  Parser error was: " + e.Message);
            }
            finally
            {
                reader.Close();
            }
        }
Пример #16
0
        /// <summary>
        /// Run the SgmlReader command line tool with the given command line arguments.
        /// </summary>
        /// <param name="args"></param>
        public void Run(string[] args)
        {
            SgmlReader reader   = new SgmlReader();
            string     inputUri = null;

            for (int i = 0; i < args.Length; i++)
            {
                string arg = args[i];
                if (arg[0] == '-' || arg[0] == '/')
                {
                    switch (arg.Substring(1))
                    {
                    case "e":
                        string errorlog = args[++i];
                        if ("$stderr".Equals(errorlog, StringComparison.OrdinalIgnoreCase))
                        {
                            reader.ErrorLog = Console.Error;
                        }
                        else
                        {
                            reader.ErrorLog = new StreamWriter(errorlog);
                        }
                        break;

                    case "html":
                        reader.DocType = "HTML";
                        break;

                    case "dtd":
                        reader.SystemLiteral = args[++i];
                        break;

                    case "proxy":
                        proxy           = args[++i];
                        reader.WebProxy = new WebProxy(proxy);
                        break;

                    case "encoding":
                        encoding = Encoding.GetEncoding(args[++i]);
                        break;

                    case "nobom":
                        noUtf8Bom = true;
                        break;

                    case "f":
                        formatted = true;
                        reader.WhitespaceHandling = WhitespaceHandling.None;
                        break;

                    case "trimtext":
                        reader.TextWhitespace = TextWhitespaceHandling.TrimBoth;
                        break;

                    case "noxml":
                        noxmldecl = true;
                        break;

                    case "doctype":
                        reader.StripDocType = false;
                        break;

                    case "lower":
                        reader.CaseFolding = CaseFolding.ToLower;
                        break;

                    case "upper":
                        reader.CaseFolding = CaseFolding.ToUpper;
                        break;

                    default:
                        string exeName    = Environment.GetCommandLineArgs()[0];
                        string exeVersion = typeof(CommandLine).Assembly.GetName().Version?.ToString();
                        Console.WriteLine("{0} - version {1}", exeName, exeVersion);
                        Console.WriteLine("  https://github.com/lovettchris/SgmlReader");
                        Console.WriteLine();
                        Console.WriteLine("Usage: {0} <options> [InputUri] [OutputFile]", exeName);
                        Console.WriteLine();
                        Console.WriteLine("<options>:");
                        Console.WriteLine("  -help          Prints this list of command-line options");
                        Console.WriteLine("  -e log         Optional log file name, name of '$STDERR' will write errors to stderr");
                        Console.WriteLine("  -f             Whether to pretty print the output.");
                        Console.WriteLine("  -html          Specify the built in HTML dtd");
                        Console.WriteLine("  -dtd url       Specify other SGML dtd to use");
                        Console.WriteLine("  -base          Add base tag to output HTML");
                        Console.WriteLine("  -noxml         Do not add XML declaration to the output");
                        Console.WriteLine("  -proxy svr:80  Proxy server to use for http requests");
                        Console.WriteLine("  -encoding name Specify an encoding for the output file (default UTF-8)");
                        Console.WriteLine("  -nobom         Prevents output of the BOM when using UTF-8");
                        Console.WriteLine("  -f             Produce indented formatted output");
                        Console.WriteLine("  -trimtext      SGML `#text` nodes will be trimmed of outer whitespace");
                        Console.WriteLine("  -lower         Convert input tags to lower case");
                        Console.WriteLine("  -upper         Convert input tags to UPPER CASE");
                        Console.WriteLine();
                        Console.WriteLine("  InputUri       The input file or http URL (defaults to stdin if not specified)");
                        Console.WriteLine("                 Supports wildcards for local file names.");
                        Console.WriteLine("  OutputFile     Output file name (defaults to stdout if not specified)");
                        Console.WriteLine("                 If input file contains wildcards then this just specifies the output file extension (default .xml)");
                        return;
                    }
                }
                else
                {
                    if (inputUri == null)
                    {
                        inputUri = arg;
                        string ext = Path.GetExtension(arg).ToLower();
                        if (ext == ".htm" || ext == ".html")
                        {
                            reader.DocType = "HTML";
                        }
                    }
                    else if (output == null)
                    {
                        output = arg;
                    }
                }
            }

            if (inputUri != null && !inputUri.StartsWith("http://") && inputUri.IndexOfAny(new char[] { '*', '?' }) >= 0)
            {
                // wild card processing of a directory of files.
                string path = Path.GetDirectoryName(inputUri);
                if (path == "")
                {
                    path = ".\\";
                }
                string ext = ".xml";
                if (output != null)
                {
                    ext = Path.GetExtension(output);
                }

                foreach (string uri in Directory.GetFiles(path, Path.GetFileName(inputUri)))
                {
                    Console.WriteLine("Processing: " + uri);
                    string file = Path.GetFileName(uri);
                    output = Path.GetDirectoryName(uri) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(file) + ext;
                    Process(reader, uri);
                    reader.Close();
                }
                return;
            }

            Process(reader, inputUri);
            reader.Close();

            return;
        }
Пример #17
0
        bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log) {
            depth++;
            StringBuilder indent = new StringBuilder();
            for (int i = 0; i < depth; i++)
                indent.Append(" ");
      
            count++;
            Uri baseUri = new Uri(doc.BaseURI);
            XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base");
            if (baseElmt != null) {
                string href = baseElmt.GetAttribute("href");
                if (href != "") {
                    try {
                        baseUri = new Uri(href);
                    }
                    catch (Exception ) {
                        Console.WriteLine("### Error parsing BASE href '"+href+"'");
                    }
                }
            }
            foreach (XmlElement a in doc.SelectNodes("//a")) {
                string href = a.GetAttribute("href");
                if (href != "" && href != null && depth<5) {
                    Uri local = new Uri(baseUri, href);
                    if (domain && baseUri.Host != local.Host)
                        continue;
                    string ext = Path.GetExtension(local.AbsolutePath).ToLower();
                    if (ext == ".jpg" || ext == ".gif" || ext==".mpg")
                        continue;
                    string url = local.AbsoluteUri;
                    if (!visited.ContainsKey(url)) {
                        visited.Add(url, url);
                        log.WriteLine(indent+"Loading '"+url+"'");
                        log.Flush();
                        StreamReader stm = null;
                        try {
                            HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
                            wr.Timeout = 10000; 
                            if (proxy != null) wr.Proxy = new WebProxy(proxy);
                            wr.PreAuthenticate = false; 
                            // Pass the credentials of the process. 
                            wr.Credentials = CredentialCache.DefaultCredentials; 

                            WebResponse resp = wr.GetResponse();
                            Uri actual = resp.ResponseUri;
                            if (actual.AbsoluteUri != url) {
                                local = new Uri(actual.AbsoluteUri);
                                log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'");
                                log.Flush();
                            }           
                            if (resp.ContentType != "text/html") {
                                log.WriteLine(indent+"Skipping ContentType="+resp.ContentType);
                                log.Flush();
                                resp.Close();
                            } 
                            else {
                                stm = new StreamReader(resp.GetResponseStream());
                            }
                        } 
                        catch (Exception e) {
                            log.WriteLine(indent+"### Error opening URL: " + e.Message);
                            log.Flush();
                        }
                        if (stm != null) {
                            SgmlReader reader = new SgmlReader();
                            reader.Dtd = dtd;
                            reader.SetBaseUri(local.AbsoluteUri);
                            reader.InputStream = stm;
                            reader.WebProxy = proxy;

                            XmlDocument d2 = new XmlDocument();
                            d2.XmlResolver = null; // don't do any downloads!
                            try {
                                d2.Load(reader);
                                reader.Close();
                                stm.Close();
                                if (!Crawl(dtd, d2, log))
                                    return false;
                            } 
                            catch (Exception e) {
                                log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message);
                                log.Flush();
                                reader.Close();
                            }
                        }
                    }
                }
            }
            depth--;
            return true;
        }
Пример #18
0
        public void Run(string[] args)
        {
            SgmlReader reader   = new SgmlReader();
            string     inputUri = null;

            for (int i = 0; i < args.Length; i++)
            {
                string arg = args[i];
                if (arg[0] == '-' || arg[0] == '/')
                {
                    switch (arg.Substring(1))
                    {
                    case "e":
                        string errorlog = args[++i];
                        if (errorlog.ToLower() == "$stderr")
                        {
                            reader.ErrorLog = Console.Error;
                        }
                        else
                        {
                            reader.ErrorLogFile = errorlog;
                        }
                        break;

                    case "html":
                        reader.DocType = "HTML";
                        break;

                    case "dtd":
                        reader.SystemLiteral = args[++i];
                        break;

                    case "proxy":
                        proxy           = args[++i];
                        reader.WebProxy = proxy;
                        break;

                    case "encoding":
                        encoding = Encoding.GetEncoding(args[++i]);
                        break;

                    case "f":
                        formatted = true;
                        reader.WhitespaceHandling = WhitespaceHandling.None;
                        break;

                    case "noxml":
                        noxmldecl = true;
                        break;

                    case "doctype":
                        reader.StripDocType = false;
                        break;

                    case "lower":
                        reader.CaseFolding = CaseFolding.ToLower;
                        break;

                    case "upper":
                        reader.CaseFolding = CaseFolding.ToUpper;
                        break;

                    default:
                        Console.WriteLine("Usage: SgmlReader <options> [InputUri] [OutputFile]");
                        Console.WriteLine("-e log         Optional log file name, name of '$STDERR' will write errors to stderr");
                        Console.WriteLine("-f             Whether to pretty print the output.");
                        Console.WriteLine("-html          Specify the built in HTML dtd");
                        Console.WriteLine("-dtd url       Specify other SGML dtd to use");
                        Console.WriteLine("-base          Add base tag to output HTML");
                        Console.WriteLine("-noxml         Do not add XML declaration to the output");
                        Console.WriteLine("-proxy svr:80  Proxy server to use for http requests");
                        Console.WriteLine("-encoding name Specify an encoding for the output file (default UTF-8)");
                        Console.WriteLine("-lower         Convert input tags to lower case");
                        Console.WriteLine("-upper         Convert input tags to upper case");
                        Console.WriteLine();
                        Console.WriteLine("InputUri       The input file or http URL (default stdin).  ");
                        Console.WriteLine("               Supports wildcards for local file names.");
                        Console.WriteLine("OutputFile     Output file name (default stdout)");
                        Console.WriteLine("               If input file contains wildcards then this just specifies the output file extension (default .xml)");
                        return;
                    }
                }
                else
                {
                    if (inputUri == null)
                    {
                        inputUri = arg;
                        string ext = Path.GetExtension(arg).ToLower();
                        if (ext == ".htm" || ext == ".html")
                        {
                            reader.DocType = "HTML";
                        }
                    }
                    else if (output == null)
                    {
                        output = arg;
                    }
                }
            }
            if (inputUri != null && !inputUri.StartsWith("http://") && inputUri.IndexOfAny(new char[] { '*', '?' }) >= 0)
            {
                // wild card processing of a directory of files.
                string path = Path.GetDirectoryName(inputUri);
                if (path == "")
                {
                    path = ".\\";
                }
                string ext = ".xml";
                if (output != null)
                {
                    ext = Path.GetExtension(output);
                }
                foreach (string uri in Directory.GetFiles(path, Path.GetFileName(inputUri)))
                {
                    Console.WriteLine("Processing: " + uri);
                    string file = Path.GetFileName(uri);
                    output = Path.GetDirectoryName(uri) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(file) + ext;
                    Process(reader, uri);
                    reader.Close();
                }
                return;
            }
            Process(reader, inputUri);
            reader.Close();

            return;
        }