SgmlReader is an XmlReader API over any SGML document (including built in support for HTML).
Inheritance: XmlReader
    static void Main(string[] args)
    {
        if (args.Length < 2) {
            Console.WriteLine("Usage: BenchSgmlReader.exe filename iterations");
            return;
        }

        var streamReader = new StreamReader(args[0]);
        string text = streamReader.ReadToEnd();
        streamReader.Close();

        int n = int.Parse(args[1]);

        var start = DateTime.Now;
        for (int i = 0; i < n; i++) {
            SgmlReader sgmlReader = new SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            //sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream = new StringReader(text);

            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);
        }
        var stop = DateTime.Now;

        var duration = stop - start;
        Console.WriteLine("{0} s", (duration.TotalMilliseconds / 1000.0).ToString(CultureInfo.InvariantCulture));
    }
Exemple #2
1
 public Parser()
 {
     _sgmlReader = new SgmlReader();
     _sgmlReader.DocType = "HTML";
     _sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
     _sgmlReader.CaseFolding = CaseFolding.ToLower;
 }
Exemple #3
1
        static void Main(string[] args)
        {
            var array = new JArray();
            var crawled = new HashSet<string>();
            var sgmlReader = new SgmlReader
            {
                Href = "http://groups.google.com/group/ravendb/web/docs-http-api-index"
            };
            crawled.Add(sgmlReader.Href);
            var doc = new XmlDocument();
            doc.Load(sgmlReader);

            var layout = doc.SelectSingleNode("//div[@class='layout']");

            var index = new JObject(new JProperty("Html", FixLinks(layout.InnerXml)), new JProperty("Name", "Index"));

            array.Add(new JObject(
                      	new JProperty("DocId", "raven_documentation/index"),
                      	new JProperty("Document", index),
                        new JProperty("Type", "raven documentation"),
                      	new JProperty("Metadata",
                      	              new JObject(new JProperty("Raven-View-Template", "/raven/JSONTemplates/documentation.html")))
                      	));

            AddDocumentsFromLinks(array, crawled, layout.SelectNodes(".//a"));

            File.WriteAllText(args[0], array.ToString(Formatting.Indented));
        }
        public static int GetPv(int cid, DateTime date)
        {
            var hatenaId = ConfigurationManager.AppSettings["hatenaId"];
            var hatenaPassword = ConfigurationManager.AppSettings["hatenaPassword"];

            var wc = new CustomWebClient() { Encoding = Encoding.UTF8 };
            wc.Headers.Add("Content-Type", "application/x-www-form-urlencoded");

            var data = string.Format(LoginParamBase, hatenaId, hatenaPassword);
            wc.UploadString("https://www.hatena.ne.jp/login", "POST", data);

            var url = string.Format(CounterUrlBase, hatenaId, cid, date.ToString("yyyy-MM-dd"));
            var res = wc.DownloadString(url);

            XDocument xml;
            using (var sgml = new SgmlReader() { IgnoreDtd = true })
            {
                sgml.InputStream = new StringReader(res);
                xml = XDocument.Load(sgml);
            }
            var ns = xml.Root.Name.Namespace;
            var count = xml.Descendants(ns + "table")
                .Where(x => x.FirstAttribute.Value == "totalcount")
                .Descendants(ns + "strong")
                .First().Value;
            return int.Parse(count);
        }
Exemple #5
1
        // Creates XmlDocument from html content and return it with rootitem "<root>".
        public static XmlDocument ParseHtml(string sContent)
        {
            StringReader sr = new StringReader("<root>" + sContent + "</root>");
            SgmlReader reader = new SgmlReader();
            reader.WhitespaceHandling = WhitespaceHandling.All;
            reader.CaseFolding = Sgml.CaseFolding.ToLower;
            reader.InputStream = sr;

            StringWriter sw = new StringWriter();
            XmlTextWriter w = new XmlTextWriter(sw);
            w.Formatting = Formatting.Indented;
            w.WriteStartDocument();
            reader.Read();
            while (!reader.EOF)
            {
                w.WriteNode(reader, true);
            }
            w.Flush();
            w.Close();

            sw.Flush();

            // create document
            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.LoadXml(sw.ToString());

            reader.Close();

            return doc;
        }
        public static XmlReader Create(string baseUri, string html)
        {
            var assembly = typeof(SgmlReader).Assembly;
            var name = "Html.dtd";
            var dtd = default(SgmlDtd);

            using (var resource = assembly.GetManifestResourceStream(name))
            {
                var input = new StreamReader(resource);
                dtd = SgmlDtd.Parse(new Uri(baseUri), "HTML", input, null, null, null);
            }

            var reader = new SgmlReader
            {
                WhitespaceHandling = WhitespaceHandling.All,
                CaseFolding = CaseFolding.ToLower,
                Dtd = dtd,
                IgnoreDtd = true,
                InputStream = new StringReader(html),
            };

            reader.SetBaseUri(baseUri);

            return reader;
        }
Exemple #7
1
 public static string GetWellFormedHTML(string html, string xpathNavPath)
 {
     // StreamReader sReader = null;
     StringWriter sw = null;
     SgmlReader reader = null;
     XmlTextWriter writer = null;
     try
     {
         //  if (uri == String.Empty) uri = "http://www.XMLforASP.NET";
         // HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri);
         //  HttpWebResponse res = (HttpWebResponse)req.GetResponse();
         //  sReader = new StreamReader(res.GetResponseStream());
         reader = new SgmlReader();
         reader.DocType = "HTML";
         reader.InputStream = new StringReader(html);
         sw = new StringWriter();
         writer = new XmlTextWriter(sw);
         writer.Formatting = Formatting.Indented;
         //writer.WriteStartElement("Test");
         while (reader.Read())
         {
             if (reader.NodeType != XmlNodeType.Whitespace)
             {
                 writer.WriteNode(reader, true);
             }
         }
         //writer.WriteEndElement();
         if (xpathNavPath == null)
         {
             string sr = sw.ToString();
             sr = sr.Replace("\r", "\n");
             sr = sr.Replace("\n\n", "\n");
             return sr;
         }
         else
         { //Filter out nodes from HTML
             StringBuilder sb = new StringBuilder();
             XPathDocument doc = new XPathDocument(new StringReader(sw.ToString()));
             XPathNavigator nav = doc.CreateNavigator();
             XPathNodeIterator nodes = nav.Select(xpathNavPath);
             while (nodes.MoveNext())
             {
                 sb.Append(nodes.Current.Value + "\n");
             }
             string sr = sb.ToString();
             sr = sr.Replace("\r", "\n");
             sr = sr.Replace("\n\n", "\n");
             return sr;
         }
     }
     catch (Exception exp)
     {
         writer.Close();
         reader.Close();
         sw.Close();
         // sReader.Close();
         return exp.Message;
     }
 }
        /// <summary>
        /// コンストラクタ
        /// </summary>
        /// <param name="url">参照先URL</param>
        /// <param name="follow">robots.txt参照可否</param>
        /// <param name="agent">ユーザーエージェント</param>
        public HtmlReader(string url, bool follow = true, UserAgent agent = null, Encoding encoding = null)
        {
            // Httpリクエスト
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
            // ユーザーエージェント
            if (agent != null)
                req.UserAgent = agent.ToString();
            // robots.txt
            Robots robots = (follow) ? Robots.Create(new Uri(url)) : null;
            if (robots != null) {
                if (!robots.Parse(url))
                    throw new RobotsDisallowException("Robots Disallow [" + url + "]");
                if (robots.CrawlDelay != 0)
                    System.Threading.Thread.Sleep(robots.CrawlDelay * 1000);
            }

            using (HttpWebResponse res = (HttpWebResponse)req.GetResponse())
            using (Stream stream = res.GetResponseStream()) {
                Encoding enc = (encoding != null) ? encoding : Encoding.GetEncoding(res.CharacterSet);
                using (StreamReader reader = new StreamReader(stream, enc))
                using (SgmlReader sgml = new SgmlReader {
                    DocType = "HTML",
                    InputStream = reader,
                    CaseFolding = CaseFolding.ToLower,
                    IgnoreDtd = true
                }) {
                    Html = XDocument.Load(sgml, LoadOptions.None);
                    Uri = url;
                    Encoding = enc;
                }
            }
        }
Exemple #9
0
        XElement GetJavaDocFile(string path, out JavaDocKind kind)
        {
            kind = JavaDocKind.DroidDoc;
            string rawHTML = ReadAndSanitizeHtmlFile(path);

            if (rawHTML.Substring(0, 500).IndexOf("Generated by javadoc (build 1.6", StringComparison.Ordinal) > 0)
            {
                kind = JavaDocKind.Java6;
            }
            if (rawHTML.Substring(0, 500).IndexOf("Generated by javadoc (version 1.7", StringComparison.Ordinal) > 0)
            {
                kind = JavaDocKind.Java7;
            }
            if (rawHTML.Substring(0, 500).IndexOf("Generated by javadoc (1.8", StringComparison.Ordinal) > 0)
            {
                kind = JavaDocKind.Java8;
            }
            var html = new Sgml.SgmlReader()
            {
                InputStream = new StringReader(rawHTML),
                CaseFolding = Sgml.CaseFolding.ToLower,
                Dtd         = HtmlDtd
            };
            var doc = XDocument.Load(html, LoadOptions.SetLineInfo | LoadOptions.SetBaseUri);

            return(doc.Root);
        }
Exemple #10
0
 static XDocument ParseHtml(TextReader reader)
 {
     using (var sgmlReader = new SgmlReader { DocType = "HTML", CaseFolding = CaseFolding.ToLower })
     {
         sgmlReader.InputStream = reader;
         return XDocument.Load(sgmlReader);
     }
 }
Exemple #11
0
 public static String GetXmlFromHtmlString (String html)
 {
     using (SgmlReader sr = new SgmlReader())
     {
         sr.InputStream = new StringReader(html);
         return sr.ReadOuterXml();
     }
 }
 XmlDocument FetchXmlDocument(Uri url)
 {
     var sr = FetchWebText (url);
     var xr = new SgmlReader () { InputStream = sr };
     var doc = new XmlDocument ();
     doc.Load (xr);
     sr.Close ();
     xr.Close ();
     return doc;
 }
		private static XDocument ParseHtml( TextReader _Reader )
		{
			using ( var sgmlReader = new SgmlReader {
				DocType = "HTML",
				CaseFolding = CaseFolding.ToLower,
				InputStream = _Reader, } )
			{
				return XDocument.Load( sgmlReader );
			}
		}
 public void SetUp()
 {
     _sgmlReader =
     new SgmlReader
       {
     CaseFolding = CaseFolding.ToLower,
     DocType = "HTML",
     WhitespaceHandling = WhitespaceHandling.None
       };
 }
Exemple #15
0
 public static XDocument FetchHtmlFromUrlAsXDocument(string url)
 {
     var webRequest = WebRequest.Create(url);
     using (var reader = new StreamReader(webRequest.GetResponse().GetResponseStream()))
     {
         var sgml = new SgmlReader();
         sgml.DocType = "HTML";
         sgml.CaseFolding = CaseFolding.ToLower;
         sgml.InputStream = reader;
         return new XDocument(XDocument.Load(sgml));
     }
 }
        internal static ImageInfo[] FindImgs(
            string htmlCode)
        {
            var r =
                new SgmlReader
                    {
                        DocType = @"HTML",
                        InputStream = new StringReader(htmlCode)
                    };
            var al = new List<ImageInfo>();

            //find <img src=""
            while (r.Read())
            {
                if (r.NodeType == XmlNodeType.Element)
                {
                    if (string.Compare(r.Name, @"img", StringComparison.OrdinalIgnoreCase) == 0)
                    {
                        if (r.HasAttributes)
                        {
                            var ii = new ImageInfo();

                            while (r.MoveToNextAttribute())
                            {
                                switch (r.Name.ToLowerInvariant())
                                {
                                    case @"src":
                                        ii.Source = r.Value;
                                        break;
                                    case @"width":
                                        ii.Width = ConvertHelper.ToInt32(r.Value);
                                        break;
                                    case @"height":
                                        ii.Height = ConvertHelper.ToInt32(r.Value);
                                        break;
                                }
                            }

                            // --

                            if (!string.IsNullOrEmpty(ii.Source))
                            {
                                al.Add(ii);
                            }
                        }
                    }
                }
            }

            return al.ToArray();
        }
Exemple #17
0
        public void CanParseResponseAsXML()
        {
            using(var inputReader = new StreamReader(new FileStream("fixture.txt", FileMode.Open)))
            {
                var reader = new SgmlReader();
                reader.InputStream = inputReader;
                reader.CaseFolding = CaseFolding.ToLower;
                reader.DocType = "HTML";

                var document = new XDocument(XDocument.Load((XmlReader) reader));

                Assert.IsNotNull(document.ToString());
            }
        }
        private XmlDocument LoadHtmlPageAsXMLInternal(string postData, string uri, string httpMethod)
        {
            // Prepare web request...
            HttpWebRequest webrequest = (HttpWebRequest)WebRequest.Create(uri);

            // Deal with proxy details if any.
            WebProxy proxy = null;
            if (_proxySettings.Option == ProxySettingsDTO.ProxyOption.UseIESettings)
            {
                throw new NotSupportedException("IE proxy settings are not supported by this module!");
            }
            if (_proxySettings.Option == ProxySettingsDTO.ProxyOption.Custom)
            {
                proxy = new WebProxy(_proxySettings.ProxyHost, _proxySettings.ProxyPort);
                webrequest.Proxy = proxy;
            }
            webrequest.Method = httpMethod;

            if(String.Equals(httpMethod, "POST", StringComparison.OrdinalIgnoreCase))
            {
                ASCIIEncoding encoding = new ASCIIEncoding();
                byte[] data = encoding.GetBytes(postData);

                webrequest.ContentType = "application/x-www-form-urlencoded";
                webrequest.ContentLength = data.Length;

                using (Stream newStream = webrequest.GetRequestStream())
                {
                    newStream.Write(data, 0, data.Length);
                }
            }

            HttpWebResponse webresponse = (HttpWebResponse)webrequest.GetResponse();
            Encoding enc = System.Text.Encoding.GetEncoding(1252);
            StreamReader loResponseStream = new StreamReader(webresponse.GetResponseStream(), enc);
            string Buffer = loResponseStream.ReadToEnd();
            loResponseStream.Close();
            webresponse.Close();

            StringReader stringReader = new StringReader(Buffer);

            // Use the cool sgml reader to 'interpret' the HTML as XML :) very nice!
            SgmlReader sgmlReader = new SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.InputStream = stringReader;
            XmlDocument doc = new XmlDocument();
            doc.Load(sgmlReader);

            return doc;
        }
Exemple #19
0
 //セッションIDを取得(更新)する
 void getSessionId(Stream stream)
 {
     var enc = System.Text.Encoding.UTF8;
     using (var reader = new StreamReader(stream, enc))
     using (var sgmlReader = new SgmlReader { InputStream = reader })
     {
         sgmlReader.DocType = "HTML";
         sgmlReader.CaseFolding = CaseFolding.ToLower;
         var doc = XDocument.Load(sgmlReader);
         var ns = doc.Root.Name.Namespace;
         var q = doc.Descendants(ns + "input")
             .Where(ul => ul.Attribute("id") != null && ul.Attribute("id").Value == "com.sun.faces.VIEW")
             .Select(el => el.Attribute("value").Value).FirstOrDefault();
         session_id = q;
     }
 }
Exemple #20
0
            public static XmlDocument ConvertHtmlToXml(string html)
            {
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = System.Xml.WhitespaceHandling.All;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.InputStream        = new System.IO.StringReader(html);

                XmlDocument xmlDoc = new XmlDocument();

                xmlDoc.PreserveWhitespace = false;
                xmlDoc.XmlResolver        = null;
                xmlDoc.Load(sgmlReader);

                return(xmlDoc);
            }
Exemple #21
0
        public static XmlDocument XmlFromHtml(string html)
        {
            // setup SgmlReader
            SgmlReader sgmlReader = new Sgml.SgmlReader();

            sgmlReader.DocType            = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
            sgmlReader.InputStream        = new StringReader(html);

            // create document
            XmlDocument doc = new XmlDocument();

            doc.PreserveWhitespace = true;
            doc.XmlResolver        = null;
            doc.Load(sgmlReader);
            return(doc);
        }
        /// <summary>
        /// Constructs a DOM (System.Xml.Linq.XDocument) from HTML markup.
        /// </summary>
        /// <param name="htmlContent">HTML markup from which the DOM is to be constructed.</param>
        /// <returns>System.Linq.Xml.XDocument instance which is a DOM of the provided HTML markup.</returns>
        public XDocument BuildDocument(string htmlContent)
        {
            if (htmlContent == null)
              {
            throw new ArgumentNullException("htmlContent");
              }

              if (htmlContent.Trim().Length == 0)
              {
            return new XDocument();
              }

              // "trim end" htmlContent to ...</html>$ (codinghorror.com puts some scripts after the </html> - sic!)
              const string htmlEnd = "</html";
              int indexOfHtmlEnd = htmlContent.LastIndexOf(htmlEnd);

              if (indexOfHtmlEnd != -1)
              {
            int indexOfHtmlEndBracket = htmlContent.IndexOf('>', indexOfHtmlEnd);

            if (indexOfHtmlEndBracket != -1)
            {
              htmlContent = htmlContent.Substring(0, indexOfHtmlEndBracket + 1);
            }
              }

              // load the document using sgml reader
              using (var sgmlReader = new SgmlReader())
              {
            sgmlReader.CaseFolding = CaseFolding.ToLower;
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.None;

            using (var sr = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(htmlContent))))
            {
              sgmlReader.InputStream = sr;

              var document = XDocument.Load(sgmlReader);

              return document;
            }
              }
        }
 public static XElement ReadHtmlAsXhtml(string html)
 {
   //detect if xhtml by looking for namespace near start
   if (html.IndexOf("http://www.w3.org/1999/xhtml") < 200)
   {
     //must be xhtml, so just parse as xml
     return XElement.Parse(html);
   }
   else
   {
     //probably html, so parse as sgml
     SgmlReader sgml = new SgmlReader();
     sgml.DocType = "HTML";
     sgml.WhitespaceHandling = WhitespaceHandling.All;
     sgml.CaseFolding = Sgml.CaseFolding.ToLower;
     sgml.InputStream = new StringReader(html);
     return XElement.Load(sgml);
   }
 }
Exemple #24
0
 public SgmlDtd getDTD(String version, String DTD)
 {
     if (log.IsDebugEnabled) log.Debug("getDTD(version: " + version + ", DTD: " + DTD + ")");
     SgmlReader reader = null;
     Dictionary<String, SgmlDtd> dtd = null;
     if (this.checkAvailableVersion(DTD+version) && !this.version.ContainsKey(version)) {
         reader = new SgmlReader();
         reader.CaseFolding = Sgml.CaseFolding.ToLower;
         String sgmlArticle = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, this.availableVersion[DTD+version]);
         if (log.IsDebugEnabled) log.Debug("sgmlArticle: " + sgmlArticle);
         reader.SystemLiteral = sgmlArticle;
         dtd = new Dictionary<String, SgmlDtd>();
         dtd.Add(DTD, reader.Dtd);
         if (log.IsDebugEnabled) log.Debug("dtd.Add(DTD: " + DTD + ", reader.Dtd: " + reader.Dtd.ToString() + ")");
         this.version.Add(version, dtd);
         if (log.IsDebugEnabled) log.Debug("this.version.Add(version: " + version + ", dtd: " + dtd.ToString() + ")");
     }
     if (log.IsDebugEnabled) log.Debug("return this.version[version: " + version + "][DTD: " + DTD + "]");
     return this.version[version][DTD];
 }
Exemple #25
0
        private static XdmNode getXdmNode(String uri, String path) {
            try {

                SgmlReader sr = new SgmlReader();
                sr.Href = uri;

                XmlDocument htmlDoc = new XmlDocument();

                try {
                    htmlDoc.Load(sr);
                } catch (Exception e) {
                    throw;
                }

                XmlNode html = htmlDoc.SelectSingleNode(path);
                Processor processor = new Processor();
                return processor.NewDocumentBuilder().Build(html);

            } catch (Exception e) {
                throw;
            }
        }
        public XmlDocument GetDocFromContent(string content)
        {
            var start = DateTime.Now;
            XmlDocument doc = new XmlDocument();

            using (var streamReader = new StringReader(content))
            {
                SgmlReader sgmlReader = new SgmlReader
                {
                    DocType = "HTML",
                    InputStream = streamReader,
                };

                doc.XmlResolver = null;
                doc.Load(sgmlReader);
            }

            var duration = DateTime.Now - start;
            Trace.WriteLine(string.Format("Cleansed html in {0} milliseconds", duration.TotalMilliseconds), "WebpageCleaner");

            return doc;
        }
 private static XDocument LoadDocument(string htmlContent)
 {
     try
     {
         using (var sgmlReader = new SgmlReader())
         {
             sgmlReader.CaseFolding = CaseFolding.ToLower;
             sgmlReader.DocType = "HTML";
             sgmlReader.WhitespaceHandling = WhitespaceHandling.None;
             using (var sr = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(htmlContent))))
             {
                 sgmlReader.InputStream = sr;
                 var document = XDocument.Load(sgmlReader);
                 return document;
             }
         }
     }
     catch (Exception)
     {
         return new XDocument();
     }
 }
Exemple #28
0
        private static string RunTest(CaseFolding caseFolding, string doctype, bool format, string source, XmlReaderTestCallback callback)
        {
            // initialize sgml reader
            XmlReader reader = new SgmlReader {
                CaseFolding = caseFolding,
                DocType = doctype,
                InputStream = new StringReader(source),
                WhitespaceHandling = format ? WhitespaceHandling.None : WhitespaceHandling.All
            };

            // check if we need to use the LoggingXmlReader
            if(_debug) {
                reader = new LoggingXmlReader(reader, Console.Out);
            }

            // initialize xml writer
            var stringWriter = new StringWriter();
            var xmlTextWriter = new XmlTextWriter(stringWriter);
            if(format) {
                xmlTextWriter.Formatting = Formatting.Indented;
            }
            callback(reader, xmlTextWriter);
            xmlTextWriter.Close();

            // reproduce the parsed document
            var actual = stringWriter.ToString();

            // ensure that output can be parsed again
            try {
                using(var stringReader = new StringReader(actual)) {
                    var doc = new XmlDocument();
                    doc.Load(stringReader);
                }
            } catch(Exception) {
                Assert.Fail("unable to parse sgml reader output:\n{0}", actual);
            }
            return actual.Trim().Replace("\r", "");
        }
Exemple #29
0
        XElement GetJavaDocFile(string path, out JavaDocKind kind)
        {
            kind = JavaDocKind.DroidDoc;
            string rawHTML = ReadHtmlFile(path);

            if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 5000)).IndexOf("<body class=\"gc-documentation develop reference api ", StringComparison.Ordinal) > 0)
            {
                kind = JavaDocKind.DroidDoc2;
            }
            if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 500)).IndexOf("Generated by javadoc (build 1.6", StringComparison.Ordinal) > 0)
            {
                kind = JavaDocKind.Java6;
            }
            if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 500)).IndexOf("Generated by javadoc (version 1.7", StringComparison.Ordinal) > 0)
            {
                kind = JavaDocKind.Java7;
            }
            if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 500)).IndexOf("Generated by javadoc (1.8", StringComparison.Ordinal) > 0)
            {
                kind = JavaDocKind.Java8;
            }
            if (kind == JavaDocKind.DroidDoc)
            {
                throw new NotSupportedException("Old DroidDoc is not supported anymore.");
            }
            else
            {
                var html = new Sgml.SgmlReader()
                {
                    InputStream = new StringReader(rawHTML),
                    CaseFolding = Sgml.CaseFolding.ToLower,
                    Dtd         = HtmlDtd
                };
                var doc = XDocument.Load(html);

                return(doc.Root);
            }
        }
Exemple #30
0
        public void Test_for_illegal_char_value() 
        {
            const string source = "&test";
            var reader = new SgmlReader {
                DocType = "HTML",
                WhitespaceHandling = WhitespaceHandling.All,
                StripDocType = true,
                InputStream = new StringReader(source),
                CaseFolding = CaseFolding.ToLower
            };

            // test
            var element = System.Xml.Linq.XElement.Load(reader);
            string value = element.Value;
            Assert.IsFalse(string.IsNullOrEmpty(value), "element has no value");
            Assert.AreNotEqual((char)65535, value[value.Length - 1], "unexpected -1 as last char");
        }
Exemple #31
0
        public void Test_MoveToNextAttribute()
        {

            // Make sure we can do MoveToElement after reading multiple attributes.
            var r = new SgmlReader {
                InputStream = new StringReader("<test id='10' x='20'><a/><!--comment-->test</test>")
            };
            Assert.IsTrue(r.Read());
            while(r.MoveToNextAttribute()) {
                _log.Debug(r.Name);
            }
            if(r.MoveToElement()) {
                _log.Debug(r.ReadInnerXml());
            }
        }
Exemple #32
0
        public void Run(string[] args)
        {
            SgmlReader reader = new SgmlReader();
            string inputUri = null;

            for (int i = 0; i < args.Length; i++) {
                string arg = args[i];
                if (arg[0] == '-' || arg[0] == '/') {
                    switch (arg.Substring(1)) {
                        case "e":
                            string errorlog = args[++i];
                            if (errorlog.ToLower() == "$stderr") {
                                reader.ErrorLog = Console.Error;
                            }
                            else {
                                reader.ErrorLogFile = errorlog;
                            }
                            break;
                        case "html":
                            reader.DocType = "HTML";
                            break;
                        case "dtd":
                            reader.SystemLiteral = args[++i];
                            break;
                        case "proxy":
                            proxy = args[++i];
                            reader.WebProxy = proxy;
                            break;
                        case "encoding":
                            encoding = Encoding.GetEncoding(args[++i]);
                            break;
                        case "f":
                            formatted = true;
                            reader.WhitespaceHandling = WhitespaceHandling.None;
                            break;
                        case "noxml":
                            noxmldecl = true;
                            break;
                        case "doctype":
                            reader.StripDocType = false;
                            break;
                        case "lower":
                            reader.CaseFolding = CaseFolding.ToLower;
                            break;
                        case "upper":
                            reader.CaseFolding = CaseFolding.ToUpper;
                            break;

                        default:
                            Console.WriteLine("Usage: SgmlReader <options> [InputUri] [OutputFile]");
                            Console.WriteLine("-e log         Optional log file name, name of '$STDERR' will write errors to stderr");
                            Console.WriteLine("-f             Whether to pretty print the output.");
                            Console.WriteLine("-html          Specify the built in HTML dtd");
                            Console.WriteLine("-dtd url       Specify other SGML dtd to use");
                            Console.WriteLine("-base          Add base tag to output HTML");
                            Console.WriteLine("-noxml         Do not add XML declaration to the output");
                            Console.WriteLine("-proxy svr:80  Proxy server to use for http requests");
                            Console.WriteLine("-encoding name Specify an encoding for the output file (default UTF-8)");
                            Console.WriteLine("-lower         Convert input tags to lower case");
                            Console.WriteLine("-upper         Convert input tags to upper case");
                            Console.WriteLine();
                            Console.WriteLine("InputUri       The input file or http URL (default stdin).  ");
                            Console.WriteLine("               Supports wildcards for local file names.");
                            Console.WriteLine("OutputFile     Output file name (default stdout)");
                            Console.WriteLine("               If input file contains wildcards then this just specifies the output file extension (default .xml)");
                            return;
                    }
                }
                else {
                    if (inputUri == null) {
                        inputUri = arg;
                        string ext = Path.GetExtension(arg).ToLower();
                        if (ext == ".htm" || ext == ".html")
                            reader.DocType = "HTML";
                    }
                    else if (output == null) output = arg;
                }
            }
            if (inputUri != null && !inputUri.StartsWith("http://") && inputUri.IndexOfAny(new char[] { '*', '?' }) >= 0) {
                // wild card processing of a directory of files.
                string path = Path.GetDirectoryName(inputUri);
                if (path == "") path = ".\\";
                string ext = ".xml";
                if (output != null)
                    ext = Path.GetExtension(output);
                foreach (string uri in Directory.GetFiles(path, Path.GetFileName(inputUri))) {
                    Console.WriteLine("Processing: " + uri);
                    string file = Path.GetFileName(uri);
                    output = Path.GetDirectoryName(uri) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(file) + ext;
                    Process(reader, uri);
                    reader.Close();
                }
                return;
            }
            Process(reader, inputUri);
            reader.Close();

            return ;
        }
Exemple #33
0
        void Process(SgmlReader reader, string uri)
        {
            if (uri == null) {
                reader.InputStream = Console.In;
            } else {
                reader.Href = uri;
            }

            if (this.encoding == null) {
                this.encoding = reader.GetEncoding();
            }

            XmlTextWriter w = null;
            if (output != null) {
                w = new XmlTextWriter(output, this.encoding);
            }
            else {
                w = new XmlTextWriter(Console.Out);
            }
            if (formatted) w.Formatting = Formatting.Indented;
            if (!noxmldecl) {
                w.WriteStartDocument();
            }
            reader.Read();
            while (!reader.EOF) {
                w.WriteNode(reader, true);
            }
            w.Flush();
            w.Close();
        }
Exemple #34
0
        public ActionResult ImportAliases(HttpPostedFileBase file)
        {
            byte   result = 1;
            string msg    = null;

            try
            {
                // setup SgmlReader
                SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.DocType            = "HTML";
                sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;
                sgmlReader.IgnoreDtd          = true;
                sgmlReader.InputStream        = new StreamReader(file.InputStream);

                // create document
                XmlDocument xdoc = new XmlDocument();
                xdoc.PreserveWhitespace = true;
                xdoc.Load(sgmlReader);

                int id, n = 0;

                XPathNavigator    xnav = xdoc.CreateNavigator();
                XPathNodeIterator it_p = xnav.Select("html//body//p");
                while (it_p.MoveNext())
                {
                    // if exists
                    XPathNavigator nav_h1 = it_p.Current.SelectSingleNode("a//h1");
                    if (nav_h1 != null)
                    {
                        XPathNavigator nav_b = it_p.Current.SelectSingleNode("b");

                        Alias alias = new Alias();
                        alias.Name   = nav_h1.Value;
                        alias.Remark = nav_b.Value;

                        System.Diagnostics.Debug.WriteLine(alias.Name);
                        if (db.Exists <Alias>("name = @0 AND parentid IS NULL", alias.Name))
                        {
                            id = db.ExecuteScalar <int>("SELECT IFNULL(a.id, 0) FROM qb_aliases a WHERE a.name = @0 AND a.parentid IS NULL", alias.Name);
                            db.Delete <Alias>("WHERE parentid = @0", id);
                        }
                        else
                        {
                            id = db.ExecuteScalar <int>("INSERT INTO qb_aliases(name, remark) VALUES(@0, @1);\nSELECT last_insert_rowid();", alias.Name, alias.Remark);
                        }


                        XPathNodeIterator it_tr = it_p.Current.Select("table//tr");
                        if (it_tr.Count != 0)
                        {
                            IList <Alias> fields = new List <Alias>();
                            // skip 1st tr - headers
                            it_tr.MoveNext();

                            while (it_tr.MoveNext())
                            {
                                Alias alias1 = new Alias();

                                XPathNavigator nav_td1 = it_tr.Current.SelectSingleNode("td[1]");
                                if (nav_td1 != null)
                                {
                                    alias1.Name = nav_td1.Value;
                                }

                                XPathNavigator nav_td2 = it_tr.Current.SelectSingleNode("td[2]");
                                if (nav_td2 != null)
                                {
                                    alias1.Remark = nav_td2.Value;
                                }

                                fields.Add(alias1);
                            }

                            if (fields.Count > 0)
                            {
                                int           ix   = 0;
                                List <string> keys = new List <string>();
                                List <object> vals = new List <object>();

                                foreach (Alias a in fields)
                                {
                                    keys.Add("@" + string.Join(", @", new int[] { ix, ix + 1, ix + 2 }));
                                    vals.AddRange(new object[] { a.Name, a.Remark, id });
                                    ix += 3;
                                }

                                string sql = "INSERT INTO qb_aliases(name, remark, parentid) VALUES (" + string.Join("), (", keys) + ")";
                                db.Execute(sql, vals.ToArray());

                                System.Diagnostics.Debug.WriteLine(db.LastSQL);
                            }
                        }

                        n++;
                        System.Diagnostics.Debug.WriteLine(n);
                    }
                }

                msg = n.ToString();

                Response.RemoveOutputCacheItem("/Admin/GetAlias");
                Response.RemoveOutputCacheItem("/Admin/GetAliases");
            }
            catch (Exception e)
            {
                msg    = e.Message;
                result = 0;
            }

            string json = Newtonsoft.Json.JsonConvert.SerializeObject(new { success = result, message = msg });

            return(Content(json));       // for IE
        }
Exemple #35
0
        public string ConvertSGMtoXML(dynamic sgmlInput, string sgmlDTD = null, string entities = null)
        {
            try
            {
                Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
                sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
                sgmlReader.CaseFolding        = Sgml.CaseFolding.ToLower;

                //If dtd is provided, we can use it to parse the xml
                if (sgmlDTD != null)
                {
                    sgmlReader.IgnoreDtd      = false;
                    sgmlReader.InternalSubset = sgmlDTD;
                }

                //Check what type of sgmlInput is provided
                if (sgmlInput is TextWriter)
                {
                    sgmlReader.InputStream = sgmlInput;
                }
                else
                {
                    sgmlReader.Href = sgmlInput;
                }


                XDocument xmlDoc = XDocument.Load(sgmlReader);

                //Check to see if there is a doctype declaration in the xmldoc
                if (xmlDoc.DocumentType == null)
                {
                    string rootName = xmlDoc.Root.Name.ToString();
                    xmlDoc.Root.AddBeforeSelf(new XDocumentType(rootName, "", "", ""));
                }

                //If entities file is provided, read it and add it to the xml doc
                if (entities != null)
                {
                    string docTypeEntities = File.ReadAllText(entities);

                    if (docTypeEntities.Contains("<!DOCTYPE") || docTypeEntities.Contains("]>"))
                    {
                        docTypeEntities = Regex.Replace(docTypeEntities, @"<!DOCTYPE(.+?)\[|\]\>", "");
                    }

                    xmlDoc.DocumentType.InternalSubset = docTypeEntities;
                }

                //Convert the xml to string in order to fix it
                string xmlString = xmlDoc.ToString();

                if (xmlString.Contains("</revst>") ||
                    xmlString.Contains("</revend>") ||
                    xmlString.Contains("</cocst>") ||
                    xmlString.Contains("</revst>"))
                {
                    xmlString = xmlString.Replace("<revst>", "<revst/>").Replace("</revst>", "")
                                .Replace("<revend>", "<revend/>").Replace("</revend>", "")
                                .Replace("<cocst>", "<cocst/>").Replace("</cocst>", "")
                                .Replace("<cocend>", "<cocend/>").Replace("</cocend>", "");
                }
                return(xmlString);
            }
            catch (Exception e)
            {
                MessageBox.Show(e.Message, "Unable to transform SGML to XML");
                return(e.Message);
            }
        }
Exemple #36
-1
        private static void ExportDocument(JArray array, HashSet<string> crawled, string href)
        {
            if (crawled.Add(href) == false)
                return;

            Console.WriteLine("Reading {0}", href);
            var uri = new UriBuilder("http", "groups.google.com", 80, href).Uri;
            var linkReader = new SgmlReader
                                {
                                    Href = uri.ToString()
                                };

            var linkDoc = new XmlDocument();
            linkDoc.Load(linkReader);

            var layout = linkDoc.SelectSingleNode("//div[@class='layout']");
            var title = layout.SelectSingleNode(".//h2").InnerText;
            var name = Path.GetFileName(uri.LocalPath).Replace("-", "_");

            AddDocumentsFromLinks(array, crawled, layout.SelectNodes(".//a"));

            Console.WriteLine("Writing {0}", title);

            var index = new JObject(new JProperty("Html", FixLinks(layout.InnerXml)), new JProperty("Name", title));
            array.Add(new JObject(
                        new JProperty("DocId", "raven_documentation/" + name),
                        new JProperty("Document", index),
                        new JProperty("Type", "raven documentation"),
                        new JProperty("Metadata",
                                      new JObject(new JProperty("Raven-View-Template", "/raven/JSONTemplates/documentation.html")))
                        ));
        }