static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage: BenchSgmlReader.exe filename iterations"); return; } var streamReader = new StreamReader(args[0]); string text = streamReader.ReadToEnd(); streamReader.Close(); int n = int.Parse(args[1]); var start = DateTime.Now; for (int i = 0; i < n; i++) { SgmlReader sgmlReader = new SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; //sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = new StringReader(text); XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); } var stop = DateTime.Now; var duration = stop - start; Console.WriteLine("{0} s", (duration.TotalMilliseconds / 1000.0).ToString(CultureInfo.InvariantCulture)); }
public Parser() { _sgmlReader = new SgmlReader(); _sgmlReader.DocType = "HTML"; _sgmlReader.WhitespaceHandling = WhitespaceHandling.All; _sgmlReader.CaseFolding = CaseFolding.ToLower; }
static void Main(string[] args) { var array = new JArray(); var crawled = new HashSet<string>(); var sgmlReader = new SgmlReader { Href = "http://groups.google.com/group/ravendb/web/docs-http-api-index" }; crawled.Add(sgmlReader.Href); var doc = new XmlDocument(); doc.Load(sgmlReader); var layout = doc.SelectSingleNode("//div[@class='layout']"); var index = new JObject(new JProperty("Html", FixLinks(layout.InnerXml)), new JProperty("Name", "Index")); array.Add(new JObject( new JProperty("DocId", "raven_documentation/index"), new JProperty("Document", index), new JProperty("Type", "raven documentation"), new JProperty("Metadata", new JObject(new JProperty("Raven-View-Template", "/raven/JSONTemplates/documentation.html"))) )); AddDocumentsFromLinks(array, crawled, layout.SelectNodes(".//a")); File.WriteAllText(args[0], array.ToString(Formatting.Indented)); }
public static int GetPv(int cid, DateTime date) { var hatenaId = ConfigurationManager.AppSettings["hatenaId"]; var hatenaPassword = ConfigurationManager.AppSettings["hatenaPassword"]; var wc = new CustomWebClient() { Encoding = Encoding.UTF8 }; wc.Headers.Add("Content-Type", "application/x-www-form-urlencoded"); var data = string.Format(LoginParamBase, hatenaId, hatenaPassword); wc.UploadString("https://www.hatena.ne.jp/login", "POST", data); var url = string.Format(CounterUrlBase, hatenaId, cid, date.ToString("yyyy-MM-dd")); var res = wc.DownloadString(url); XDocument xml; using (var sgml = new SgmlReader() { IgnoreDtd = true }) { sgml.InputStream = new StringReader(res); xml = XDocument.Load(sgml); } var ns = xml.Root.Name.Namespace; var count = xml.Descendants(ns + "table") .Where(x => x.FirstAttribute.Value == "totalcount") .Descendants(ns + "strong") .First().Value; return int.Parse(count); }
// Creates XmlDocument from html content and return it with rootitem "<root>". public static XmlDocument ParseHtml(string sContent) { StringReader sr = new StringReader("<root>" + sContent + "</root>"); SgmlReader reader = new SgmlReader(); reader.WhitespaceHandling = WhitespaceHandling.All; reader.CaseFolding = Sgml.CaseFolding.ToLower; reader.InputStream = sr; StringWriter sw = new StringWriter(); XmlTextWriter w = new XmlTextWriter(sw); w.Formatting = Formatting.Indented; w.WriteStartDocument(); reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); w.Close(); sw.Flush(); // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.LoadXml(sw.ToString()); reader.Close(); return doc; }
public static XmlReader Create(string baseUri, string html) { var assembly = typeof(SgmlReader).Assembly; var name = "Html.dtd"; var dtd = default(SgmlDtd); using (var resource = assembly.GetManifestResourceStream(name)) { var input = new StreamReader(resource); dtd = SgmlDtd.Parse(new Uri(baseUri), "HTML", input, null, null, null); } var reader = new SgmlReader { WhitespaceHandling = WhitespaceHandling.All, CaseFolding = CaseFolding.ToLower, Dtd = dtd, IgnoreDtd = true, InputStream = new StringReader(html), }; reader.SetBaseUri(baseUri); return reader; }
public static string GetWellFormedHTML(string html, string xpathNavPath) { // StreamReader sReader = null; StringWriter sw = null; SgmlReader reader = null; XmlTextWriter writer = null; try { // if (uri == String.Empty) uri = "http://www.XMLforASP.NET"; // HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri); // HttpWebResponse res = (HttpWebResponse)req.GetResponse(); // sReader = new StreamReader(res.GetResponseStream()); reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(html); sw = new StringWriter(); writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; //writer.WriteStartElement("Test"); while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } //writer.WriteEndElement(); if (xpathNavPath == null) { string sr = sw.ToString(); sr = sr.Replace("\r", "\n"); sr = sr.Replace("\n\n", "\n"); return sr; } else { //Filter out nodes from HTML StringBuilder sb = new StringBuilder(); XPathDocument doc = new XPathDocument(new StringReader(sw.ToString())); XPathNavigator nav = doc.CreateNavigator(); XPathNodeIterator nodes = nav.Select(xpathNavPath); while (nodes.MoveNext()) { sb.Append(nodes.Current.Value + "\n"); } string sr = sb.ToString(); sr = sr.Replace("\r", "\n"); sr = sr.Replace("\n\n", "\n"); return sr; } } catch (Exception exp) { writer.Close(); reader.Close(); sw.Close(); // sReader.Close(); return exp.Message; } }
/// <summary> /// コンストラクタ /// </summary> /// <param name="url">参照先URL</param> /// <param name="follow">robots.txt参照可否</param> /// <param name="agent">ユーザーエージェント</param> public HtmlReader(string url, bool follow = true, UserAgent agent = null, Encoding encoding = null) { // Httpリクエスト HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); // ユーザーエージェント if (agent != null) req.UserAgent = agent.ToString(); // robots.txt Robots robots = (follow) ? Robots.Create(new Uri(url)) : null; if (robots != null) { if (!robots.Parse(url)) throw new RobotsDisallowException("Robots Disallow [" + url + "]"); if (robots.CrawlDelay != 0) System.Threading.Thread.Sleep(robots.CrawlDelay * 1000); } using (HttpWebResponse res = (HttpWebResponse)req.GetResponse()) using (Stream stream = res.GetResponseStream()) { Encoding enc = (encoding != null) ? encoding : Encoding.GetEncoding(res.CharacterSet); using (StreamReader reader = new StreamReader(stream, enc)) using (SgmlReader sgml = new SgmlReader { DocType = "HTML", InputStream = reader, CaseFolding = CaseFolding.ToLower, IgnoreDtd = true }) { Html = XDocument.Load(sgml, LoadOptions.None); Uri = url; Encoding = enc; } } }
XElement GetJavaDocFile(string path, out JavaDocKind kind) { kind = JavaDocKind.DroidDoc; string rawHTML = ReadAndSanitizeHtmlFile(path); if (rawHTML.Substring(0, 500).IndexOf("Generated by javadoc (build 1.6", StringComparison.Ordinal) > 0) { kind = JavaDocKind.Java6; } if (rawHTML.Substring(0, 500).IndexOf("Generated by javadoc (version 1.7", StringComparison.Ordinal) > 0) { kind = JavaDocKind.Java7; } if (rawHTML.Substring(0, 500).IndexOf("Generated by javadoc (1.8", StringComparison.Ordinal) > 0) { kind = JavaDocKind.Java8; } var html = new Sgml.SgmlReader() { InputStream = new StringReader(rawHTML), CaseFolding = Sgml.CaseFolding.ToLower, Dtd = HtmlDtd }; var doc = XDocument.Load(html, LoadOptions.SetLineInfo | LoadOptions.SetBaseUri); return(doc.Root); }
static XDocument ParseHtml(TextReader reader) { using (var sgmlReader = new SgmlReader { DocType = "HTML", CaseFolding = CaseFolding.ToLower }) { sgmlReader.InputStream = reader; return XDocument.Load(sgmlReader); } }
public static String GetXmlFromHtmlString (String html) { using (SgmlReader sr = new SgmlReader()) { sr.InputStream = new StringReader(html); return sr.ReadOuterXml(); } }
XmlDocument FetchXmlDocument(Uri url) { var sr = FetchWebText (url); var xr = new SgmlReader () { InputStream = sr }; var doc = new XmlDocument (); doc.Load (xr); sr.Close (); xr.Close (); return doc; }
private static XDocument ParseHtml( TextReader _Reader ) { using ( var sgmlReader = new SgmlReader { DocType = "HTML", CaseFolding = CaseFolding.ToLower, InputStream = _Reader, } ) { return XDocument.Load( sgmlReader ); } }
public void SetUp() { _sgmlReader = new SgmlReader { CaseFolding = CaseFolding.ToLower, DocType = "HTML", WhitespaceHandling = WhitespaceHandling.None }; }
public static XDocument FetchHtmlFromUrlAsXDocument(string url) { var webRequest = WebRequest.Create(url); using (var reader = new StreamReader(webRequest.GetResponse().GetResponseStream())) { var sgml = new SgmlReader(); sgml.DocType = "HTML"; sgml.CaseFolding = CaseFolding.ToLower; sgml.InputStream = reader; return new XDocument(XDocument.Load(sgml)); } }
internal static ImageInfo[] FindImgs( string htmlCode) { var r = new SgmlReader { DocType = @"HTML", InputStream = new StringReader(htmlCode) }; var al = new List<ImageInfo>(); //find <img src="" while (r.Read()) { if (r.NodeType == XmlNodeType.Element) { if (string.Compare(r.Name, @"img", StringComparison.OrdinalIgnoreCase) == 0) { if (r.HasAttributes) { var ii = new ImageInfo(); while (r.MoveToNextAttribute()) { switch (r.Name.ToLowerInvariant()) { case @"src": ii.Source = r.Value; break; case @"width": ii.Width = ConvertHelper.ToInt32(r.Value); break; case @"height": ii.Height = ConvertHelper.ToInt32(r.Value); break; } } // -- if (!string.IsNullOrEmpty(ii.Source)) { al.Add(ii); } } } } } return al.ToArray(); }
public void CanParseResponseAsXML() { using(var inputReader = new StreamReader(new FileStream("fixture.txt", FileMode.Open))) { var reader = new SgmlReader(); reader.InputStream = inputReader; reader.CaseFolding = CaseFolding.ToLower; reader.DocType = "HTML"; var document = new XDocument(XDocument.Load((XmlReader) reader)); Assert.IsNotNull(document.ToString()); } }
private XmlDocument LoadHtmlPageAsXMLInternal(string postData, string uri, string httpMethod) { // Prepare web request... HttpWebRequest webrequest = (HttpWebRequest)WebRequest.Create(uri); // Deal with proxy details if any. WebProxy proxy = null; if (_proxySettings.Option == ProxySettingsDTO.ProxyOption.UseIESettings) { throw new NotSupportedException("IE proxy settings are not supported by this module!"); } if (_proxySettings.Option == ProxySettingsDTO.ProxyOption.Custom) { proxy = new WebProxy(_proxySettings.ProxyHost, _proxySettings.ProxyPort); webrequest.Proxy = proxy; } webrequest.Method = httpMethod; if(String.Equals(httpMethod, "POST", StringComparison.OrdinalIgnoreCase)) { ASCIIEncoding encoding = new ASCIIEncoding(); byte[] data = encoding.GetBytes(postData); webrequest.ContentType = "application/x-www-form-urlencoded"; webrequest.ContentLength = data.Length; using (Stream newStream = webrequest.GetRequestStream()) { newStream.Write(data, 0, data.Length); } } HttpWebResponse webresponse = (HttpWebResponse)webrequest.GetResponse(); Encoding enc = System.Text.Encoding.GetEncoding(1252); StreamReader loResponseStream = new StreamReader(webresponse.GetResponseStream(), enc); string Buffer = loResponseStream.ReadToEnd(); loResponseStream.Close(); webresponse.Close(); StringReader stringReader = new StringReader(Buffer); // Use the cool sgml reader to 'interpret' the HTML as XML :) very nice! SgmlReader sgmlReader = new SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.InputStream = stringReader; XmlDocument doc = new XmlDocument(); doc.Load(sgmlReader); return doc; }
//セッションIDを取得(更新)する void getSessionId(Stream stream) { var enc = System.Text.Encoding.UTF8; using (var reader = new StreamReader(stream, enc)) using (var sgmlReader = new SgmlReader { InputStream = reader }) { sgmlReader.DocType = "HTML"; sgmlReader.CaseFolding = CaseFolding.ToLower; var doc = XDocument.Load(sgmlReader); var ns = doc.Root.Name.Namespace; var q = doc.Descendants(ns + "input") .Where(ul => ul.Attribute("id") != null && ul.Attribute("id").Value == "com.sun.faces.VIEW") .Select(el => el.Attribute("value").Value).FirstOrDefault(); session_id = q; } }
public static XmlDocument ConvertHtmlToXml(string html) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = System.Xml.WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = new System.IO.StringReader(html); XmlDocument xmlDoc = new XmlDocument(); xmlDoc.PreserveWhitespace = false; xmlDoc.XmlResolver = null; xmlDoc.Load(sgmlReader); return(xmlDoc); }
public static XmlDocument XmlFromHtml(string html) { // setup SgmlReader SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = new StringReader(html); // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return(doc); }
/// <summary> /// Constructs a DOM (System.Xml.Linq.XDocument) from HTML markup. /// </summary> /// <param name="htmlContent">HTML markup from which the DOM is to be constructed.</param> /// <returns>System.Linq.Xml.XDocument instance which is a DOM of the provided HTML markup.</returns> public XDocument BuildDocument(string htmlContent) { if (htmlContent == null) { throw new ArgumentNullException("htmlContent"); } if (htmlContent.Trim().Length == 0) { return new XDocument(); } // "trim end" htmlContent to ...</html>$ (codinghorror.com puts some scripts after the </html> - sic!) const string htmlEnd = "</html"; int indexOfHtmlEnd = htmlContent.LastIndexOf(htmlEnd); if (indexOfHtmlEnd != -1) { int indexOfHtmlEndBracket = htmlContent.IndexOf('>', indexOfHtmlEnd); if (indexOfHtmlEndBracket != -1) { htmlContent = htmlContent.Substring(0, indexOfHtmlEndBracket + 1); } } // load the document using sgml reader using (var sgmlReader = new SgmlReader()) { sgmlReader.CaseFolding = CaseFolding.ToLower; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.None; using (var sr = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(htmlContent)))) { sgmlReader.InputStream = sr; var document = XDocument.Load(sgmlReader); return document; } } }
public static XElement ReadHtmlAsXhtml(string html) { //detect if xhtml by looking for namespace near start if (html.IndexOf("http://www.w3.org/1999/xhtml") < 200) { //must be xhtml, so just parse as xml return XElement.Parse(html); } else { //probably html, so parse as sgml SgmlReader sgml = new SgmlReader(); sgml.DocType = "HTML"; sgml.WhitespaceHandling = WhitespaceHandling.All; sgml.CaseFolding = Sgml.CaseFolding.ToLower; sgml.InputStream = new StringReader(html); return XElement.Load(sgml); } }
public SgmlDtd getDTD(String version, String DTD) { if (log.IsDebugEnabled) log.Debug("getDTD(version: " + version + ", DTD: " + DTD + ")"); SgmlReader reader = null; Dictionary<String, SgmlDtd> dtd = null; if (this.checkAvailableVersion(DTD+version) && !this.version.ContainsKey(version)) { reader = new SgmlReader(); reader.CaseFolding = Sgml.CaseFolding.ToLower; String sgmlArticle = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, this.availableVersion[DTD+version]); if (log.IsDebugEnabled) log.Debug("sgmlArticle: " + sgmlArticle); reader.SystemLiteral = sgmlArticle; dtd = new Dictionary<String, SgmlDtd>(); dtd.Add(DTD, reader.Dtd); if (log.IsDebugEnabled) log.Debug("dtd.Add(DTD: " + DTD + ", reader.Dtd: " + reader.Dtd.ToString() + ")"); this.version.Add(version, dtd); if (log.IsDebugEnabled) log.Debug("this.version.Add(version: " + version + ", dtd: " + dtd.ToString() + ")"); } if (log.IsDebugEnabled) log.Debug("return this.version[version: " + version + "][DTD: " + DTD + "]"); return this.version[version][DTD]; }
private static XdmNode getXdmNode(String uri, String path) { try { SgmlReader sr = new SgmlReader(); sr.Href = uri; XmlDocument htmlDoc = new XmlDocument(); try { htmlDoc.Load(sr); } catch (Exception e) { throw; } XmlNode html = htmlDoc.SelectSingleNode(path); Processor processor = new Processor(); return processor.NewDocumentBuilder().Build(html); } catch (Exception e) { throw; } }
public XmlDocument GetDocFromContent(string content) { var start = DateTime.Now; XmlDocument doc = new XmlDocument(); using (var streamReader = new StringReader(content)) { SgmlReader sgmlReader = new SgmlReader { DocType = "HTML", InputStream = streamReader, }; doc.XmlResolver = null; doc.Load(sgmlReader); } var duration = DateTime.Now - start; Trace.WriteLine(string.Format("Cleansed html in {0} milliseconds", duration.TotalMilliseconds), "WebpageCleaner"); return doc; }
private static XDocument LoadDocument(string htmlContent) { try { using (var sgmlReader = new SgmlReader()) { sgmlReader.CaseFolding = CaseFolding.ToLower; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.None; using (var sr = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(htmlContent)))) { sgmlReader.InputStream = sr; var document = XDocument.Load(sgmlReader); return document; } } } catch (Exception) { return new XDocument(); } }
private static string RunTest(CaseFolding caseFolding, string doctype, bool format, string source, XmlReaderTestCallback callback) { // initialize sgml reader XmlReader reader = new SgmlReader { CaseFolding = caseFolding, DocType = doctype, InputStream = new StringReader(source), WhitespaceHandling = format ? WhitespaceHandling.None : WhitespaceHandling.All }; // check if we need to use the LoggingXmlReader if(_debug) { reader = new LoggingXmlReader(reader, Console.Out); } // initialize xml writer var stringWriter = new StringWriter(); var xmlTextWriter = new XmlTextWriter(stringWriter); if(format) { xmlTextWriter.Formatting = Formatting.Indented; } callback(reader, xmlTextWriter); xmlTextWriter.Close(); // reproduce the parsed document var actual = stringWriter.ToString(); // ensure that output can be parsed again try { using(var stringReader = new StringReader(actual)) { var doc = new XmlDocument(); doc.Load(stringReader); } } catch(Exception) { Assert.Fail("unable to parse sgml reader output:\n{0}", actual); } return actual.Trim().Replace("\r", ""); }
XElement GetJavaDocFile(string path, out JavaDocKind kind) { kind = JavaDocKind.DroidDoc; string rawHTML = ReadHtmlFile(path); if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 5000)).IndexOf("<body class=\"gc-documentation develop reference api ", StringComparison.Ordinal) > 0) { kind = JavaDocKind.DroidDoc2; } if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 500)).IndexOf("Generated by javadoc (build 1.6", StringComparison.Ordinal) > 0) { kind = JavaDocKind.Java6; } if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 500)).IndexOf("Generated by javadoc (version 1.7", StringComparison.Ordinal) > 0) { kind = JavaDocKind.Java7; } if (rawHTML.Substring(0, Math.Min(rawHTML.Length, 500)).IndexOf("Generated by javadoc (1.8", StringComparison.Ordinal) > 0) { kind = JavaDocKind.Java8; } if (kind == JavaDocKind.DroidDoc) { throw new NotSupportedException("Old DroidDoc is not supported anymore."); } else { var html = new Sgml.SgmlReader() { InputStream = new StringReader(rawHTML), CaseFolding = Sgml.CaseFolding.ToLower, Dtd = HtmlDtd }; var doc = XDocument.Load(html); return(doc.Root); } }
public void Test_for_illegal_char_value() { const string source = "&test"; var reader = new SgmlReader { DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, StripDocType = true, InputStream = new StringReader(source), CaseFolding = CaseFolding.ToLower }; // test var element = System.Xml.Linq.XElement.Load(reader); string value = element.Value; Assert.IsFalse(string.IsNullOrEmpty(value), "element has no value"); Assert.AreNotEqual((char)65535, value[value.Length - 1], "unexpected -1 as last char"); }
public void Test_MoveToNextAttribute() { // Make sure we can do MoveToElement after reading multiple attributes. var r = new SgmlReader { InputStream = new StringReader("<test id='10' x='20'><a/><!--comment-->test</test>") }; Assert.IsTrue(r.Read()); while(r.MoveToNextAttribute()) { _log.Debug(r.Name); } if(r.MoveToElement()) { _log.Debug(r.ReadInnerXml()); } }
public void Run(string[] args) { SgmlReader reader = new SgmlReader(); string inputUri = null; for (int i = 0; i < args.Length; i++) { string arg = args[i]; if (arg[0] == '-' || arg[0] == '/') { switch (arg.Substring(1)) { case "e": string errorlog = args[++i]; if (errorlog.ToLower() == "$stderr") { reader.ErrorLog = Console.Error; } else { reader.ErrorLogFile = errorlog; } break; case "html": reader.DocType = "HTML"; break; case "dtd": reader.SystemLiteral = args[++i]; break; case "proxy": proxy = args[++i]; reader.WebProxy = proxy; break; case "encoding": encoding = Encoding.GetEncoding(args[++i]); break; case "f": formatted = true; reader.WhitespaceHandling = WhitespaceHandling.None; break; case "noxml": noxmldecl = true; break; case "doctype": reader.StripDocType = false; break; case "lower": reader.CaseFolding = CaseFolding.ToLower; break; case "upper": reader.CaseFolding = CaseFolding.ToUpper; break; default: Console.WriteLine("Usage: SgmlReader <options> [InputUri] [OutputFile]"); Console.WriteLine("-e log Optional log file name, name of '$STDERR' will write errors to stderr"); Console.WriteLine("-f Whether to pretty print the output."); Console.WriteLine("-html Specify the built in HTML dtd"); Console.WriteLine("-dtd url Specify other SGML dtd to use"); Console.WriteLine("-base Add base tag to output HTML"); Console.WriteLine("-noxml Do not add XML declaration to the output"); Console.WriteLine("-proxy svr:80 Proxy server to use for http requests"); Console.WriteLine("-encoding name Specify an encoding for the output file (default UTF-8)"); Console.WriteLine("-lower Convert input tags to lower case"); Console.WriteLine("-upper Convert input tags to upper case"); Console.WriteLine(); Console.WriteLine("InputUri The input file or http URL (default stdin). "); Console.WriteLine(" Supports wildcards for local file names."); Console.WriteLine("OutputFile Output file name (default stdout)"); Console.WriteLine(" If input file contains wildcards then this just specifies the output file extension (default .xml)"); return; } } else { if (inputUri == null) { inputUri = arg; string ext = Path.GetExtension(arg).ToLower(); if (ext == ".htm" || ext == ".html") reader.DocType = "HTML"; } else if (output == null) output = arg; } } if (inputUri != null && !inputUri.StartsWith("http://") && inputUri.IndexOfAny(new char[] { '*', '?' }) >= 0) { // wild card processing of a directory of files. string path = Path.GetDirectoryName(inputUri); if (path == "") path = ".\\"; string ext = ".xml"; if (output != null) ext = Path.GetExtension(output); foreach (string uri in Directory.GetFiles(path, Path.GetFileName(inputUri))) { Console.WriteLine("Processing: " + uri); string file = Path.GetFileName(uri); output = Path.GetDirectoryName(uri) + Path.DirectorySeparatorChar + Path.GetFileNameWithoutExtension(file) + ext; Process(reader, uri); reader.Close(); } return; } Process(reader, inputUri); reader.Close(); return ; }
void Process(SgmlReader reader, string uri) { if (uri == null) { reader.InputStream = Console.In; } else { reader.Href = uri; } if (this.encoding == null) { this.encoding = reader.GetEncoding(); } XmlTextWriter w = null; if (output != null) { w = new XmlTextWriter(output, this.encoding); } else { w = new XmlTextWriter(Console.Out); } if (formatted) w.Formatting = Formatting.Indented; if (!noxmldecl) { w.WriteStartDocument(); } reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); w.Close(); }
public ActionResult ImportAliases(HttpPostedFileBase file) { byte result = 1; string msg = null; try { // setup SgmlReader SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.IgnoreDtd = true; sgmlReader.InputStream = new StreamReader(file.InputStream); // create document XmlDocument xdoc = new XmlDocument(); xdoc.PreserveWhitespace = true; xdoc.Load(sgmlReader); int id, n = 0; XPathNavigator xnav = xdoc.CreateNavigator(); XPathNodeIterator it_p = xnav.Select("html//body//p"); while (it_p.MoveNext()) { // if exists XPathNavigator nav_h1 = it_p.Current.SelectSingleNode("a//h1"); if (nav_h1 != null) { XPathNavigator nav_b = it_p.Current.SelectSingleNode("b"); Alias alias = new Alias(); alias.Name = nav_h1.Value; alias.Remark = nav_b.Value; System.Diagnostics.Debug.WriteLine(alias.Name); if (db.Exists <Alias>("name = @0 AND parentid IS NULL", alias.Name)) { id = db.ExecuteScalar <int>("SELECT IFNULL(a.id, 0) FROM qb_aliases a WHERE a.name = @0 AND a.parentid IS NULL", alias.Name); db.Delete <Alias>("WHERE parentid = @0", id); } else { id = db.ExecuteScalar <int>("INSERT INTO qb_aliases(name, remark) VALUES(@0, @1);\nSELECT last_insert_rowid();", alias.Name, alias.Remark); } XPathNodeIterator it_tr = it_p.Current.Select("table//tr"); if (it_tr.Count != 0) { IList <Alias> fields = new List <Alias>(); // skip 1st tr - headers it_tr.MoveNext(); while (it_tr.MoveNext()) { Alias alias1 = new Alias(); XPathNavigator nav_td1 = it_tr.Current.SelectSingleNode("td[1]"); if (nav_td1 != null) { alias1.Name = nav_td1.Value; } XPathNavigator nav_td2 = it_tr.Current.SelectSingleNode("td[2]"); if (nav_td2 != null) { alias1.Remark = nav_td2.Value; } fields.Add(alias1); } if (fields.Count > 0) { int ix = 0; List <string> keys = new List <string>(); List <object> vals = new List <object>(); foreach (Alias a in fields) { keys.Add("@" + string.Join(", @", new int[] { ix, ix + 1, ix + 2 })); vals.AddRange(new object[] { a.Name, a.Remark, id }); ix += 3; } string sql = "INSERT INTO qb_aliases(name, remark, parentid) VALUES (" + string.Join("), (", keys) + ")"; db.Execute(sql, vals.ToArray()); System.Diagnostics.Debug.WriteLine(db.LastSQL); } } n++; System.Diagnostics.Debug.WriteLine(n); } } msg = n.ToString(); Response.RemoveOutputCacheItem("/Admin/GetAlias"); Response.RemoveOutputCacheItem("/Admin/GetAliases"); } catch (Exception e) { msg = e.Message; result = 0; } string json = Newtonsoft.Json.JsonConvert.SerializeObject(new { success = result, message = msg }); return(Content(json)); // for IE }
public string ConvertSGMtoXML(dynamic sgmlInput, string sgmlDTD = null, string entities = null) { try { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; //If dtd is provided, we can use it to parse the xml if (sgmlDTD != null) { sgmlReader.IgnoreDtd = false; sgmlReader.InternalSubset = sgmlDTD; } //Check what type of sgmlInput is provided if (sgmlInput is TextWriter) { sgmlReader.InputStream = sgmlInput; } else { sgmlReader.Href = sgmlInput; } XDocument xmlDoc = XDocument.Load(sgmlReader); //Check to see if there is a doctype declaration in the xmldoc if (xmlDoc.DocumentType == null) { string rootName = xmlDoc.Root.Name.ToString(); xmlDoc.Root.AddBeforeSelf(new XDocumentType(rootName, "", "", "")); } //If entities file is provided, read it and add it to the xml doc if (entities != null) { string docTypeEntities = File.ReadAllText(entities); if (docTypeEntities.Contains("<!DOCTYPE") || docTypeEntities.Contains("]>")) { docTypeEntities = Regex.Replace(docTypeEntities, @"<!DOCTYPE(.+?)\[|\]\>", ""); } xmlDoc.DocumentType.InternalSubset = docTypeEntities; } //Convert the xml to string in order to fix it string xmlString = xmlDoc.ToString(); if (xmlString.Contains("</revst>") || xmlString.Contains("</revend>") || xmlString.Contains("</cocst>") || xmlString.Contains("</revst>")) { xmlString = xmlString.Replace("<revst>", "<revst/>").Replace("</revst>", "") .Replace("<revend>", "<revend/>").Replace("</revend>", "") .Replace("<cocst>", "<cocst/>").Replace("</cocst>", "") .Replace("<cocend>", "<cocend/>").Replace("</cocend>", ""); } return(xmlString); } catch (Exception e) { MessageBox.Show(e.Message, "Unable to transform SGML to XML"); return(e.Message); } }
private static void ExportDocument(JArray array, HashSet<string> crawled, string href) { if (crawled.Add(href) == false) return; Console.WriteLine("Reading {0}", href); var uri = new UriBuilder("http", "groups.google.com", 80, href).Uri; var linkReader = new SgmlReader { Href = uri.ToString() }; var linkDoc = new XmlDocument(); linkDoc.Load(linkReader); var layout = linkDoc.SelectSingleNode("//div[@class='layout']"); var title = layout.SelectSingleNode(".//h2").InnerText; var name = Path.GetFileName(uri.LocalPath).Replace("-", "_"); AddDocumentsFromLinks(array, crawled, layout.SelectNodes(".//a")); Console.WriteLine("Writing {0}", title); var index = new JObject(new JProperty("Html", FixLinks(layout.InnerXml)), new JProperty("Name", title)); array.Add(new JObject( new JProperty("DocId", "raven_documentation/" + name), new JProperty("Document", index), new JProperty("Type", "raven documentation"), new JProperty("Metadata", new JObject(new JProperty("Raven-View-Template", "/raven/JSONTemplates/documentation.html"))) )); }