// Parse the document from the current position until we find the // matching closing tag private SnapshotSpan?FindClosingTag(ITextSnapshot snapshot, int searchStart, string searchFor) { String textToSearch = snapshot.GetText(searchStart, snapshot.Length - searchStart); using (SgmlReader reader = new SgmlReader()) { reader.InputStream = new StringReader(textToSearch); reader.WhitespaceHandling = WhitespaceHandling.All; try { reader.Read(); if (!reader.IsEmptyElement) { // skip all the internal nodes, until the end while (reader.Read()) { if (reader.NodeType == XmlNodeType.EndElement && reader.Depth == 1) { break; } } // calculate the new position based on the number of lines // read in the SgmlReader + the position within that line. // Note that if there is whitespace after the closing tag // we'll be positioned on it, so we need to keep track of that. var origLine = snapshot.GetLineFromPosition(searchStart); int startOffset = searchStart - origLine.Start.Position; int newStart = 0; // tag is on same position as the opening one if (reader.LineNumber == 1) { var line = snapshot.GetLineFromPosition(searchStart); newStart = line.Start.Position + startOffset + reader.LinePosition - 2; } else { int newLineNum = origLine.LineNumber + reader.LineNumber - 1; var newLine = snapshot.GetLineFromLineNumber(newLineNum); newStart = newLine.Start.Position + reader.LinePosition - 1; } newStart -= reader.Name.Length + 3; // </ + element + > SnapshotSpan?newSpan = new SnapshotSpan(snapshot, newStart, searchFor.Length); if (newSpan.Value.GetText() != searchFor) { Trace.WriteLine(String.Format("Searching for '{0}', but found '{1}'.", searchFor, newSpan.Value.GetText())); newSpan = null; } return(newSpan); } } catch (Exception ex) { Trace.WriteLine(String.Format("Exception while parsing document: {0}.", ex.ToString())); } } return(null); }
// Creates XmlDocument from html content and return it with rootitem "<root>". public static XmlDocument ParseHtml(string sContent) { StringReader sr = new StringReader("<root>" + sContent + "</root>"); SgmlReader reader = new SgmlReader(); reader.WhitespaceHandling = WhitespaceHandling.All; reader.CaseFolding = Sgml.CaseFolding.ToLower; reader.InputStream = sr; StringWriter sw = new StringWriter(); XmlTextWriter w = new XmlTextWriter(sw); w.Formatting = Formatting.Indented; w.WriteStartDocument(); reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); w.Close(); sw.Flush(); // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.LoadXml(sw.ToString()); reader.Close(); return(doc); }
void Process(SgmlReader reader, string uri) { if (uri == null) { reader.InputStream = Console.In; } else { reader.Href = uri; } this.encoding ??= reader.GetEncoding(); XmlTextWriter w = output != null ? new XmlTextWriter(output, this.encoding) : new XmlTextWriter(Console.Out); if (formatted) { w.Formatting = Formatting.Indented; } if (!noxmldecl) { w.WriteStartDocument(); } reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); w.Close(); }
public static string GetWellFormedHTML(string html, string xpathNavPath) { // StreamReader sReader = null; StringWriter sw = null; SgmlReader reader = null; XmlTextWriter writer = null; try { // if (uri == String.Empty) uri = "http://www.XMLforASP.NET"; // HttpWebRequest req = (HttpWebRequest)WebRequest.Create(uri); // HttpWebResponse res = (HttpWebResponse)req.GetResponse(); // sReader = new StreamReader(res.GetResponseStream()); reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(html); sw = new StringWriter(); writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; //writer.WriteStartElement("Test"); while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } //writer.WriteEndElement(); if (xpathNavPath == null) { string sr = sw.ToString(); sr = sr.Replace("\r", "\n"); sr = sr.Replace("\n\n", "\n"); return(sr); } else { //Filter out nodes from HTML StringBuilder sb = new StringBuilder(); XPathDocument doc = new XPathDocument(new StringReader(sw.ToString())); XPathNavigator nav = doc.CreateNavigator(); XPathNodeIterator nodes = nav.Select(xpathNavPath); while (nodes.MoveNext()) { sb.Append(nodes.Current.Value + "\n"); } string sr = sb.ToString(); sr = sr.Replace("\r", "\n"); sr = sr.Replace("\n\n", "\n"); return(sr); } } catch (Exception exp) { writer.Close(); reader.Close(); sw.Close(); // sReader.Close(); return(exp.Message); } }
/// <summary> /// 将html代码转换为xml代码,需要在try-catch块中调用。 /// </summary> public static string HtmlToXml(string html) { if (string.IsNullOrWhiteSpace(html)) { return(string.Empty); } html = StringUtils.ReplaceIgnoreCase(html, "<br>", "<br />"); html = StringUtils.ReplaceIgnoreCase(html, "&#", "&#"); html = html.Replace(" @", " hexadecimal-value-0x40"); //vuejs shorthand @click html = html.Replace(" :", " hexadecimal-value-0x3a"); //vuejs shorthand :href //strInputHtml = StringUtils.ReplaceNewline(strInputHtml, NEWLINE_REPLACEMENT); var reader = new SgmlReader { DocType = "HTML" }; var sr = new System.IO.StringReader(html); reader.InputStream = sr; var sw = new System.IO.StringWriter(); var w = new XmlTextWriter(sw); reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); w.Close(); var xml = sw.ToString(); //xml = xml.Replace(NEWLINE_REPLACEMENT, "\r\n"); return(xml); }
/// <summary> /// 将html代码转换为xml代码,需要在try-catch块中调用。 /// </summary> public static string HtmlToXml(string strInputHtml) { strInputHtml = StringUtils.ReplaceIgnoreCase(strInputHtml, "<br>", "<br />"); strInputHtml = StringUtils.ReplaceIgnoreCase(strInputHtml, "&#", "&#"); //strInputHtml = StringUtils.ReplaceNewline(strInputHtml, NEWLINE_REPLACEMENT); var reader = new SgmlReader { DocType = "HTML" }; var sr = new System.IO.StringReader(strInputHtml); reader.InputStream = sr; var sw = new System.IO.StringWriter(); var w = new XmlTextWriter(sw); reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); w.Close(); var xml = sw.ToString(); //xml = xml.Replace(NEWLINE_REPLACEMENT, "\r\n"); return(xml); }
private string ProcessString(string strInputHtml) { string strOutputXhtml = String.Empty; if (strInputHtml == null || strInputHtml == "") { return("<html></html>"); } SgmlReader rd = new SgmlReader(); rd.DocType = "HTML"; StringReader sr = new System.IO.StringReader(strInputHtml); rd.InputStream = sr; StringWriter sw = new StringWriter(); XmlTextWriter xw = new XmlTextWriter(sw); rd.Read(); while (!rd.EOF) { try { xw.WriteNode(rd, true); } catch { break; } } xw.Flush(); xw.Close(); return(sw.ToString()); }
void Process(SgmlReader reader, string uri, bool loadAsStream) { if (uri == null) { reader.InputStream = Console.In; } else if (loadAsStream) { Uri location = new Uri(uri); if (location.IsFile) { reader.InputStream = new StreamReader(uri); } else { WebRequest wr = WebRequest.Create(location); reader.InputStream = new StreamReader(wr.GetResponse().GetResponseStream()); } } else { reader.Href = uri; } if (debug) { Debug(reader); reader.Close(); return; } if (crawl) { StartCrawl(reader, uri, basify); return; } if (this.encoding == null) { this.encoding = reader.GetEncoding(); } XmlTextWriter w = null; if (output != null) { w = new XmlTextWriter(output, this.encoding); } else { w = new XmlTextWriter(Console.Out); } if (formatted) w.Formatting = Formatting.Indented; if (!noxmldecl) { w.WriteStartDocument(); } if (testdoc) { XmlDocument doc = new XmlDocument(); try { doc.Load(reader); doc.WriteTo(w); } catch (XmlException e) { Console.WriteLine("Error:" + e.Message); Console.WriteLine("at line " + e.LineNumber + " column " + e.LinePosition); } } else { reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } } w.Flush(); w.Close(); }
/// <summary> /// 获取xml中的数据 根据 /// </summary> /// <param name="htmlString"></param> /// <param name="xpath"></param> /// <returns></returns> public static string GetWellFormedHTML(string htmlString, string xpath) { if (htmlString.Trim().Length < 10) { return(""); } htmlString = htmlString.Replace("xmlns", "buyao"); StringWriter sw = null; SgmlReader reader = null; XmlTextWriter writer = null; try { reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(htmlString); sw = new StringWriter(); writer = new XmlTextWriter(sw); writer.Formatting = Formatting.Indented; writer.WriteStartDocument(); while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { try { //如果出错 抛弃此节点 writer.WriteNode(reader, true); } catch (Exception e) { } } } if (xpath == null) { return(sw.ToString()); } else { StringBuilder sb = new StringBuilder(); XPathDocument doc = new XPathDocument(new StringReader(sw.ToString())); XPathNavigator nav = doc.CreateNavigator(); XPathNodeIterator nodes = nav.Select(xpath); while (nodes.MoveNext()) { sb.Append(nodes.Current.OuterXml + " "); } return(sb.ToString()); } } catch (Exception exp) { writer.Close(); reader.Close(); sw.Close(); return(""); } }
void Debug(SgmlReader sr) { NodeTypeFlags[] AllowedContentMap = new NodeTypeFlags[19] { NodeTypeFlags.None, // none NodeTypeFlags.Element | NodeTypeFlags.Attribute | NodeTypeFlags.Text | NodeTypeFlags.CDATA | NodeTypeFlags.EntityReference | NodeTypeFlags.ProcessingInstruction | NodeTypeFlags.Comment | NodeTypeFlags.Whitespace | NodeTypeFlags.SignificantWhitespace | NodeTypeFlags.EndElement, // element NodeTypeFlags.Text | NodeTypeFlags.EntityReference, // attribute NodeTypeFlags.None, // text NodeTypeFlags.None, // cdata NodeTypeFlags.None, // entity reference NodeTypeFlags.None, // entity NodeTypeFlags.None, // processing instruction NodeTypeFlags.None, // comment NodeTypeFlags.Comment | NodeTypeFlags.DocumentType | NodeTypeFlags.Element | NodeTypeFlags.EndElement | NodeTypeFlags.ProcessingInstruction | NodeTypeFlags.Whitespace | NodeTypeFlags.SignificantWhitespace | NodeTypeFlags.XmlDeclaration, // document NodeTypeFlags.None, // document type NodeTypeFlags.None, // document fragment (not expecting these) NodeTypeFlags.None, // notation NodeTypeFlags.None, // whitespace NodeTypeFlags.None, // signification whitespace NodeTypeFlags.None, // end element NodeTypeFlags.None, // end entity NodeTypeFlags.None, // filler NodeTypeFlags.None, // xml declaration. }; Stack s = new Stack(); while (sr.Read()) { if (sr.NodeType == XmlNodeType.EndElement) { s.Pop(); } if (s.Count > 0) { XmlNodeType pt = (XmlNodeType)s.Peek(); NodeTypeFlags p = NodeTypeMap[(int)pt]; NodeTypeFlags f = NodeTypeMap[(int)sr.NodeType]; if ((AllowedContentMap[(int)pt]& f) != f) { Console.WriteLine("Invalid content!!"); } } if (s.Count != sr.Depth-1) { Console.WriteLine("Depth is wrong!"); } if ( (sr.NodeType == XmlNodeType.Element && !sr.IsEmptyElement) || sr.NodeType == XmlNodeType.Document) { s.Push(sr.NodeType); } for (int i = 1; i < sr.Depth; i++) Console.Write(" "); Console.Write(sr.NodeType.ToString() + " " + sr.Name); if (sr.NodeType == XmlNodeType.Element && sr.AttributeCount > 0) { sr.MoveToAttribute(0); Console.Write(" (" + sr.Name+"="+sr.Value + ")"); sr.MoveToElement(); } if (sr.Value != null) { Console.Write(" " + sr.Value.Replace("\n"," ").Replace("\r","")); } Console.WriteLine(); } }
private ResultInfo getDetail(string backstring) { SgmlReader reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(backstring); StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); reader.WhitespaceHandling = WhitespaceHandling.None; writer.Formatting = Formatting.Indented; while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } XmlDocument doc = new XmlDocument(); doc.Load(new StringReader(sw.ToString())); XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable); xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml"); XPathNavigator nav = doc.CreateNavigator(); /////////////////根据网页返回结果分析 string xpath = "//bottum:table[@id='ctl00_ContentPlaceHolder1_TrackDetail']/bottum:tr/bottum:td/bottum:div[8]/bottum:table/bottum:tr/bottum:td"; XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式 ResultInfo backinfo = new ResultInfo(querynum); if (nodes.Count > 3) { nodes.MoveNext(); nodes.MoveNext(); nodes.MoveNext(); } for (int i = 1; i < nodes.Count / 3; i++) { nodes.MoveNext(); string time = nodes.Current.Value; nodes.MoveNext(); nodes.MoveNext(); string state = nodes.Current.Value; backinfo.add(time, state); } reader.Close(); writer.Close(); sw.Close(); return(backinfo); }
public ResultInfo getDetail(string backstring) { SgmlReader reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(backstring); StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); reader.WhitespaceHandling = WhitespaceHandling.None; writer.Formatting = Formatting.Indented; while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } XmlDocument doc = new XmlDocument(); doc.Load(new StringReader(sw.ToString())); reader.Close(); writer.Close(); sw.Close(); XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable); xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml"); XPathNavigator nav = doc.CreateNavigator(); string xpath = "//bottum:table[@id='GridView1']/bottum:tr/bottum:td"; XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式 if (nodes != null) { int count = nodes.Count; int k = count / 3; ResultInfo backinfo = new ResultInfo(queryNumber); for (int i = 0; i < k; i++) { nodes.MoveNext(); nodes.MoveNext(); string time = nodes.Current.Value; nodes.MoveNext(); string state = nodes.Current.Value; backinfo.add(time, state); } return(backinfo); } else { return(new ResultInfo(queryNumber)); } }
// parse the document from the start, and try to // figure out where the opening tag matching our closing tag starts private SnapshotSpan?FindOpeningTag(ITextSnapshot snapshot, int searchEnd, string searchFor) { String textToSearch = snapshot.GetText(0, searchEnd); int origLineNum = snapshot.GetLineNumberFromPosition(searchEnd); using (SgmlReader reader = new SgmlReader()) { reader.InputStream = new StringReader(textToSearch); reader.WhitespaceHandling = WhitespaceHandling.All; try { Stack <int> openingPositions = new Stack <int>(); while (reader.Read()) { if (reader.LocalName != searchFor) { continue; } if (reader.NodeType == XmlNodeType.Element && !reader.IsEmptyElement) { // find close to where the tag starts int lineNum = reader.LineNumber - 1; var line = snapshot.GetLineFromLineNumber(lineNum); int position = line.Start.Position + reader.LinePosition - searchFor.Length; position = BacktrackToLessThan(snapshot, position); String textFound = snapshot.GetText(position, 10); openingPositions.Push(position); } else if (reader.NodeType == XmlNodeType.EndElement) { if (openingPositions.Count <= 0) { // document is malformed, so just get the heck out return(null); } var line = snapshot.GetLineFromLineNumber(reader.LineNumber - 1); int position = line.Start.Position + reader.LinePosition; if (position >= searchEnd) { break; } openingPositions.Pop(); } } // done, last if (openingPositions.Count > 0) { int position = openingPositions.Pop(); return(new SnapshotSpan(snapshot, position, searchFor.Length + 2)); } } catch (Exception ex) { Trace.WriteLine(String.Format("Exception while parsing document: {0}.", ex.ToString())); } } return(null); }
private ResultInfo getDetail(string backstring) { SgmlReader reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(backstring); StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); reader.WhitespaceHandling = WhitespaceHandling.None; writer.Formatting = Formatting.Indented; while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } XmlDocument doc = new XmlDocument(); doc.Load(new StringReader(sw.ToString())); XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable); XPathNavigator nav = doc.CreateNavigator(); /////////////////根据网页返回结果分析 string xpath = "//table[1]/tr/td"; string str = ""; XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式 ResultInfo backinfo = new ResultInfo(querynum); if (nodes.Count >= 4) { nodes.MoveNext(); nodes.MoveNext(); nodes.MoveNext(); nodes.MoveNext(); } for (int i = 4; i < nodes.Count / 2; i++) { nodes.MoveNext(); string time = nodes.Current.Value; nodes.MoveNext(); string state = nodes.Current.Value; backinfo.add(time, state); nodes.MoveNext(); } reader.Close(); writer.Close(); sw.Close(); return(backinfo); }
internal static ImageInfo[] FindImgs( string htmlCode) { var r = new SgmlReader { DocType = @"HTML", InputStream = new StringReader(htmlCode) }; var al = new List <ImageInfo>(); //find <img src="" while (r.Read()) { if (r.NodeType == XmlNodeType.Element) { if (String.Compare(r.Name, @"img", StringComparison.OrdinalIgnoreCase) == 0) { if (r.HasAttributes) { var ii = new ImageInfo(); while (r.MoveToNextAttribute()) { switch (r.Name.ToLowerInvariant()) { case @"src": ii.Source = r.Value; break; case @"width": ii.Width = ConvertHelper.ToInt32(r.Value); break; case @"height": ii.Height = ConvertHelper.ToInt32(r.Value); break; } } // -- if (!String.IsNullOrEmpty(ii.Source)) { al.Add(ii); } } } } } return(al.ToArray()); }
public ResultInfo getDetail(string backstring) { backstring = backstring.Replace("xmlns=\"http://www.w3.org/1999/xhtml\"", ""); SgmlReader reader = new SgmlReader(); reader.DocType = "HTML"; reader.InputStream = new StringReader(backstring); StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); reader.WhitespaceHandling = WhitespaceHandling.None; writer.Formatting = Formatting.Indented; while (reader.Read()) { if (reader.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(reader, true); } } XmlDocument doc = new XmlDocument(); doc.Load(new StringReader(sw.ToString())); XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable); xnm.AddNamespace("bottum", "http://www.w3.org/1999/xhtml"); XPathNavigator nav = doc.CreateNavigator(); string xpath = "/html/body/table[8]/tr/td"; XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式 if (nodes.Count >= 2) { nodes.MoveNext(); nodes.MoveNext(); } ResultInfo backinfo = new ResultInfo(querynum); for (int i = 1; i < nodes.Count / 2; i++) { nodes.MoveNext(); string time = nodes.Current.Value; nodes.MoveNext(); string state = nodes.Current.Value; backinfo.add(time, state); } reader.Close(); writer.Close(); sw.Close(); return(backinfo); }
protected internal static void AutoCloseTags(SgmlReader reader, XmlWriter writer) { object msgBody = reader.NameTable.Add("MSGBODY"); object previousElement = null; Stack elementsWeAlreadyEnded = new Stack(); while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: previousElement = reader.LocalName; writer.WriteStartElement(reader.LocalName); break; case XmlNodeType.Text: if (String.IsNullOrEmpty(reader.Value) == false) { writer.WriteString(reader.Value.Trim()); if (previousElement != null && !previousElement.Equals(msgBody)) { writer.WriteEndElement(); elementsWeAlreadyEnded.Push(previousElement); } } else { Debug.Assert(true, "big problems?"); } break; case XmlNodeType.EndElement: if (elementsWeAlreadyEnded.Count > 0 && Object.ReferenceEquals(elementsWeAlreadyEnded.Peek(), reader.LocalName)) { elementsWeAlreadyEnded.Pop(); } else { writer.WriteEndElement(); } break; default: writer.WriteNode(reader, false); break; } } }
/// <summary> /// Gets the title out of the HTML head section. /// </summary> /// <param name="url">The URL of the page</param> /// <param name="defaultIfNoMatch">string to return, if no match was found</param> /// <param name="credentials">Credentials for authenticating the request</param> /// <param name="proxy">Proxy server to direct the request through</param> /// <returns></returns> //dup to FindTitle2() - which one we should use? public static string FindTitle(string url, string defaultIfNoMatch, IWebProxy proxy, ICredentials credentials) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.AllowAutoRedirect = true; request.Proxy = proxy; request.Credentials = credentials; request.Timeout = 5 * 1000 /* 5 second timeout */; if (FeedSource.SetCookies) { HttpCookieManager.SetCookies(request); } /* use bogus user agent since some sites will bounce you to unsupported browser page otherwise */ request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;)"; string title = defaultIfNoMatch; Stream stream = null; try { stream = request.GetResponse().GetResponseStream(); SgmlReader reader = new SgmlReader(); reader.InputStream = new StreamReader(stream); while (reader.Read()) { if ((reader.NodeType == XmlNodeType.Element) && (reader.Name.ToLower().Equals("title"))) { title = reader.ReadElementContentAsString(); stream.Flush(); break; } } //while } catch (Exception e) { _log.Debug("Error retrieving title from HTML page at " + url, e); } finally { if (stream != null) { stream.Close(); } } return(title); }
void RunTest(SgmlReader reader, int line, string args, string input, string expectedOutput){ bool testdoc = false; foreach (string arg in args.Split(' ')){ string sarg = arg.Trim(); if (sarg.Length==0) continue; if (sarg[0] == '-'){ switch (sarg.Substring(1)){ case "html": reader.DocType = "html"; break; case "lower": reader.CaseFolding = CaseFolding.ToLower; break; case "upper": reader.CaseFolding = CaseFolding.ToUpper; break; case "testdoc": testdoc = true; break; } } } this.tests++; reader.InputStream = new StringReader(input); reader.WhitespaceHandling = WhitespaceHandling.None; StringWriter output = new StringWriter(); XmlTextWriter w = new XmlTextWriter(output); w.Formatting = Formatting.Indented; if (testdoc) { XmlDocument doc = new XmlDocument(); doc.Load(reader); doc.WriteTo(w); } else { reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } } w.Close(); string actualOutput = output.ToString(); if (actualOutput.Trim() != expectedOutput.Trim()) { Console.WriteLine("ERROR: Test failed on line {0}", line); Console.WriteLine("---- Expected output"); Console.WriteLine(expectedOutput); Console.WriteLine("---- Actual output"); Console.WriteLine(actualOutput); } else { this.passed++; } }
public static IEnumerable <string> GetAttributeValues(this string html, string tagName, string attributeName) { var reader = new SgmlReader { DocType = "html", WhitespaceHandling = WhitespaceHandling.All, InputStream = new StringReader(string.Format("<html>{0}</html>", html)) }; while (reader.Read() && !reader.EOF) { if (reader.NodeType == XmlNodeType.Element && reader.LocalName == tagName) { yield return(reader.GetAttribute(attributeName)); } } }
public void Test_MoveToNextAttribute() { // Make sure we can do MoveToElement after reading multiple attributes. var r = new SgmlReader { InputStream = new StringReader("<test id='10' x='20'><a/><!--comment-->test</test>") }; Assert.IsTrue(r.Read()); while (r.MoveToNextAttribute()) { _log.Debug(r.Name); } if (r.MoveToElement()) { _log.Debug(r.ReadInnerXml()); } }
void RegressionTest1() { // Make sure we can do MoveToElement after reading multiple attributes. SgmlReader r = new SgmlReader(); r.InputStream = new StringReader("<test id='10' x='20'><a/><!--comment-->test</test>"); if (r.Read()) { while (r.MoveToNextAttribute()) { Trace.WriteLine(r.Name); } if (r.MoveToElement()) { Trace.WriteLine(r.ReadInnerXml()); } } }
private ResultInfo getDetail(string backstring) { //////////////用sgml库分析网页,转换成xml文件 SgmlReader readern = new SgmlReader(); readern.DocType = "HTML"; readern.InputStream = new StringReader(backstring); StringWriter sw = new StringWriter(); XmlTextWriter writer = new XmlTextWriter(sw); readern.WhitespaceHandling = WhitespaceHandling.None; writer.Formatting = Formatting.Indented; while (!readern.EOF) { readern.Read(); if (readern.NodeType != XmlNodeType.Whitespace) { writer.WriteNode(readern, true); } } XmlDocument doc = new XmlDocument(); doc.Load(new StringReader(sw.ToString())); XmlNamespaceManager xnm = new XmlNamespaceManager(doc.NameTable); XPathNavigator nav = doc.CreateNavigator(); string xpath = "//div[@id='ess_ctr1579_TrackResult_DivBill']/table[2]/tr[@class='font_c']/td"; XPathNodeIterator nodes = nav.Select(xpath, xnm);//xpath表达式 ResultInfo backinfo = new ResultInfo(queryNumber); for (int i = 0; i < nodes.Count / 2; i++) { nodes.MoveNext(); string time = nodes.Current.Value; nodes.MoveNext(); string state = nodes.Current.Value; backinfo.add(time, state); } readern.Close(); writer.Close(); sw.Close(); return(backinfo); }
internal static void Convert(String htmlFile, String xhtmlFile) { using (SgmlReader reader = new SgmlReader()) { reader.DocType = "HTML"; reader.WhitespaceHandling = WhitespaceHandling.None; using (StreamReader r = new StreamReader(htmlFile)) { reader.InputStream = r; using (XmlTextWriter writer = new XmlTextWriter(xhtmlFile, Encoding.UTF8)) { writer.Formatting = Formatting.Indented; reader.Read(); while (!reader.EOF) { writer.WriteNode(reader, true); } } } } }
public static string TransformHtmlToXHTML(string inputHtml) { var sgmlReader = new SgmlReader { DocType = "HTML", }; var stringReader = new StringReader(inputHtml); sgmlReader.InputStream = stringReader; var stringWriter = new StringWriter(); using (var xmlWriter = new XmlTextWriter(stringWriter)) { sgmlReader.Read(); while (!sgmlReader.EOF) { xmlWriter.WriteNode(sgmlReader, true); } } return(RemoveCopyOfImage(stringWriter.ToString())); }
static void SaveAsXml(string url, string fileToSave) { if (File.Exists(fileToSave)) { return; } var stream = new XmlUrlResolver().GetEntity(new Uri(url), null, typeof(Stream)) as Stream; var xr = new SgmlReader() { InputStream = new StreamReader(stream) }; var xw = XmlWriter.Create(fileToSave); xr.MoveToContent(); do { xw.WriteNode(xr, false); xw.Flush(); } while (xr.Read()); xw.Close(); }
private void Process(SgmlReader reader, string uri) { if (uri == null) { reader.InputStream = Console.In; } else { reader.Href = uri; } encoding ??= reader.GetEncoding(); if (noUtf8Bom && encoding.Equals(Encoding.UTF8)) { encoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true); } XmlTextWriter w = output != null ? new XmlTextWriter(output, encoding) : new XmlTextWriter(Console.Out); using (w) { if (formatted) { w.Formatting = Formatting.Indented; } if (!noxmldecl) { w.WriteStartDocument(); } reader.Read(); while (!reader.EOF) { w.WriteNode(reader, true); } w.Flush(); } }
public void Test_fragment_parsing() { XmlReaderSettings settings = new XmlReaderSettings(); settings.ConformanceLevel = ConformanceLevel.Fragment; StringReader stream = new StringReader("<html><head></head><body></body></html> <script></script>"); int count = 0; SgmlReader reader = new SgmlReader(settings); reader.DocType = "html"; reader.InputStream = stream; while (reader.Read()) { if (reader.NodeType == XmlNodeType.Element) { XDocument doc = XDocument.Load(reader.ReadSubtree()); Debug.WriteLine(doc.ToString()); count++; } } Assert.AreEqual(2, count, "Expecing 2 XmlDocuments in the input stream"); }
/// <summary> /// Finds the images. /// </summary> /// <param name="htmlCode">The HTML code.</param> /// <returns></returns> internal static string[] FindImgs( string htmlCode) { var r = new SgmlReader { DocType = @"HTML", InputStream = new StringReader(htmlCode) }; var al = new List <string>(); //find <img src="" while (r.Read()) { if (r.NodeType == XmlNodeType.Element) { if (string.Compare(r.Name, @"img", true) == 0) { if (r.HasAttributes) { while (r.MoveToNextAttribute()) { if (r.Name.ToLower() == @"src") { if (!al.Contains(r.Value)) { al.Add(r.Value); } } } } } } } return(al.ToArray()); }
private static string ConvertCommentToMarkdown(string body) { var sb = new StringBuilder(); var sgmlReader = new SgmlReader { InputStream = new StringReader(body), DocType = "HTML", WhitespaceHandling = WhitespaceHandling.Significant, CaseFolding = CaseFolding.ToLower }; bool outputEndElement = false; int indentLevel = 0; while (sgmlReader.Read()) { switch (sgmlReader.NodeType) { case XmlNodeType.Text: if (indentLevel > 0) { sb.Append("\t"); } sb.AppendLine(sgmlReader.Value); break; case XmlNodeType.Element: switch (sgmlReader.LocalName) { case "h1": sb.Append("## "); break; case "br": sb.AppendLine(" "); break; case "a": if (sgmlReader.MoveToAttribute("href")) { string url = sgmlReader.Value; sgmlReader.Read(); sb.AppendFormat("[{0}]({1})", sgmlReader.Value, url); } break; case "html": break; case "strong": case "b": sb.AppendFormat("**{0}**", sgmlReader.Value); break; case "i": case "em": sb.AppendFormat("_{0}_", sgmlReader.Value); break; case "li": sb.AppendFormat("- {0}", sgmlReader.Value); break; case "pre": case "code": case "quote": indentLevel = 1; break; case "ul": case "ol": case "img": break; default: outputEndElement = true; sb.Append("<").Append(sgmlReader.LocalName); break; } break; case XmlNodeType.SignificantWhitespace: case XmlNodeType.Whitespace: case XmlNodeType.CDATA: break; case XmlNodeType.EndElement: indentLevel = 0; if (outputEndElement) { sb.Append(">"); } outputEndElement = false; break; default: throw new ArgumentOutOfRangeException(); } } return(sb.ToString()); }