public Webtext() { XmlDocument FromHtml(TextReader reader) { // ' setup SgmlReader Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.None; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; // ' create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return(doc); } string LoadWebText(string URL) { WebClient objWebClient = new WebClient(); UTF8Encoding objUTF8 = new UTF8Encoding(); XmlDocument xml = new XmlDocument(); xml = FromHtml(new StringReader(objUTF8.GetString(objWebClient.DownloadData(URL)))); return(xml.InnerText()); } }
private static XDoc FromHtml(TextReader reader) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable) { Dtd = _dtd, DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, CaseFolding = Sgml.CaseFolding.ToLower, InputStream = reader }; try { XmlDocument doc = XDoc.NewXmlDocument(); doc.Load(sgmlReader); if (doc.DocumentElement == null) { return(XDoc.Empty); } if (_dtd == null) { _dtd = sgmlReader.Dtd; } return(new XDoc(doc)); } catch (Exception) { return(XDoc.Empty); } }
public void WallStreeJournalRssTest() { using (StringReader reader = new StringReader(new WebClient().DownloadString("http://online.wsj.com/xml/rss/3_7011.xml"))) { // setup the SgmlReader and load it into a XDocument Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; string rawrssfeed = sgmlReader.ReadOuterXml(); Console.WriteLine(rawrssfeed); XDocument rss = XDocument.Load(rawrssfeed); // Read the rss feed using linq var rssfeed = (from nodes in rss.Descendants() select nodes); foreach (XElement p in rssfeed) { switch (p.Name.LocalName) { default: ParseItem(p); break; } } } }
static string GetImageFromHtml(string html) { // load the document using sgml reader var document = new XmlDocument(); using (var sgmlReader = new Sgml.SgmlReader()) { sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.None; using (var sr = new StringReader(html)) { sgmlReader.InputStream = sr; document.Load(sgmlReader); } } string imageUrl = null; var images = document.GetElementsByTagName("img"); foreach (XmlNode image in images) { if (image.Attributes["src"] != null) { imageUrl = image.Attributes["src"].Value; break; } } return(imageUrl); }
public Parser() { _sgmlReader = new Sgml.SgmlReader(); _sgmlReader.DocType = "HTML"; _sgmlReader.WhitespaceHandling = System.Xml.WhitespaceHandling.All; _sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; }
public static string RemoveScriptFromHtml(string html) { // load the document using sgml reader var document = new XmlDocument(); using (var sgmlReader = new Sgml.SgmlReader()) { sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.None; using (var sr = new StringReader(html)) { sgmlReader.InputStream = sr; document.Load(sgmlReader); } } // remove <script> var nodes = document.GetElementsByTagName("script"); for (int i = 0; i < nodes.Count; i++) { nodes[i].ParentNode.RemoveChild(nodes[i]); } RemoveAttributeScript(document.DocumentElement); return(document.OuterXml); }
private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = new StringReader(e.Content); XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); string textOnly = doc.DocumentElement.InnerText; foreach (string keyword in Keywords) { MatchCollection matches = Regex.Matches(textOnly, "(?'found'" + keyword.Replace(" ", "[\\s]*") + ")", RegexOptions.IgnoreCase); Console.ForegroundColor = ConsoleColor.Yellow; Console.Write("Found "); Console.ForegroundColor = ConsoleColor.Cyan; Console.Write(keyword); Console.ForegroundColor = ConsoleColor.Yellow; Console.Write(" in "); Console.ForegroundColor = ConsoleColor.Cyan; Console.Write(matches.Count); Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine(" different places."); } }
/* ----------------------------------------------------------------- */ /// /// ToXDocument /// /// <summary> /// XDocument オブジェクトを生成します。 /// </summary> /// /* ----------------------------------------------------------------- */ private static XDocument ToXDocument(System.IO.Stream src) { using (var stream = new System.IO.StreamReader(src, System.Text.Encoding.UTF8)) using (var reader = new Sgml.SgmlReader { CaseFolding = Sgml.CaseFolding.ToLower, DocType = "HTML", IgnoreDtd = true, InputStream = stream, }) return(XDocument.Load(reader)); }
public static XDoc WebHtml( [DekiScriptParam("HTML source text or source uri (default: none)", true)] string source, [DekiScriptParam("xpath to value (default: none)", true)] string xpath, [DekiScriptParam("namespaces (default: none)", true)] Hashtable namespaces, [DekiScriptParam("caching duration in seconds (range: 60 - 86400; default: 300)", true)] double?ttl, [DekiScriptParam("return nil if source could not be loaded (default: text with error message)", true)] bool?nilIfMissing ) { string text = WebText(source, xpath, namespaces, true, ttl, nilIfMissing); if (text == null) { return(null); } // convert text to html without a converter XDoc result = XDoc.Empty; using (TextReader reader = new StringReader("<html><body>" + text + "</body></html>")) { // NOTE (steveb): we create the sgml reader explicitly since we don't want a DTD to be associated with it; the DTD would force a potentially unwanted HTML structure // check if HTML entities DTD has already been loaded if (_htmlEntitiesDtd == null) { using (StreamReader dtdReader = new StreamReader(Plug.New("resource://mindtouch.deki.script/MindTouch.Deki.Script.Resources.HtmlEntities.dtd").Get().AsStream())) { _htmlEntitiesDtd = Sgml.SgmlDtd.Parse(null, "HTML", dtdReader, null, null, XDoc.XmlNameTable); } } Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable); sgmlReader.Dtd = _htmlEntitiesDtd; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; try { XmlDocument doc = new XmlDocument(XDoc.XmlNameTable) { PreserveWhitespace = true, XmlResolver = null }; doc.Load(sgmlReader); // check if a valid document was created if (doc.DocumentElement != null) { result = new XDoc(doc); } } catch { // swallow parsing exceptions } } return(CleanseHtmlDocument(result)); }
public static XDocument FromHtml(TextReader reader) { // setup SgmlReader Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; // create document XDocument doc = new XDocument(); doc = XDocument.Load(sgmlReader); return doc; }
/// <summary> /// Returns an <see cref="XDocument"/> from <see cref="TextReader"/> that contains HTML. /// </summary> /// <param name="reader">The reader used for getting HTML.</param> /// <returns>Returns an XML representation of the HTML.</returns> private XDocument FromHtml(TextReader reader) { var sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; var doc = XDocument.Load(sgmlReader); return(doc); }
static XmlDocument FromHtml(TextReader reader) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "XML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.None; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return(doc); }
private void HandleElementStart(Sgml.SgmlReader reader) { //ghetto, but the SgmlReader has no way to get ALL attributes. ParserNode node = new ParserNode(reader.Name, System.Xml.XmlNodeType.Element); node.AddAttribute("style", reader.GetAttribute("style")); node.AddAttribute("title", reader.GetAttribute("title")); node.AddAttribute("class", reader.GetAttribute("class")); node.AddAttribute("href", reader.GetAttribute("href")); node.AddAttribute("src", reader.GetAttribute("src")); node.AddAttribute("colspan", reader.GetAttribute("colspan")); node.AddAttribute("rowspan", reader.GetAttribute("rowspan")); AddNode(node); }
public XDocument FromHtmlToXDoc(string webAddress) { WebClient webPage = new WebClient(); string html = webPage.DownloadString(webAddress); using (TextReader sr = new StringReader(html)) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = sr; return(XDocument.Load(sgmlReader)); } }
/// <summary> /// /// </summary> private static XmlReader getDocReader( string html, string baseUrl) { var r = new Sgml.SgmlReader(); if (baseUrl.Length > 0) { r.SetBaseUri(baseUrl); } r.DocType = @"HTML"; r.InputStream = new StringReader(html); return(r); }
/// <summary> /// Html To XMl 返回格式化好的XML文件 /// </summary> /// <param name="html">传入要格式化的HTML文件</param> /// <returns>返回格式化好的XML文件</returns> public static string HTMLConvert(string html) { if (string.IsNullOrEmpty(html.Trim())) { return(string.Empty); } //solve ]]> //处理节点 html = System.Text.RegularExpressions.Regex.Replace(html, @"<!\s{0,}\[\s{0,}CDATA\s{0,}\[\s{0,}|\s{0,}\]\s{0,}\]\s{0,}>", ""); using (Sgml.SgmlReader reader = new Sgml.SgmlReader()) { reader.DocType = "HTML"; reader.InputStream = new System.IO.StringReader(html); using (System.IO.StringWriter stringWriter = new System.IO.StringWriter()) { //实例化对象 using (System.Xml.XmlTextWriter writer = new System.Xml.XmlTextWriter(stringWriter)) { reader.WhitespaceHandling = System.Xml.WhitespaceHandling.None; writer.Formatting = System.Xml.Formatting.Indented; System.Xml.XmlDocument doc = new System.Xml.XmlDocument(); doc.Load(reader); if (doc.DocumentElement == null) { return("Html to XML Error this programe can not Convert"); } else { doc.DocumentElement.WriteContentTo(writer); } writer.Close(); string xhtml = stringWriter.ToString(); reader.InputStream.Close(); reader.InputStream.Dispose(); if (xhtml == null) { xhtml = stringWriter.ToString(); stringWriter.Close(); } return(xhtml); } } } }
static XmlDocument FromHtml(TextReader reader) { // setup SGMLReader Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return doc; }
private static XmlDocument getXmlDocFromHtmlCode(string html) { XmlDocument doc = new XmlDocument(); using (TextReader reader = new StringReader(html)) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.Significant; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; doc.Load(sgmlReader); } return(doc); }
private static XmlDocument FromHtml(TextReader reader) { // setup SGMLReader Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return(doc); }
/// <summary> /// Load quickly a <see cref="XmlDocument"/> from a HTML string /// </summary> static public XmlDocument ParseHTMLtext(string html) { XmlDocument rslt = new XmlDocument() { PreserveWhitespace = false, XmlResolver = null }; using (Sgml.SgmlReader sgmlReader = CreateSgmlReader(html)) { rslt.Load(sgmlReader); } rslt.RemoveDeclaration(); rslt.SetDocumentType("html"); return(rslt); }
public XmlDocument ParseSgml(TextReader textReader) { if (textReader == null) throw new ArgumentNullException ("textReader"); Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = textReader; XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return doc; }
public FlowDocument Import(string path) { // Create the flow document FlowDocument fd = new FlowDocument(); fd.IsHyphenationEnabled = true; fd.IsOptimalParagraphEnabled = true; fd.ColumnRuleWidth = 5; fd.FontSize = 12; fd.FontFamily = new FontFamily("Times New Roman"); using (StreamReader reader = new StreamReader(path)) { // setup the SgmlReader and load it into a XDocument Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; XDocument xd = XDocument.Load(sgmlReader); // Read the html page using linq var htmlpage = (from nodes in xd.Descendants() select nodes); Section s; foreach (XElement p in htmlpage) { switch (p.Name.LocalName) { case "body": s = new Section(); ParseBody(s.Blocks, p); break; case "head": s = new Section(); ParseHead(s.Blocks, p); break; } } } return(fd); }
static string GetTextFromHtml(string html) { // load the document using sgml reader var document = new XmlDocument(); using (var sgmlReader = new Sgml.SgmlReader()) { sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.None; using (var sr = new StringReader(html)) { sgmlReader.InputStream = sr; document.Load(sgmlReader); } } return(document.InnerText); }
public Vehicle Parse(string markup) { var reader = new Sgml.SgmlReader {InputStream = new StringReader(markup)}; XDocument doc = XDocument.Load(reader); var container = doc.Root.Descendants("div").SingleOrDefault(x => x.HasId("pnlVehicleInfo")); if (container == null) return null; var values = container.Elements().Where(x => x.HasClass("pairValue")).ToArray(); if (values.Length != 4) throw new ApplicationException("Cannot parse markup to Vehicle, wrong number of values found"); return new Vehicle { Make = values[0].Value, Model = values[1].Value, Vin = values[2].Value, RegistrationNumber = values[3].Value, Inspections = ParseInspections(doc) }; }
private static XmlDocument getXmlDocFromHtmlCode(string html) { // Remove every xmlns info as it's a pain for our simple xpath parsing. html = Regex.Replace(html, "xmlns=\"[^\"]*\"", ""); XmlDocument doc = new XmlDocument(); using (TextReader reader = new StringReader(html)) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.Significant; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; doc.Load(sgmlReader); } return(doc); }
private void LoadReader(Sgml.SgmlReader reader) { while (reader.Read()) { switch (reader.NodeType) { case System.Xml.XmlNodeType.Element: HandleElementStart(reader); break; case System.Xml.XmlNodeType.EndElement: HandleElementEnd(reader); break; case System.Xml.XmlNodeType.Text: HandleText(reader); break; } } }
static private Sgml.SgmlReader CreateSgmlReader(string sgml) { foreach (var item in XmlHtmlEntity.HtmlBase) { sgml = item.ParseXMLtoHTML(sgml); } sgml = XmlHtmlEntity.ParseToCHAR(sgml, XmlHtmlEntity.Html2.Concat(XmlHtmlEntity.Html3, XmlHtmlEntity.Html4)); StreamReader reader = new StreamReader(new StreamString(sgml.RemoveDOCTYPE())); Sgml.SgmlReader rslt = new Sgml.SgmlReader() { DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, CaseFolding = Sgml.CaseFolding.ToLower, InputStream = reader, }; reader.BaseStream.Position = 0; return(rslt); }
public XmlDocument ConvertHtmlToXml(string path) { // setup SgmlReader StreamReader reader = new StreamReader(path); Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return(doc); }
public static XmlDocument DownloadDocument(string content) { try { var doc = new XmlDocument { PreserveWhitespace = true, XmlResolver = null }; var i = content.IndexOf("<rss", System.StringComparison.Ordinal); if (i == -1) { using (var xhtmlConverter = new Sgml.SgmlReader()) { xhtmlConverter.DocType = "HTML"; xhtmlConverter.WhitespaceHandling = WhitespaceHandling.All; xhtmlConverter.CaseFolding = Sgml.CaseFolding.ToLower; xhtmlConverter.InputStream = new System.IO.StringReader(content); doc.Load(xhtmlConverter); xhtmlConverter.Close(); } } else { content = content.Substring(i); doc.LoadXml(content); } return(doc); } catch (OutOfMemoryException ex) { throw; } catch (WebException ex) { throw; } catch (Exception ex) { throw; } }
/// <summary> /// Распарсить данные со страницы ответа сервера. /// </summary> /// <param name="reader"></param> /// <returns></returns> private XmlDocument FromHtml(TextReader reader) { // setup SgmlReader var sgmlReader = new Sgml.SgmlReader { DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, CaseFolding = Sgml.CaseFolding.ToLower, InputStream = reader }; // create document var doc = new XmlDocument { PreserveWhitespace = true, XmlResolver = null }; doc.Load(sgmlReader); return(doc); }
private String GetProductDescription(String Url) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = FetchHtmlDoc(Url); XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); XmlNodeList Pnodes = doc.GetElementsByTagName("p"); String Description = Pnodes[0].InnerText; return(Description); }
public XmlDocument ParseSgml(TextReader textReader) { if (textReader == null) { throw new ArgumentNullException("textReader"); } Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = textReader; XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return(doc); }
private static void bot_UriProcessingFinished(object sender, UriProcessingFinishedEventArgs e) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = new StringReader(e.Content); XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); XmlNodeList list = doc.SelectNodes(@"/html/body[@id='gsr']/div[@id='res']/div/ol/li/div/cite"); int count = 0; foreach (XmlNode node in list) { count++; string foundUrl = node.InnerText; foundUrl = GoogleUrlResultSize.Replace(foundUrl, String.Empty); foundUrl = "http://" + foundUrl; Uri url; if (Uri.TryCreate(foundUrl, UriKind.Absolute, out url)) { if (url.Host.IndexOf(LookingForDomain) >= 0) { string result = String.Format("Rank {0} for {1}", count + CurrentlyProcessingStart, url); ResultsFound.Add(result); Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine(result); Console.ResetColor(); Console.Beep(); } } } }
public bool Read(Book book, Stream stream) { var mem = new MemoryStream(); stream.CopyTo(mem); var encoding = Util.DetectXmlEncoding(mem); logger.Trace($"Book encoding detected, id:{book.Id}, enc:{encoding}"); using (var reader = new StreamReader(mem, encoding)) { using (var sgmlReader = new Sgml.SgmlReader()) { sgmlReader.InputStream = reader; var doc = XDocument.Load(sgmlReader); logger.Trace($"Book file loaded, id:{book.Id}"); try { UpdateAnnotation(book, doc); } catch (Exception) { } try { UpdateCover(book, doc); } catch (Exception) { } } } return(true); }
public XDoc WebHtml( [DekiExtParam("HTML source text or source uri (default: none)", true)] string source, [DekiExtParam("xpath to value (default: none)", true)] string xpath, [DekiExtParam("namespaces (default: none)", true)] Hashtable namespaces, [DekiExtParam("caching duration in seconds (range: 300+; default: 300)", true)] double? ttl ) { string text = WebText(source, xpath, namespaces, true, ttl); // convert text to html XDoc result = XDoc.Empty; using(TextReader reader = new StringReader("<html><body>" + text + "</body></html>")) { // NOTE (steveb): we create the sgml reader explicitly since we don't want a DTD to be associated with it; the DTD would force a potentially unwanted HTML structure // check if HTML entities DTD has already been loaded if(_htmlEntitiesDtd == null) { using(StreamReader dtdReader = new StreamReader(Plug.New("resource://mindtouch.deki.script/MindTouch.Deki.Script.HtmlEntities.dtd").Get().AsStream())) { _htmlEntitiesDtd = Sgml.SgmlDtd.Parse(null, "HTML", dtdReader, null, null, XDoc.XmlNameTable); } } Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable); sgmlReader.Dtd = _htmlEntitiesDtd; sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; try { XmlDocument doc = new XmlDocument(XDoc.XmlNameTable); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); // check if a valid document was created if(doc.DocumentElement != null) { result = new XDoc(doc); } } catch(Exception) { // swallow parsing exceptions } } return DekiScriptLibrary.CleanseHtmlDocument(result); }
private static XDoc FromHtml(TextReader reader) { Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(XDoc.XmlNameTable) { Dtd = _dtd, DocType = "HTML", WhitespaceHandling = WhitespaceHandling.All, CaseFolding = Sgml.CaseFolding.ToLower, InputStream = reader }; try { XmlDocument doc = XDoc.NewXmlDocument(); doc.Load(sgmlReader); if(doc.DocumentElement == null) { return XDoc.Empty; } if(_dtd == null) { _dtd = sgmlReader.Dtd; } return new XDoc(doc); } catch(Exception) { return XDoc.Empty; } }
/// <summary> /// Returns an <see cref="XDocument"/> from <see cref="TextReader"/> that contains HTML. /// </summary> /// <param name="reader">The reader used for getting HTML.</param> /// <returns>Returns an XML representation of the HTML.</returns> private XDocument FromHtml(TextReader reader) { var sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; var doc = XDocument.Load(sgmlReader); return doc; }
private static XmlDocument DOMTreeToXml(HtmlDocument htmlDoc) { XmlDocument result = new XmlDocument(); if(htmlDoc != null && htmlDoc.Body != null && htmlDoc.Body.Parent != null) { HtmlElement topHtml = htmlDoc.Body.Parent; using (StringReader sReader = new StringReader(topHtml.OuterHtml)) { using (StringWriter errorLog = new StringWriter()) { Sgml.SgmlReader reader = new Sgml.SgmlReader(); reader.ErrorLog = errorLog; reader.InputStream = sReader; using (StringReader dtdReader = new StringReader(Properties.Resources.WeakHtml)) reader.Dtd = Sgml.SgmlDtd.Parse(null, "HTML", null, dtdReader, null, null, reader.NameTable); result.Load(reader); errorLog.Flush(); Console.WriteLine(errorLog.ToString()); } } } return result; }
/// <remarks>コメントはJavaScriptによって動的に読み込まれているので、この方法では取得できない</remarks> private static IEnumerable <Comment> RetrieveComments(string entryUrl) { var doc = new XmlDocument(); Console.Error.Write("{0} のコメントを取得中 ... ", entryUrl); using (var sgmlReader = new Sgml.SgmlReader()) { sgmlReader.Href = entryUrl; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; doc.Load(sgmlReader); System.Threading.Thread.Sleep(500); } //var contentNode = doc.GetElementById("content"); var contentNode = doc.SelectSingleNode("//*[@id = 'content']"); foreach (XmlElement commentRootElement in contentNode.SelectNodes(".//*[contains(@class, 'entry-comment')]")) { var comment = new Comment(); foreach (XmlNode commentChildNode in commentRootElement.ChildNodes) { if (commentChildNode.NodeType != XmlNodeType.Element) { continue; } var commentChildElement = (XmlElement)commentChildNode; switch (commentChildElement.GetAttribute("class")) { case "comment-user-name": /* * <!-- hatena user --> * <e class="comment-user-name"> * <a class="comment-user-id" href="http://blog.hatena.ne.jp/hatenaid/"> * <span class="comment-nickname" data-user-name="hatenaid"> * id:hatenaid * </span> * </a> * </e> * <!-- name with website --> * <e class="comment-user-name"> * name * <a class="icon-website" href="http://example.com/" /> * </e> * <!-- name only --> * <e class="comment-user-name"> * name * </e> */ comment.Author = commentChildElement.InnerText.Trim(); comment.Url = commentChildElement.GetSingleNodeValueOf(".//@href"); break; case "comment-content": /* * <e class="comment-content"> * <p>comment-html</p> * </e> */ comment.Content = commentChildElement.FirstChild.InnerXml; break; case "comment-metadata": /* * <e class="comment-metadata"> * <time data-epoch="1387283661000" /> * </e> */ comment.Date = DateTimeOffset.FromUnixTime(commentChildElement.GetSingleNodeValueOf("time/@data-epoch", long.Parse) / 1000).ToLocalTime(); break; } } yield return(comment); } Console.Error.WriteLine("完了"); }
public SlurpResult Process(string manufacturerId) { using (WebClient client = new WebClient()) { SlurpResult result = new SlurpResult(manufacturerId); result.SiteName = this.GetType().Name; var tweakerUrl = String.Format("http://tweakers.net/pricewatch/zoeken/?keyword={0}", manufacturerId); string searchResults = client.DownloadString(tweakerUrl); if (searchResults.Contains("Er werden geen producten gevonden.")) { return(result); } log.InfoFormat("Processing product {0}", manufacturerId); HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(searchResults); var productUrl = doc.DocumentNode.SelectSingleNode("//table[@class=\"priceTable\"]/tbody/tr[1]/td[2]/p/a").Attributes["href"].Value; string productPage = client.DownloadString(productUrl); if (productPage.Contains("Van dit product worden geen prijzen meer getoond.") || productPage.Contains("Geen actuele prijzen bekend.")) { result.ProductStatus = ProductStatus.Obsolete; return(result); } using (var reader = new StringReader(productPage)) { XDocument xdoc = new XDocument(); using (Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader()) { sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; sgmlReader.InputStream = reader; xdoc = XDocument.Load(sgmlReader); } var rows = xdoc.Root.Descendants("table").Where(t => t.Attribute("class").Value == "priceTable").First().Element("tbody").Elements("tr"); foreach (var row in rows) { var cells = row.Elements("td"); if (cells.First().Attribute("colspan") != null && Int32.Parse(cells.First().Attribute("colspan").Value) > 1) { continue; } var shop = cells.First().Value; string txt; var priceCell = cells.FirstOrDefault(x => x.Attribute("class") != null && x.Attribute("class").Value == "price"); decimal?price = null; if (priceCell != null) { price = StripPrice2(priceCell.Value); } var totalPriceCell = cells.FirstOrDefault(x => x.Attribute("class") != null && x.Attribute("class").Value == "totalPrice"); if (totalPriceCell != null && totalPriceCell.Element("div") != null) { txt = totalPriceCell.Element("div").Element("a").Value; } else { txt = null; } var total = StripPrice2(txt); var deliveryCell = cells.First(x => x.Attribute("class") != null && x.Attribute("class").Value == "delivery"); var img = deliveryCell.Element("img"); string delivery = String.Empty; if (img != null) { delivery = img.Attribute("src").Value; } else { delivery = String.Empty; } //delivery = deliveryCell.Element("span").Attribute("title").Value; var deliveryStatus = ParseIcon(delivery); result.Shops.Add(new ShopResult() { Name = shop, Price = price, TotalPrice = total, Delivery = deliveryStatus }); } } return(result); } }