public HtmlLoader(ProcessingContext options) { HtmlDtd = Sgml.SgmlDtd.Parse(new Uri("urn:anything"), "HTML", "-//W3C//DTD HTML 4.01//EN", "strict.dtd", string.Empty, new NameTable(), new EmbeddedResourceEntityResolver()); SourceDocumentationRoot = options.SourceDocumentationRoot; }
private void LazyLoadDtd(Uri baseUri) { if (this.dtd == null) { if (this.syslit == null || this.syslit == "") { if (this.docType != null && StringUtilities.EqualsIgnoreCase(this.docType, "html")) { Assembly a = typeof(SgmlReader).Assembly; string name = a.FullName.Split(',')[0]+".Html.dtd"; Stream stm = a.GetManifestResourceStream(name); if (stm != null){ StreamReader sr = new StreamReader(stm); this.dtd = SgmlDtd.Parse(baseUri, "HTML", null, sr, null, this.proxy, this.nametable); } } } else { if (baseUri != null) { baseUri = new Uri(baseUri, this.syslit); } else if (this.baseUri != null) { baseUri = new Uri(this.baseUri, this.syslit); } else { baseUri = new Uri(new Uri(Directory.GetCurrentDirectory()+"\\"), this.syslit); } this.dtd = SgmlDtd.Parse(baseUri, this.docType, this.pubid, baseUri.AbsoluteUri, this.subset, this.proxy, this.nametable); } if (this.dtd != null && this.dtd.Name != null){ switch (this.CaseFolding){ case CaseFolding.ToUpper: this.rootElementName = this.dtd.Name.ToUpper(); break; case CaseFolding.ToLower: this.rootElementName = this.dtd.Name.ToLower(); break; default: this.rootElementName = this.dtd.Name; break; } this.isHtml = StringUtilities.EqualsIgnoreCase(this.dtd.Name, "html"); } } }
bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log) { depth++; StringBuilder indent = new StringBuilder(); for (int i = 0; i < depth; i++) indent.Append(" "); count++; Uri baseUri = new Uri(doc.BaseURI); XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base"); if (baseElmt != null) { string href = baseElmt.GetAttribute("href"); if (href != "") { try { baseUri = new Uri(href); } catch (Exception ) { Console.WriteLine("### Error parsing BASE href '"+href+"'"); } } } foreach (XmlElement a in doc.SelectNodes("//a")) { string href = a.GetAttribute("href"); if (href != "" && href != null && depth<5) { Uri local = new Uri(baseUri, href); if (domain && baseUri.Host != local.Host) continue; string ext = Path.GetExtension(local.AbsolutePath).ToLower(); if (ext == ".jpg" || ext == ".gif" || ext==".mpg") continue; string url = local.AbsoluteUri; if (!visited.ContainsKey(url)) { visited.Add(url, url); log.WriteLine(indent+"Loading '"+url+"'"); log.Flush(); StreamReader stm = null; try { HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url); wr.Timeout = 10000; if (proxy != null) wr.Proxy = new WebProxy(proxy); wr.PreAuthenticate = false; // Pass the credentials of the process. wr.Credentials = CredentialCache.DefaultCredentials; WebResponse resp = wr.GetResponse(); Uri actual = resp.ResponseUri; if (actual.AbsoluteUri != url) { local = new Uri(actual.AbsoluteUri); log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'"); log.Flush(); } if (resp.ContentType != "text/html") { log.WriteLine(indent+"Skipping ContentType="+resp.ContentType); log.Flush(); resp.Close(); } else { stm = new StreamReader(resp.GetResponseStream()); } } catch (Exception e) { log.WriteLine(indent+"### Error opening URL: " + e.Message); log.Flush(); } if (stm != null) { SgmlReader reader = new SgmlReader(); reader.Dtd = dtd; reader.SetBaseUri(local.AbsoluteUri); reader.InputStream = stm; reader.WebProxy = proxy; XmlDocument d2 = new XmlDocument(); d2.XmlResolver = null; // don't do any downloads! try { d2.Load(reader); reader.Close(); stm.Close(); if (!Crawl(dtd, d2, log)) return false; } catch (Exception e) { log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message); log.Flush(); reader.Close(); } } } } } depth--; return true; }
public static SgmlDtd Parse(Uri baseUri, string name, TextReader input, string subset, string proxy, XmlNameTable nt) { SgmlDtd dtd = new SgmlDtd(name, nt); dtd.PushEntity(baseUri, new Entity(dtd.Name, baseUri, input, proxy)); if (!string.IsNullOrEmpty(subset)) { dtd.PushEntity(baseUri, new Entity(name, subset)); } try { dtd.Parse(); } catch (Exception e) { throw new SgmlParseException(e.Message + dtd.m_current.Context()); } return dtd; }
/// <summary> /// Checks whether an element using this group can contain a specified element. /// </summary> /// <param name="name">The name of the element to look for.</param> /// <param name="dtd">The DTD to use during the checking.</param> /// <returns>true if an element using this group can contain the element, otherwise false.</returns> /// <remarks> /// Rough approximation - this is really assuming an "Or" group /// </remarks> public bool CanContain(string name, SgmlDtd dtd) { if (dtd == null) throw new ArgumentNullException("dtd"); // Do a simple search of members. foreach (object obj in Members) { if (obj is string) { if( string.Equals((string)obj, name, StringComparison.OrdinalIgnoreCase)) return true; } } // didn't find it, so do a more expensive search over child elements // that have optional start tags and over child groups. foreach (object obj in Members) { string s = obj as string; if (s != null) { ElementDecl e = dtd.FindElement(s); if (e != null) { if (e.StartTagOptional) { // tricky case, the start tag is optional so element may be // allowed inside this guy! if (e.CanContain(name, dtd)) return true; } } } else { Group m = (Group)obj; if (m.CanContain(name, dtd)) return true; } } return false; }
/// <summary> /// Checks whether an element using this group can contain a specified element. /// </summary> /// <param name="name">The name of the element to look for.</param> /// <param name="dtd">The DTD to use during the checking.</param> /// <returns>true if an element using this group can contain the element, otherwise false.</returns> public bool CanContain(string name, SgmlDtd dtd) { if (m_declaredContent != DeclaredContent.Default) return false; // empty or text only node. return m_model.CanContain(name, dtd); }
/// <summary> /// Tests whether this element can contain another specified element. /// </summary> /// <param name="name">The name of the element to check for.</param> /// <param name="dtd">The DTD to use to do the check.</param> /// <returns>True if the specified element can be contained by this element.</returns> public bool CanContain(string name, SgmlDtd dtd) { // return true if this element is allowed to contain the given element. if (m_exclusions != null) { foreach (string s in m_exclusions) { if (string.Equals(s, name, StringComparison.OrdinalIgnoreCase)) return false; } } if (m_inclusions != null) { foreach (string s in m_inclusions) { if (string.Equals(s, name, StringComparison.OrdinalIgnoreCase)) return true; } } return m_contentModel.CanContain(name, dtd); }
public static SgmlDtd Parse(Uri baseUri, string name, string pubid, string url, string subset, string proxy, XmlNameTable nt) { SgmlDtd dtd = new SgmlDtd(name, nt); if (url != null && url != "") { dtd.PushEntity(baseUri, new Entity(dtd.Name, pubid, url, proxy)); } if (subset != null && subset != "") { dtd.PushEntity(baseUri, new Entity(name, subset)); } try { dtd.Parse(); } catch (Exception e) { throw new Exception(e.Message + dtd.current.Context()); } return dtd; }
public bool CanContain(string name, SgmlDtd dtd) { // return true if this element is allowed to contain the given element. if (Exclusions != null) { foreach (string s in Exclusions) { if ((object)s == (object)name) // XmlNameTable optimization return false; } } if (Inclusions != null) { foreach (string s in Inclusions) { if ((object)s == (object)name) // XmlNameTable optimization return true; } } return ContentModel.CanContain(name, dtd); }
// Rough approximation - this is really assuming an "Or" group public bool CanContain(string name, SgmlDtd dtd) { // Do a simple search of members. foreach (object obj in Members) { if (obj is String) { if (obj == (object)name) // XmlNameTable optimization return true; } } // didn't find it, so do a more expensive search over child elements // that have optional start tags and over child groups. foreach (object obj in Members) { if (obj is String) { string s = (string)obj; ElementDecl e = dtd.FindElement(s); if (e != null) { if (e.StartTagOptional) { // tricky case, the start tag is optional so element may be // allowed inside this guy! if (e.CanContain(name, dtd)) return true; } } } else { Group m = (Group)obj; if (m.CanContain(name, dtd)) return true; } } return false; }
private void LazyLoadDtd(Uri baseUri) { if (this.m_dtd == null && !this.m_ignoreDtd) { if (string.IsNullOrEmpty(this.m_syslit)) { if (this.m_docType != null && StringUtilities.EqualsIgnoreCase(this.m_docType, "html")) { Assembly a = typeof(SgmlReader).Assembly; string name = a.FullName.Split(',')[0]+".Html.dtd"; Stream stm = a.GetManifestResourceStream(name); if (stm != null) { StreamReader sr = new StreamReader(stm); this.m_dtd = SgmlDtd.Parse(baseUri, "HTML", sr, null, this.m_proxy, null); } } else if (this.m_docType != null && StringUtilities.EqualsIgnoreCase(this.m_docType, "ofx")) { Assembly a = typeof(SgmlReader).Assembly; string name = a.FullName.Split(',')[0] + ".ofx160.dtd"; Stream stm = a.GetManifestResourceStream(name); if (stm != null) { StreamReader sr = new StreamReader(stm); this.m_dtd = SgmlDtd.Parse(baseUri, "OFX", sr, null, this.m_proxy, null); } } } else { if (baseUri != null) { baseUri = new Uri(baseUri, this.m_syslit); } else if (this.m_baseUri != null) { baseUri = new Uri(this.m_baseUri, this.m_syslit); } else { baseUri = new Uri(new Uri(Directory.GetCurrentDirectory() + "/"), this.m_syslit); } this.m_dtd = SgmlDtd.Parse(baseUri, this.m_docType, this.m_pubid, baseUri.AbsoluteUri, this.m_subset, this.m_proxy, null); } } if (this.m_dtd != null && this.m_dtd.Name != null) { switch(this.CaseFolding) { case CaseFolding.ToUpper: this.m_rootElementName = this.m_dtd.Name.ToUpperInvariant(); break; case CaseFolding.ToLower: this.m_rootElementName = this.m_dtd.Name.ToLowerInvariant(); break; default: this.m_rootElementName = this.m_dtd.Name; break; } this.m_isHtml = StringUtilities.EqualsIgnoreCase(this.m_dtd.Name, "html"); } }
private void LazyLoadDtd(Uri baseUri) { var sr = new StringReader(htmldtd); this.m_dtd = SgmlDtd.Parse(baseUri, "HTML", sr, null, this.m_proxy, null); if (this.m_dtd != null && this.m_dtd.Name != null) { switch(this.CaseFolding) { case CaseFolding.ToUpper: this.m_rootElementName = this.m_dtd.Name.ToUpperInvariant(); break; case CaseFolding.ToLower: this.m_rootElementName = this.m_dtd.Name.ToLowerInvariant(); break; default: this.m_rootElementName = this.m_dtd.Name; break; } this.m_isHtml = StringUtilities.EqualsIgnoreCase(this.m_dtd.Name, "html"); } }
private void LazyLoadDtd(Uri baseUri) { if (_dtd == null) { if (_syslit == null || _syslit == "") { if (_docType != null && _docType.ToLower() == "html") { Assembly a = typeof(SgmlReader).Assembly; string name = a.FullName.Split(',')[0]+".Html.dtd"; Stream stm = a.GetManifestResourceStream(name); StreamReader sr = new StreamReader(stm); _dtd = SgmlDtd.Parse(baseUri, "HTML", null, sr, null, _proxy, _nametable); } } else { if (_syslit.IndexOf("://")>0) { baseUri = new Uri(_syslit); } else { // probably a local filename. baseUri = new Uri("file://"+ _syslit.Replace("\\","/")); } _dtd = SgmlDtd.Parse(baseUri, _docType, _pubid, _syslit, _subset, _proxy, _nametable); } } }
public HtmlLoader() { HtmlDtd = LoadHtmlDtd(); }