Provides DTD parsing and support for the SgmlParser framework.
Exemple #1
0
 public HtmlLoader(ProcessingContext options)
 {
     HtmlDtd = Sgml.SgmlDtd.Parse(new Uri("urn:anything"),
                                  "HTML",
                                  "-//W3C//DTD HTML 4.01//EN",
                                  "strict.dtd",
                                  string.Empty, new NameTable(), new EmbeddedResourceEntityResolver());
     SourceDocumentationRoot = options.SourceDocumentationRoot;
 }
Exemple #2
0
        private void LazyLoadDtd(Uri baseUri) {
            if (this.dtd == null) {
                if (this.syslit == null || this.syslit == "") {
                    if (this.docType != null && StringUtilities.EqualsIgnoreCase(this.docType, "html")) {
                        Assembly a = typeof(SgmlReader).Assembly;
                        string name = a.FullName.Split(',')[0]+".Html.dtd";
                        Stream stm = a.GetManifestResourceStream(name);
                        if (stm != null){
                            StreamReader sr = new StreamReader(stm);
                            this.dtd = SgmlDtd.Parse(baseUri, "HTML", null, sr, null, this.proxy, this.nametable);
                        }
                    }
                } else { 
                    if (baseUri != null) {
                        baseUri = new Uri(baseUri, this.syslit);
                    } else if (this.baseUri != null) {
                        baseUri = new Uri(this.baseUri, this.syslit);
                    } else {
                        baseUri = new Uri(new Uri(Directory.GetCurrentDirectory()+"\\"), this.syslit);
                    }
                    this.dtd = SgmlDtd.Parse(baseUri, this.docType, this.pubid, baseUri.AbsoluteUri, this.subset, this.proxy, this.nametable);
                }

                if (this.dtd != null && this.dtd.Name != null){
                    switch (this.CaseFolding){
                        case CaseFolding.ToUpper:
                            this.rootElementName = this.dtd.Name.ToUpper();
                            break;
                        case CaseFolding.ToLower:
                            this.rootElementName = this.dtd.Name.ToLower();
                            break;
                        default:
                            this.rootElementName = this.dtd.Name;
                            break;
                    }
                    this.isHtml = StringUtilities.EqualsIgnoreCase(this.dtd.Name, "html");
                }

            }
        }
Exemple #3
0
        bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log)
        {
            depth++;
            StringBuilder indent = new StringBuilder();
            for (int i = 0; i < depth; i++)
                indent.Append(" ");

            count++;
            Uri baseUri = new Uri(doc.BaseURI);
            XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base");
            if (baseElmt != null) {
                string href = baseElmt.GetAttribute("href");
                if (href != "") {
                    try {
                        baseUri = new Uri(href);
                    }
                    catch (Exception ) {
                        Console.WriteLine("### Error parsing BASE href '"+href+"'");
                    }
                }
            }
            foreach (XmlElement a in doc.SelectNodes("//a")) {
                string href = a.GetAttribute("href");
                if (href != "" && href != null && depth<5) {
                    Uri local = new Uri(baseUri, href);
                    if (domain && baseUri.Host != local.Host)
                        continue;
                    string ext = Path.GetExtension(local.AbsolutePath).ToLower();
                    if (ext == ".jpg" || ext == ".gif" || ext==".mpg")
                        continue;
                    string url = local.AbsoluteUri;
                    if (!visited.ContainsKey(url)) {
                        visited.Add(url, url);
                        log.WriteLine(indent+"Loading '"+url+"'");
                        log.Flush();
                        StreamReader stm = null;
                        try {
                            HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
                            wr.Timeout = 10000;
                            if (proxy != null) wr.Proxy = new WebProxy(proxy);
                            wr.PreAuthenticate = false;
                            // Pass the credentials of the process.
                            wr.Credentials = CredentialCache.DefaultCredentials;

                            WebResponse resp = wr.GetResponse();
                            Uri actual = resp.ResponseUri;
                            if (actual.AbsoluteUri != url) {
                                local = new Uri(actual.AbsoluteUri);
                                log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'");
                                log.Flush();
                            }
                            if (resp.ContentType != "text/html") {
                                log.WriteLine(indent+"Skipping ContentType="+resp.ContentType);
                                log.Flush();
                                resp.Close();
                            }
                            else {
                                stm = new StreamReader(resp.GetResponseStream());
                            }
                        }
                        catch (Exception e) {
                            log.WriteLine(indent+"### Error opening URL: " + e.Message);
                            log.Flush();
                        }
                        if (stm != null) {
                            SgmlReader reader = new SgmlReader();
                            reader.Dtd = dtd;
                            reader.SetBaseUri(local.AbsoluteUri);
                            reader.InputStream = stm;
                            reader.WebProxy = proxy;

                            XmlDocument d2 = new XmlDocument();
                            d2.XmlResolver = null; // don't do any downloads!
                            try {
                                d2.Load(reader);
                                reader.Close();
                                stm.Close();
                                if (!Crawl(dtd, d2, log))
                                    return false;
                            }
                            catch (Exception e) {
                                log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message);
                                log.Flush();
                                reader.Close();
                            }
                        }
                    }
                }
            }
            depth--;
            return true;
        }
        public static SgmlDtd Parse(Uri baseUri, string name, TextReader input, string subset, string proxy, XmlNameTable nt)
        {
            SgmlDtd dtd = new SgmlDtd(name, nt);
            dtd.PushEntity(baseUri, new Entity(dtd.Name, baseUri, input, proxy));
            if (!string.IsNullOrEmpty(subset))
            {
                dtd.PushEntity(baseUri, new Entity(name, subset));
            }

            try
            {
                dtd.Parse();
            } 
            catch (Exception e)
            {
                throw new SgmlParseException(e.Message + dtd.m_current.Context());
            }

            return dtd;
        }
        /// <summary>
        /// Checks whether an element using this group can contain a specified element.
        /// </summary>
        /// <param name="name">The name of the element to look for.</param>
        /// <param name="dtd">The DTD to use during the checking.</param>
        /// <returns>true if an element using this group can contain the element, otherwise false.</returns>
        /// <remarks>
        /// Rough approximation - this is really assuming an "Or" group
        /// </remarks>
        public bool CanContain(string name, SgmlDtd dtd)
        {
            if (dtd == null)
                throw new ArgumentNullException("dtd");

            // Do a simple search of members.
            foreach (object obj in Members) 
            {
                if (obj is string) 
                {
                    if( string.Equals((string)obj, name, StringComparison.OrdinalIgnoreCase))
                        return true;
                } 
            }
            // didn't find it, so do a more expensive search over child elements
            // that have optional start tags and over child groups.
            foreach (object obj in Members) 
            {
                string s = obj as string;
                if (s != null)
                {
                    ElementDecl e = dtd.FindElement(s);
                    if (e != null) 
                    {
                        if (e.StartTagOptional) 
                        {
                            // tricky case, the start tag is optional so element may be
                            // allowed inside this guy!
                            if (e.CanContain(name, dtd))
                                return true;
                        }
                    }
                } 
                else 
                {
                    Group m = (Group)obj;
                    if (m.CanContain(name, dtd)) 
                        return true;
                }
            }

            return false;
        }
        /// <summary>
        /// Checks whether an element using this group can contain a specified element.
        /// </summary>
        /// <param name="name">The name of the element to look for.</param>
        /// <param name="dtd">The DTD to use during the checking.</param>
        /// <returns>true if an element using this group can contain the element, otherwise false.</returns>
        public bool CanContain(string name, SgmlDtd dtd)
        {
            if (m_declaredContent != DeclaredContent.Default)
                return false; // empty or text only node.

            return m_model.CanContain(name, dtd);
        }
        /// <summary>
        /// Tests whether this element can contain another specified element.
        /// </summary>
        /// <param name="name">The name of the element to check for.</param>
        /// <param name="dtd">The DTD to use to do the check.</param>
        /// <returns>True if the specified element can be contained by this element.</returns>
        public bool CanContain(string name, SgmlDtd dtd)
        {            
            // return true if this element is allowed to contain the given element.
            if (m_exclusions != null) 
            {
                foreach (string s in m_exclusions) 
                {
                    if (string.Equals(s, name, StringComparison.OrdinalIgnoreCase))
                        return false;
                }
            }

            if (m_inclusions != null) 
            {
                foreach (string s in m_inclusions) 
                {
                    if (string.Equals(s, name, StringComparison.OrdinalIgnoreCase))
                        return true;
                }
            }
            return m_contentModel.CanContain(name, dtd);
        }
Exemple #8
0
 public static SgmlDtd Parse(Uri baseUri, string name, string pubid, string url, string subset, string proxy, XmlNameTable nt)
 {
     SgmlDtd dtd = new SgmlDtd(name, nt);
     if (url != null && url != "") 
     {
         dtd.PushEntity(baseUri, new Entity(dtd.Name, pubid, url, proxy));
     }
     if (subset != null && subset != "") 
     {
         dtd.PushEntity(baseUri, new Entity(name, subset));
     }
     try 
     {
         dtd.Parse();
     } 
     catch (Exception e)
     {
         throw new Exception(e.Message + dtd.current.Context());
     }           
     return dtd;
 }
Exemple #9
0
 public bool CanContain(string name, SgmlDtd dtd)
 {            
     // return true if this element is allowed to contain the given element.
     if (Exclusions != null) 
     {
         foreach (string s in Exclusions) 
         {
             if ((object)s == (object)name) // XmlNameTable optimization
                 return false;
         }
     }
     if (Inclusions != null) 
     {
         foreach (string s in Inclusions) 
         {
             if ((object)s == (object)name) // XmlNameTable optimization
                 return true;
         }
     }
     return ContentModel.CanContain(name, dtd);
 }
Exemple #10
0
 // Rough approximation - this is really assuming an "Or" group
 public bool CanContain(string name, SgmlDtd dtd)
 {
     // Do a simple search of members.
     foreach (object obj in Members) 
     {
         if (obj is String) 
         {
             if (obj == (object)name) // XmlNameTable optimization
                 return true;
         } 
     }
     // didn't find it, so do a more expensive search over child elements
     // that have optional start tags and over child groups.
     foreach (object obj in Members) 
     {
         if (obj is String) 
         {
             string s = (string)obj;
             ElementDecl e = dtd.FindElement(s);
             if (e != null) 
             {
                 if (e.StartTagOptional) 
                 {
                     // tricky case, the start tag is optional so element may be
                     // allowed inside this guy!
                     if (e.CanContain(name, dtd))
                         return true;
                 }
             }
         } 
         else 
         {
             Group m = (Group)obj;
             if (m.CanContain(name, dtd)) 
                 return true;
         }
     }
     return false;
 }
Exemple #11
0
        private void LazyLoadDtd(Uri baseUri)
        {
            if (this.m_dtd == null && !this.m_ignoreDtd)
            {
                if (string.IsNullOrEmpty(this.m_syslit))
                {
                    if (this.m_docType != null && StringUtilities.EqualsIgnoreCase(this.m_docType, "html"))
                    {
                        Assembly a = typeof(SgmlReader).Assembly;
                        string name = a.FullName.Split(',')[0]+".Html.dtd";
                        Stream stm = a.GetManifestResourceStream(name);
                        if (stm != null)
                        {
                            StreamReader sr = new StreamReader(stm);
                            this.m_dtd = SgmlDtd.Parse(baseUri, "HTML", sr, null, this.m_proxy, null);
                        }
                    }
                    else if (this.m_docType != null && StringUtilities.EqualsIgnoreCase(this.m_docType, "ofx"))
                    {
                        Assembly a = typeof(SgmlReader).Assembly;
                        string name = a.FullName.Split(',')[0] + ".ofx160.dtd";
                        Stream stm = a.GetManifestResourceStream(name);
                        if (stm != null)
                        {
                            StreamReader sr = new StreamReader(stm);
                            this.m_dtd = SgmlDtd.Parse(baseUri, "OFX", sr, null, this.m_proxy, null);
                        }
                    }
                }
                else
                {
                    if (baseUri != null)
                    {
                        baseUri = new Uri(baseUri, this.m_syslit);
                    }
                    else if (this.m_baseUri != null)
                    {
                        baseUri = new Uri(this.m_baseUri, this.m_syslit);
                    }
                    else
                    {
                        baseUri = new Uri(new Uri(Directory.GetCurrentDirectory() + "/"), this.m_syslit);
                    }
                    this.m_dtd = SgmlDtd.Parse(baseUri, this.m_docType, this.m_pubid, baseUri.AbsoluteUri, this.m_subset, this.m_proxy, null);
                }
            }

            if (this.m_dtd != null && this.m_dtd.Name != null)
            {
                switch(this.CaseFolding)
                {
                case CaseFolding.ToUpper:
                    this.m_rootElementName = this.m_dtd.Name.ToUpperInvariant();
                    break;
                case CaseFolding.ToLower:
                    this.m_rootElementName = this.m_dtd.Name.ToLowerInvariant();
                    break;
                default:
                    this.m_rootElementName = this.m_dtd.Name;
                    break;
                }

                this.m_isHtml = StringUtilities.EqualsIgnoreCase(this.m_dtd.Name, "html");
            }
        }
Exemple #12
0
        private void LazyLoadDtd(Uri baseUri)
        {

            var sr = new StringReader(htmldtd);
            this.m_dtd = SgmlDtd.Parse(baseUri, "HTML", sr, null, this.m_proxy, null);

            if (this.m_dtd != null && this.m_dtd.Name != null)
            {
                switch(this.CaseFolding)
                {
                case CaseFolding.ToUpper:
                    this.m_rootElementName = this.m_dtd.Name.ToUpperInvariant();
                    break;
                case CaseFolding.ToLower:
                    this.m_rootElementName = this.m_dtd.Name.ToLowerInvariant();
                    break;
                default:
                    this.m_rootElementName = this.m_dtd.Name;
                    break;
                }

                this.m_isHtml = StringUtilities.EqualsIgnoreCase(this.m_dtd.Name, "html");
            }
        }
Exemple #13
0
 private void LazyLoadDtd(Uri baseUri)
 {
     if (_dtd == null) {
         if (_syslit == null || _syslit == "") {
             if (_docType != null && _docType.ToLower() == "html") {
                 Assembly a = typeof(SgmlReader).Assembly;
                 string name = a.FullName.Split(',')[0]+".Html.dtd";
                 Stream stm = a.GetManifestResourceStream(name);
                 StreamReader sr = new StreamReader(stm);
                 _dtd = SgmlDtd.Parse(baseUri, "HTML", null, sr, null, _proxy, _nametable);
             }
         } else  {
             if (_syslit.IndexOf("://")>0) {
                 baseUri = new Uri(_syslit);
             }
             else {
                 // probably a local filename.
                 baseUri = new Uri("file://"+ _syslit.Replace("\\","/"));
             }
             _dtd = SgmlDtd.Parse(baseUri, _docType, _pubid, _syslit, _subset, _proxy, _nametable);
         }
     }
 }
Exemple #14
0
 public HtmlLoader()
 {
     HtmlDtd = LoadHtmlDtd();
 }