SetBaseUri() public method

The base Uri is used to resolve relative Uri's like the SystemLiteral and Href properties. This is a method because BaseURI is a read-only property on the base XmlReader class.
public SetBaseUri ( string uri ) : void
uri string
return void
        public static XmlReader Create(string baseUri, string html)
        {
            var assembly = typeof(SgmlReader).Assembly;
            var name = "Html.dtd";
            var dtd = default(SgmlDtd);

            using (var resource = assembly.GetManifestResourceStream(name))
            {
                var input = new StreamReader(resource);
                dtd = SgmlDtd.Parse(new Uri(baseUri), "HTML", input, null, null, null);
            }

            var reader = new SgmlReader
            {
                WhitespaceHandling = WhitespaceHandling.All,
                CaseFolding = CaseFolding.ToLower,
                Dtd = dtd,
                IgnoreDtd = true,
                InputStream = new StringReader(html),
            };

            reader.SetBaseUri(baseUri);

            return reader;
        }
Beispiel #2
0
        bool Crawl(SgmlDtd dtd, XmlDocument doc, TextWriter log)
        {
            depth++;
            StringBuilder indent = new StringBuilder();
            for (int i = 0; i < depth; i++)
                indent.Append(" ");

            count++;
            Uri baseUri = new Uri(doc.BaseURI);
            XmlElement baseElmt = (XmlElement)doc.SelectSingleNode("/html/head/base");
            if (baseElmt != null) {
                string href = baseElmt.GetAttribute("href");
                if (href != "") {
                    try {
                        baseUri = new Uri(href);
                    }
                    catch (Exception ) {
                        Console.WriteLine("### Error parsing BASE href '"+href+"'");
                    }
                }
            }
            foreach (XmlElement a in doc.SelectNodes("//a")) {
                string href = a.GetAttribute("href");
                if (href != "" && href != null && depth<5) {
                    Uri local = new Uri(baseUri, href);
                    if (domain && baseUri.Host != local.Host)
                        continue;
                    string ext = Path.GetExtension(local.AbsolutePath).ToLower();
                    if (ext == ".jpg" || ext == ".gif" || ext==".mpg")
                        continue;
                    string url = local.AbsoluteUri;
                    if (!visited.ContainsKey(url)) {
                        visited.Add(url, url);
                        log.WriteLine(indent+"Loading '"+url+"'");
                        log.Flush();
                        StreamReader stm = null;
                        try {
                            HttpWebRequest wr = (HttpWebRequest)WebRequest.Create(url);
                            wr.Timeout = 10000;
                            if (proxy != null) wr.Proxy = new WebProxy(proxy);
                            wr.PreAuthenticate = false;
                            // Pass the credentials of the process.
                            wr.Credentials = CredentialCache.DefaultCredentials;

                            WebResponse resp = wr.GetResponse();
                            Uri actual = resp.ResponseUri;
                            if (actual.AbsoluteUri != url) {
                                local = new Uri(actual.AbsoluteUri);
                                log.WriteLine(indent+"Redirected to '"+actual.AbsoluteUri+"'");
                                log.Flush();
                            }
                            if (resp.ContentType != "text/html") {
                                log.WriteLine(indent+"Skipping ContentType="+resp.ContentType);
                                log.Flush();
                                resp.Close();
                            }
                            else {
                                stm = new StreamReader(resp.GetResponseStream());
                            }
                        }
                        catch (Exception e) {
                            log.WriteLine(indent+"### Error opening URL: " + e.Message);
                            log.Flush();
                        }
                        if (stm != null) {
                            SgmlReader reader = new SgmlReader();
                            reader.Dtd = dtd;
                            reader.SetBaseUri(local.AbsoluteUri);
                            reader.InputStream = stm;
                            reader.WebProxy = proxy;

                            XmlDocument d2 = new XmlDocument();
                            d2.XmlResolver = null; // don't do any downloads!
                            try {
                                d2.Load(reader);
                                reader.Close();
                                stm.Close();
                                if (!Crawl(dtd, d2, log))
                                    return false;
                            }
                            catch (Exception e) {
                                log.WriteLine(indent+"### Error parsing document '"+local.AbsoluteUri+"', "+e.Message);
                                log.Flush();
                                reader.Close();
                            }
                        }
                    }
                }
            }
            depth--;
            return true;
        }
Beispiel #3
0
        /**************************************************************************
         * Run a test suite.  Tests suites are organized into expected input/output
         * blocks separated by back quotes (`).  It runs the input and compares it
         * with the expected output and reports any failures.
         **************************************************************************/
        void RunTest(SgmlReader reader, string file)
        {
            Console.WriteLine(file);
            StreamReader sr = new StreamReader(file);
            StringBuilder input = new StringBuilder();
            StringBuilder expectedOutput = new StringBuilder();
            StringBuilder current = null;
            StringBuilder args = new StringBuilder();

            Uri baseUri = new Uri(new Uri(Directory.GetCurrentDirectory()+"\\"), file);
            reader.SetBaseUri(baseUri.AbsoluteUri);

            int start = 1;
            int line = 1;
            int pos = 1;
            bool skipToEOL = false;
            bool readArgs = false;
            int i;
            do {
                i = sr.Read();
                char ch = (char)i;
                if (pos == 1 && ch == '`') {
                    ++pos;
                    if (current == null) {
                        current = input;
                        current.Length = 0;
                        readArgs = true;
                    } else if (current == input) {
                        current = expectedOutput;
                    }
                    else {
                        RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString());
                        start = line;
                        input.Length = 0;
                        args.Length = 0;
                        expectedOutput.Length = 0;
                        current = input;
                        readArgs = true;
                    }
                    skipToEOL = true;
                } else {
                    ++pos;
                    if(current != null) {
                        if (readArgs){
                            args.Append(ch);
                        } else if (!skipToEOL){
                            current.Append(ch);
                        }
                    }
                    if (ch == '\r') {
                        line++; pos = 1;
                        if (sr.Peek() == '\n') {
                            i = sr.Read();
                            if (!skipToEOL) current.Append((char)i);
                            if (readArgs) args.Append(ch);
                        }
                        skipToEOL = false;
                        readArgs = false;
                    } else if (ch == '\n'){
                        skipToEOL = false;
                        readArgs = false;
                        line++; pos = 1;
                    }
                }
            } while (i != -1);

            if (current.Length>0 && expectedOutput.Length>0) {
                RunTest(reader, start, args.ToString(), input.ToString(), expectedOutput.ToString());
            }
        }
Beispiel #4
0
 /// <summary>
 /// Converts the entry body into XHTML compliant text. 
 /// Returns false if it encounters a problem in doing so.
 /// </summary>
 /// <param name="entry">Entry.</param>
 /// <returns></returns>
 public static bool ConvertHtmlToXHtml(Entry entry)
 {
     SgmlReader reader = new SgmlReader();
     reader.SetBaseUri(Config.CurrentBlog.RootUrl.ToString());
     entry.Body = ConvertHtmlToXHtml(reader, entry.Body, null);
     return true;
 }
		/// <summary>
		/// Gets the doc reader.
		/// </summary>
		/// <param name="html">The HTML.</param>
		/// <param name="baseUri">The base URI.</param>
		/// <returns></returns>
		private static XmlReader GetDocReader(
			string html,
			Uri baseUri )
		{
			SgmlReader r = new SgmlReader();

			if ( baseUri != null &&
				!string.IsNullOrEmpty( baseUri.ToString() ) )
			{
				r.SetBaseUri( baseUri.ToString() );
			}
			r.DocType = @"HTML";
			r.InputStream = new StringReader( html );

			return r;
		}
	/// <summary>
	/// 
	/// </summary>
	private static XmlReader getDocReader(
		string html,
		string baseUrl )
	{
		var r = new Sgml.SgmlReader();

		if ( baseUrl.Length > 0 )
		{
			r.SetBaseUri( baseUrl );
		}
		r.DocType = @"HTML";
		r.InputStream = new StringReader( html );

		return r;
	}
Beispiel #7
0
        /// <summary>
        /// Detects URLs in styles.
        /// </summary>
        /// <param name="baseUri">The base URI.</param>
        /// <param name="attributeName">Name of the attribute.</param>
        /// <param name="attributeValue">The attribute value.</param>
        /// <returns></returns>
        //private List<UriResourceInformation> ExtractStyleUrls(
        //    Uri baseUri,
        //    string attributeName,
        //    string attributeValue)
        //{
        //    List<UriResourceInformation> result =
        //        new List<UriResourceInformation>();
        //    if (string.Compare(attributeName, @"style", true) == 0)
        //    {
        //        if (attributeValue != null &&
        //            attributeValue.Trim().Length > 0)
        //        {
        //            MatchCollection matchs = Regex.Matches(
        //                attributeValue,
        //                @"url\s*\(\s*([^\)\s]+)\s*\)",
        //                RegexOptions.Singleline | RegexOptions.IgnoreCase);
        //            if (matchs.Count > 0)
        //            {
        //                foreach (Match match in matchs)
        //                {
        //                    if (match != null && match.Success)
        //                    {
        //                        string url = match.Groups[1].Value;
        //                        UriResourceInformation ui =
        //                            new UriResourceInformation(
        //                            _settings.Options,
        //                            url,
        //                            new Uri(url, UriKind.RelativeOrAbsolute),
        //                            baseUri,
        //                            UriType.Resource,
        //                            _uriInfo.AbsoluteUri,
        //                            );
        //                        bool isOnSameSite =
        //                            ui.IsOnSameSite(baseUri);
        //                        if ((isOnSameSite ||
        //                            !_settings.Options.StayOnSite) &&
        //                            ui.IsProcessableUri)
        //                        {
        //                            result.Add(ui);
        //                        }
        //                    }
        //                }
        //            }
        //        }
        //    }
        //    return result;
        //}
        /// <summary>
        /// Gets the doc reader.
        /// </summary>
        /// <param name="html">The HTML.</param>
        /// <param name="baseUri">The base URI.</param>
        /// <returns></returns>
        private static XmlReader GetDocReader(
            string html,
            Uri baseUri)
        {
            SgmlReader r = new SgmlReader();

            if (baseUri != null &&
                !string.IsNullOrEmpty(baseUri.ToString()))
                r.SetBaseUri(baseUri.ToString());
            r.DocType = @"HTML";
            r.WhitespaceHandling = WhitespaceHandling.All;
            r.CaseFolding = CaseFolding.None;
            StringReader sr = new StringReader(html);
            r.InputStream = sr;
            r.Read();

            return r;
        }