コード例 #1
0
        private HtmlNodeNavigator(HtmlNodeNavigator nav)
        {
            if (nav == null)
            {
                throw new ArgumentNullException("nav");
            }
            InternalTrace(null);

            _doc = nav._doc;
            _currentnode = nav._currentnode;
            _attindex = nav._attindex;
            _nametable = nav._nametable; // REVIEW: should we do this?
        }
コード例 #2
0
        internal HtmlNodeNavigator(HtmlDocument doc, HtmlNode currentNode)
        {
            if (currentNode == null)
            {
                throw new ArgumentNullException("currentNode");
            }
            if (currentNode.OwnerDocument != doc)
            {
                throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild);
            }
            InternalTrace(null);

            _doc = doc;
            Reset();
            _currentnode = currentNode;
        }
コード例 #3
0
        /// <summary>
        /// Begins the process of downloading an internet resource
        /// </summary>
        /// <param name="uri">Url to the html document</param>
        /// <param name="encoding">The encoding to use while downloading the document</param>
        /// <param name="credentials">The credentials to use for authenticating the web request</param>
        public async Task<HtmlDocument> LoadFromWebAsync(Uri uri, Encoding encoding, NetworkCredential credentials)
        {
            var clientHandler = new HttpClientHandler();
            if (credentials == null)
                clientHandler.UseDefaultCredentials = true;
            else
                clientHandler.Credentials = credentials;

            var client = new HttpClient(clientHandler);

            var e = await client.GetAsync(uri);
            if (e.StatusCode == HttpStatusCode.OK)
            {
                var html = string.Empty;
                if (encoding != null)
                {
                    using (var sr = new StreamReader(await e.Content.ReadAsStreamAsync(), encoding))
                    {
                        html = sr.ReadToEnd();
                    }
                }
                else
                    html = await e.Content.ReadAsStringAsync();
                var doc = new HtmlDocument();
                if (PreHandleDocument != null)
                    PreHandleDocument(doc);
                doc.LoadHtml(html);
                return doc;
            }
            throw new Exception("Error downloading html");
        }
コード例 #4
0
 private HtmlDocument LoadUrl(Uri uri, string method, IWebProxy proxy, ICredentials creds)
 {
     HtmlDocument doc = new HtmlDocument();
     doc.OptionAutoCloseOnEnd = false;
     doc.OptionFixNestedTags = true;
     _statusCode = Get(uri, method, null, doc, proxy, creds);
     if (_statusCode == HttpStatusCode.NotModified)
     {
         // read cached encoding
         doc.DetectEncodingAndLoad(GetCachePath(uri));
     }
     return doc;
 }
コード例 #5
0
        private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc, IWebProxy proxy,
                                   ICredentials creds)
        {
            string cachePath = null;
            bool oldFile = false;
            HttpStatusCode status;
            using (var request = new HttpRequestMessage(new HttpMethod(method), uri))
            using (var handler = new HttpClientHandler())
            using (var client = new HttpClient(handler))
            {
                client.DefaultRequestHeaders.Add("User-Agent", UserAgent);

                if (proxy != null)
                {
                    if (creds != null)
                    {
                        proxy.Credentials = creds;
                        handler.Credentials = creds;
                    }
                    else
                    {
                        proxy.Credentials = CredentialCache.DefaultCredentials;
                        handler.Credentials = CredentialCache.DefaultCredentials;
                    }
                    handler.Proxy = proxy;
                    handler.UseProxy = true;
                }

                _fromCache = false;
                _requestDuration = 0;
                int tc = Environment.TickCount;
                if (UsingCache)
                {
                    cachePath = GetCachePath(request.RequestUri);
                    if (File.Exists(cachePath))
                    {
                        client.DefaultRequestHeaders.IfModifiedSince = File.GetLastAccessTime(cachePath);
                        oldFile = true;
                    }
                }

                if (_cacheOnly)
                {
                    if (!File.Exists(cachePath))
                    {
                        throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'");
                    }

                    if (path != null)
                    {
                        IOLibrary.CopyAlways(cachePath, path);
                        // touch the file
                        if (cachePath != null) File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
                    }
                    _fromCache = true;
                    return HttpStatusCode.NotModified;
                }

                if (_useCookies)
                {
                    handler.CookieContainer = new CookieContainer();
                }

                if (PreRequest != null)
                {
                    // allow our user to change the request at will
                    if (!PreRequest(handler, request))
                    {
                        return HttpStatusCode.ResetContent;
                    }

                    // dump cookie
                    //				if (_useCookies)
                    //				{
                    //					foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
                    //					{
                    //						HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
                    //					}
                    //				}
                }

                HttpResponseMessage response;
                try
                {
                    response = client.SendAsync(request).Result;
                }
                catch (HttpRequestException)
                {
                    _requestDuration = Environment.TickCount - tc;
                    if (oldFile)
                    {
                        if (path != null)
                        {
                            IOLibrary.CopyAlways(cachePath, path);
                            // touch the file
                            File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
                        }
                        return HttpStatusCode.NotModified;
                    }
                    throw;
                }
                catch (Exception)
                {
                    _requestDuration = Environment.TickCount - tc;
                    throw;
                }

                // allow our user to get some info from the response
                if (PostResponse != null)
                {
                    PostResponse(request, response);
                }

                _requestDuration = Environment.TickCount - tc;
                _responseUri = response.RequestMessage.RequestUri;

                bool html = IsHtmlContent(response.Content.Headers.ContentType.MediaType);
                var encoding = response.Content.Headers.ContentEncoding.FirstOrDefault();
                Encoding respenc = !string.IsNullOrEmpty(encoding)
                                       ? Encoding.GetEncoding(encoding)
                                       : null;

                if (response.StatusCode == HttpStatusCode.NotModified)
                {
                    if (UsingCache)
                    {
                        _fromCache = true;
                        if (path != null)
                        {
                            IOLibrary.CopyAlways(cachePath, path);
                            // touch the file
                            File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
                        }
                        return response.StatusCode;
                    }
                    // this should *never* happen...
                    throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
                }
                Stream s = response.Content.ReadAsStreamAsync().Result;
                if (s != null)
                {
                    if (UsingCache)
                    {
                        // NOTE: LastModified does not contain milliseconds, so we remove them to the file
                        SaveStream(s, cachePath, RemoveMilliseconds(response.Content.Headers.LastModified), _streamBufferSize);

                        // save headers
                        SaveCacheHeaders(request.RequestUri, response);

                        if (path != null)
                        {
                            // copy and touch the file
                            IOLibrary.CopyAlways(cachePath, path);
                            File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
                        }
                    }
                    else
                    {
                        // try to work in-memory
                        if ((doc != null) && (html))
                        {
                            if (respenc != null)
                            {
                                doc.Load(s, respenc);
                            }
                            else
                            {
                                doc.Load(s, true);
                            }
                        }
                    }
                }
                status = response.StatusCode;
            }

            return status;
        }
コード例 #6
0
        private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc, IWebProxy proxy,
								   ICredentials creds)
		{
			string cachePath = null;
			HttpWebRequest req;
			bool oldFile = false;
          
			req = WebRequest.Create(uri) as HttpWebRequest;
			req.Method = method;
			req.UserAgent = UserAgent;
			if (proxy != null)
			{
				if (creds != null)
				{
					proxy.Credentials = creds;
					req.Credentials = creds;
				}
				else
				{
					proxy.Credentials = CredentialCache.DefaultCredentials;
					req.Credentials = CredentialCache.DefaultCredentials;
				}
				req.Proxy = proxy;
			}

			_fromCache = false;
			_requestDuration = 0;
			int tc = Environment.TickCount;
			if (UsingCache)
			{
				cachePath = GetCachePath(req.RequestUri);
				if (File.Exists(cachePath))
				{
					req.IfModifiedSince = File.GetLastAccessTime(cachePath);
					oldFile = true;
				}
			}

			if (_cacheOnly)
			{
				if (!File.Exists(cachePath))
				{
					throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'");
				}

				if (path != null)
				{
					IOLibrary.CopyAlways(cachePath, path);
					// touch the file
					if (cachePath != null) File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
				}
				_fromCache = true;
				return HttpStatusCode.NotModified;
			}

			if (_useCookies)
			{
				req.CookieContainer = new CookieContainer();
			}

			if (PreRequest != null)
			{
				// allow our user to change the request at will
				if (!PreRequest(req))
				{
					return HttpStatusCode.ResetContent;
				}

				// dump cookie
				//				if (_useCookies)
				//				{
				//					foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
				//					{
				//						HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
				//					}
				//				}
			}

			HttpWebResponse resp;

			try
			{
				resp = req.GetResponse() as HttpWebResponse;
			}
			catch (WebException we)
			{
				_requestDuration = Environment.TickCount - tc;
				resp = (HttpWebResponse)we.Response;
				if (resp == null)
				{
					if (oldFile)
					{
						if (path != null)
						{
							IOLibrary.CopyAlways(cachePath, path);
							// touch the file
							File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
						}
						return HttpStatusCode.NotModified;
					}
					throw;
				}
			}
			catch (Exception)
			{
				_requestDuration = Environment.TickCount - tc;
				throw;
			}

			// allow our user to get some info from the response
			if (PostResponse != null)
			{
				PostResponse(req, resp);
			}

			_requestDuration = Environment.TickCount - tc;
			_responseUri = resp.ResponseUri;

			bool html = IsHtmlContent(resp.ContentType);

			Encoding respenc = !string.IsNullOrEmpty(resp.ContentEncoding)
								   ? Encoding.GetEncoding(resp.ContentEncoding)
								   : null;
			if (OverrideEncoding != null)
				respenc = OverrideEncoding;

			if (resp.StatusCode == HttpStatusCode.NotModified)
			{
				if (UsingCache)
				{
					_fromCache = true;
					if (path != null)
					{
						IOLibrary.CopyAlways(cachePath, path);
						// touch the file
						File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
					}
					return resp.StatusCode;
				}
				// this should *never* happen...
				throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
			}
			Stream s = resp.GetResponseStream();
			if (s != null)
			{
				if (UsingCache)
				{
					// NOTE: LastModified does not contain milliseconds, so we remove them to the file
					SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize);

					// save headers
					SaveCacheHeaders(req.RequestUri, resp);

					if (path != null)
					{
						// copy and touch the file
						IOLibrary.CopyAlways(cachePath, path);
						File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
					}
				}
				else
				{
					// try to work in-memory
					if (doc != null && html)
					{
					    if (respenc == null)
					    {
					        doc.Load(s, true);
					    }
					    else
					    {
					        doc.Load(s, respenc);
					    }
					}
				}
				resp.Close();
			}
			return resp.StatusCode;
		}
コード例 #7
0
        /// <summary>
        /// Loads an HTML document from an Internet resource.
        /// </summary>
        /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
        /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
        /// <param name="proxy">Proxy to use with this request</param>
        /// <param name="credentials">Credentials to use when authenticating</param>
        /// <returns>A new HTML document.</returns>
        public HtmlDocument Load(string url, string method, IWebProxy proxy, ICredentials credentials)
        {
            Uri uri = new Uri(url);
            HtmlDocument doc;
#if NET20 || NET40 || NET451
            if (uri.Scheme == Uri.UriSchemeFile)
#else
            // Uri.UriScheme* is internal. Bug or not completed?
            if (uri.Scheme == "file")
#endif
            {
                doc = LoadUrl(uri, method, proxy, credentials);
            }
            else
            {
#if NET20 || NET40 || NET451
                if (uri.Scheme == Uri.UriSchemeFile)
#else
                // Uri.UriScheme* is internal. Bug or not completed?
                if (uri.Scheme == "file")
#endif
                {
                    doc = new HtmlDocument();
                    doc.OptionAutoCloseOnEnd = false;
                    doc.OptionAutoCloseOnEnd = true;
                    doc.DetectEncodingAndLoad(url, _autoDetectEncoding);
                }
                else
                {
                    throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
                }
            }
            if (PreHandleDocument != null)
            {
                PreHandleDocument(doc);
            }
            return doc;
        }
コード例 #8
0
        /// <summary>
        /// Loads an HTML document from an Internet resource.
        /// </summary>
        /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
        /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
        /// <returns>A new HTML document.</returns>
        public HtmlDocument Load(string url, string method)
        {
            Uri uri = new Uri(url);
            HtmlDocument doc;
#if NET20 || NET40 || NET451
            if ((uri.Scheme == Uri.UriSchemeHttps) ||
                (uri.Scheme == Uri.UriSchemeHttp))
#else
            // Uri.UriSchemeHttps is internal. Bug or not completed?
            if ((uri.Scheme == "https") ||
            (uri.Scheme == "http"))
#endif
            {
                doc = LoadUrl(uri, method, null, null);
            }
            else
            {
#if NET20 || NET40 || NET451
                if (uri.Scheme == Uri.UriSchemeFile)
#else
                // Uri.UriScheme* is internal. Bug or not completed?
                if (uri.Scheme == "file")
#endif
                {
                    doc = new HtmlDocument();
                    doc.OptionAutoCloseOnEnd = false;
                    doc.OptionAutoCloseOnEnd = true;
                    if (OverrideEncoding != null)
                        doc.Load(url, OverrideEncoding);
                    else
                        doc.DetectEncodingAndLoad(url, _autoDetectEncoding);
                }
                else
                {
                    throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
                }
            }
            if (PreHandleDocument != null)
            {
                PreHandleDocument(doc);
            }
            return doc;
        }
コード例 #9
0
 internal HtmlAttribute(HtmlDocument ownerdocument)
 {
     _ownerdocument = ownerdocument;
 }
コード例 #10
0
        /// <summary>
        /// Helper function that returns an HTML document from text
        /// </summary>
        private static HtmlDocument GetHtml(string source)
        {
            HtmlDocument html = new HtmlDocument();
            html.OptionFixNestedTags = true;
            html.OptionAutoCloseOnEnd = true;
            html.OptionDefaultStreamEncoding = Encoding.UTF8;

            html.LoadHtml(source ?? "");

            // Encode any code blocks independently so they won't
            // be stripped out completely when we do a final cleanup
            foreach (var n in html.DocumentNode.DescendantsAndSelf())
            {
                if (n.Name == "code")
                {
                    //** Code tag attribute vulnerability fix 28-9-12 (thanks to Natd)
                    HtmlAttribute[] attr = n.Attributes.ToArray();
                    foreach (HtmlAttribute a in attr)
                    {
                        if (a.Name != "style" && a.Name != "class") { a.Remove(); }
                    } //** End fix
                    n.InnerHtml = System.Net.WebUtility.HtmlEncode(System.Net.WebUtility.HtmlDecode(n.InnerHtml));
                }
            }

            return html;
        }
コード例 #11
0
 /// <summary>
 /// Creates an HTML node from a string representing literal HTML.
 /// </summary>
 /// <param name="html">The HTML text.</param>
 /// <returns>The newly created node instance.</returns>
 public static HtmlNode CreateNode(string html)
 {
     // REVIEW: this is *not* optimum...
     HtmlDocument doc = new HtmlDocument();
     doc.LoadHtml(html);
     return doc.DocumentNode.FirstChild;
 }
コード例 #12
0
        /// <summary>
        /// Initializes HtmlNode, providing type, owner and where it exists in a collection
        /// </summary>
        /// <param name="type"></param>
        /// <param name="ownerdocument"></param>
        /// <param name="index"></param>
        public HtmlNode(HtmlNodeType type, HtmlDocument ownerdocument, int index)
        {
            _nodetype = type;
            _ownerdocument = ownerdocument;
            _outerstartindex = index;

            switch (type)
            {
                case HtmlNodeType.Comment:
                    Name = HtmlNodeTypeNameComment;
                    _endnode = this;
                    break;

                case HtmlNodeType.Document:
                    Name = HtmlNodeTypeNameDocument;
                    _endnode = this;
                    break;

                case HtmlNodeType.Text:
                    Name = HtmlNodeTypeNameText;
                    _endnode = this;
                    break;
            }

            if (_ownerdocument.Openednodes != null)
            {
                if (!Closed)
                {
                    // we use the index as the key

                    // -1 means the node comes from public
                    if (-1 != index)
                    {
                        _ownerdocument.Openednodes.Add(index, this);
                    }
                }
            }

            if ((-1 != index) || (type == HtmlNodeType.Comment) || (type == HtmlNodeType.Text)) return;
            // innerhtml and outerhtml must be calculated
            SetChanged();
        }