/// <summary> /// Initializes HtmlNode, providing type, owner and where it exists in a collection /// </summary> /// <param name="type"></param> /// <param name="ownerdocument"></param> /// <param name="index"></param> public HtmlNode(HtmlNodeType type, HtmlDocument ownerdocument, int index) { _nodetype = type; _ownerdocument = ownerdocument; _outerstartindex = index; switch (type) { case HtmlNodeType.Comment: Name = HtmlNodeTypeNameComment; _endnode = this; break; case HtmlNodeType.Document: Name = HtmlNodeTypeNameDocument; _endnode = this; break; case HtmlNodeType.Text: Name = HtmlNodeTypeNameText; _endnode = this; break; } if (_ownerdocument._openednodes != null) { if (!Closed) { // we use the index as the key // -1 means the node comes from public if (-1 != index) { _ownerdocument._openednodes.Add(index, this); } } } if ((-1 != index) || (type == HtmlNodeType.Comment) || (type == HtmlNodeType.Text)) return; // innerhtml and outerhtml must be calculated _outerchanged = true; _innerchanged = true; }
/// <summary> /// Creates an HTML node from a string representing literal HTML. /// </summary> /// <param name="html">The HTML text.</param> /// <returns>The newly created node instance.</returns> public static HtmlNode CreateNode(string html) { // REVIEW: this is *not* optimum... HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); return doc.DocumentNode.FirstChild; }
private HtmlDocument LoadUrl(Uri uri, string method, WebProxy proxy, NetworkCredential creds) { HtmlDocument doc = new HtmlDocument(); doc.OptionAutoCloseOnEnd = false; doc.OptionFixNestedTags = true; _statusCode = Get(uri, method, null, doc, proxy, creds); if (_statusCode == HttpStatusCode.NotModified) { // read cached encoding doc.DetectEncodingAndLoad(GetCachePath(uri)); } return doc; }
public void Parse() { contentHolderId = -1; AspContent = new Dictionary<int, string>(); // Extract tags AspTags = AspTagsStripRegex.Matches(Raw); GaspConditions = GaspXPConditionRegex.Matches(Raw); GaspForeaches = GaspXPForeachRegex.Matches(Raw); // Preprocess the HTML // Strip the <% asp code %> (replace with a placeholder) Processed = AspTagsStripRegex.Replace(Raw, me => { AspContent.Add(++contentHolderId, me.Value); return "<!-- GaspXP[[" + contentHolderId + "]] -->"; }); // Strip the <condition></condition> tags Processed = GaspXPConditionRegex.Replace(Processed, me => ""); // Strip the <foreach></foreach> tags Processed = GaspXPForeachRegex.Replace(Processed, me => ""); var doc = new HtmlDocument(); doc.OptionWriteEmptyNodes = true; doc.OptionOutputOriginalCase = true; doc.OptionAutoCloseOnEnd = true; // todo OptionOutputOriginalCase => doesnt seem to work for attributes! (not all?) doc.LoadHtml(Processed); string debug = ""; // Loop through all conditions foreach (Match condition in GaspConditions) { foreach (Match tag in AttributesRegex.Matches(condition.Groups[0].Value)) { if (tag.Groups[1].Value == "for") { var elementId = tag.Groups[2].Value; // find the element foreach (var n in doc.DocumentNode.SelectNodes("//*", GaspNamespace)) { bool found = false; foreach (var a in n.Attributes) { if (a.OriginalName != "gasp:id" || a.Value != elementId) continue; found = true; break; } if (!found) continue; n.ParentNode.InsertBefore(HtmlNode.CreateNode("<% if(" + condition.Groups[2].Value + "){%>"), n); n.ParentNode.InsertAfter(HtmlNode.CreateNode("<% } %>"), n); } foreach (var n in doc.DocumentNode.SelectNodes("id('" + elementId + "')")) { n.ParentNode.InsertBefore(HtmlNode.CreateNode("<% if(" + condition.Groups[2].Value + "){%>"), n); n.ParentNode.InsertAfter(HtmlNode.CreateNode("<% } %>"), n); } } } } // Loop through all foreaches foreach (Match condition in GaspForeaches) { foreach (Match tag in AttributesRegex.Matches(condition.Groups[0].Value)) { if (tag.Groups[1].Value == "for") { var elementId = tag.Groups[2].Value; string key = "item"; foreach (Match keyTag in AttributesRegex.Matches(condition.Groups[0].Value)) { if (keyTag.Groups[1].Value == "key") { key = keyTag.Groups[2].Value; break; } } // find the element (first search on 'gaspid') // allows to be applied to multiple elements at once! foreach (var n in doc.DocumentNode.SelectNodes("//*", GaspNamespace)) { bool found = false; foreach (var a in n.Attributes) { if (a.OriginalName != "gasp:id" || a.Value != elementId) continue; found = true; break; } if (!found) continue; n.ParentNode.InsertBefore(HtmlNode.CreateNode("<% if(" + condition.Groups[2].Value + "){%>"), n); n.ParentNode.InsertAfter(HtmlNode.CreateNode("<% } %>"), n); } foreach (var n in doc.DocumentNode.SelectNodes("id('" + elementId + "')")) { n.InsertBefore(HtmlNode.CreateNode("<% foreach( var " + key + " in (" + condition.Groups[2].Value + ")){%>"), n.FirstChild); n.InsertAfter(HtmlNode.CreateNode("<% } %>"), n.LastChild); break; } break; } } } // cleanup gaspid's foreach (var n in new List<HtmlNode>(doc.DocumentNode.SelectNodes("//*", GaspNamespace))) { n.Attributes.Remove("gasp:id"); } /* return the asp code back into the doc */ Processed = GaspXPContentRegex.Replace(doc.DocumentNode.OuterHtml,m => AspContent[int.Parse(m.Groups[1].Value)]); }
private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc, IWebProxy proxy, ICredentials creds) { string cachePath = null; HttpWebRequest req; bool oldFile = false; req = WebRequest.Create(uri) as HttpWebRequest; req.Method = method; req.UserAgent = UserAgent; if (proxy != null) { if (creds != null) { proxy.Credentials = creds; req.Credentials = creds; } else { proxy.Credentials = CredentialCache.DefaultCredentials; req.Credentials = CredentialCache.DefaultCredentials; } req.Proxy = proxy; } _fromCache = false; _requestDuration = 0; int tc = Environment.TickCount; if (UsingCache) { cachePath = GetCachePath(req.RequestUri); if (File.Exists(cachePath)) { req.IfModifiedSince = File.GetLastAccessTime(cachePath); oldFile = true; } } if (_cacheOnly) { if (!File.Exists(cachePath)) { throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'"); } if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file if (cachePath != null) File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } _fromCache = true; return HttpStatusCode.NotModified; } if (_useCookies) { req.CookieContainer = new CookieContainer(); } if (PreRequest != null) { // allow our user to change the request at will if (!PreRequest(req)) { return HttpStatusCode.ResetContent; } // dump cookie // if (_useCookies) // { // foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri)) // { // HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain); // } // } } HttpWebResponse resp; try { resp = req.GetResponse() as HttpWebResponse; } catch (WebException we) { _requestDuration = Environment.TickCount - tc; resp = (HttpWebResponse)we.Response; if (resp == null) { if (oldFile) { if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } return HttpStatusCode.NotModified; } throw; } } catch (Exception) { _requestDuration = Environment.TickCount - tc; throw; } // allow our user to get some info from the response if (PostResponse != null) { PostResponse(req, resp); } _requestDuration = Environment.TickCount - tc; _responseUri = resp.ResponseUri; bool html = IsHtmlContent(resp.ContentType); Encoding respenc = !string.IsNullOrEmpty(resp.ContentEncoding) ? Encoding.GetEncoding(resp.ContentEncoding) : null; if (resp.StatusCode == HttpStatusCode.NotModified) { if (UsingCache) { _fromCache = true; if (path != null) { IOLibrary.CopyAlways(cachePath, path); // touch the file File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } return resp.StatusCode; } // this should *never* happen... throw new HtmlWebException("Server has send a NotModifed code, without cache enabled."); } Stream s = resp.GetResponseStream(); if (s != null) { if (UsingCache) { // NOTE: LastModified does not contain milliseconds, so we remove them to the file SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize); // save headers SaveCacheHeaders(req.RequestUri, resp); if (path != null) { // copy and touch the file IOLibrary.CopyAlways(cachePath, path); File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath)); } } else { // try to work in-memory if ((doc != null) && (html)) { if (respenc != null) { doc.Load(s, respenc); } else { doc.Load(s, true); } } } resp.Close(); } return resp.StatusCode; }
/// <summary> /// Loads an HTML document from an Internet resource. /// </summary> /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param> /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param> /// <param name="proxy">Proxy to use with this request</param> /// <param name="credentials">Credentials to use when authenticating</param> /// <returns>A new HTML document.</returns> public HtmlDocument Load(string url, string method, WebProxy proxy, NetworkCredential credentials) { Uri uri = new Uri(url); HtmlDocument doc; if ((uri.Scheme == Uri.UriSchemeHttps) || (uri.Scheme == Uri.UriSchemeHttp)) { doc = LoadUrl(uri, method, proxy, credentials); } else { if (uri.Scheme == Uri.UriSchemeFile) { doc = new HtmlDocument(); doc.OptionAutoCloseOnEnd = false; doc.OptionAutoCloseOnEnd = true; doc.DetectEncodingAndLoad(url, _autoDetectEncoding); } else { throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'."); } } if (PreHandleDocument != null) { PreHandleDocument(doc); } return doc; }
private HtmlNodeNavigator(HtmlNodeNavigator nav) { if (nav == null) { throw new ArgumentNullException("nav"); } InternalTrace(null); _doc = nav._doc; _currentnode = nav._currentnode; _attindex = nav._attindex; _nametable = nav._nametable; // REVIEW: should we do this? }
internal HtmlNodeNavigator(HtmlDocument doc, HtmlNode currentNode) { if (currentNode == null) { throw new ArgumentNullException("currentNode"); } if (currentNode.OwnerDocument != doc) { throw new ArgumentException(HtmlDocument.HtmlExceptionRefNotChild); } InternalTrace(null); _doc = doc; Reset(); _currentnode = currentNode; }
internal HtmlAttribute(HtmlDocument ownerdocument) { _ownerdocument = ownerdocument; }