private void FixAllRelativeHrefs(HtmlDocument document, string url) { var nodes = document.DocumentNode.SelectNodes("//a[not(starts-with(@href,'http') or starts-with(@href,'https'))]"); if (nodes != null) { foreach (var node in nodes) { if (node.Attributes["href"] != null) { node.Attributes["href"].Value = UrlUtils.CanonicalizeUrl(node.Attributes["href"].Value, url); } } } var images = document.DocumentNode.SelectNodes(".//img"); if (images != null) { foreach (var image in images) { if (image.Attributes["src"] != null) { image.Attributes["src"].Value = UrlUtils.CanonicalizeUrl(image.Attributes["src"].Value, url); } } } }
/// <summary> /// Add urls to fetch /// </summary> /// <param name="requests"></param> /// <param name="priority"></param> public void AddTargetRequests(IList <string> requests, int priority) { if (requests == null || requests.Count == 0) { return; } lock (this) { foreach (string s in requests) { if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:")) { continue; } string s1 = UrlUtils.CanonicalizeUrl(s, Url); Request request = new Request(s1, Request.Extras) { Priority = priority, Depth = Request.NextDepth }; if (request.IsAvailable) { TargetRequests.Add(request); } } } }
/** * add url to fetch * * @param requestString requestString */ public void AddTargetRequest(string requestString) { if (string.IsNullOrEmpty(requestString) || requestString.Equals("#")) { return; } requestString = UrlUtils.CanonicalizeUrl(requestString, _url.ToString()); _targetRequests.Add(new Request(requestString)); }
public void AddTargetRequests(IList <string> requests) { foreach (string s in requests) { if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:")) { continue; } string s1 = UrlUtils.CanonicalizeUrl(s, _url.ToString()); _targetRequests.Add(new Request(s1, _request?.Extras)); } }
/// <summary> /// Add url to fetch /// </summary> /// <param name="requestString"></param> public void AddTargetRequest(string requestString) { lock (this) { if (string.IsNullOrEmpty(requestString) || requestString.Equals("#")) { return; } requestString = UrlUtils.CanonicalizeUrl(requestString, Url); TargetRequests.Add(new Request(requestString, Request.NextDepth, Request.Extras)); } }
/** * add urls to fetch * * @param requests requests * @param priority priority */ public void AddTargetRequests(List <string> requests, long priority) { for (var i = 0; i < requests.Count; ++i) { var s = requests[i]; if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:")) { continue; } requests[i] = UrlUtils.CanonicalizeUrl(requests[i], _url.ToString()); _targetRequests.Add(new Request(requests[i]).SetPriority(priority)); } }
/// <summary> /// Add urls to fetch /// </summary> /// <param name="requests"></param> public void AddTargetRequests(IList <string> requests) { lock (this) { foreach (string s in requests) { if (string.IsNullOrEmpty(s) || s.Equals("#") || s.StartsWith("javascript:")) { continue; } string s1 = UrlUtils.CanonicalizeUrl(s, Url); TargetRequests.Add(new Request(s1, Request.NextDepth, Request.Extras)); } } }
// 问题太多, 如果有需要移到实体类的Expression中处理 internal void FixAllRelativeHrefs(string url) { var nodes = Document.SelectNodes("//a[not(starts-with(@href,'http') or starts-with(@href,'https'))]"); if (nodes != null) { foreach (var node in nodes) { if (node.Attributes["href"] != null) { node.Attributes["href"].Value = UrlUtils.CanonicalizeUrl(node.Attributes["href"].Value, url); } } } }
public void TestFixRelativeUrl() { string absoluteUrl = UrlUtils.CanonicalizeUrl("?aa", "http://www.dianping.com/sh/ss/com"); Assert.Equal(absoluteUrl, "http://www.dianping.com/sh/ss/com?aa"); absoluteUrl = UrlUtils.CanonicalizeUrl("../aa", "http://www.dianping.com/sh/ss/com"); Assert.Equal(absoluteUrl, "http://www.dianping.com/sh/aa"); absoluteUrl = UrlUtils.CanonicalizeUrl("..aa", "http://www.dianping.com/sh/ss/com"); Assert.Equal(absoluteUrl, "http://www.dianping.com/sh/ss/..aa"); absoluteUrl = UrlUtils.CanonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com/"); Assert.Equal(absoluteUrl, "http://www.dianping.com/sh/aa"); absoluteUrl = UrlUtils.CanonicalizeUrl("../../aa", "http://www.dianping.com/sh/ss/com"); Assert.Equal(absoluteUrl, "http://www.dianping.com/aa"); }
/// <summary> /// Add url to fetch /// </summary> /// <param name="requestString"></param> public void AddTargetRequest(string requestString) { lock (this) { if (string.IsNullOrEmpty(requestString) || requestString.Equals("#")) { return; } requestString = UrlUtils.CanonicalizeUrl(requestString, Url); var request = new Request(requestString, Request.Extras) { Depth = Request.NextDepth }; if (request.IsAvailable) { TargetRequests.Add(request); } } }