private static void CombineAndCleanLinks(HtmlDocument document, LinkQueryCleanLevel queryCleanLevel) { CleanLinks(document, queryCleanLevel); CombineLinks(document); RemoveEmptyLinks(document); RemoveLeadingAndTrailingSpacesFromLinks(document); CleanLinkContent(document); }
private static void CleanLinks(HtmlDocument document, LinkQueryCleanLevel queryCleanLevel) { var links = document.DocumentNode.SelectNodes("//a[@href]") ?? Enumerable.Empty <HtmlNode>(); foreach (var link in links) { var href = link.Attributes["href"].Value; href = LinkChecker.CleanHref(href, queryCleanLevel); link.Attributes["href"].Value = href; } }
/// <summary> /// Cleans the query part of the supplied URL. /// </summary> /// <param name="original">The original URL.</param> /// <param name="cleanLevel">The requested clean level.</param> /// <returns></returns> private static string CleanQueryString(string original, LinkQueryCleanLevel cleanLevel) { if (cleanLevel == LinkQueryCleanLevel.None || string.IsNullOrWhiteSpace(original) || !original.Contains('?')) { return(original); } if (cleanLevel == LinkQueryCleanLevel.RemoveQuery) { return(original.Substring(0, original.IndexOf('?'))); } var qry = original.Substring(original.IndexOf('?') + 1); bool encoded = qry.Contains("&"); if (encoded) { // decode (ignoring any other encodings) qry = qry.Replace("&", "&"); } // ignore all "utm_whatever=value" parts qry = string.Join("&", qry.Split('&').Where(x => !x.StartsWith("utm", StringComparison.OrdinalIgnoreCase))); if (string.IsNullOrEmpty(qry)) { return(original.Substring(0, original.IndexOf('?'))); } if (encoded) { // re-encode qry = qry.Replace("&", "&"); } return(original.Substring(0, original.IndexOf('?')) + "?" + qry); }
/// <summary> /// Cleans the href value. /// </summary> /// <param name="original">The original URL.</param> /// <param name="cleanLevel">The clean level.</param> /// <returns></returns> public static string CleanHref(string original, LinkQueryCleanLevel cleanLevel) { if (string.IsNullOrWhiteSpace(original) || !original.StartsWith("http", StringComparison.Ordinal)) { return(original); } if (!Uri.TryCreate(original, UriKind.Absolute, out Uri uri)) { return(original); } // for example https://nam04.safelinks.protection.outlook.com/?url=http%3A%2F%2Fhealthclarity.wolterskluwer.com ... if (uri.Host.EndsWith("safelinks.protection.outlook.com", StringComparison.OrdinalIgnoreCase)) { // I want the "url" parameter (decoded) and can ignore the rest var query = original.Substring(original.IndexOf('?') + 1).Replace("&", "&"); var urlparam = query.Split('&').First(p => p.StartsWith("url=", StringComparison.Ordinal)); original = System.Net.WebUtility.UrlDecode(urlparam.Substring(4)); // skip the "url=" } else if (uri.Host.Equals("urldefense.com", StringComparison.OrdinalIgnoreCase)) { // assume /v3/__http(s)://fullpath__;security var query = uri.PathAndQuery; query = query.Substring(query.IndexOf("__", StringComparison.Ordinal) + 2); var p = query.IndexOf(";", StringComparison.Ordinal); if (p > 0) { query = query.Substring(0, p).Trim('_'); } // undo some conversions that I saw in two examples: *20 -> %20, /*/ -> /#/ query = Regex.Replace(query, "\\*(?=[0-9a-zA-Z]{2})", "%").Replace("*", "#"); original = query; } return(CleanQueryString(original, cleanLevel)); }