Пример #1
0
 private static void CombineAndCleanLinks(HtmlDocument document, LinkQueryCleanLevel queryCleanLevel)
 {
     CleanLinks(document, queryCleanLevel);
     CombineLinks(document);
     RemoveEmptyLinks(document);
     RemoveLeadingAndTrailingSpacesFromLinks(document);
     CleanLinkContent(document);
 }
Пример #2
0
        private static void CleanLinks(HtmlDocument document, LinkQueryCleanLevel queryCleanLevel)
        {
            var links = document.DocumentNode.SelectNodes("//a[@href]") ?? Enumerable.Empty <HtmlNode>();

            foreach (var link in links)
            {
                var href = link.Attributes["href"].Value;
                href = LinkChecker.CleanHref(href, queryCleanLevel);
                link.Attributes["href"].Value = href;
            }
        }
Пример #3
0
        /// <summary>
        /// Cleans the query part of the supplied URL.
        /// </summary>
        /// <param name="original">The original URL.</param>
        /// <param name="cleanLevel">The requested clean level.</param>
        /// <returns></returns>
        private static string CleanQueryString(string original, LinkQueryCleanLevel cleanLevel)
        {
            if (cleanLevel == LinkQueryCleanLevel.None || string.IsNullOrWhiteSpace(original) || !original.Contains('?'))
            {
                return(original);
            }

            if (cleanLevel == LinkQueryCleanLevel.RemoveQuery)
            {
                return(original.Substring(0, original.IndexOf('?')));
            }


            var  qry     = original.Substring(original.IndexOf('?') + 1);
            bool encoded = qry.Contains("&amp;");

            if (encoded)
            {
                // decode (ignoring any other encodings)
                qry = qry.Replace("&amp;", "&");
            }

            // ignore all "utm_whatever=value" parts
            qry = string.Join("&",
                              qry.Split('&').Where(x => !x.StartsWith("utm", StringComparison.OrdinalIgnoreCase)));
            if (string.IsNullOrEmpty(qry))
            {
                return(original.Substring(0, original.IndexOf('?')));
            }

            if (encoded)
            {
                // re-encode
                qry = qry.Replace("&", "&amp;");
            }

            return(original.Substring(0, original.IndexOf('?')) + "?" + qry);
        }
Пример #4
0
        /// <summary>
        /// Cleans the href value.
        /// </summary>
        /// <param name="original">The original URL.</param>
        /// <param name="cleanLevel">The clean level.</param>
        /// <returns></returns>
        public static string CleanHref(string original, LinkQueryCleanLevel cleanLevel)
        {
            if (string.IsNullOrWhiteSpace(original) || !original.StartsWith("http", StringComparison.Ordinal))
            {
                return(original);
            }

            if (!Uri.TryCreate(original, UriKind.Absolute, out Uri uri))
            {
                return(original);
            }

            // for example https://nam04.safelinks.protection.outlook.com/?url=http%3A%2F%2Fhealthclarity.wolterskluwer.com ...
            if (uri.Host.EndsWith("safelinks.protection.outlook.com", StringComparison.OrdinalIgnoreCase))
            {
                // I want the "url" parameter (decoded) and can ignore the rest
                var query    = original.Substring(original.IndexOf('?') + 1).Replace("&amp;", "&");
                var urlparam = query.Split('&').First(p => p.StartsWith("url=", StringComparison.Ordinal));
                original = System.Net.WebUtility.UrlDecode(urlparam.Substring(4)); // skip the "url="
            }
            else if (uri.Host.Equals("urldefense.com", StringComparison.OrdinalIgnoreCase))
            {
                // assume /v3/__http(s)://fullpath__;security
                var query = uri.PathAndQuery;
                query = query.Substring(query.IndexOf("__", StringComparison.Ordinal) + 2);
                var p = query.IndexOf(";", StringComparison.Ordinal);
                if (p > 0)
                {
                    query = query.Substring(0, p).Trim('_');
                }

                // undo some conversions that I saw in two examples: *20 -> %20, /*/ -> /#/
                query    = Regex.Replace(query, "\\*(?=[0-9a-zA-Z]{2})", "%").Replace("*", "#");
                original = query;
            }

            return(CleanQueryString(original, cleanLevel));
        }