Пример #1
0
        public Dictionary <string, int> ParseURLS(string htmlContent, int depth)
        {
            Dictionary <string, string> urls = new Dictionary <string, string> ();
            Dictionary <string, int>    res  = new Dictionary <string, int> ();
            var match = Regex.Match(htmlContent, @"(?i)<a .*?href=\""([^\""]+)\""[^>]*>(.*?)</a>");

            while (match.Success)
            {
                urls [match.Groups [1].Value] = Regex.Replace(match.Groups [2].Value, "(?i)<.*?>", "");
                match = match.NextMatch();
            }
            foreach (var link in urls)
            {
                string href            = link.Key;
                string linkDescription = link.Value;
                if (!string.IsNullOrEmpty(href))
                {
                    bool canBeAdded = true;
                    if (EscapeWords != null)
                    {
                        canBeAdded = !EscapeWords.Any(linkDescription.Contains);
                    }
                    if (Keywords != null && !Keywords.Any(linkDescription.Contains))
                    {
                        canBeAdded = false;
                    }
                    if (canBeAdded)
                    {
                        string url = contentStandarlize(href);

                        if (String.IsNullOrEmpty(url) ||
                            url.StartsWith("#") ||
                            url.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) ||
                            url.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase))
                        {
                            continue;
                        }
                        url = URLSdantarlize(url);

                        if (URLRegexFilter != null && !URLRegexFilter.Any(f => Regex.IsMatch(url, f, RegexOptions.IgnoreCase)))
                        {
                            continue;
                        }
                        if (!res.Keys.Contains(url))
                        {
                            res.Add(url, depth + 1);
                        }
                    }
                }
            }
            return(res);
        }
Пример #2
0
 public static string EscapeKeyword(this string word)
 => word != null && EscapeWords.Contains(word) ? word.ToLower() : word;