public void GetUrlFromCss(ISpiderContainer container, ref string html) { var items = new List <string>(); var matches = Regex.Matches(html, @"url\([""']?([^""'\s\<\>]*)[""']?\)", RegexOptions.IgnoreCase); foreach (Match item in matches) { if (item.Groups[1].Value.IndexOf("base64,") >= 0) { continue; } var uri = container.AddUri(item.Groups[1].Value, UriType.File); html = html.Replace(item.Value, item.Value.Replace(item.Groups[1].Value, uri)); } }
public void GetUrlFromHtml(ISpiderContainer container, ref string html) { var matches = Regex.Matches(html, @"(\<(a|img|link|script|embed|audio|object|video|param|source|iframe)[^\<\>]+(src|href|value|data)\s?=)\s?[""']?([^""'\s\<\>]*)[""']?", RegexOptions.IgnoreCase); foreach (Match item in matches) { var url = item.Groups[4].Value; if (string.IsNullOrEmpty(url) || url.IndexOf("javascript:", StringComparison.Ordinal) >= 0 || url.IndexOf("#", StringComparison.Ordinal) == 0 || url.IndexOf("data:", StringComparison.OrdinalIgnoreCase) >= 0 || url.IndexOf("ed2k://", StringComparison.OrdinalIgnoreCase) >= 0) { continue; } var uriType = UriType.File; switch (item.Groups[2].Value.ToLower()) { case "iframe": case "a": uriType = UriType.Html; break; case "link": uriType = UriType.Css; break; case "img": uriType = UriType.Image; break; case "script": uriType = UriType.Js; break; default: uriType = UriType.File; break; } var uri = container.AddUri(url, uriType); html = html.Replace(item.Value, item.Value.Replace(item.Groups[4].Value, uri)); // 需要相对路径 } }
public void GetUrlFromCustom(ISpiderContainer container, ref string html) { var items = new List <string>(); var regex = new Regex(pattern, RegexOptions.IgnoreCase); var isEmptyTag = string.IsNullOrWhiteSpace(tag); var matches = regex.Matches(html); foreach (Match item in matches) { var value = isEmptyTag ? item.Value : item.Groups[tag].Value; if (value.IndexOf("base64,") >= 0) { continue; } var uri = container.AddUri(value, UriType.File); html = html.Replace(item.Value, isEmptyTag ? uri : item.Value.Replace(item.Groups[tag].Value, uri)); } }