コード例 #1
0
        public void GetUrlFromCss(ISpiderContainer container, ref string html)
        {
            var items   = new List <string>();
            var matches = Regex.Matches(html, @"url\([""']?([^""'\s\<\>]*)[""']?\)", RegexOptions.IgnoreCase);

            foreach (Match item in matches)
            {
                if (item.Groups[1].Value.IndexOf("base64,") >= 0)
                {
                    continue;
                }
                var uri = container.AddUri(item.Groups[1].Value, UriType.File);
                html = html.Replace(item.Value,
                                    item.Value.Replace(item.Groups[1].Value, uri));
            }
        }
コード例 #2
0
        public void GetUrlFromHtml(ISpiderContainer container, ref string html)
        {
            var matches = Regex.Matches(html, @"(\<(a|img|link|script|embed|audio|object|video|param|source|iframe)[^\<\>]+(src|href|value|data)\s?=)\s?[""']?([^""'\s\<\>]*)[""']?", RegexOptions.IgnoreCase);

            foreach (Match item in matches)
            {
                var url = item.Groups[4].Value;
                if (string.IsNullOrEmpty(url) ||
                    url.IndexOf("javascript:", StringComparison.Ordinal) >= 0 ||
                    url.IndexOf("#", StringComparison.Ordinal) == 0 ||
                    url.IndexOf("data:", StringComparison.OrdinalIgnoreCase) >= 0 ||
                    url.IndexOf("ed2k://", StringComparison.OrdinalIgnoreCase) >= 0)
                {
                    continue;
                }
                var uriType = UriType.File;
                switch (item.Groups[2].Value.ToLower())
                {
                case "iframe":
                case "a":
                    uriType = UriType.Html;
                    break;

                case "link":
                    uriType = UriType.Css;
                    break;

                case "img":
                    uriType = UriType.Image;
                    break;

                case "script":
                    uriType = UriType.Js;
                    break;

                default:
                    uriType = UriType.File;
                    break;
                }
                var uri = container.AddUri(url, uriType);
                html = html.Replace(item.Value, item.Value.Replace(item.Groups[4].Value, uri));  // 需要相对路径
            }
        }
コード例 #3
0
ファイル: UrlRule.cs プロジェクト: zx648383079/ZoDream.Spider
        public void GetUrlFromCustom(ISpiderContainer container, ref string html)
        {
            var items      = new List <string>();
            var regex      = new Regex(pattern, RegexOptions.IgnoreCase);
            var isEmptyTag = string.IsNullOrWhiteSpace(tag);
            var matches    = regex.Matches(html);

            foreach (Match item in matches)
            {
                var value = isEmptyTag ? item.Value : item.Groups[tag].Value;
                if (value.IndexOf("base64,") >= 0)
                {
                    continue;
                }
                var uri = container.AddUri(value, UriType.File);
                html = html.Replace(item.Value,
                                    isEmptyTag ? uri : item.Value.Replace(item.Groups[tag].Value, uri));
            }
        }