Пример #1
0
        /// <summary>分析HTML中的链接</summary>
        /// <param name="html">Html文本</param>
        /// <param name="baseurl">基础Url,用于生成超链接的完整Url</param>
        /// <param name="filter">用于基础过滤的过滤器</param>
        /// <returns></returns>
        public static Link[] Parse(String html, String baseurl = null, Func <Link, Boolean> filter = null)
        {
            // 分析所有链接
            var list = new List <Link>();
            var buri = new Uri(baseurl);

            foreach (Match item in _regA.Matches(html))
            {
                var link = new Link();

                link.Html   = item.Value;
                link.Name   = item.Groups["名称"].Value.Trim();
                link.Url    = item.Groups["链接"].Value.Trim();
                link.RawUrl = link.Url;

                // 过滤器
                if (filter != null && !filter(link))
                {
                    continue;
                }

                link.Url = link.Url.TrimStart("#");
                if (String.IsNullOrEmpty(link.Url))
                {
                    continue;
                }

                if (link.Url.StartsWithIgnoreCase("javascript:"))
                {
                    continue;
                }

                // 分析title
                var txt = item.Groups["其它1"].Value.Trim();
                if (txt.IsNullOrWhiteSpace() || !_regTitle.IsMatch(txt))
                {
                    txt = item.Groups["其它2"].Value.Trim();
                }
                var mc = _regTitle.Match(txt);
                if (mc.Success)
                {
                    link.Title = mc.Groups["标题"].Value.Trim();
                }

                // 完善下载地址
                var uri = new Uri(buri, link.RawUrl);
                link.Url = uri.ToString();

                // 分割名称,计算结尾的时间 yyyyMMddHHmmss
                link.ParseTime();

                // 分割版本,_v1.0.0.0
                link.ParseVersion();

                list.Add(link);
            }

            return(list.ToArray());
        }
Пример #2
0
        private static Link[] ParseFTP(String html, String url, Func <Link, Boolean> filter = null)
        {
            var list = new List <Link>();

            var ns = html.Split(Environment.NewLine);

            if (ns.Length == 0)
            {
                return(list.ToArray());
            }

            // 如果由很多段组成,可能是unix格式
            var unix = ns[0].Split(" ").Length >= 6;
            var buri = new Uri(url);

            foreach (var item in ns)
            {
                var link = new Link
                {
                    Name = item
                };
                //link.Name = Path.GetFileNameWithoutExtension(item);
                //link.Url = new Uri(buri, item).ToString();
                //link.RawUrl = link.Url;

                // 过滤器
                if (filter != null && !filter(link))
                {
                    continue;
                }

                // 分析title
                link.Title = Path.GetFileNameWithoutExtension(item);

                // 完善下载地址
                var uri = new Uri(buri, item);
                link.Url = uri.ToString();

                // 分割名称,计算结尾的时间 yyyyMMddHHmmss
                var idx = link.ParseTime();
                if (idx > 0)
                {
                    link.Title = link.Title.Substring(0, idx);
                }

                // 分割版本,_v1.0.0.0
                idx = link.ParseVersion();
                if (idx > 0)
                {
                    link.Title = link.Title.Substring(0, idx);
                }

                list.Add(link);
            }

            return(list.ToArray());
        }
Пример #3
0
        /// <summary>分析HTML中的链接</summary>
        /// <param name="html">Html文本</param>
        /// <param name="baseurl">基础Url,用于生成超链接的完整Url</param>
        /// <param name="filter">用于基础过滤的过滤器</param>
        /// <returns></returns>
        public static Link[] Parse(String html, String baseurl = null, Func<Link, Boolean> filter = null)
        {
            // 分析所有链接
            var list = new List<Link>();
            var buri = new Uri(baseurl);
            foreach (Match item in _regA.Matches(html))
            {
                var link = new Link();

                link.Html = item.Value;
                link.Name = item.Groups["名称"].Value.Trim();
                link.Url = item.Groups["链接"].Value.Trim();
                link.RawUrl = link.Url;

                // 过滤器
                if (filter != null && !filter(link)) continue;

                link.Url = link.Url.TrimStart("#");
                if (String.IsNullOrEmpty(link.Url)) continue;

                if (link.Url.StartsWithIgnoreCase("javascript:")) continue;

                // 分析title
                var txt = item.Groups["其它1"].Value.Trim();
                if (txt.IsNullOrWhiteSpace() || !_regTitle.IsMatch(txt)) txt = item.Groups["其它2"].Value.Trim();
                var mc = _regTitle.Match(txt);
                if (mc.Success)
                {
                    link.Title = mc.Groups["标题"].Value.Trim();
                }

                // 完善下载地址
                var uri = new Uri(buri, link.RawUrl);
                link.Url = uri.ToString();

                // 分割名称,计算结尾的时间 yyyyMMddHHmmss
                link.ParseTime();

                // 分割版本,_v1.0.0.0
                link.ParseVersion();

                list.Add(link);
            }

            return list.ToArray();
        }
Пример #4
0
        /// <summary>分析HTML中的链接</summary>
        /// <param name="html">Html文本</param>
        /// <param name="baseurl">基础Url,用于生成超链接的完整Url</param>
        /// <param name="filter">用于基础过滤的过滤器</param>
        /// <returns></returns>
        public static Link[] Parse(String html, String baseurl = null, Func <Link, Boolean> filter = null)
        {
            if (baseurl.StartsWithIgnoreCase("ftp://"))
            {
                return(ParseFTP(html, baseurl, filter));
            }

            // 分析所有链接
            var list = new List <Link>();
            var buri = new Uri(baseurl);

            foreach (Match item in _regA.Matches(html))
            {
                var link = new Link
                {
                    Html     = item.Value,
                    FullName = item.Groups["名称"].Value.Trim(),
                    Url      = item.Groups["链接"].Value.Trim()
                };
                link.RawUrl = link.Url;
                link.Name   = link.FullName;

                // 过滤器
                if (filter != null && !filter(link))
                {
                    continue;
                }

                link.Url = link.Url.TrimStart("#");
                if (String.IsNullOrEmpty(link.Url))
                {
                    continue;
                }

                if (link.Url.StartsWithIgnoreCase("javascript:"))
                {
                    continue;
                }

                // 分析title
                var txt = item.Groups["其它1"].Value.Trim();
                if (txt.IsNullOrWhiteSpace() || !_regTitle.IsMatch(txt))
                {
                    txt = item.Groups["其它2"].Value.Trim();
                }
                var mc = _regTitle.Match(txt);
                if (mc.Success)
                {
                    link.Title = mc.Groups["标题"].Value.Trim();
                }

                // 完善下载地址
                var uri = new Uri(buri, link.RawUrl);
                link.Url = uri.ToString();

                // 从github.com下载需要处理Url
                if (link.Url.Contains("github.com") && link.Url.Contains("/blob/"))
                {
                    link.Url = link.Url.Replace("/blob/", "/raw/");
                }

                // 分割名称,计算结尾的时间 yyyyMMddHHmmss
                link.ParseTime();

                // 分割版本,_v1.0.0.0
                link.ParseVersion();

                // 去掉后缀
                var p = link.Name.LastIndexOf('.');
                if (p > 0)
                {
                    link.Name = link.Name.Substring(0, p);
                }

                list.Add(link);
            }

            return(list.ToArray());
        }