/// <summary>分析HTML中的链接</summary> /// <param name="html">Html文本</param> /// <param name="baseurl">基础Url,用于生成超链接的完整Url</param> /// <param name="filter">用于基础过滤的过滤器</param> /// <returns></returns> public static Link[] Parse(String html, String baseurl = null, Func <Link, Boolean> filter = null) { // 分析所有链接 var list = new List <Link>(); var buri = new Uri(baseurl); foreach (Match item in _regA.Matches(html)) { var link = new Link(); link.Html = item.Value; link.Name = item.Groups["名称"].Value.Trim(); link.Url = item.Groups["链接"].Value.Trim(); link.RawUrl = link.Url; // 过滤器 if (filter != null && !filter(link)) { continue; } link.Url = link.Url.TrimStart("#"); if (String.IsNullOrEmpty(link.Url)) { continue; } if (link.Url.StartsWithIgnoreCase("javascript:")) { continue; } // 分析title var txt = item.Groups["其它1"].Value.Trim(); if (txt.IsNullOrWhiteSpace() || !_regTitle.IsMatch(txt)) { txt = item.Groups["其它2"].Value.Trim(); } var mc = _regTitle.Match(txt); if (mc.Success) { link.Title = mc.Groups["标题"].Value.Trim(); } // 完善下载地址 var uri = new Uri(buri, link.RawUrl); link.Url = uri.ToString(); // 分割名称,计算结尾的时间 yyyyMMddHHmmss link.ParseTime(); // 分割版本,_v1.0.0.0 link.ParseVersion(); list.Add(link); } return(list.ToArray()); }
private static Link[] ParseFTP(String html, String url, Func <Link, Boolean> filter = null) { var list = new List <Link>(); var ns = html.Split(Environment.NewLine); if (ns.Length == 0) { return(list.ToArray()); } // 如果由很多段组成,可能是unix格式 var unix = ns[0].Split(" ").Length >= 6; var buri = new Uri(url); foreach (var item in ns) { var link = new Link { Name = item }; //link.Name = Path.GetFileNameWithoutExtension(item); //link.Url = new Uri(buri, item).ToString(); //link.RawUrl = link.Url; // 过滤器 if (filter != null && !filter(link)) { continue; } // 分析title link.Title = Path.GetFileNameWithoutExtension(item); // 完善下载地址 var uri = new Uri(buri, item); link.Url = uri.ToString(); // 分割名称,计算结尾的时间 yyyyMMddHHmmss var idx = link.ParseTime(); if (idx > 0) { link.Title = link.Title.Substring(0, idx); } // 分割版本,_v1.0.0.0 idx = link.ParseVersion(); if (idx > 0) { link.Title = link.Title.Substring(0, idx); } list.Add(link); } return(list.ToArray()); }
/// <summary>分析HTML中的链接</summary> /// <param name="html">Html文本</param> /// <param name="baseurl">基础Url,用于生成超链接的完整Url</param> /// <param name="filter">用于基础过滤的过滤器</param> /// <returns></returns> public static Link[] Parse(String html, String baseurl = null, Func<Link, Boolean> filter = null) { // 分析所有链接 var list = new List<Link>(); var buri = new Uri(baseurl); foreach (Match item in _regA.Matches(html)) { var link = new Link(); link.Html = item.Value; link.Name = item.Groups["名称"].Value.Trim(); link.Url = item.Groups["链接"].Value.Trim(); link.RawUrl = link.Url; // 过滤器 if (filter != null && !filter(link)) continue; link.Url = link.Url.TrimStart("#"); if (String.IsNullOrEmpty(link.Url)) continue; if (link.Url.StartsWithIgnoreCase("javascript:")) continue; // 分析title var txt = item.Groups["其它1"].Value.Trim(); if (txt.IsNullOrWhiteSpace() || !_regTitle.IsMatch(txt)) txt = item.Groups["其它2"].Value.Trim(); var mc = _regTitle.Match(txt); if (mc.Success) { link.Title = mc.Groups["标题"].Value.Trim(); } // 完善下载地址 var uri = new Uri(buri, link.RawUrl); link.Url = uri.ToString(); // 分割名称,计算结尾的时间 yyyyMMddHHmmss link.ParseTime(); // 分割版本,_v1.0.0.0 link.ParseVersion(); list.Add(link); } return list.ToArray(); }
/// <summary>分析HTML中的链接</summary> /// <param name="html">Html文本</param> /// <param name="baseurl">基础Url,用于生成超链接的完整Url</param> /// <param name="filter">用于基础过滤的过滤器</param> /// <returns></returns> public static Link[] Parse(String html, String baseurl = null, Func <Link, Boolean> filter = null) { if (baseurl.StartsWithIgnoreCase("ftp://")) { return(ParseFTP(html, baseurl, filter)); } // 分析所有链接 var list = new List <Link>(); var buri = new Uri(baseurl); foreach (Match item in _regA.Matches(html)) { var link = new Link { Html = item.Value, FullName = item.Groups["名称"].Value.Trim(), Url = item.Groups["链接"].Value.Trim() }; link.RawUrl = link.Url; link.Name = link.FullName; // 过滤器 if (filter != null && !filter(link)) { continue; } link.Url = link.Url.TrimStart("#"); if (String.IsNullOrEmpty(link.Url)) { continue; } if (link.Url.StartsWithIgnoreCase("javascript:")) { continue; } // 分析title var txt = item.Groups["其它1"].Value.Trim(); if (txt.IsNullOrWhiteSpace() || !_regTitle.IsMatch(txt)) { txt = item.Groups["其它2"].Value.Trim(); } var mc = _regTitle.Match(txt); if (mc.Success) { link.Title = mc.Groups["标题"].Value.Trim(); } // 完善下载地址 var uri = new Uri(buri, link.RawUrl); link.Url = uri.ToString(); // 从github.com下载需要处理Url if (link.Url.Contains("github.com") && link.Url.Contains("/blob/")) { link.Url = link.Url.Replace("/blob/", "/raw/"); } // 分割名称,计算结尾的时间 yyyyMMddHHmmss link.ParseTime(); // 分割版本,_v1.0.0.0 link.ParseVersion(); // 去掉后缀 var p = link.Name.LastIndexOf('.'); if (p > 0) { link.Name = link.Name.Substring(0, p); } list.Add(link); } return(list.ToArray()); }