public static string[] GetHTMLUrls(string Page, string HostAbsolutePath, int MaxLen, int FindUrlLevel) { ArrayList list = new ArrayList(); ParseHTML ehtml = new ParseHTML(); ehtml.Source = Page; while (!ehtml.Eof()) { if (ehtml.Parse() == '\0') { Shove.HTML.HtmlParse.Attribute attribute = ehtml.GetTag()["HREF"]; if (attribute != null) { string str = attribute.Value.Trim().ToLower(); if ((((str != "") && !str.StartsWith("mailto")) && !str.StartsWith("#")) && (((FindUrlLevel == 2) || str.StartsWith("http://")) || str.StartsWith("https://"))) { str = GetPath(str, HostAbsolutePath); if ((MaxLen < 1) || (str.Length <= MaxLen)) { list.Add(str); } } } attribute = ehtml.GetTag()["SRC"]; if (attribute != null) { string str2 = attribute.Value.Trim().ToLower(); if ((str2 != "") && (((FindUrlLevel == 2) || str2.StartsWith("http://")) || str2.StartsWith("https://"))) { str2 = GetPath(str2, HostAbsolutePath); if ((MaxLen < 1) || (str2.Length <= MaxLen)) { list.Add(str2); } } } } } if (list.Count == 0) { return null; } string[] strArray = new string[list.Count]; for (int i = 0; i < list.Count; i++) { strArray[i] = list[i].ToString(); } return strArray; }
/// <summary> /// 获取 html 包含的所有的链接地址 /// </summary> /// <param name="page"></param> /// <param name="hostAbsolutePath"></param> /// <param name="maxLen"></param> /// <param name="findUrlLevel"></param> /// <returns></returns> public static string[] GetHTMLUrls(string page, string hostAbsolutePath, int maxLen, int findUrlLevel) { ArrayList m_Url = new ArrayList(); HtmlParse.ParseHTML parse = new HtmlParse.ParseHTML(); HtmlParse.Attribute a; parse.Source = page; while (!parse.Eof()) { char ch = parse.Parse(); if (ch == 0) { a = parse.GetTag()["HREF"]; if (a != null) { string str = a.Value.Trim().ToLower(); if ((str != "") && (!str.StartsWith("mailto", StringComparison.Ordinal)) && (!str.StartsWith("#", StringComparison.Ordinal))) { if ((findUrlLevel == 2) || str.StartsWith("http://", StringComparison.Ordinal) || str.StartsWith("https://", StringComparison.Ordinal)) { str = ReBuildUrl(str, hostAbsolutePath); if ((maxLen < 1) || (str.Length <= maxLen)) { m_Url.Add(str); } } } } a = parse.GetTag()["SRC"]; if (a != null) { string str = a.Value.Trim().ToLower(); if (str != "") { if ((findUrlLevel == 2) || str.StartsWith("http://", StringComparison.Ordinal) || str.StartsWith("https://", StringComparison.Ordinal)) { str = ReBuildUrl(str, hostAbsolutePath); if ((maxLen < 1) || (str.Length <= maxLen)) { m_Url.Add(str); } } } } } } if (m_Url.Count == 0) { return(null); } string[] strs = new string[m_Url.Count]; int i; for (i = 0; i < m_Url.Count; i++) { strs[i] = m_Url[i].ToString(); } return(strs); }