public static void GetContentFromUrl(string url, ref string tmp_title, ref string tmp_content, string treg, string creg) { EchoHelper.EchoPickStart(); tmp_title = tmp_title.Replace("[标题]", "(.*?)"); tmp_content = tmp_content.Replace("[正文]", "(.*?)"); nextPages = new ArrayList(); while (url != "") { string html = FetchContent.GetDataFromUrl(url); nextPages.Add(url); if (string.IsNullOrEmpty(tmp_title)) { tmp_title = RegexHelper.getMatch(html, treg, 1); } //内容正则循环 if (!string.IsNullOrEmpty(creg)) { string[] contentRegexs = creg.Split('\n'); for (int i = 0; i < contentRegexs.Length; i++) { string tmp = RegexHelper.getMatchs(html.Replace("\n", "`"), contentRegexs[i].ToString().Trim(), 1, "\r\n").Replace("`", "\n"); tmp_content += tmp; tmp_content += Environment.NewLine; } } url = FetchContent.GetNextPageUrl(html, url); } EchoHelper.EchoPickEnd(); }
public static void GetContentFromUrl(string url, ref string title, ref string content) { EchoHelper.EchoPickStart(); try { url = HttpUtility.UrlDecode(url); nextPages = new ArrayList(); while (url != "") { string html = FetchContent.GetDataFromUrl(url); nextPages.Add(url); if (string.IsNullOrEmpty(title)) { title = RegexHelper.getHtmlRegexText(html, "{<title>(.*?)</title>}"); title = RegexHelper.regReplace(title, "_.*", ""); title = RegexHelper.regReplace(title, "-.*", ""); title = title.Replace(" ", ""); } content += FetchContent.GetMainContent(html); url = FetchContent.GetNextPageUrl(html, url); url.Trim(); } if (title.Contains("<title>(.*")) { title = StringHelper.SubString(content, 0, 50); } } catch { title = ""; content = ""; EchoHelper.Echo("采集跳过,原因可能是:该文章设置了密码、被删除、乱码等。", "采集出错", EchoHelper.EchoType.普通信息); } EchoHelper.EchoPickEnd(); }
public static string GetTitleFromUrl(string url) { url = HttpUtility.UrlDecode(url); string result = string.Empty; string html = FetchContent.GetDataFromUrl(url); return(GetTitleFromHTML(result)); }