Пример #1
0
        public static void GetContentFromUrl(string url, ref string tmp_title, ref string tmp_content, string treg, string creg)
        {
            EchoHelper.EchoPickStart();

            tmp_title   = tmp_title.Replace("[标题]", "(.*?)");
            tmp_content = tmp_content.Replace("[正文]", "(.*?)");
            nextPages   = new ArrayList();

            while (url != "")
            {
                string html = FetchContent.GetDataFromUrl(url);
                nextPages.Add(url);
                if (string.IsNullOrEmpty(tmp_title))
                {
                    tmp_title = RegexHelper.getMatch(html, treg, 1);
                }
                //内容正则循环
                if (!string.IsNullOrEmpty(creg))
                {
                    string[] contentRegexs = creg.Split('\n');
                    for (int i = 0; i < contentRegexs.Length; i++)
                    {
                        string tmp = RegexHelper.getMatchs(html.Replace("\n", "`"), contentRegexs[i].ToString().Trim(), 1, "\r\n").Replace("`", "\n");
                        tmp_content += tmp;
                        tmp_content += Environment.NewLine;
                    }
                }
                url = FetchContent.GetNextPageUrl(html, url);
            }
            EchoHelper.EchoPickEnd();
        }
Пример #2
0
 public static void GetContentFromUrl(string url, ref string title, ref string content)
 {
     EchoHelper.EchoPickStart();
     try {
         url       = HttpUtility.UrlDecode(url);
         nextPages = new ArrayList();
         while (url != "")
         {
             string html = FetchContent.GetDataFromUrl(url);
             nextPages.Add(url);
             if (string.IsNullOrEmpty(title))
             {
                 title = RegexHelper.getHtmlRegexText(html, "{<title>(.*?)</title>}");
                 title = RegexHelper.regReplace(title, "_.*", "");
                 title = RegexHelper.regReplace(title, "-.*", "");
                 title = title.Replace("&nbsp;", "");
             }
             content += FetchContent.GetMainContent(html);
             url      = FetchContent.GetNextPageUrl(html, url);
             url.Trim();
         }
         if (title.Contains("<title>(.*"))
         {
             title = StringHelper.SubString(content, 0, 50);
         }
     } catch {
         title   = "";
         content = "";
         EchoHelper.Echo("采集跳过,原因可能是:该文章设置了密码、被删除、乱码等。", "采集出错", EchoHelper.EchoType.普通信息);
     }
     EchoHelper.EchoPickEnd();
 }
Пример #3
0
        public static string GetTitleFromUrl(string url)
        {
            url = HttpUtility.UrlDecode(url);
            string result = string.Empty;
            string html   = FetchContent.GetDataFromUrl(url);

            return(GetTitleFromHTML(result));
        }