public void WriteHtml(string html) { if (this.InTable) { throw new InvalidOperationException("Attempted to write non-table content in table mode."); } var mdcv = new md.Converter(); var md = mdcv.Convert(html); this.TextWriter.WriteLine(md); this.TextWriter.WriteLine(); }
/// <summary> /// 导出博客园的文章成本地 Markdown 进行保存 /// </summary> /// <param name="pageStart">博客起始页码,即 http://www.cnblogs.com/parry/default.html?page={0} </param> /// <param name="pageEnd">博客结束页码,即 http://www.cnblogs.com/parry/default.html?page={0} </param> /// <param name="isSaveImage">是否将文章中的图片保存到本地,保存后文件夹在程序运行的 images 文件夹</param> /// <param name="imagePrefixUrl">替换文章中的图片为自己图床的前缀 Url</param> /// <param name="isAddMoreSeparateLine">在抓取到的文章 separateLineLocation(参数) 处添加<!--more-->分隔符,用于博客展示文章时用于抽取描述以及阅读更多使用。</param> /// <param name="separateLineLocation">添加分隔符的位置</param> /// <returns>是否执行完成</returns> public static bool ExportToMarkdown(int pageStart, int pageEnd, bool isSaveImage, string imagePrefixUrl = "", bool isAddMoreSeparateLine = false, int separateLineLocation = 300) { for (var page = pageStart; page <= pageEnd; page++) { var pagesUrl = string.Format("http://www.cnblogs.com/parry/default.html?page={0}", page); //抓取所有的文章内容链接地址,进行循环抓取并存储 var regex = new Regex(@"class=""postTitle"">\s+<a.*?href=""(?<href>.*?)"">", RegexOptions.Singleline | RegexOptions.Multiline); var matches = regex.Matches(NetworkHelper.GetHtmlFromGet(pagesUrl, Encoding.UTF8)); foreach (Match match in matches) { var articleUrl = match.Groups["href"].ToString(); var regexArticle = new Regex( @"<div\s+id=""topics"">.*?id=""cb_post_title_url"".*?>(?<title>.*?)</a>.*?<div\s+id=""cnblogs_post_body"">(?<articlecontent>.*?)</div><div\s+(?:id=""MySignature""></div>)?\s+<div\s+class=""clear""></div>.*?id=""post-date"">(?<date>.*?)</span>", RegexOptions.Singleline | RegexOptions.Multiline); var content = NetworkHelper.GetHtmlFromGet(articleUrl, Encoding.UTF8); var regexAppName = new Regex("currentBlogApp\\s+=\\s+'(?<appName>.*?)'", RegexOptions.Singleline | RegexOptions.Multiline); var matchAppName = regexAppName.Match(content); var appName = string.Empty; if (matchAppName.Success) { appName = matchAppName.Groups["appName"].ToString(); } var matchArticle = regexArticle.Match(content); if (matchArticle.Success) { var title = matchArticle.Groups["title"].ToString().Trim(); var date = matchArticle.Groups["date"].ToString().Trim(); var articleContent = matchArticle.Groups["articlecontent"].ToString(); if (isSaveImage) { articleContent = ProcessArticleImage(articleContent, imagePrefixUrl); //对文章中的图片进行保存,根据情况可以不处理,如何有自己的图床,那么保存下来后替换掉图床前缀就可以了。 } articleContent = ProcessArticleCode(articleContent); articleContent = articleContent.Replace("<div id=\"parrycontent\">", string.Empty) .Replace("</div>", string.Empty); var regexId = new Regex(@"cb_blogId=(?<blogid>\d+),cb_entryId=(?<entryid>\d+)", RegexOptions.Singleline | RegexOptions.Multiline); int blogId = 0, postId = 0; var matchId = regexId.Match(content); if (matchId.Success) { int.TryParse(matchId.Groups["blogid"].ToString(), out blogId); int.TryParse(matchId.Groups["entryid"].ToString(), out postId); } var categoryTags = GetArticleCategory(appName, blogId, postId); var fileName = GetFileName(articleUrl); var filePath = Application.StartupPath + "\\output\\" + fileName; var mdContent = string.Format("---\r\ntitle: {0}\r\ndate: {1}\r\n{2}\r\n---\r\n{3}", title, date, categoryTags, articleContent); var converter = new Converter(); var markdown = converter.Convert(mdContent); //注意此处的作用是在抓取到的文章 300 字符处添加<!--more-->分隔符,用于博客展示文章时用于抽取描述以及阅读更多使用。 if (isAddMoreSeparateLine) { markdown = markdown.Substring(0, separateLineLocation) + "\r\n<!--more-->\r\n" + markdown.Substring(separateLineLocation + 1); } //保存文件 var streamWriter = new StreamWriter(filePath); streamWriter.Write(markdown); streamWriter.Close(); } } } return true; }
private void button1_Click(object sender, EventArgs e) { UrlGeneration(); string[] lines = System.IO.File.ReadAllLines(@"urls.txt"); int a = 0, b = 0; foreach (string url in lines) { WebClient client = new WebClient(); string htmlContent = client.DownloadString(url); string title = TakeCourseTitle(htmlContent); string editedHtmlContent = ""; try { var contentStartIndex = htmlContent.IndexOf("<!-- <content> -->"); editedHtmlContent = htmlContent.Substring(contentStartIndex, htmlContent.IndexOf("<!-- </content> -->") - contentStartIndex); editedHtmlContent = editedHtmlContent.Substring("<!-- <content> -->".Length); if (!editedHtmlContent.Contains("<li>English</li>")) { continue; } if (editedHtmlContent.Contains("Chinese (Simplified)")) { continue; } if (editedHtmlContent.Contains("Chinese (Traditional)")) { continue; } if (editedHtmlContent.Contains("Japanese, English</li>")) { continue; } if (editedHtmlContent.Contains("German, English</li>")) { continue; } if (editedHtmlContent.Contains("French, English</li>")) { continue; } if (editedHtmlContent.Contains("Course retirement date:")) { continue; } if (editedHtmlContent.Contains("Portuguese(Brazil), English")) { continue; } if (editedHtmlContent.Contains("Spanish, English</li>>")) { continue; } } catch (Exception ex) { Console.WriteLine(ex); } string filename = RemoveIllegalChars(title) + ".md"; excelFileGenerator.CreateCourseListInfoFromHtml(editedHtmlContent, title, filename, url, prodDictionary); var converter = new Html2Markdown.Converter(); try { var md = converter.Convert(editedHtmlContent); b = md.IndexOf("English"); a = md.IndexOf("Job role:"); md = md.Substring(b, a - b); md = md.Replace("####", "#").Replace("English", "# About this course"); md = StripHTML(md); md = md.Substring(0, md.Length - 2); CreateFile(md, filename); } catch (Exception ex) { Console.WriteLine(b); Console.WriteLine(a); Console.WriteLine(filename); } } excelFileGenerator.dosmth(); MessageBox.Show("All Finished"); }