Пример #1
0
        public void WriteHtml(string html)
        {
            if (this.InTable)
            {
                throw new InvalidOperationException("Attempted to write non-table content in table mode.");
            }

            var mdcv = new md.Converter();
            var md   = mdcv.Convert(html);

            this.TextWriter.WriteLine(md);
            this.TextWriter.WriteLine();
        }
        /// <summary>
        /// 导出博客园的文章成本地 Markdown 进行保存
        /// </summary>
        /// <param name="pageStart">博客起始页码,即 http://www.cnblogs.com/parry/default.html?page={0} </param>
        /// <param name="pageEnd">博客结束页码,即 http://www.cnblogs.com/parry/default.html?page={0} </param>
        /// <param name="isSaveImage">是否将文章中的图片保存到本地,保存后文件夹在程序运行的 images 文件夹</param>
        /// <param name="imagePrefixUrl">替换文章中的图片为自己图床的前缀 Url</param>
        /// <param name="isAddMoreSeparateLine">在抓取到的文章 separateLineLocation(参数) 处添加<!--more-->分隔符,用于博客展示文章时用于抽取描述以及阅读更多使用。</param>
        /// <param name="separateLineLocation">添加分隔符的位置</param>
        /// <returns>是否执行完成</returns>
        public static bool ExportToMarkdown(int pageStart, int pageEnd, bool isSaveImage, string imagePrefixUrl = "", bool isAddMoreSeparateLine = false, int separateLineLocation = 300)
        {
            for (var page = pageStart; page <= pageEnd; page++)
            {
                var pagesUrl = string.Format("http://www.cnblogs.com/parry/default.html?page={0}", page);
                //抓取所有的文章内容链接地址,进行循环抓取并存储
                var regex = new Regex(@"class=""postTitle"">\s+<a.*?href=""(?<href>.*?)"">",
                    RegexOptions.Singleline | RegexOptions.Multiline);
                var matches = regex.Matches(NetworkHelper.GetHtmlFromGet(pagesUrl, Encoding.UTF8));
                foreach (Match match in matches)
                {
                    var articleUrl = match.Groups["href"].ToString();
                    var regexArticle =
                        new Regex(
                            @"<div\s+id=""topics"">.*?id=""cb_post_title_url"".*?>(?<title>.*?)</a>.*?<div\s+id=""cnblogs_post_body"">(?<articlecontent>.*?)</div><div\s+(?:id=""MySignature""></div>)?\s+<div\s+class=""clear""></div>.*?id=""post-date"">(?<date>.*?)</span>",
                            RegexOptions.Singleline | RegexOptions.Multiline);
                    var content = NetworkHelper.GetHtmlFromGet(articleUrl, Encoding.UTF8);
                    var regexAppName = new Regex("currentBlogApp\\s+=\\s+'(?<appName>.*?)'", RegexOptions.Singleline | RegexOptions.Multiline);
                    var matchAppName = regexAppName.Match(content);
                    var appName = string.Empty;
                    if (matchAppName.Success)
                    {
                        appName = matchAppName.Groups["appName"].ToString();
                    }
                    var matchArticle = regexArticle.Match(content);
                    if (matchArticle.Success)
                    {
                        var title = matchArticle.Groups["title"].ToString().Trim();
                        var date = matchArticle.Groups["date"].ToString().Trim();
                        var articleContent = matchArticle.Groups["articlecontent"].ToString();
                        if (isSaveImage)
                        {
                            articleContent = ProcessArticleImage(articleContent, imagePrefixUrl); //对文章中的图片进行保存,根据情况可以不处理,如何有自己的图床,那么保存下来后替换掉图床前缀就可以了。
                        }

                        articleContent = ProcessArticleCode(articleContent);
                        articleContent =
                            articleContent.Replace("<div id=\"parrycontent\">", string.Empty)
                                .Replace("</div>", string.Empty);
                        var regexId = new Regex(@"cb_blogId=(?<blogid>\d+),cb_entryId=(?<entryid>\d+)",
                            RegexOptions.Singleline | RegexOptions.Multiline);
                        int blogId = 0, postId = 0;
                        var matchId = regexId.Match(content);
                        if (matchId.Success)
                        {
                            int.TryParse(matchId.Groups["blogid"].ToString(), out blogId);
                            int.TryParse(matchId.Groups["entryid"].ToString(), out postId);
                        }

                        var categoryTags = GetArticleCategory(appName, blogId, postId);
                        var fileName = GetFileName(articleUrl);
                        var filePath = Application.StartupPath + "\\output\\" + fileName;
                        var mdContent = string.Format("---\r\ntitle: {0}\r\ndate: {1}\r\n{2}\r\n---\r\n{3}", title, date,
                            categoryTags, articleContent);
                        var converter = new Converter();
                        var markdown = converter.Convert(mdContent);
                        //注意此处的作用是在抓取到的文章 300 字符处添加<!--more-->分隔符,用于博客展示文章时用于抽取描述以及阅读更多使用。
                        if (isAddMoreSeparateLine)
                        {
                            markdown = markdown.Substring(0, separateLineLocation) + "\r\n<!--more-->\r\n" +
                                       markdown.Substring(separateLineLocation + 1);
                        }

                        //保存文件
                        var streamWriter = new StreamWriter(filePath);
                        streamWriter.Write(markdown);
                        streamWriter.Close();
                    }
                }
            }
            return true;
        }
Пример #3
0
        private void button1_Click(object sender, EventArgs e)
        {
            UrlGeneration();

            string[] lines = System.IO.File.ReadAllLines(@"urls.txt");
            int      a = 0, b = 0;

            foreach (string url in lines)
            {
                WebClient client            = new WebClient();
                string    htmlContent       = client.DownloadString(url);
                string    title             = TakeCourseTitle(htmlContent);
                string    editedHtmlContent = "";
                try
                {
                    var contentStartIndex = htmlContent.IndexOf("<!-- <content> -->");
                    editedHtmlContent = htmlContent.Substring(contentStartIndex, htmlContent.IndexOf("<!-- </content> -->") - contentStartIndex);
                    editedHtmlContent = editedHtmlContent.Substring("<!-- <content> -->".Length);
                    if (!editedHtmlContent.Contains("<li>English</li>"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("Chinese (Simplified)"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("Chinese (Traditional)"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("Japanese, English</li>"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("German, English</li>"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("French, English</li>"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("Course retirement date:"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("Portuguese(Brazil), English"))
                    {
                        continue;
                    }
                    if (editedHtmlContent.Contains("Spanish, English</li>>"))
                    {
                        continue;
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex);
                }
                string filename = RemoveIllegalChars(title) + ".md";


                excelFileGenerator.CreateCourseListInfoFromHtml(editedHtmlContent, title, filename, url, prodDictionary);
                var converter = new Html2Markdown.Converter();



                try
                {
                    var md = converter.Convert(editedHtmlContent);
                    b  = md.IndexOf("English");
                    a  = md.IndexOf("Job role:");
                    md = md.Substring(b, a - b);
                    md = md.Replace("####", "#").Replace("English", "# About this course");
                    md = StripHTML(md);
                    md = md.Substring(0, md.Length - 2);
                    CreateFile(md, filename);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(b);
                    Console.WriteLine(a);

                    Console.WriteLine(filename);
                }
            }
            excelFileGenerator.dosmth();

            MessageBox.Show("All Finished");
        }