static void ProcessFileList(IEnumerable <string> inputFiles, string outputFilePath) { PostParser _postParser; Post[] posts = new Post[] { }; string[] htmlFiles = inputFiles.Select(GetHtmlFromFile).ToArray(); _postParser = new PostParser(new HtmlPostContentParser(new HtmlDocument())); _postParser.SetPosts(htmlFiles); posts = _postParser.ToPosts(); using (TextWriter writer = new StreamWriter(outputFilePath)) { CsvWriter csv = new CsvWriter(writer); csv.WriteHeader <Post>(); csv.NextRecord(); foreach (Post post in posts) { csv.WriteRecord(post); csv.NextRecord(); } } }
public void Given_Header_That_Contains_Two_Properties_With_Values_Should_Map_To_Dictionary_Correctly() { var fileData = File.ReadAllText("SettingParsers/TestFiles/series-sample-test-3.md"); var result = PostParser.ParseSettings(fileData); Assert.Equal("post", result["layout"]); Assert.Equal("some title", result["title"]); }
public void Given_Header_With_Empty_Settings_Should_Return_Empty_Dictionary() { var result = PostParser.ParseSettings(@"--- ---"); Assert.Empty(result); }
public void Given_Header_Which_Contains_Blank_Lines_Should_Only_Parse_Out_Valid_Lines_Correctly() { var fileData = File.ReadAllText("SettingParsers/TestFiles/series-sample-test-4.md"); var result = PostParser.ParseSettings(fileData); Assert.Equal("post", result["layout"]); Assert.Equal("some title", result["title"]); Assert.Equal(2, result.Count); }
public void Given_A_File_With_No_Header_Should_Return_Raw_Post_And_Empty_Header() { var fileData = File.ReadAllText("SettingParsers/TestFiles/series-sample-test-2.md"); const string expected = @"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec porttitor non velit nec feugiat."; var result = PostParser.ParseDataFromFile(fileData); Assert.Equal(string.Empty, result.Item1); Assert.Equal(expected, result.Item2); }
public void Given_File_Should_Return_Tuple_With_Item2_Containing_Post() { var fileData = File.ReadAllText("SettingParsers/TestFiles/series-sample-test-1.md"); const string expected = @" Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec porttitor non velit nec feugiat."; var result = PostParser.ParseDataFromFile(fileData); Assert.Equal(expected, result.Item2); }
public void Given_Header_Which_Contains_Series_Should_Return_Series_Key_With_Series_Object() { var fileData = File.ReadAllText("SettingParsers/TestFiles/series-sample-test-5.md"); var result = PostParser.ParseSettings(fileData); Assert.True(result.ContainsKey("series")); var series = (Series)result["series"]; Assert.NotEmpty(series.Parts); Assert.Equal("123", series.Name); Assert.Equal(2, series.Current); }
public void Given_File_Should_Return_Tuple_With_Item1_Containing_Header() { var fileData = File.ReadAllText("SettingParsers/TestFiles/series-sample-test-1.md"); const string expected = @"--- layout: post series: name: 123 current: 1 part: test part 1 part: test part 2 part: test part 3 title: some title ---"; var result = PostParser.ParseDataFromFile(fileData); Assert.Equal(expected, result.Item1); }
public static IParser BuildParser(ISetting setting) { IParser parser; parser = new DefaultParser(setting.BlockBegin, setting.BlockContinue, setting.BlockEnd, setting.Escape); if (setting.Optimize) { parser = new PostParser(parser, new IOptimizer[] { ConstantInvokeOptimizer.Instance, ConstantMapOptimizer.Instance, IfOptimizer.Instance, ReturnOptimizer.Instance }); } return(parser); }
public PostPage() { this.InitializeComponent(); parser = new PostParser(); }
public async Task ParseAsync() { try { RaiseReport("START"); var pages = await ParsePagesCountAsync().ConfigureAwait(false); // Site peges count RaiseReport($"Pages: { pages }"); var lastUrl = _storage.GetLastPostUrl(); // Load last parsed post RaiseReport($"Last: { lastUrl ?? "New session" }"); var isEnd = false; for (int i = _start; i <= pages; i++) { try { if (i == 1) { continue; // Skip first page } var pageUrl = string.Format(PagePattern, i); RaisePage(new Uri(pageUrl)); var postLinkTags = await GetPostUrlsFromPageAsync(pageUrl).ConfigureAwait(false); // Parse post url's from page var stopWatch = Stopwatch.StartNew(); foreach (var postUrl in postLinkTags) { try { RaisePage(new Uri(postUrl)); if (postUrl == lastUrl) // Save only new posts { isEnd = true; break; } else if (_start != 0 && _storage.IsExists(postUrl)) // Continue loading from page { continue; } stopWatch.Restart(); var html = await LoadPageAsync(postUrl).ConfigureAwait(false); // Load post html RaiseReport($"Page loaded: [{ stopWatch.Elapsed.TotalMilliseconds }]"); if (string.IsNullOrEmpty(html)) { RaiseError($"Can't load page: { postUrl }"); continue; } stopWatch.Restart(); // Parse elements and save to storage using var postParser = new PostParser(html); var postDto = await postParser.GetPostDtoAsync().ConfigureAwait(false); postDto.Comments = await postParser.GetPostCommentsAsync().ConfigureAwait(false); postDto.Files = _saveFiles ? await postParser.GetPostFilesAsync().ConfigureAwait(false) : new List <string>(); RaiseReport($"Post parsed: [{ stopWatch.Elapsed.TotalMilliseconds }] ms"); stopWatch.Restart(); await _storage.SavePostAsync(postUrl, postDto).ConfigureAwait(false); RaiseReport($"Post saved: [{ stopWatch.Elapsed.TotalMilliseconds }] ms"); } catch (Exception ex) { RaiseError(ex.Message); } } } catch (Exception ex) { RaiseError(ex.Message); } if (isEnd) { break; } } RaiseReport("DONE!"); } catch (Exception ex) { RaiseError(ex.Message); } }
public void Given_Empty_RawSettings_Should_Return_Empty_Dictionary() { var result = PostParser.ParseSettings(""); Assert.Empty(result); }