public void DoubleTitleExtractorTest() { var articleExtractor = new ArticleExtractor(new DoubleTitleExtractorPipelineConfigurer(), new ExtractorConfiguration()); var article = articleExtractor.Extract(""); article.Should().NotBeNull(); article.Title.Should().NotBeNull(); article.Title.Should().Be("Title2"); }
private async static Task Handler(string mode, Uri uri) { IFileLayout docLayout; if (!_modeMap.TryGetValue(mode, out docLayout)) { throw new ArgumentException($"The value of {nameof(mode)} should be single or package."); } var mapper = new FeedRetriever(uri); await mapper.ReadAsync(); Trace.TraceInformation($"Retrieved {mapper.Count} links."); var retriever = new HttpRetriever(); var article = new ArticleExtractor(); var images = new ImageExtractor(); var context = BrowsingContext.New(Configuration.Default); var inserter = new ArticleInserter(); var imgWriter = new FileWriter(_modeMap["nested"]); var docWriter = new HtmlWriter(docLayout); string fileName; foreach (var i in mapper) { Trace.TraceInformation($"{i.Title}\t{i.Link}"); var itemUri = new Uri(i.Link); var input = await retriever.StartAsync(itemUri); var content = await article.ExtractAsync(input, "type-post"); var imgs = await images.ExtractAllAsync(input); foreach (var j in imgs) { var source = ((IHtmlImageElement)j)?.Source ?? "http://127.0.0.1"; var imgUri = new Uri(source); var file = await retriever.StartAsync(imgUri); await imgWriter.WriteAsync(file, imgUri.LocalPath); Trace.TraceInformation($"Path: {imgUri.AbsoluteUri} Length: {file.Length}"); } var document = await context.CreateDocumentAsync("templates\\post.html"); var placeholder = document.QuerySelector("div#article-placeholder"); inserter.Replace(placeholder, content); fileName = itemUri.Segments[1]; fileName = fileName.Replace("/", ""); await docWriter.WriteAsync(document, fileName); } }
public void ExtractArticleFromExternalHtmlTest() { const string urlAddress = "https://www.kinopoisk.ru/film/7107/"; var html = LoadHtml(urlAddress); if (html == null) { Assert.NotNull(html, "Html should be not null"); } var articleExtractor = ArticleExtractor.Create(ExtractorConfiguration.Default); var article = articleExtractor.Extract(html); article.Should().NotBeNull(); Console.WriteLine("Loaded article:" + "\nTitle = " + article.Title + "\nAuthors = " + string.Join(",", article.Authors ?? Enumerable.Empty <string>())); }