Example #1
0
        public void DoubleTitleExtractorTest()
        {
            var articleExtractor = new ArticleExtractor(new DoubleTitleExtractorPipelineConfigurer(), new ExtractorConfiguration());
            var article          = articleExtractor.Extract("");

            article.Should().NotBeNull();
            article.Title.Should().NotBeNull();
            article.Title.Should().Be("Title2");
        }
Example #2
0
        private async static Task Handler(string mode, Uri uri)
        {
            IFileLayout docLayout;

            if (!_modeMap.TryGetValue(mode, out docLayout))
            {
                throw new ArgumentException($"The value of {nameof(mode)} should be single or package.");
            }

            var mapper = new FeedRetriever(uri);
            await mapper.ReadAsync();

            Trace.TraceInformation($"Retrieved {mapper.Count} links.");

            var    retriever = new HttpRetriever();
            var    article   = new ArticleExtractor();
            var    images    = new ImageExtractor();
            var    context   = BrowsingContext.New(Configuration.Default);
            var    inserter  = new ArticleInserter();
            var    imgWriter = new FileWriter(_modeMap["nested"]);
            var    docWriter = new HtmlWriter(docLayout);
            string fileName;

            foreach (var i in mapper)
            {
                Trace.TraceInformation($"{i.Title}\t{i.Link}");

                var itemUri = new Uri(i.Link);
                var input   = await retriever.StartAsync(itemUri);

                var content = await article.ExtractAsync(input, "type-post");

                var imgs = await images.ExtractAllAsync(input);

                foreach (var j in imgs)
                {
                    var source = ((IHtmlImageElement)j)?.Source ?? "http://127.0.0.1";
                    var imgUri = new Uri(source);
                    var file   = await retriever.StartAsync(imgUri);

                    await imgWriter.WriteAsync(file, imgUri.LocalPath);

                    Trace.TraceInformation($"Path: {imgUri.AbsoluteUri} Length: {file.Length}");
                }

                var document = await context.CreateDocumentAsync("templates\\post.html");

                var placeholder = document.QuerySelector("div#article-placeholder");
                inserter.Replace(placeholder, content);

                fileName = itemUri.Segments[1];
                fileName = fileName.Replace("/", "");

                await docWriter.WriteAsync(document, fileName);
            }
        }
Example #3
0
        public void ExtractArticleFromExternalHtmlTest()
        {
            const string urlAddress = "https://www.kinopoisk.ru/film/7107/";
            var          html       = LoadHtml(urlAddress);

            if (html == null)
            {
                Assert.NotNull(html, "Html should be not null");
            }

            var articleExtractor = ArticleExtractor.Create(ExtractorConfiguration.Default);
            var article          = articleExtractor.Extract(html);

            article.Should().NotBeNull();

            Console.WriteLine("Loaded article:" +
                              "\nTitle = " + article.Title +
                              "\nAuthors = " + string.Join(",", article.Authors ?? Enumerable.Empty <string>()));
        }