Пример #1
0
        /// <summary>
        /// Reads article content from the given stream.
        /// </summary>
        /// <param name="stream">The stream to extract the content from.</param>
        /// <param name="encoding">The stream encoding.</param>
        /// <param name="options">The transform options.</param>
        /// <param name="cancellationToken">The cancellation token.</param>
        /// <returns>
        /// An article with extracted content and meta information.
        /// </returns>
        public Article Read(Stream stream, Encoding encoding, ReadOptions options, CancellationToken cancellationToken)
        {
            // readability
            TranscodingResult transcodingResult;

            try
            {
                // transcode content
                transcodingResult = ExtractReadableInformation(null, stream, options, encoding);
            }
            catch (Exception exc)
            {
                throw new ReadException(exc.Message);
            }

            // get word count and plain text
            string plainContent;
            int    wordCount = 0;

            try
            {
                plainContent = HtmlUtilities.ConvertToPlainText(transcodingResult.ExtractedContent);
                wordCount    = HtmlUtilities.CountWords(plainContent);
            }
            catch
            {
                plainContent = null;
            }

            // create article
            return(new Article
            {
                Title = transcodingResult.ExtractedTitle,
                Description = transcodingResult.ExtractedDescription,
                Content = transcodingResult.ExtractedContent,
                ContentExtracted = transcodingResult.ContentExtracted && wordCount > 0,
                Raw = _rawHTML,
                PlainContent = plainContent,
                WordCount = wordCount,
                PageCount = 1,
                FrontImage = transcodingResult.ExtractedImage,
                Images = new ArticleImage[0],
                Favicon = transcodingResult.ExtractedFavicon,
                NextPage = transcodingResult.NextPageUrl != null ? new Uri(transcodingResult.NextPageUrl, UriKind.Absolute) : null,
                Encoding = encoding
            });
        }
Пример #2
0
        /// <summary>
        /// Reads article content from the given URI.
        /// </summary>
        /// <param name="uri">An URI to extract the content from.</param>
        /// <param name="options">The transform options.</param>
        /// <param name="cancellationToken">The cancellation token.</param>
        /// <returns>
        /// An article with extracted content and meta information.
        /// </returns>
        /// <exception cref="ReadException"></exception>
        /// <exception cref="InvalidOperationException"></exception>
        /// <exception cref="OperationCanceledException"></exception>
        public async Task <Article> Read(Uri uri, ReadOptions options = null, CancellationToken cancellationToken = default(CancellationToken))
        {
            _currentPages = new List <string>();

            Response response;
            string   uriString = uri.OriginalString;

            if (options == null)
            {
                options = ReadOptions.CreateDefault();
            }

            // replace domain when URI is marked as faulty
            foreach (string faultyUri in _redirectFaultyMobileURIs.Keys)
            {
                if (uriString.Contains(faultyUri))
                {
                    uri = new Uri(uriString.Replace(faultyUri, _redirectFaultyMobileURIs[faultyUri]));
                }
            }

            // make async request
            response = await Request(uri, options, null, cancellationToken);

            // get images from article
            int id = 1;
            IEnumerable <ArticleImage> images = response.TranscodingResult.Images
                                                .Select(image =>
            {
                Uri imageUri = null;
                Uri.TryCreate(image.GetAttributeValue("src", null), UriKind.Absolute, out imageUri);

                return(new ArticleImage()
                {
                    ID = (id++).ToString(),
                    Uri = imageUri,
                    Title = image.GetAttributeValue("title", null),
                    AlternativeText = image.GetAttributeValue("alt", null)
                });
            });
            //.GroupBy(image => image.Uri)
            //.Select(g => g.First())
            //.Where(image => image.Uri != null);

            // get word count and plain text
            string plainContent;
            int    wordCount = 0;

            try
            {
                plainContent = HtmlUtilities.ConvertToPlainText(response.TranscodingResult.ExtractedContent);
                wordCount    = HtmlUtilities.CountWords(plainContent);
            }
            catch
            {
                plainContent = null;
            }

            // create article
            return(new Article()
            {
                Title = response.TranscodingResult.ExtractedTitle,
                Description = response.TranscodingResult.ExtractedDescription,
                Content = response.TranscodingResult.ExtractedContent,
                ContentExtracted = response.TranscodingResult.ContentExtracted ? wordCount > 0 : false,
                Raw = _rawHTML,
                PlainContent = plainContent,
                WordCount = wordCount,
                PageCount = response.PageCount,
                FrontImage = response.TranscodingResult.ExtractedImage,
                Images = images,
                Favicon = response.TranscodingResult.ExtractedFavicon,
                NextPage = response.TranscodingResult.NextPageUrl != null ? new Uri(response.TranscodingResult.NextPageUrl, UriKind.Absolute) : null,
                Encoding = response.Encoding
            });
        }