Beispiel #1
0
        /// <summary>
        /// Reads article content from the given URI.
        /// </summary>
        /// <param name="uri">An URI to extract the content from.</param>
        /// <param name="options">The transform options.</param>
        /// <param name="cancellationToken">The cancellation token.</param>
        /// <returns>
        /// An article with extracted content and meta information.
        /// </returns>
        /// <exception cref="ReadException"></exception>
        /// <exception cref="InvalidOperationException"></exception>
        /// <exception cref="OperationCanceledException"></exception>
        public async Task <Article> Read(Uri uri, ReadOptions options = null, CancellationToken cancellationToken = default(CancellationToken))
        {
            _currentPages = new List <string>();

            Response response;
            string   uriString = uri.OriginalString;

            if (options == null)
            {
                options = ReadOptions.CreateDefault();
            }

            // replace domain when URI is marked as faulty
            foreach (string faultyUri in _redirectFaultyMobileURIs.Keys)
            {
                if (uriString.Contains(faultyUri))
                {
                    uri = new Uri(uriString.Replace(faultyUri, _redirectFaultyMobileURIs[faultyUri]));
                }
            }

            // make async request
            response = await Request(uri, options, null, cancellationToken);

            // get images from article
            int id = 1;
            IEnumerable <ArticleImage> images = response.TranscodingResult.Images
                                                .Select(image =>
            {
                Uri imageUri = null;
                Uri.TryCreate(image.GetAttributeValue("src", null), UriKind.Absolute, out imageUri);

                return(new ArticleImage()
                {
                    ID = (id++).ToString(),
                    Uri = imageUri,
                    Title = image.GetAttributeValue("title", null),
                    AlternativeText = image.GetAttributeValue("alt", null)
                });
            });
            //.GroupBy(image => image.Uri)
            //.Select(g => g.First())
            //.Where(image => image.Uri != null);

            // get word count and plain text
            string plainContent;
            int    wordCount = 0;

            try
            {
                plainContent = HtmlUtilities.ConvertToPlainText(response.TranscodingResult.ExtractedContent);
                wordCount    = HtmlUtilities.CountWords(plainContent);
            }
            catch
            {
                plainContent = null;
            }

            // create article
            return(new Article()
            {
                Title = response.TranscodingResult.ExtractedTitle,
                Description = response.TranscodingResult.ExtractedDescription,
                Content = response.TranscodingResult.ExtractedContent,
                ContentExtracted = response.TranscodingResult.ContentExtracted ? wordCount > 0 : false,
                Raw = _rawHTML,
                PlainContent = plainContent,
                WordCount = wordCount,
                PageCount = response.PageCount,
                FrontImage = response.TranscodingResult.ExtractedImage,
                Images = images,
                Favicon = response.TranscodingResult.ExtractedFavicon,
                NextPage = response.TranscodingResult.NextPageUrl != null ? new Uri(response.TranscodingResult.NextPageUrl, UriKind.Absolute) : null,
                Encoding = response.Encoding
            });
        }
Beispiel #2
0
 /// <summary>
 /// Reads article content from the given stream.
 /// </summary>
 /// <param name="stream">The stream to extract the content from.</param>
 /// <param name="encoding">The stream encoding.</param>
 /// <param name="cancellationToken">The cancellation token.</param>
 /// <returns>
 /// An article with extracted content and meta information.
 /// </returns>
 public Article Read(Stream stream, Encoding encoding, CancellationToken cancellationToken)
 {
     return(Read(stream, encoding, ReadOptions.CreateDefault(), cancellationToken));
 }