/// <summary> /// Reads article content from the given stream. /// </summary> /// <param name="stream">The stream to extract the content from.</param> /// <param name="encoding">The stream encoding.</param> /// <param name="options">The transform options.</param> /// <param name="cancellationToken">The cancellation token.</param> /// <returns> /// An article with extracted content and meta information. /// </returns> public Article Read(Stream stream, Encoding encoding, ReadOptions options, CancellationToken cancellationToken) { // readability TranscodingResult transcodingResult; try { // transcode content transcodingResult = ExtractReadableInformation(null, stream, options, encoding); } catch (Exception exc) { throw new ReadException(exc.Message); } // get word count and plain text string plainContent; int wordCount = 0; try { plainContent = HtmlUtilities.ConvertToPlainText(transcodingResult.ExtractedContent); wordCount = HtmlUtilities.CountWords(plainContent); } catch { plainContent = null; } // create article return(new Article { Title = transcodingResult.ExtractedTitle, Description = transcodingResult.ExtractedDescription, Content = transcodingResult.ExtractedContent, ContentExtracted = transcodingResult.ContentExtracted && wordCount > 0, Raw = _rawHTML, PlainContent = plainContent, WordCount = wordCount, PageCount = 1, FrontImage = transcodingResult.ExtractedImage, Images = new ArticleImage[0], Favicon = transcodingResult.ExtractedFavicon, NextPage = transcodingResult.NextPageUrl != null ? new Uri(transcodingResult.NextPageUrl, UriKind.Absolute) : null, Encoding = encoding }); }
/// <summary> /// Reads article content from the given URI. /// </summary> /// <param name="uri">An URI to extract the content from.</param> /// <param name="options">The transform options.</param> /// <param name="cancellationToken">The cancellation token.</param> /// <returns> /// An article with extracted content and meta information. /// </returns> /// <exception cref="ReadException"></exception> /// <exception cref="InvalidOperationException"></exception> /// <exception cref="OperationCanceledException"></exception> public async Task <Article> Read(Uri uri, ReadOptions options = null, CancellationToken cancellationToken = default(CancellationToken)) { _currentPages = new List <string>(); Response response; string uriString = uri.OriginalString; if (options == null) { options = ReadOptions.CreateDefault(); } // replace domain when URI is marked as faulty foreach (string faultyUri in _redirectFaultyMobileURIs.Keys) { if (uriString.Contains(faultyUri)) { uri = new Uri(uriString.Replace(faultyUri, _redirectFaultyMobileURIs[faultyUri])); } } // make async request response = await Request(uri, options, null, cancellationToken); // get images from article int id = 1; IEnumerable <ArticleImage> images = response.TranscodingResult.Images .Select(image => { Uri imageUri = null; Uri.TryCreate(image.GetAttributeValue("src", null), UriKind.Absolute, out imageUri); return(new ArticleImage() { ID = (id++).ToString(), Uri = imageUri, Title = image.GetAttributeValue("title", null), AlternativeText = image.GetAttributeValue("alt", null) }); }); //.GroupBy(image => image.Uri) //.Select(g => g.First()) //.Where(image => image.Uri != null); // get word count and plain text string plainContent; int wordCount = 0; try { plainContent = HtmlUtilities.ConvertToPlainText(response.TranscodingResult.ExtractedContent); wordCount = HtmlUtilities.CountWords(plainContent); } catch { plainContent = null; } // create article return(new Article() { Title = response.TranscodingResult.ExtractedTitle, Description = response.TranscodingResult.ExtractedDescription, Content = response.TranscodingResult.ExtractedContent, ContentExtracted = response.TranscodingResult.ContentExtracted ? wordCount > 0 : false, Raw = _rawHTML, PlainContent = plainContent, WordCount = wordCount, PageCount = response.PageCount, FrontImage = response.TranscodingResult.ExtractedImage, Images = images, Favicon = response.TranscodingResult.ExtractedFavicon, NextPage = response.TranscodingResult.NextPageUrl != null ? new Uri(response.TranscodingResult.NextPageUrl, UriKind.Absolute) : null, Encoding = response.Encoding }); }