Beispiel #1
0
    /// <summary>
    /// Fetches a resource
    /// </summary>
    /// <param name="uri">The URI.</param>
    /// <param name="options">The options.</param>
    /// <param name="isContinuedPage">if set to <c>true</c> [is continued page].</param>
    /// <param name="cancellationToken">The cancellation token.</param>
    /// <returns></returns>
    /// <exception cref="ReadException">
    /// </exception>
    private async Task<Response> Request(Uri uri, ReadOptions options, Response previousResponse, CancellationToken cancellationToken)
    {
      // URI already fetched
      if (previousResponse != null && _currentPages.Contains(uri.OriginalString))
      {
        return previousResponse;
      }
      _currentPages.Add(uri.OriginalString);

      HttpResponseMessage response = null;
      TranscodingResult transcodingResult;
      Encoding encoding;

      using (HttpRequestMessage request = new HttpRequestMessage(HttpMethod.Get, uri))
      {
        // make async request
        try
        {
          response = await _httpClient.SendAsync(request, cancellationToken);
        }
        catch (HttpRequestException exc)
        {
          throw new ReadException(exc.Message);
        }

        // validate HTTP response
        if (response.StatusCode != HttpStatusCode.OK)
        {
          string exceptionString = String.Format("Request error: {0} ({1})", response.ReasonPhrase, (int)response.StatusCode);

          throw new ReadException(exceptionString);
        }
      }

      // read response
      Stream responseStream = await response.Content.ReadAsStreamAsync();

      string charset = response.Content.Headers.ContentType?.CharSet;

      // handle deep links
      if (options.UseDeepLinks)
      {
        _transcoder.AnchorHrefTranformer = ReverseDeepLinks;
      }
      else
      {
        _transcoder.AnchorHrefTranformer = null;
      }

      // readability
      try
      {
        // charset found in HTTP headers
        encoding = _encoder.GetEncodingFromString(charset);

        // transcode content
        transcodingResult = ExtractReadableInformation(uri, responseStream, options, encoding);

        // get encoding found in HTML
        Encoding encodingFromHTML = _encoder.GetEncodingFromString(transcodingResult.Charset);

        // extract again if encoding didn't match or failed to retrieve
        if ((encoding != null && String.IsNullOrEmpty(charset))
          ||
          (options.PreferHTMLEncoding && !String.Equals(charset, transcodingResult.Charset, StringComparison.OrdinalIgnoreCase)))
        {
          transcodingResult = ExtractReadableInformation(uri, responseStream, options, encodingFromHTML);
          encoding = encodingFromHTML;
        }
      }
      catch (Exception exc)
      {
        throw new ReadException(exc.Message);
      }
      finally
      {
        response.Dispose();
        responseStream.Dispose();
      }

      Response newResponse = new Response()
      {
        TranscodingResult = transcodingResult,
        PageCount = 1,
        Encoding = encoding
      };

      // in same special cases their are multiple pages, which are only comments or do not contain new content.
      // if this is the case we will break here and return the first page only.
      if (previousResponse != null && previousResponse.TranscodingResult.ExtractedContent.Contains(transcodingResult.ExtractedContent))
      {
        previousResponse.TranscodingResult.NextPageUrl = null;
        return previousResponse;
      }

      // multiple pages are available
      try
      {
        if (options.MultipageDownload && transcodingResult.NextPageUrl != null && (previousResponse == null || (previousResponse != null && previousResponse.PageCount < _options.MultipageLimit)))
        {
          return await Request(new Uri(transcodingResult.NextPageUrl), new ReadOptions()
          {
            PrettyPrint = options.PrettyPrint,
            UseDeepLinks = options.UseDeepLinks,
            MultipageDownload = true
          }, previousResponse != null ? MergeResponses(previousResponse, newResponse) : newResponse, cancellationToken);
        }

        // this is not the first page
        if (previousResponse != null)
        {
          return MergeResponses(previousResponse, newResponse);
        }
      }
      // silently fail when next pages fail to download
      catch { }

      return newResponse;
    }
Beispiel #2
0
    /// <summary>
    /// Extracts the readable information.
    /// </summary>
    /// <param name="uri">The URI.</param>
    /// <param name="textStream">The text stream.</param>
    /// <param name="options">The options.</param>
    /// <param name="encoding">The encoding.</param>
    /// <returns></returns>
    protected TranscodingResult ExtractReadableInformation(
      Uri uri,
      Stream textStream,
      ReadOptions options,
      Encoding encoding = null)
    {
      // response stream to text
      textStream.Position = 0;
      StreamReader streamReader = new StreamReader(textStream, encoding ?? Encoding.UTF8);
      _rawHTML = streamReader.ReadToEnd();

      // set properties for processing
      TranscodingInput transcodingInput = new TranscodingInput(_rawHTML)
      {
        Url = uri.ToString(),
        DomSerializationParams = new DomSerializationParams()
        {
          BodyOnly = !options.HasHeaderTags,
          NoHeadline = !options.HasHeadline,
          PrettyPrint = options.PrettyPrint,
          DontIncludeContentTypeMetaElement = true,
          DontIncludeMobileSpecificMetaElements = true,
          DontIncludeDocTypeMetaElement = false,
          DontIncludeGeneratorMetaElement = true,
          ReplaceImagesWithPlaceholders = options.ReplaceImagesWithPlaceholders
        }
      };

      // process/transcode HTML
      return _transcoder.Transcode(transcodingInput);
    }
Beispiel #3
0
        /// <summary>
        /// Fetches a resource
        /// </summary>
        /// <param name="uri">The URI.</param>
        /// <param name="options">The options.</param>
        /// <param name="isContinuedPage">if set to <c>true</c> [is continued page].</param>
        /// <param name="cancellationToken">The cancellation token.</param>
        /// <returns></returns>
        /// <exception cref="ReadException">
        /// </exception>
        private async Task <Response> Request(Uri uri, ReadOptions options, Response previousResponse, CancellationToken cancellationToken)
        {
            // URI already fetched
            if (previousResponse != null && _currentPages.Contains(uri.OriginalString))
            {
                return(previousResponse);
            }
            _currentPages.Add(uri.OriginalString);

            HttpResponseMessage response = null;
            TranscodingResult   transcodingResult;
            Encoding            encoding;

            using (HttpRequestMessage request = new HttpRequestMessage(HttpMethod.Get, uri))
            {
                // make async request
                try
                {
                    response = await _httpClient.SendAsync(request, cancellationToken);
                }
                catch (HttpRequestException exc)
                {
                    throw new ReadException(exc.Message);
                }

                // validate HTTP response
                if (response.StatusCode != HttpStatusCode.OK)
                {
                    string exceptionString = String.Format("Request error: {0} ({1})", response.ReasonPhrase, (int)response.StatusCode);

                    throw new ReadException(exceptionString);
                }
            }

            // read response
            Stream responseStream = await response.Content.ReadAsStreamAsync();

            string charset = response.Content.Headers.ContentType.CharSet;

            // handle deep links
            if (options.UseDeepLinks)
            {
                _transcoder.AnchorHrefTranformer = ReverseDeepLinks;
            }
            else
            {
                _transcoder.AnchorHrefTranformer = null;
            }

            // readability
            try
            {
                // charset found in HTTP headers
                encoding = _encoder.GetEncodingFromString(charset);

                // transcode content
                transcodingResult = ExtractReadableInformation(uri, responseStream, options, encoding);

                // get encoding found in HTML
                Encoding encodingFromHTML = _encoder.GetEncodingFromString(transcodingResult.Charset);

                // extract again if encoding didn't match or failed to retrieve
                if ((encoding != null && String.IsNullOrEmpty(charset))
                    ||
                    (options.PreferHTMLEncoding && !String.Equals(charset, transcodingResult.Charset, StringComparison.OrdinalIgnoreCase)))
                {
                    transcodingResult = ExtractReadableInformation(uri, responseStream, options, encodingFromHTML);
                    encoding          = encodingFromHTML;
                }
            }
            catch (Exception exc)
            {
                throw new ReadException(exc.Message);
            }
            finally
            {
                response.Dispose();
                responseStream.Dispose();
            }

            Response newResponse = new Response()
            {
                TranscodingResult = transcodingResult,
                PageCount         = 1,
                Encoding          = encoding
            };

            // in same special cases their are multiple pages, which are only comments or do not contain new content.
            // if this is the case we will break here and return the first page only.
            if (previousResponse != null && previousResponse.TranscodingResult.ExtractedContent.Contains(transcodingResult.ExtractedContent))
            {
                previousResponse.TranscodingResult.NextPageUrl = null;
                return(previousResponse);
            }

            // multiple pages are available
            try
            {
                if (options.MultipageDownload && transcodingResult.NextPageUrl != null && (previousResponse == null || (previousResponse != null && previousResponse.PageCount < _options.MultipageLimit)))
                {
                    return(await Request(new Uri(transcodingResult.NextPageUrl), new ReadOptions()
                    {
                        PrettyPrint = options.PrettyPrint,
                        UseDeepLinks = options.UseDeepLinks,
                        MultipageDownload = true
                    }, previousResponse != null?MergeResponses(previousResponse, newResponse) : newResponse, cancellationToken));
                }

                // this is not the first page
                if (previousResponse != null)
                {
                    return(MergeResponses(previousResponse, newResponse));
                }
            }
            // silently fail when next pages fail to download
            catch { }

            return(newResponse);
        }
Beispiel #4
0
      /// <summary>
    /// Reads article content from the given URI.
    /// </summary>
    /// <param name="uri">An URI to extract the content from.</param>
    /// <param name="options">The transform options.</param>
    /// <param name="cancellationToken">The cancellation token.</param>
    /// <returns>
    /// An article with extracted content and meta information.
    /// </returns>
    /// <exception cref="ReadException"></exception>
    /// <exception cref="InvalidOperationException"></exception>
    /// <exception cref="OperationCanceledException"></exception>
    public async Task<Article> Read(Uri uri, ReadOptions options = null, CancellationToken cancellationToken = default(CancellationToken))
    {
      _currentPages = new List<string>();

      Response response;
      string uriString = uri.OriginalString;

      if (options == null)
      {
        options = ReadOptions.CreateDefault();
      }

      // replace domain when URI is marked as faulty
      foreach (string faultyUri in _redirectFaultyMobileURIs.Keys)
      {
        if (uriString.Contains(faultyUri))
        {
          uri = new Uri(uriString.Replace(faultyUri, _redirectFaultyMobileURIs[faultyUri]));
        }
      }

      // make async request
      response = await Request(uri, options, null, cancellationToken);

      // get images from article
      int id = 1;
      IEnumerable<ArticleImage> images = response.TranscodingResult.Images
        .Select(image =>
        {
          Uri imageUri = null;
          Uri.TryCreate(image.GetAttributeValue("src", null), UriKind.Absolute, out imageUri);

          return new ArticleImage()
          {
            ID = (id++).ToString(),
            Uri = imageUri,
            Title = image.GetAttributeValue("title", null),
            AlternativeText = image.GetAttributeValue("alt", null)
          };
        });
        //.GroupBy(image => image.Uri)
        //.Select(g => g.First())
        //.Where(image => image.Uri != null);

      // get word count and plain text
      string plainContent;
      int wordCount = 0;

      try
      {
        plainContent = HtmlUtilities.ConvertToPlainText(response.TranscodingResult.ExtractedContent);
        wordCount = HtmlUtilities.CountWords(plainContent);
      }
      catch
      {
        plainContent = null;
      }

      // create article
      return new Article()
      {
        Title = response.TranscodingResult.ExtractedTitle,
        Description = response.TranscodingResult.ExtractedDescription,
        Content = response.TranscodingResult.ExtractedContent,
        ContentExtracted = response.TranscodingResult.ContentExtracted ? wordCount > 0 : false,
        Raw = _rawHTML,
        PlainContent = plainContent,
        WordCount = wordCount,
        PageCount = response.PageCount,
        FrontImage = response.TranscodingResult.ExtractedImage,
        Images = images,
        Favicon = response.TranscodingResult.ExtractedFavicon,
        NextPage = response.TranscodingResult.NextPageUrl != null ? new Uri(response.TranscodingResult.NextPageUrl, UriKind.Absolute) : null,
        Encoding = response.Encoding
      };
    }
Beispiel #5
0
        /// <summary>
        /// Reads article content from the given URI.
        /// </summary>
        /// <param name="uri">An URI to extract the content from.</param>
        /// <param name="options">The transform options.</param>
        /// <param name="cancellationToken">The cancellation token.</param>
        /// <returns>
        /// An article with extracted content and meta information.
        /// </returns>
        /// <exception cref="ReadException"></exception>
        /// <exception cref="InvalidOperationException"></exception>
        /// <exception cref="OperationCanceledException"></exception>
        public async Task <Article> Read(Uri uri, ReadOptions options = null, CancellationToken cancellationToken = default(CancellationToken))
        {
            _currentPages = new List <string>();

            Response response;
            string   uriString = uri.OriginalString;

            if (options == null)
            {
                options = ReadOptions.CreateDefault();
            }

            // replace domain when URI is marked as faulty
            foreach (string faultyUri in _redirectFaultyMobileURIs.Keys)
            {
                if (uriString.Contains(faultyUri))
                {
                    uri = new Uri(uriString.Replace(faultyUri, _redirectFaultyMobileURIs[faultyUri]));
                }
            }

            // make async request
            response = await Request(uri, options, null, cancellationToken);

            // get images from article
            int id = 1;
            IEnumerable <ArticleImage> images = response.TranscodingResult.Images
                                                .Select(image =>
            {
                Uri imageUri = null;
                Uri.TryCreate(image.GetAttributeValue("src", null), UriKind.Absolute, out imageUri);

                return(new ArticleImage()
                {
                    ID = (id++).ToString(),
                    Uri = imageUri,
                    Title = image.GetAttributeValue("title", null),
                    AlternativeText = image.GetAttributeValue("alt", null)
                });
            });
            //.GroupBy(image => image.Uri)
            //.Select(g => g.First())
            //.Where(image => image.Uri != null);

            // get word count and plain text
            string plainContent;
            int    wordCount = 0;

            try
            {
                plainContent = HtmlUtilities.ConvertToPlainText(response.TranscodingResult.ExtractedContent);
                wordCount    = HtmlUtilities.CountWords(plainContent);
            }
            catch
            {
                plainContent = null;
            }

            // create article
            return(new Article()
            {
                Title = response.TranscodingResult.ExtractedTitle,
                Description = response.TranscodingResult.ExtractedDescription,
                Content = response.TranscodingResult.ExtractedContent,
                ContentExtracted = response.TranscodingResult.ContentExtracted ? wordCount > 0 : false,
                Raw = _rawHTML,
                PlainContent = plainContent,
                WordCount = wordCount,
                PageCount = response.PageCount,
                FrontImage = response.TranscodingResult.ExtractedImage,
                Images = images,
                Favicon = response.TranscodingResult.ExtractedFavicon,
                NextPage = response.TranscodingResult.NextPageUrl != null ? new Uri(response.TranscodingResult.NextPageUrl, UriKind.Absolute) : null,
                Encoding = response.Encoding
            });
        }