/// <summary>
        /// Extracts article content from an HTML page.
        /// </summary>
        /// <param name="transcodingInput">An object containing input parameters, i.a. html content to be processed.</param>
        /// <returns>An object containing transcoding result, i.a. extracted content and title.</returns>
        public TranscodingResult Transcode(TranscodingInput transcodingInput)
        {
            if (transcodingInput == null)
              {
            throw new ArgumentNullException("transcodingInput");
              }

              bool contentExtracted;
              string extractedTitle;
              string nextPageUrl;

              XDocument transcodedXmlDocument =
            TranscodeToXml(
              transcodingInput.HtmlContent,
              transcodingInput.Url,
              out contentExtracted,
              out extractedTitle,
              out nextPageUrl);

              IEnumerable<XElement> images = transcodedXmlDocument.GetElementsByTagName("img");
              List<XElement> realImages = images.ToList();

              if (transcodingInput.DomSerializationParams.ReplaceImagesWithPlaceholders)
              {
            int i = 1;
            images.ForEach(ximage => ximage.AddAfterSelf(new XComment("IMG_" + (i++))));
            images.Remove();
              }

              string transcodedContent =
            _sgmlDomSerializer.SerializeDocument(
              transcodedXmlDocument,
              transcodingInput.DomSerializationParams);

              bool titleExtracted = !string.IsNullOrEmpty(extractedTitle);

              MetaExtractor metaExtractor = new MetaExtractor(transcodedXmlDocument);

              string charset = null;
              string description = null;
              Uri image = null;
              Uri favicon = null;

              if (metaExtractor.HasValue)
              {
            charset = metaExtractor.GetCharset();
            description = metaExtractor.GetMetaDescription();
            string imageString = metaExtractor.GetMetaImage();
            string faviconString = metaExtractor.GetMetaFavicon();

            if (imageString != null)
            {
              imageString = ResolveElementUrl(imageString, transcodingInput.Url);
              Uri.TryCreate(imageString, UriKind.Absolute, out image);
            }
            if (faviconString != null)
            {
              faviconString = ResolveElementUrl(faviconString, transcodingInput.Url);
              Uri.TryCreate(faviconString, UriKind.Absolute, out favicon);
            }
              }

              return
            new TranscodingResult(contentExtracted, titleExtracted)
              {
            ExtractedContent = transcodedContent,
            ExtractedTitle = extractedTitle,
            ExtractedDescription = description,
            ExtractedFavicon = favicon,
            ExtractedImage = image,
            NextPageUrl = nextPageUrl,
            Charset = charset,
            Images = realImages
              };
        }
Example #2
0
    /// <summary>
    /// Extracts the readable information.
    /// </summary>
    /// <param name="uri">The URI.</param>
    /// <param name="textStream">The text stream.</param>
    /// <param name="options">The options.</param>
    /// <param name="encoding">The encoding.</param>
    /// <returns></returns>
    protected TranscodingResult ExtractReadableInformation(
      Uri uri,
      Stream textStream,
      ReadOptions options,
      Encoding encoding = null)
    {
      // response stream to text
      textStream.Position = 0;
      StreamReader streamReader = new StreamReader(textStream, encoding ?? Encoding.UTF8);
      _rawHTML = streamReader.ReadToEnd();

      // set properties for processing
      TranscodingInput transcodingInput = new TranscodingInput(_rawHTML)
      {
        Url = uri.ToString(),
        DomSerializationParams = new DomSerializationParams()
        {
          BodyOnly = !options.HasHeaderTags,
          NoHeadline = !options.HasHeadline,
          PrettyPrint = options.PrettyPrint,
          DontIncludeContentTypeMetaElement = true,
          DontIncludeMobileSpecificMetaElements = true,
          DontIncludeDocTypeMetaElement = false,
          DontIncludeGeneratorMetaElement = true,
          ReplaceImagesWithPlaceholders = options.ReplaceImagesWithPlaceholders
        }
      };

      // process/transcode HTML
      return _transcoder.Transcode(transcodingInput);
    }