/// <summary> /// Extracts article content from an HTML page. /// </summary> /// <param name="transcodingInput">An object containing input parameters, i.a. html content to be processed.</param> /// <returns>An object containing transcoding result, i.a. extracted content and title.</returns> public TranscodingResult Transcode(TranscodingInput transcodingInput) { if (transcodingInput == null) { throw new ArgumentNullException("transcodingInput"); } bool contentExtracted; string extractedTitle; string nextPageUrl; XDocument transcodedXmlDocument = TranscodeToXml( transcodingInput.HtmlContent, transcodingInput.Url, out contentExtracted, out extractedTitle, out nextPageUrl); IEnumerable<XElement> images = transcodedXmlDocument.GetElementsByTagName("img"); List<XElement> realImages = images.ToList(); if (transcodingInput.DomSerializationParams.ReplaceImagesWithPlaceholders) { int i = 1; images.ForEach(ximage => ximage.AddAfterSelf(new XComment("IMG_" + (i++)))); images.Remove(); } string transcodedContent = _sgmlDomSerializer.SerializeDocument( transcodedXmlDocument, transcodingInput.DomSerializationParams); bool titleExtracted = !string.IsNullOrEmpty(extractedTitle); MetaExtractor metaExtractor = new MetaExtractor(transcodedXmlDocument); string charset = null; string description = null; Uri image = null; Uri favicon = null; if (metaExtractor.HasValue) { charset = metaExtractor.GetCharset(); description = metaExtractor.GetMetaDescription(); string imageString = metaExtractor.GetMetaImage(); string faviconString = metaExtractor.GetMetaFavicon(); if (imageString != null) { imageString = ResolveElementUrl(imageString, transcodingInput.Url); Uri.TryCreate(imageString, UriKind.Absolute, out image); } if (faviconString != null) { faviconString = ResolveElementUrl(faviconString, transcodingInput.Url); Uri.TryCreate(faviconString, UriKind.Absolute, out favicon); } } return new TranscodingResult(contentExtracted, titleExtracted) { ExtractedContent = transcodedContent, ExtractedTitle = extractedTitle, ExtractedDescription = description, ExtractedFavicon = favicon, ExtractedImage = image, NextPageUrl = nextPageUrl, Charset = charset, Images = realImages }; }
/// <summary> /// Extracts the readable information. /// </summary> /// <param name="uri">The URI.</param> /// <param name="textStream">The text stream.</param> /// <param name="options">The options.</param> /// <param name="encoding">The encoding.</param> /// <returns></returns> protected TranscodingResult ExtractReadableInformation( Uri uri, Stream textStream, ReadOptions options, Encoding encoding = null) { // response stream to text textStream.Position = 0; StreamReader streamReader = new StreamReader(textStream, encoding ?? Encoding.UTF8); _rawHTML = streamReader.ReadToEnd(); // set properties for processing TranscodingInput transcodingInput = new TranscodingInput(_rawHTML) { Url = uri.ToString(), DomSerializationParams = new DomSerializationParams() { BodyOnly = !options.HasHeaderTags, NoHeadline = !options.HasHeadline, PrettyPrint = options.PrettyPrint, DontIncludeContentTypeMetaElement = true, DontIncludeMobileSpecificMetaElements = true, DontIncludeDocTypeMetaElement = false, DontIncludeGeneratorMetaElement = true, ReplaceImagesWithPlaceholders = options.ReplaceImagesWithPlaceholders } }; // process/transcode HTML return _transcoder.Transcode(transcodingInput); }