private static void ProcessMetaContentTypeElement(XElement headElement, DomSerializationParams domSerializationParams) { if (!domSerializationParams.DontIncludeContentTypeMetaElement) { XElement metaContentTypeElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "content-type".Equals(metaElement.GetAttributeValue("http-equiv", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'http-equiv' element if present if (metaContentTypeElement != null) { metaContentTypeElement.Remove(); } // add <meta name="http-equiv" ... /> element metaContentTypeElement = new XElement( XName.Get("meta", headElement.Name != null ? (headElement.Name.NamespaceName ?? "") : ""), new XAttribute("http-equiv", "Content-Type"), new XAttribute("content", "text/html; charset=utf-8")); headElement.AddFirst(metaContentTypeElement); } }
private static void ProcessMobileSpecificMetaElements(XElement headElement, DomSerializationParams domSerializationParams) { var metaViewportElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "viewport".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'viewport' element if present metaViewportElement?.Remove(); var metaHandheldFriendlyElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "HandheldFriendly".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'HandheldFriendly' element if present metaHandheldFriendlyElement?.Remove(); if (!domSerializationParams.DontIncludeMobileSpecificMetaElements) { // add <meta name="HandheldFriendly" ... /> element metaHandheldFriendlyElement = new XElement( XName.Get("meta", headElement.Name.NamespaceName), new XAttribute("name", "HandheldFriendly"), new XAttribute("content", "true")); headElement.AddFirst(metaHandheldFriendlyElement); } }
/// <summary> /// Serializes given DOM (System.Xml.Linq.XDocument object) to a string. /// </summary> /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param> /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param> /// <returns>Serialized representation of the DOM.</returns> public string SerializeDocument(XDocument document, DomSerializationParams domSerializationParams) { if (!domSerializationParams.DontIncludeContentTypeMetaElement || !domSerializationParams.DontIncludeMobileSpecificMetaElements || !domSerializationParams.DontIncludeGeneratorMetaElement) { var documentRoot = document.Root; if (documentRoot == null) { throw new ArgumentException("The document must have a root."); } if (!"html".Equals(documentRoot.Name.LocalName, StringComparison.OrdinalIgnoreCase) ) { throw new ArgumentException("The document's root must be an html element."); } // add <head> element if not present var headElement = documentRoot.GetChildrenByTagName("head").FirstOrDefault(); if (headElement == null) { headElement = new XElement("head"); documentRoot.AddFirst(headElement); } ProcessMetaElements(headElement, domSerializationParams); } var result = document.ToString(domSerializationParams.PrettyPrint ? SaveOptions.None : SaveOptions.DisableFormatting); if (!domSerializationParams.DontIncludeDocTypeMetaElement) { result = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n" + result; } return(result); }
private string DoTranscode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted, out string extractedTitle) { _curPageNum = 1; _parsedPages = new List <string> { Regex.Replace(url, @"\/$", "") }; /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ var htmlContent = _urlFetcher.Fetch(url); /* If we can't fetch the page, then exit. */ if (string.IsNullOrEmpty(htmlContent)) { mainContentExtracted = false; extractedTitle = null; return(null); } /* Attempt to transcode the page */ var document = _transcoder.TranscodeToXml(htmlContent, url, out mainContentExtracted, out extractedTitle, out var nextPage); if (nextPage != null) { AppendNextPage(document, nextPage); } /* If there are multiple pages, rename the first content div */ if (_curPageNum > 1) { var articleContainer = document.GetElementById("readInner").Element("div"); articleContainer.SetId(_pageIdPrefix + "1"); articleContainer.SetClass("page"); } return(_sgmlDomSerializer.SerializeDocument(document, domSerializationParams)); }
/// <summary> /// Serializes given DOM (System.Xml.Linq.XDocument object) to a string. /// </summary> /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param> /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param> /// <returns>Serialized representation of the DOM.</returns> public string SerializeDocument(XDocument document, DomSerializationParams domSerializationParams) { if (!domSerializationParams.DontIncludeContentTypeMetaElement || !domSerializationParams.DontIncludeMobileSpecificMetaElements || !domSerializationParams.DontIncludeGeneratorMetaElement) { var documentRoot = document.Root; if (documentRoot == null) { throw new ArgumentException("The document must have a root."); } if (documentRoot.Name == null || !"html".Equals(documentRoot.Name.LocalName, StringComparison.OrdinalIgnoreCase)) { throw new ArgumentException("The document's root must be an html element."); } // add <head> element if not present var headElement = documentRoot.GetChildrenByTagName("head").FirstOrDefault(); if (headElement == null) { headElement = new XElement("head"); documentRoot.AddFirst(headElement); } ProcessMetaElements(headElement, domSerializationParams); } string result = document.ToString(domSerializationParams.PrettyPrint ? SaveOptions.None : SaveOptions.DisableFormatting); if (!domSerializationParams.DontIncludeDocTypeMetaElement) { result = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n" + result; } return result; }
private static void ProcessMetaGeneratorElement(XElement headElement, DomSerializationParams domSerializationParams) { if (!domSerializationParams.DontIncludeGeneratorMetaElement) { var metaGeneratorElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "Generator".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'generator' element if present metaGeneratorElement?.Remove(); // add <meta name="Generator" ... /> element metaGeneratorElement = new XElement( XName.Get("meta", headElement.Name.NamespaceName), new XAttribute("name", "Generator"), new XAttribute("content", Consts.NReadabilityFullName)); headElement.AddFirst(metaGeneratorElement); } }
private static void ProcessMobileSpecificMetaElements(XElement headElement, DomSerializationParams domSerializationParams) { XElement metaViewportElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "viewport".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'viewport' element if present if (metaViewportElement != null) { metaViewportElement.Remove(); } XElement metaHandheldFriendlyElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "HandheldFriendly".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'HandheldFriendly' element if present if (metaHandheldFriendlyElement != null) { metaHandheldFriendlyElement.Remove(); } if (!domSerializationParams.DontIncludeMobileSpecificMetaElements) { // add <meta name="HandheldFriendly" ... /> element metaHandheldFriendlyElement = new XElement( XName.Get("meta", headElement.Name != null ? (headElement.Name.NamespaceName ?? "") : ""), new XAttribute("name", "HandheldFriendly"), new XAttribute("content", "true")); headElement.AddFirst(metaHandheldFriendlyElement); } }
public string Transcode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted, out string extractedTitle) { return(DoTranscode(url, domSerializationParams, out mainContentExtracted, out extractedTitle)); }
public string Transcode(string url, out bool mainContentExtracted) { return(Transcode(url, DomSerializationParams.CreateDefault(), out mainContentExtracted)); }
private string DoTranscode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted, out string extractedTitle) { _curPageNum = 1; _parsedPages = new List<string>(); /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ _parsedPages.Add(Regex.Replace(url, @"\/$", "")); string htmlContent = _urlFetcher.Fetch(url); /* If we can't fetch the page, then exit. */ if (string.IsNullOrEmpty(htmlContent)) { mainContentExtracted = false; extractedTitle = null; return null; } /* Attempt to transcode the page */ XDocument document; string nextPage; document = _transcoder.TranscodeToXml(htmlContent, url, out mainContentExtracted, out extractedTitle, out nextPage); if (nextPage != null) { AppendNextPage(document, nextPage); } /* If there are multiple pages, rename the first content div */ if (_curPageNum > 1) { var articleContainer = document.GetElementById("readInner").Element("div"); articleContainer.SetId(_PageIdPrefix + "1"); articleContainer.SetClass("page"); } return _sgmlDomSerializer.SerializeDocument(document, domSerializationParams); }
/// <summary> /// Extracts main article content from a HTML page. /// </summary> /// <param name="htmlContent">HTML markup to process.</param> /// <param name="url">Url from which the content was downloaded. Used to resolve relative urls. Can be null.</param> /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param> /// <param name="mainContentExtracted">Determines whether the content has been extracted (if the article is not empty).</param> /// <param name="nextPageUrl">If the content contains a link to a subsequent page, it is returned here.</param> /// <returns>HTML markup containing extracted article content.</returns> public string Transcode(string htmlContent, string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted, out string nextPageUrl) { var document = TranscodeToXml(htmlContent, url, out mainContentExtracted, out nextPageUrl); return _sgmlDomSerializer.SerializeDocument(document, domSerializationParams); }
private static void ProcessMetaGeneratorElement(XElement headElement, DomSerializationParams domSerializationParams) { if (!domSerializationParams.DontIncludeGeneratorMetaElement) { XElement metaGeneratorElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "Generator".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'generator' element if present if (metaGeneratorElement != null) { metaGeneratorElement.Remove(); } // add <meta name="Generator" ... /> element metaGeneratorElement = new XElement( XName.Get("meta", headElement.Name != null ? (headElement.Name.NamespaceName ?? "") : ""), new XAttribute("name", "Generator"), new XAttribute("content", Consts.NReadabilityFullName)); headElement.AddFirst(metaGeneratorElement); } }
/// <summary> /// Serializes given DOM (System.Xml.Linq.XDocument object) to a string. /// </summary> /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param> /// <returns>Serialized representation of the DOM.</returns> public string SerializeDocument(XDocument document) { return(SerializeDocument(document, DomSerializationParams.CreateDefault())); }
private static void ProcessMetaGeneratorElement(XElement headElement, DomSerializationParams domSerializationParams) { if (!domSerializationParams.DontIncludeGeneratorMetaElement) { XElement metaGeneratorElement = (from metaElement in headElement.GetChildrenByTagName("meta") where "Generator".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase) select metaElement).FirstOrDefault(); // remove meta 'generator' element if present if (metaGeneratorElement != null) { metaGeneratorElement.Remove(); } headElement.AddFirst(metaGeneratorElement); } }
private static void ProcessMetaElements(XElement headElement, DomSerializationParams domSerializationParams) { ProcessMetaContentTypeElement(headElement, domSerializationParams); ProcessMobileSpecificMetaElements(headElement, domSerializationParams); ProcessMetaGeneratorElement(headElement, domSerializationParams); }
/// <summary> /// Serializes given DOM (System.Xml.Linq.XDocument object) to a string. /// </summary> /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param> /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param> /// <returns>Serialized representation of the DOM.</returns> public string SerializeDocument(XDocument document, DomSerializationParams domSerializationParams) { var result = document.ToString(domSerializationParams.PrettyPrint ? SaveOptions.None : SaveOptions.DisableFormatting); return result; }
public string Transcode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted) { string extractedTitle; return DoTranscode(url, domSerializationParams, out mainContentExtracted, out extractedTitle); }