Exemplo n.º 1
0
        private static void ProcessMetaContentTypeElement(XElement headElement, DomSerializationParams domSerializationParams)
        {
            if (!domSerializationParams.DontIncludeContentTypeMetaElement)
            {
                XElement metaContentTypeElement =
                    (from metaElement in headElement.GetChildrenByTagName("meta")
                     where "content-type".Equals(metaElement.GetAttributeValue("http-equiv", ""), StringComparison.OrdinalIgnoreCase)
                     select metaElement).FirstOrDefault();

                // remove meta 'http-equiv' element if present
                if (metaContentTypeElement != null)
                {
                    metaContentTypeElement.Remove();
                }

                // add <meta name="http-equiv" ... /> element
                metaContentTypeElement =
                    new XElement(
                        XName.Get("meta", headElement.Name != null ? (headElement.Name.NamespaceName ?? "") : ""),
                        new XAttribute("http-equiv", "Content-Type"),
                        new XAttribute("content", "text/html; charset=utf-8"));

                headElement.AddFirst(metaContentTypeElement);
            }
        }
Exemplo n.º 2
0
    private static void ProcessMetaContentTypeElement(XElement headElement, DomSerializationParams domSerializationParams)
    {
      if (!domSerializationParams.DontIncludeContentTypeMetaElement)
      {
        XElement metaContentTypeElement =
          (from metaElement in headElement.GetChildrenByTagName("meta")
           where "content-type".Equals(metaElement.GetAttributeValue("http-equiv", ""), StringComparison.OrdinalIgnoreCase)
           select metaElement).FirstOrDefault();

        // remove meta 'http-equiv' element if present
        if (metaContentTypeElement != null)
        {
          metaContentTypeElement.Remove();
        }

        // add <meta name="http-equiv" ... /> element
        metaContentTypeElement =
          new XElement(
            XName.Get("meta", headElement.Name != null ? (headElement.Name.NamespaceName ?? "") : ""),
            new XAttribute("http-equiv", "Content-Type"),
            new XAttribute("content", "text/html; charset=utf-8"));

        headElement.AddFirst(metaContentTypeElement);
      }
    }
Exemplo n.º 3
0
        private static void ProcessMobileSpecificMetaElements(XElement headElement,
                                                              DomSerializationParams domSerializationParams)
        {
            var metaViewportElement =
                (from metaElement in headElement.GetChildrenByTagName("meta")
                 where "viewport".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase)
                 select metaElement).FirstOrDefault();

            // remove meta 'viewport' element if present
            metaViewportElement?.Remove();

            var metaHandheldFriendlyElement =
                (from metaElement in headElement.GetChildrenByTagName("meta")
                 where "HandheldFriendly".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase)
                 select metaElement).FirstOrDefault();

            // remove meta 'HandheldFriendly' element if present
            metaHandheldFriendlyElement?.Remove();

            if (!domSerializationParams.DontIncludeMobileSpecificMetaElements)
            {
                // add <meta name="HandheldFriendly" ... /> element
                metaHandheldFriendlyElement = new XElement(
                    XName.Get("meta",
                              headElement.Name.NamespaceName),
                    new XAttribute("name", "HandheldFriendly"),
                    new XAttribute("content", "true"));

                headElement.AddFirst(metaHandheldFriendlyElement);
            }
        }
Exemplo n.º 4
0
        /// <summary>
        ///   Serializes given DOM (System.Xml.Linq.XDocument object) to a string.
        /// </summary>
        /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param>
        /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param>
        /// <returns>Serialized representation of the DOM.</returns>
        public string SerializeDocument(XDocument document, DomSerializationParams domSerializationParams)
        {
            if (!domSerializationParams.DontIncludeContentTypeMetaElement ||
                !domSerializationParams.DontIncludeMobileSpecificMetaElements ||
                !domSerializationParams.DontIncludeGeneratorMetaElement)
            {
                var documentRoot = document.Root;

                if (documentRoot == null)
                {
                    throw new ArgumentException("The document must have a root.");
                }

                if (!"html".Equals(documentRoot.Name.LocalName, StringComparison.OrdinalIgnoreCase)
                    )
                {
                    throw new ArgumentException("The document's root must be an html element.");
                }

                // add <head> element if not present
                var headElement = documentRoot.GetChildrenByTagName("head").FirstOrDefault();

                if (headElement == null)
                {
                    headElement = new XElement("head");
                    documentRoot.AddFirst(headElement);
                }

                ProcessMetaElements(headElement, domSerializationParams);
            }

            var result =
                document.ToString(domSerializationParams.PrettyPrint ? SaveOptions.None : SaveOptions.DisableFormatting);

            if (!domSerializationParams.DontIncludeDocTypeMetaElement)
            {
                result =
                    "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n" +
                    result;
            }

            return(result);
        }
        private string DoTranscode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted,
                                   out string extractedTitle)
        {
            _curPageNum  = 1;
            _parsedPages = new List <string> {
                Regex.Replace(url, @"\/$", "")
            };

            /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */

            var htmlContent = _urlFetcher.Fetch(url);

            /* If we can't fetch the page, then exit. */
            if (string.IsNullOrEmpty(htmlContent))
            {
                mainContentExtracted = false;
                extractedTitle       = null;

                return(null);
            }

            /* Attempt to transcode the page */

            var document = _transcoder.TranscodeToXml(htmlContent, url, out mainContentExtracted, out extractedTitle,
                                                      out var nextPage);

            if (nextPage != null)
            {
                AppendNextPage(document, nextPage);
            }

            /* If there are multiple pages, rename the first content div */
            if (_curPageNum > 1)
            {
                var articleContainer = document.GetElementById("readInner").Element("div");

                articleContainer.SetId(_pageIdPrefix + "1");
                articleContainer.SetClass("page");
            }

            return(_sgmlDomSerializer.SerializeDocument(document, domSerializationParams));
        }
Exemplo n.º 6
0
        /// <summary>
        /// Serializes given DOM (System.Xml.Linq.XDocument object) to a string.
        /// </summary>
        /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param>
        /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param>
        /// <returns>Serialized representation of the DOM.</returns>
        public string SerializeDocument(XDocument document, DomSerializationParams domSerializationParams)
        {
            if (!domSerializationParams.DontIncludeContentTypeMetaElement
               || !domSerializationParams.DontIncludeMobileSpecificMetaElements
               || !domSerializationParams.DontIncludeGeneratorMetaElement)
              {
            var documentRoot = document.Root;

            if (documentRoot == null)
            {
              throw new ArgumentException("The document must have a root.");
            }

            if (documentRoot.Name == null || !"html".Equals(documentRoot.Name.LocalName, StringComparison.OrdinalIgnoreCase))
            {
              throw new ArgumentException("The document's root must be an html element.");
            }

            // add <head> element if not present
            var headElement = documentRoot.GetChildrenByTagName("head").FirstOrDefault();

            if (headElement == null)
            {
              headElement = new XElement("head");
              documentRoot.AddFirst(headElement);
            }

            ProcessMetaElements(headElement, domSerializationParams);
              }

              string result = document.ToString(domSerializationParams.PrettyPrint ? SaveOptions.None : SaveOptions.DisableFormatting);

              if (!domSerializationParams.DontIncludeDocTypeMetaElement)
              {
            result = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\r\n\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n" + result;
              }

              return result;
        }
Exemplo n.º 7
0
        private static void ProcessMetaGeneratorElement(XElement headElement, DomSerializationParams domSerializationParams)
        {
            if (!domSerializationParams.DontIncludeGeneratorMetaElement)
            {
                var metaGeneratorElement =
                    (from metaElement in headElement.GetChildrenByTagName("meta")
                     where "Generator".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase)
                     select metaElement).FirstOrDefault();

                // remove meta 'generator' element if present
                metaGeneratorElement?.Remove();

                // add <meta name="Generator" ... /> element
                metaGeneratorElement = new XElement(
                    XName.Get("meta",
                              headElement.Name.NamespaceName),
                    new XAttribute("name", "Generator"),
                    new XAttribute("content", Consts.NReadabilityFullName));

                headElement.AddFirst(metaGeneratorElement);
            }
        }
Exemplo n.º 8
0
    private static void ProcessMobileSpecificMetaElements(XElement headElement, DomSerializationParams domSerializationParams)
    {
      XElement metaViewportElement =
        (from metaElement in headElement.GetChildrenByTagName("meta")
         where "viewport".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase)
         select metaElement).FirstOrDefault();

      // remove meta 'viewport' element if present
      if (metaViewportElement != null)
      {
        metaViewportElement.Remove();
      }

      XElement metaHandheldFriendlyElement =
        (from metaElement in headElement.GetChildrenByTagName("meta")
         where "HandheldFriendly".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase)
         select metaElement).FirstOrDefault();

      // remove meta 'HandheldFriendly' element if present
      if (metaHandheldFriendlyElement != null)
      {
        metaHandheldFriendlyElement.Remove();
      }

      if (!domSerializationParams.DontIncludeMobileSpecificMetaElements)
      {
        // add <meta name="HandheldFriendly" ... /> element
        metaHandheldFriendlyElement = new XElement(
          XName.Get("meta", headElement.Name != null ? (headElement.Name.NamespaceName ?? "") : ""),
          new XAttribute("name", "HandheldFriendly"),
          new XAttribute("content", "true"));

        headElement.AddFirst(metaHandheldFriendlyElement);
      }
    }
 public string Transcode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted, out string extractedTitle)
 {
     return(DoTranscode(url, domSerializationParams, out mainContentExtracted, out extractedTitle));
 }
 public string Transcode(string url, out bool mainContentExtracted)
 {
     return(Transcode(url, DomSerializationParams.CreateDefault(), out mainContentExtracted));
 }
        private string DoTranscode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted, out string extractedTitle)
        {
            _curPageNum = 1;
              _parsedPages = new List<string>();

              /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
              _parsedPages.Add(Regex.Replace(url, @"\/$", ""));

              string htmlContent = _urlFetcher.Fetch(url);

              /* If we can't fetch the page, then exit. */
              if (string.IsNullOrEmpty(htmlContent))
              {
            mainContentExtracted = false;
            extractedTitle = null;

            return null;
              }

              /* Attempt to transcode the page */
              XDocument document;
              string nextPage;

              document = _transcoder.TranscodeToXml(htmlContent, url, out mainContentExtracted, out extractedTitle, out nextPage);

              if (nextPage != null)
              {
            AppendNextPage(document, nextPage);
              }

              /* If there are multiple pages, rename the first content div */
              if (_curPageNum > 1)
              {
            var articleContainer = document.GetElementById("readInner").Element("div");

            articleContainer.SetId(_PageIdPrefix + "1");
            articleContainer.SetClass("page");
              }

              return _sgmlDomSerializer.SerializeDocument(document, domSerializationParams);
        }
        /// <summary>
        /// Extracts main article content from a HTML page.
        /// </summary>
        /// <param name="htmlContent">HTML markup to process.</param>
        /// <param name="url">Url from which the content was downloaded. Used to resolve relative urls. Can be null.</param>
        /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param>
        /// <param name="mainContentExtracted">Determines whether the content has been extracted (if the article is not empty).</param>
        /// <param name="nextPageUrl">If the content contains a link to a subsequent page, it is returned here.</param>
        /// <returns>HTML markup containing extracted article content.</returns>
        public string Transcode(string htmlContent, string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted, out string nextPageUrl)
        {
            var document = TranscodeToXml(htmlContent, url, out mainContentExtracted, out nextPageUrl);

              return _sgmlDomSerializer.SerializeDocument(document, domSerializationParams);
        }
Exemplo n.º 13
0
        private static void ProcessMetaGeneratorElement(XElement headElement, DomSerializationParams domSerializationParams)
        {
            if (!domSerializationParams.DontIncludeGeneratorMetaElement)
              {
            XElement metaGeneratorElement =
              (from metaElement in headElement.GetChildrenByTagName("meta")
               where "Generator".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase)
               select metaElement).FirstOrDefault();

            // remove meta 'generator' element if present
            if (metaGeneratorElement != null)
            {
              metaGeneratorElement.Remove();
            }

            // add <meta name="Generator" ... /> element
            metaGeneratorElement = new XElement(
              XName.Get("meta", headElement.Name != null ? (headElement.Name.NamespaceName ?? "") : ""),
              new XAttribute("name", "Generator"),
              new XAttribute("content", Consts.NReadabilityFullName));

            headElement.AddFirst(metaGeneratorElement);
              }
        }
Exemplo n.º 14
0
 /// <summary>
 ///   Serializes given DOM (System.Xml.Linq.XDocument object) to a string.
 /// </summary>
 /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param>
 /// <returns>Serialized representation of the DOM.</returns>
 public string SerializeDocument(XDocument document)
 {
     return(SerializeDocument(document, DomSerializationParams.CreateDefault()));
 }
Exemplo n.º 15
0
    private static void ProcessMetaGeneratorElement(XElement headElement, DomSerializationParams domSerializationParams)
    {
      if (!domSerializationParams.DontIncludeGeneratorMetaElement)
      {
        XElement metaGeneratorElement =
          (from metaElement in headElement.GetChildrenByTagName("meta")
           where "Generator".Equals(metaElement.GetAttributeValue("name", ""), StringComparison.OrdinalIgnoreCase)
           select metaElement).FirstOrDefault();

        // remove meta 'generator' element if present
        if (metaGeneratorElement != null)
        {
          metaGeneratorElement.Remove();
        }

        headElement.AddFirst(metaGeneratorElement);
      }
    }
Exemplo n.º 16
0
 private static void ProcessMetaElements(XElement headElement, DomSerializationParams domSerializationParams)
 {
   ProcessMetaContentTypeElement(headElement, domSerializationParams);
   ProcessMobileSpecificMetaElements(headElement, domSerializationParams);
   ProcessMetaGeneratorElement(headElement, domSerializationParams);
 }
Exemplo n.º 17
0
 /// <summary>
 /// Serializes given DOM (System.Xml.Linq.XDocument object) to a string.
 /// </summary>
 /// <param name="document">System.Xml.Linq.XDocument instance containing the DOM to be serialized.</param>
 /// <param name="domSerializationParams">Contains parameters that modify the behaviour of the output serialization.</param>
 /// <returns>Serialized representation of the DOM.</returns>
 public string SerializeDocument(XDocument document, DomSerializationParams domSerializationParams)
 {
   var result = document.ToString(domSerializationParams.PrettyPrint ? SaveOptions.None : SaveOptions.DisableFormatting);
   return result;
 }
Exemplo n.º 18
0
 private static void ProcessMetaElements(XElement headElement, DomSerializationParams domSerializationParams)
 {
     ProcessMetaContentTypeElement(headElement, domSerializationParams);
     ProcessMobileSpecificMetaElements(headElement, domSerializationParams);
     ProcessMetaGeneratorElement(headElement, domSerializationParams);
 }
        public string Transcode(string url, DomSerializationParams domSerializationParams, out bool mainContentExtracted)
        {
            string extractedTitle;

              return DoTranscode(url, domSerializationParams, out mainContentExtracted, out extractedTitle);
        }