public bool IsBlogger()
        {
            if (Regex.IsMatch(homepageUrl, @"^http://.+\.blogspot\.com($|/)", RegexOptions.IgnoreCase)
                || Regex.IsMatch(homepageUrl, @"^http(s)?://(www\.)?blogger\.com($|/)", RegexOptions.IgnoreCase)
                || new HtmlExtractor(html).Seek(new BloggerGeneratorCriterion()).Success)
            {
                return true;
            }

            HtmlExtractor ex = new HtmlExtractor(html);
            while (ex.Seek("<link href rel='service.post' type='application/atom+xml'>").Success)
            {
                BeginTag bt = (BeginTag)ex.Element;
                string atomHref = bt.GetAttributeValue("href");

                // these obsolete Blogger atom links can't be used, but are
                // still a good indication that it's Blogger
                if (atomHref.StartsWith("https://www.blogger.com/atom/", StringComparison.OrdinalIgnoreCase))
                    return true;

                // any other blogger or blogspot atom link will be considered a match
                if (Regex.IsMatch(atomHref, @"^https?\:\/\/.+\.blog(ger|spot)\.com\/.*", RegexOptions.IgnoreCase))
                    return true;
            }

            return false;
        }
Esempio n. 2
0
 public Match MatchHomepageText(Regex regex)
 {
     string html = DownloadHomepage();
     HtmlExtractor ex = new HtmlExtractor(html);
     if (ex.Seek(new DelegatePredicate(e => e is Text && regex.IsMatch(e.ToString()))).Success)
         return regex.Match(ex.Element.ToString());
     return Match.Empty;
 }
        public static ImageViewer DetectImageViewer(string html, string sourceUrl)
        {
            List<ImageViewer> viewers = imageViewers;
            LazyLoader<List<Regex>> regexes = new LazyLoader<List<Regex>>(delegate
              {
                  List<Regex> regexList = new List<Regex>(viewers.Count);
                  foreach (ImageViewer v in viewers)
                  {
                      regexList.Add(new Regex(v.Pattern, RegexOptions.CultureInvariant));
                  }
                  return regexList;
              });

            HtmlExtractor ex = new HtmlExtractor(html);
            while (ex.Seek("<script src>").Success)
            {
                BeginTag tag = (BeginTag)ex.Element;
                string src = tag.GetAttributeValue("src");

                if (String.IsNullOrEmpty(src))
                {
                    continue;
                }

                try
                {
                    if (!UrlHelper.IsUrl(src))
                    {
                        // We need absolute URLs.
                        src = UrlHelper.EscapeRelativeURL(sourceUrl, src);
                    }

                    Uri srcUri = new Uri(src);
                    if (srcUri.IsAbsoluteUri)
                    {
                        // WinLive 248276: We want just the path portion since there could be an additional query or
                        // fragment on the URL that our regexs can't handle.
                        src = srcUri.GetLeftPart(UriPartial.Path);
                    }
                }
                catch (UriFormatException)
                {
                    // We'll just use the regex on the raw attribute value.
                }

                List<Regex> regexList = regexes.Value;
                for (int i = 0; i < regexList.Count; i++)
                {
                    if (regexList[i].IsMatch(src))
                        return viewers[i];
                }
            }
            return null;
        }
Esempio n. 4
0
        public override BlogPost Parse(XmlElement entryNode, bool includeCategories, Uri documentUri)
        {
            BlogPost post = new BlogPost();
            AtomEntry atomEntry = new AtomEntry(_atomVer, _atomNS, CategoryScheme, _nsMgr, documentUri, entryNode);

            post.Title = atomEntry.Title;
            post.Excerpt = atomEntry.Excerpt;
            post.Id = PostUriToPostId(atomEntry.EditUri);
            post.Permalink = atomEntry.Permalink;

            string content = atomEntry.ContentHtml;
            if (content.Trim() != string.Empty)
            {
                HtmlExtractor ex = new HtmlExtractor(content);
                int start, length;
                if (Options.SupportsExtendedEntries && ex.Seek("<a name=\"more\">").Success)
                {
                    start = ex.Element.Offset;
                    length = ex.Element.Length;
                    if (ex.Seek("</a>").Success)
                    {
                        post.SetContents(content.Substring(0, start), content.Substring(ex.Element.Offset + ex.Element.Length));
                    }
                    else
                    {
                        post.SetContents(content.Substring(0, start), content.Substring(start + length));
                    }
                }
                else
                {
                    post.Contents = content;
                }
            }

            post.DatePublished = atomEntry.PublishDate;
            if (Options.SupportsCategories && includeCategories)
                post.Categories = atomEntry.Categories;

            return post;
        }
        Size FindSizeAttribute(string input)
        {
            Size size = new Size(_width, _height);

            if (string.IsNullOrEmpty(input))
                return size;

            try
            {
                RequiredAttribute[] attrWidth = new RequiredAttribute[] { new RequiredAttribute("width"), new RequiredAttribute("height") };
                IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed", attrWidth), new BeginTagPredicate("object", attrWidth));
                HtmlExtractor ex = new HtmlExtractor(input);
                if (ex.Seek(predicate).Success)
                {
                    BeginTag tag = (BeginTag)ex.Element;
                    size = new Size(Convert.ToInt32(tag.GetAttributeValue("width"), CultureInfo.InvariantCulture), Convert.ToInt32(tag.GetAttributeValue("height"), CultureInfo.InvariantCulture));

                }
            }
            catch (Exception ex)
            {
                Trace.Fail("Exception thrown while trying to find video size: " + ex);
            }

            return size;
        }
 private string IdFromEmbed(string input)
 {
     foreach (EmbedPattern check in _embedPatterns)
     {
         IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) });
         HtmlExtractor ex = new HtmlExtractor(input);
         ex = ex.Seek(predicate);
         if (ex.Success)
         {
             BeginTag bt = ex.Element as BeginTag;
             string srcRef = bt.GetAttributeValue(check.Attr);
             Match m = Regex.Match(srcRef, check.Pattern, RegexOptions.IgnoreCase);
             if (m.Success && m.Groups["id"].Success)
             {
                 return m.Groups["id"].Value;
             }
         }
     }
     return String.Empty;
 }
 public bool MatchesEmbed(string input)
 {
     foreach (EmbedPattern check in _embedPatterns)
     {
         IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) });
         HtmlExtractor ex = new HtmlExtractor(input);
         ex = ex.Seek(predicate);
         if (ex.Success)
         {
             BeginTag bt = ex.Element as BeginTag;
             string srcRef = bt.GetAttributeValue(check.Attr);
             if (!Regex.IsMatch(srcRef, check.Pattern, RegexOptions.IgnoreCase))
             {
                 return false;
             }
         }
         else
         {
             return false; //didn't find embed tag with the attr
         }
     }
     return true; //found all predicates
 }
 public static string GetBaseUrl(string html, string defaultUrl)
 {
     string url = defaultUrl;
     HtmlExtractor extractor = new HtmlExtractor(html);
     if (extractor.Seek("<base href>").Success)
     {
         string newUrl = ((BeginTag)extractor.Element).GetAttributeValue("href");
         if (UrlHelper.IsUrl(newUrl))
             url = newUrl;
     }
     return url;
 }
        private void ParsePostContent(XmlNode xmlNode, BlogPost blogPost)
        {
            // get raw content (decode base64 if necessary)
            string content;
            XmlNode base64Node = xmlNode.SelectSingleNode("base64");
            if (base64Node != null)
            {
                byte[] contentBytes = Convert.FromBase64String(base64Node.InnerText);
                content = _utf8EncodingNoBOM.GetString(contentBytes);
            }
            else // no base64 encoding, just read text
            {
                content = xmlNode.InnerText;
            }

            // parse out the title and contents of the post
            HtmlExtractor ex = new HtmlExtractor(content);
            if (ex.Seek("<title>").Success)
            {
                SetPostTitleFromXmlValue(blogPost, ex.CollectTextUntil("title"));
                content = content.Substring(ex.Parser.Position).TrimStart('\r', '\n');

            }

            if (content.Trim() != string.Empty)
            {
                HtmlExtractor ex2 = new HtmlExtractor(content);
                if (Options.SupportsExtendedEntries && ex2.Seek("<lj-cut>").Success)
                    blogPost.SetContents(content.Substring(0, ex2.Element.Offset), content.Substring(ex2.Element.Offset + ex2.Element.Length));
                else
                    blogPost.Contents = content;
            }

        }
        private bool AttemptGenericAtomLinkDetection(string url, string html, bool preferredOnly)
        {
            const string GENERIC_ATOM_PROVIDER_ID = "D48F1B5A-06E6-4f0f-BD76-74F34F520792";

            if (html == null)
                return false;

            HtmlExtractor ex = new HtmlExtractor(html);
            if (ex
                .SeekWithin("<head>", "<body>")
                .SeekWithin("<link href rel='service' type='application/atomsvc+xml'>", "</head>")
                .Success)
            {
                IBlogProvider atomProvider = BlogProviderManager.FindProvider(GENERIC_ATOM_PROVIDER_ID);

                BeginTag bt = ex.Element as BeginTag;

                if (preferredOnly)
                {
                    string classes = bt.GetAttributeValue("class");
                    if (classes == null)
                        return false;
                    if (!Regex.IsMatch(classes, @"\bpreferred\b"))
                        return false;
                }

                string linkUrl = bt.GetAttributeValue("href");

                Debug.WriteLine("Atom service link detected in the blog homepage");

                _providerId = atomProvider.Id;
                _serviceName = atomProvider.Name;
                _clientType = atomProvider.ClientType;
                _blogName = string.Empty;
                _postApiUrl = linkUrl;

                IBlogClient client = BlogClientManager.CreateClient(atomProvider.ClientType, _postApiUrl, _credentials);
                client.VerifyCredentials();
                _usersBlogs = client.GetUsersBlogs();
                if (_usersBlogs.Length == 1)
                {
                    _hostBlogId = _usersBlogs[0].Id;
                    _blogName = _usersBlogs[0].Name;
                    /*
                                        if (_usersBlogs[0].HomepageUrl != null && _usersBlogs[0].HomepageUrl.Length > 0)
                                            _homepageUrl = _usersBlogs[0].HomepageUrl;
                    */
                }

                // attempt to read the blog name from the homepage title
                if (_blogName == null || _blogName.Length == 0)
                {
                    HtmlExtractor ex2 = new HtmlExtractor(html);
                    if (ex2.Seek("<title>").Success)
                    {
                        _blogName = ex2.CollectTextUntil("title");
                    }
                }

                return true;
            }
            return false;
        }
 private static bool ContainsEmbedOrObject(string html)
 {
     try
     {
         IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object"), new BeginTagPredicate("iframe"));
         HtmlExtractor ex = new HtmlExtractor(html);
         return ex.Seek(predicate).Success;
     }
     catch
     {
         return false;
     }
 }
        public bool ShouldUpdateContent(string oldHTML, string newHTML)
        {
            HtmlExtractor exOld = new HtmlExtractor(oldHTML);
            HtmlExtractor exNew = new HtmlExtractor(newHTML);

            HtmlExtractor exImgOld = exOld.Seek("<img title>");
            HtmlExtractor exImgNew = exNew.Seek("<img title>");

            if (exImgOld.Success &&
               exImgNew.Success &&
               ((BeginTag)exImgOld.Element).GetAttributeValue("title") == ((BeginTag)exImgNew.Element).GetAttributeValue("title"))
            {
                return false;
            }

            return true;
        }
        // Warning: Does not deal with escaping properly. This is fine as long as
        // we're only using it for content we generate and there are no security
        // impliciations.
        public static string StripDivsWithClass(string html, string cssClass)
        {
            if (html.IndexOf(cssClass) < 0)
                return html;

            StringBuilder sb = new StringBuilder();
            HtmlExtractor ex = new HtmlExtractor(html);
            int pos = 0;
            while (ex.Seek("<div class='" + cssClass + "'>").Success)
            {
                sb.Append(html, pos, ex.Element.Offset - pos);
                ex.Parser.CollectHtmlUntil("div");
                pos = ex.Parser.Position;
            }
            sb.Append(html, pos, html.Length - pos);
            return sb.ToString();
        }
        /// <summary>
        /// Any setting that is derivaed from the homepage html needs to be in this function.  This function is turned
        /// on and off when detecting blog seetings through the IncludeHomePageSettings.  None of these checks will be run
        /// if the internet is not active.  As each check is made, it does not need to be applied back the _content until the end
        /// at which time it will write the settings back to the registry.
        /// </summary>
        private void DetectHomePageSettings()
        {
            if (_homepageAccessor.HtmlDocument == null) return;

            IDictionary homepageSettings = new Hashtable();

            Debug.Assert(!UseManifestCache, "This code will not run correctly under the manifest cache, due to option overrides not being set");

            LightWeightHTMLMetaData metaData = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument);
            if (metaData.Charset != null)
            {
                try
                {
                    homepageSettings.Add(BlogClientOptions.CHARACTER_SET, metaData.Charset);
                }
                catch (NotSupportedException)
                {
                    //not an actual encoding
                }

            }

            string docType = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument).DocType;
            if (docType != null)
            {
                bool xhtml = docType.IndexOf("xhtml", StringComparison.OrdinalIgnoreCase) >= 0;
                if (xhtml)
                {
                    homepageSettings.Add(BlogClientOptions.REQUIRES_XHTML, true.ToString(CultureInfo.InvariantCulture));
                }
            }

            //checking whether blog is rtl
            HtmlExtractor extractor = new HtmlExtractor(_homepageAccessor.HtmlDocument.RawHtml);
            if (extractor.Seek(new OrPredicate(
                new SmartPredicate("<html dir>"),
                new SmartPredicate("<body dir>"))).Success)
            {
                BeginTag tag = (BeginTag)extractor.Element;
                string dir = tag.GetAttributeValue("dir");
                if (String.Compare(dir, "rtl", StringComparison.OrdinalIgnoreCase) == 0)
                {
                    homepageSettings.Add(BlogClientOptions.TEMPLATE_IS_RTL, true.ToString(CultureInfo.InvariantCulture));
                }
            }

            if (_homepageAccessor.HtmlDocument != null)
            {
                string html = _homepageAccessor.OriginalHtml;
                ImageViewer viewer = DhtmlImageViewers.DetectImageViewer(html, _context.HomepageUrl);
                if (viewer != null)
                {
                    homepageSettings.Add(BlogClientOptions.DHTML_IMAGE_VIEWER, viewer.Name);
                }
            }

            _context.HomePageOverrides = homepageSettings;
        }
Esempio n. 15
0
 public string RestorePreserved(string html)
 {
     StringBuilder sb = new StringBuilder();
     HtmlExtractor ex = new HtmlExtractor(html);
     int pos = 0;
     while (ex.Seek("<span class='" + PRESERVE_CLASS + "'>").Success)
     {
         sb.Append(html, pos, ex.Element.Offset - pos);
         pos = ex.Element.Offset;
         BeginTag bt = (BeginTag)ex.Element;
         string elementId = bt.GetAttributeValue("id");
         Match m = Regex.Match(elementId ?? "", @"^preserve([a-zA-Z0-9]+)$");
         if (m.Success)
         {
             string preserveId = m.Groups[1].Value;
             string preservedValue;
             if (preserved.TryGetValue(preserveId, out preservedValue))
             {
                 sb.Append(preservedValue);
                 ex.CollectTextUntil("span");
                 if (ex.Element == null)
                     pos = html.Length;
                 else
                     pos = ex.Parser.Position;
             }
         }
     }
     sb.Append(html, pos, html.Length - pos);
     return sb.ToString();
 }
 public SmartPredicate(string criterion)
 {
     actualPredicate = HtmlExtractor.Parse(criterion);
 }
 private static bool ContainsEmbed(string input)
 {
     IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object"));
     HtmlExtractor ex = new HtmlExtractor(input);
     return ex.Seek(predicate).Success;
 }