C# (CSharp) IHtmlDocument.GetElementsByTagName Examples

Programming Language: C# (CSharp)

Class/Type: IHtmlDocument

Method/Function: GetElementsByTagName

Examples at hotexamples.com: 12

C# (CSharp) IHtmlDocument.GetElementsByTagName - 12 examples found. These are the top rated real world C# (CSharp) examples of IHtmlDocument.GetElementsByTagName extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

QuerySelectorAll(30)

GetElementsByClassName(30)

GetElementById(30)

CreateElement(25)

QuerySelector(18)

ToHtml(16)

Find(15)

GetElementsByTagName(12)

GetElementByPath(6)

Descendents(6)

Dispose(6)

GetFormAction(5)

GetFormInputs(5)

Render(4)

Nodes(3)

CreateTextNode(3)

ShouldContainErrors(2)

GetInputValue(2)

InnerHtml(2)

Elements(2)

DescendantNodes(2)

Require(2)

GetTableByPath(2)

GetElementsByName(2)

createTextNode(1)

createElement(1)

QuerySelectorOrThrow(1)

createShadowRootElement(1)

ThrowIfNull(1)

createDocumentFragment(1)

ResolveUriToAbsoluate(1)

Should(1)

AddElement(1)

Normalize(1)

GetNodes(1)

GetInputFieldValue(1)

AllElements(1)

GetElementByTestId(1)

FindFirstOrDefault(1)

FindFirst(1)

Exists(1)

Equals(1)

Descendants(1)

DataBind(1)

CreateTreeWalker(1)

CreateNavigator(1)

Clone(1)

getElementById(1)

Example #1

Show file

        /// <summary>
        /// Attempts to get metadata for the article.
        /// </summary>
        /// <param name="doc">The document</param>
        /// <param name="uri">The uri, possibly used to check for a date</param>
        /// <param name="language">The language that was possibly found in the headers of the response</param>
        /// <returns>The metadata object with all the info found</returns>
        internal static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language)
        {
            Metadata metadata = new Metadata();
            Dictionary <string, string> values = new Dictionary <string, string>();
            var metaElements = doc.GetElementsByTagName("meta");

            // Match "description", or Twitter's "twitter:description" (Cards)
            // in name attribute.
            // name is a single value
            var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$";

            // Match Facebook's Open Graph title & description properties.
            // property is a space-separated list of values
            var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)";

            var itemPropPattern = @"\s*datePublished\s*";

            // Find description tags.
            NodeUtility.ForEachNode(metaElements, (element) =>
            {
                var elementName     = (element as IElement).GetAttribute("name") ?? "";
                var elementProperty = (element as IElement).GetAttribute("property") ?? "";
                var itemProp        = (element as IElement).GetAttribute("itemprop") ?? "";
                var content         = (element as IElement).GetAttribute("content");

                // avoid issues with no meta tags
                if (String.IsNullOrEmpty(content))
                {
                    return;
                }
                MatchCollection matches = null;
                String name             = "";

                if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1)
                {
                    metadata.Byline = (element as IElement).GetAttribute("content");
                    metadata.Author = (element as IElement).GetAttribute("content");
                    return;
                }

                if (!String.IsNullOrEmpty(elementProperty))
                {
                    matches = Regex.Matches(elementProperty, propertyPattern);
                    if (matches.Count > 0)
                    {
                        for (int i = matches.Count - 1; i >= 0; i--)
                        {
                            // Convert to lowercase, and remove any whitespace
                            // so we can match below.
                            name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", "");

                            // multiple authors
                            values[name] = content.Trim();
                        }
                    }
                }

                if ((matches == null || matches.Count == 0) &&
                    !String.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase))
                {
                    name = elementName;
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase, remove any whitespace, and convert dots
                        // to colons so we can match below.
                        name         = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":");
                        values[name] = content.Trim();
                    }
                }
                else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase))
                {
                    name = elementProperty;
                }
                else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase))
                {
                    name = itemProp;
                }

                if (!String.IsNullOrEmpty(name))
                {
                    content = (element as IElement).GetAttribute("content");
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase and remove any whitespace
                        // so we can match below.
                        name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase);
                        if (!values.ContainsKey(name))
                        {
                            values.Add(name, content.Trim());
                        }
                    }
                }
            });

            // Find the the description of the article
            IEnumerable <string> DescriptionKeys()
            {
                yield return(values.ContainsKey("description") ? values["description"] : null);

                yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null);

                yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null);

                yield return(values.ContainsKey("og:description") ? values["og:description"] : null);

                yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null);

                yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null);

                yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null);
            }

            metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Get the name of the site
            if (values.ContainsKey("og:site_name"))
            {
                metadata.SiteName = values["og:site_name"];
            }

            // Find the title of the article
            IEnumerable <string> TitleKeys()
            {
                yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null);

                yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null);

                yield return(values.ContainsKey("og:title") ? values["og:title"] : null);

                yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null);

                yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null);

                yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null);

                yield return(values.ContainsKey("title") ? values["title"] : null);
            }

            metadata.Title = TitleKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Let's try to eliminate the site name from the title
            metadata.Title = Readability.CleanTitle(metadata.Title, metadata.SiteName);

            // We did not find any title,
            // we try to get it from the title tag
            if (String.IsNullOrEmpty(metadata.Title))
            {
                metadata.Title = Readability.GetArticleTitle(doc);
            }

            // added language extraction
            IEnumerable <string> LanguageHeuristics()
            {
                yield return(language);

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang"));

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang"));

                yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content"));

                // this is wrong, but it's used
                yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value"));
            }

            metadata.Language = LanguageHeuristics().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Find the featured image of the article
            IEnumerable <string> FeaturedImageKeys()
            {
                yield return(values.ContainsKey("og:image") ? values["og:image"] : null);

                yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null);

                yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null);

                yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null);
            }

            metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            if (String.IsNullOrEmpty(metadata.Author))
            {
                // We try to find a meta tag for the author.
                // Note that there is Open Grapg tag for an author,
                // but it usually contains a profile URL of the author.
                // So we do not use it
                IEnumerable <string> AuthorKeys()
                {
                    yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null);

                    yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] : null);

                    yield return(values.ContainsKey("author") ? values["author"] : null);
                }

                metadata.Author = AuthorKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";
            }

            // added date extraction
            DateTime date;

            // added language extraction
            IEnumerable <DateTime?> DateHeuristics()
            {
                yield return(values.ContainsKey("article:published_time") &&
                             DateTime.TryParse(values["article:published_time"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("date") &&
                             DateTime.TryParse(values["date"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("datepublished") &&
                             DateTime.TryParse(values["datepublished"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:article:create_at") &&
                             DateTime.TryParse(values["weibo:article:create_at"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:webpage:create_at") &&
                             DateTime.TryParse(values["weibo:webpage:create_at"], out date) ?
                             date : DateTime.MinValue);
            }

            metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue);

            if (metadata.PublicationDate == null)
            {
                var times = doc.GetElementsByTagName("time");

                foreach (var time in times)
                {
                    if (!String.IsNullOrEmpty(time.GetAttribute("pubDate")) &&
                        DateTime.TryParse(time.GetAttribute("datetime"), out date))
                    {
                        metadata.PublicationDate = date;
                    }
                }
            }

            if (metadata.PublicationDate == null)
            {
                // as a last resort check the URL for a date
                Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?");
                if (maybeDate.Success)
                {
                    metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value),
                                                            int.Parse(maybeDate.Groups["month"].Value),
                                                            !String.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1);
                }
            }

            return(metadata);
        }

Example #2

Show file

File: FeedDetector.cs Project: DeCarabas/onceandfuture

        public static async Task <IList <Uri> > GetFeedUrls(
            string originUrl,
            bool findAll = false)
        {
            var allUrls = new List <Uri>();
            Uri baseUri = FixupUrl(originUrl);

            // Maybe... maybe this one is a feed?
            Log.FindFeedCheckingBase(baseUri);
            string data = await GetFeedData(baseUri);

            if (LooksLikeFeed(data))
            {
                Log.FindFeedBaseWasFeed(baseUri);
                return(new[] { baseUri });
            }

            // Nope, let's dive into the soup!
            var           parser   = new HtmlParser();
            IHtmlDocument document = parser.ParseDocument(data);

            // Link elements.
            Log.FindFeedCheckingLinkElements(baseUri);
            List <Uri> linkUrls = new List <Uri>();

            foreach (IElement element in document.GetElementsByTagName("link"))
            {
                string linkType = element.GetAttribute("type");
                if (linkType != null && FeedMimeTypes.Contains(linkType))
                {
                    Uri hrefUrl =
                        SyndicationUtil.TryParseAbsoluteUrl(
                            element.GetAttribute("href"),
                            baseUri
                            );
                    if (hrefUrl != null)
                    {
                        linkUrls.Add(hrefUrl);
                    }
                }
            }

            await FilterUrlsByFeed(linkUrls);

            if (linkUrls.Count > 0)
            {
                Log.FindFeedFoundLinkElements(baseUri, linkUrls);
                linkUrls.Sort(UrlFeedComparison);
                allUrls.AddRange(linkUrls);
                if (!findAll)
                {
                    return(allUrls);
                }
            }

            // <a> tags
            Log.FindFeedCheckingAnchorElements(baseUri);
            List <Uri> localGuesses  = new List <Uri>();
            List <Uri> remoteGuesses = new List <Uri>();

            foreach (IElement element in document.GetElementsByTagName("a"))
            {
                Uri hrefUrl =
                    SyndicationUtil.TryParseAbsoluteUrl(
                        element.GetAttribute("href"),
                        baseUri
                        );
                if (hrefUrl != null)
                {
                    if ((hrefUrl.Host == baseUri.Host) && IsFeedUrl(hrefUrl))
                    {
                        localGuesses.Add(hrefUrl);
                    }
                    else if (IsFeedishUrl(hrefUrl))
                    {
                        remoteGuesses.Add(hrefUrl);
                    }
                }
            }

            Log.FindFeedFoundSomeAnchors(baseUri, localGuesses, remoteGuesses);

            // (Consider ones on the same domain first.)
            await FilterUrlsByFeed(localGuesses);

            if (localGuesses.Count > 0)
            {
                Log.FindFeedsFoundLocalGuesses(baseUri, localGuesses);
                localGuesses.Sort(UrlFeedComparison);
                allUrls.AddRange(localGuesses);
                if (!findAll)
                {
                    return(localGuesses);
                }
            }

            await FilterUrlsByFeed(remoteGuesses);

            if (remoteGuesses.Count > 0)
            {
                Log.FindFeedsFoundRemoteGuesses(baseUri, remoteGuesses);
                remoteGuesses.Sort(UrlFeedComparison);
                allUrls.AddRange(remoteGuesses);
                if (!findAll)
                {
                    return(remoteGuesses);
                }
            }

            List <Uri> randomGuesses =
                FeedNames.Select(s => new Uri(baseUri, s)).ToList();

            await FilterUrlsByFeed(randomGuesses);

            if (randomGuesses.Count > 0)
            {
                Log.FindFeedsFoundRandomGuesses(baseUri, randomGuesses);
                randomGuesses.Sort(UrlFeedComparison);
                allUrls.AddRange(randomGuesses);
                if (!findAll)
                {
                    return(randomGuesses);
                }
            }

            // All done, nothing. (Or... everything!)
            Log.FindFeedFoundTotal(baseUri, allUrls);
            return(allUrls);
        }

Example #3

Show file

        /// <summary>
        /// Get the article title
        /// </summary>
        /// <param name="doc">The document</param>
        /// <returns>
        /// The clean title
        /// </returns>
        internal static string GetArticleTitle(IHtmlDocument doc)
        {
            var curTitle  = "";
            var origTitle = "";

            try
            {
                curTitle = origTitle = doc.Title.Trim();

                // If they had an element with id "title" in their HTML
                if (typeof(string) != curTitle.GetType())
                {
                    curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]);
                }
            }
            catch (Exception e) { /* ignore exceptions setting the title. */ }

            var titleHadHierarchicalSeparators = false;

            int wordCount(String str)
            {
                return(Regex.Split(str, @"\s+").Length);
            }

            // If there's a separator in the title, first remove the final part
            if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1)
            {
                titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1;
                curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase);

                // If the resulting title is too short (3 words or fewer), remove
                // the first part instead:
                if (wordCount(curTitle) < 3)
                {
                    curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase);
                }
            }
            else if (curTitle.IndexOf(": ") != -1)
            {
                // Check if we have an heading containing this exact string, so we
                // could assume it's the full title.
                var headings = NodeUtility.ConcatNodeLists(
                    doc.GetElementsByTagName("h1"),
                    doc.GetElementsByTagName("h2")
                    );
                var trimmedTitle = curTitle.Trim();
                var match        = NodeUtility.SomeNode(headings, (heading) =>
                {
                    return(heading.TextContent.Trim() == trimmedTitle);
                });

                // If we don't, let's extract the title out of the original title string.
                if (!match)
                {
                    curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1);

                    // If the title is now too short, try the first colon instead:
                    if (wordCount(curTitle) < 3)
                    {
                        curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1);
                    }
                }
            }
            else if (curTitle.Length > 150 || curTitle.Length < 15)
            {
                var hOnes = doc.GetElementsByTagName("h1");

                if (hOnes.Length == 1)
                {
                    curTitle = NodeUtility.GetInnerText(hOnes[0]);
                }
            }

            curTitle = curTitle.Trim();

            // If we now have 4 words or fewer as our title, and either no
            // 'hierarchical' separators (\, /, > or ») were found in the original
            // title or we decreased the number of words by more than 1 word, use
            // the original title.
            var curTitleWordCount = wordCount(curTitle);

            if (curTitleWordCount <= 4 && (
                    !titleHadHierarchicalSeparators ||
                    curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1))
            {
                curTitle = origTitle;
            }

            return(curTitle);
        }

Example #4

Show file

        public List <Asset> GetHtmlAssets(IHtmlDocument doc)
        {
            var assets = new List <Asset>();
            var images = doc.Images
                         .Where(x => x.HasAttribute("src"));
            var styles = doc.GetElementsByTagName("link")
                         .Where(l => l.Attributes["rel"].Value.Trim().ToLower() == "stylesheet")
                         .Where(c => c.HasAttribute("href"));
            var scripts = doc.GetElementsByTagName("script")
                          .Where(x => x.HasAttribute("src"));
            var inlineStyles = doc.GetElementsByTagName("style");

            //var cssparser = new AngleSharp.Parser.Css.CssParser();
            //var f = cssparser.ParseStylesheet(inlineStyles[0].InnerHtml);
            foreach (var inlineStyle in inlineStyles)
            {
                var inlineStyleAssets = GetCssAssets(inlineStyle.InnerHtml);
                assets.AddRange(inlineStyleAssets);
            }

            foreach (var image in images)
            {
                var src = image.Attributes["src"].Value;
                if (IsLocalPath(src) && !assets.Any(a => a.Uri == src))
                {
                    var suffix = src.Split('.').Last().Split('?').First().Split('#').First();
                    var asset  = new Asset
                    {
                        Uri    = src,
                        Suffix = suffix,
                        NewUri = Guid.NewGuid().ToString().Replace("-", "")
                    };
                    assets.Add(asset);
                }
            }
            foreach (var css in styles)
            {
                var src = css.Attributes["href"].Value;
                if (IsLocalPath(src) && !assets.Any(a => a.Uri == src))
                {
                    var asset = new Asset
                    {
                        Uri    = src,
                        Suffix = "css",
                        NewUri = Guid.NewGuid().ToString().Replace("-", "")
                    };
                    assets.Add(asset);
                }
            }
            foreach (var script in scripts)
            {
                var src = script.Attributes["src"].Value;
                if (IsLocalPath(src) && !assets.Any(a => a.Uri == src))
                {
                    var suffix = src.Split('.').Last().Split('?').First().Split('#').First();
                    if (suffix == src.Split('?').First())
                    {
                        suffix = "js";
                    }
                    var asset = new Asset
                    {
                        Uri    = src,
                        Suffix = suffix,
                        NewUri = Guid.NewGuid().ToString().Replace("-", "")
                    };
                    assets.Add(asset);
                }
            }

            return(assets);
        }

Example #5

Show file

File: MirrorResources.cs Project: erisonliang/Statiq.Framework

        protected override async Task <IEnumerable <Common.IDocument> > ExecuteContextAsync(IExecutionContext context)
        {
#pragma warning disable RCS1163 // Unused parameter.
            // Handle invalid HTTPS certificates and allow alternate security protocols (see http://stackoverflow.com/a/5670954/807064)
            ServicePointManager.ServerCertificateValidationCallback = (s, cert, chain, ssl) => true;
#pragma warning restore RCS1163 // Unused parameter.

            // Cache downloaded resources
            Dictionary <string, string> mirrorCache = new Dictionary <string, string>();

            // Iterate the input documents synchronously so we don't download the same resource more than once
            HtmlParser parser = new HtmlParser();
            return(await context.Inputs
                   .ToAsyncEnumerable()
                   .SelectAwait(async x => await GetDocumentAsync(x))
                   .ToListAsync());

            async Task <Common.IDocument> GetDocumentAsync(Common.IDocument input)
            {
                IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, parser);

                if (htmlDocument != null)
                {
                    bool modifiedDocument = false;

                    // Link element
                    foreach (IElement element in htmlDocument
                             .GetElementsByTagName("link")
                             .Where(x => x.HasAttribute("href") && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = await DownloadAndReplaceAsync(element.GetAttribute("href"), mirrorCache, context);

                        if (replacement != null)
                        {
                            element.Attributes["href"].Value = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Scripts
                    foreach (IHtmlScriptElement element in htmlDocument.Scripts
                             .Where(x => !string.IsNullOrEmpty(x.Source) && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = await DownloadAndReplaceAsync(element.Source, mirrorCache, context);

                        if (replacement != null)
                        {
                            element.Source   = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Return a new document with the replacements if we performed any
                    if (modifiedDocument)
                    {
                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                            }
                        }
                    }
                }

                return(input);
            }
        }

Example #6

Show file

 public static string ParseName(IHtmlDocument doc)
 {
     return(Regex.Replace(doc.GetElementsByTagName("h1").FirstOrDefault().TextContent, @"<[^>]*>", "").Replace("\n", "").Trim());
 }

Example #7

Show file

        /// <summary>
        /// Find all &lt;noscript&gt; that are located after &lt;img&gt; nodes, and which contain
        /// only one single&lt;img&gt; element. Replace the first image from inside the
        /// &lt;noscript&gt; tag and remove the &lt;noscript&gt; tag. This improves the quality of the
        /// images we use on some sites (e.g.Medium)
        /// </summary>
        /// <param name="doc">The document to operate on</param>
        internal static void UnwrapNoscriptImages(IHtmlDocument doc)
        {
            // Find img without source or attributes that might contains image, and remove it.
            // This is done to prevent a placeholder img is replaced by img from noscript in next step.
            var imgs = doc.GetElementsByTagName("img");

            ForEachElement(imgs, static img => {
                for (var i = 0; i < img.Attributes.Length; i++)
                {
                    var attr = img.Attributes[i] !;

                    if (attr.Name is "src" or "srcset" or "data-src" or "data-srcset")
                    {
                        return;
                    }

                    if (Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                    {
                        return;
                    }
                }

                img.Parent !.RemoveChild(img);
            });

            // Next find noscript and try to extract its image
            var noscripts = doc.GetElementsByTagName("noscript");

            ForEachElement(noscripts, static noscript => {
                // Parse content of noscript and make sure it only contains image
                var doc = (IHtmlDocument)noscript.GetRoot();

                var tmp       = doc.CreateElement("div");
                tmp.InnerHtml = noscript.InnerHtml;

                if (!IsSingleImage(tmp))
                {
                    return;
                }

                // If noscript has previous sibling and it only contains image,
                // replace it with noscript content. However we also keep old
                // attributes that might contains image.
                if (noscript.PreviousElementSibling is IElement prevElement && IsSingleImage(prevElement))
                {
                    var prevImg = prevElement;
                    if (prevImg.TagName is not "IMG")
                    {
                        prevImg = prevElement.GetElementsByTagName("img")[0];
                    }

                    var newImg = tmp.GetElementsByTagName("img")[0];
                    for (var i = 0; i < prevImg.Attributes.Length; i++)
                    {
                        var attr = prevImg.Attributes[i] !;
                        if (attr.Value is "")
                        {
                            continue;
                        }

                        if (attr.Name is "src" or "srcset" ||
                            Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                        {
                            if (string.Equals(newImg.GetAttribute(attr.Name), attr.Value, StringComparison.Ordinal))
                            {
                                continue;
                            }

                            var attrName = attr.Name;
                            if (newImg.HasAttribute(attrName))
                            {
                                attrName = "data-old-" + attrName;
                            }

                            newImg.SetAttribute(attrName, attr.Value);
                        }
                    }

                    noscript.Parent !.ReplaceChild(tmp.FirstElementChild !, prevElement);
                }
            });
        }

Example #8

Show file

        public static List <Route.RouteType> ParseRouteTypes(IHtmlDocument doc)
        {
            string typeString = HttpUtility.HtmlDecode(doc.GetElementsByTagName("tr").FirstOrDefault(p => p.GetElementsByTagName("td").FirstOrDefault().TextContent.Contains("Type:"))
                                                       .GetElementsByTagName("td")[1].TextContent).Trim();

            List <Route.RouteType> result = new List <Route.RouteType>();

            if (Regex.IsMatch(typeString, "BOULDER", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "BOULDER", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Boulder);
            }

            if (Regex.IsMatch(typeString, "TRAD", RegexOptions.IgnoreCase)) //This has to go before an attempt to match "TR" so that we don't accidentally match "TR" instead of "TRAD"
            {
                typeString = Regex.Replace(typeString, "TRAD", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Trad);
            }

            if (Regex.IsMatch(typeString, "TR|TOP ROPE", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "TR|TOP ROPE", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.TopRope);
            }

            if (Regex.IsMatch(typeString, "AID", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "AID", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Aid);
            }

            if (Regex.IsMatch(typeString, "SPORT", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "SPORT", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Sport);
            }

            if (Regex.IsMatch(typeString, "MIXED", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "MIXED", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Mixed);
            }

            if (Regex.IsMatch(typeString, "ICE", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "ICE", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Ice);
            }

            if (Regex.IsMatch(typeString, "ALPINE", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "ALPINE", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Alpine);
            }

            if (Regex.IsMatch(typeString, "SNOW", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "SNOW", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Snow);
            }

            return(result);
        }

Example #9

Show file

File: ResolveXrefs.cs Project: weedkiller/Statiq.Web

        private static async Task <Common.IDocument> ResolveDocumentXrefsAsync(
            Common.IDocument input,
            IExecutionContext context,
            ConcurrentDictionary <string, ConcurrentBag <string> > failures)
        {
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument != null)
            {
                // Find and replace "xref:" in links
                bool modifiedDocument = false;
                bool errors           = false;
                foreach (IElement element in htmlDocument
                         .GetElementsByTagName("a")
                         .Where(x => x.HasAttribute("href")))
                {
                    string href = element.GetAttribute("href");
                    if (href.StartsWith("xref:") && href.Length > 5)
                    {
                        string xref                  = href.Substring(5);
                        string queryAndFragment      = string.Empty;
                        int    queryAndFragmentIndex = xref.IndexOfAny(new[] { '#', '?' });
                        if (queryAndFragmentIndex > 0)
                        {
                            queryAndFragment = xref.Substring(queryAndFragmentIndex);
                            xref             = xref.Substring(0, queryAndFragmentIndex);
                        }
                        if (context.TryGetXrefLink(xref, out string xrefLink, out string error))
                        {
                            element.Attributes["href"].Value = xrefLink + queryAndFragment;
                        }
                        else
                        {
                            // Continue processing so we can report all the failures in a given document
                            failures.AddOrUpdate(
                                input.Source.FullPath,
                                _ => new ConcurrentBag <string> {
                                error
                            },
                                (_, list) =>
                            {
                                list.Add(error);
                                return(list);
                            });
                            errors = true;
                        }
                        modifiedDocument = true;
                    }
                }

                // Exit if there were errors
                if (errors)
                {
                    return(null);
                }

                // Return a new document with the replacements if we performed any
                if (modifiedDocument)
                {
                    using (Stream contentStream = await context.GetContentStreamAsync())
                    {
                        using (StreamWriter writer = contentStream.GetWriter())
                        {
                            htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                            writer.Flush();
                            return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                        }
                    }
                }
            }

Example #10

Show file

        /// <summary>
        /// Find all &lt;noscript&gt; that are located after &lt;img&gt; nodes, and which contain
        /// only one single&lt;img&gt; element. Replace the first image from inside the
        /// &lt;noscript&gt; tag and remove the &lt;noscript&gt; tag. This improves the quality of the
        /// images we use on some sites (e.g.Medium)
        /// </summary>
        /// <param name="doc">The document to operate on</param>
        internal static void UnwrapNoscriptImages(IHtmlDocument doc)
        {
            // Find img without source or attributes that might contains image, and remove it.
            // This is done to prevent a placeholder img is replaced by img from noscript in next step.
            var imgs = doc.GetElementsByTagName("img");

            ForEachNode(imgs, (img) => {
                if (img is IElement)
                {
                    for (var i = 0; i < (img as IElement).Attributes.Length; i++)
                    {
                        var attr = (img as IElement).Attributes[i];
                        switch (attr.Name)
                        {
                        case "src":
                        case "srcset":
                        case "data-src":
                        case "data-srcset":
                            return;
                        }


                        if (Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                        {
                            return;
                        }
                    }

                    img.Parent.RemoveChild(img);
                }
            });

            // Next find noscript and try to extract its image
            var noscripts = doc.GetElementsByTagName("noscript");

            ForEachNode(noscripts, (noscript) => {
                if (noscript is IElement)
                {
                    // Parse content of noscript and make sure it only contains image
                    var tmp       = doc.CreateElement("div");
                    tmp.InnerHtml = (noscript as IElement).InnerHtml;
                    if (!IsSingleImage(tmp))
                    {
                        return;
                    }

                    // If noscript has previous sibling and it only contains image,
                    // replace it with noscript content. However we also keep old
                    // attributes that might contains image.
                    var prevElement = (noscript as IElement).PreviousElementSibling;
                    if (prevElement != null && IsSingleImage(prevElement))
                    {
                        var prevImg = prevElement;
                        if (prevImg.TagName != "IMG")
                        {
                            prevImg = prevElement.GetElementsByTagName("img")[0];
                        }

                        var newImg = tmp.GetElementsByTagName("img")[0];
                        for (var i = 0; i < prevImg.Attributes.Length; i++)
                        {
                            var attr = prevImg.Attributes[i];
                            if (attr.Value == "")
                            {
                                continue;
                            }

                            if (attr.Name == "src" || attr.Name == "srcset" ||
                                Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                            {
                                if (newImg.GetAttribute(attr.Name) == attr.Value)
                                {
                                    continue;
                                }

                                var attrName = attr.Name;
                                if (newImg.HasAttribute(attrName))
                                {
                                    attrName = "data-old-" + attrName;
                                }

                                newImg.SetAttribute(attrName, attr.Value);
                            }
                        }

                        noscript.Parent.ReplaceChild(tmp.FirstElementChild, prevElement);
                    }
                }
            });
        }

Example #11

Show file

File: BBCodeFormatter.cs Project: LaughingLeader/SourceControlGenerator

        public IHtmlDocument BBCodeConversion(IHtmlDocument doc)
        {
            if (!BBIgnoringLinks)
            {
                foreach (var element in doc.All.OfType <IHtmlAnchorElement>())
                {
                    element.OuterHtml = $"[url={element.Href}]{element.InnerHtml}[/url]";
                }
            }


            if (!BBIgnoringImages)
            {
                foreach (var element in doc.All.OfType <IHtmlImageElement>())
                {
                    element.OuterHtml = $"[img]{element.Source}[/img]";
                }
            }

            if (!BBIgnoringLists)
            {
                foreach (var element in doc.All.OfType <IHtmlOrderedListElement>())
                {
                    element.OuterHtml = $"[olist]{element.InnerHtml}[/olist]";
                }

                foreach (var element in doc.All.OfType <IHtmlUnorderedListElement>())
                {
                    element.OuterHtml = $"[list]{element.InnerHtml}[/list]";
                }

                foreach (var element in doc.All.OfType <IHtmlListItemElement>())
                {
                    element.OuterHtml = $"[*]{element.InnerHtml}";
                }
            }

            if (!BBIgnoringHeaders)
            {
                foreach (var element in doc.All.OfType <IHtmlHeadingElement>())
                {
                    //Ugly placeholder for now.
                    var comparer = StringComparison.OrdinalIgnoreCase;
                    var text     = element.OuterHtml;
                    text = text.Replace("<h1>", "[h1]", comparer).Replace("<h2>", "[h2]", comparer).Replace("<h3>", "[h3]", comparer).Replace("<h4>", "[h4]", comparer).Replace("<h5>", "[h5]", comparer).Replace("<h6>", "[h6]", comparer);
                    text = text.Replace("</h1>", "[/h1]", comparer).Replace("</h2>", "[/h2]", comparer).Replace("</h3>", "[/h3]", comparer).Replace("</h4>", "[/h4]", comparer).Replace("</h5>", "[/h5]", comparer).Replace("</h6>", "[/h6]", comparer);
                    element.OuterHtml = text;
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Strong) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Strong))
                {
                    element.OuterHtml = $"[b]{element.InnerHtml}[/b]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Em) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Em))
                {
                    element.OuterHtml = $"[i]{element.InnerHtml}[/i]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.U) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.U))
                {
                    element.OuterHtml = $"[u]{element.InnerHtml}[/u]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.P) < 0 && BBIgnoredElements.IndexOf(typeof(IHtmlParagraphElement)) < 0)
            {
                foreach (var element in doc.All.OfType <IHtmlParagraphElement>())
                {
                    element.OuterHtml = element.InnerHtml + Environment.NewLine;
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Code) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Code))
                {
                    element.OuterHtml = $"[code]{element.InnerHtml}[/code]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Pre) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Pre))
                {
                    element.OuterHtml = $"[noparse]{element.InnerHtml}[/noparse]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Strike) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Strike))
                {
                    element.OuterHtml = $"[strike]{element.InnerHtml}[/strike]";
                }
            }

            /*
             * foreach (var element in doc.All)
             * {
             *      Log.Here().Activity($"Element | Type {element.GetType()} Content: {element.OuterHtml}");
             * }
             */

            return(doc);
        }

Example #12

Show file

File: FormulaParser.cs Project: githabb/parseformula1

        public RaceModel Parse(IHtmlDocument document)
        {
            var trace = document.GetElementsByClassName("ResultsArchiveTitle").First();

            var    raceName = trace.InnerHtml.Trim();
            string name     = raceName.Substring(0, raceName.IndexOf('\n'));


            var tbody = document.GetElementsByTagName("tbody").First();

            var trs = tbody.GetElementsByTagName("tr");

            var list = new List <RacingResult>();

            foreach (var item in trs)
            {
                var racingResult = new RacingResult();

                racingResult.Pos = item.Children[1].InnerHtml;

                racingResult.No = int.Parse(item.Children[2].InnerHtml);

                racingResult.DriverFirstName = item.Children[3].Children[0].InnerHtml;

                racingResult.DriverLastName = item.Children[3].Children[1].InnerHtml;

                racingResult.DriverShortName = item.Children[3].Children[2].InnerHtml;

                racingResult.Car = item.Children[4].InnerHtml;

                racingResult.Laps = int.Parse(item.Children[5].InnerHtml);

                string timeStr = item.Children[6].InnerHtml;
                if (item.Children[6].ChildElementCount == 0)
                {
                    if (timeStr == "DNF" || timeStr == "DNS")
                    {
                        racingResult.Time = null;
                    }
                    else
                    {
                        racingResult.Time = TimeSpan.Parse(timeStr, CultureInfo.InvariantCulture);
                    }
                }
                else
                {
                    string  secondStr = timeStr.Substring(0, timeStr.IndexOf('<'));
                    decimal seconds   = decimal.Parse(secondStr, CultureInfo.InvariantCulture);

                    racingResult.Retired = new TimeSpan(0, 0, 0, (int)seconds, (int)((seconds - (int)seconds) * 1000));
                }

                racingResult.Pts = int.Parse(item.Children[7].InnerHtml);

                list.Add(racingResult);
            }

            var table = document.GetElementsByClassName("resultsarchive-col-right").First().OuterHtml;

            //document.Body.InnerHtml = table;
            //table = document.Source.Text;

            return(new RaceModel()
            {
                RaceName = name, RaceResults = list.ToArray(), ResultTable = table
            });
        }