Example #1
0
        /// <summary>
        /// Attempts to get metadata for the article.
        /// </summary>
        /// <param name="doc">The document</param>
        /// <param name="uri">The uri, possibly used to check for a date</param>
        /// <param name="language">The language that was possibly found in the headers of the response</param>
        /// <returns>The metadata object with all the info found</returns>
        internal static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language)
        {
            Metadata metadata = new Metadata();
            Dictionary <string, string> values = new Dictionary <string, string>();
            var metaElements = doc.GetElementsByTagName("meta");

            // Match "description", or Twitter's "twitter:description" (Cards)
            // in name attribute.
            // name is a single value
            var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$";

            // Match Facebook's Open Graph title & description properties.
            // property is a space-separated list of values
            var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)";

            var itemPropPattern = @"\s*datePublished\s*";

            // Find description tags.
            NodeUtility.ForEachNode(metaElements, (element) =>
            {
                var elementName     = (element as IElement).GetAttribute("name") ?? "";
                var elementProperty = (element as IElement).GetAttribute("property") ?? "";
                var itemProp        = (element as IElement).GetAttribute("itemprop") ?? "";
                var content         = (element as IElement).GetAttribute("content");

                // avoid issues with no meta tags
                if (String.IsNullOrEmpty(content))
                {
                    return;
                }
                MatchCollection matches = null;
                String name             = "";

                if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1)
                {
                    metadata.Byline = (element as IElement).GetAttribute("content");
                    metadata.Author = (element as IElement).GetAttribute("content");
                    return;
                }

                if (!String.IsNullOrEmpty(elementProperty))
                {
                    matches = Regex.Matches(elementProperty, propertyPattern);
                    if (matches.Count > 0)
                    {
                        for (int i = matches.Count - 1; i >= 0; i--)
                        {
                            // Convert to lowercase, and remove any whitespace
                            // so we can match below.
                            name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", "");

                            // multiple authors
                            values[name] = content.Trim();
                        }
                    }
                }

                if ((matches == null || matches.Count == 0) &&
                    !String.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase))
                {
                    name = elementName;
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase, remove any whitespace, and convert dots
                        // to colons so we can match below.
                        name         = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":");
                        values[name] = content.Trim();
                    }
                }
                else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase))
                {
                    name = elementProperty;
                }
                else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase))
                {
                    name = itemProp;
                }

                if (!String.IsNullOrEmpty(name))
                {
                    content = (element as IElement).GetAttribute("content");
                    if (!String.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase and remove any whitespace
                        // so we can match below.
                        name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase);
                        if (!values.ContainsKey(name))
                        {
                            values.Add(name, content.Trim());
                        }
                    }
                }
            });

            // Find the the description of the article
            IEnumerable <string> DescriptionKeys()
            {
                yield return(values.ContainsKey("description") ? values["description"] : null);

                yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null);

                yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null);

                yield return(values.ContainsKey("og:description") ? values["og:description"] : null);

                yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null);

                yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null);

                yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null);
            }

            metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Get the name of the site
            if (values.ContainsKey("og:site_name"))
            {
                metadata.SiteName = values["og:site_name"];
            }

            // Find the title of the article
            IEnumerable <string> TitleKeys()
            {
                yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null);

                yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null);

                yield return(values.ContainsKey("og:title") ? values["og:title"] : null);

                yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null);

                yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null);

                yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null);

                yield return(values.ContainsKey("title") ? values["title"] : null);
            }

            metadata.Title = TitleKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Let's try to eliminate the site name from the title
            metadata.Title = Readability.CleanTitle(metadata.Title, metadata.SiteName);

            // We did not find any title,
            // we try to get it from the title tag
            if (String.IsNullOrEmpty(metadata.Title))
            {
                metadata.Title = Readability.GetArticleTitle(doc);
            }

            // added language extraction
            IEnumerable <string> LanguageHeuristics()
            {
                yield return(language);

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang"));

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang"));

                yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content"));

                // this is wrong, but it's used
                yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value"));
            }

            metadata.Language = LanguageHeuristics().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            // Find the featured image of the article
            IEnumerable <string> FeaturedImageKeys()
            {
                yield return(values.ContainsKey("og:image") ? values["og:image"] : null);

                yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null);

                yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null);

                yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null);
            }

            metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";

            if (String.IsNullOrEmpty(metadata.Author))
            {
                // We try to find a meta tag for the author.
                // Note that there is Open Grapg tag for an author,
                // but it usually contains a profile URL of the author.
                // So we do not use it
                IEnumerable <string> AuthorKeys()
                {
                    yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null);

                    yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] : null);

                    yield return(values.ContainsKey("author") ? values["author"] : null);
                }

                metadata.Author = AuthorKeys().FirstOrDefault(l => !String.IsNullOrEmpty(l)) ?? "";
            }

            // added date extraction
            DateTime date;

            // added language extraction
            IEnumerable <DateTime?> DateHeuristics()
            {
                yield return(values.ContainsKey("article:published_time") &&
                             DateTime.TryParse(values["article:published_time"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("date") &&
                             DateTime.TryParse(values["date"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("datepublished") &&
                             DateTime.TryParse(values["datepublished"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:article:create_at") &&
                             DateTime.TryParse(values["weibo:article:create_at"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:webpage:create_at") &&
                             DateTime.TryParse(values["weibo:webpage:create_at"], out date) ?
                             date : DateTime.MinValue);
            }

            metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue);

            if (metadata.PublicationDate == null)
            {
                var times = doc.GetElementsByTagName("time");

                foreach (var time in times)
                {
                    if (!String.IsNullOrEmpty(time.GetAttribute("pubDate")) &&
                        DateTime.TryParse(time.GetAttribute("datetime"), out date))
                    {
                        metadata.PublicationDate = date;
                    }
                }
            }

            if (metadata.PublicationDate == null)
            {
                // as a last resort check the URL for a date
                Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?");
                if (maybeDate.Success)
                {
                    metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value),
                                                            int.Parse(maybeDate.Groups["month"].Value),
                                                            !String.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1);
                }
            }

            return(metadata);
        }
Example #2
0
        public static async Task <IList <Uri> > GetFeedUrls(
            string originUrl,
            bool findAll = false)
        {
            var allUrls = new List <Uri>();
            Uri baseUri = FixupUrl(originUrl);

            // Maybe... maybe this one is a feed?
            Log.FindFeedCheckingBase(baseUri);
            string data = await GetFeedData(baseUri);

            if (LooksLikeFeed(data))
            {
                Log.FindFeedBaseWasFeed(baseUri);
                return(new[] { baseUri });
            }

            // Nope, let's dive into the soup!
            var           parser   = new HtmlParser();
            IHtmlDocument document = parser.ParseDocument(data);

            // Link elements.
            Log.FindFeedCheckingLinkElements(baseUri);
            List <Uri> linkUrls = new List <Uri>();

            foreach (IElement element in document.GetElementsByTagName("link"))
            {
                string linkType = element.GetAttribute("type");
                if (linkType != null && FeedMimeTypes.Contains(linkType))
                {
                    Uri hrefUrl =
                        SyndicationUtil.TryParseAbsoluteUrl(
                            element.GetAttribute("href"),
                            baseUri
                            );
                    if (hrefUrl != null)
                    {
                        linkUrls.Add(hrefUrl);
                    }
                }
            }

            await FilterUrlsByFeed(linkUrls);

            if (linkUrls.Count > 0)
            {
                Log.FindFeedFoundLinkElements(baseUri, linkUrls);
                linkUrls.Sort(UrlFeedComparison);
                allUrls.AddRange(linkUrls);
                if (!findAll)
                {
                    return(allUrls);
                }
            }

            // <a> tags
            Log.FindFeedCheckingAnchorElements(baseUri);
            List <Uri> localGuesses  = new List <Uri>();
            List <Uri> remoteGuesses = new List <Uri>();

            foreach (IElement element in document.GetElementsByTagName("a"))
            {
                Uri hrefUrl =
                    SyndicationUtil.TryParseAbsoluteUrl(
                        element.GetAttribute("href"),
                        baseUri
                        );
                if (hrefUrl != null)
                {
                    if ((hrefUrl.Host == baseUri.Host) && IsFeedUrl(hrefUrl))
                    {
                        localGuesses.Add(hrefUrl);
                    }
                    else if (IsFeedishUrl(hrefUrl))
                    {
                        remoteGuesses.Add(hrefUrl);
                    }
                }
            }

            Log.FindFeedFoundSomeAnchors(baseUri, localGuesses, remoteGuesses);

            // (Consider ones on the same domain first.)
            await FilterUrlsByFeed(localGuesses);

            if (localGuesses.Count > 0)
            {
                Log.FindFeedsFoundLocalGuesses(baseUri, localGuesses);
                localGuesses.Sort(UrlFeedComparison);
                allUrls.AddRange(localGuesses);
                if (!findAll)
                {
                    return(localGuesses);
                }
            }

            await FilterUrlsByFeed(remoteGuesses);

            if (remoteGuesses.Count > 0)
            {
                Log.FindFeedsFoundRemoteGuesses(baseUri, remoteGuesses);
                remoteGuesses.Sort(UrlFeedComparison);
                allUrls.AddRange(remoteGuesses);
                if (!findAll)
                {
                    return(remoteGuesses);
                }
            }

            List <Uri> randomGuesses =
                FeedNames.Select(s => new Uri(baseUri, s)).ToList();

            await FilterUrlsByFeed(randomGuesses);

            if (randomGuesses.Count > 0)
            {
                Log.FindFeedsFoundRandomGuesses(baseUri, randomGuesses);
                randomGuesses.Sort(UrlFeedComparison);
                allUrls.AddRange(randomGuesses);
                if (!findAll)
                {
                    return(randomGuesses);
                }
            }

            // All done, nothing. (Or... everything!)
            Log.FindFeedFoundTotal(baseUri, allUrls);
            return(allUrls);
        }
Example #3
0
        /// <summary>
        /// Get the article title
        /// </summary>
        /// <param name="doc">The document</param>
        /// <returns>
        /// The clean title
        /// </returns>
        internal static string GetArticleTitle(IHtmlDocument doc)
        {
            var curTitle  = "";
            var origTitle = "";

            try
            {
                curTitle = origTitle = doc.Title.Trim();

                // If they had an element with id "title" in their HTML
                if (typeof(string) != curTitle.GetType())
                {
                    curTitle = origTitle = NodeUtility.GetInnerText(doc.GetElementsByTagName("title")[0]);
                }
            }
            catch (Exception e) { /* ignore exceptions setting the title. */ }

            var titleHadHierarchicalSeparators = false;

            int wordCount(String str)
            {
                return(Regex.Split(str, @"\s+").Length);
            }

            // If there's a separator in the title, first remove the final part
            if (curTitle.IndexOfAny(new char[] { '|', '-', '»', '/', '>' }) != -1)
            {
                titleHadHierarchicalSeparators = curTitle.IndexOfAny(new char[] { '\\', '»', '/', '>' }) != -1;
                curTitle = Regex.Replace(origTitle, @"(.*) [\|\-\\\/>»] .*", "$1", RegexOptions.IgnoreCase);

                // If the resulting title is too short (3 words or fewer), remove
                // the first part instead:
                if (wordCount(curTitle) < 3)
                {
                    curTitle = Regex.Replace(origTitle, @"[^\|\-\\\/>»]* [\|\-\\\/>»](.*)", "$1", RegexOptions.IgnoreCase);
                }
            }
            else if (curTitle.IndexOf(": ") != -1)
            {
                // Check if we have an heading containing this exact string, so we
                // could assume it's the full title.
                var headings = NodeUtility.ConcatNodeLists(
                    doc.GetElementsByTagName("h1"),
                    doc.GetElementsByTagName("h2")
                    );
                var trimmedTitle = curTitle.Trim();
                var match        = NodeUtility.SomeNode(headings, (heading) =>
                {
                    return(heading.TextContent.Trim() == trimmedTitle);
                });

                // If we don't, let's extract the title out of the original title string.
                if (!match)
                {
                    curTitle = origTitle.Substring(origTitle.LastIndexOf(':') + 1);

                    // If the title is now too short, try the first colon instead:
                    if (wordCount(curTitle) < 3)
                    {
                        curTitle = origTitle.Substring(origTitle.IndexOf(':') + 1);
                    }
                }
            }
            else if (curTitle.Length > 150 || curTitle.Length < 15)
            {
                var hOnes = doc.GetElementsByTagName("h1");

                if (hOnes.Length == 1)
                {
                    curTitle = NodeUtility.GetInnerText(hOnes[0]);
                }
            }

            curTitle = curTitle.Trim();

            // If we now have 4 words or fewer as our title, and either no
            // 'hierarchical' separators (\, /, > or ») were found in the original
            // title or we decreased the number of words by more than 1 word, use
            // the original title.
            var curTitleWordCount = wordCount(curTitle);

            if (curTitleWordCount <= 4 && (
                    !titleHadHierarchicalSeparators ||
                    curTitleWordCount != wordCount(Regex.Replace(origTitle, @"[\|\-\\\/>»: ]+", " ", RegexOptions.IgnoreCase)) - 1))
            {
                curTitle = origTitle;
            }

            return(curTitle);
        }
Example #4
0
        public List <Asset> GetHtmlAssets(IHtmlDocument doc)
        {
            var assets = new List <Asset>();
            var images = doc.Images
                         .Where(x => x.HasAttribute("src"));
            var styles = doc.GetElementsByTagName("link")
                         .Where(l => l.Attributes["rel"].Value.Trim().ToLower() == "stylesheet")
                         .Where(c => c.HasAttribute("href"));
            var scripts = doc.GetElementsByTagName("script")
                          .Where(x => x.HasAttribute("src"));
            var inlineStyles = doc.GetElementsByTagName("style");

            //var cssparser = new AngleSharp.Parser.Css.CssParser();
            //var f = cssparser.ParseStylesheet(inlineStyles[0].InnerHtml);
            foreach (var inlineStyle in inlineStyles)
            {
                var inlineStyleAssets = GetCssAssets(inlineStyle.InnerHtml);
                assets.AddRange(inlineStyleAssets);
            }

            foreach (var image in images)
            {
                var src = image.Attributes["src"].Value;
                if (IsLocalPath(src) && !assets.Any(a => a.Uri == src))
                {
                    var suffix = src.Split('.').Last().Split('?').First().Split('#').First();
                    var asset  = new Asset
                    {
                        Uri    = src,
                        Suffix = suffix,
                        NewUri = Guid.NewGuid().ToString().Replace("-", "")
                    };
                    assets.Add(asset);
                }
            }
            foreach (var css in styles)
            {
                var src = css.Attributes["href"].Value;
                if (IsLocalPath(src) && !assets.Any(a => a.Uri == src))
                {
                    var asset = new Asset
                    {
                        Uri    = src,
                        Suffix = "css",
                        NewUri = Guid.NewGuid().ToString().Replace("-", "")
                    };
                    assets.Add(asset);
                }
            }
            foreach (var script in scripts)
            {
                var src = script.Attributes["src"].Value;
                if (IsLocalPath(src) && !assets.Any(a => a.Uri == src))
                {
                    var suffix = src.Split('.').Last().Split('?').First().Split('#').First();
                    if (suffix == src.Split('?').First())
                    {
                        suffix = "js";
                    }
                    var asset = new Asset
                    {
                        Uri    = src,
                        Suffix = suffix,
                        NewUri = Guid.NewGuid().ToString().Replace("-", "")
                    };
                    assets.Add(asset);
                }
            }

            return(assets);
        }
        protected override async Task <IEnumerable <Common.IDocument> > ExecuteContextAsync(IExecutionContext context)
        {
#pragma warning disable RCS1163 // Unused parameter.
            // Handle invalid HTTPS certificates and allow alternate security protocols (see http://stackoverflow.com/a/5670954/807064)
            ServicePointManager.ServerCertificateValidationCallback = (s, cert, chain, ssl) => true;
#pragma warning restore RCS1163 // Unused parameter.

            // Cache downloaded resources
            Dictionary <string, string> mirrorCache = new Dictionary <string, string>();

            // Iterate the input documents synchronously so we don't download the same resource more than once
            HtmlParser parser = new HtmlParser();
            return(await context.Inputs
                   .ToAsyncEnumerable()
                   .SelectAwait(async x => await GetDocumentAsync(x))
                   .ToListAsync());

            async Task <Common.IDocument> GetDocumentAsync(Common.IDocument input)
            {
                IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, parser);

                if (htmlDocument != null)
                {
                    bool modifiedDocument = false;

                    // Link element
                    foreach (IElement element in htmlDocument
                             .GetElementsByTagName("link")
                             .Where(x => x.HasAttribute("href") && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = await DownloadAndReplaceAsync(element.GetAttribute("href"), mirrorCache, context);

                        if (replacement != null)
                        {
                            element.Attributes["href"].Value = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Scripts
                    foreach (IHtmlScriptElement element in htmlDocument.Scripts
                             .Where(x => !string.IsNullOrEmpty(x.Source) && !x.HasAttribute("data-no-mirror")))
                    {
                        string replacement = await DownloadAndReplaceAsync(element.Source, mirrorCache, context);

                        if (replacement != null)
                        {
                            element.Source   = replacement;
                            modifiedDocument = true;
                        }
                    }

                    // Return a new document with the replacements if we performed any
                    if (modifiedDocument)
                    {
                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                            }
                        }
                    }
                }

                return(input);
            }
        }
Example #6
0
 public static string ParseName(IHtmlDocument doc)
 {
     return(Regex.Replace(doc.GetElementsByTagName("h1").FirstOrDefault().TextContent, @"<[^>]*>", "").Replace("\n", "").Trim());
 }
Example #7
0
        /// <summary>
        /// Find all &lt;noscript&gt; that are located after &lt;img&gt; nodes, and which contain
        /// only one single&lt;img&gt; element. Replace the first image from inside the
        /// &lt;noscript&gt; tag and remove the &lt;noscript&gt; tag. This improves the quality of the
        /// images we use on some sites (e.g.Medium)
        /// </summary>
        /// <param name="doc">The document to operate on</param>
        internal static void UnwrapNoscriptImages(IHtmlDocument doc)
        {
            // Find img without source or attributes that might contains image, and remove it.
            // This is done to prevent a placeholder img is replaced by img from noscript in next step.
            var imgs = doc.GetElementsByTagName("img");

            ForEachElement(imgs, static img => {
                for (var i = 0; i < img.Attributes.Length; i++)
                {
                    var attr = img.Attributes[i] !;

                    if (attr.Name is "src" or "srcset" or "data-src" or "data-srcset")
                    {
                        return;
                    }

                    if (Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                    {
                        return;
                    }
                }

                img.Parent !.RemoveChild(img);
            });

            // Next find noscript and try to extract its image
            var noscripts = doc.GetElementsByTagName("noscript");

            ForEachElement(noscripts, static noscript => {
                // Parse content of noscript and make sure it only contains image
                var doc = (IHtmlDocument)noscript.GetRoot();

                var tmp       = doc.CreateElement("div");
                tmp.InnerHtml = noscript.InnerHtml;

                if (!IsSingleImage(tmp))
                {
                    return;
                }

                // If noscript has previous sibling and it only contains image,
                // replace it with noscript content. However we also keep old
                // attributes that might contains image.
                if (noscript.PreviousElementSibling is IElement prevElement && IsSingleImage(prevElement))
                {
                    var prevImg = prevElement;
                    if (prevImg.TagName is not "IMG")
                    {
                        prevImg = prevElement.GetElementsByTagName("img")[0];
                    }

                    var newImg = tmp.GetElementsByTagName("img")[0];
                    for (var i = 0; i < prevImg.Attributes.Length; i++)
                    {
                        var attr = prevImg.Attributes[i] !;
                        if (attr.Value is "")
                        {
                            continue;
                        }

                        if (attr.Name is "src" or "srcset" ||
                            Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                        {
                            if (string.Equals(newImg.GetAttribute(attr.Name), attr.Value, StringComparison.Ordinal))
                            {
                                continue;
                            }

                            var attrName = attr.Name;
                            if (newImg.HasAttribute(attrName))
                            {
                                attrName = "data-old-" + attrName;
                            }

                            newImg.SetAttribute(attrName, attr.Value);
                        }
                    }

                    noscript.Parent !.ReplaceChild(tmp.FirstElementChild !, prevElement);
                }
            });
        }
Example #8
0
        public static List <Route.RouteType> ParseRouteTypes(IHtmlDocument doc)
        {
            string typeString = HttpUtility.HtmlDecode(doc.GetElementsByTagName("tr").FirstOrDefault(p => p.GetElementsByTagName("td").FirstOrDefault().TextContent.Contains("Type:"))
                                                       .GetElementsByTagName("td")[1].TextContent).Trim();

            List <Route.RouteType> result = new List <Route.RouteType>();

            if (Regex.IsMatch(typeString, "BOULDER", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "BOULDER", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Boulder);
            }

            if (Regex.IsMatch(typeString, "TRAD", RegexOptions.IgnoreCase)) //This has to go before an attempt to match "TR" so that we don't accidentally match "TR" instead of "TRAD"
            {
                typeString = Regex.Replace(typeString, "TRAD", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Trad);
            }

            if (Regex.IsMatch(typeString, "TR|TOP ROPE", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "TR|TOP ROPE", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.TopRope);
            }

            if (Regex.IsMatch(typeString, "AID", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "AID", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Aid);
            }

            if (Regex.IsMatch(typeString, "SPORT", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "SPORT", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Sport);
            }

            if (Regex.IsMatch(typeString, "MIXED", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "MIXED", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Mixed);
            }

            if (Regex.IsMatch(typeString, "ICE", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "ICE", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Ice);
            }

            if (Regex.IsMatch(typeString, "ALPINE", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "ALPINE", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Alpine);
            }

            if (Regex.IsMatch(typeString, "SNOW", RegexOptions.IgnoreCase))
            {
                typeString = Regex.Replace(typeString, "SNOW", "", RegexOptions.IgnoreCase);
                result.Add(Route.RouteType.Snow);
            }

            return(result);
        }
Example #9
0
        private static async Task <Common.IDocument> ResolveDocumentXrefsAsync(
            Common.IDocument input,
            IExecutionContext context,
            ConcurrentDictionary <string, ConcurrentBag <string> > failures)
        {
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument != null)
            {
                // Find and replace "xref:" in links
                bool modifiedDocument = false;
                bool errors           = false;
                foreach (IElement element in htmlDocument
                         .GetElementsByTagName("a")
                         .Where(x => x.HasAttribute("href")))
                {
                    string href = element.GetAttribute("href");
                    if (href.StartsWith("xref:") && href.Length > 5)
                    {
                        string xref                  = href.Substring(5);
                        string queryAndFragment      = string.Empty;
                        int    queryAndFragmentIndex = xref.IndexOfAny(new[] { '#', '?' });
                        if (queryAndFragmentIndex > 0)
                        {
                            queryAndFragment = xref.Substring(queryAndFragmentIndex);
                            xref             = xref.Substring(0, queryAndFragmentIndex);
                        }
                        if (context.TryGetXrefLink(xref, out string xrefLink, out string error))
                        {
                            element.Attributes["href"].Value = xrefLink + queryAndFragment;
                        }
                        else
                        {
                            // Continue processing so we can report all the failures in a given document
                            failures.AddOrUpdate(
                                input.Source.FullPath,
                                _ => new ConcurrentBag <string> {
                                error
                            },
                                (_, list) =>
                            {
                                list.Add(error);
                                return(list);
                            });
                            errors = true;
                        }
                        modifiedDocument = true;
                    }
                }

                // Exit if there were errors
                if (errors)
                {
                    return(null);
                }

                // Return a new document with the replacements if we performed any
                if (modifiedDocument)
                {
                    using (Stream contentStream = await context.GetContentStreamAsync())
                    {
                        using (StreamWriter writer = contentStream.GetWriter())
                        {
                            htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                            writer.Flush();
                            return(input.Clone(context.GetContentProvider(contentStream, MediaTypes.Html)));
                        }
                    }
                }
            }
Example #10
0
        /// <summary>
        /// Find all &lt;noscript&gt; that are located after &lt;img&gt; nodes, and which contain
        /// only one single&lt;img&gt; element. Replace the first image from inside the
        /// &lt;noscript&gt; tag and remove the &lt;noscript&gt; tag. This improves the quality of the
        /// images we use on some sites (e.g.Medium)
        /// </summary>
        /// <param name="doc">The document to operate on</param>
        internal static void UnwrapNoscriptImages(IHtmlDocument doc)
        {
            // Find img without source or attributes that might contains image, and remove it.
            // This is done to prevent a placeholder img is replaced by img from noscript in next step.
            var imgs = doc.GetElementsByTagName("img");

            ForEachNode(imgs, (img) => {
                if (img is IElement)
                {
                    for (var i = 0; i < (img as IElement).Attributes.Length; i++)
                    {
                        var attr = (img as IElement).Attributes[i];
                        switch (attr.Name)
                        {
                        case "src":
                        case "srcset":
                        case "data-src":
                        case "data-srcset":
                            return;
                        }


                        if (Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                        {
                            return;
                        }
                    }

                    img.Parent.RemoveChild(img);
                }
            });

            // Next find noscript and try to extract its image
            var noscripts = doc.GetElementsByTagName("noscript");

            ForEachNode(noscripts, (noscript) => {
                if (noscript is IElement)
                {
                    // Parse content of noscript and make sure it only contains image
                    var tmp       = doc.CreateElement("div");
                    tmp.InnerHtml = (noscript as IElement).InnerHtml;
                    if (!IsSingleImage(tmp))
                    {
                        return;
                    }

                    // If noscript has previous sibling and it only contains image,
                    // replace it with noscript content. However we also keep old
                    // attributes that might contains image.
                    var prevElement = (noscript as IElement).PreviousElementSibling;
                    if (prevElement != null && IsSingleImage(prevElement))
                    {
                        var prevImg = prevElement;
                        if (prevImg.TagName != "IMG")
                        {
                            prevImg = prevElement.GetElementsByTagName("img")[0];
                        }

                        var newImg = tmp.GetElementsByTagName("img")[0];
                        for (var i = 0; i < prevImg.Attributes.Length; i++)
                        {
                            var attr = prevImg.Attributes[i];
                            if (attr.Value == "")
                            {
                                continue;
                            }

                            if (attr.Name == "src" || attr.Name == "srcset" ||
                                Regex.IsMatch(attr.Value, @"\.(jpg|jpeg|png|webp)"))
                            {
                                if (newImg.GetAttribute(attr.Name) == attr.Value)
                                {
                                    continue;
                                }

                                var attrName = attr.Name;
                                if (newImg.HasAttribute(attrName))
                                {
                                    attrName = "data-old-" + attrName;
                                }

                                newImg.SetAttribute(attrName, attr.Value);
                            }
                        }

                        noscript.Parent.ReplaceChild(tmp.FirstElementChild, prevElement);
                    }
                }
            });
        }
        public IHtmlDocument BBCodeConversion(IHtmlDocument doc)
        {
            if (!BBIgnoringLinks)
            {
                foreach (var element in doc.All.OfType <IHtmlAnchorElement>())
                {
                    element.OuterHtml = $"[url={element.Href}]{element.InnerHtml}[/url]";
                }
            }


            if (!BBIgnoringImages)
            {
                foreach (var element in doc.All.OfType <IHtmlImageElement>())
                {
                    element.OuterHtml = $"[img]{element.Source}[/img]";
                }
            }

            if (!BBIgnoringLists)
            {
                foreach (var element in doc.All.OfType <IHtmlOrderedListElement>())
                {
                    element.OuterHtml = $"[olist]{element.InnerHtml}[/olist]";
                }

                foreach (var element in doc.All.OfType <IHtmlUnorderedListElement>())
                {
                    element.OuterHtml = $"[list]{element.InnerHtml}[/list]";
                }

                foreach (var element in doc.All.OfType <IHtmlListItemElement>())
                {
                    element.OuterHtml = $"[*]{element.InnerHtml}";
                }
            }

            if (!BBIgnoringHeaders)
            {
                foreach (var element in doc.All.OfType <IHtmlHeadingElement>())
                {
                    //Ugly placeholder for now.
                    var comparer = StringComparison.OrdinalIgnoreCase;
                    var text     = element.OuterHtml;
                    text = text.Replace("<h1>", "[h1]", comparer).Replace("<h2>", "[h2]", comparer).Replace("<h3>", "[h3]", comparer).Replace("<h4>", "[h4]", comparer).Replace("<h5>", "[h5]", comparer).Replace("<h6>", "[h6]", comparer);
                    text = text.Replace("</h1>", "[/h1]", comparer).Replace("</h2>", "[/h2]", comparer).Replace("</h3>", "[/h3]", comparer).Replace("</h4>", "[/h4]", comparer).Replace("</h5>", "[/h5]", comparer).Replace("</h6>", "[/h6]", comparer);
                    element.OuterHtml = text;
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Strong) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Strong))
                {
                    element.OuterHtml = $"[b]{element.InnerHtml}[/b]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Em) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Em))
                {
                    element.OuterHtml = $"[i]{element.InnerHtml}[/i]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.U) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.U))
                {
                    element.OuterHtml = $"[u]{element.InnerHtml}[/u]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.P) < 0 && BBIgnoredElements.IndexOf(typeof(IHtmlParagraphElement)) < 0)
            {
                foreach (var element in doc.All.OfType <IHtmlParagraphElement>())
                {
                    element.OuterHtml = element.InnerHtml + Environment.NewLine;
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Code) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Code))
                {
                    element.OuterHtml = $"[code]{element.InnerHtml}[/code]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Pre) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Pre))
                {
                    element.OuterHtml = $"[noparse]{element.InnerHtml}[/noparse]";
                }
            }

            if (BBIgnoredTags.IndexOf(TagNames.Strike) < 0)
            {
                foreach (var element in doc.GetElementsByTagName(TagNames.Strike))
                {
                    element.OuterHtml = $"[strike]{element.InnerHtml}[/strike]";
                }
            }

            /*
             * foreach (var element in doc.All)
             * {
             *      Log.Here().Activity($"Element | Type {element.GetType()} Content: {element.OuterHtml}");
             * }
             */

            return(doc);
        }
Example #12
0
        public RaceModel Parse(IHtmlDocument document)
        {
            var trace = document.GetElementsByClassName("ResultsArchiveTitle").First();

            var    raceName = trace.InnerHtml.Trim();
            string name     = raceName.Substring(0, raceName.IndexOf('\n'));


            var tbody = document.GetElementsByTagName("tbody").First();

            var trs = tbody.GetElementsByTagName("tr");

            var list = new List <RacingResult>();

            foreach (var item in trs)
            {
                var racingResult = new RacingResult();

                racingResult.Pos = item.Children[1].InnerHtml;

                racingResult.No = int.Parse(item.Children[2].InnerHtml);

                racingResult.DriverFirstName = item.Children[3].Children[0].InnerHtml;

                racingResult.DriverLastName = item.Children[3].Children[1].InnerHtml;

                racingResult.DriverShortName = item.Children[3].Children[2].InnerHtml;

                racingResult.Car = item.Children[4].InnerHtml;

                racingResult.Laps = int.Parse(item.Children[5].InnerHtml);

                string timeStr = item.Children[6].InnerHtml;
                if (item.Children[6].ChildElementCount == 0)
                {
                    if (timeStr == "DNF" || timeStr == "DNS")
                    {
                        racingResult.Time = null;
                    }
                    else
                    {
                        racingResult.Time = TimeSpan.Parse(timeStr, CultureInfo.InvariantCulture);
                    }
                }
                else
                {
                    string  secondStr = timeStr.Substring(0, timeStr.IndexOf('<'));
                    decimal seconds   = decimal.Parse(secondStr, CultureInfo.InvariantCulture);

                    racingResult.Retired = new TimeSpan(0, 0, 0, (int)seconds, (int)((seconds - (int)seconds) * 1000));
                }

                racingResult.Pts = int.Parse(item.Children[7].InnerHtml);

                list.Add(racingResult);
            }

            var table = document.GetElementsByClassName("resultsarchive-col-right").First().OuterHtml;

            //document.Body.InnerHtml = table;
            //table = document.Source.Text;

            return(new RaceModel()
            {
                RaceName = name, RaceResults = list.ToArray(), ResultTable = table
            });
        }