/// <summary> /// Parses the account and profile settings page of the user and creates a user out of it. /// </summary> /// <param name="accountSettingsPage">The account settings page, which contains the user name and the email address of the user.</param> /// <param name="profileSettingsPage">The profile settings page, which contains the full name and the avatar of the user.</param> /// <returns>Returns the created user with the parsed information.</returns> public static User FromHtml(IHtmlDocument accountSettingsPage, IHtmlDocument profileSettingsPage) { // Creates a new user User user = new User(); // Tries to parse the account settings page for the user name and the email address, if it could not be parsed, then an exception is thrown try { IElement accountSettingsForm = accountSettingsPage.QuerySelector("#setting"); user.UserName = accountSettingsForm.QuerySelectorAll("input").FirstOrDefault(input => input.GetAttribute("name") == "login_name").GetAttribute("value"); user.EmailAddress = accountSettingsForm.QuerySelectorAll("input").FirstOrDefault(input => input.GetAttribute("name") == "email").GetAttribute("value"); } catch (Exception exception) { throw new NineGagException("The user name and the email address could not be parsed. This could be an indicator, that the 9GAG website is down or its content has changed. If this problem keeps coming, then please report this problem to 9GAG or the maintainer of the library.", exception); } // Tries to parse the profile settings page for the full name and the avatar image of the user, if it could not be parsed, then an exception is thrown try { user.FullName = profileSettingsPage.QuerySelectorAll("input").FirstOrDefault(input => input.GetAttribute("name") == "fullName").GetAttribute("value"); user.AvatarUri = new Uri(profileSettingsPage.QuerySelector("#jsid-profile-avatar").GetAttribute("src"), UriKind.Absolute); } catch (Exception exception) { throw new NineGagException("The full name and the avatar image could not be parsed. This could be an indicator, that the 9GAG website is down or its content has changed. If this problem keeps coming, then please report this problem to 9GAG or the maintainer of the library.", exception); } // Returns the created user return user; }
/// <summary> /// ページを解析してJSONを取得する /// </summary> /// <param name="source"></param> /// <returns></returns> private RemotePlaylistInfo AnalyzePage(string source) { var series = new RemotePlaylistInfo(); IHtmlDocument?document = HtmlParser.ParseDocument(source); IHtmlCollection <IElement>?videos = document?.QuerySelectorAll(".SeriesVideoListContainer-video"); if (videos is null) { return(series); } IElement?ownerElm = document?.QuerySelector(".SeriesAdditionalContainer-ownerName"); string ownerName = ownerElm?.InnerHtml ?? string.Empty; int ownerID = int.Parse((ownerElm?.GetAttribute("href")?.Split("/")[^ 1]) ?? "0");
/// <summary> /// Parses the detail information of the post. /// </summary> /// <param name="htmlDocument">The HTML document, which contains the details page of the post.</param> /// <exception cref="NineGagException">If anything goes wrong during the retrieval of the details, an <see cref="NineGagException"/> exception is thrown.</exception> protected override void ParseDetailInformation(IHtmlDocument htmlDocument) { // Calls the base implementation base.ParseDetailInformation(htmlDocument); // Tries to parse the the larger version of the image, if could not be parsed, then an exception is thrown try { this.Content = this.Content.Union(new List<Content> { new Content { Uri = new Uri(htmlDocument.QuerySelector("article img").GetAttribute("src"), UriKind.Absolute), Kind = ContentKind.Jpeg } }).ToList(); } catch (Exception exception) { throw new NineGagException("The larger version of the content of the image post could not be retrieved. Maybe there is no internet connection available.", exception); } }
public static void AssertErrorSummaryMessage(this IHtmlDocument document, string summaryInputName, string spanInputName, string expectedMessage) { // assert the error appears in the error summary var errorLink = (IHtmlAnchorElement)document?.QuerySelector(EscapeQuerySelector($"a#error-summary-{summaryInputName}")); Assert.NotNull(errorLink); Assert.Equal(expectedMessage, errorLink.TextContent); // assert the link contained within the error in the error summary works var errorParentId = errorLink.Href.Split("#").Last(); var errorParent = document.QuerySelector(EscapeQuerySelector($"#{errorParentId}")); Assert.NotNull(errorParent); // In some places we don't link to a particular field, e.g. if the error relates to multiple fields together if (spanInputName != null) { // assert the error is found where linked to by the error message and the correct error message is present errorParent.AssertErrorMessage(spanInputName, expectedMessage); } }
// todo: find a more concrete solution to this problem. private static bool IsPlayerProfilePrivate(IHtmlDocument pageData) { return(pageData.QuerySelector(".masthead-permission-level-text")?.TextContent == "Private Profile"); }
public bool IsVideoAvailable() { return(_root .QuerySelector("meta[property=\"og:url\"]") != null); }
protected override void GetDateTime(Article _article, IElement reducedArticle, IHtmlDocument fullArticle) { string dateSource = fullArticle.QuerySelector("div.container").QuerySelector("[itemprop = datePublished]").Attributes["content"].Value; _article.DateTime = DateTime.Parse(dateSource); }
private static string CompetitiveRankImage(IHtmlDocument doc) { var compImg = doc.QuerySelector("div.competitive-rank img")?.OuterHtml; return(!string.IsNullOrEmpty(compImg) ? compImg.Replace("<img src=\"", "").Replace("\">", "") : string.Empty); }
private static ushort EndorsementLevel(IHtmlDocument doc) { ushort.TryParse(doc.QuerySelector("div.endorsement-level div.u-center")?.TextContent, out ushort parsedEndorsementLevel); return(parsedEndorsementLevel); }
public bool Check(IHtmlDocument dom) { cell = dom.QuerySelector("[itemtype=\"http://schema.org/Product\"]"); return(cell != null); }
/// <summary> /// Attempts to get metadata for the article. /// </summary> /// <param name="doc">The document</param> /// <param name="uri">The uri, possibly used to check for a date</param> /// <param name="language">The language that was possibly found in the headers of the response</param> /// <param name="jsonLD">The dictionary containing metadata found in JSON LD</param> /// <returns>The metadata object with all the info found</returns> internal static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language, Dictionary <string, string> jsonLD) { Metadata metadata = new Metadata(); Dictionary <string, string> values = jsonLD; var metaElements = doc.GetElementsByTagName("meta"); // Match "description", or Twitter's "twitter:description" (Cards) // in name attribute. // name is a single value var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$"; // Match Facebook's Open Graph title & description properties. // property is a space-separated list of values var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)"; var itemPropPattern = @"\s*datePublished\s*"; // Find description tags. NodeUtility.ForEachNode(metaElements, (element) => { var elementName = (element as IElement).GetAttribute("name") ?? ""; var elementProperty = (element as IElement).GetAttribute("property") ?? ""; var itemProp = (element as IElement).GetAttribute("itemprop") ?? ""; var content = (element as IElement).GetAttribute("content"); // avoid issues with no meta tags if (string.IsNullOrEmpty(content)) { return; } MatchCollection matches = null; string name = ""; if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1) { values["author"] = (element as IElement).GetAttribute("content"); } if (!string.IsNullOrEmpty(elementProperty)) { matches = Regex.Matches(elementProperty, propertyPattern); if (matches.Count > 0) { for (int i = matches.Count - 1; i >= 0; i--) { // Convert to lowercase, and remove any whitespace // so we can match below. name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", ""); // multiple authors values[name] = content.Trim(); } } } if ((matches == null || matches.Count == 0) && !string.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase)) { name = elementName; if (!string.IsNullOrEmpty(content)) { // Convert to lowercase, remove any whitespace, and convert dots // to colons so we can match below. name = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":"); values[name] = content.Trim(); } } else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase)) { name = elementProperty; } else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase)) { name = itemProp; } if (!string.IsNullOrEmpty(name)) { content = (element as IElement).GetAttribute("content"); if (!string.IsNullOrEmpty(content)) { // Convert to lowercase and remove any whitespace // so we can match below. name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase); if (!values.ContainsKey(name)) { values.Add(name, content.Trim()); } } } }); // Find the the description of the article IEnumerable <string> DescriptionKeys() { yield return(values.ContainsKey("jsonld:description") ? values["jsonld:description"] : null); yield return(values.ContainsKey("description") ? values["description"] : null); yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null); yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null); yield return(values.ContainsKey("og:description") ? values["og:description"] : null); yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null); yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null); yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null); } metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? ""; IEnumerable <string> SiteNameKeys() { yield return(values.ContainsKey("jsonld:siteName") ? values["jsonld:siteName"] : null); yield return(values.ContainsKey("og:site_name") ? values["og:site_name"] : null); } // Get the name of the site metadata.SiteName = SiteNameKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? ""; // Find the title of the article IEnumerable <string> TitleKeys() { yield return(values.ContainsKey("jsonld:title") ? values["jsonld:title"] : null); yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null); yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null); yield return(values.ContainsKey("og:title") ? values["og:title"] : null); yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null); yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null); yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null); yield return(values.ContainsKey("title") ? values["title"] : null); } metadata.Title = TitleKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? ""; // Let's try to eliminate the site name from the title metadata.Title = CleanTitle(metadata.Title, metadata.SiteName); // We did not find any title, // we try to get it from the title tag if (string.IsNullOrEmpty(metadata.Title)) { metadata.Title = GetArticleTitle(doc); } // added language extraction IEnumerable <string> LanguageHeuristics() { yield return(language); yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang")); yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang")); yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content")); // this is wrong, but it's used yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value")); } metadata.Language = LanguageHeuristics().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? ""; // Find the featured image of the article IEnumerable <string> FeaturedImageKeys() { yield return(values.ContainsKey("jsonld:image") ? values["jsonld:image"] : null); yield return(values.ContainsKey("og:image") ? values["og:image"] : null); yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null); yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null); yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null); } metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? ""; // We try to find a meta tag for the author. // Note that there is Open Grapg tag for an author, // but it usually contains a profile URL of the author. // So we do not use it IEnumerable <string> AuthorKeys() { yield return(values.ContainsKey("jsonld:author") ? values["jsonld:author"] : null); yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null); yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] :null); yield return(values.ContainsKey("author") ? values["author"] : null); } metadata.Author = AuthorKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? ""; // added date extraction DateTime date; // added language extraction IEnumerable <DateTime?> DateHeuristics() { yield return(values.ContainsKey("jsonld:datePublished") && DateTime.TryParse(values["jsonld:datePublished"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("article:published_time") && DateTime.TryParse(values["article:published_time"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("date") && DateTime.TryParse(values["date"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("datepublished") && DateTime.TryParse(values["datepublished"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("weibo:article:create_at") && DateTime.TryParse(values["weibo:article:create_at"], out date) ? date : DateTime.MinValue); yield return(values.ContainsKey("weibo:webpage:create_at") && DateTime.TryParse(values["weibo:webpage:create_at"], out date) ? date : DateTime.MinValue); } metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue); if (metadata.PublicationDate == null) { var times = doc.GetElementsByTagName("time"); foreach (var time in times) { if (!string.IsNullOrEmpty(time.GetAttribute("pubDate")) && DateTime.TryParse(time.GetAttribute("datetime"), out date)) { metadata.PublicationDate = date; } } } if (metadata.PublicationDate == null) { // as a last resort check the URL for a date Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?"); if (maybeDate.Success) { metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value), int.Parse(maybeDate.Groups["month"].Value), !string.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1); } } // in many sites the meta value is escaped with HTML entities, // so here we need to unescape it metadata.Title = UnescapeHtmlEntities(metadata.Title); metadata.Excerpt = UnescapeHtmlEntities(metadata.Excerpt); metadata.SiteName = UnescapeHtmlEntities(metadata.SiteName); return(metadata); }
private IEnumerable <TModel> BindModelWithHtmlDocument <TModel>(IHtmlDocument htmlDocument, Action <TModel> postBindAction = null) where TModel : class, new() { Type modelType = typeof(TModel); BindAttribute bindAttribute = modelType.GetCustomAttributes <BindAttribute>(false).FirstOrDefault(); if (bindAttribute == null) { throw new BindAttributeNotFoundException($"BindAttribute not found for model {modelType.FullName}"); } // TODO : @deniz bunu genel bir yere taþýmak lazým. Birden fazla parametre olan durumlarda olucak string cssSelector = bindAttribute.CssSelector; if (string.IsNullOrEmpty(cssSelector)) { throw new CssSelectorNotFoundException($"BindAttribute not found for model {modelType.FullName}"); } if (CssSelectorParameters.Any(pair => cssSelector.Contains($"{{{pair.Key}}}"))) { string key = Regex.Match(cssSelector, @"\{([^}]*)\}").Groups[1].ToString(); string value = CssSelectorParameters[key]; cssSelector = cssSelector.Replace($"{{{key}}}", value); } IHtmlCollection <IElement> querySelectorAll = htmlDocument.QuerySelectorAll(cssSelector); List <TModel> models = new List <TModel>(); foreach (IElement element in querySelectorAll) { TModel instance = Activator.CreateInstance <TModel>(); PropertyInfo[] propertyInfos = modelType.GetProperties(); // TODO : @deniz property info'lara neyin nasýl set edilecek Func<>'lar yazýlarak belirlenecek. Propert'nin attribute'üne göre property ve Func<> ikilisi oluþturulup, bu func'lar execute edilecek. // Performans için property info ve func'lar expression'a çevrilip cache'lene bilir, IL emiting yapýlabilir. foreach (PropertyInfo propertyInfo in propertyInfos) { var propertyBindAttribute = propertyInfo.GetCustomAttributes <BindAttribute>(false).FirstOrDefault(); if (propertyBindAttribute == null) { continue; } string elementValue; IElement selectedElement = string.IsNullOrEmpty(propertyBindAttribute.CssSelector) ? element : propertyBindAttribute.ApplySelectorToHtmlDocument ? htmlDocument.QuerySelector(propertyBindAttribute.CssSelector) : element.QuerySelector(propertyBindAttribute.CssSelector); if (!string.IsNullOrEmpty(propertyBindAttribute.AttributeName)) { elementValue = selectedElement.Attributes[propertyBindAttribute.AttributeName].Value; } else { elementValue = propertyBindAttribute.ElementValueSelector == ElementValueSelector.InnerText ? selectedElement.TextContent : selectedElement.InnerHtml; } // TODO : @deniz Type conversion yapmak gerekebilir. propertyInfo.SetValue(instance, elementValue); } postBindAction?.Invoke(instance); // TODO : yield return tarzý birþey kullanýlabilir models.Add(instance); } return(models); }
protected virtual async Task EmbedImagesAsync(IHtmlDocument doc, OpfFile opfFile, Chapter chapter, string outputDir) { var tasks = new List<Task>(); var images = new Dictionary<Uri, string>(); foreach (var img in doc.QuerySelectorAll("img")) { string src = img.GetAttribute("src"); if (src.StartsWith("//")) { src = src.Substring(2); if (!(src.StartsWith("http://") || src.StartsWith("https://"))) src = "http://" + src; } Uri uri; if (!Uri.TryCreate(src, UriKind.RelativeOrAbsolute, out uri)) continue; UriBuilder ub = new UriBuilder(uri) { Query = string.Empty }; uri = ub.Uri; string fileName = $"{Path.GetRandomFileName()}.{Path.GetExtension(uri.ToString())}".ToValidFilePath(); if (string.IsNullOrEmpty(fileName)) return; string path = Path.Combine(outputDir, fileName); if (!images.ContainsKey(uri)) images.Add(uri, path); string filePath = Path.Combine(new DirectoryInfo(outputDir).Name, Path.GetFileName(path)).Replace(@"\", "/"); img.SetAttribute("src", filePath); } foreach (var img in images) { tasks.Add(Task.Run(async () => { Uri uri = img.Key; string path = img.Value; string outputPath = Path.Combine(new DirectoryInfo(outputDir).Name, Path.GetFileName(path)).Replace(@"\", "/"); string src = uri.ToString(); if (uri.IsAbsoluteUri && !uri.IsFile) { try { using (HttpClient client = new HttpClient()) { HttpResponseMessage resp = await client.GetAsync(src); resp.EnsureSuccessStatusCode(); string mediaType = resp.Content.Headers.ContentType.MediaType.ToLower(); if (mediaType != MediaType.JpegType && mediaType != MediaType.PngType) return; if (File.Exists(path)) return; using (FileStream fs = new FileStream(path, FileMode.CreateNew)) await resp.Content.CopyToAsync(fs); } } catch (Exception) { return; } } else if (File.Exists(src)) { File.Copy(src, path); } MediaType mType = MediaType.FromExtension(Path.GetExtension(path)); if (mType == null) return; opfFile.AddItem(new OpfItem(outputPath, StringUtilities.GenerateRandomString(), mType), false); })); } await Task.WhenAll(tasks.ToArray()); chapter.Content = doc.QuerySelector("body").ChildNodes.ToHtml(new XmlMarkupFormatter()); }
protected override void GetUrlMainImg(Article _article, IElement reducedArticle, IHtmlDocument fullArticle) { string imgurl = fullArticle.QuerySelector("img.article__main-image__image").Attributes["src"].Value; _article.UrlMainImg = imgurl; }
public bool ParseIsAvailable() => _root.QuerySelector("meta[property=\"og:url\"]") != null;
public FreelanceTask Parse(IHtmlDocument document) { ////var items = document.QuerySelectorAll("p").Where(item => item.ClassName != null && item.ClassName.Contains("href_me")); //var item = document.QuerySelector("p.href_me"); ////foreach (var item in items) ////{ //// fl.Description = item.TextContent; ////} ////try ////{ //// fl.Description = item.TextContent; ////} ////catch ////{ //// fl.Description = null; ////} //if (1==1) //{ // fl.Description = item. //} //else //{ // fl.Description = null; //} //return fl; var p = document.QuerySelector("p"); if (p.InnerHtml.Contains("Получить доступ")) { return(fl); } fl.Description = document.QuerySelector("p.href_me").TextContent; var allDt = document.QuerySelectorAll("dt"); foreach (var dt in allDt) { if (dt.InnerHtml.Contains("Срок выполнения:")) { fl.Deadline = dt.NextSibling.TextContent; } if (dt.InnerHtml.Contains("Варианты оплаты:")) { fl.Prepayment = dt.NextSibling.TextContent; } if (dt.InnerHtml.Contains("Способ оплаты:")) { List <string> payment = new List <string>(); var items_payment = dt.NextElementSibling.QuerySelectorAll("li"); foreach (var item in items_payment) { payment.Add(item.TextContent); } fl.Payment = string.Join(", ", payment); } if (dt.InnerHtml.Contains("Дата публикации:")) { fl.PublishDate = dt.NextSibling.TextContent; } } var hElements = document.QuerySelector("div.avatar>a"); fl.Employer = hElements.GetAttribute("title"); //int count = 0; //foreach (var h in hElements) //{ // count++; //} //string hs = document.QuerySelector("h4").TextContent; //fl.Employer = hElements; //fl.PublishDate = document.QuerySelector("dl>dd:nth-of-type(5)").TextContent; //fl.Employer = document.QuerySelector("div.table h4>a").TextContent; //fl.Employer = document.QuerySelector("dt").NextSibling.TextContent; return(fl); }
protected override void GetBody(Article _article, IHtmlDocument fullArticle) { _article.Body = fullArticle.QuerySelector("div.post__text").TextContent; }
public Product Parse(string productUrl, IHtmlDocument dom) { // Extracting item properties var itemPropCells = cell?.QuerySelectorAll("[itemprop]"); if (cell == null) { itemPropCells = dom.QuerySelectorAll("[itemprop]"); } // Parse all itemprops to dictionary var productProps = new Dictionary <string, string>(); foreach (var itemProperty in itemPropCells) { string key = itemProperty.GetAttribute("itemprop"); if (!productProps.ContainsKey(key)) { string value = itemProperty.TextContent; if (value == "") { value = itemProperty.GetAttribute("content"); } // Image url parse if (key == "image" && (value == "" || value == null)) { value = itemProperty.GetAttribute("src"); } if (value != null) { value = value.RemoveExtraWS(); } productProps.Add(key, value); } } if (productProps["priceCurrency"] == null) { var currencyCell = dom.QuerySelector("[itemprop=\"priceCurrency\"]"); if (currencyCell == null) { throw new ParserException("Can't parse product, price currency not found"); } } if (productProps["name"] == null || productProps["price"] == null) { throw new ParserException("Can't parse product, name or price not found"); } if (productProps.ContainsKey("image") && productProps["image"].StartsWith('/') && !productProps["image"].StartsWith("//")) { productProps["image"] = productUrl.GetSecondDomain() + productProps["image"]; } Product product = new Product { Url = productUrl, Name = productProps.ContainsKey("name") ? productProps["name"] : productUrl, Brand = productProps.ContainsKey("brand") ? productProps["brand"] : null, Currency = productProps["priceCurrency"], Price = double.Parse(productProps["price"], CultureInfo.InvariantCulture), IsOnSale = false, // TO DO Description = productProps.ContainsKey("description") ? productProps["description"] : null, Image = productProps.ContainsKey("image") ? productProps["image"] : null, }; return(product); }
private static string PortraitImage(IHtmlDocument doc) => doc.QuerySelector(".player-portrait").GetAttribute("src");
private static ushort CompetitiveRank(IHtmlDocument doc) { ushort.TryParse(doc.QuerySelector("div.competitive-rank div")?.TextContent, out var parsedCompetitiveRank); return(parsedCompetitiveRank); }
public override async Task<ConfigurationData> GetConfigurationForSetup() { var Login = Definition.Login; if (Login == null || Login.Method != "form") return configData; var LoginUrl = SiteLink + Login.Path; configData.CookieHeader.Value = null; landingResult = await RequestStringWithCookies(LoginUrl, null, SiteLink); var htmlParser = new HtmlParser(); landingResultDocument = htmlParser.Parse(landingResult.Content); var grecaptcha = landingResultDocument.QuerySelector(".g-recaptcha"); if (grecaptcha != null) { var CaptchaItem = new RecaptchaItem(); CaptchaItem.Name = "Captcha"; CaptchaItem.Version = "2"; CaptchaItem.SiteKey = grecaptcha.GetAttribute("data-sitekey"); if (CaptchaItem.SiteKey == null) // some sites don't store the sitekey in the .g-recaptcha div (e.g. cloudflare captcha challenge page) CaptchaItem.SiteKey = landingResultDocument.QuerySelector("[data-sitekey]").GetAttribute("data-sitekey"); configData.AddDynamic("Captcha", CaptchaItem); } return configData; }
internal static async Task <IEnumerable <Common.IDocument> > ProcessElementsAsync( Common.IDocument input, IExecutionContext context, string querySelector, bool first, Action <Common.IDocument, IExecutionContext, IElement, Dictionary <string, object> > processElement) { // Parse the HTML content IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser); if (htmlDocument == null) { return(input.Yield()); } // Evaluate the query selector try { if (!string.IsNullOrWhiteSpace(querySelector)) { IElement[] elements = first ? new[] { htmlDocument.QuerySelector(querySelector) } : htmlDocument.QuerySelectorAll(querySelector).ToArray(); if (elements.Length > 0 && elements[0] != null) { INode clone = htmlDocument.Clone(true); // Clone the document so we know if it changed Dictionary <string, object> metadata = new Dictionary <string, object>(); foreach (IElement element in elements) { processElement(input, context, element, metadata); } if (htmlDocument.Equals(clone)) { // Elements were not edited so return the original document or clone it with new metadata return(metadata.Count == 0 ? input.Yield() : input.Clone(metadata).Yield()); } // Elements were edited so get the new content using (Stream contentStream = await context.GetContentStreamAsync()) { using (StreamWriter writer = contentStream.GetWriter()) { htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance); writer.Flush(); IContentProvider contentProvider = context.GetContentProvider(contentStream, MediaTypes.Html); return(metadata.Count == 0 ? input.Clone(contentProvider).Yield() : input.Clone(metadata, contentProvider).Yield()); } } } } return(input.Yield()); } catch (Exception ex) { context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message); return(input.Yield()); } }