Example #1
0
        /// <summary>
        /// Parses the account and profile settings page of the user and creates a user out of it.
        /// </summary>
        /// <param name="accountSettingsPage">The account settings page, which contains the user name and the email address of the user.</param>
        /// <param name="profileSettingsPage">The profile settings page, which contains the full name and the avatar of the user.</param>
        /// <returns>Returns the created user with the parsed information.</returns>
        public static User FromHtml(IHtmlDocument accountSettingsPage, IHtmlDocument profileSettingsPage)
        {
            // Creates a new user
            User user = new User();

            // Tries to parse the account settings page for the user name and the email address, if it could not be parsed, then an exception is thrown
            try
            {
                IElement accountSettingsForm = accountSettingsPage.QuerySelector("#setting");
                user.UserName = accountSettingsForm.QuerySelectorAll("input").FirstOrDefault(input => input.GetAttribute("name") == "login_name").GetAttribute("value");
                user.EmailAddress = accountSettingsForm.QuerySelectorAll("input").FirstOrDefault(input => input.GetAttribute("name") == "email").GetAttribute("value");
            }
            catch (Exception exception)
            {
                throw new NineGagException("The user name and the email address could not be parsed. This could be an indicator, that the 9GAG website is down or its content has changed. If this problem keeps coming, then please report this problem to 9GAG or the maintainer of the library.", exception);
            }

            // Tries to parse the profile settings page for the full name and the avatar image of the user, if it could not be parsed, then an exception is thrown
            try
            {
                user.FullName = profileSettingsPage.QuerySelectorAll("input").FirstOrDefault(input => input.GetAttribute("name") == "fullName").GetAttribute("value");
                user.AvatarUri = new Uri(profileSettingsPage.QuerySelector("#jsid-profile-avatar").GetAttribute("src"), UriKind.Absolute);
            }
            catch (Exception exception)
            {
                throw new NineGagException("The full name and the avatar image could not be parsed. This could be an indicator, that the 9GAG website is down or its content has changed. If this problem keeps coming, then please report this problem to 9GAG or the maintainer of the library.", exception);
            }

            // Returns the created user
            return user;
        }
Example #2
0
        /// <summary>
        /// ページを解析してJSONを取得する
        /// </summary>
        /// <param name="source"></param>
        /// <returns></returns>
        private RemotePlaylistInfo AnalyzePage(string source)
        {
            var series = new RemotePlaylistInfo();

            IHtmlDocument?document            = HtmlParser.ParseDocument(source);
            IHtmlCollection <IElement>?videos = document?.QuerySelectorAll(".SeriesVideoListContainer-video");

            if (videos is null)
            {
                return(series);
            }

            IElement?ownerElm  = document?.QuerySelector(".SeriesAdditionalContainer-ownerName");
            string   ownerName = ownerElm?.InnerHtml ?? string.Empty;
            int      ownerID   = int.Parse((ownerElm?.GetAttribute("href")?.Split("/")[^ 1]) ?? "0");
        /// <summary>
        /// Parses the detail information of the post.
        /// </summary>
        /// <param name="htmlDocument">The HTML document, which contains the details page of the post.</param>
        /// <exception cref="NineGagException">If anything goes wrong during the retrieval of the details, an <see cref="NineGagException"/> exception is thrown.</exception>
        protected override void ParseDetailInformation(IHtmlDocument htmlDocument)
        {
            // Calls the base implementation
            base.ParseDetailInformation(htmlDocument);

            // Tries to parse the the larger version of the image, if could not be parsed, then an exception is thrown
            try
            {
                this.Content = this.Content.Union(new List<Content>
                {
                    new Content
                    {
                        Uri = new Uri(htmlDocument.QuerySelector("article img").GetAttribute("src"), UriKind.Absolute),
                        Kind = ContentKind.Jpeg
                    }
                }).ToList();
            }
            catch (Exception exception)
            {
                throw new NineGagException("The larger version of the content of the image post could not be retrieved. Maybe there is no internet connection available.", exception);
            }
        }
Example #4
0
        public static void AssertErrorSummaryMessage(this IHtmlDocument document,
                                                     string summaryInputName,
                                                     string spanInputName,
                                                     string expectedMessage)
        {
            // assert the error appears in the error summary
            var errorLink = (IHtmlAnchorElement)document?.QuerySelector(EscapeQuerySelector($"a#error-summary-{summaryInputName}"));

            Assert.NotNull(errorLink);
            Assert.Equal(expectedMessage, errorLink.TextContent);

            // assert the link contained within the error in the error summary works
            var errorParentId = errorLink.Href.Split("#").Last();
            var errorParent   = document.QuerySelector(EscapeQuerySelector($"#{errorParentId}"));

            Assert.NotNull(errorParent);

            // In some places we don't link to a particular field, e.g. if the error relates to multiple fields together
            if (spanInputName != null)
            {
                // assert the error is found where linked to by the error message and the correct error message is present
                errorParent.AssertErrorMessage(spanInputName, expectedMessage);
            }
        }
Example #5
0
 // todo: find a more concrete solution to this problem.
 private static bool IsPlayerProfilePrivate(IHtmlDocument pageData)
 {
     return(pageData.QuerySelector(".masthead-permission-level-text")?.TextContent == "Private Profile");
 }
Example #6
0
 public bool IsVideoAvailable()
 {
     return(_root
            .QuerySelector("meta[property=\"og:url\"]") != null);
 }
Example #7
0
        protected override void GetDateTime(Article _article, IElement reducedArticle, IHtmlDocument fullArticle)
        {
            string dateSource = fullArticle.QuerySelector("div.container").QuerySelector("[itemprop = datePublished]").Attributes["content"].Value;

            _article.DateTime = DateTime.Parse(dateSource);
        }
Example #8
0
        private static string CompetitiveRankImage(IHtmlDocument doc)
        {
            var compImg = doc.QuerySelector("div.competitive-rank img")?.OuterHtml;

            return(!string.IsNullOrEmpty(compImg) ? compImg.Replace("<img src=\"", "").Replace("\">", "") : string.Empty);
        }
Example #9
0
 private static ushort EndorsementLevel(IHtmlDocument doc)
 {
     ushort.TryParse(doc.QuerySelector("div.endorsement-level div.u-center")?.TextContent, out ushort parsedEndorsementLevel);
     return(parsedEndorsementLevel);
 }
Example #10
0
 public bool Check(IHtmlDocument dom)
 {
     cell = dom.QuerySelector("[itemtype=\"http://schema.org/Product\"]");
     return(cell != null);
 }
Example #11
0
        /// <summary>
        /// Attempts to get metadata for the article.
        /// </summary>
        /// <param name="doc">The document</param>
        /// <param name="uri">The uri, possibly used to check for a date</param>
        /// <param name="language">The language that was possibly found in the headers of the response</param>
        /// <param name="jsonLD">The dictionary containing metadata found in JSON LD</param>
        /// <returns>The metadata object with all the info found</returns>
        internal static Metadata GetArticleMetadata(IHtmlDocument doc, Uri uri, string language, Dictionary <string, string> jsonLD)
        {
            Metadata metadata = new Metadata();
            Dictionary <string, string> values = jsonLD;
            var metaElements = doc.GetElementsByTagName("meta");

            // Match "description", or Twitter's "twitter:description" (Cards)
            // in name attribute.
            // name is a single value
            var namePattern = @"^\s*((?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)|name)\s*$";

            // Match Facebook's Open Graph title & description properties.
            // property is a space-separated list of values
            var propertyPattern = @"\s*(dc|dcterm|og|twitter|article)\s*:\s*(author|creator|description|title|published_time|image|site_name)(\s+|$)";

            var itemPropPattern = @"\s*datePublished\s*";

            // Find description tags.
            NodeUtility.ForEachNode(metaElements, (element) =>
            {
                var elementName     = (element as IElement).GetAttribute("name") ?? "";
                var elementProperty = (element as IElement).GetAttribute("property") ?? "";
                var itemProp        = (element as IElement).GetAttribute("itemprop") ?? "";
                var content         = (element as IElement).GetAttribute("content");

                // avoid issues with no meta tags
                if (string.IsNullOrEmpty(content))
                {
                    return;
                }
                MatchCollection matches = null;
                string name             = "";

                if (new string[] { elementName, elementProperty, itemProp }.ToList().IndexOf("author") != -1)
                {
                    values["author"] = (element as IElement).GetAttribute("content");
                }

                if (!string.IsNullOrEmpty(elementProperty))
                {
                    matches = Regex.Matches(elementProperty, propertyPattern);
                    if (matches.Count > 0)
                    {
                        for (int i = matches.Count - 1; i >= 0; i--)
                        {
                            // Convert to lowercase, and remove any whitespace
                            // so we can match below.
                            name = Regex.Replace(matches[i].Value.ToLower(), @"\s+", "");

                            // multiple authors
                            values[name] = content.Trim();
                        }
                    }
                }

                if ((matches == null || matches.Count == 0) &&
                    !string.IsNullOrEmpty(elementName) && Regex.IsMatch(elementName, namePattern, RegexOptions.IgnoreCase))
                {
                    name = elementName;
                    if (!string.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase, remove any whitespace, and convert dots
                        // to colons so we can match below.
                        name         = Regex.Replace(Regex.Replace(name.ToLower(), @"\s+", ""), @"\.", ":");
                        values[name] = content.Trim();
                    }
                }
                else if (Regex.IsMatch(elementProperty, propertyPattern, RegexOptions.IgnoreCase))
                {
                    name = elementProperty;
                }
                else if (Regex.IsMatch(itemProp, itemPropPattern, RegexOptions.IgnoreCase))
                {
                    name = itemProp;
                }

                if (!string.IsNullOrEmpty(name))
                {
                    content = (element as IElement).GetAttribute("content");
                    if (!string.IsNullOrEmpty(content))
                    {
                        // Convert to lowercase and remove any whitespace
                        // so we can match below.
                        name = Regex.Replace(name.ToLower(), @"\s", "", RegexOptions.IgnoreCase);
                        if (!values.ContainsKey(name))
                        {
                            values.Add(name, content.Trim());
                        }
                    }
                }
            });

            // Find the the description of the article
            IEnumerable <string> DescriptionKeys()
            {
                yield return(values.ContainsKey("jsonld:description") ? values["jsonld:description"] : null);

                yield return(values.ContainsKey("description") ? values["description"] : null);

                yield return(values.ContainsKey("dc:description") ? values["dc:description"] : null);

                yield return(values.ContainsKey("dcterm:description") ? values["dcterm:description"] : null);

                yield return(values.ContainsKey("og:description") ? values["og:description"] : null);

                yield return(values.ContainsKey("weibo:article:description") ? values["weibo:article:description"] : null);

                yield return(values.ContainsKey("weibo:webpage:description") ? values["weibo:webpage:description"] : null);

                yield return(values.ContainsKey("twitter:description") ? values["twitter:description"] : null);
            }

            metadata.Excerpt = DescriptionKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? "";

            IEnumerable <string> SiteNameKeys()
            {
                yield return(values.ContainsKey("jsonld:siteName") ? values["jsonld:siteName"] : null);

                yield return(values.ContainsKey("og:site_name") ? values["og:site_name"] : null);
            }

            // Get the name of the site
            metadata.SiteName = SiteNameKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? "";

            // Find the title of the article
            IEnumerable <string> TitleKeys()
            {
                yield return(values.ContainsKey("jsonld:title") ? values["jsonld:title"] : null);

                yield return(values.ContainsKey("dc:title") ? values["dc:title"] : null);

                yield return(values.ContainsKey("dcterm:title") ? values["dcterm:title"] : null);

                yield return(values.ContainsKey("og:title") ? values["og:title"] : null);

                yield return(values.ContainsKey("weibo:article:title") ? values["weibo:article:title"] : null);

                yield return(values.ContainsKey("weibo:webpage:title") ? values["weibo:webpage:title"] : null);

                yield return(values.ContainsKey("twitter:title") ? values["twitter:title"] : null);

                yield return(values.ContainsKey("title") ? values["title"] : null);
            }

            metadata.Title = TitleKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? "";

            // Let's try to eliminate the site name from the title
            metadata.Title = CleanTitle(metadata.Title, metadata.SiteName);

            // We did not find any title,
            // we try to get it from the title tag
            if (string.IsNullOrEmpty(metadata.Title))
            {
                metadata.Title = GetArticleTitle(doc);
            }

            // added language extraction
            IEnumerable <string> LanguageHeuristics()
            {
                yield return(language);

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("lang"));

                yield return(doc.GetElementsByTagName("html")[0].GetAttribute("xml:lang"));

                yield return(doc.QuerySelector("meta[http-equiv=\"Content-Language\"]")?.GetAttribute("content"));

                // this is wrong, but it's used
                yield return(doc.QuerySelector("meta[name=\"lang\"]")?.GetAttribute("value"));
            }

            metadata.Language = LanguageHeuristics().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? "";

            // Find the featured image of the article
            IEnumerable <string> FeaturedImageKeys()
            {
                yield return(values.ContainsKey("jsonld:image") ? values["jsonld:image"] : null);

                yield return(values.ContainsKey("og:image") ? values["og:image"] : null);

                yield return(values.ContainsKey("twitter:image") ? values["twitter:image"] : null);

                yield return(values.ContainsKey("weibo:article:image") ? values["weibo:article:image"] : null);

                yield return(values.ContainsKey("weibo:webpage:image") ? values["weibo:webpage:image"] : null);
            }

            metadata.FeaturedImage = FeaturedImageKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? "";

            // We try to find a meta tag for the author.
            // Note that there is Open Grapg tag for an author,
            // but it usually contains a profile URL of the author.
            // So we do not use it
            IEnumerable <string> AuthorKeys()
            {
                yield return(values.ContainsKey("jsonld:author") ? values["jsonld:author"] : null);

                yield return(values.ContainsKey("dc:creator") ? values["dc:creator"] : null);

                yield return(values.ContainsKey("dcterm:creator") ? values["dcterm:creator"] :null);

                yield return(values.ContainsKey("author") ? values["author"] : null);
            }

            metadata.Author = AuthorKeys().FirstOrDefault(l => !string.IsNullOrEmpty(l)) ?? "";

            // added date extraction
            DateTime date;

            // added language extraction
            IEnumerable <DateTime?> DateHeuristics()
            {
                yield return(values.ContainsKey("jsonld:datePublished") &&
                             DateTime.TryParse(values["jsonld:datePublished"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("article:published_time") &&
                             DateTime.TryParse(values["article:published_time"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("date") &&
                             DateTime.TryParse(values["date"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("datepublished") &&
                             DateTime.TryParse(values["datepublished"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:article:create_at") &&
                             DateTime.TryParse(values["weibo:article:create_at"], out date) ?
                             date : DateTime.MinValue);

                yield return(values.ContainsKey("weibo:webpage:create_at") &&
                             DateTime.TryParse(values["weibo:webpage:create_at"], out date) ?
                             date : DateTime.MinValue);
            }

            metadata.PublicationDate = DateHeuristics().FirstOrDefault(d => d != DateTime.MinValue);

            if (metadata.PublicationDate == null)
            {
                var times = doc.GetElementsByTagName("time");

                foreach (var time in times)
                {
                    if (!string.IsNullOrEmpty(time.GetAttribute("pubDate")) &&
                        DateTime.TryParse(time.GetAttribute("datetime"), out date))
                    {
                        metadata.PublicationDate = date;
                    }
                }
            }

            if (metadata.PublicationDate == null)
            {
                // as a last resort check the URL for a date
                Match maybeDate = Regex.Match(uri.PathAndQuery, "/(?<year>[0-9]{4})/(?<month>[0-9]{2})/(?<day>[0-9]{2})?");
                if (maybeDate.Success)
                {
                    metadata.PublicationDate = new DateTime(int.Parse(maybeDate.Groups["year"].Value),
                                                            int.Parse(maybeDate.Groups["month"].Value),
                                                            !string.IsNullOrEmpty(maybeDate.Groups["day"].Value) ? int.Parse(maybeDate.Groups["day"].Value) : 1);
                }
            }

            // in many sites the meta value is escaped with HTML entities,
            // so here we need to unescape it
            metadata.Title    = UnescapeHtmlEntities(metadata.Title);
            metadata.Excerpt  = UnescapeHtmlEntities(metadata.Excerpt);
            metadata.SiteName = UnescapeHtmlEntities(metadata.SiteName);

            return(metadata);
        }
Example #12
0
        private IEnumerable <TModel> BindModelWithHtmlDocument <TModel>(IHtmlDocument htmlDocument, Action <TModel> postBindAction = null)
            where TModel : class, new()
        {
            Type modelType = typeof(TModel);

            BindAttribute bindAttribute = modelType.GetCustomAttributes <BindAttribute>(false).FirstOrDefault();

            if (bindAttribute == null)
            {
                throw new BindAttributeNotFoundException($"BindAttribute not found for model {modelType.FullName}");
            }

            // TODO : @deniz bunu genel bir yere taþýmak lazým. Birden fazla parametre olan durumlarda olucak
            string cssSelector = bindAttribute.CssSelector;

            if (string.IsNullOrEmpty(cssSelector))
            {
                throw new CssSelectorNotFoundException($"BindAttribute not found for model {modelType.FullName}");
            }

            if (CssSelectorParameters.Any(pair => cssSelector.Contains($"{{{pair.Key}}}")))
            {
                string key   = Regex.Match(cssSelector, @"\{([^}]*)\}").Groups[1].ToString();
                string value = CssSelectorParameters[key];

                cssSelector = cssSelector.Replace($"{{{key}}}", value);
            }

            IHtmlCollection <IElement> querySelectorAll = htmlDocument.QuerySelectorAll(cssSelector);

            List <TModel> models = new List <TModel>();

            foreach (IElement element in querySelectorAll)
            {
                TModel         instance      = Activator.CreateInstance <TModel>();
                PropertyInfo[] propertyInfos = modelType.GetProperties();

                // TODO : @deniz property info'lara neyin nasýl set edilecek Func<>'lar yazýlarak belirlenecek. Propert'nin attribute'üne göre property ve Func<> ikilisi oluþturulup, bu func'lar execute edilecek.
                // Performans için property info ve func'lar expression'a çevrilip cache'lene bilir, IL emiting yapýlabilir.

                foreach (PropertyInfo propertyInfo in propertyInfos)
                {
                    var propertyBindAttribute = propertyInfo.GetCustomAttributes <BindAttribute>(false).FirstOrDefault();

                    if (propertyBindAttribute == null)
                    {
                        continue;
                    }

                    string elementValue;

                    IElement selectedElement = string.IsNullOrEmpty(propertyBindAttribute.CssSelector)
                        ? element
                        : propertyBindAttribute.ApplySelectorToHtmlDocument
                            ? htmlDocument.QuerySelector(propertyBindAttribute.CssSelector)
                            : element.QuerySelector(propertyBindAttribute.CssSelector);

                    if (!string.IsNullOrEmpty(propertyBindAttribute.AttributeName))
                    {
                        elementValue = selectedElement.Attributes[propertyBindAttribute.AttributeName].Value;
                    }
                    else
                    {
                        elementValue = propertyBindAttribute.ElementValueSelector == ElementValueSelector.InnerText
                            ? selectedElement.TextContent
                            : selectedElement.InnerHtml;
                    }

                    // TODO : @deniz Type conversion yapmak gerekebilir.
                    propertyInfo.SetValue(instance, elementValue);
                }

                postBindAction?.Invoke(instance);

                // TODO : yield return tarzý birþey kullanýlabilir
                models.Add(instance);
            }

            return(models);
        }
Example #13
0
        protected virtual async Task EmbedImagesAsync(IHtmlDocument doc, OpfFile opfFile, Chapter chapter, string outputDir)
        {
            var tasks = new List<Task>();
            var images = new Dictionary<Uri, string>();

            foreach (var img in doc.QuerySelectorAll("img"))
            {
                string src = img.GetAttribute("src");
                if (src.StartsWith("//"))
                {
                    src = src.Substring(2);

                    if (!(src.StartsWith("http://") || src.StartsWith("https://")))
                        src = "http://" + src;
                }

                Uri uri;
                if (!Uri.TryCreate(src, UriKind.RelativeOrAbsolute, out uri))
                    continue;

                UriBuilder ub = new UriBuilder(uri) { Query = string.Empty };
                uri = ub.Uri;

                string fileName = $"{Path.GetRandomFileName()}.{Path.GetExtension(uri.ToString())}".ToValidFilePath();

                if (string.IsNullOrEmpty(fileName))
                    return;

                string path = Path.Combine(outputDir, fileName);

                if (!images.ContainsKey(uri))
                    images.Add(uri, path);

                string filePath = Path.Combine(new DirectoryInfo(outputDir).Name, Path.GetFileName(path)).Replace(@"\", "/");
                img.SetAttribute("src", filePath);
            }

            foreach (var img in images)
            {
                tasks.Add(Task.Run(async () =>
                {
                    Uri uri = img.Key;
                    string path = img.Value;
                    string outputPath = Path.Combine(new DirectoryInfo(outputDir).Name, Path.GetFileName(path)).Replace(@"\", "/");
                    string src = uri.ToString();
                    
                    if (uri.IsAbsoluteUri && !uri.IsFile)
                    {
                        try
                        {
                            using (HttpClient client = new HttpClient())
                            {
                                HttpResponseMessage resp = await client.GetAsync(src);
                                resp.EnsureSuccessStatusCode();

                                string mediaType = resp.Content.Headers.ContentType.MediaType.ToLower();

                                if (mediaType != MediaType.JpegType && mediaType != MediaType.PngType)
                                    return;

                                if (File.Exists(path))
                                    return;

                                using (FileStream fs = new FileStream(path, FileMode.CreateNew))
                                    await resp.Content.CopyToAsync(fs);
                            }
                        }
                        catch (Exception)
                        {
                            return;
                        }
                    }
                    else if (File.Exists(src))
                    {
                        File.Copy(src, path);
                    }

                    MediaType mType = MediaType.FromExtension(Path.GetExtension(path));

                    if (mType == null)
                        return;

                    opfFile.AddItem(new OpfItem(outputPath, StringUtilities.GenerateRandomString(),
                        mType), false);
                }));
            }

            await Task.WhenAll(tasks.ToArray());

            chapter.Content = doc.QuerySelector("body").ChildNodes.ToHtml(new XmlMarkupFormatter());
        }
Example #14
0
        protected override void GetUrlMainImg(Article _article, IElement reducedArticle, IHtmlDocument fullArticle)
        {
            string imgurl = fullArticle.QuerySelector("img.article__main-image__image").Attributes["src"].Value;

            _article.UrlMainImg = imgurl;
        }
Example #15
0
 public bool ParseIsAvailable() => _root.QuerySelector("meta[property=\"og:url\"]") != null;
        public FreelanceTask Parse(IHtmlDocument document)
        {
            ////var items = document.QuerySelectorAll("p").Where(item => item.ClassName != null && item.ClassName.Contains("href_me"));
            //var item = document.QuerySelector("p.href_me");
            ////foreach (var item in items)
            ////{
            ////    fl.Description = item.TextContent;
            ////}
            ////try
            ////{
            ////    fl.Description = item.TextContent;
            ////}
            ////catch
            ////{
            ////    fl.Description = null;
            ////}
            //if (1==1)
            //{
            //    fl.Description = item.
            //}
            //else
            //{
            //    fl.Description = null;
            //}
            //return fl;
            var p = document.QuerySelector("p");

            if (p.InnerHtml.Contains("Получить доступ"))
            {
                return(fl);
            }
            fl.Description = document.QuerySelector("p.href_me").TextContent;

            var allDt = document.QuerySelectorAll("dt");

            foreach (var dt in allDt)
            {
                if (dt.InnerHtml.Contains("Срок выполнения:"))
                {
                    fl.Deadline = dt.NextSibling.TextContent;
                }
                if (dt.InnerHtml.Contains("Варианты оплаты:"))
                {
                    fl.Prepayment = dt.NextSibling.TextContent;
                }
                if (dt.InnerHtml.Contains("Способ оплаты:"))
                {
                    List <string> payment       = new List <string>();
                    var           items_payment = dt.NextElementSibling.QuerySelectorAll("li");
                    foreach (var item in items_payment)
                    {
                        payment.Add(item.TextContent);
                    }
                    fl.Payment = string.Join(", ", payment);
                }
                if (dt.InnerHtml.Contains("Дата публикации:"))
                {
                    fl.PublishDate = dt.NextSibling.TextContent;
                }
            }

            var hElements = document.QuerySelector("div.avatar>a");

            fl.Employer = hElements.GetAttribute("title");

            //int count = 0;
            //foreach (var h in hElements)
            //{
            //    count++;

            //}
            //string hs = document.QuerySelector("h4").TextContent;
            //fl.Employer = hElements;

            //fl.PublishDate = document.QuerySelector("dl>dd:nth-of-type(5)").TextContent;
            //fl.Employer = document.QuerySelector("div.table h4>a").TextContent;
            //fl.Employer = document.QuerySelector("dt").NextSibling.TextContent;


            return(fl);
        }
Example #17
0
 protected override void GetBody(Article _article, IHtmlDocument fullArticle)
 {
     _article.Body = fullArticle.QuerySelector("div.post__text").TextContent;
 }
Example #18
0
        public Product Parse(string productUrl, IHtmlDocument dom)
        {
            // Extracting item properties
            var itemPropCells = cell?.QuerySelectorAll("[itemprop]");

            if (cell == null)
            {
                itemPropCells = dom.QuerySelectorAll("[itemprop]");
            }

            // Parse all itemprops to dictionary
            var productProps = new Dictionary <string, string>();

            foreach (var itemProperty in itemPropCells)
            {
                string key = itemProperty.GetAttribute("itemprop");
                if (!productProps.ContainsKey(key))
                {
                    string value = itemProperty.TextContent;

                    if (value == "")
                    {
                        value = itemProperty.GetAttribute("content");
                    }

                    // Image url parse
                    if (key == "image" && (value == "" || value == null))
                    {
                        value = itemProperty.GetAttribute("src");
                    }

                    if (value != null)
                    {
                        value = value.RemoveExtraWS();
                    }

                    productProps.Add(key, value);
                }
            }

            if (productProps["priceCurrency"] == null)
            {
                var currencyCell = dom.QuerySelector("[itemprop=\"priceCurrency\"]");
                if (currencyCell == null)
                {
                    throw new ParserException("Can't parse product, price currency not found");
                }
            }

            if (productProps["name"] == null ||
                productProps["price"] == null)
            {
                throw new ParserException("Can't parse product, name or price not found");
            }

            if (productProps.ContainsKey("image") && productProps["image"].StartsWith('/') &&
                !productProps["image"].StartsWith("//"))
            {
                productProps["image"] = productUrl.GetSecondDomain() + productProps["image"];
            }

            Product product = new Product
            {
                Url         = productUrl,
                Name        = productProps.ContainsKey("name") ? productProps["name"] : productUrl,
                Brand       = productProps.ContainsKey("brand") ? productProps["brand"] : null,
                Currency    = productProps["priceCurrency"],
                Price       = double.Parse(productProps["price"], CultureInfo.InvariantCulture),
                IsOnSale    = false, // TO DO
                Description = productProps.ContainsKey("description") ? productProps["description"] : null,
                Image       = productProps.ContainsKey("image") ? productProps["image"] : null,
            };

            return(product);
        }
Example #19
0
 private static string PortraitImage(IHtmlDocument doc) => doc.QuerySelector(".player-portrait").GetAttribute("src");
Example #20
0
 private static ushort CompetitiveRank(IHtmlDocument doc)
 {
     ushort.TryParse(doc.QuerySelector("div.competitive-rank div")?.TextContent, out var parsedCompetitiveRank);
     return(parsedCompetitiveRank);
 }
Example #21
0
        public override async Task<ConfigurationData> GetConfigurationForSetup()
        {
            var Login = Definition.Login;

            if (Login == null || Login.Method != "form")
                return configData;

            var LoginUrl = SiteLink + Login.Path;

            configData.CookieHeader.Value = null;
            landingResult = await RequestStringWithCookies(LoginUrl, null, SiteLink);

            var htmlParser = new HtmlParser();
            landingResultDocument = htmlParser.Parse(landingResult.Content);

            var grecaptcha = landingResultDocument.QuerySelector(".g-recaptcha");
            if (grecaptcha != null)
            {
                var CaptchaItem = new RecaptchaItem();
                CaptchaItem.Name = "Captcha";
                CaptchaItem.Version = "2";
                CaptchaItem.SiteKey = grecaptcha.GetAttribute("data-sitekey");
                if (CaptchaItem.SiteKey == null) // some sites don't store the sitekey in the .g-recaptcha div (e.g. cloudflare captcha challenge page)
                    CaptchaItem.SiteKey = landingResultDocument.QuerySelector("[data-sitekey]").GetAttribute("data-sitekey");

                configData.AddDynamic("Captcha", CaptchaItem);
            }

            return configData;
        }
        internal static async Task <IEnumerable <Common.IDocument> > ProcessElementsAsync(
            Common.IDocument input,
            IExecutionContext context,
            string querySelector,
            bool first,
            Action <Common.IDocument, IExecutionContext, IElement, Dictionary <string, object> > processElement)
        {
            // Parse the HTML content
            IHtmlDocument htmlDocument = await input.ParseHtmlAsync(context, HtmlParser);

            if (htmlDocument == null)
            {
                return(input.Yield());
            }

            // Evaluate the query selector
            try
            {
                if (!string.IsNullOrWhiteSpace(querySelector))
                {
                    IElement[] elements = first
                        ? new[] { htmlDocument.QuerySelector(querySelector) }
                        : htmlDocument.QuerySelectorAll(querySelector).ToArray();
                    if (elements.Length > 0 && elements[0] != null)
                    {
                        INode clone = htmlDocument.Clone(true);  // Clone the document so we know if it changed
                        Dictionary <string, object> metadata = new Dictionary <string, object>();
                        foreach (IElement element in elements)
                        {
                            processElement(input, context, element, metadata);
                        }

                        if (htmlDocument.Equals(clone))
                        {
                            // Elements were not edited so return the original document or clone it with new metadata
                            return(metadata.Count == 0 ? input.Yield() : input.Clone(metadata).Yield());
                        }

                        // Elements were edited so get the new content
                        using (Stream contentStream = await context.GetContentStreamAsync())
                        {
                            using (StreamWriter writer = contentStream.GetWriter())
                            {
                                htmlDocument.ToHtml(writer, ProcessingInstructionFormatter.Instance);
                                writer.Flush();
                                IContentProvider contentProvider = context.GetContentProvider(contentStream, MediaTypes.Html);
                                return(metadata.Count == 0
                                    ? input.Clone(contentProvider).Yield()
                                    : input.Clone(metadata, contentProvider).Yield());
                            }
                        }
                    }
                }
                return(input.Yield());
            }
            catch (Exception ex)
            {
                context.LogWarning("Exception while processing HTML for {0}: {1}", input.ToSafeDisplayString(), ex.Message);
                return(input.Yield());
            }
        }