/// <summary> /// Fetches and returns information about the site that would later be used to parse cases /// </summary> /// <returns></returns> private static async Task <SiteData> GetSiteData() { string site = await HtmlFetcher.RetrieveFromUrl("https://csgostash.com/"); var doc = new HtmlDocument(); doc.LoadHtml(site); //Target the li tags with a class of dropdown IEnumerable <HtmlNode> liData = doc.DocumentNode.SelectNodes("//li[@class='dropdown']"); var siteData = new SiteData { //Get URL data from dropdowns CaseUrLs = GetDropdownOption(liData, "newest cases", "/case/"), KnifeUrLs = GetDropdownOption(liData, "newest knives", "/weapon/"), CollectionUrLs = GetDropdownOption(liData, "newest collections", "/collection/"), //Fetch the urLs for souvenirs, stickers, tournament stickers SouvenirUrLs = await GetPaginationUrls(liData, "newest cases", "souvenir-package"), StickerUrLs = await GetPaginationUrls(liData, "tournament stickers", "/stickers/regular"), TournamentStickerUrLs = await GetPaginationUrls(liData, "tournament stickers", "/stickers/tournament") }; return(siteData); }
private static async Task <int> GetPaginationPagesCount(string url) { var page = await HtmlFetcher.RetrieveFromUrl(url); var doc = new HtmlDocument(); doc.LoadHtml(page); doc.LoadHtml(doc.DocumentNode.SelectNodes("//ul[@class='pagination']").FirstOrDefault()?.InnerHtml); //Select and load the first pagination button IEnumerable <string> innerText = doc.DocumentNode.SelectNodes("//li").Select(n => n.InnerText); //Extract the inner text from li options //Cycle through each li and get the highest one int highestVal = 1; foreach (string s in innerText) { if (int.TryParse(s, out int val)) { if (val > highestVal) { highestVal = val; } } } return(highestVal); }
private static async Task <List <DataCollection> > ParseGridItems(IEnumerable <string> souvenirUrLs, string nameXpath, string collectionXpath) { Logger.Log("Fetching/parsing Souvenirs..."); var souvenirCollections = new List <DataCollection>(); //Parse through the souvenirs for names, and the collection foreach (string souvenirUrL in souvenirUrLs.ToList()) { string page = await HtmlFetcher.RetrieveFromUrl(souvenirUrL); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(page); //Filter to the class "well result-box nomargin" List <string> filteredDivs = doc.DocumentNode.SelectNodes("html/body/div/div/div/div").Where(n => n.Attributes["class"].Value.Contains("well result-box nomargin")).Select(n => n.InnerHtml).ToList(); //Extract name and collection foreach (string filteredDiv in filteredDivs) { HtmlDocument filteredDoc = new HtmlDocument(); filteredDoc.LoadHtml(filteredDiv); //Exception is likely from loading invalid HTML //Extract case name string name = ExtractStringFromTags(filteredDoc, nameXpath); //Extract case collection string itemCollection = ExtractStringFromTags(filteredDoc, collectionXpath).Replace("\n", ""); string iconUrL = ExtractImgSrc(doc, "img-responsive center-block"); //Add to souvenirCollections var souvenirCollection = new DataCollection() { Name = name, CaseCollection = itemCollection, IconUrL = iconUrL }; souvenirCollections.Add(souvenirCollection); } } return(souvenirCollections); }
private static async Task <Dictionary <string, List <string> > > ParseKnives(IEnumerable <string> masterKnifeUrLs) { masterKnifeUrLs = masterKnifeUrLs.ToList(); Logger.Log("Extracting knife names from URL..."); List <string> knifeUrLsList = masterKnifeUrLs.ToList(); var knifeNames = new List <string>(); //Extract the knife names by themselves from the list foreach (string knifeUrL in knifeUrLsList) { for (int i = knifeUrL.Length - 1; i > 0; i--) //subtract 1 since array is 0 based { if (knifeUrL[i] == '/') { knifeNames.Add(knifeUrL.Substring(i + 1, knifeUrL.Length - i - 1) //Offset forwards by 1 since this is 1 based .Replace("+", "-")); //the current knife name (E.G Navaja+Knife) which will be converted to navaja-knife break; } } } Logger.Log("Fetching/extracting individual knife URLs from HTML..."); //Pull the link <a></a> tags out of each page - store them to be used later var knifeUrLs = new List <string>(); int pageindex = 0; foreach (string knifeUrL in masterKnifeUrLs) { string page = await HtmlFetcher.RetrieveFromUrl(knifeUrL); //Getting knife name // => /body/div[1]/div[2]/div[0] // => col-lg-12 text-center col-widen content-header => h1 HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(page); var aTags = doc.DocumentNode.SelectNodes("/html/body//a"); //Select the body before looking for all the a tags //Filter the A-Tags to only ones with href var aTagHrefs = new List <string>(); foreach (var aTag in aTags) { string hrefVal = ""; if (aTag.Attributes["href"] != null) { hrefVal = aTag.Attributes["href"].Value; //Try to extract href val } if (!string.IsNullOrWhiteSpace(hrefVal)) { aTagHrefs.Add(hrefVal); //Log the https links for further processing } } //Process the aTag hrefs to only those which are valid links aTagHrefs = aTagHrefs.Where(t => t.Contains("http") && t.Contains("/skin/") && t.Contains(knifeNames[pageindex])).ToList(); //Only websites && only paths containing skin && has knife name //Delete duplicates aTagHrefs = aTagHrefs.Distinct().ToList(); aTagHrefs.ForEach(t => knifeUrLs.Add(t)); //add to master url list pageindex++; } //Follow links pulled above to get the names of the actual knives Logger.Log("Fetching/extracting individual knife HTML..."); var knifeCaseData = new Dictionary <string, List <string> >(); foreach (string knifeUrL in knifeUrLs) { string page = await HtmlFetcher.RetrieveFromUrl(knifeUrL); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(page); //Extracting case data List <string> knifeCases = doc.DocumentNode.SelectNodes("html/body//p") .Where(n => n.HasClass("collection-text-label")).Select(n => n.InnerHtml) .ToList(); //Search for p with class "collection-text-label" //Get knife name string knifeName = doc.DocumentNode.SelectNodes("html/head/title").Select(n => n.InnerHtml) .FirstOrDefault(); knifeName = knifeName?.Replace(" - CS:GO Stash", ""); //Null prorogation if (knifeName != null) { //Add to dictionary knifeCaseData.Add(knifeName, knifeCases); } } return(knifeCaseData); }
/// <summary> /// Parses through the block structure for item data and case name+collection /// </summary> /// <param name="caseUrLs"></param> /// <returns></returns> private static async Task <Dictionary <string, DataCollection> > ParseGridBlocks(IEnumerable <string> caseUrLs) { var csgoData = new Dictionary <string, DataCollection>(); foreach (string caseUrL in caseUrLs) { string page = await HtmlFetcher.RetrieveFromUrl(caseUrL); //Get case name and collection HtmlDocument htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(page); //Get the data out of the <a3> tags HtmlNode caseNodeData = htmlDoc.DocumentNode.SelectSingleNode( "//div[@class='inline-middle collapsed-top-margin']"); //inline-middle collapsed-top-margin string caseName = ExtractStringFromTags(caseNodeData, "h1"); string caseCollection = ExtractStringFromTags(caseNodeData, "h4"); //img-responsive center-block content-header-img-margin string iconUrL = ExtractImgSrc(htmlDoc, "img-responsive center-block content-header-img-margin"); var caseData = new DataCollection { Name = caseName, CaseCollection = caseCollection, IconUrL = iconUrL }; //If case name already exists, pass if (csgoData.ContainsKey(caseName)) { continue; } //Separate single string with line breaks into array string[] lines = page.Split( new[] { "\r\n", "\r", "\n" }, StringSplitOptions.None ); //Filter to lines containing " | " List <string> fLines = lines.Where(l => l.Contains(" | ")).ToList(); //Select the text out of the html foreach (var dataDuoLines in fLines) //The 2 lines containing the item name and collection { HtmlDocument duoHtml = new HtmlDocument(); duoHtml.LoadHtml(dataDuoLines); //Get the data out of the <a3> tags HtmlNodeCollection duoLineDataNodes = duoHtml.DocumentNode.SelectNodes("/h3/a"); if (duoLineDataNodes != null && duoLineDataNodes.Any()) { List <string> duoLineData = duoLineDataNodes.Select(i => i.InnerHtml).ToList(); //Concat into item name if (duoLineData.Any()) { caseData.Items.Add(string.Join(" | ", duoLineData)); //Weapon name | skin name } } } if (!csgoData.TryGetValue(caseName, out _)) { csgoData.Add(caseName, caseData); } } return(csgoData); }