public static string ScrapePalmettoStateArmory(string str) { string outPutString = System.String.Empty; var getHtmlWeb = new HtmlWeb(); var doc = getHtmlWeb.Load(str); HtmlAgilityPack.HtmlNodeCollection items = doc.DocumentNode.SelectNodes ("//li[contains(@class, 'item')]"); var ammoCollection = new List<Ammo> { }; foreach (var i in items) { Ammo currentAmmo = new Ammo(); string DescriptionString = ".//h2[contains(@class,'product-name')]//a"; string PriceString = ".//span[@class='price']"; string numInStock = ".//td[3]"; currentAmmo.Description = i.SelectSingleNode(DescriptionString).OuterHtml; try { currentAmmo.Price = i.SelectSingleNode(PriceString).InnerText; } catch { currentAmmo.Price = "caught an error trying to locate price on " + currentAmmo.Description; } //currentAmmo.NumberInSock = r.SelectSingleNode(numInStock).InnerText; //outPutString += i.OuterHtml + "<br />"; outPutString += "Description is: " + currentAmmo.Description + "<br />"; outPutString += "Price is: " + currentAmmo.Price + "<br />"; } if (outPutString == string.Empty) { outPutString = "<br />baseline ScrapePalmettoStateArmory Method with return null"; } return outPutString; }
public static string ScreenScrapeSGAmmo(string str) { Uri baseURI = new Uri(str); string outPutString = System.String.Empty; // do some work var getHtmlWeb = new HtmlWeb(); var doc = getHtmlWeb.Load(str); //string searchString = "/product/aguila/500-round-brick-22-lr-colibri-powderless-low-noise-ammo-20-grain-bullet-not-semi-auto"; string searchString = "/product/"; int counter = 0; //********************************************** //////System.Xml.XPath.XPathNavigator x = doc.CreateNavigator(); //////XPathNodeIterator it = x.Select(bug); //////while (it.MoveNext()) //////{ ////// outPutString = outPutString + counter + " ." + it.Current.Value + "<br />"; //////} //********************************************** //************** get rowes ************************************** HtmlAgilityPack.HtmlNodeCollection rows = doc.DocumentNode.SelectNodes ("//tr[td[a[contains(@href, '/product/')]]]"); var ammoCollection = new List<Ammo>{}; foreach (var r in rows) { string DescriptionString = ".//td//a[contains(@href,'" + searchString + "') and contains(.,'22') and not (img)]"; string PriceString = ".//td[@class='price-cell']//span"; string numInStock = ".//td[3]"; // outPutString = outPutString + r.OuterHtml + "<br />"; Ammo currentAmmo = new Ammo(); currentAmmo.Description = r.SelectSingleNode(DescriptionString).OuterHtml; //************ put host url into relative path supplied by host *********** string HRefPattern= "href\\s*=[\"]"; string replacement = "href=\"" + baseURI.Scheme + "://" + baseURI.Host; Regex rgx = new Regex(HRefPattern); string result = rgx.Replace(currentAmmo.Description, replacement); //************ end host url replacement logic ****************************** currentAmmo.Description = result; currentAmmo.Price = r.SelectSingleNode(PriceString).InnerText; currentAmmo.NumberInSock = r.SelectSingleNode(numInStock).InnerText; string[] w = System.Text.RegularExpressions.Regex.Split(currentAmmo.Description, "[0-9]+"); currentAmmo.NumberRounds = Regex.Match(currentAmmo.Description, @"\d+").ToString(); ammoCollection.Add(currentAmmo); } foreach (var a in ammoCollection) { outPutString = outPutString + "<br />" + "description is: " + a.Description + "<br />"; outPutString = outPutString + "price is " + a.Price + "<br />"; outPutString = outPutString + "number in stock is " + a.NumberInSock + "<br />"; outPutString = outPutString + "number of rounds is " + a.NumberRounds + "<br />"; outPutString = outPutString + "output uri is " + getHtmlWeb.ResponseUri + "<br />"; } outPutString = outPutString + "****************** end rows ********* <br />"; //************** end rowes *************************************** outPutString = outPutString + "****************** begin cols ********* <br />"; for (int i = 0; i < rows.Count; ++i) { HtmlNodeCollection cols = rows[i].SelectNodes(".//td"); for (int j = 0; j < cols.Count; ++j) { string value = cols[j].OuterHtml; //outPutString = outPutString + value + "<br />"; } } outPutString = outPutString + "****************** end cols ********* <br />"; //************** description from anchors *********************** string descString = "//tr//td//a[contains(@href,'" + searchString + "') and contains(.,'22') and not (img)]"; var data = doc.DocumentNode.SelectNodes(descString); foreach (var item in data) { counter += 1; //outPutString = outPutString + counter + ". " + item.InnerText + "<br />"; // *** last one used - outPutString = outPutString + counter + ". " + item.SelectSingleNode("//a").OuterHtml; } //**************************************************************** //var aTags = document.DocumentNode.SelectNodes("//td"); IEnumerable<HtmlNode> paragraphs = doc.DocumentNode.Descendants().Where(p => p.Name.ToLower() == "p"); //IEnumerable<HtmlNode> tr = doc.DocumentNode.Descendants().Where(t => t.Name.ToLower() == "tr"); //try linq() var names = from y in doc.DocumentNode.Descendants().Where(n => n.Name == "td") select y; //from td in tr.Descendants("td").Where(x => x.Attributes["href"].Value == // "/product/winchester/50-rd-box-22-lr-number-12-shot-shells-rat-or-snake-shot-ammo") //where td.InnerText.Trim().Length > 0 //select tr; //////////var data = ////////// from ////////// tr in doc.DocumentNode.Descendants().Where(td => td.Name.ToLower() == "tr") ////////// //tr.Descendants("td").Where(a => a.Attributes["href"].Value.Contains(searchString)) ////////// from ////////// td in tr.Descendants().Where(t => t.Name.ToLower() == "td") ////////// from ////////// a in td.Descendants("a").Where(a => a.Attributes["href"].Value.Contains(searchString) &! ////////// a.InnerHtml.Contains("img")) ////////// select tr; ////////if (data != null) ////////{ //////// foreach (var t in data) //////// { //////// //outPutString = "url inner text = " + lnk.InnerText + "<br />"; //////// //col = t.SelectNodes("//td[2]"); //////// //outPutString = outPutString + counter + " ." + t.InnerHtml + "<br />"; //////// //string s = t.InnerHtml.ToString(); //////// } ////////} ////if (aTags != null) ////{ //// foreach (var aTag in aTags) //// { //// counter += 1; //// //outPutString = outPutString + counter + "found one" + "\t" + "<br />"; //// // outPutString = outPutString + aTag.InnerHtml + " - " + aTag.Attributes["href"] + "\t" + "<br />"; //// // ** last working outPutString = outPutString + counter + ". " + aTag.InnerHtml + "<br />"; //// // "found one" + "\t" + "<br />"; //// //outPutString += counter + ". " + aTag.InnerHtml + " - " + //// // aTag.Attributes["href"].Value + "\t" + "<br />"; //// //counter++; //// } ////} return outPutString; }