public static List <string> GetProductsUri(string pagination_uri) { //go over the product page and extract product url's List <string> productsLinks = new List <string>(); HtmlDocument dataPage = DataGetter.GetHtmlpage(new Uri(pagination_uri)); //now get all product url's from page List <string> elements = new List <string>(); elements.Add("//li[@class='productNumberValue']"); List <KeyValuePair <string, HtmlNodeCollection> > data = DataGetter.GetDataByXPATH(dataPage, elements); HtmlNodeCollection productsNodes = data[0].Value; for (int j = 0; /*j < numProductsToGet && */ j < productsNodes.Count; j++) { string link = productsNodes[j].InnerHtml; string extractedLink = ExtractLinkFromHtml(productsNodes[j], "href=", "\">"); productsLinks.Add(SigmaAldrichConstants.SigmaAldrichMain + "/" + extractedLink); } System.Threading.Thread.Sleep((int)DataGetter.GetRandomNumber(5.0, 15.0) * 1000); return(productsLinks); }
public static Product GetProduct(string product_uri) { Product p = new Product(); List <string> elements = new List <string>(); List <string> headers = new List <string>(); headers.Add("Components"); headers.Add("Application"); headers.Add("Features and Benefits"); headers.Add("General description"); headers.Add("Packaging"); headers.Add("Reconstitution"); headers.Add("Other Notes"); headers.Add("Legal Information"); headers.Add("Caution"); headers.Add("Biochem/physiol Actions"); headers.Add("Preparation Note"); elements.Add("//div[@class='descriptionContent']"); HtmlDocument dataPage = DataGetter.GetHtmlpage(new Uri(product_uri)); List <KeyValuePair <string, HtmlNodeCollection> > dataDescription = DataGetter.GetDataByXPATH(dataPage, elements); p.Description = SigmaAldrichParser.ParseDescription(dataDescription, headers); p.Properties = SigmaAldrichParser.ParseDetailProperties(dataPage.DocumentNode); List <string> elements1 = new List <string>(); elements1.Add("//p[@class='product-name']"); p.Name = GetProductName(dataPage, elements1); return(p); }
//string ProductName = GetProductName(uri, elements); public static void Parse(Uri uri) { HtmlDocument entirePage = DataGetter.GetHtmlpage(uri); //string productName = GetProductName(entirePage, "//h2[@class='product-name']"); //string description = GetDescription(entirePage, "//meta[@name='description']"); //specifications Specifications = GetSpecifications(entirePage, "//td[@class='data']"); List <string> productUris = GetProductUris(entirePage, "//li"); }
public static List <string> GetAllProductsUri(List <string> pagination_uri, int num_pages_to_get) { //go over all product pages and extract product url's List <string> productsLinks = new List <string>(); //int numProductsToGet = num_pages_to_get; //if (numProductsToGet == 0) //{ // numProductsToGet = pagination_uri.Count; //} for (int i = 0; i < pagination_uri.Count; i++) { HtmlDocument dataPage = DataGetter.GetHtmlpage(new Uri(pagination_uri[i])); //now get all product url's from page List <string> elements = new List <string>(); elements.Add("//li[@class='productNumberValue']"); List <KeyValuePair <string, HtmlNodeCollection> > data = DataGetter.GetDataByXPATH(dataPage, elements); HtmlNodeCollection productsNodes = data[0].Value; for (int j = 0; /*j < numProductsToGet && */ j < productsNodes.Count; j++) { string link = productsNodes[j].InnerHtml; string extractedLink = ExtractLinkFromHtml(productsNodes[j], "href=", "\">"); productsLinks.Add(SigmaAldrichConstants.SigmaAldrichMain + "/" + extractedLink); } //foreach (HtmlNode node in productsNodes) //{ // string link = node.InnerHtml; // string extractedLink = ExtractLinkFromHtml(node, "href=", "\">"); // productsLinks.Add(SigmaAldrichConstants.SigmaAldrichMain + "/" + extractedLink); //} //if (productsLinks.Count < numProductsToGet) //{ System.Threading.Thread.Sleep((int)DataGetter.GetRandomNumber(5.0, 15.0) * 1000); //} //else //{ // break; //} } return(productsLinks); }
public static List <string> GetCategoryPaginationUrls(string category_url, int pages_to_get) { int numPagesToGet = pages_to_get; HtmlDocument doc = DataGetter.GetHtmlpage(new Uri(category_url)); //infer page name from 2nd page link List <string> liElements = new List <string>(); liElements.Add("searchResultsPagination"); List <KeyValuePair <string, HtmlNode> > paginationElements = DataGetter.GetDataByID(doc, "div", liElements); List <string> liElements1 = new List <string>(); liElements1.Add("pg2"); List <KeyValuePair <string, HtmlNode> > page2 = DataGetter.GetDataByID(doc, "li", liElements1); string page2Link = ExtractLinkFromHtml(page2[0].Value, "href", "'>"); HtmlNodeCollection divNodeChildren = paginationElements[paginationElements.Count - 1].Value.ChildNodes; HtmlNode divNodeChildren1 = divNodeChildren.First(x => x.Name == "ul"); HtmlNode lastPage = divNodeChildren1.ChildNodes[divNodeChildren1.ChildNodes.Count - 2]; string lastPageLink = ExtractLinkFromHtml(lastPage, "href", "'>"); int lastPageNumber = ExtractNumberFromString(lastPageLink, "page="); //Now build the pagination links... string[] pageParts = page2Link.Split(new string[] { "page=" }, StringSplitOptions.None); //pageParts[0] = pageParts[0].Substring(1); pageParts[1] = RemoveNumberFromStartOfString(pageParts[1]); //pageParts[1] = pageParts[1].Substring(0, pageParts[1].Length - 1); List <string> paginationLinks = new List <string>(); if (numPagesToGet == 0) { //get all products numPagesToGet = lastPageNumber; } for (int i = 0; i < numPagesToGet; i++) { paginationLinks.Add(SigmaAldrichConstants.SigmaAldrichMain + pageParts[0] + "page=" + (i + 1) + pageParts[1]); } return(paginationLinks); }