public static List <string> GetProductsUri(string pagination_uri) { //go over the product page and extract product url's List <string> productsLinks = new List <string>(); HtmlDocument dataPage = DataGetter.GetHtmlpage(new Uri(pagination_uri)); //now get all product url's from page List <string> elements = new List <string>(); elements.Add("//li[@class='productNumberValue']"); List <KeyValuePair <string, HtmlNodeCollection> > data = DataGetter.GetDataByXPATH(dataPage, elements); HtmlNodeCollection productsNodes = data[0].Value; for (int j = 0; /*j < numProductsToGet && */ j < productsNodes.Count; j++) { string link = productsNodes[j].InnerHtml; string extractedLink = ExtractLinkFromHtml(productsNodes[j], "href=", "\">"); productsLinks.Add(SigmaAldrichConstants.SigmaAldrichMain + "/" + extractedLink); } System.Threading.Thread.Sleep((int)DataGetter.GetRandomNumber(5.0, 15.0) * 1000); return(productsLinks); }
public static Product GetProduct(string product_uri) { Product p = new Product(); List <string> elements = new List <string>(); List <string> headers = new List <string>(); headers.Add("Components"); headers.Add("Application"); headers.Add("Features and Benefits"); headers.Add("General description"); headers.Add("Packaging"); headers.Add("Reconstitution"); headers.Add("Other Notes"); headers.Add("Legal Information"); headers.Add("Caution"); headers.Add("Biochem/physiol Actions"); headers.Add("Preparation Note"); elements.Add("//div[@class='descriptionContent']"); HtmlDocument dataPage = DataGetter.GetHtmlpage(new Uri(product_uri)); List <KeyValuePair <string, HtmlNodeCollection> > dataDescription = DataGetter.GetDataByXPATH(dataPage, elements); p.Description = SigmaAldrichParser.ParseDescription(dataDescription, headers); p.Properties = SigmaAldrichParser.ParseDetailProperties(dataPage.DocumentNode); List <string> elements1 = new List <string>(); elements1.Add("//p[@class='product-name']"); p.Name = GetProductName(dataPage, elements1); return(p); }
private static string GetProductName(HtmlDocument page, string element) { List <string> elements = new List <string>(); elements.Add(element); List <KeyValuePair <string, HtmlNodeCollection> > productName = DataGetter.GetDataByXPATH(page, elements); HtmlNode productNameNode = productName[0].Value[0]; return(productNameNode.InnerText); }
private static string GetProductName(HtmlDocument webpage, List <string> element_names) { string name = ""; List <KeyValuePair <string, HtmlNodeCollection> > dataName = DataGetter.GetDataByXPATH(webpage, element_names); if (dataName.Count > 0) { name = dataName[0].Value.First(x => x.Name == "p").InnerText.Replace("\t", "").Replace("\n", "").Trim(); } return(name); }
public static List <string> GetAllProductsUri(List <string> pagination_uri, int num_pages_to_get) { //go over all product pages and extract product url's List <string> productsLinks = new List <string>(); //int numProductsToGet = num_pages_to_get; //if (numProductsToGet == 0) //{ // numProductsToGet = pagination_uri.Count; //} for (int i = 0; i < pagination_uri.Count; i++) { HtmlDocument dataPage = DataGetter.GetHtmlpage(new Uri(pagination_uri[i])); //now get all product url's from page List <string> elements = new List <string>(); elements.Add("//li[@class='productNumberValue']"); List <KeyValuePair <string, HtmlNodeCollection> > data = DataGetter.GetDataByXPATH(dataPage, elements); HtmlNodeCollection productsNodes = data[0].Value; for (int j = 0; /*j < numProductsToGet && */ j < productsNodes.Count; j++) { string link = productsNodes[j].InnerHtml; string extractedLink = ExtractLinkFromHtml(productsNodes[j], "href=", "\">"); productsLinks.Add(SigmaAldrichConstants.SigmaAldrichMain + "/" + extractedLink); } //foreach (HtmlNode node in productsNodes) //{ // string link = node.InnerHtml; // string extractedLink = ExtractLinkFromHtml(node, "href=", "\">"); // productsLinks.Add(SigmaAldrichConstants.SigmaAldrichMain + "/" + extractedLink); //} //if (productsLinks.Count < numProductsToGet) //{ System.Threading.Thread.Sleep((int)DataGetter.GetRandomNumber(5.0, 15.0) * 1000); //} //else //{ // break; //} } return(productsLinks); }
private static string GetDescription(HtmlDocument page, string element) { List <string> elements = new List <string>(); elements.Add(element); List <KeyValuePair <string, HtmlNodeCollection> > description = DataGetter.GetDataByXPATH(page, elements); HtmlNode descriptionNode = description[0].Value[0]; string descriptionStr = descriptionNode.OuterHtml; descriptionStr = descriptionStr.Replace("\n", ""); string[] list_description = descriptionStr.Split('='); descriptionStr = list_description[2]; descriptionStr = descriptionStr.Replace(">", ""); return(descriptionStr.Trim()); }
private static specifications GetSpecifications(HtmlDocument page, string element) { specifications specifications = new specifications(); List <string> elements = new List <string>(); elements.Add(element); List <KeyValuePair <string, HtmlNodeCollection> > htmlSpecifications = DataGetter.GetDataByXPATH(page, elements); HtmlNode Form = htmlSpecifications[0].Value[0]; HtmlNode Storage_Conditions = htmlSpecifications[0].Value[1]; HtmlNode Quality_Control = htmlSpecifications[0].Value[2]; specifications.form = Form.InnerText;; specifications.storage_conditions = Storage_Conditions.InnerText; specifications.quality_control = Quality_Control.InnerText; return(specifications); }
public static List <Description> GetAllProductsForCategory(string category, int num_products_to_get, int pages_to_get) { string categoryUri = GetCategoryUri(category); int numProductsToGet = 0; List <string> paginationUri = GetCategoryPaginationUrls(categoryUri, pages_to_get); List <string> productsUris = GetAllProductsUri(paginationUri, num_products_to_get); List <Description> products = new List <Description>(); List <string> elements = new List <string>(); List <string> headers = new List <string>(); headers.Add("Components"); headers.Add("Application"); headers.Add("Features and Benefits"); headers.Add("General description"); headers.Add("Packaging"); headers.Add("Reconstitution"); headers.Add("Other Notes"); headers.Add("Legal Information"); headers.Add("Caution"); headers.Add("Biochem/physiol Actions"); headers.Add("Preparation Note"); elements.Add("//div[@class='descriptionContent']"); if (num_products_to_get == 0) { //get all products numProductsToGet = productsUris.Count; } for (int i = 0; i < num_products_to_get; i++) { List <KeyValuePair <string, HtmlNodeCollection> > dataDescription = DataGetter.GetDataByXPATH(new Uri(productsUris[i]), elements); Description description = SigmaAldrichParser.ParseDescription(dataDescription, headers); products.Add(description); System.Threading.Thread.Sleep((int)DataGetter.GetRandomNumber(5.0, 15.0) * 1000); } return(products); }
private static List <string> GetProductUris(HtmlDocument page, string element) { List <string> elements = new List <string>(); elements.Add(element); List <KeyValuePair <string, HtmlNodeCollection> > htmlProductUris = DataGetter.GetDataByXPATH(page, elements); KeyValuePair <string, HtmlNodeCollection> ProductUrisNode = htmlProductUris[0]; HtmlNodeCollection HtmlCategoryUris = ProductUrisNode.Value; List <string> ProductCategoryUris = new List <string>(); foreach (HtmlNode HtmlCategoryUri in HtmlCategoryUris) { string curr_InnerHtml = HtmlCategoryUri.InnerHtml; curr_InnerHtml = curr_InnerHtml.Trim(); if (HtmlCategoryUri.InnerHtml.Contains("<a href=")) { ProductCategoryUris.Add(HtmlCategoryUri.InnerHtml); } } return(ProductCategoryUris); }