public Product ParseNextProduct(string url) { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); this.LoadWebPage(url); stopwatch.Stop(); Console.WriteLine("Time consumed in loading webpage = {0}", stopwatch.Elapsed); StreamWriter sw = new StreamWriter("c:\\temp\\b.html"); sw.Write(this.driver.PageSource); sw.Close(); // Prepare the product object. stopwatch.Restart(); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(this.driver.PageSource); Product product = new Product(); product.URL = url; HtmlNode rootNode = doc.DocumentNode; product.Brand = @"Next"; // Parse the product name HtmlNode currentNode = rootNode.SelectSingleNode(@"//div[@class=""Title""]"); if (currentNode != null) { product.Title = currentNode.InnerText.Trim(); product.TitleCN = Translator.Translate(product.Title); } currentNode = rootNode.SelectSingleNode(@"//div[@class=""ItemNumber""]"); product.ProductID = currentNode.InnerText; // Parse the gender. HtmlNodeCollection currentNodes = rootNode.SelectNodes(@"//li[contains(@class,""Breadcrumb"")]"); if (currentNodes != null) { IEnumerable <string> breadCrumbs = currentNodes.Select(x => x.InnerText.Trim()); string fullCrumbString = string.Join(@"\", breadCrumbs); product.Gender = ParseNextGenderString(fullCrumbString); // Parse the category product.Category = ParseNextCategoryString(fullCrumbString); } // Parse the description currentNode = rootNode.SelectSingleNode(@"//div[@id='ToneOfVoice']"); if (currentNode != null) { product.Description = currentNode.InnerText.Trim(); } else { currentNodes = rootNode.SelectNodes(@"//div[@class='description']"); if (currentNodes != null) { product.Description = WebUtility.HtmlDecode(string.Join(@" ", currentNodes.Select(x => x.InnerText))); } else { product.Description = "Desciption"; } } product.DescriptionCN = Translator.Translate(product.Description); // Parse the material. currentNode = rootNode.SelectSingleNode(@"//div[@id='Composition']"); if (currentNode != null) { product.Material = currentNode.InnerText.Trim(); product.MaterialCN = Translator.Translate(product.Material); } // Parse the price currentNode = rootNode.SelectSingleNode(@"//div[contains(@class, 'SizeSelector')]"); if (currentNode != null) { PriceInfo[] priceInfos = ParseNextPriceInfos(currentNode); // Parse the age info if (priceInfos.Length > 0) { Tuple <Single, Single> firstAge = ParseNextAgeInfo(priceInfos.First().Size); Tuple <Single, Single> lastAge = ParseNextAgeInfo(priceInfos.Last().Size); product.MinimumAge = firstAge.Item1; product.MaximumAge = lastAge.Item2; } product.SetPriceInfos(priceInfos); } // Parse the image links product.ThumbnailLink = "URL"; HtmlNode imgLinksNode = rootNode.SelectSingleNode(@"//div[@class=""ThumbNailNavClip""]"); IList <string> imgLinks = new List <string>(); if (imgLinksNode != null) { currentNode = imgLinksNode.ChildNodes["ul"]; if (currentNode != null) { foreach (HtmlNode node in currentNode.ChildNodes) { HtmlNode aNode = node.SelectSingleNode(@"a"); imgLinks.Add(aNode.Attributes["rel"].Value); if (product.ThumbnailLink == null) { HtmlNode thumbnailNode = aNode.ChildNodes.FirstOrDefault(); if (thumbnailNode != null) { product.ThumbnailLink = thumbnailNode.Attributes["src"].Value; } } } } } product.ImageLinks = imgLinks.Count > 0 ? string.Join(@";", imgLinks) : "URLs"; product.InsertTime = DateTime.Now; product.UpdateTime = DateTime.Now; stopwatch.Stop(); Console.WriteLine("Time consumed in parsing the content {0}", stopwatch.Elapsed); return(product); }
public Product ParseBodenProduct(string url) { this.LoadWebPage(url); // Prepare the product object. HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(this.driver.PageSource); Product product = new Product(); product.URL = url; HtmlNode rootNode = doc.DocumentNode; product.Brand = @"Boden"; product.URL = url; // Parse the product id from the url. string[] ss = url.Split('/'); product.ProductID = ss[ss.Length - 2]; // Parse the product name HtmlNode currentNode = rootNode.SelectSingleNode(@"//h1[@class=""pdpProductTitle""]"); if (currentNode != null) { string productTitle = currentNode.InnerText.Trim(); product.Title = productTitle; product.TitleCN = Translator.Translate(productTitle); } // Parse the sex and category. HtmlNode breadNode = rootNode.SelectSingleNode(@"//div[@class=""breadcrumb""]"); if (breadNode != null) { HtmlNodeCollection categoryNodes = breadNode.SelectNodes(@".//li"); // Parse the gender. product.Gender = ParseBodenGenderString(categoryNodes[1].InnerText); int num = categoryNodes.Count; // Parse the category. if (num > 2) { string categoryStr = categoryNodes[num - 2].InnerText.Trim(); product.Category = ParseBodenCategoryString(categoryStr); } } // Parse the description and material. currentNode = rootNode.SelectSingleNode(@"//div[@class=""tabContent pdpProductPnl a-slide""]"); if (currentNode != null) { string productDescription = WebUtility.HtmlDecode(currentNode.InnerText.Trim()); string[] desStrings = productDescription.Split('\n'); product.Description = desStrings[0].Trim(); product.DescriptionCN = Translator.Translate(product.Description); if (ss.Length > 1) { product.Material = String.Join(@"", desStrings.Skip(1)).Trim().Replace(@"\n", @" ").Replace(@"\r", @""); product.MaterialCN = Translator.Translate(product.Material); } else { product.Material = @""; product.MaterialCN = @""; } } // Parse the image links HtmlNode imgContainerNode = rootNode.SelectSingleNode(@"//div[@class=""imageryImagesContainer""]"); if (imgContainerNode != null) { IList <string> imgLinks = new List <string>(); HtmlNodeCollection imgNodes = imgContainerNode.SelectNodes(@".//img[@class=""cloudzoom""]"); foreach (HtmlNode imgNode in imgNodes) { imgLinks.Add(imgNode.Attributes["src"].Value); } product.ImageLinks = string.Join(@";", imgLinks); product.ThumbnailLink = imgLinks.First(); } product.InsertTime = DateTime.Now; product.UpdateTime = DateTime.Now; PriceInfo[] priceInfos = ParseBodenPrices(this.driver, this.wait); product.SetPriceInfos(priceInfos); // Parse the age info if (priceInfos.Length > 0) { Tuple <Single, Single> firstAge = ParseBodenAgeInfo(priceInfos.First().Size); Tuple <Single, Single> lastAge = ParseBodenAgeInfo(priceInfos.Last().Size); product.MinimumAge = firstAge.Item1; product.MaximumAge = lastAge.Item2; } return(product); }