예제 #1
0
        public Product ParseNextProduct(string url)
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();
            this.LoadWebPage(url);
            stopwatch.Stop();
            Console.WriteLine("Time consumed in loading webpage = {0}", stopwatch.Elapsed);

            StreamWriter sw = new StreamWriter("c:\\temp\\b.html");

            sw.Write(this.driver.PageSource);
            sw.Close();

            // Prepare the product object.
            stopwatch.Restart();
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(this.driver.PageSource);
            Product product = new Product();

            product.URL = url;
            HtmlNode rootNode = doc.DocumentNode;

            product.Brand = @"Next";

            // Parse the product name
            HtmlNode currentNode = rootNode.SelectSingleNode(@"//div[@class=""Title""]");

            if (currentNode != null)
            {
                product.Title   = currentNode.InnerText.Trim();
                product.TitleCN = Translator.Translate(product.Title);
            }

            currentNode       = rootNode.SelectSingleNode(@"//div[@class=""ItemNumber""]");
            product.ProductID = currentNode.InnerText;

            // Parse the gender.
            HtmlNodeCollection currentNodes = rootNode.SelectNodes(@"//li[contains(@class,""Breadcrumb"")]");

            if (currentNodes != null)
            {
                IEnumerable <string> breadCrumbs = currentNodes.Select(x => x.InnerText.Trim());
                string fullCrumbString           = string.Join(@"\", breadCrumbs);
                product.Gender = ParseNextGenderString(fullCrumbString);

                // Parse the category
                product.Category = ParseNextCategoryString(fullCrumbString);
            }

            // Parse the description
            currentNode = rootNode.SelectSingleNode(@"//div[@id='ToneOfVoice']");
            if (currentNode != null)
            {
                product.Description = currentNode.InnerText.Trim();
            }
            else
            {
                currentNodes = rootNode.SelectNodes(@"//div[@class='description']");
                if (currentNodes != null)
                {
                    product.Description = WebUtility.HtmlDecode(string.Join(@" ", currentNodes.Select(x => x.InnerText)));
                }
                else
                {
                    product.Description = "Desciption";
                }
            }

            product.DescriptionCN = Translator.Translate(product.Description);

            // Parse the material.
            currentNode = rootNode.SelectSingleNode(@"//div[@id='Composition']");
            if (currentNode != null)
            {
                product.Material   = currentNode.InnerText.Trim();
                product.MaterialCN = Translator.Translate(product.Material);
            }

            // Parse the price
            currentNode = rootNode.SelectSingleNode(@"//div[contains(@class, 'SizeSelector')]");
            if (currentNode != null)
            {
                PriceInfo[] priceInfos = ParseNextPriceInfos(currentNode);
                // Parse the age info
                if (priceInfos.Length > 0)
                {
                    Tuple <Single, Single> firstAge = ParseNextAgeInfo(priceInfos.First().Size);
                    Tuple <Single, Single> lastAge  = ParseNextAgeInfo(priceInfos.Last().Size);
                    product.MinimumAge = firstAge.Item1;
                    product.MaximumAge = lastAge.Item2;
                }

                product.SetPriceInfos(priceInfos);
            }

            // Parse the image links
            product.ThumbnailLink = "URL";
            HtmlNode       imgLinksNode = rootNode.SelectSingleNode(@"//div[@class=""ThumbNailNavClip""]");
            IList <string> imgLinks     = new List <string>();

            if (imgLinksNode != null)
            {
                currentNode = imgLinksNode.ChildNodes["ul"];
                if (currentNode != null)
                {
                    foreach (HtmlNode node in currentNode.ChildNodes)
                    {
                        HtmlNode aNode = node.SelectSingleNode(@"a");
                        imgLinks.Add(aNode.Attributes["rel"].Value);
                        if (product.ThumbnailLink == null)
                        {
                            HtmlNode thumbnailNode = aNode.ChildNodes.FirstOrDefault();
                            if (thumbnailNode != null)
                            {
                                product.ThumbnailLink = thumbnailNode.Attributes["src"].Value;
                            }
                        }
                    }
                }
            }

            product.ImageLinks = imgLinks.Count > 0 ? string.Join(@";", imgLinks) : "URLs";
            product.InsertTime = DateTime.Now;
            product.UpdateTime = DateTime.Now;

            stopwatch.Stop();
            Console.WriteLine("Time consumed in parsing the content {0}", stopwatch.Elapsed);
            return(product);
        }
예제 #2
0
        public Product ParseBodenProduct(string url)
        {
            this.LoadWebPage(url);

            // Prepare the product object.
            HtmlDocument doc = new HtmlDocument();

            doc.LoadHtml(this.driver.PageSource);
            Product product = new Product();

            product.URL = url;
            HtmlNode rootNode = doc.DocumentNode;

            product.Brand = @"Boden";
            product.URL   = url;

            // Parse the product id from the url.
            string[] ss = url.Split('/');
            product.ProductID = ss[ss.Length - 2];

            // Parse the product name
            HtmlNode currentNode = rootNode.SelectSingleNode(@"//h1[@class=""pdpProductTitle""]");

            if (currentNode != null)
            {
                string productTitle = currentNode.InnerText.Trim();
                product.Title   = productTitle;
                product.TitleCN = Translator.Translate(productTitle);
            }

            // Parse the sex and category.
            HtmlNode breadNode = rootNode.SelectSingleNode(@"//div[@class=""breadcrumb""]");

            if (breadNode != null)
            {
                HtmlNodeCollection categoryNodes = breadNode.SelectNodes(@".//li");

                // Parse the gender.
                product.Gender = ParseBodenGenderString(categoryNodes[1].InnerText);

                int num = categoryNodes.Count;
                // Parse the category.
                if (num > 2)
                {
                    string categoryStr = categoryNodes[num - 2].InnerText.Trim();
                    product.Category = ParseBodenCategoryString(categoryStr);
                }
            }

            // Parse the description and material.
            currentNode = rootNode.SelectSingleNode(@"//div[@class=""tabContent pdpProductPnl a-slide""]");
            if (currentNode != null)
            {
                string   productDescription = WebUtility.HtmlDecode(currentNode.InnerText.Trim());
                string[] desStrings         = productDescription.Split('\n');
                product.Description   = desStrings[0].Trim();
                product.DescriptionCN = Translator.Translate(product.Description);
                if (ss.Length > 1)
                {
                    product.Material   = String.Join(@"", desStrings.Skip(1)).Trim().Replace(@"\n", @" ").Replace(@"\r", @"");
                    product.MaterialCN = Translator.Translate(product.Material);
                }
                else
                {
                    product.Material   = @"";
                    product.MaterialCN = @"";
                }
            }

            // Parse the image links
            HtmlNode imgContainerNode = rootNode.SelectSingleNode(@"//div[@class=""imageryImagesContainer""]");

            if (imgContainerNode != null)
            {
                IList <string>     imgLinks = new List <string>();
                HtmlNodeCollection imgNodes = imgContainerNode.SelectNodes(@".//img[@class=""cloudzoom""]");
                foreach (HtmlNode imgNode in imgNodes)
                {
                    imgLinks.Add(imgNode.Attributes["src"].Value);
                }

                product.ImageLinks    = string.Join(@";", imgLinks);
                product.ThumbnailLink = imgLinks.First();
            }

            product.InsertTime = DateTime.Now;
            product.UpdateTime = DateTime.Now;

            PriceInfo[] priceInfos = ParseBodenPrices(this.driver, this.wait);
            product.SetPriceInfos(priceInfos);

            // Parse the age info
            if (priceInfos.Length > 0)
            {
                Tuple <Single, Single> firstAge = ParseBodenAgeInfo(priceInfos.First().Size);
                Tuple <Single, Single> lastAge  = ParseBodenAgeInfo(priceInfos.Last().Size);
                product.MinimumAge = firstAge.Item1;
                product.MaximumAge = lastAge.Item2;
            }


            return(product);
        }