/// <summary> /// Get product information /// </summary> /// <param name="productLink">Example: https://www.jomashop.com/tissot-watch-t0064071603300.html</param> private Models.Product GetProductInformation(string productLink) { //Create product to save crawled data Models.Product product = new Models.Product(); //Redirect to site by URL //browser.Navigate().GoToUrl("https://www.jomashop.com/tissot-watch-t0064071603300.html"); browser.Navigate().GoToUrl(productLink); //browser.Navigate().GoToUrl("https://dosi-in.com/tee-black-myson-mix-reflective/"); //Select elements by CSS Selector (easiest way) //You can also select element by ID, Class, Name, XPath,... //Get brand by CSS Attribute Selectors (https://www.w3schools.com/css/css_attribute_selectors.asp) //var element = browser.FindElement(By.CssSelector("[itemprop=\"brand manufacturer\"]>a")); //product.Attribute1Value = element.GetAttribute("innerHTML"); //OuterHTML will give full element HTML code //Get price by CSS Selectors (https://www.w3schools.com/css/css_selectors.asp) //element = browser.FindElement(By.CssSelector("#final-price")); //No timeout, wait until page loaded //string finalPrice = element.GetAttribute("innerHTML"); //finalPrice = finalPrice.Replace("$", ""); //double finalPriceInVnd = Double.Parse(finalPrice) * 24300 * 1.1 + 350000; // product.SalePrice = finalPriceInVnd; List <IWebElement> elementList0 = new List <IWebElement>(); elementList0.AddRange(browser.FindElements(By.CssSelector(".bigEntrance.banner"))); if (elementList0.Count < 1) { List <IWebElement> elementList1 = new List <IWebElement>(); elementList1.AddRange(browser.FindElements(By.CssSelector(".product-short_description li:first-child"))); if (elementList1.Count > 0) { string SKU = browser.FindElement(By.CssSelector(".product-short_description li:first-child")).GetAttribute("innerHTML"); product.SKU = SKU.Replace("Mã sản phẩm: ", ""); } string Name = browser.FindElement(By.CssSelector("h1.mainbox-title")).GetAttribute("innerHTML"); product.Name = Name; double finalPriceInVnd = Double.Parse(browser.FindElement(By.CssSelector(".price-num")).GetAttribute("innerHTML")) * 1.2 + 50000; product.SalePrice = finalPriceInVnd; List <IWebElement> elementList2 = new List <IWebElement>(); elementList2.AddRange(browser.FindElements(By.CssSelector(".strike span:first-child"))); if (elementList2.Count > 0) { Double retailPriceInVnd = Double.Parse(browser.FindElement(By.CssSelector(".strike span:first-child")).GetAttribute("innerHTML")) * 1.2 + 50000; product.RegularPrice = retailPriceInVnd; } List <IWebElement> elementList3 = new List <IWebElement>(); elementList3.AddRange(browser.FindElements(By.CssSelector("label.radio.float-left.dosi_get_change_option.dosi_option_size"))); if (elementList3.Count > 0) { List <string> sizes = new List <string>(); foreach (var size in browser.FindElements(By.CssSelector("label.radio.float-left.dosi_get_change_option.dosi_option_size"))) { sizes.Add(size.GetAttribute("innerHTML").Trim()); } product.Attribute1Value = String.Join(", ", sizes); } List <string> images = new List <string>(); foreach (var image in browser.FindElements(By.CssSelector("a.cm-image-previewer.cm-previewer.previewer img"))) { images.Add(image.GetAttribute("src").Replace(".webp", "")); } product.Images = String.Join(", ", images); //string Description = browser.FindElement(By.CssSelector("#myTabproductContent div")).GetAttribute("innerHTML"); //product.Description = Description.Trim(); string brand = browser.FindElement(By.CssSelector("a.product_company")).GetAttribute("innerHTML"); product.Attribute2Value = brand.Trim(); product.Categories = product.Categories + "Brand>" + brand.Trim(); // Có 6 PriceRange: > 2.000.000, 1.000.000 - 2.000.000, 500.000 - 1.000.000, 350.000 - 500.000, 150.000 - 350.000, 95.000 - 150.000 string PriceRange = "0 - 95.000"; if (finalPriceInVnd > 2000000) { PriceRange = ">2000000"; } if (finalPriceInVnd >= 1000000 && finalPriceInVnd <= 2000000) { PriceRange = "1.000.000 - 2.000.000"; } if (finalPriceInVnd >= 350000 && finalPriceInVnd <= 500000) { PriceRange = "350.000 - 500.000"; } if (finalPriceInVnd >= 150000 && finalPriceInVnd <= 350000) { PriceRange = "150.000 - 350.000"; } if (finalPriceInVnd >= 95000 && finalPriceInVnd <= 150000) { PriceRange = "95.000 - 150.000"; } product.Attribute3Value = PriceRange; //string shortDes = browser.FindElement(By.CssSelector(".cm-disable-empty-files.cm-processed-form ul.product-short_description li")).GetAttribute("innerHTML"); //product.ShortDescription = shortDes.Replace("DOSI", "VENUS").Trim(); //---------------------------------------------- } return(product); }
/// <summary> /// Get product information /// </summary> /// <param name="productLink">Example: https://www.jomashop.com/tissot-watch-t0064071603300.html</param> private Models.Product GetProductInformation(string productLink) { //Create product to save crawled data Models.Product product = new Models.Product(); //Redirect to site by URL //browser.Navigate().GoToUrl("https://www.jomashop.com/tissot-watch-t0064071603300.html"); browser.Navigate().GoToUrl(productLink); //Sử dụng try catch để over lỗi //Lấy loại (Type) Dựa trên breadcumb try { var elementtype = browser.FindElement(By.CssSelector("[class=\"mc-brea\"]")); string breadcumb = elementtype.GetAttribute("innerHTML"); string temp = Regex.Match(breadcumb, "<ul>.*?</li>", RegexOptions.Singleline).Value; breadcumb = breadcumb.Replace(temp, "").Trim(); //Replace Trang chủ temp = Regex.Match(breadcumb, "<li>.*?</li>", RegexOptions.Singleline).Value; breadcumb = breadcumb.Replace(temp, "").Trim(); //Replace Máy đổi trả temp = Regex.Match(breadcumb, "<li>.*?</li>", RegexOptions.Singleline).Value; product.Type = Regex.Match(temp, "<li><a.*?>(.*?)</a></li>", RegexOptions.Singleline).Groups[1].Value; //Lấy giá trị Type breadcumb = breadcumb.Replace(temp, "").Trim(); //Replace Type //Lấy Brand dựa trên link trên (Brand Attribute 1) product.Attribute1Value = Regex.Match(breadcumb, "<li><a.*?>(.*?)</a></li>", RegexOptions.Singleline).Groups[1].Value;//Lấy giá trị Brand } catch (Exception e) { Console.WriteLine(productLink + "\nType & Brand went wrong."); } //Lấy SKU (SKU) try { var element0 = browser.FindElement(By.CssSelector("[class=\"mc-ctname\"]")); var SKU = element0.GetAttribute("innerHTML"); SKU = Regex.Match(SKU, "<span>(.*?)</span>", RegexOptions.Singleline).Groups[1].Value; product.SKU = Regex.Replace(SKU, "\\W", "").Trim(); } catch (Exception e) { Console.WriteLine(productLink + "\nSKU went wrong."); } //Lấy tên (Name) try { var element1 = browser.FindElement(By.CssSelector("[class=\"mc-ctname\"]")); var name = element1.GetAttribute("innerHTML"); product.Name = Regex.Replace(name, "<span>.*?</span>", "").Trim(); } catch (Exception e) { Console.WriteLine(productLink + "\nName went wrong."); } //Lấy màu (Attribute 2) try { var element2 = browser.FindElement(By.CssSelector("[class=\"mc-ctclo\"]")); var color = element2.GetAttribute("innerHTML"); product.Attribute2Value = Regex.Replace(color, "<i.*?</i>", "").Trim(); } catch (Exception e) { Console.WriteLine(productLink + "\nColor went wrong."); } //Lấy giá hiện tại (Regular Price) try { var element3 = browser.FindElement(By.CssSelector("[class=\"mc-ctpri1\"]")); var price1 = element3.GetAttribute("innerHTML"); price1 = Regex.Replace(price1, ".*?<span>", "").Trim(); price1 = Regex.Replace(price1, "đ</span>.*", "", RegexOptions.Singleline).Trim(); price1 = price1.Replace(".", ""); product.RegularPrice = Double.Parse(price1); } catch (Exception e) { Console.WriteLine(productLink + "\nPrice went wrong."); } //Lấy giá máy mới (Attribute 3) try { var element4 = browser.FindElement(By.CssSelector("[class=\"mc-ctpri2\"]")); product.Attribute3Value = element4.GetAttribute("innerHTML"); } catch (Exception e) { Console.WriteLine(productLink + "\nNewPrice went wrong."); } //Tiết kiệm (Attribute 4) try { var element5 = browser.FindElement(By.CssSelector("[class=\"mc-ctpri3\"]")); var price3 = element5.GetAttribute("innerHTML"); product.Attribute4Value = Regex.Replace(price3, "<p>|</p>", "").Trim(); } catch (Exception e) { Console.WriteLine(productLink + "\nSavings went wrong."); } //Tình trạng (Attribute 5) try { var element6 = browser.FindElement(By.CssSelector("[class=\"mc-ctttm\"]")); string status = element6.GetAttribute("innerHTML"); string stttemp = Regex.Match(status, "<li>(.*?)</li>", RegexOptions.Singleline).Groups[1].Value; product.Attribute5Value = stttemp; status = status.Replace(stttemp, ""); //Phụ kiện (Attribute 6) stttemp = Regex.Match(status, "<li></li>.*?<li>(.*?)</li>", RegexOptions.Singleline).Groups[1].Value; product.Attribute6Value = stttemp; status = status.Replace(stttemp, ""); //Bảo hành (Attribute 7) stttemp = Regex.Match(status, "<li></li>.*?<li></li>.*?<li>(.*?)</li>", RegexOptions.Singleline).Groups[1].Value; product.Attribute7Value = stttemp; } catch (Exception e) { Console.WriteLine(productLink + "\nStatus, Acess & Grua went wrong."); } //Thông số kĩ thuật (Description) try { var element7 = browser.FindElement(By.CssSelector("[class=\"modal-body tskt-popct\"]")); string infor = element7.GetAttribute("innerHTML"); infor.Trim(); infor = infor.Replace("@", "&"); product.Description = infor; } catch (Exception e) { Console.WriteLine(productLink + "\nDecript went wrong."); } //Hình ảnh (Images) try { var element8 = browser.FindElement(By.CssSelector("[class=\"slick-list draggable\"]")); string img = element8.GetAttribute("innerHTML"); string imgLinks = ""; string pattern = "https://.*?\""; Regex r = new Regex(pattern); foreach (Match m in r.Matches(img)) { imgLinks += m.Value + ", "; //Hiển thị kết quả } //loại bỏ cdn. mới truy cập được imgLinks = imgLinks.Replace("cdn.", "").Replace("\"", ""); imgLinks = imgLinks.TrimEnd(','); product.Images = imgLinks; } catch (Exception e) { Console.WriteLine(productLink + "\nImage went wrong."); } //lấy từng địa chỉ cho từng biến (2 địa chỉ) //string imageLink1 = Regex.Match(img, "src=\"(.*?)\"", RegexOptions.Singleline).Groups[1].Value; //imageLink1 = imageLink1.Replace("cdn.", ""); //img = img.Replace("src=\"" + imageLink1 + "\"", ""); //string imageLink2 = Regex.Match(img, "src=\"(.*?)\"", RegexOptions.Singleline).Groups[1].Value; //imageLink2 = imageLink2.Replace("cdn.", ""); //Lấy Địa chỉ cửa hàng (Địa chỉ -> Attribute8, link ->Attribute9) try { var element9 = browser.FindElement(By.CssSelector("[class=\"mc-ctlocit\"]")); string location = element9.GetAttribute("innerHTML"); var address = Regex.Match(location, "(.*?)<a", RegexOptions.Singleline).Groups[1].Value; product.Attribute8Value = address; string locLink = Regex.Match(location, "href=\"(.*?)\"", RegexOptions.Singleline).Groups[1].Value; product.Attribute9Value = locLink; } catch (Exception e) { Console.WriteLine(productLink + "\n Address went wrong."); } /* //Select elements by CSS Selector (easiest way) * //You can also select element by ID, Class, Name, XPath,... * //Get brand by CSS Attribute Selectors (https://www.w3schools.com/css/css_attribute_selectors.asp) * var element = browser.FindElement(By.CssSelector("[itemprop=\"brand manufacturer\"]>a")); * product.Attribute1Value = element.GetAttribute("innerHTML"); //OuterHTML will give full element HTML code * * //Get price by CSS Selectors (https://www.w3schools.com/css/css_selectors.asp) * element = browser.FindElement(By.CssSelector("#final-price")); //No timeout, wait until page loaded * string finalPrice = element.GetAttribute("innerHTML"); * finalPrice = finalPrice.Replace("$", ""); * double finalPriceInVnd = Double.Parse(finalPrice) * 24300 * 1.1 + 350000; * product.SalePrice = finalPriceInVnd; */ //---------------------------------------------- return(product); }
/// <summary> /// Get product information /// </summary> /// <param name="productLink">Example private Models.Product GetProductInformation(string productLink) { //Create product to save crawled data Models.Product product = new Models.Product(); //Redirect to site by URL //browser.Navigate().GoToUrl() System.Threading.Thread.Sleep(new Random().Next(2) * 1000); //Sleep random from 1-5 seconds WebDriverWait wait = new WebDriverWait(this.browser, TimeSpan.FromSeconds(120)); browser.Navigate().GoToUrl(productLink); //Select elements by CSS Selector (easiest way) //You can also select element by ID, Class, Name, XPath,... //Select elements by CSS Selector (easiest way) //You can also select element by ID, Class, Name, XPath,... //Get brand by CSS Attribute Selectors (https://www.w3schools.com/css/css_attribute_selectors.asp) wait.Until((x) => { return(((IJavaScriptExecutor)this.browser).ExecuteScript("return document.readyState").Equals("complete")); }); //Get SKU bool test; if (test = (verify(browser, "[itemprop = \"sku\"]") == true)) { var element = browser.FindElement(By.CssSelector("[itemprop = \"sku\"]")); product.SKU = element.GetAttribute("innerHTML"); } //Get Name if (test = (verify(browser, "[itemprop =\"url\"]") == true)) { var element = browser.FindElement(By.CssSelector("[itemprop =\"url\"]")); product.Name = element.GetAttribute("innerHTML"); } //Get Regular price if (test = (verify(browser, "[class=\"ngachngang\"]") == true)) { var element = browser.FindElement(By.CssSelector("[class=\"ngachngang\"]")); var RegularPrice = element.GetAttribute("innerHTML"); RegularPrice = RegularPrice.Replace(" VNĐ", ""); // Remove Currency double finalRegularPrice = Double.Parse(RegularPrice); // Parse to Double product.RegularPrice = finalRegularPrice; } //Get Sale Price if (test = (verify(browser, "[class=\"price_sale\"]") == true)) { var element = browser.FindElement(By.CssSelector("[class=\"price_sale\"]")); var SalePrice = element.GetAttribute("innerHTML"); if (SalePrice != "Liên hệ") { SalePrice = SalePrice.Replace(" VNĐ", ""); // Remove Currency double finalSalePrice = Double.Parse(SalePrice); // Parse to Double product.SalePrice = finalSalePrice; } else { SalePrice = SalePrice.Replace("Liên hệ", "9999999"); // Remove Currency double finalSalePrice = Double.Parse(SalePrice); // Parse to Double product.SalePrice = finalSalePrice; } } //Get Description if (test = (verify(browser, "[class=\"view-content\"]") == true)) { var element = browser.FindElement(By.CssSelector("[class=\"view-content\"]")); var Description = element.GetAttribute("innerText"); product.Description = Description.Replace("tại BinhMinhDigital", "").Replace("/s", "").Replace("\r", "").Replace("\n", ""); } //Get Image //if (test = (verify(browser, "[itemprop=\"image\"]") == true)) //{ // var element = browser.FindElement(By.CssSelector("[itemprop=\"image\"]")); // product.Images = element.GetAttribute("src"); //} // RegularExpression //var eelement = browser.FindElement(By.XPath("//picture")); //var ex = eelement.GetAttribute("innerHTML"); //product.Images = Regex.Match(ex.ToString(), @"img src(.*?)alt").Value.Replace("img src=\"", "").Replace("\" alt", ""); //Get Brand if (test = (verify(browser, "[itemprop=\"brand\"]") == true)) { var element = browser.FindElement(By.CssSelector("[itemprop=\"brand\"]")); product.Attribute1Value = element.GetAttribute("innerHTML"); } //Get AdvanceInfor if (test = (verify(browser, "[class=\"view-content\"]") == true)) { var element = browser.FindElement(By.CssSelector("[class=\"product-recap\"]")); var AdvanceInfor = element.GetAttribute("innerText"); product.Attribute2Value = AdvanceInfor = AdvanceInfor.Replace("TÍNH NĂNG NỔI BẬT", "").Replace("/s", "").Replace("\r", "").Replace("\n", ""); } //Attribute Price if (product.SalePrice <= 1000000) { product.Attribute3Value = "<= 1 000 000"; } ; if (product.SalePrice > 1000000 && product.SalePrice <= 5000000) { product.Attribute3Value = "1 000 000 - 5 000 000"; } ; if (product.SalePrice > 5000000 && product.SalePrice <= 20000000) { product.Attribute3Value = "5 000 000 - 20 000 000"; } ; if (product.SalePrice > 2000000) { product.Attribute3Value = "> 20 000 000"; } ; //Get category - brand, thuộc tính category, brand > thuộc tính category product.Categories = "Chân Camera" + ">" + product.Attribute1Value; //Get Images Galleries string list = ""; string htmlpage = browser.PageSource; var ListImages = Regex.Matches(htmlpage, @"data-standard(.*?)jpg", RegexOptions.Singleline); foreach (var course in ListImages) { string Gallery = Regex.Match(course.ToString(), @"data-standard(.*?)jpg").Value.Replace("data-standard=\"/", "https://binhminhdigital.com/"); list += Gallery + ","; } product.Images = list.TrimEnd(','); //---------------------------------------------- return(product); }