/// <summary> /// Dla serwisu ceneo.pl pobiera całą zawartość z komentarzami /// </summary> /// <param name="statistic"></param> /// <param name="product"></param> /// <returns></returns> public Product getPagesContent(IStatisctics statistic, Product product) { System.Net.WebClient client = new System.Net.WebClient(); client.Encoding = Encoding.UTF8; ILinkProvider provider = new CeneoLinkProvider(m_downloadString); string link = ""; while ((link = provider.getLink()) != "") { statistic.addDowlodedPage(link); string pageContent = client.DownloadString(link); pagesContent.Add(pageContent); } return product; }
/// <summary> /// Pobieranie kontentów strony komentarzy dla danego produkru z serwisów ceneo.pl i skapice.pl /// </summary> public HttpCommentGeter(string productId, IStatisctics statistic) { product = new Product(); string pageName = "http://www.ceneo.pl/" + productId + "#tab=reviews"; fillProductPropertis(product, pageName); m_webCrawlerCeneo = new CeneoWebCrawler(pageName); m_webCrawlerCeneo.getPagesContent( statistic, product); ILinkToProductFinder productFinder = new SkapiecLinkToProductFinder(); string foundProduct = productFinder.getLinkToProduct(product); if (foundProduct != null) { m_webCrawlerSkapiec = new SkapiecWebCrawler("http://www.skapiec.pl" + productFinder.getLinkToProduct(product) + "#opinie"); m_webCrawlerSkapiec.getPagesContent(statistic, product); } }
/// <summary> /// Pobiera link do produktu ze strony skapiec.pl na podstawie danych model i producent /// </summary> /// <param name="product"></param> /// <returns></returns> public string getLinkToProduct(Product product) { string productName = product.Brand +" "+ product.Model; productName = productName.TrimEnd(); productName =productName.Replace(' ', '+'); productName = productName.ToLower(); int distance = Levenshtein.CalculateDistance(productName, "", 1); System.Net.WebClient client = new System.Net.WebClient(); client.Encoding = Encoding.UTF8; Uri uriAddres = new Uri("http://www.skapiec.pl/szukaj/w_calym_serwisie/" + productName); client.Headers.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"); client.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); client.Headers.Add("Accept-Language", "pl,en-US;q=0.7,en;q=0.3"); client.Headers.Add("Accept-Encoding", "gzip, deflate"); client.Headers.Add("Cookie", "PHPSESSID=e82ovdm91g5vobf0700n6k6dk6; skapiec_track=MTE5MTc0NzIyNg%3D%3D; YII_CSRF_TOKEN=8cc33c83714d40df25451e3b10a93f8e675eeae4; _ga=GA1.2.288452357.1451829433; __utmx=197911341.0T-zQrfuTne0iqXdL--tYQ$73259467-63:.DHy8J0MdR82UaXE7-wwR2w$73259467-66:; __utmxx=197911341.0T-zQrfuTne0iqXdL--tYQ$73259467-63:1451829432:15552000.DHy8J0MdR82UaXE7-wwR2w$73259467-66:1451829719:15552000; __utma=197911341.288452357.1451829433.1451829435.1451829435.1; __utmb=197911341.6.9.1451829720200; __utmc=197911341; __utmz=197911341.1451829435.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cktest=123; groki_uuid=f73327cb-113d-4372-8657-75d9eb124fd2; groki_usid=318888af-58b9-40b8-93ec-281d1d92dd73; __utmv=197911341.|2=UID=Brak=1; __gfp_64b=.XiwTpeFNPvRQK5GN82_ZmEnZo8ft2afgYYTqbCJuTT.07; ea_uuid=201601031457157309300828; SkaPaginationSearchPagination=20"); var responseStream = new System.IO.Compression.GZipStream(client.OpenRead(uriAddres), System.IO.Compression.CompressionMode.Decompress); var reader = new System.IO.StreamReader(responseStream); string pageContent = reader.ReadToEnd(); HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(pageContent); HtmlAgilityPack.HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//div[@class=\"partial products js results\"]/div[@class=\"partial box-row js groki click\"]/div[@class =\"box\"] | //div[@class=\"partial products js results\"]/div[@class=\"partial box-row js groki click\"]/div[@class =\"box mono-offer\"]");//("//*[@id=\"body\"]/div[2]/div/div/div[2]/div[3]/div[2]/ol/li/div/div[1]/p");// //body//div[@id='body']class=\"product - review - body\""); Tuple<string, string, double> bestFit = new Tuple<string, string, double>("","", Double.MaxValue); if (nodes != null) { foreach (HtmlAgilityPack.HtmlNode node in nodes) { HtmlAgilityPack.HtmlNodeCollection bodyNodes = node.SelectNodes(".//a[1]"); foreach (HtmlAgilityPack.HtmlNode nodeA in bodyNodes) { string tet = nodeA.InnerHtml.ToLower(); if(bestFit.Item3 > Levenshtein.CalculateDistance(productName, tet, 1)) { string page = nodeA.GetAttributeValue("href", ""); int levenshtein = Levenshtein.CalculateDistance(productName, tet, 1); bestFit = new Tuple<string, string, double>(tet, page, levenshtein); } break; } } } if (bestFit.Item3 < 20) return bestFit.Item2; else return null; }
/// <summary> /// Metoda pobiera komentarze z pageContent do product /// </summary> /// <param name="pageContent"></param> /// <param name="product"></param> public void getCommentsContentFromPage(string pageContent, Product product) { htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(pageContent); if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { throw new System.ExecutionEngineException(); // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { fillProductInfo(product); } } }
/// <summary> /// Wypełnienie produktu danymi o nazwie modelu i producenta /// </summary> /// <param name="product"></param> private void fillBrandAndModel(Product product) { HtmlAgilityPack.HtmlNodeCollection bodyNodes = htmlDoc.DocumentNode.SelectNodes("//nav[@class=\"breadcrumbs\"]//dl//strong"); if (bodyNodes != null) { foreach (HtmlAgilityPack.HtmlNode nodeType in bodyNodes) { string[] brand = nodeType.InnerHtml.Split(' '); product.Brand = brand[0]; //get model string model = ""; for (int i = 1; i < brand.Length; i++) { model += brand[i] + " "; } product.Model = model; } } }
/// <summary> /// Pobiera dane z node i sprawdza czy product się nie powtarza /// </summary> /// <param name="product"></param> /// <param name="node"></param> /// <returns></returns> private bool isCommentExistInProduct(Product product, HtmlNode node) { HtmlAgilityPack.HtmlNodeCollection bodyNodes = node.SelectNodes(".//div[@class=\"opinion-container\"]//p"); string commentToParse = ""; if (bodyNodes != null) { foreach (HtmlAgilityPack.HtmlNode commentNode in bodyNodes) { commentToParse += commentNode.InnerText; } } foreach (CommentDb commentsInDb in product.Comments) { if (commentsInDb.Comment == null) { continue; } if (commentsInDb.Comment.Equals(commentToParse) && commentsInDb.PortalName.Contains("Skapiec")) { return true; } } return false; }
/// <summary> /// Wypełnienie produktu daną o typie /// </summary> /// <param name="product"></param> private void fillType(Product product) { HtmlAgilityPack.HtmlNodeCollection bodyNodes = htmlDoc.DocumentNode.SelectNodes("//nav[@class=\"breadcrumbs\"]//dd//span[last()]//span"); if (bodyNodes != null) { foreach (HtmlAgilityPack.HtmlNode nodeType in bodyNodes) { product.Type = nodeType.InnerText; } } }
/// <summary> /// Metoda dla danego produktu pobiera informacje o produkcie /// </summary> /// <param name="product"></param> private void fillProductInfo(Product product) { fillComments(product); }
/// <summary> /// Uzupełnienie komentarza danymi komentarza /// </summary> /// <param name="product"></param> void fillComments(Product product) { HtmlAgilityPack.HtmlNodeCollection bodyNodes = htmlDoc.DocumentNode.SelectNodes("//ul[@class=\"opinion-list\"]/li"); if (bodyNodes != null) { int dd = 0; foreach (HtmlAgilityPack.HtmlNode node in bodyNodes) { if (!isCommentExistInProduct(product, node)) { CommentDb comment = new CommentDb(); comment.PortalName = "Skapiec"; fillComment(comment, node); product.Comments.Add(comment); } dd++; } } }
/// <summary> /// Wypełnia produkt zawartością komentarzy /// </summary> /// <param name="product"></param> public void fillProduct(Product product) { foreach(string pageContent in pagesContent) ceneoParser.getCommentsContentFromPage(pageContent, product); }
/// <summary> /// Pobiera dane z node i sprawdza czy product się nie powtarza /// </summary> /// <param name="product"></param> /// <param name="node"></param> /// <returns></returns> private bool isCommentExistInProduct(Product product, HtmlNode node) { HtmlAgilityPack.HtmlNodeCollection bodyNodes = node.SelectNodes(".//p[@class=\"product-review-body\"]");//("//*[@id=\"body\"]/div[2]/div/div/div[2]/div[3]/div[2]/ol/li/div/div[1]/p");// //body//div[@id='body']class=\"product - review - body\""); string commentToParse = ""; if (bodyNodes != null) { foreach (HtmlAgilityPack.HtmlNode commentNode in bodyNodes) { commentToParse += commentNode.InnerText; } } foreach (CommentDb commentsInDb in product.Comments) { if (commentsInDb.Comment == null) { continue; } if (commentsInDb.Comment.Equals(commentToParse)) { return true; } } return false; }
/// <summary> /// Metoda dla danego produktu pobiera informacje o produkcie /// </summary> /// <param name="product"></param> private void fillProductInfo(Product product) { fillComments(product); fillType(product); fillBrandAndModel(product); }
/// <summary> /// Uzupełnienie komentarza danymi komentarza /// </summary> /// <param name="product"></param> void fillComments(Product product) { HtmlAgilityPack.HtmlNodeCollection bodyNodes = htmlDoc.DocumentNode.SelectNodes("//ol[@class=\"product-reviews js_product-reviews js_reviews-hook\"]/li");//("//*[@id=\"body\"]/div[2]/div/div/div[2]/div[3]/div[2]/ol/li/div/div[1]/p");// //body//div[@id='body']class=\"product - review - body\""); if (bodyNodes != null) { foreach (HtmlAgilityPack.HtmlNode node in bodyNodes) { if (!isCommentExistInProduct(product, node)) { CommentDb comment = new CommentDb(); comment.PortalName = "Ceneo"; fillComment(comment, node); product.Comments.Add(comment); } } } }
/// <summary> /// Funcja dodająca proddukt do bd, wraz ze sprawdzaniem czy dany komentarz dla danego produktu istnieje /// </summary> /// <param name="product"></param> /// <param name="statistic"></param> private void addProductToDatabase(Product product, IStatisctics statistic) { try { using (var db = new DatabaseContext()) { IQueryable<Product> productsInDb = from p in db.Product where p.Brand.Equals(product.Brand) && p.Model.Equals(product.Model) && p.Type.Equals(product.Type) select p;// select db.Product;//and p.Model.E; if (productsInDb != null) { foreach (Product productInDb in productsInDb) { DateTime time = DateTime.Now; foreach (CommentDb dowloadedProd in product.Comments) { bool contains = productInDb.Comments.Any(x => { bool returnValue = true; if (dowloadedProd.Advantages != null && x.Advantages != null) returnValue &= dowloadedProd.Advantages.Equals(x.Advantages); if (dowloadedProd.Disadvantages != null && x.Disadvantages != null) returnValue &= dowloadedProd.Disadvantages.Equals(x.Disadvantages); if (dowloadedProd.Comment != null && x.Comment != null) returnValue &= dowloadedProd.Comment.Equals(x.Comment); if (dowloadedProd.Date != null && x.Date != null) returnValue &= dowloadedProd.Date.Equals(x.Date); returnValue &= dowloadedProd.Recommend.Equals(x.Recommend); returnValue &= dowloadedProd.Stars.Equals(x.Stars); returnValue &= dowloadedProd.Usability.Equals(x.Usability); returnValue &= dowloadedProd.UsabilityVotes.Equals(x.UsabilityVotes); returnValue &= dowloadedProd.Author.Equals(x.Author); return returnValue; }); if (contains) { } else { dowloadedProd.LoadDate = time; statistic.addAddedComment(dowloadedProd.Comment); productInDb.Comments.Add(dowloadedProd); } } } if (productsInDb.Count() == 0) { DateTime time = DateTime.Now; foreach (CommentDb com in product.Comments) { com.LoadDate = time; statistic.addAddedComment(com.Comment); } db.Product.Add(product); } } else { DateTime time = DateTime.Now; foreach (CommentDb com in product.Comments) { com.LoadDate = time; statistic.addAddedComment(com.Comment); } db.Product.Add(product); } db.SaveChanges(); } } catch (System.Data.Entity.Validation.DbEntityValidationException ex) { // Retrieve the error messages as a list of strings. var errorMessages = ex.EntityValidationErrors .SelectMany(x => x.ValidationErrors) .Select(x => x.ErrorMessage); // Join the list to a single string. var fullErrorMessage = string.Join("; ", errorMessages); // Combine the original exception message with the new one. var exceptionMessage = string.Concat(ex.Message, " The validation errors are: ", fullErrorMessage); // Throw a new DbEntityValidationException with the improved exception message. throw new System.Data.Entity.Validation.DbEntityValidationException(exceptionMessage, ex.EntityValidationErrors); } }
/// <summary> /// Tłumaczenie nazwy id na nazwę produktu /// </summary> /// <param name="product"></param> /// <param name="pageName"></param> private void fillProductPropertis(Product product, string pageName ) { System.Net.WebClient client = new System.Net.WebClient(); client.Encoding = Encoding.UTF8; string pageContent = client.DownloadString(pageName); htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(pageContent); if (htmlDoc.ParseErrors != null && htmlDoc.ParseErrors.Count() > 0) { throw new System.ExecutionEngineException(); // Handle any parse errors as required } else { if (htmlDoc.DocumentNode != null) { fillBrandAndModel(product); fillType(product); } } }