static int disambiguateTipologia(adResult ad, dbparams tip) { int t = ad.idtipologia; if (tip.disambiguation != "" && tip.disambiguation != null) { string[] pairs = tip.disambiguation.Split(';'); foreach (string pair in pairs) { string[] values = pair.Split('='); if (ad.title.ToLower().Contains(values[0])) { t = Convert.ToInt32(values[1]); } } } return t; }
static int saveAd(SqlConnection conn, adResult ad) { int r = 0; try { using (SqlCommand cmd = new SqlCommand()) { cmd.CommandText = "Crawler_Inserimento"; cmd.CommandType = CommandType.StoredProcedure; cmd.Connection = conn; cmd.CommandTimeout = 600; if (ad.id != "" && ad.idcomune != "" && ad.idcomune != null && ad.url != "" && ad.idtipologia > 0 && ad.idcategoria > 0 && ad.contratto != "") { cmd.Parameters.Add("@IDPortale", SqlDbType.TinyInt).Value = Convert.ToInt32(ConfigurationManager.AppSettings["idPortale"]); cmd.Parameters.Add("@IDImmobileImportato", SqlDbType.Int).Value = ad.id; cmd.Parameters.Add("@Lingua", SqlDbType.Char).Value = "IT"; cmd.Parameters.Add("@CodiceNazione", SqlDbType.Char).Value = "IT"; cmd.Parameters.Add("@CodiceComune", SqlDbType.VarChar).Value = ad.idcomune; if (ad.idquartiere > 0) { cmd.Parameters.Add("@CodiceQuartiere", SqlDbType.Int).Value = ad.idquartiere; } else { cmd.Parameters.Add("@CodiceQuartiere", SqlDbType.Int).Value = DBNull.Value; } cmd.Parameters.Add("@IDQuartierePortale", SqlDbType.Int).Value = ad.idquartiereportale; cmd.Parameters.Add("@Zona", SqlDbType.NVarChar).Value = ad.zona; cmd.Parameters.Add("@Categoria", SqlDbType.TinyInt).Value = ad.idcategoria; cmd.Parameters.Add("@Contratto", SqlDbType.Char).Value = ad.contratto; cmd.Parameters.Add("@IDTipologia", SqlDbType.Int).Value = ad.idtipologia; cmd.Parameters.Add("@NrLocali", SqlDbType.TinyInt).Value = ad.locali; cmd.Parameters.Add("@Prezzo", SqlDbType.Money).Value = ad.price; cmd.Parameters.Add("@MQSuperficie", SqlDbType.Int).Value = ad.mq; cmd.Parameters.Add("@URLFotoPrincipale", SqlDbType.VarChar).Value = ad.imgSrc; cmd.Parameters.Add("@URLImmobile", SqlDbType.VarChar).Value = ad.url; cmd.Parameters.Add("@Testo", SqlDbType.NVarChar).Value = ad.title; cmd.Parameters.Add("@DataInserimento", SqlDbType.DateTime).Value = ad.date; cmd.Parameters.Add("@DataModifica", SqlDbType.DateTime).Value = ad.date; r = cmd.ExecuteNonQuery(); esitoAnnunciLavorati += 1; } } } catch (SqlException ex) { logger.Error("Errore esecuzione Stored Procedure ", ex); esitoErrors += 1; esitoErrorsNotes += "\r\nErrore esecuzione Stored Procedure. " + ex.Message; } return r; }
static void crawl(SqlConnection connImport01, string contratto, dbparams tip) { try { ArrayList urls = new ArrayList(); foreach (string siglaProvincia in province) { string categoria = ""; switch (tip.idcategoria) { case 1: categoria = “xxxx”; break; case 2: categoria = “xxxx”; break; } url uIT = new url(); uIT.uri = string.Format(“xxxxxxxxxxxxxxxxxxx”, contratto, categoria, tip.tipologia, siglaProvincia.ToLower() ); uIT.siglaprovincia = siglaProvincia; urls.Add(uIT); } for (int u = 0; u < urls.Count; u++) { ForUrls: url uu = (url)urls[u]; logger.Info(uu.uri); string baseUrl = uu.uri; HtmlDocument basedom = new HtmlDocument(); basedom = Functions.GetHtmlDocumentByUrl(baseUrl, logger); string baseHtml = basedom.DocumentNode.InnerHtml; HtmlNode content = basedom.GetElementbyId("results"); if (content != null) { string pagination_results = Functions.extractFirstOccur(content.InnerHtml, " di * xxxx”); if (pagination_results.IsNumeric() && pagination_results.Trim() != "0") { string limit = pagination_results; limit = limit.Replace(".", ""); if (Functions.IsNumeric(limit)) { int numPages = Convert.ToInt32(limit) / resultsPerPage; if (numPages * resultsPerPage < Convert.ToInt32(limit)) { numPages += 1; } for (int x = 1; x <= numPages; x++) { try { Console.WriteLine("Pagina " + x.ToString()); HtmlDocument pagedom = new HtmlDocument(); if (x > 1) { pagedom = Functions.GetHtmlDocumentByUrl(baseUrl.Replace("lista-1", "lista-" + x.ToString()), logger); } else { pagedom = basedom; } HtmlNode searchResults = pagedom.GetElementbyId("searchResultsTbl"); if (searchResults != null) { ArrayList results = Functions.getElementsByClass(searchResults, "resultBody first tier1", "resultBody tier1"); foreach (HtmlNode hn in results) { try { bool nuovoAnnuncio = hn.InnerHtml.Contains("<div class=\"newIcon\"></div>"); if (!nuovoAnnuncio) { u++; if (u == urls.Count) { return; } else { goto ForUrls; } } if (nuovoAnnuncio) { adResult ad = new adResult(); ad.idtipologia = tip.idtipologia; ad.idcategoria = tip.idcategoria; ad.url = Functions.extractFirstOccur(hn.InnerHtml, "href=\"*\""); ad.id = hn.Id.Replace("t", ""); string city = Functions.retrieveStringAfterChars(ad.url, tip.tipologia + "-" + uu.siglaprovincia.ToLower() + "-"); city = Functions.retrieveStringBeforeChars(city, "-"); city = city.Replace("+", " "); if (ad.url != "") { ad.url = ConfigurationManager.AppSettings["urlPortale"] + ad.url; } string imgSrc = Functions.extractFirstOccur(hn.InnerHtml, "data-src='*'"); if (imgSrc.Contains("placeholder.jpg")) { imgSrc = ""; } ad.imgSrc = imgSrc; ad.date = DateTime.Now; comune comuneByName = getComuneByName(connImport01, city); if (comuneByName != null) { ad.idcomune = comuneByName.IDComune.ToString(); ad.city = comuneByName.Comune; } if (ad.idcomune != "" && ad.idcomune != null) { string zone = Functions.extractFirstOccur(hn.InnerHtml, "<p class=\"zone\">*</p>"); if (comuniConQuartieri.Contains(city)) { quartiereportale qp = getQuartierePortaleByName(connImport01, comuneByName.IDComune, zone); if (qp != null) { ad.idquartiere = qp.idquartiere; ad.idquartiereportale = qp.nrtavola; } } else { ad.idquartiereportale = 0; ad.idquartiere = 0; } ad.zona = zone; HtmlDocument basedomScheda = new HtmlDocument(); basedomScheda = Functions.GetHtmlDocumentByUrl(ad.url, logger); string htmlScheda = basedomScheda.DocumentNode.InnerHtml; ad.title = Functions.extractFirstOccur(htmlScheda, "<title>*-", "", true); Console.WriteLine(ad.id + " - " + ad.city + " - " + ad.title); string pr = Functions.extractFirstOccur(htmlScheda, "class=\"price\"><span class=\"hidden\">€ *</span>", "", true).Replace(".", ""); if (Functions.IsNumeric(pr)) { ad.price = Convert.ToDecimal(pr); } else { ad.price = 0; } string description = Functions.extractFirstOccur(htmlScheda, "<p class=\"body\">*</p>"); if (description != "") { ad.title = description; } if (ad.title.Length > 2000) { ad.title = ad.title.Substring(0, 1997) + "..."; } ad.idtipologia = disambiguateTipologia(ad, tip); string mq = Functions.extractFirstOccur(htmlScheda, "<li>Metri quadri:<span>* mq</span></li>"); string locali = Functions.extractFirstOccur(htmlScheda, "<li>Locali:<span>*</span></li>"); if (Functions.IsNumeric(locali)) { ad.locali = Convert.ToInt32(locali); } else { ad.locali = 0; } if (Functions.IsNumeric(mq)) { ad.mq = Convert.ToInt32(mq); } else { ad.mq = 0; } ad.contratto = contratto.Substring(0, 1).ToUpper(); saveAd(connImport01, ad); } } } catch (Exception ex) { Console.WriteLine(ex.Message); logger.Error("Errore", ex); esitoErrors += 1; esitoErrorsNotes += "\r\n" + ex.Message; } } } } catch (Exception ex) { Console.WriteLine(ex.Message); logger.Error("Errore", ex); esitoErrors += 1; esitoErrorsNotes += "\r\n" + ex.Message; } } } } } } } catch (Exception ex) { Console.WriteLine(ex.Message); logger.Error("Errore", ex); esitoErrors += 1; esitoErrorsNotes += "\r\n" + ex.Message; } }