コード例 #1
0
ファイル: Program.cs プロジェクト: maxross/Crawler
        static void Main(string[] args)
        {

            idPortale = (byte)Convert.ToInt32(ConfigurationManager.AppSettings["idPortale"]);
            try
            {
                logger = new Log(typeof(Program), Assembly.GetExecutingAssembly().GetName().Name);
                logger.Info("Crawler: Start crawling. " + DateTime.Now.ToLongDateString());

                dbparams[] tipologie = new dbparams[8];
                for (int x = 0; x <= 7; x++)
                {
                    tipologie[x] = new dbparams();
                }
                tipologie[0].tipologia = “xxxx”;
                tipologie[0].idtipologia = 1;
                tipologie[0].idcategoria = 1;
                tipologie[1].tipologia = “xxxx”;
                tipologie[1].idtipologia = 3;
                tipologie[1].idcategoria = 1;
                tipologie[2].tipologia = “xxxx”;
                tipologie[2].idtipologia = 4;
                tipologie[2].idcategoria = 1;
                tipologie[3].tipologia = “xxxx”;
                tipologie[3].idtipologia = 14;
                tipologie[3].idcategoria = 1;
                tipologie[4].tipologia = “xxxx;
                tipologie[4].idtipologia = 11;
                tipologie[4].idcategoria = 1;
                tipologie[5].tipologia = “xxxx”;
                tipologie[5].idtipologia = 2;
                tipologie[5].idcategoria = 1;
                tipologie[6].tipologia = “xxxx”;
                tipologie[6].idtipologia = 10;
                tipologie[6].idcategoria = 1;
                tipologie[7].tipologia = “xxxx”;
                tipologie[7].idtipologia = 141;
                tipologie[7].idcategoria = 1;


                SqlConnection connImport01 = new SqlConnection(ConfigurationManager.ConnectionStrings["DBIMPORT01ConnectionString"].ConnectionString);
                connImport01.Open();

                using (SqlCommand cmd = new SqlCommand())
                {
                    cmd.CommandText = “xxxx”;
                    cmd.CommandType = CommandType.StoredProcedure;
                    cmd.Connection = connImport01;
                    cmd.CommandTimeout = 600;
                    cmd.Parameters.Add("@IdPortale", SqlDbType.TinyInt).Value = idPortale;
                    try
                    {
                        var adapt = new SqlDataAdapter();
                        adapt.SelectCommand = cmd;
                        DataTable dTable = new DataTable();
                        adapt.Fill(dTable);
                        foreach (DataRow dRow in dTable.Rows)
                        {
                            comuniConQuartieri.Add(((string)dRow["Comune"]).ToLower());
                        }
                    }
                    catch (SqlException ex)
                    {
                        logger.Error("Errore nella select comuni con quartieri", ex);
                    }
                }
                using (SqlCommand cmd = new SqlCommand())
                {
                    cmd.CommandText = “xxxx”;
                    cmd.CommandType = CommandType.StoredProcedure;
                    cmd.Connection = connImport01;
                    cmd.CommandTimeout = 600;
                    try
                    {
                        var adapt = new SqlDataAdapter();
                        adapt.SelectCommand = cmd;
                        DataTable dTable = new DataTable();
                        adapt.Fill(dTable);
                        foreach (DataRow dRow in dTable.Rows)
                        {
                            province.Add((string)dRow["SiglaProvincia"]);
                        }
                    }
                    catch (SqlException ex)
                    {
                        logger.Error("Errore nella select province", ex);
                    }
                }


                foreach (dbparams t in tipologie)
                {
                    logger.Info("Crawling: " + t.tipologia + " vendita");
                    crawl(connImport01, "vendita", t);
                    logger.Info("Crawling: " + t.tipologia + " affitto");
                    crawl(connImport01, "affitto", t);
                }

                connImport01.Close();
                connImport01.Dispose();

                esitoSuccess = true;
                DateTime fineProcedura = DateTime.Now;
                TimeSpan span = fineProcedura.Subtract(inizioProcedura);
                double seconds = span.TotalSeconds;
                sendStatus(esitoSuccess, esitoAnnunciLavorati, esitoErrors, seconds, esitoErrorsNotes);

            }
            catch (Exception ex)
            {
                logger.Error("Errore", ex);
                esitoSuccess = false;
                DateTime fineProcedura = DateTime.Now;
                TimeSpan span = fineProcedura.Subtract(inizioProcedura);
                double seconds = span.TotalSeconds;
                sendStatus(esitoSuccess, esitoAnnunciLavorati, esitoErrors, seconds, esitoErrorsNotes);
            }



        }
コード例 #2
0
ファイル: Program.cs プロジェクト: maxross/Crawler
 static int disambiguateTipologia(adResult ad, dbparams tip)
 {
     int t = ad.idtipologia;
     if (tip.disambiguation != "" && tip.disambiguation != null)
     {
         string[] pairs = tip.disambiguation.Split(';');
         foreach (string pair in pairs)
         {
             string[] values = pair.Split('=');
             if (ad.title.ToLower().Contains(values[0]))
             {
                 t = Convert.ToInt32(values[1]);
             }
         }
     }
     return t;
 }
コード例 #3
0
ファイル: Program.cs プロジェクト: maxross/Crawler
        static void crawl(SqlConnection connImport01, string contratto, dbparams tip)
        {
            try
            {

                ArrayList urls = new ArrayList();
                foreach (string siglaProvincia in province)
                {
                
                    string categoria = "";
                    switch (tip.idcategoria)
                    {
                        case 1:
                            categoria = “xxxx”;
                            break;
                        case 2:
                            categoria = “xxxx”;
                            break;
                    }

                    
                    url uIT = new url();
                    
                    uIT.uri = string.Format(“xxxxxxxxxxxxxxxxxxx”,
                        contratto,
                        categoria,
                        tip.tipologia,
                        siglaProvincia.ToLower()
                        );
                    uIT.siglaprovincia = siglaProvincia;
                    urls.Add(uIT);
                }
                    for (int u = 0; u < urls.Count; u++)
                    {
                    ForUrls:
                        url uu = (url)urls[u];
                        logger.Info(uu.uri);
                        string baseUrl = uu.uri;
                        HtmlDocument basedom = new HtmlDocument();
                        basedom = Functions.GetHtmlDocumentByUrl(baseUrl, logger);
                        string baseHtml = basedom.DocumentNode.InnerHtml;
                        HtmlNode content = basedom.GetElementbyId("results");
                        if (content != null)
                        {
                            string pagination_results = Functions.extractFirstOccur(content.InnerHtml,
                                " di * xxxx”);
                            if (pagination_results.IsNumeric() && pagination_results.Trim() != "0")
                            {
                                string limit = pagination_results;
                                limit = limit.Replace(".", "");
                                if (Functions.IsNumeric(limit))
                                {
                                    int numPages = Convert.ToInt32(limit) / resultsPerPage;
                                    if (numPages * resultsPerPage < Convert.ToInt32(limit))
                                    {
                                        numPages += 1;
                                    }

                                    for (int x = 1; x <= numPages; x++)
                                    {
                                        try
                                        {
                                            Console.WriteLine("Pagina " + x.ToString());
                                            HtmlDocument pagedom = new HtmlDocument();
                                            if (x > 1)
                                            {
                                                pagedom = Functions.GetHtmlDocumentByUrl(baseUrl.Replace("lista-1", "lista-" + x.ToString()), logger);
                                            }
                                            else
                                            {
                                                pagedom = basedom;
                                            }

                                            HtmlNode searchResults = pagedom.GetElementbyId("searchResultsTbl");
                                            if (searchResults != null)
                                            {
                                                ArrayList results = Functions.getElementsByClass(searchResults, "resultBody first tier1", "resultBody tier1");
                                                foreach (HtmlNode hn in results)
                                                {
                                                    try
                                                    {
                                                        bool nuovoAnnuncio = hn.InnerHtml.Contains("<div class=\"newIcon\"></div>");
                                                        if (!nuovoAnnuncio)
                                                        {
                                                            u++;
                                                            if (u == urls.Count)
                                                            {
                                                                return;
                                                            }
                                                            else
                                                            {
                                                                goto ForUrls;
                                                            }

                                                        }
                                                        if (nuovoAnnuncio)
                                                        {
                                                            adResult ad = new adResult();
                                                            ad.idtipologia = tip.idtipologia;
                                                            ad.idcategoria = tip.idcategoria;
                                                            ad.url = Functions.extractFirstOccur(hn.InnerHtml, "href=\"*\"");
                                                            ad.id = hn.Id.Replace("t", "");

                                                            string city = Functions.retrieveStringAfterChars(ad.url, tip.tipologia + "-" + uu.siglaprovincia.ToLower() + "-");
                                                            city = Functions.retrieveStringBeforeChars(city, "-");
                                                            city = city.Replace("+", " ");
                                                            if (ad.url != "")
                                                            {
                                                                ad.url = ConfigurationManager.AppSettings["urlPortale"] + ad.url;
                                                            }
                                                            string imgSrc = Functions.extractFirstOccur(hn.InnerHtml, "data-src='*'");
                                                            if (imgSrc.Contains("placeholder.jpg"))
                                                            {
                                                                imgSrc = "";
                                                            }
                                                            ad.imgSrc = imgSrc;
                                                            ad.date = DateTime.Now;
                                                            comune comuneByName = getComuneByName(connImport01, city);
                                                            if (comuneByName != null)
                                                            {
                                                                ad.idcomune = comuneByName.IDComune.ToString();
                                                                ad.city = comuneByName.Comune;
                                                            }

                                                            if (ad.idcomune != "" && ad.idcomune != null)
                                                            {
                                                                string zone = Functions.extractFirstOccur(hn.InnerHtml, "<p class=\"zone\">*</p>");
                                                                if (comuniConQuartieri.Contains(city))
                                                                {
                                                                    quartiereportale qp = getQuartierePortaleByName(connImport01, comuneByName.IDComune, zone);
                                                                    if (qp != null)
                                                                    {
                                                                        ad.idquartiere = qp.idquartiere;
                                                                        ad.idquartiereportale = qp.nrtavola;
                                                                    }
                                                                }
                                                                else
                                                                {
                                                                    ad.idquartiereportale = 0;
                                                                    ad.idquartiere = 0;
                                                                }
                                                                ad.zona = zone;

                                                                HtmlDocument basedomScheda = new HtmlDocument();
                                                                basedomScheda = Functions.GetHtmlDocumentByUrl(ad.url, logger);
                                                                string htmlScheda = basedomScheda.DocumentNode.InnerHtml;
                                                                ad.title = Functions.extractFirstOccur(htmlScheda, "<title>*-", "", true);
                                                                Console.WriteLine(ad.id + " - " + ad.city + " - " + ad.title);
                                                                string pr = Functions.extractFirstOccur(htmlScheda, "class=\"price\"><span class=\"hidden\">€ *</span>", "", true).Replace(".", "");
                                                                if (Functions.IsNumeric(pr))
                                                                {
                                                                    ad.price = Convert.ToDecimal(pr);
                                                                }
                                                                else
                                                                {
                                                                    ad.price = 0;
                                                                }

                                                                string description = Functions.extractFirstOccur(htmlScheda, "<p class=\"body\">*</p>");
                                                                if (description != "")
                                                                {
                                                                    ad.title = description;
                                                                }
                                                                if (ad.title.Length > 2000)
                                                                {
                                                                    ad.title = ad.title.Substring(0, 1997) + "...";
                                                                }
                                                                ad.idtipologia = disambiguateTipologia(ad, tip);
                                                                string mq = Functions.extractFirstOccur(htmlScheda,
                                                                    "<li>Metri quadri:<span>* mq</span></li>");
                                                                string locali = Functions.extractFirstOccur(htmlScheda,
                                                                    "<li>Locali:<span>*</span></li>");
                                                                if (Functions.IsNumeric(locali))
                                                                {
                                                                    ad.locali = Convert.ToInt32(locali);
                                                                }
                                                                else
                                                                {
                                                                    ad.locali = 0;
                                                                }
                                                                if (Functions.IsNumeric(mq))
                                                                {
                                                                    ad.mq = Convert.ToInt32(mq);
                                                                }
                                                                else
                                                                {
                                                                    ad.mq = 0;
                                                                }
                                                                ad.contratto = contratto.Substring(0, 1).ToUpper();
                                                                saveAd(connImport01, ad);

                                                            }


                                                        }
                                                    }
                                                    catch (Exception ex)
                                                    {
                                                        Console.WriteLine(ex.Message);
                                                        logger.Error("Errore", ex);
                                                        esitoErrors += 1;
                                                        esitoErrorsNotes += "\r\n" + ex.Message;
                                                    }
                                                }

                                            }

                                        }
                                        catch (Exception ex)
                                        {
                                            Console.WriteLine(ex.Message);
                                            logger.Error("Errore", ex);
                                            esitoErrors += 1;
                                            esitoErrorsNotes += "\r\n" + ex.Message;
                                        }



                                    }
                                }
                            }

                        }

                    }

              



            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                logger.Error("Errore", ex);
                esitoErrors += 1;
                esitoErrorsNotes += "\r\n" + ex.Message;
            }
        }