Esempio n. 1
0
        public List <Ad> ParseHomePage(HtmlDocument doc, ScanPageDto scanPage)
        {
            List <Ad>          adsList = new List <Ad>();
            HtmlNodeCollection docs    = doc.DocumentNode.SelectNodes("//div[@class='container']");

            foreach (HtmlNode ad in docs)
            {
                HtmlNode nod = ad.SelectSingleNode("div[@class='title']/a");

                string title = nod.InnerText.Trim();
                string url   = scanPage.HostUrl + nod.Attributes["href"].Value;
                string idAds = url.Split('/').Last();

                HtmlNode priceTemp =
                    ad.SelectSingleNode(
                        "div[@class='info']/div[@class='price']/span[@class='value']/span[@class='amount']");
                decimal price = ScrapExtensions.ConvertStringToDecimal(priceTemp?.InnerText);

                Ad ads = Ad.Create(Guid.NewGuid(), idAds, title, url, price, scanPage.Host, scanPage.HostUrl);

                adsList.Add(ads);
            }

            return(adsList);
        }
Esempio n. 2
0
        public List <Ad> ParseHomePage(HtmlDocument doc, ScanPageDto scanPage)
        {
            List <Ad>          adsList = new List <Ad>();
            HtmlNodeCollection docs    = doc.DocumentNode.SelectNodes("//div[@class='row'] / div / article");

            foreach (HtmlNode ad in docs)
            {
                var url = ad.SelectSingleNode("div[@class='offer-item-details'] / header / h3 / a").Attributes["href"]
                          .Value;
                var title = ad.SelectSingleNode("div[@class='offer-item-details'] / header / h3 / a / span / span")
                            .InnerText.Trim();

                string idAds = ad.Attributes["data-tracking-id"].Value;

                var priceTemp =
                    ad.SelectSingleNode("div[@class='offer-item-details'] / ul / li[@class='offer-item-price']")
                    .InnerText.Trim();

                decimal price = ScrapExtensions.ConvertStringToDecimal(priceTemp);

                Ad ads = Ad.Create(Guid.NewGuid(), idAds, title, url, price, scanPage.Host, scanPage.HostUrl);

                adsList.Add(ads);
            }

            return(adsList);
        }
Esempio n. 3
0
        public List <Ad> ParseHomePage(HtmlDocument doc, ScanPageDto scanPage)
        {
            List <Ad>          adsList = new List <Ad>();
            HtmlNodeCollection docs    = doc.DocumentNode.SelectNodes("// tbody / tr[@class='wrap'] / td / div");

            foreach (HtmlNode ad in docs)
            {
                HtmlNode nod = ad.SelectSingleNode("table / tbody / tr[1]");

                string url = nod.SelectSingleNode("td[1] / a").Attributes["href"].Value;

                string title = nod.SelectSingleNode("td[2] / div / h3 / a / strong").InnerText.Trim();

                string idAds = ad.SelectSingleNode("table").Attributes["data-id"].Value;

                HtmlNode priceTemp = nod.SelectSingleNode("td[3] / div / p / strong");

                decimal price = ScrapExtensions.ConvertStringToDecimal(priceTemp?.InnerText);

                Ad ads = Ad.Create(Guid.NewGuid(), idAds, title, url, price, scanPage.Host, scanPage.HostUrl);

                adsList.Add(ads);
            }

            return(adsList);
        }
        public void conver_string_to_int(string value, int expectedResult)
        {
            // Arrange
            float result;

            // Act
            result = ScrapExtensions.ConvertStringToInt(value);

            // Assert
            Assert.Equal(result, expectedResult);
        }
        public void convert_string_to_decimal(string value, decimal expectedResult)
        {
            // Arrange
            decimal result;

            // Act
            result = ScrapExtensions.ConvertStringToDecimal(value);

            // Assert
            Assert.Equal(result, expectedResult);
        }
Esempio n. 6
0
        public async Task ScrapAsync()
        {
            Logger.Information("Start ScrapAsync");
            IEnumerable <Type>        scraperTypes = ScrapExtensions.GetScraperTypes();
            IEnumerable <ScanPageDto> scanPages    = _scanPageService.GetAllAsync().Result.Where(x => x.Active).ToList();
            IEnumerable <Ad>          adsDb        = await _adRepository.GetAllAsync();

            foreach (ScanPageDto scanPage in scanPages)
            {
                Logger.Information($"Start scrap page, url = '{scanPage.UrlAddress}'");

                Type scrapClass = scraperTypes
                                  .FirstOrDefault(x => x.Name.ToLower()
                                                  .Replace("Scraper", "")
                                                  .Contains(scanPage.Host.ToLower()));
                if (scrapClass == null)
                {
                    throw new Exception(
                              $"Invalid scan page, UrlAddress='{scanPage.UrlAddress}', Page='{scanPage.Host}'.");
                }

                scraperInstance = Activator.CreateInstance(scrapClass) as IScraper;

                HtmlDocument scrapedDoc = ScrapExtensions.ScrapUrl(scanPage.UrlAddress);
                if (scrapedDoc == null)
                {
                    throw new Exception(
                              $"Problem with scrap page = '{scanPage.UrlAddress}', scrapClass='{scrapClass.Name}'.");
                }

                List <Ad> ads = scraperInstance.ParseHomePage(scrapedDoc, scanPage);

                foreach (Ad ad in ads)
                {
                    bool isInDb = adsDb.Any(x => x.IdAds == ad.IdAds);
                    if (!isInDb)
                    {
                        HtmlDocument scrapedSubPage = ScrapExtensions.ScrapUrl(ad.Url);
                        ad.AdDetails = scraperInstance.ParseDetailsPage(scrapedSubPage, ad);

                        await _adRepository.AddAsync(ad);
                    }
                }
                Logger.Information($"Complited page='{scanPage.UrlAddress}', scraped '{ads.Count}' pages.");
            }
            Logger.Information("End ScrapAsync");
        }
Esempio n. 7
0
        public AdDetails ParseDetailsPage(HtmlDocument doc, Ad ad)
        {
            DateTime createAt       = DateTime.UtcNow;
            string   district       = null;
            string   city           = null;
            string   typeOfProperty = null;
            //string parking = null;
            bool    agency            = false;
            int     numberOfRooms     = 0;
            int     numberOfBathrooms = 0;
            float   size    = 0;
            decimal priceM2 = 0;

            HtmlNodeCollection docs = doc.DocumentNode.SelectNodes("//ul[@class='selMenu'] / li / div");

            foreach (HtmlNode docParameter in docs)
            {
                string nameParam  = docParameter.SelectSingleNode("span[@class='name']")?.InnerText.Trim();
                string valueParam = docParameter.SelectSingleNode("span[@class='value']")?.InnerText.Trim();
                if (nameParam.Empty() || valueParam.Empty())
                {
                    break;
                }

                switch (nameParam)
                {
                case "Data dodania":
                {
                    DateTime.TryParse(valueParam, out DateTime now);
                    createAt = now;
                }
                break;

                case "Lokalizacja":
                    var location = valueParam?.Split(",");
                    if (location != null)
                    {
                        district = location[0].Trim();
                        city     = location[1].Trim();
                    }
                    break;

                case "Na sprzedaż przez":

                    if (valueParam == "Właściciel")
                    {
                        agency = false;
                    }
                    else if (valueParam == "Agencja")
                    {
                        agency = true;
                    }
                    else
                    {
                        agency = true;
                    }
                    break;

                case "Rodzaj nieruchomości":
                    typeOfProperty = valueParam?.Trim();
                    break;

                case "Liczba pokoi":
                    numberOfRooms = ScrapExtensions.ConvertStringToInt(valueParam);
                    break;

                case "Liczba łazienek":
                    numberOfBathrooms = ScrapExtensions.ConvertStringToInt(valueParam);
                    break;

                case "Wielkość (m2)":
                    size = ScrapExtensions.ConvertStringToFloat(valueParam);
                    break;

                case "Parking":
                    //parking = valueParam.Trim();
                    break;

                default:
                    break;
                }
            }
            if (size != 0)
            {
                decimal tempPriceM2 = (ad.Price / (decimal)size);
                priceM2 = decimal.Round(tempPriceM2, 2, MidpointRounding.AwayFromZero);
            }
            else
            {
                priceM2 = 0;
            }

            var    tempUsername = doc.DocumentNode.SelectSingleNode("//span[@class='username'] / a /text()");
            string username     = tempUsername.InnerText.Trim();


            AdDetails adDetails = AdDetails.Create(
                priceM2,
                district,
                city,
                agency,
                typeOfProperty,
                numberOfRooms,
                numberOfBathrooms,
                size,
                username,
                new List <string>(),
                createAt);

            return(adDetails);
        }
Esempio n. 8
0
        public AdDetails ParseDetailsPage(HtmlDocument doc, Ad ad)
        {
            try
            {
                DateTime createAt       = DateTime.UtcNow;
                string   district       = null;
                string   city           = null;
                string   typeOfProperty = null;
                //string parking = null;
                bool          agency            = false;
                int           numberOfRooms     = 0;
                int           numberOfBathrooms = 0;
                float         size    = 0;
                decimal       priceM2 = 0;
                List <string> images  = new List <string>();

                HtmlNodeCollection docs = doc.DocumentNode.SelectNodes("//ul[@class='main-list'] / li");
                if (docs == null)
                {
                    Logger.Error("Docs is null. Perhaps problem with scrap url: {@ad}", ad);
                    return(null);
                }

                // images
                var imagesTemp = doc.DocumentNode.SelectNodes("//figure[@itemprop='associatedMedia'] / a / img");

                foreach (var img in imagesTemp)
                {
                    string res = img?.Attributes["src"]?.Value;
                    images.Add(res);
                }

                foreach (HtmlNode docParameter in docs)
                {
                    string nameParam  = docParameter.SelectSingleNode("text()").InnerText.Trim();
                    string valueParam = docParameter.SelectSingleNode("span / strong")?.InnerText.Trim();

                    switch (nameParam)
                    {
                    case "Cena":
                        decimal price = ScrapExtensions.ConvertStringToDecimal(valueParam);
                        break;

                    case "Piętro":
                        int pietro = ScrapExtensions.ConvertStringToInt(valueParam);
                        break;

                    case "Liczba pokoi":
                        numberOfRooms = ScrapExtensions.ConvertStringToInt(valueParam);
                        break;

                    case "Powierzchnia":
                        Match result   = Regex.Match(valueParam, @"\b[,); +]+.*$");
                        var   sizeTemp = valueParam.Replace(result.Value, "");
                        size = ScrapExtensions.ConvertStringToFloat(sizeTemp);
                        break;

                    default:
                        break;
                    }
                }

                HtmlNodeCollection subDocs = doc.DocumentNode.SelectNodes("//ul[@class='sub-list'] / li");

                foreach (HtmlNode subDoc in subDocs)
                {
                    string nameParam  = subDoc.SelectSingleNode("strong").InnerText.Trim().Replace(":", "");
                    string valueParam = subDoc.SelectSingleNode("text()")?.InnerText.Trim();

                    switch (nameParam)
                    {
                    case "Rynek":
                        string rynek = valueParam;
                        break;

                    case "Rodzaj zabudowy":
                        typeOfProperty = valueParam;
                        break;

                    case "Materiał budynku":
                        string materialy = valueParam;
                        break;

                    case "Okna":
                        string onka = valueParam;
                        break;

                    case "Ogrzewanie":
                        string ogrzewanie = valueParam;
                        break;

                    case "Rok budowy":
                        int rokBudowy = ScrapExtensions.ConvertStringToInt(valueParam);
                        break;

                    case "Stan wykończenia":
                        string stanWykonczenia = valueParam;
                        break;

                    case "Czynsz":
                        decimal czynsz = ScrapExtensions.ConvertStringToDecimal(valueParam);
                        break;

                    case "Forma własności":
                        string formaWlasnosci = valueParam;
                        break;

                    default:
                        break;
                    }
                }

                // location
                var location = doc.DocumentNode.SelectNodes("//address / p[@class='address-links'] / a");
                city     = location[1].InnerText.Trim();
                district = location.Count < 3 ? "-" : location[2].InnerText?.Trim();

                // price m2
                if (size != 0)
                {
                    decimal tempPriceM2 = (ad.Price / (decimal)size);
                    priceM2 = decimal.Round(tempPriceM2, 2, MidpointRounding.AwayFromZero);
                }
                else
                {
                    priceM2 = 0;
                }

                // user
                var tempUser = doc.DocumentNode.SelectSingleNode(
                    "//div[@class='box-person'] / span[@itemprop='name']");
                if (tempUser == null)
                {
                    tempUser = doc.DocumentNode.SelectSingleNode(
                        "//div[@class='box-person'] / a / span[@itemprop='name']");
                }

                string username = tempUser?.InnerText?.Trim();
                username = username.Empty() ? "-" : username;

                // agency
                var agencyTemp = doc.DocumentNode.SelectNodes("//h5[@class='box-title']");
                var agent      = agencyTemp?.Any(x => x?.InnerText?.Trim() == "Biuro nieruchomości");

                var  agencyOfferTemp = doc.DocumentNode.SelectNodes("//h6[@class='box-contact-info-type']");
                bool?priv            = agencyOfferTemp?.Any(x => x?.InnerText?.Trim() == "Oferta prywatna");

                agency = !priv.GetValueOrDefault(false) && agent.GetValueOrDefault(true);


                AdDetails adDetails = AdDetails.Create(
                    priceM2,
                    district,
                    city,
                    agency,
                    typeOfProperty ?? "blok",
                    numberOfRooms,
                    numberOfBathrooms,
                    size,
                    username,
                    images,
                    DateTime.UtcNow);

                return(adDetails);
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
Esempio n. 9
0
        public AdDetails ParseDetailsPage(HtmlDocument doc, Ad ad)
        {
            DateTime createAt       = DateTime.UtcNow;
            string   district       = null;
            string   city           = null;
            string   typeOfProperty = null;
            //string parking = null;
            bool    agency            = false;
            int     numberOfRooms     = 0;
            int     numberOfBathrooms = 0;
            float   size    = 0;
            decimal priceM2 = 0;

            HtmlNode details = doc.DocumentNode.SelectSingleNode(
                "//div[@class='offer-titlebox'] / div[@class='offer-titlebox__details']");

            if (details == null)
            {
                if (ad.Url.Contains("otodom"))
                {
                    Logger.Warning("Start scrap otodom: {@ad}.", ad);
                    ad.SetHostUrl("https://www.otodom.pl");
                    ad.SetHost("otodom");
                    AdDetails result = new OtodomScraper().ParseDetailsPage(doc, ad);
                    return(result);
                }
                Logger.Error("Docs is null and url not contains otodom: {@ad}.", ad);

                return(null);
            }

            var locationTemp = details.SelectSingleNode("a").InnerText;
            var location     = locationTemp.Split(",");

            city     = location[0];
            district = location[2];


            var createAtTemp    = details.SelectSingleNode("em").InnerText.Trim();
            var regexBeforeChar = Regex.Replace(createAtTemp, "^[^_]*o ", "");
            var regexAfterChar  = Regex.Replace(regexBeforeChar, ", ID.*$", "");

            createAt =
                DateTime.ParseExact(regexAfterChar, "HH:mm, d MMMM yyyy", CultureInfo.CreateSpecificCulture("pl-PL"));

            var offerDescriptions = doc.DocumentNode.SelectNodes(
                "//div[@id='offerdescription'] / div[contains(@class, 'descriptioncontent')] / table / tr / td");


            foreach (var description in offerDescriptions)
            {
                var name  = description.SelectSingleNode("table / tr / th")?.InnerText.Trim();
                var value = description.SelectSingleNode("table / tr / td / strong")?.InnerText?.Trim();

                switch (name)
                {
                case "Oferta od":
                    if (value == "Osoby prywatnej")
                    {
                        agency = false;
                    }
                    else if (value == "Biuro / Deweloper")
                    {
                        agency = true;
                    }
                    else
                    {
                        agency = true;
                    }
                    break;

                case "Cena za m2":
                    priceM2 = ScrapExtensions.ConvertStringToDecimal(value);
                    break;

                case "Poziom":
                    int poziom = ScrapExtensions.ConvertStringToInt(value);
                    break;

                case "Umeblowane":
                    /*bool umeblowanie = false;
                     * if (value == "Tak")
                     *  umeblowanie = true;
                     * else if (value == "Nie")
                     *  umeblowanie = false;
                     * else
                     *  umeblowanie = false;*/
                    break;

                case "Rynek":
                    string rynek = value;
                    break;

                case "Rodzaj zabudowy":
                    typeOfProperty = value;
                    break;

                case "Powierzchnia":
                    size = ScrapExtensions.ConvertStringToFloat(value.Replace("m2", ""));
                    break;

                case "Liczba pokoi":
                    numberOfRooms = ScrapExtensions.ConvertStringToInt(value);
                    break;

                case "Finanse":
                    break;

                default:
                    break;
                }
            }

            var    tempUsername = doc.DocumentNode.SelectSingleNode("//div[@class='offer-user__details'] / h4 / a");
            string username     = tempUsername?.InnerText?.Trim();

            AdDetails adDetails = AdDetails.Create(
                priceM2,
                district,
                city,
                agency,
                typeOfProperty,
                numberOfRooms,
                numberOfBathrooms,
                size,
                username,
                new List <string>(),
                createAt);

            return(adDetails);
        }