public async Task StartAsync(string url) { ListAdvertisements listAdvertisements = new ListAdvertisements(); do { if (listAdvertisements.UrlNextPage != default) { url = listAdvertisements.UrlNextPage; } string html = await DownloadHtml(url); listAdvertisements = ParsingListAdvertisement(html); repository.AddListAdvertisement(listAdvertisements); } while (listAdvertisements.ExistNextPage); // Парсим отдельно каждое объявление var urls = repository.GetUnfinishedAdvertisementUrls(); foreach (var item in urls) { string html = await DownloadHtml(item); Advertisement advertisement = await ParsingAdvertisementAsync(html); advertisement.Url = item; repository.AddAdvertisement(advertisement); } }
public void AddListAdvertisement(ListAdvertisements listAdvertisements) { foreach (var item in listAdvertisements.Advertisements) { AddAdvertisement(item); } }
public ListAdvertisements ParsingListAdvertisement(string html) { IHtmlDocument document = new HtmlParser().ParseDocument(html); var rows = document.QuerySelectorAll("div[data-name='OfferCard']").ToArray(); ListAdvertisements Advertisements = new ListAdvertisements(); foreach (var row in rows) { Owner owner = new Owner(); House house = new House(); Advertisement advertisement = new Advertisement(); advertisement.Url = row.QuerySelector(".c6e8ba5398--header--1fV2A").GetAttribute("href"); #region Парсим титул "3-комн. кв., 97,4 м², 1/16 этаж" var tempTitle = row.QuerySelector("div[data-name='Title']")?.TextContent; if (tempTitle == null) { tempTitle = row.QuerySelector("div[data-name='TopTitle']")?.TextContent; } var tempTitleArray = tempTitle?.Split(", "); if (tempTitleArray != null) { foreach (var item in tempTitleArray) { if (item.Contains("-комн. кв.")) { advertisement.RoomCount = int.Parse(string.Join("", item.Where(c => char.IsDigit(c)))); } if (item.Contains("м²")) { advertisement.TotalArea = double.Parse(string.Join("", item.Where(c => char.IsDigit(c) || c == ','))); } if (item.Contains("этаж")) { advertisement.Floor = int.Parse(item.Substring(0, item.IndexOf('/'))); house.FloorCount = int.Parse(item.Substring(item.IndexOf('/') + 1, item.IndexOf(' ') - item.IndexOf('/') - 1)); } } } #endregion advertisement.Price = decimal.Parse(string.Join("", row.QuerySelector(".c6e8ba5398--header--1df-X").TextContent.Where(c => char.IsDigit(c)))); house.DistanceInfo = row.QuerySelector(".c6e8ba5398--underground-container--1exfN")?.TextContent; #region Парсим адресс "Татарстан респ., Казань, р-н Советский, ул. Аделя Кутуя, 83" house.FullAdress = row.QuerySelector("div[data-name='AddressItem']")?.TextContent; var tempAdressArray = house.FullAdress?.Split(", "); //ToDo Надо пересмотреть, т.к. порядок часто разный house.Region = tempAdressArray[0]; house.City = tempAdressArray[1]; house.District = tempAdressArray[2]; house.Street = tempAdressArray[3]; house.NumberHouse = tempAdressArray[4]; for (int i = 0; i < tempAdressArray.Length; i++) { if (tempAdressArray[i].Contains("р-н")) { house.District = tempAdressArray[i]; } if (tempAdressArray[i].Contains("мкр.")) { house.MicroDistrict = tempAdressArray[i]; } //ToDo рассмотреть другие варианты } #endregion advertisement.Description = row.QuerySelector("div[data-name='Description']")?.TextContent; owner.Name = row.QuerySelector("div[data-name='AgentBrandMainInfoComponent']")?.TextContent; owner.PhoneNumbers = row.QuerySelector("div[data-name='PhoneButton']")?.TextContent; advertisement.DatePublishString = row.QuerySelector("div[data-name='TimeLabel'] .c6e8ba5398--absolute--9uFLj")?.TextContent; advertisement.DatePublish = ParsDateTime(advertisement.DatePublishString); advertisement.DateUpdate = DateTime.Now; advertisement.FullParse = false; advertisement.House = house; advertisement.Owner = owner; Advertisements.Advertisements.Add(advertisement); } var pages = document.QuerySelectorAll("div[data-name='Pagination'] ._93444fe79c--list-item--2KxXr"); for (int i = 0; i < pages.Length - 1; i++) { if (!pages[i].OuterHtml.Contains("href")) { Advertisements.ExistNextPage = true; Advertisements.UrlNextPage = pages[i + 1].QuerySelector("a").GetAttribute("href"); if (!Advertisements.UrlNextPage.StartsWith("https://kazan.cian.ru")) { Advertisements.UrlNextPage = "https://kazan.cian.ru" + Advertisements.UrlNextPage; } break; } } return(Advertisements); }