private List <string> ScrapePhase1_GenerateListCities_WebClient(ScraperAirdnaStateModel state) { var list = new List <string>(); //https://api.airdna.co/v1/market/search?access_token=MTg4Njkx|6534a37a5f8f4a1e9a8e6139fda32153&term=Israel var loginUrl = "https://www.airdna.co/api/v1/account/login"; var loginPost = loginUrl .PostUrlEncodedAsync(new { username = "******", password = "******", remember_me = "true" }) .ReceiveJson <ResponseLogin>(); var loginResult = loginPost.Result; if (loginResult.Status == null || loginResult.Status.ToLower() != "success") { Console.WriteLine($"Error authorizations on airdna.co site."); } var token = loginResult.Token; _token = token; var cities22 = $"https://api.airdna.co/v1/market/search?access_token={token}&term=Israel"; var data = cities22.GetAsync().ReceiveJson().Result; var filename = $"{state.RootPath}/cities.json"; File.WriteAllText(filename, JsonConvert.SerializeObject(data, Newtonsoft.Json.Formatting.Indented)); return(list); }
private void _airdnaScrapeThenSaveStore(bool isNew = false, bool needScrape = true) { _log($"Start AirdnaScrapeThenSaveStore (isNew={isNew})"); var state = new ScraperAirdnaStateModel() { IsNew = isNew, }; var scraper = new ScraperAirdna(state); if (needScrape) { scraper.Scrape(); } var dataOfScrape = scraper.GetDomainModel(); var excelService = new ExcelAirdnaService(state); var excelData = excelService.CreateExcel(dataOfScrape); var pathToFile = excelService.SaveToFile(excelData); var archive = new ArchiveRepository(); archive.Save(pathToFile, state.TypeScraper); _log($"End AirdnaScrapeThenSaveStore (isNew={isNew}), Spent time {_calcSpentTime2String(state)}"); }
private async Task <bool> _downloadItemAsync(AdDtoModel item, ScraperAirdnaStateModel state) { var result = true; var id = item.Id; try { var adDetails = _getAdDetailsFromService(item); var coordinates = _getCoordinatesFromServiceAsync(item, state); var phones = _getPhonesFromService(item); var details = _getDetailsFromService(item); var detailsItemDto = new DetailsItemDtoModel { Coordinates = await coordinates, Phones = await phones, Details = await details, AdDetails = await adDetails, RowDataFromPage = item, }; await _saveItemDetailsAsync(id, detailsItemDto, state); } catch (Exception exception) { _log($"Error-f1. {exception.Message} / {exception.StackTrace}"); result = false; } return(result); }
private async Task ScrapePhase1_DownloadPages_Scrapy(ScraperAirdnaStateModel state) { SetWorkPhaseBase("DownloadPages", state); var listPages = _loadListCities(state); fixListPages(listPages, state); var needToDo = listPages.Where(x => x.Value == false).Select(x => x.Key).ToList(); var tasks = new List <Task <bool> >(); if (needToDo.Count > 0) { foreach (var page in needToDo) { tasks.Add(Task.Run(async() => listPages[page] = await GetPageAsync(page, state))); } //await _saveListCitiesAsync(listPages, state); } Task.WaitAll(tasks.ToArray()); _log($"Phase done"); }
private void _savePage(int page, List <AdDtoModel> listAdFromPage, ScraperAirdnaStateModel state) { var filename = $"{state.PagesPath}/page-{page}.json"; File.WriteAllText(filename, JsonConvert.SerializeObject(listAdFromPage, Newtonsoft.Json.Formatting.Indented)); _log($"Save file {filename}"); }
private int _statusWorkspace_AmountItemsFromPages(ScraperAirdnaStateModel state) { var list = _statusWorkspace_AmountItemsFromPages_GetItems(state); //var dubs = list.GroupBy(x => x.Id, StringComparer.OrdinalIgnoreCase).ToDictionary(x => x.First().Id, x => x.First().Done); return(list.Count()); }
private async Task _saveItemAsync(string id, string response, ScraperAirdnaStateModel state) { var filename = $"{state.ItemsPath}/{id}.xml"; await File.WriteAllTextAsync(filename, response); _log($"Save xml-data file {filename}"); }
private void _saveListItems(Dictionary <string, AdDtoModel> list, ScraperAirdnaStateModel state) { var filename = state.ListItemsFilename; _log($"Saving list-items: {filename}"); File.WriteAllText($"{filename}", JsonConvert.SerializeObject(list, Newtonsoft.Json.Formatting.Indented)); _log($"Save list-items done"); }
private async Task <int> ScrapePhase1_GetAmountPages_WebClientAsync(ScraperAirdnaStateModel state) { var result = 0; var page = await GetPage_WebClientAsync(1, state); var amountPages = ScrapeAmountPages(page); result = amountPages; return(result); }
private List <AdDtoModel> ParseAdsFromPage(HtmlDocument html, ScraperAirdnaStateModel state) { var result = new List <AdDtoModel>(); var list1 = ParseAdsFromPage_Rent(html, EnumTypeItems.Rent); var list2 = ParseAdsFromPage_Rent(html, EnumTypeItems.RentTivuch); result.AddRange(list1); result.AddRange(list2); return(result); }
private Dictionary <int, bool> _loadListCities(ScraperAirdnaStateModel state) { Dictionary <int, bool> result = null; var filename = state.ListPagesFilename; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <Dictionary <int, bool> >(File.ReadAllText(filename)); } return(result); }
private List <AdDtoModel> _loadPage(int page, ScraperAirdnaStateModel state) { List <AdDtoModel> result = null; var filename = $"{state.PagesPath}/page-{page}.json"; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename)); } return(result); }
private Dictionary <string, AdDtoModel> _loadListItems(ScraperAirdnaStateModel state) { Dictionary <string, AdDtoModel> result = null; var filename = state.ListItemsFilename; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <Dictionary <string, AdDtoModel> >(File.ReadAllText(filename)); } return(result); }
public ScraperAirdna(ScraperAirdnaStateModel state = null) { if (state is null) { state = new ScraperAirdnaStateModel(); } _state = (ScraperAirdnaStateModel)state; _loadScraperConfig((ScraperAirdnaStateModel)_state); _checkDirectory(_state.RootPath); }
private void _loadScraperConfig(ScraperAirdnaStateModel state) { var configFilename = state.ConfigFilename; if (File.Exists(configFilename)) { _config = JsonConvert.DeserializeObject <ScraperAirdnaConfigModel>(File.ReadAllText(configFilename)); } else { _config = new ScraperAirdnaConfigModel(); _saveScraperConfig(_config, configFilename); } }
private List <AdDtoModel> _parsePage(int page, ScraperAirdnaStateModel state) { List <AdDtoModel> result = new List <AdDtoModel>(); var isDoneGetPage = _getPage_Selenoid(page: page, state: state); if (isDoneGetPage) { _scrapeFromPage_Rent(result, EnumTypeItems.Rent); _scrapeFromPage_Rent(result, EnumTypeItems.RentTivuch); } return(result); }
private void ScrapePhase2_GenerateListItems(ScraperAirdnaStateModel state) { SetWorkPhaseBase("GenerateListItems", state); var listItems = _loadListItems(state); if (listItems == null || listItems.Count == 0 || state.IsNew) { _log($"Start generate list-items"); var listPages = _loadListCities(state); listItems = new Dictionary <string, AdDtoModel>(); var listItemDublicate = new List <string>(); var pages = listPages.Select(x => x.Key).ToList(); foreach (var page in pages) { var itemsOnPage = _loadPage(page, state); if (itemsOnPage != null) { foreach (var item in itemsOnPage) { var key = item.Id; if (listItems.ContainsKey(key)) { listItemDublicate.Add(key); } else { listItems.Add(key, item); } } } } _log($"Scraped uniq items:{listItems.Count}, items-dublicates:{listItemDublicate.Count}, total:{listItems.Count+ listItemDublicate.Count}"); _saveListItems(listItems, state); _saveListItemsDublicates(listItemDublicate, state); } else { _log($"Generate list-item skipped (isNew:{state.IsNew})"); } }
private void _initSelenoid(ScraperAirdnaStateModel state) { if (_selenoidState is null) { _selenoidState = new SelenoidStateModel() { JavaScriptEnable = true, ShowPictures = true, } } ; _initSelenoidBase(_selenoidState, state); } #endregion }
private int ScrapePhase1_GetAmountPages_Selenoid(ScraperAirdnaStateModel state) { var result = 0; var tryCount = 1; var tryCountMax = 10; var doNeedTry = false; _log($"Detecting amount pages"); _initSelenoid(state); do { doNeedTry = false; var isDoneGetPage1 = _getPage_Selenoid(page: 1, state: state); if (isDoneGetPage1) { var pageNavigate = _selenoidState.WindowMain.FindElementByClassName("pagingdisplay"); var lastPageText = pageNavigate.FindElements(By.TagName("span")).LastOrDefault().Text; result = int.Parse(lastPageText); } else { tryCount++; if (tryCount <= tryCountMax) { doNeedTry = true; _log($"Try count {tryCount}/{tryCountMax}. Pause 10 sec."); Thread.Sleep(TimeSpan.FromSeconds(3)); } else { _log($"Tred {tryCount} detect amount pages. Stop scrap process."); } } } while (doNeedTry); _log($"Detected was {result} pages"); _closeSelenoid(); return(result); }
private void _fixListItems(Dictionary <string, AdDtoModel> list, ScraperAirdnaStateModel state) { var itemsFiles = new DirectoryInfo(state.ItemsPath).GetFiles(); var amountFixed = 0; foreach (var itemFile in itemsFiles) { var id = Path.GetFileNameWithoutExtension(itemFile.Name); if (list.ContainsKey(id) && !list[id].DownloadedItem) { list[id].DownloadedItem = true; amountFixed++; } } _log($"Fixed {amountFixed} files"); }
private void ScrapePhase2_DownloadItems(ScraperAirdnaStateModel state) { SetWorkPhaseBase("DownloadItems", state); var listItems = _loadListItems(state); _fixListItems(listItems, state); Func <List <KeyValuePair <string, AdDtoModel> > > NeedToDo = () => listItems.Where(x => x.Value.DownloadedItem == false).ToList(); var countDownloaded = 0; var maxTasks = 5; var tasks = new List <Task <bool> >(); var needToDoItems = NeedToDo(); do { var freeTasks = maxTasks - tasks.Count(); foreach (var item in needToDoItems.Take(freeTasks)) { var id = item.Key; listItems[id].DownloadedItem = true; tasks.Add(Task.Run(async() => listItems[id].DownloadedItem = await _downloadItemAsync(item.Value, state))); } //Thread.Sleep(1000 * 2); Task.WaitAny(tasks.ToArray()); countDownloaded += tasks.Where(x => x.IsCompleted).Count(); tasks.RemoveAll(x => x.IsCompleted); if (countDownloaded % 100 == 0) { _log($"CountDownloaded are {countDownloaded}"); _saveListItems(listItems, state); } needToDoItems = NeedToDo(); } while (needToDoItems.Count > 0); Task.WaitAll(tasks.ToArray()); _log($"Scraped {countDownloaded} items"); _saveListItems(listItems, state); _log($"Download items done"); }
private List <string> ScrapePhase1_GenerateListCities_Selenoid(ScraperAirdnaStateModel state) { var list = new List <string>(); _initSelenoid(state); LogonToSite(); var url = $"https://airdna.co"; //var isOk = Selenoid_GoToUrl_Base(url, _selenoidState, state); //(string)_selenoidState.WindowMain.Ex _log($"Generate new list of pages done"); return(list); }
private void ScrapePhase1_GenerateListCities(ScraperAirdnaStateModel state) { SetWorkPhaseBase("GenerateListcities", state); var listCities = _loadListCities(state); if (/*listCities == null || listCities.Count == 0 || */ state.IsNew) { //var list = ScrapePhase1_GenerateListCities_Selenoid(state); var list = ScrapePhase1_GenerateListCities_WebClient(state); //_saveListCitiesAsync(list, state).Wait(); } else { _log($"Generate list city not need (missing, isNew:{state.IsNew})"); } }
private void fixListPages(Dictionary <int, bool> listPages, ScraperAirdnaStateModel state) { var path = $"{state.PagesPath}"; var listFiles = new DirectoryInfo(path).GetFiles(); var amountFixed = 0; foreach (var file in listFiles) { var filename = Path.GetFileNameWithoutExtension(file.Name); var n = int.Parse(filename.Split("-")[1]); if (listPages.ContainsKey(n) && !listPages[n]) { listPages[n] = true; amountFixed++; } } _log($"Fixed {amountFixed} files"); }
private async Task <List <AdItemAirdnaDomainModel> > ScrapeAllItems(ScraperAirdnaStateModel state) { var listDomainItems = new List <AdItemAirdnaDomainModel>(); var files = new DirectoryInfo($"{state.ItemsPath}").GetFiles(); foreach (var file in files) { var itemsDto = await LoadItemDtoFromStoreAsync <AirdnaScrapeDataModel>(file, state); foreach (var item in itemsDto.Properties) { var itemDomain = new AdItemAirdnaDomainModel().FromDto(item); itemDomain.Location = itemsDto.AreaInfo.Geom.Name.City; listDomainItems.Add(itemDomain); } } return(listDomainItems); }
private void ScrapePhase2_DownloadCities(ScraperAirdnaStateModel state) { if (state.IsNew) { var filename = $"{state.RootPath}/cities.json"; var cities = JsonConvert.DeserializeObject <ItemDto>(File.ReadAllText(filename)); foreach (var city in cities.items) { var url = $"https://api.airdna.co/v1/market/property_list?access_token={_token}&city_id={city.city.id}¤cy=native"; var dto = url.GetJsonAsync <AirdnaScrapeDataModel>().Result; var filename2 = $"{state.ItemsPath}/{city.city.id}.json"; File.WriteAllText(filename2, JsonConvert.SerializeObject(dto, Newtonsoft.Json.Formatting.Indented)); } } }
static void Main(string[] args) { Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US"); NpgsqlConnection.GlobalTypeMapper.UseNetTopologySuite(); var state = new ScraperAirdnaStateModel() { IsNew = true, }; var scraper = new ScraperAirdna(state); UpdateRepository(); //Scrape(scraper); //GetExcelFile(scraper); //PrintSaveStatus(scraper); }
private async Task <string> GetPage_WebClientAsync(int page, ScraperAirdnaStateModel state) { var result = ""; try { var url = $"https://www.homeless.co.il/rent/1"; result = await url .WithHeaders(new { User_Agent = "Windows", }) .GetStringAsync(); }catch (Exception exception) { _log($"Error x2. {exception.Message} / {exception.StackTrace}"); } return(result); }
private List <ItemTest> _statusWorkspace_AmountItemsFromPages_GetItems(ScraperAirdnaStateModel state) { var listPages = _statusWorkspace_AmountPages_GetFilesBase(state); var totalItems = 0; var list = new List <ItemTest>(); foreach (var page in listPages) { var filename = page.FullName; var pageData = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename)); var listItems = pageData.Select(x => new ItemTest() { Id = x.Id, Done = x.DownloadedItem }).ToList(); list.AddRange(listItems); totalItems += listItems.Count; } return(list); }
private async Task <Dictionary <string, AdDtoModel> > _loadPagesAsync(ScraperAirdnaStateModel state) { var result = new Dictionary <string, AdDtoModel>(); var path = $"{state.PagesPath}"; var listPageFiles = new DirectoryInfo(path).GetFiles(); foreach (var pageFile in listPageFiles) { var listAdDto = JsonConvert.DeserializeObject <List <AdDtoModel> >(await File.ReadAllTextAsync(pageFile.FullName)); foreach (var adDto in listAdDto) { if (!result.ContainsKey(adDto.Id)) { result.Add(adDto.Id, adDto); } } } return(result); }