private async Task ScrapePhase1_DownloadPages_Scrapy(ScraperHomeLessStateModel state) { SetWorkPhaseBase("DownloadPages", state); var listPages = _loadListPages(state); fixListPages(listPages, state); var needToDo = listPages.Where(x => x.Value == false).Select(x => x.Key).ToList(); var tasks = new List <Task <bool> >(); if (needToDo.Count > 0) { foreach (var page in needToDo) { tasks.Add(Task.Run(async() => listPages[page] = await GetPageAsync(page, state))); } await _saveListPagesAsync(listPages, state); } Task.WaitAll(tasks.ToArray()); _log($"Phase done"); }
private async Task <bool> _downloadItemAsync(AdDtoModel item, ScraperHomeLessStateModel state) { var result = true; var id = item.Id; try { var adDetails = _getAdDetailsFromService(item); var coordinates = _getCoordinatesFromServiceAsync(item, state); var phones = _getPhonesFromService(item); var details = _getDetailsFromService(item); var detailsItemDto = new DetailsItemDtoModel { Coordinates = await coordinates, Phones = await phones, Details = await details, AdDetails = await adDetails, RowDataFromPage = item, }; await _saveItemDetailsAsync(id, detailsItemDto, state); } catch (Exception exception) { _log($"Error-f1. {exception.Message} / {exception.StackTrace}"); result = false; } return(result); }
private void _homeLessScrapeThenSaveStore(bool isNew = false, bool needScrape = true) { _log($"Start HomeLessScrapeThenSaveStore (isNew={isNew})"); var state = new ScraperHomeLessStateModel() { IsNew = isNew, }; var scraper = new ScraperHomeLess(state); if (needScrape) { scraper.Scrape(); } var dataOfScrape = scraper.GetDomainModel(); var excelService = new ExcelHomeLessService(state); var excelData = excelService.CreateExcel(dataOfScrape); var pathToFile = excelService.SaveToFile(excelData); var archive = new ArchiveRepository(); archive.Save(pathToFile, state.TypeScraper); _log($"End HomeLessScrapeThenSaveStore (isNew={isNew}), Spent time {_calcSpentTime2String(state)}"); }
private int _statusWorkspace_AmountItemsFromPages(ScraperHomeLessStateModel state) { var list = _statusWorkspace_AmountItemsFromPages_GetItems(state); //var dubs = list.GroupBy(x => x.Id, StringComparer.OrdinalIgnoreCase).ToDictionary(x => x.First().Id, x => x.First().Done); return(list.Count()); }
private async Task _saveItemAsync(string id, string response, ScraperHomeLessStateModel state) { var filename = $"{state.ItemsPath}/{id}.xml"; await File.WriteAllTextAsync(filename, response); _log($"Save xml-data file {filename}"); }
private void _savePage(int page, List <AdDtoModel> listAdFromPage, ScraperHomeLessStateModel state) { var filename = $"{state.PagesPath}/page-{page}.json"; File.WriteAllText(filename, JsonConvert.SerializeObject(listAdFromPage, Newtonsoft.Json.Formatting.Indented)); _log($"Save file {filename}"); }
private void _initSelenoid(ScraperHomeLessStateModel state) { if (_selenoidState is null) { _selenoidState = new SelenoidStateModel(); } _initSelenoidBase(_selenoidState, state); }
private void _saveListItems(Dictionary <string, AdDtoModel> list, ScraperHomeLessStateModel state) { var filename = state.ListItemsFilename; _log($"Saving list-items: {filename}"); File.WriteAllText($"{filename}", JsonConvert.SerializeObject(list, Newtonsoft.Json.Formatting.Indented)); _log($"Save list-items done"); }
private List <AdDtoModel> ParseAdsFromPage(HtmlDocument html, ScraperHomeLessStateModel state) { var result = new List <AdDtoModel>(); var list1 = ParseAdsFromPage_Rent(html, EnumTypeItems.Rent); var list2 = ParseAdsFromPage_Rent(html, EnumTypeItems.RentTivuch); result.AddRange(list1); result.AddRange(list2); return(result); }
private async Task <int> ScrapePhase1_GetAmountPages_WebClientAsync(ScraperHomeLessStateModel state) { var result = 0; var page = await GetPage_WebClientAsync(1, state); var amountPages = ScrapeAmountPages(page); result = amountPages; return(result); }
private Dictionary <string, AdDtoModel> _loadListItems(ScraperHomeLessStateModel state) { Dictionary <string, AdDtoModel> result = null; var filename = state.ListItemsFilename; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <Dictionary <string, AdDtoModel> >(File.ReadAllText(filename)); } return(result); }
public ScraperHomeLess(ScraperHomeLessStateModel state = null) { if (state is null) { state = new ScraperHomeLessStateModel(); } _state = (ScraperHomeLessStateModel)state; _loadScraperConfig((ScraperHomeLessStateModel)_state); _checkDirectory(_state.RootPath); }
private List <AdDtoModel> _loadPage(int page, ScraperHomeLessStateModel state) { List <AdDtoModel> result = null; var filename = $"{state.PagesPath}/page-{page}.json"; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename)); } return(result); }
private List <AdDtoModel> _parsePage(int page, ScraperHomeLessStateModel state) { List <AdDtoModel> result = new List <AdDtoModel>(); var isDoneGetPage = _getPage_Selenoid(page: page, state: state); if (isDoneGetPage) { _scrapeFromPage_Rent(result, EnumTypeItems.Rent); _scrapeFromPage_Rent(result, EnumTypeItems.RentTivuch); } return(result); }
private void _loadScraperConfig(ScraperHomeLessStateModel state) { var configFilename = state.ConfigFilename; if (File.Exists(configFilename)) { _config = JsonConvert.DeserializeObject <ScraperHomeLessConfigModel>(File.ReadAllText(configFilename)); } else { _config = new ScraperHomeLessConfigModel(); _saveScraperConfig(_config, configFilename); } }
private void ScrapePhase2_GenerateListItems(ScraperHomeLessStateModel state) { SetWorkPhaseBase("GenerateListItems", state); var listItems = _loadListItems(state); if (listItems == null || listItems.Count == 0 || state.IsNew) { _log($"Start generate list-items"); var listPages = _loadListPages(state); listItems = new Dictionary <string, AdDtoModel>(); var listItemDublicate = new List <string>(); var pages = listPages.Select(x => x.Key).ToList(); foreach (var page in pages) { var itemsOnPage = _loadPage(page, state); if (itemsOnPage != null) { foreach (var item in itemsOnPage) { var key = item.Id; if (listItems.ContainsKey(key)) { listItemDublicate.Add(key); } else { listItems.Add(key, item); } } } } _log($"Scraped uniq items:{listItems.Count}, items-dublicates:{listItemDublicate.Count}, total:{listItems.Count+ listItemDublicate.Count}"); _saveListItems(listItems, state); _saveListItemsDublicates(listItemDublicate, state); } else { _log($"Generate list-item skipped (isNew:{state.IsNew})"); } }
private void _fixListItems(Dictionary <string, AdDtoModel> list, ScraperHomeLessStateModel state) { var itemsFiles = new DirectoryInfo(state.ItemsPath).GetFiles(); var amountFixed = 0; foreach (var itemFile in itemsFiles) { var id = Path.GetFileNameWithoutExtension(itemFile.Name); if (list.ContainsKey(id) && !list[id].DownloadedItem) { list[id].DownloadedItem = true; amountFixed++; } } _log($"Fixed {amountFixed} files"); }
private void ScrapePhase2_DownloadItems(ScraperHomeLessStateModel state) { SetWorkPhaseBase("DownloadItems", state); var listItems = _loadListItems(state); _fixListItems(listItems, state); Func <List <KeyValuePair <string, AdDtoModel> > > NeedToDo = () => listItems.Where(x => x.Value.DownloadedItem == false).ToList(); var countDownloaded = 0; var maxTasks = 5; var tasks = new List <Task <bool> >(); var needToDoItems = NeedToDo(); do { var freeTasks = maxTasks - tasks.Count(); foreach (var item in needToDoItems.Take(freeTasks)) { var id = item.Key; listItems[id].DownloadedItem = true; tasks.Add(Task.Run(async() => listItems[id].DownloadedItem = await _downloadItemAsync(item.Value, state))); } //Thread.Sleep(1000 * 2); Task.WaitAny(tasks.ToArray()); countDownloaded += tasks.Where(x => x.IsCompleted).Count(); tasks.RemoveAll(x => x.IsCompleted); if (countDownloaded % 100 == 0) { _log($"CountDownloaded are {countDownloaded}"); _saveListItems(listItems, state); } needToDoItems = NeedToDo(); } while (needToDoItems.Count > 0); Task.WaitAll(tasks.ToArray()); _log($"Scraped {countDownloaded} items"); _saveListItems(listItems, state); _log($"Download items done"); }
private int ScrapePhase1_GetAmountPages_Selenoid(ScraperHomeLessStateModel state) { var result = 0; var tryCount = 1; var tryCountMax = 10; var doNeedTry = false; _log($"Detecting amount pages"); _initSelenoid(state); do { doNeedTry = false; var isDoneGetPage1 = _getPage_Selenoid(page: 1, state: state); if (isDoneGetPage1) { var pageNavigate = _selenoidState.WindowMain.FindElementByClassName("pagingdisplay"); var lastPageText = pageNavigate.FindElements(By.TagName("span")).LastOrDefault().Text; result = int.Parse(lastPageText); } else { tryCount++; if (tryCount <= tryCountMax) { doNeedTry = true; _log($"Try count {tryCount}/{tryCountMax}. Pause 10 sec."); Thread.Sleep(TimeSpan.FromSeconds(3)); } else { _log($"Tred {tryCount} detect amount pages. Stop scrap process."); } } } while (doNeedTry); _log($"Detected was {result} pages"); _closeSelenoid(); return(result); }
private void ScrapePhase1_GenerateListPages(ScraperHomeLessStateModel state) { SetWorkPhaseBase("GenerateListPages", state); var listPages = _loadListPages(state); if (listPages == null || listPages.Count == 0 || state.IsNew) { //var amountPages = ScrapePhase1_GetAmountPages_Selenoid(state); var amountPages = ScrapePhase1_GetAmountPages_WebClientAsync(state); var list = ScrapePhase1_GenerateListPages(amountPages.Result); _saveListPagesAsync(list, state).Wait(); } else { _log($"Generate list page not need (missing, isNew:{state.IsNew})"); } }
private void fixListPages(Dictionary <int, bool> listPages, ScraperHomeLessStateModel state) { var path = $"{state.PagesPath}"; var listFiles = new DirectoryInfo(path).GetFiles(); var amountFixed = 0; foreach (var file in listFiles) { var filename = Path.GetFileNameWithoutExtension(file.Name); var n = int.Parse(filename.Split("-")[1]); if (listPages.ContainsKey(n) && !listPages[n]) { listPages[n] = true; amountFixed++; } } _log($"Fixed {amountFixed} files"); }
private async Task <List <AdItemHomeLessDomainModel> > ScrapePhase4Async(ScraperHomeLessStateModel state) { var listDomainItems = new List <AdItemHomeLessDomainModel>(); var files = GetListItemFiles(state); var listPages = _loadPagesAsync(state); foreach (var file in files) { var itemDto = await LoadItemDtoFromStoreAsync <DetailsItemDtoModel>(file, state); itemDto.RowDataFromPage = GetRowDataFromPage(await listPages, file); var itemDomain = new AdItemHomeLessDomainModel().FromDto(itemDto); listDomainItems.Add(itemDomain); } return(listDomainItems); }
static void Main(string[] args) { Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US"); var state = new ScraperHomeLessStateModel() { IsNew = true, }; var scraper = new ScraperHomeLess(state); UpdateRepository(); //Scrape(scraper); //GetExcelFile(scraper); //PrintSaveStatus(scraper); }
private async Task <string> GetPage_WebClientAsync(int page, ScraperHomeLessStateModel state) { var result = ""; try { var url = $"https://www.homeless.co.il/rent/1"; result = await url .WithHeaders(new { User_Agent = "Windows", }) .GetStringAsync(); }catch (Exception exception) { _log($"Error x2. {exception.Message} / {exception.StackTrace}"); } return(result); }
private List <ItemTest> _statusWorkspace_AmountItemsFromPages_GetItems(ScraperHomeLessStateModel state) { var listPages = _statusWorkspace_AmountPages_GetFilesBase(state); var totalItems = 0; var list = new List <ItemTest>(); foreach (var page in listPages) { var filename = page.FullName; var pageData = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename)); var listItems = pageData.Select(x => new ItemTest() { Id = x.Id, Done = x.DownloadedItem }).ToList(); list.AddRange(listItems); totalItems += listItems.Count; } return(list); }
private async Task <Dictionary <string, AdDtoModel> > _loadPagesAsync(ScraperHomeLessStateModel state) { var result = new Dictionary <string, AdDtoModel>(); var path = $"{state.PagesPath}"; var listPageFiles = new DirectoryInfo(path).GetFiles(); foreach (var pageFile in listPageFiles) { var listAdDto = JsonConvert.DeserializeObject <List <AdDtoModel> >(await File.ReadAllTextAsync(pageFile.FullName)); foreach (var adDto in listAdDto) { if (!result.ContainsKey(adDto.Id)) { result.Add(adDto.Id, adDto); } } } return(result); }
private bool _getPage_Selenoid(int page, ScraperHomeLessStateModel state) { var result = false; var doNeedRepeatRequest = false; var count = 0; var countMax = 10; do { doNeedRepeatRequest = false; var url = $"https://homeless.co.il/rent/{page}"; try { _selenoidState.WindowMain.Navigate().GoToUrl(url); _selenoidState.WaitMain.Until(ExpectedConditions.ElementIsVisible(By.ClassName("pagingdisplay"))); //Thread.Sleep(1000 * 5); result = true; } catch (Exception exception) { count++; if (count < countMax) { _log($"!!! Need Reinit Selenoid (try {count}) !!!"); _initSelenoid(state); doNeedRepeatRequest = true; } else { _log($"Try is out. Error Selenoid"); } } } while (doNeedRepeatRequest); return(result); }
private async Task <bool> GetPageAsync(int page, ScraperHomeLessStateModel state) { var result = false; try { var html = await GetPage_ScrappyAsync(page, state); var listAdsFromPage = ParseAdsFromPage(html, state); if (listAdsFromPage.Count > 0) { await _savePageAsync(page, listAdsFromPage, state); result = true; } }catch (Exception exception) { _log($"Error T1. {exception.Message} / {exception.StackTrace}"); } return(result); }
private async Task <HtmlDocument> GetPage_ScrappyAsync(int pageNumber, ScraperHomeLessStateModel state) { HtmlDocument result = null; var needReplay = false; var web = new HtmlWeb(); do { needReplay = false; try { result = await web.LoadFromWebAsync($"https://www.homeless.co.il/rent/{pageNumber}"); } catch (Exception exception) { _log($"Error-g1. Wait 1 sec. {exception.Message}"); needReplay = true; Thread.Sleep(1000 * 1); } } while (needReplay); return(result); }
private void ScrapePhase1_DownloadPages_Selenoid(ScraperHomeLessStateModel state) { SetWorkPhaseBase("DownloadPages", state); var listPages = _loadListPages(state); fixListPages(listPages, state); var needToDo = listPages.Where(x => x.Value == false).Select(x => x.Key).ToList(); var needSaveListPages = false; var countPages = 0; var countPagesPerSave = 10; List <AdDtoModel> listAdFromPage = null; var needRepeate = false; var countTryRepeate = 0; var countMaxTryRepeate = 10; if (needToDo.Count > 0) { _initSelenoid(state); foreach (var page in needToDo) { do { needRepeate = false; try { listAdFromPage = _parsePage(page, state); } catch { countTryRepeate++; _log($"\t\t!!! Need reinit SelenoidService !!!"); _initSelenoid(state); Thread.Sleep(TimeSpan.FromMinutes(10)); needRepeate = true; if (countTryRepeate > countMaxTryRepeate) { _log($"SelenoidService not inited."); _abortApplication(); } } } while (needRepeate); if (listAdFromPage.Count > 0) { _savePage(page, listAdFromPage, state); listPages[page] = true; if (needSaveListPages) { _saveListPagesAsync(listPages, state).Wait(); needSaveListPages = false; } countPages++; if (countPages > countPagesPerSave) { countPages = 0; needSaveListPages = true; } } } _saveListPagesAsync(listPages, state).Wait(); _closeSelenoid(); } _log($"Phase-2 done"); }