private async Task <Dictionary <int, bool> > _loadListPreLoadsAsync(ScraperYad2StateModel state) { Dictionary <int, bool> result = null; _log($"Load list-pre-loads starting"); var filename = $"{state.PathListPreLoads}"; if (File.Exists(filename)) { try { result = JsonConvert.DeserializeObject <Dictionary <int, bool> >(await File.ReadAllTextAsync(filename)); _log($"Load list-pre-loads is complete"); } catch (Exception exception) { _log($"Load list-pre-loads is fail"); _log($"Error L2. {exception.Message}"); } } else { _log($"no file {filename}"); } return(result); }
private void _yad2ScrapeThenSaveStore(bool isNew = false, bool needScrape = true) { _log($"Start Yad2ScrapeThenSaveStore (isNew={isNew})"); var state = new ScraperYad2StateModel() { IsNew = isNew, }; var scraper = new ScraperYad2(state); if (needScrape) { scraper.Scrape(); } var dataOfScrape = scraper.GetDomainModel(); var excelService = new ExcelYad2Service(state); var excelData = excelService.CreateExcel(dataOfScrape); var pathToFile = excelService.SaveToFile(excelData); var archive = new ArchiveRepository(); archive.Save(pathToFile, state.TypeScraper); _log($"End Yad2ScrapeThenSaveStore (isNew={isNew}), Spent time {_calcSpentTime2String(state)}"); }
private async Task <Dictionary <string, bool> > _loadListItemsAsync(ScraperYad2StateModel state) { Dictionary <string, bool> result = null; _log($"Load list-items"); try { var filename = state.PathListItems; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <Dictionary <string, bool> >(await File.ReadAllTextAsync(filename)); _log($"Load list-items is completed"); } else { throw new Exception(); } } catch (Exception exception) { _log($"Load list-items is fail"); _log($"Error L1. {exception.Message}"); } return(result); }
private async Task <bool> _downloadPreLoadAsync(int page, ScraperYad2StateModel state) { var url = $"https://www.yad2.co.il/api/pre-load/getFeedIndex/realestate/rent?page={page}&compact-req=1"; var filename = $"{state.PathPreLoads}/page-{page}.json"; var isDoneGetPage = await _downloadFilenameAsync(url, filename); return(isDoneGetPage); }
private async Task <bool> _downloadItemAsync(string item, ScraperYad2StateModel state) { var url = $"https://www.yad2.co.il/api/item/{item}"; var filename = $"{state.ItemsPath}/{item}.json"; var isDoneGetObject = await _downloadFilenameAsync(url, filename); return(isDoneGetObject); }
private int _statusWorkspace_AmountItemUniquesFromPages(ScraperYad2StateModel state) { var list = _statusWorkspace_AmountItemsFromPages_GetItems(state); var dups = list.GroupBy(x => x.Id, StringComparer.OrdinalIgnoreCase).ToDictionary(x => x.First()?.Id ?? "NULL", x => x.First().Done); var result = dups.Count() - dups.Where(x => x.Key == "NULL").Count(); return(result); }
private void _saveConfig(ScraperYad2ConfigModel config, ScraperYad2StateModel state) { var configFilename = state.ConfigFilename; _log($"Start saving config:{configFilename}"); File.WriteAllText(configFilename, JsonConvert.SerializeObject(config, Newtonsoft.Json.Formatting.Indented)); _log($"Save config:{configFilename} is done"); }
private async Task _scrapePhase1_GenerateListPreLoadsAsync(ScraperYad2StateModel state) { SetWorkPhaseBase($"GenerateListPreLoads", state); var listPreLoads = await _loadListPreLoadsAsync(state); if (listPreLoads == null) { _log($"Generate new list-pre-loads"); var tryCount = 1; var tryCountMax = 10; var doNeedTry = false; do { doNeedTry = false; var isDoneGetPage1 = await _downloadPreLoadAsync(page : 1, state : state); if (isDoneGetPage1) { var page1Dto = _getPreloadFromFilestore(page: 1, state: state); var lastPage = int.Parse(page1Dto.Pagination.LastPage); _log($"Detect {lastPage} pages"); var list = new Dictionary <int, bool>(); foreach (var i in Enumerable.Range(1, lastPage)) { list.Add(i, false); } list[1] = true; await _saveListPreLoadsAsync(list, state); } else { tryCount++; if (tryCount <= tryCountMax) { doNeedTry = true; _log($"Try count {tryCount}/{tryCountMax}. Pause 10 sec."); Thread.Sleep(TimeSpan.FromSeconds(3)); } else { _log($"Tred {tryCount} download PreLoad Page1. Stop scrap process."); } } } while (doNeedTry); } else { _log($"Generate list-pre-loads is missing"); } }
public ScraperYad2(ScraperYad2StateModel state = null) { if (state is null) { state = new ScraperYad2StateModel(); } _state = state; _config = _loadConfig((ScraperYad2StateModel)_state); }
private static void _parseParams(string[] args, ScraperYad2StateModel state) { foreach (var arg in args) { switch (arg.ToLower()) { case "-new": state.IsNew = true; break; } } }
private async Task _scrapePhase5_GenerateListItemsContactsAsync(ScraperYad2StateModel state) { SetWorkPhaseBase($"GenerateListItemsContacts", state); var listItemsContacts = await _loadListItemsContactsAsync(state); if (listItemsContacts == null) { _log($"Generate new list items-contacts"); listItemsContacts = new Dictionary <string, bool>(); var files = Directory.GetFiles(state.PathPreLoads); var dublicates = new List <string>(); foreach (var file in files) { var fileData = File.ReadAllText(file); var data = JsonConvert.DeserializeObject <PreloadDtoModel>(fileData); var items = data.Feed.feed_items.Where(x => !string.IsNullOrEmpty(x.id)).Select(x => x.id).ToList(); foreach (var item in items) { if (listItemsContacts.ContainsKey(item)) { dublicates.Add(item); } else { listItemsContacts.Add(item, false); } } } _log($"Generate new list items-contacts is completed"); _log($"Total uniq:{listItemsContacts.Count}, Dublicate:{dublicates.Count}, Total:{listItemsContacts.Count + dublicates.Count}"); File.WriteAllText($"{state.ListItemsContactsDublicatesFilename}", JsonConvert.SerializeObject(dublicates, Formatting.Indented)); await _saveListItemsContactsAsync(listItemsContacts, state); } else { _log($"Generate list items-contacts is missing"); } }
private ScraperYad2ConfigModel _loadConfig(ScraperYad2StateModel state) { ScraperYad2ConfigModel result = null; var filename = state.ConfigFilename; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <ScraperYad2ConfigModel>(File.ReadAllText(filename)); } else { result = new ScraperYad2ConfigModel(); _saveConfig(result, state); } return(result); }
private async Task <bool> _scrapePhase2_GetPreLoads(ScraperYad2StateModel state) { var result = true; SetWorkPhaseBase($"GetPreLoads", state); var listPreLoads = await _loadListPreLoadsAsync(state); if (listPreLoads != null) { await _scrapePhase2_DownloadsPreLoadsAsync(listPreLoads, state); } else { _log($"Fail load list-pre-loads"); result = false; } return(result); }
private int _statusWorkspace_AmountItemsWithWrongDataFromPath(ScraperYad2StateModel state) { var listItems = _statusWorkspace_AmountItemsFromPath_GetFilesBase(state); var amountItemsWithWrongData = 0; foreach (var item in listItems) { try { var itemObject = JsonConvert.DeserializeObject <Phase3ObjectDto>(File.ReadAllText(item.FullName)); } catch (Exception exception) { amountItemsWithWrongData++; _log($"Error w1. {exception.Message}"); } } return(amountItemsWithWrongData); }
private async Task <bool> _scrapePhase4_GetItems(ScraperYad2StateModel state) { var result = true; SetWorkPhaseBase($"GetItems", state); var listItems = await _loadListItemsAsync(state); if (listItems != null) { await _scrapePhase4_DownloadsItemsAsync(listItems, state); } else { _log($"Fail load list-items"); result = false; } return(result); }
private async Task <bool> _scrapePhase6_GetItemsContacts(ScraperYad2StateModel state) { var result = true; SetWorkPhaseBase($"GetItemsContacts", state); var listItemsContacts = await _loadListItemsContactsAsync(state); if (listItemsContacts != null) { await _scrapePhase6_DownloadsItemsContactsAsync(listItemsContacts, state); } else { Console.WriteLine($"Fail load list-items"); result = false; } return(result); }
private async Task <bool> _scrapingAsync(ScraperYad2StateModel state) { var result = false; try { await _scrapePhase1_GenerateListPreLoadsAsync(state); await _scrapePhase2_GetPreLoads(state); await _scrapePhase3_GenerateListItems(state); await _scrapePhase4_GetItems(state); await _scrapePhase5_GenerateListItemsContactsAsync(state); await _scrapePhase6_GetItemsContacts(state); result = true; } catch (Exception exception) { _log($"Erro-x1. {exception.Message} / {exception.StackTrace}"); } return(result); }
private List <ItemTest> _statusWorkspace_AmountItemsFromPages_GetItems(ScraperYad2StateModel state) { var listPages = _statusWorkspace_AmountPages_GetFilesBase(state); var totalItems = 0; var list = new List <ItemTest>(); foreach (var page in listPages) { var filename = page.FullName; var pageData = JsonConvert.DeserializeObject <PreloadDtoModel>(File.ReadAllText(filename)); var listItems = pageData.Feed.feed_items.Select(x => new ItemTest() { Id = x.id, Done = false }).ToList(); list.AddRange(listItems); totalItems += listItems.Count; } return(list); }
static void Main(string[] args) { Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US"); var state = new ScraperYad2StateModel() { IsNew = true, }; var scraper = new ScraperYad2(state); UpdateRepository(); //Scrape(scraper); //GetExcelFile(scraper); //PrintSaveStatus(scraper); //SaveDomainModel(scraper); //LoadDomainModelFromFile(scraper); }
private PreloadDtoModel _getPreloadFromFilestore(int page, ScraperYad2StateModel state) { var result = JsonConvert.DeserializeObject <PreloadDtoModel>(File.ReadAllText($"{state.PathPreLoads}/page-{page}.json")); return(result); }
private async Task _saveListItemsAsync(Dictionary <string, bool> list, ScraperYad2StateModel state) { _log($"Save {state.PathListItems}"); await File.WriteAllTextAsync($"{state.PathListItems}", JsonConvert.SerializeObject(list, Formatting.Indented)); }
private int _statusWorkspace_AmountItemsFromPages(ScraperYad2StateModel state) { var list = _statusWorkspace_AmountItemsFromPages_GetItems(state); return(list.Count()); }
private async Task _saveListPreLoadsAsync(Dictionary <int, bool> list, ScraperYad2StateModel state) { _log($"Save list-pre-loads: {state.PathListPreLoads}"); await File.WriteAllTextAsync($"{state.PathListPreLoads}", JsonConvert.SerializeObject(list, Formatting.Indented)); }
private async Task <List <AdItemYad2DomainModel> > _scrapePhase7_GenerateDomainModelAsync(ScraperYad2StateModel state) { List <AdItemYad2DomainModel> result = null; var listItems = await _loadListItemsAsync(state) ?? new Dictionary <string, bool>(); if (listItems.Count > 0) { result = new List <AdItemYad2DomainModel>(); } var sb = new StringBuilder(1000); var indexParseItems = 0; foreach (var itemId in listItems) { try { var key = itemId.Key; var itemDto = JsonConvert.DeserializeObject <Phase3ObjectDto>(await File.ReadAllTextAsync($"{state.ItemsPath}/{key}.json")); var itemContactsDto = JsonConvert.DeserializeObject <Phase3ObjectContactsDto>(await File.ReadAllTextAsync($"{state.PathItemsContacts}/item-contacts-{key}.json")); var itemDomain = new AdItemYad2DomainModel().FromDto(itemDto, itemContactsDto); result.Add(itemDomain); sb.Append($",{itemId.Key}=ok"); } catch (Exception exception) { sb.Append($",{itemId.Key}=fail, {exception.Message}"); } indexParseItems++; if (indexParseItems % 3000 == 0) { _log($"Parsed {indexParseItems} items"); } } _log($"Parsed items are {sb.ToString()}"); return(result); }
private async Task _scrapePhase2_DownloadsPreLoadsAsync(Dictionary <int, bool> list, ScraperYad2StateModel state) { var hasError = false; var amount = 0; Func <IEnumerable <int> > toDoAll = () => list.Where(x => x.Value == false).Select(x => x.Key); Func <IEnumerable <int> > toDone = () => list.Where(x => x.Value == true).Select(x => x.Key); Action showStat = () => _log($"Downloads: done: {toDone().Count()}, balance: {toDoAll().Count()}, total: {list.Count}"); showStat(); var amountGetters = state.CountScrapers; if (toDoAll().Count() > 0) { _log($"Use {amountGetters} threads"); } while (toDoAll().Count() > 0) { List <Task <bool> > arrayTask = new List <Task <bool> >(); var listToDo = toDoAll().Take(amountGetters).ToList(); foreach (var toDo in listToDo) { arrayTask.Add(Task.Run(async() => await _downloadPreLoadAsync(toDo, state))); } await _saveListPreLoadsAsync(list, state); hasError = false; try { Task.WaitAll(arrayTask.ToArray(), TimeSpan.FromMinutes(5)); } catch (Exception exception) { hasError = true; _log($"Error D2. {exception.Message}"); } var i = 0; if (!hasError) { foreach (var task in arrayTask) { bool itemData = task.Result; if (itemData) { list[listToDo.Skip(i).Take(1).FirstOrDefault()] = true; amount++; } else { hasError = true; } i++; } } if (hasError) { _log($"Error pass page {listToDo.Skip(i).Take(1).FirstOrDefault()}"); _log($"Pause {state.CountWaitSecondForFailRequest} sec"); Thread.Sleep(TimeSpan.FromSeconds(state.CountWaitSecondForFailRequest)); } showStat(); } _log($"List pre-loads completed"); await _saveListPreLoadsAsync(list, state); }