private async Task _scrapePhase3Inner(ScraperWinWinStateModel state) { var listItems = await _loadListItemsAsync(state); await _fixListItems(listItems, state); var maxItems = 50; Func <int, List <ShortItemDtoModel> > NeedToDo = (i) => listItems.Where(x => x.Value.Done == false).Select(x => x.Value).Take(i).ToList(); var task = new List <Task <bool> >(); var needToDo = NeedToDo(maxItems); do { foreach (var shortItem in needToDo) { var item = shortItem; item.Done = true; task.Add(Task.Run(async() => item.Done = await _downloadItemAsync(shortItem, state))); } Thread.Sleep(1000 * 3); Task.WaitAny(task.ToArray()); task.RemoveAll(x => x.IsCompleted); needToDo = NeedToDo(maxItems - task.Count()); } while (needToDo.Count() > 0); Task.WaitAll(task.ToArray()); }
private void _openMainPage(SelenoidStateModel selenoidState, ScraperWinWinStateModel state) { var url = $"https://www.winwin.co.il/RealEstate/ForRent/Search/SearchResults/RealEstatePage.aspx?search=8bdb5277c594afddcf9414e7541fd518"; var hasError = false; var tryCountMax = 20; var indexTry = 0; do { try { hasError = false; selenoidState.WindowMain.Navigate().GoToUrl(url); selenoidState.WaitMain.Until(ExpectedConditions.ElementIsVisible(By.Id("carAreasDV"))); } catch { indexTry++; if (indexTry < tryCountMax) { hasError = true; _initSelenoidBase(selenoidState, state); Thread.Sleep(1000 * 5); } } } while (hasError); }
private async Task ScrapePhase2_GenerateListItemsAsync(ScraperWinWinStateModel state) { SetWorkPhaseBase("GenerateListItems", state); var listFilePages = _loadListPages(state); var listItems = await _loadListItemsAsync(state); foreach (var filePage in listFilePages) { var filename = $"{filePage.FullName}"; var listShortItems = JsonConvert.DeserializeObject <List <ShortItemDtoModel> >(await File.ReadAllTextAsync(filename)); foreach (var shortItem in listShortItems) { if (!listItems.ContainsKey(shortItem.ItemId)) { shortItem.Done = false; listItems.Add(shortItem.ItemId, shortItem); } } } await _saveListItemsAsync(listItems, state); LogDone(state); }
private void _winWinScrapeThenSaveStore(bool isNew = false, bool needScrape = true) { _log($"Start WinWinScrapeThenSaveStore (isNew={isNew})"); var state = new ScraperWinWinStateModel() { IsNew = isNew, }; var scraper = new ScraperWinWin(state); if (needScrape) { scraper.Scrape(); } var dataOfScrape = scraper.GetDomainModel(); var excelService = new ExcelWinWinService(state); var excelData = excelService.CreateExcel(dataOfScrape); var pathToFile = excelService.SaveToFile(excelData); var archive = new ArchiveRepository(); archive.Save(pathToFile, state.TypeScraper); _log($"End WinWinScrapeThenSaveStore (isNew={isNew}), Spent time {_calcSpentTime2String(state)}"); }
private async Task ScrapePhase3Async(ScraperWinWinStateModel state) { SetWorkPhaseBase("DownloadItems", state); await _scrapePhase3Inner(state); LogDone(state); }
private bool _isDownloadedItem(ShortItemDtoModel shortItem, ScraperWinWinStateModel state) { var filename = $"{state.ItemsPath}/{shortItem.ItemId}.json"; var result = File.Exists(filename); return(result); }
private async Task _saveItemDtoAsync(AdItemWinWinDtoModel item, ScraperWinWinStateModel state) { var filename = $"{state.ItemsPath}/{item.ItemId}.json"; await File.WriteAllTextAsync($"{filename}", JsonConvert.SerializeObject(item, Newtonsoft.Json.Formatting.Indented)); _log($"Save item {item.ItemId}, filename:{filename}"); }
private int _statusWorkspace_AmountItemsFromPages(ScraperWinWinStateModel state) { var list = _statusWorkspace_AmountItemsFromPages_GetItems(state); //var dubs = list.GroupBy(x => x.Id, StringComparer.OrdinalIgnoreCase).ToDictionary(x => x.First().Id, x => x.First().Done); return(list.Count()); }
private void _saveScraperConfig(ScraperWinWinConfigModel config, ScraperWinWinStateModel state) { var configFilename = state.ConfigFilename; File.WriteAllText(configFilename, JsonConvert.SerializeObject(config, Newtonsoft.Json.Formatting.Indented)); _log($"Save config:{configFilename} is done"); }
private async Task ScrapePhase1Async(ScraperWinWinStateModel state) { SetWorkPhaseBase("Phase-1", state); await _scrapePhase1InnerAsync(state); LogDone(state); }
private void _saveListPage(Dictionary <int, bool> list, ScraperWinWinStateModel state) { var filename = state.ListPagesFilename; File.WriteAllText($"{filename}", JsonConvert.SerializeObject(list, Newtonsoft.Json.Formatting.Indented)); _log($"Saved list-page:{state.ListPagesFilename}"); }
private void _saveListRegions(List <RegionModel> list, ScraperWinWinStateModel state) { var filename = $"{state.ListRegionsFilename}"; File.WriteAllText($"{filename}", JsonConvert.SerializeObject(list, Newtonsoft.Json.Formatting.Indented)); _log($"Save list-regions, filename:{filename}"); }
private ShortItemDtoModel _loadShortItems(string itemId, ScraperWinWinStateModel state) { var filename = $"{state.ItemsPath}/{itemId}.json"; var shortItem = JsonConvert.DeserializeObject <ShortItemDtoModel>(File.ReadAllText(filename)); return(shortItem); }
private int _getAmountPages(ScraperWinWinStateModel state) { var result = 0; //var page = _loadPageAsync(1, state).Result; //result = _detectLastNumPage(page); return(result); }
private void _regErrorScrape_ListItemIds(List <string> list, ScraperWinWinStateModel state) { var filename = state.LogErrorFilename; var message = $"Errors itemIds:{string.Join(",", list)}\r\n"; File.AppendAllText(filename, message); _log($"Save errors"); }
private async Task _scrapePhase2Inner(ScraperWinWinStateModel state) { var listRegions = await _loadListRegionsAsync(state); foreach (var region in listRegions) { _downloadRegionPages(region, state); } }
private List <ShortItemDtoModel> _parsePage(HtmlDocument document, ScraperWinWinStateModel state) { var result = new List <ShortItemDtoModel>(); result.AddRange(_getItemShortFromNodes(document.DocumentNode.CssSelect(".paid"))); result.AddRange(_getItemShortFromNodes(document.DocumentNode.CssSelect(".TitleData"))); return(result); }
private void _closeSelenoidBase(SelenoidStateModel selenoidState, ScraperWinWinStateModel state) { if (selenoidState.WindowMain != null) { selenoidState.WindowMain.Quit(); //_windowMain.Close(); _log($"Close Selenoid Service done"); } }
private void _downloadRegionPages(RegionModel region, ScraperWinWinStateModel state) { var listTasks = new List <Task <bool> >(); foreach (var page in Enumerable.Range(1, region.AmountPages)) { listTasks.Add(Task.Run(() => _downloadRegionPageAsync(page, region, state))); } Task.WaitAll(listTasks.ToArray()); }
public ScraperWinWin(ScraperWinWinStateModel state = null) { if (state is null) { state = new ScraperWinWinStateModel(); } _state = state; //_config = _loadScraperConfig(state); }
private List <ShortItemDtoModel> _loadShortItemsFromPages(ScraperWinWinStateModel state) { var result = new List <ShortItemDtoModel>(); var path = new DirectoryInfo(state.PagesPath); foreach (var file in path.GetFiles()) { result.AddRange(JsonConvert.DeserializeObject <List <ShortItemDtoModel> >(File.ReadAllText(file.FullName))); } return(result); }
private async Task <AdItemWinWinDtoModel> _loadItemDtoAsync(string file, ScraperWinWinStateModel state) { var filename = $"{file}"; var itemDto = JsonConvert.DeserializeObject <AdItemWinWinDtoModel>(await File.ReadAllTextAsync(filename)); var filenameShort = Path.GetFileName(filename); _log($"Load itemDto from filename:{filenameShort}"); return(itemDto); }
private async Task <List <RegionModel> > _loadListRegionsAsync(ScraperWinWinStateModel state) { var list = new List <RegionModel>(); var filename = $"{state.ListRegionsFilename}"; if (File.Exists(filename)) { list = JsonConvert.DeserializeObject <List <RegionModel> >(await File.ReadAllTextAsync(filename)); } _log($"Load list-regions is done (file:{filename})"); return(list); }
private void _generateNewListPages(ScraperWinWinStateModel state) { _log($"Generate new list-pages"); var amountPages = _getAmountPages(state); var listPages = new Dictionary <int, bool>(); foreach (var page in Enumerable.Range(1, amountPages)) { listPages.Add(page, false); } _log($"Generated list-pages is done"); _saveListPage(listPages, state); }
private ScraperWinWinConfigModel _loadScraperConfig(ScraperWinWinStateModel state) { ScraperWinWinConfigModel result = null; var filename = state.ConfigFilename; if (File.Exists(filename)) { result = JsonConvert.DeserializeObject <ScraperWinWinConfigModel>(File.ReadAllText(filename)); } else { result = new ScraperWinWinConfigModel(); _saveScraperConfig(result, state); } return(result); }
private List <ItemTest> _statusWorkspace_AmountItemsFromPages_GetItems(ScraperWinWinStateModel state) { var listPages = _statusWorkspace_AmountPages_GetFilesBase(state); var totalItems = 0; var list = new List <ItemTest>(); foreach (var page in listPages) { var filename = page.FullName; var pageData = JsonConvert.DeserializeObject <List <ShortItemDtoModel> >(File.ReadAllText(filename)); var listItems = pageData.Select(x => new ItemTest() { Id = x.ItemId, Done = false }).ToList(); list.AddRange(listItems); totalItems += listItems.Count; } return(list); }
private async Task <bool> _downloadItemAsync(ShortItemDtoModel shortItem, ScraperWinWinStateModel state) { var result = false; try { var itemPage = await _loadItemPageAsync(shortItem, state); var itemDto = await _parseItemPageAsync(itemPage, shortItem); await _saveItemDtoAsync(itemDto, state); result = true; } catch (Exception exception) { _log($"Error-z2. {exception.Message}"); } return(result); }
static void Main(string[] args) { Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US"); var state = new ScraperWinWinStateModel() { IsNew = false, }; var scraper = new ScraperWinWin(state); UpdateRepository(); //Scrape(scraper); //GetExcelFile(scraper); //PrintSaveStatus(scraper); var itemDto = scraper.GetItemDtoAsync("4389448").Result; }
private async Task <HtmlDocument> _loadItemPageAsync(ShortItemDtoModel shortItem, ScraperWinWinStateModel state) { var itemId = shortItem.ItemId; _log($"Loading item-id:{itemId}"); var url = $"https://www.winwin.co.il/RealEstate/ForRent/Ads/RealEstateAds,{itemId}.aspx"; var webGet = new HtmlWeb(); HtmlDocument result = null; try { result = await webGet.LoadFromWebAsync(url); } catch (Exception exception) { _log($"Error-z3. {exception.Message}"); } //_log($"Load item-id:{itemId} is done"); return(result); }
private async Task <Dictionary <string, ShortItemDtoModel> > _loadListItemsAsync(ScraperWinWinStateModel state) { var listItems = new Dictionary <string, ShortItemDtoModel>(); var filename = $"{state.ListItemsFilename}"; if (File.Exists(filename)) { listItems = JsonConvert.DeserializeObject <Dictionary <string, ShortItemDtoModel> >(await File.ReadAllTextAsync(filename)); } else { await _saveListItemsAsync(listItems, state); } return(listItems); }