Esempio n. 1
0
        private async Task ScrapePhase1_DownloadPages_Scrapy(ScraperHomeLessStateModel state)
        {
            SetWorkPhaseBase("DownloadPages", state);

            var listPages = _loadListPages(state);

            fixListPages(listPages, state);

            var needToDo = listPages.Where(x => x.Value == false).Select(x => x.Key).ToList();

            var tasks = new List <Task <bool> >();

            if (needToDo.Count > 0)
            {
                foreach (var page in needToDo)
                {
                    tasks.Add(Task.Run(async() => listPages[page] = await GetPageAsync(page, state)));
                }

                await _saveListPagesAsync(listPages, state);
            }

            Task.WaitAll(tasks.ToArray());

            _log($"Phase done");
        }
Esempio n. 2
0
        private async Task <bool> _downloadItemAsync(AdDtoModel item, ScraperHomeLessStateModel state)
        {
            var result = true;
            var id     = item.Id;

            try
            {
                var adDetails   = _getAdDetailsFromService(item);
                var coordinates = _getCoordinatesFromServiceAsync(item, state);
                var phones      = _getPhonesFromService(item);
                var details     = _getDetailsFromService(item);

                var detailsItemDto = new DetailsItemDtoModel
                {
                    Coordinates     = await coordinates,
                    Phones          = await phones,
                    Details         = await details,
                    AdDetails       = await adDetails,
                    RowDataFromPage = item,
                };

                await _saveItemDetailsAsync(id, detailsItemDto, state);
            }
            catch (Exception exception)
            {
                _log($"Error-f1. {exception.Message} / {exception.StackTrace}");
                result = false;
            }

            return(result);
        }
Esempio n. 3
0
        private void _homeLessScrapeThenSaveStore(bool isNew = false, bool needScrape = true)
        {
            _log($"Start HomeLessScrapeThenSaveStore (isNew={isNew})");

            var state = new ScraperHomeLessStateModel()
            {
                IsNew = isNew,
            };

            var scraper = new ScraperHomeLess(state);

            if (needScrape)
            {
                scraper.Scrape();
            }

            var dataOfScrape = scraper.GetDomainModel();

            var excelService = new ExcelHomeLessService(state);

            var excelData = excelService.CreateExcel(dataOfScrape);

            var pathToFile = excelService.SaveToFile(excelData);

            var archive = new ArchiveRepository();

            archive.Save(pathToFile, state.TypeScraper);

            _log($"End HomeLessScrapeThenSaveStore (isNew={isNew}), Spent time {_calcSpentTime2String(state)}");
        }
Esempio n. 4
0
        private int _statusWorkspace_AmountItemsFromPages(ScraperHomeLessStateModel state)
        {
            var list = _statusWorkspace_AmountItemsFromPages_GetItems(state);

            //var dubs = list.GroupBy(x => x.Id, StringComparer.OrdinalIgnoreCase).ToDictionary(x => x.First().Id, x => x.First().Done);

            return(list.Count());
        }
Esempio n. 5
0
        private async Task _saveItemAsync(string id, string response, ScraperHomeLessStateModel state)
        {
            var filename = $"{state.ItemsPath}/{id}.xml";

            await File.WriteAllTextAsync(filename, response);

            _log($"Save xml-data file {filename}");
        }
Esempio n. 6
0
        private void _savePage(int page, List <AdDtoModel> listAdFromPage, ScraperHomeLessStateModel state)
        {
            var filename = $"{state.PagesPath}/page-{page}.json";

            File.WriteAllText(filename, JsonConvert.SerializeObject(listAdFromPage, Newtonsoft.Json.Formatting.Indented));

            _log($"Save file {filename}");
        }
Esempio n. 7
0
        private void _initSelenoid(ScraperHomeLessStateModel state)
        {
            if (_selenoidState is null)
            {
                _selenoidState = new SelenoidStateModel();
            }

            _initSelenoidBase(_selenoidState, state);
        }
Esempio n. 8
0
        private void _saveListItems(Dictionary <string, AdDtoModel> list, ScraperHomeLessStateModel state)
        {
            var filename = state.ListItemsFilename;

            _log($"Saving list-items: {filename}");

            File.WriteAllText($"{filename}", JsonConvert.SerializeObject(list, Newtonsoft.Json.Formatting.Indented));

            _log($"Save list-items done");
        }
Esempio n. 9
0
        private List <AdDtoModel> ParseAdsFromPage(HtmlDocument html, ScraperHomeLessStateModel state)
        {
            var result = new List <AdDtoModel>();

            var list1 = ParseAdsFromPage_Rent(html, EnumTypeItems.Rent);
            var list2 = ParseAdsFromPage_Rent(html, EnumTypeItems.RentTivuch);

            result.AddRange(list1);
            result.AddRange(list2);

            return(result);
        }
Esempio n. 10
0
        private async Task <int> ScrapePhase1_GetAmountPages_WebClientAsync(ScraperHomeLessStateModel state)
        {
            var result = 0;

            var page = await GetPage_WebClientAsync(1, state);

            var amountPages = ScrapeAmountPages(page);

            result = amountPages;

            return(result);
        }
Esempio n. 11
0
        private Dictionary <string, AdDtoModel> _loadListItems(ScraperHomeLessStateModel state)
        {
            Dictionary <string, AdDtoModel> result = null;

            var filename = state.ListItemsFilename;

            if (File.Exists(filename))
            {
                result = JsonConvert.DeserializeObject <Dictionary <string, AdDtoModel> >(File.ReadAllText(filename));
            }

            return(result);
        }
Esempio n. 12
0
        public ScraperHomeLess(ScraperHomeLessStateModel state = null)
        {
            if (state is null)
            {
                state = new ScraperHomeLessStateModel();
            }

            _state = (ScraperHomeLessStateModel)state;

            _loadScraperConfig((ScraperHomeLessStateModel)_state);

            _checkDirectory(_state.RootPath);
        }
Esempio n. 13
0
        private List <AdDtoModel> _loadPage(int page, ScraperHomeLessStateModel state)
        {
            List <AdDtoModel> result = null;

            var filename = $"{state.PagesPath}/page-{page}.json";

            if (File.Exists(filename))
            {
                result = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename));
            }

            return(result);
        }
Esempio n. 14
0
        private List <AdDtoModel> _parsePage(int page, ScraperHomeLessStateModel state)
        {
            List <AdDtoModel> result = new List <AdDtoModel>();

            var isDoneGetPage = _getPage_Selenoid(page: page, state: state);

            if (isDoneGetPage)
            {
                _scrapeFromPage_Rent(result, EnumTypeItems.Rent);
                _scrapeFromPage_Rent(result, EnumTypeItems.RentTivuch);
            }

            return(result);
        }
Esempio n. 15
0
        private void _loadScraperConfig(ScraperHomeLessStateModel state)
        {
            var configFilename = state.ConfigFilename;

            if (File.Exists(configFilename))
            {
                _config = JsonConvert.DeserializeObject <ScraperHomeLessConfigModel>(File.ReadAllText(configFilename));
            }
            else
            {
                _config = new ScraperHomeLessConfigModel();
                _saveScraperConfig(_config, configFilename);
            }
        }
Esempio n. 16
0
        private void ScrapePhase2_GenerateListItems(ScraperHomeLessStateModel state)
        {
            SetWorkPhaseBase("GenerateListItems", state);

            var listItems = _loadListItems(state);

            if (listItems == null || listItems.Count == 0 || state.IsNew)
            {
                _log($"Start generate list-items");

                var listPages = _loadListPages(state);
                listItems = new Dictionary <string, AdDtoModel>();
                var listItemDublicate = new List <string>();

                var pages = listPages.Select(x => x.Key).ToList();

                foreach (var page in pages)
                {
                    var itemsOnPage = _loadPage(page, state);

                    if (itemsOnPage != null)
                    {
                        foreach (var item in itemsOnPage)
                        {
                            var key = item.Id;
                            if (listItems.ContainsKey(key))
                            {
                                listItemDublicate.Add(key);
                            }
                            else
                            {
                                listItems.Add(key, item);
                            }
                        }
                    }
                }

                _log($"Scraped uniq items:{listItems.Count}, items-dublicates:{listItemDublicate.Count}, total:{listItems.Count+ listItemDublicate.Count}");

                _saveListItems(listItems, state);
                _saveListItemsDublicates(listItemDublicate, state);
            }
            else
            {
                _log($"Generate list-item skipped (isNew:{state.IsNew})");
            }
        }
Esempio n. 17
0
        private void _fixListItems(Dictionary <string, AdDtoModel> list, ScraperHomeLessStateModel state)
        {
            var itemsFiles  = new DirectoryInfo(state.ItemsPath).GetFiles();
            var amountFixed = 0;

            foreach (var itemFile in itemsFiles)
            {
                var id = Path.GetFileNameWithoutExtension(itemFile.Name);
                if (list.ContainsKey(id) && !list[id].DownloadedItem)
                {
                    list[id].DownloadedItem = true;
                    amountFixed++;
                }
            }

            _log($"Fixed {amountFixed} files");
        }
Esempio n. 18
0
        private void ScrapePhase2_DownloadItems(ScraperHomeLessStateModel state)
        {
            SetWorkPhaseBase("DownloadItems", state);

            var listItems = _loadListItems(state);

            _fixListItems(listItems, state);

            Func <List <KeyValuePair <string, AdDtoModel> > > NeedToDo = () => listItems.Where(x => x.Value.DownloadedItem == false).ToList();
            var countDownloaded = 0;
            var maxTasks        = 5;
            var tasks           = new List <Task <bool> >();
            var needToDoItems   = NeedToDo();

            do
            {
                var freeTasks = maxTasks - tasks.Count();
                foreach (var item in needToDoItems.Take(freeTasks))
                {
                    var id = item.Key;
                    listItems[id].DownloadedItem = true;
                    tasks.Add(Task.Run(async() => listItems[id].DownloadedItem = await _downloadItemAsync(item.Value, state)));
                }

                //Thread.Sleep(1000 * 2);
                Task.WaitAny(tasks.ToArray());

                countDownloaded += tasks.Where(x => x.IsCompleted).Count();
                tasks.RemoveAll(x => x.IsCompleted);

                if (countDownloaded % 100 == 0)
                {
                    _log($"CountDownloaded are {countDownloaded}");
                    _saveListItems(listItems, state);
                }

                needToDoItems = NeedToDo();
            } while (needToDoItems.Count > 0);

            Task.WaitAll(tasks.ToArray());

            _log($"Scraped {countDownloaded} items");
            _saveListItems(listItems, state);

            _log($"Download items done");
        }
Esempio n. 19
0
        private int ScrapePhase1_GetAmountPages_Selenoid(ScraperHomeLessStateModel state)
        {
            var result = 0;

            var tryCount    = 1;
            var tryCountMax = 10;
            var doNeedTry   = false;

            _log($"Detecting amount pages");

            _initSelenoid(state);

            do
            {
                doNeedTry = false;
                var isDoneGetPage1 = _getPage_Selenoid(page: 1, state: state);

                if (isDoneGetPage1)
                {
                    var pageNavigate = _selenoidState.WindowMain.FindElementByClassName("pagingdisplay");
                    var lastPageText = pageNavigate.FindElements(By.TagName("span")).LastOrDefault().Text;

                    result = int.Parse(lastPageText);
                }
                else
                {
                    tryCount++;
                    if (tryCount <= tryCountMax)
                    {
                        doNeedTry = true;
                        _log($"Try count {tryCount}/{tryCountMax}. Pause 10 sec.");
                        Thread.Sleep(TimeSpan.FromSeconds(3));
                    }
                    else
                    {
                        _log($"Tred {tryCount} detect amount pages. Stop scrap process.");
                    }
                }
            } while (doNeedTry);

            _log($"Detected was {result} pages");

            _closeSelenoid();

            return(result);
        }
Esempio n. 20
0
        private void ScrapePhase1_GenerateListPages(ScraperHomeLessStateModel state)
        {
            SetWorkPhaseBase("GenerateListPages", state);

            var listPages = _loadListPages(state);

            if (listPages == null || listPages.Count == 0 || state.IsNew)
            {
                //var amountPages = ScrapePhase1_GetAmountPages_Selenoid(state);
                var amountPages = ScrapePhase1_GetAmountPages_WebClientAsync(state);
                var list        = ScrapePhase1_GenerateListPages(amountPages.Result);
                _saveListPagesAsync(list, state).Wait();
            }
            else
            {
                _log($"Generate list page not need (missing, isNew:{state.IsNew})");
            }
        }
Esempio n. 21
0
        private void fixListPages(Dictionary <int, bool> listPages, ScraperHomeLessStateModel state)
        {
            var path        = $"{state.PagesPath}";
            var listFiles   = new DirectoryInfo(path).GetFiles();
            var amountFixed = 0;

            foreach (var file in listFiles)
            {
                var filename = Path.GetFileNameWithoutExtension(file.Name);
                var n        = int.Parse(filename.Split("-")[1]);
                if (listPages.ContainsKey(n) && !listPages[n])
                {
                    listPages[n] = true;
                    amountFixed++;
                }
            }
            _log($"Fixed {amountFixed} files");
        }
Esempio n. 22
0
        private async Task <List <AdItemHomeLessDomainModel> > ScrapePhase4Async(ScraperHomeLessStateModel state)
        {
            var listDomainItems = new List <AdItemHomeLessDomainModel>();
            var files           = GetListItemFiles(state);
            var listPages       = _loadPagesAsync(state);

            foreach (var file in files)
            {
                var itemDto = await LoadItemDtoFromStoreAsync <DetailsItemDtoModel>(file, state);

                itemDto.RowDataFromPage = GetRowDataFromPage(await listPages, file);

                var itemDomain = new AdItemHomeLessDomainModel().FromDto(itemDto);
                listDomainItems.Add(itemDomain);
            }

            return(listDomainItems);
        }
Esempio n. 23
0
        static void Main(string[] args)
        {
            Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US");

            var state = new ScraperHomeLessStateModel()
            {
                IsNew = true,
            };

            var scraper = new ScraperHomeLess(state);

            UpdateRepository();

            //Scrape(scraper);

            //GetExcelFile(scraper);

            //PrintSaveStatus(scraper);
        }
Esempio n. 24
0
        private async Task <string> GetPage_WebClientAsync(int page, ScraperHomeLessStateModel state)
        {
            var result = "";

            try
            {
                var url = $"https://www.homeless.co.il/rent/1";
                result = await url
                         .WithHeaders(new
                {
                    User_Agent = "Windows",
                })
                         .GetStringAsync();
            }catch (Exception exception)
            {
                _log($"Error x2. {exception.Message} / {exception.StackTrace}");
            }

            return(result);
        }
Esempio n. 25
0
        private List <ItemTest> _statusWorkspace_AmountItemsFromPages_GetItems(ScraperHomeLessStateModel state)
        {
            var listPages = _statusWorkspace_AmountPages_GetFilesBase(state);

            var totalItems = 0;
            var list       = new List <ItemTest>();

            foreach (var page in listPages)
            {
                var filename  = page.FullName;
                var pageData  = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename));
                var listItems = pageData.Select(x => new ItemTest()
                {
                    Id = x.Id, Done = x.DownloadedItem
                }).ToList();
                list.AddRange(listItems);
                totalItems += listItems.Count;
            }

            return(list);
        }
Esempio n. 26
0
        private async Task <Dictionary <string, AdDtoModel> > _loadPagesAsync(ScraperHomeLessStateModel state)
        {
            var result = new Dictionary <string, AdDtoModel>();

            var path          = $"{state.PagesPath}";
            var listPageFiles = new DirectoryInfo(path).GetFiles();

            foreach (var pageFile in listPageFiles)
            {
                var listAdDto = JsonConvert.DeserializeObject <List <AdDtoModel> >(await File.ReadAllTextAsync(pageFile.FullName));
                foreach (var adDto in listAdDto)
                {
                    if (!result.ContainsKey(adDto.Id))
                    {
                        result.Add(adDto.Id, adDto);
                    }
                }
            }

            return(result);
        }
Esempio n. 27
0
        private bool _getPage_Selenoid(int page, ScraperHomeLessStateModel state)
        {
            var result = false;
            var doNeedRepeatRequest = false;
            var count    = 0;
            var countMax = 10;

            do
            {
                doNeedRepeatRequest = false;
                var url = $"https://homeless.co.il/rent/{page}";

                try
                {
                    _selenoidState.WindowMain.Navigate().GoToUrl(url);

                    _selenoidState.WaitMain.Until(ExpectedConditions.ElementIsVisible(By.ClassName("pagingdisplay")));
                    //Thread.Sleep(1000 * 5);
                    result = true;
                }
                catch (Exception exception)
                {
                    count++;
                    if (count < countMax)
                    {
                        _log($"!!! Need Reinit Selenoid (try {count}) !!!");
                        _initSelenoid(state);
                        doNeedRepeatRequest = true;
                    }
                    else
                    {
                        _log($"Try is out. Error Selenoid");
                    }
                }
            } while (doNeedRepeatRequest);

            return(result);
        }
Esempio n. 28
0
        private async Task <bool> GetPageAsync(int page, ScraperHomeLessStateModel state)
        {
            var result = false;

            try
            {
                var html = await GetPage_ScrappyAsync(page, state);

                var listAdsFromPage = ParseAdsFromPage(html, state);

                if (listAdsFromPage.Count > 0)
                {
                    await _savePageAsync(page, listAdsFromPage, state);

                    result = true;
                }
            }catch (Exception exception)
            {
                _log($"Error T1. {exception.Message} / {exception.StackTrace}");
            }

            return(result);
        }
Esempio n. 29
0
        private async Task <HtmlDocument> GetPage_ScrappyAsync(int pageNumber, ScraperHomeLessStateModel state)
        {
            HtmlDocument result     = null;
            var          needReplay = false;
            var          web        = new HtmlWeb();

            do
            {
                needReplay = false;
                try
                {
                    result = await web.LoadFromWebAsync($"https://www.homeless.co.il/rent/{pageNumber}");
                }
                catch (Exception exception)
                {
                    _log($"Error-g1. Wait 1 sec. {exception.Message}");
                    needReplay = true;
                    Thread.Sleep(1000 * 1);
                }
            } while (needReplay);

            return(result);
        }
Esempio n. 30
0
        private void ScrapePhase1_DownloadPages_Selenoid(ScraperHomeLessStateModel state)
        {
            SetWorkPhaseBase("DownloadPages", state);

            var listPages = _loadListPages(state);

            fixListPages(listPages, state);

            var needToDo                     = listPages.Where(x => x.Value == false).Select(x => x.Key).ToList();
            var needSaveListPages            = false;
            var countPages                   = 0;
            var countPagesPerSave            = 10;
            List <AdDtoModel> listAdFromPage = null;
            var needRepeate                  = false;
            var countTryRepeate              = 0;
            var countMaxTryRepeate           = 10;

            if (needToDo.Count > 0)
            {
                _initSelenoid(state);

                foreach (var page in needToDo)
                {
                    do
                    {
                        needRepeate = false;
                        try
                        {
                            listAdFromPage = _parsePage(page, state);
                        }
                        catch
                        {
                            countTryRepeate++;
                            _log($"\t\t!!! Need reinit SelenoidService !!!");
                            _initSelenoid(state);
                            Thread.Sleep(TimeSpan.FromMinutes(10));
                            needRepeate = true;
                            if (countTryRepeate > countMaxTryRepeate)
                            {
                                _log($"SelenoidService not inited.");
                                _abortApplication();
                            }
                        }
                    } while (needRepeate);

                    if (listAdFromPage.Count > 0)
                    {
                        _savePage(page, listAdFromPage, state);
                        listPages[page] = true;
                        if (needSaveListPages)
                        {
                            _saveListPagesAsync(listPages, state).Wait();
                            needSaveListPages = false;
                        }

                        countPages++;

                        if (countPages > countPagesPerSave)
                        {
                            countPages        = 0;
                            needSaveListPages = true;
                        }
                    }
                }

                _saveListPagesAsync(listPages, state).Wait();

                _closeSelenoid();
            }

            _log($"Phase-2 done");
        }