コード例 #1
0
        private List <string> ScrapePhase1_GenerateListCities_WebClient(ScraperAirdnaStateModel state)
        {
            var list = new List <string>();
            //https://api.airdna.co/v1/market/search?access_token=MTg4Njkx|6534a37a5f8f4a1e9a8e6139fda32153&term=Israel

            var loginUrl  = "https://www.airdna.co/api/v1/account/login";
            var loginPost = loginUrl
                            .PostUrlEncodedAsync(new { username = "******", password = "******", remember_me = "true" })
                            .ReceiveJson <ResponseLogin>();

            var loginResult = loginPost.Result;

            if (loginResult.Status == null || loginResult.Status.ToLower() != "success")
            {
                Console.WriteLine($"Error authorizations on airdna.co site.");
            }

            var token = loginResult.Token;

            _token = token;
            var cities22 = $"https://api.airdna.co/v1/market/search?access_token={token}&term=Israel";

            var data = cities22.GetAsync().ReceiveJson().Result;

            var filename = $"{state.RootPath}/cities.json";

            File.WriteAllText(filename, JsonConvert.SerializeObject(data, Newtonsoft.Json.Formatting.Indented));

            return(list);
        }
コード例 #2
0
        private void _airdnaScrapeThenSaveStore(bool isNew = false, bool needScrape = true)
        {
            _log($"Start AirdnaScrapeThenSaveStore (isNew={isNew})");

            var state = new ScraperAirdnaStateModel()
            {
                IsNew = isNew,
            };

            var scraper = new ScraperAirdna(state);

            if (needScrape)
            {
                scraper.Scrape();
            }

            var dataOfScrape = scraper.GetDomainModel();

            var excelService = new ExcelAirdnaService(state);

            var excelData = excelService.CreateExcel(dataOfScrape);

            var pathToFile = excelService.SaveToFile(excelData);

            var archive = new ArchiveRepository();

            archive.Save(pathToFile, state.TypeScraper);

            _log($"End AirdnaScrapeThenSaveStore (isNew={isNew}), Spent time {_calcSpentTime2String(state)}");
        }
コード例 #3
0
        private async Task <bool> _downloadItemAsync(AdDtoModel item, ScraperAirdnaStateModel state)
        {
            var result = true;
            var id     = item.Id;

            try
            {
                var adDetails   = _getAdDetailsFromService(item);
                var coordinates = _getCoordinatesFromServiceAsync(item, state);
                var phones      = _getPhonesFromService(item);
                var details     = _getDetailsFromService(item);

                var detailsItemDto = new DetailsItemDtoModel
                {
                    Coordinates     = await coordinates,
                    Phones          = await phones,
                    Details         = await details,
                    AdDetails       = await adDetails,
                    RowDataFromPage = item,
                };

                await _saveItemDetailsAsync(id, detailsItemDto, state);
            }
            catch (Exception exception)
            {
                _log($"Error-f1. {exception.Message} / {exception.StackTrace}");
                result = false;
            }

            return(result);
        }
コード例 #4
0
        private async Task ScrapePhase1_DownloadPages_Scrapy(ScraperAirdnaStateModel state)
        {
            SetWorkPhaseBase("DownloadPages", state);

            var listPages = _loadListCities(state);

            fixListPages(listPages, state);

            var needToDo = listPages.Where(x => x.Value == false).Select(x => x.Key).ToList();

            var tasks = new List <Task <bool> >();

            if (needToDo.Count > 0)
            {
                foreach (var page in needToDo)
                {
                    tasks.Add(Task.Run(async() => listPages[page] = await GetPageAsync(page, state)));
                }

                //await _saveListCitiesAsync(listPages, state);
            }

            Task.WaitAll(tasks.ToArray());

            _log($"Phase done");
        }
コード例 #5
0
        private void _savePage(int page, List <AdDtoModel> listAdFromPage, ScraperAirdnaStateModel state)
        {
            var filename = $"{state.PagesPath}/page-{page}.json";

            File.WriteAllText(filename, JsonConvert.SerializeObject(listAdFromPage, Newtonsoft.Json.Formatting.Indented));

            _log($"Save file {filename}");
        }
コード例 #6
0
        private int _statusWorkspace_AmountItemsFromPages(ScraperAirdnaStateModel state)
        {
            var list = _statusWorkspace_AmountItemsFromPages_GetItems(state);

            //var dubs = list.GroupBy(x => x.Id, StringComparer.OrdinalIgnoreCase).ToDictionary(x => x.First().Id, x => x.First().Done);

            return(list.Count());
        }
コード例 #7
0
        private async Task _saveItemAsync(string id, string response, ScraperAirdnaStateModel state)
        {
            var filename = $"{state.ItemsPath}/{id}.xml";

            await File.WriteAllTextAsync(filename, response);

            _log($"Save xml-data file {filename}");
        }
コード例 #8
0
        private void _saveListItems(Dictionary <string, AdDtoModel> list, ScraperAirdnaStateModel state)
        {
            var filename = state.ListItemsFilename;

            _log($"Saving list-items: {filename}");

            File.WriteAllText($"{filename}", JsonConvert.SerializeObject(list, Newtonsoft.Json.Formatting.Indented));

            _log($"Save list-items done");
        }
コード例 #9
0
        private async Task <int> ScrapePhase1_GetAmountPages_WebClientAsync(ScraperAirdnaStateModel state)
        {
            var result = 0;

            var page = await GetPage_WebClientAsync(1, state);

            var amountPages = ScrapeAmountPages(page);

            result = amountPages;

            return(result);
        }
コード例 #10
0
        private List <AdDtoModel> ParseAdsFromPage(HtmlDocument html, ScraperAirdnaStateModel state)
        {
            var result = new List <AdDtoModel>();

            var list1 = ParseAdsFromPage_Rent(html, EnumTypeItems.Rent);
            var list2 = ParseAdsFromPage_Rent(html, EnumTypeItems.RentTivuch);

            result.AddRange(list1);
            result.AddRange(list2);

            return(result);
        }
コード例 #11
0
        private Dictionary <int, bool> _loadListCities(ScraperAirdnaStateModel state)
        {
            Dictionary <int, bool> result = null;

            var filename = state.ListPagesFilename;

            if (File.Exists(filename))
            {
                result = JsonConvert.DeserializeObject <Dictionary <int, bool> >(File.ReadAllText(filename));
            }

            return(result);
        }
コード例 #12
0
        private List <AdDtoModel> _loadPage(int page, ScraperAirdnaStateModel state)
        {
            List <AdDtoModel> result = null;

            var filename = $"{state.PagesPath}/page-{page}.json";

            if (File.Exists(filename))
            {
                result = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename));
            }

            return(result);
        }
コード例 #13
0
        private Dictionary <string, AdDtoModel> _loadListItems(ScraperAirdnaStateModel state)
        {
            Dictionary <string, AdDtoModel> result = null;

            var filename = state.ListItemsFilename;

            if (File.Exists(filename))
            {
                result = JsonConvert.DeserializeObject <Dictionary <string, AdDtoModel> >(File.ReadAllText(filename));
            }

            return(result);
        }
コード例 #14
0
        public ScraperAirdna(ScraperAirdnaStateModel state = null)
        {
            if (state is null)
            {
                state = new ScraperAirdnaStateModel();
            }

            _state = (ScraperAirdnaStateModel)state;

            _loadScraperConfig((ScraperAirdnaStateModel)_state);

            _checkDirectory(_state.RootPath);
        }
コード例 #15
0
        private void _loadScraperConfig(ScraperAirdnaStateModel state)
        {
            var configFilename = state.ConfigFilename;

            if (File.Exists(configFilename))
            {
                _config = JsonConvert.DeserializeObject <ScraperAirdnaConfigModel>(File.ReadAllText(configFilename));
            }
            else
            {
                _config = new ScraperAirdnaConfigModel();
                _saveScraperConfig(_config, configFilename);
            }
        }
コード例 #16
0
        private List <AdDtoModel> _parsePage(int page, ScraperAirdnaStateModel state)
        {
            List <AdDtoModel> result = new List <AdDtoModel>();

            var isDoneGetPage = _getPage_Selenoid(page: page, state: state);

            if (isDoneGetPage)
            {
                _scrapeFromPage_Rent(result, EnumTypeItems.Rent);
                _scrapeFromPage_Rent(result, EnumTypeItems.RentTivuch);
            }

            return(result);
        }
コード例 #17
0
        private void ScrapePhase2_GenerateListItems(ScraperAirdnaStateModel state)
        {
            SetWorkPhaseBase("GenerateListItems", state);

            var listItems = _loadListItems(state);

            if (listItems == null || listItems.Count == 0 || state.IsNew)
            {
                _log($"Start generate list-items");

                var listPages = _loadListCities(state);
                listItems = new Dictionary <string, AdDtoModel>();
                var listItemDublicate = new List <string>();

                var pages = listPages.Select(x => x.Key).ToList();

                foreach (var page in pages)
                {
                    var itemsOnPage = _loadPage(page, state);

                    if (itemsOnPage != null)
                    {
                        foreach (var item in itemsOnPage)
                        {
                            var key = item.Id;
                            if (listItems.ContainsKey(key))
                            {
                                listItemDublicate.Add(key);
                            }
                            else
                            {
                                listItems.Add(key, item);
                            }
                        }
                    }
                }

                _log($"Scraped uniq items:{listItems.Count}, items-dublicates:{listItemDublicate.Count}, total:{listItems.Count+ listItemDublicate.Count}");

                _saveListItems(listItems, state);
                _saveListItemsDublicates(listItemDublicate, state);
            }
            else
            {
                _log($"Generate list-item skipped (isNew:{state.IsNew})");
            }
        }
コード例 #18
0
        private void _initSelenoid(ScraperAirdnaStateModel state)
        {
            if (_selenoidState is null)
            {
                _selenoidState = new SelenoidStateModel()
                {
                    JavaScriptEnable = true,
                    ShowPictures     = true,
                }
            }
            ;

            _initSelenoidBase(_selenoidState, state);
        }

        #endregion
    }
コード例 #19
0
        private int ScrapePhase1_GetAmountPages_Selenoid(ScraperAirdnaStateModel state)
        {
            var result = 0;

            var tryCount    = 1;
            var tryCountMax = 10;
            var doNeedTry   = false;

            _log($"Detecting amount pages");

            _initSelenoid(state);

            do
            {
                doNeedTry = false;
                var isDoneGetPage1 = _getPage_Selenoid(page: 1, state: state);

                if (isDoneGetPage1)
                {
                    var pageNavigate = _selenoidState.WindowMain.FindElementByClassName("pagingdisplay");
                    var lastPageText = pageNavigate.FindElements(By.TagName("span")).LastOrDefault().Text;

                    result = int.Parse(lastPageText);
                }
                else
                {
                    tryCount++;
                    if (tryCount <= tryCountMax)
                    {
                        doNeedTry = true;
                        _log($"Try count {tryCount}/{tryCountMax}. Pause 10 sec.");
                        Thread.Sleep(TimeSpan.FromSeconds(3));
                    }
                    else
                    {
                        _log($"Tred {tryCount} detect amount pages. Stop scrap process.");
                    }
                }
            } while (doNeedTry);

            _log($"Detected was {result} pages");

            _closeSelenoid();

            return(result);
        }
コード例 #20
0
        private void _fixListItems(Dictionary <string, AdDtoModel> list, ScraperAirdnaStateModel state)
        {
            var itemsFiles  = new DirectoryInfo(state.ItemsPath).GetFiles();
            var amountFixed = 0;

            foreach (var itemFile in itemsFiles)
            {
                var id = Path.GetFileNameWithoutExtension(itemFile.Name);
                if (list.ContainsKey(id) && !list[id].DownloadedItem)
                {
                    list[id].DownloadedItem = true;
                    amountFixed++;
                }
            }

            _log($"Fixed {amountFixed} files");
        }
コード例 #21
0
        private void ScrapePhase2_DownloadItems(ScraperAirdnaStateModel state)
        {
            SetWorkPhaseBase("DownloadItems", state);

            var listItems = _loadListItems(state);

            _fixListItems(listItems, state);

            Func <List <KeyValuePair <string, AdDtoModel> > > NeedToDo = () => listItems.Where(x => x.Value.DownloadedItem == false).ToList();
            var countDownloaded = 0;
            var maxTasks        = 5;
            var tasks           = new List <Task <bool> >();
            var needToDoItems   = NeedToDo();

            do
            {
                var freeTasks = maxTasks - tasks.Count();
                foreach (var item in needToDoItems.Take(freeTasks))
                {
                    var id = item.Key;
                    listItems[id].DownloadedItem = true;
                    tasks.Add(Task.Run(async() => listItems[id].DownloadedItem = await _downloadItemAsync(item.Value, state)));
                }

                //Thread.Sleep(1000 * 2);
                Task.WaitAny(tasks.ToArray());

                countDownloaded += tasks.Where(x => x.IsCompleted).Count();
                tasks.RemoveAll(x => x.IsCompleted);

                if (countDownloaded % 100 == 0)
                {
                    _log($"CountDownloaded are {countDownloaded}");
                    _saveListItems(listItems, state);
                }

                needToDoItems = NeedToDo();
            } while (needToDoItems.Count > 0);

            Task.WaitAll(tasks.ToArray());

            _log($"Scraped {countDownloaded} items");
            _saveListItems(listItems, state);

            _log($"Download items done");
        }
コード例 #22
0
        private List <string> ScrapePhase1_GenerateListCities_Selenoid(ScraperAirdnaStateModel state)
        {
            var list = new List <string>();

            _initSelenoid(state);

            LogonToSite();

            var url = $"https://airdna.co";

            //var isOk = Selenoid_GoToUrl_Base(url, _selenoidState, state);

            //(string)_selenoidState.WindowMain.Ex

            _log($"Generate new list of pages done");

            return(list);
        }
コード例 #23
0
        private void ScrapePhase1_GenerateListCities(ScraperAirdnaStateModel state)
        {
            SetWorkPhaseBase("GenerateListcities", state);

            var listCities = _loadListCities(state);

            if (/*listCities == null || listCities.Count == 0 || */ state.IsNew)
            {
                //var list = ScrapePhase1_GenerateListCities_Selenoid(state);
                var list = ScrapePhase1_GenerateListCities_WebClient(state);

                //_saveListCitiesAsync(list, state).Wait();
            }
            else
            {
                _log($"Generate list city not need (missing, isNew:{state.IsNew})");
            }
        }
コード例 #24
0
        private void fixListPages(Dictionary <int, bool> listPages, ScraperAirdnaStateModel state)
        {
            var path        = $"{state.PagesPath}";
            var listFiles   = new DirectoryInfo(path).GetFiles();
            var amountFixed = 0;

            foreach (var file in listFiles)
            {
                var filename = Path.GetFileNameWithoutExtension(file.Name);
                var n        = int.Parse(filename.Split("-")[1]);
                if (listPages.ContainsKey(n) && !listPages[n])
                {
                    listPages[n] = true;
                    amountFixed++;
                }
            }
            _log($"Fixed {amountFixed} files");
        }
コード例 #25
0
        private async Task <List <AdItemAirdnaDomainModel> > ScrapeAllItems(ScraperAirdnaStateModel state)
        {
            var listDomainItems = new List <AdItemAirdnaDomainModel>();
            var files           = new DirectoryInfo($"{state.ItemsPath}").GetFiles();

            foreach (var file in files)
            {
                var itemsDto = await LoadItemDtoFromStoreAsync <AirdnaScrapeDataModel>(file, state);

                foreach (var item in itemsDto.Properties)
                {
                    var itemDomain = new AdItemAirdnaDomainModel().FromDto(item);
                    itemDomain.Location = itemsDto.AreaInfo.Geom.Name.City;
                    listDomainItems.Add(itemDomain);
                }
            }

            return(listDomainItems);
        }
コード例 #26
0
        private void ScrapePhase2_DownloadCities(ScraperAirdnaStateModel state)
        {
            if (state.IsNew)
            {
                var filename = $"{state.RootPath}/cities.json";
                var cities   = JsonConvert.DeserializeObject <ItemDto>(File.ReadAllText(filename));

                foreach (var city in cities.items)
                {
                    var url = $"https://api.airdna.co/v1/market/property_list?access_token={_token}&city_id={city.city.id}&currency=native";

                    var dto = url.GetJsonAsync <AirdnaScrapeDataModel>().Result;

                    var filename2 = $"{state.ItemsPath}/{city.city.id}.json";

                    File.WriteAllText(filename2, JsonConvert.SerializeObject(dto, Newtonsoft.Json.Formatting.Indented));
                }
            }
        }
コード例 #27
0
ファイル: Program.cs プロジェクト: Ori021586060/Scrapers
        static void Main(string[] args)
        {
            Thread.CurrentThread.CurrentCulture = new CultureInfo("en-US");
            NpgsqlConnection.GlobalTypeMapper.UseNetTopologySuite();

            var state = new ScraperAirdnaStateModel()
            {
                IsNew = true,
            };

            var scraper = new ScraperAirdna(state);

            UpdateRepository();

            //Scrape(scraper);

            //GetExcelFile(scraper);

            //PrintSaveStatus(scraper);
        }
コード例 #28
0
        private async Task <string> GetPage_WebClientAsync(int page, ScraperAirdnaStateModel state)
        {
            var result = "";

            try
            {
                var url = $"https://www.homeless.co.il/rent/1";
                result = await url
                         .WithHeaders(new
                {
                    User_Agent = "Windows",
                })
                         .GetStringAsync();
            }catch (Exception exception)
            {
                _log($"Error x2. {exception.Message} / {exception.StackTrace}");
            }

            return(result);
        }
コード例 #29
0
        private List <ItemTest> _statusWorkspace_AmountItemsFromPages_GetItems(ScraperAirdnaStateModel state)
        {
            var listPages = _statusWorkspace_AmountPages_GetFilesBase(state);

            var totalItems = 0;
            var list       = new List <ItemTest>();

            foreach (var page in listPages)
            {
                var filename  = page.FullName;
                var pageData  = JsonConvert.DeserializeObject <List <AdDtoModel> >(File.ReadAllText(filename));
                var listItems = pageData.Select(x => new ItemTest()
                {
                    Id = x.Id, Done = x.DownloadedItem
                }).ToList();
                list.AddRange(listItems);
                totalItems += listItems.Count;
            }

            return(list);
        }
コード例 #30
0
        private async Task <Dictionary <string, AdDtoModel> > _loadPagesAsync(ScraperAirdnaStateModel state)
        {
            var result = new Dictionary <string, AdDtoModel>();

            var path          = $"{state.PagesPath}";
            var listPageFiles = new DirectoryInfo(path).GetFiles();

            foreach (var pageFile in listPageFiles)
            {
                var listAdDto = JsonConvert.DeserializeObject <List <AdDtoModel> >(await File.ReadAllTextAsync(pageFile.FullName));
                foreach (var adDto in listAdDto)
                {
                    if (!result.ContainsKey(adDto.Id))
                    {
                        result.Add(adDto.Id, adDto);
                    }
                }
            }

            return(result);
        }