private async Task <WebScraperModel> GetPageData(string url) { var config = Configuration.Default.WithDefaultLoader(); var context = BrowsingContext.New(config); var document = await context.OpenAsync(url); WebScraperModel mymodel = new WebScraperModel(); var HeadHtml = document.Head.InnerHtml; var AllHtml = document.DocumentElement.InnerHtml; var parser = new AngleSharp.Html.Parser.HtmlParser(); var data = parser.ParseDocument(HeadHtml); var allHtmlData = parser.ParseDocument(AllHtml); var MetaTags = data.All.Where(x => x.LocalName == "meta"); var AnchorTags = allHtmlData.QuerySelectorAll("a"); var metaTitle = document.Title; var mataTagsKeywords = MetaTags.FirstOrDefault(x => x.GetAttribute("Name") == "keywords").GetAttribute("Content"); var mataTagsDescription = MetaTags.FirstOrDefault(x => x.GetAttribute("Name") == "description").GetAttribute("Content"); var AllLinks = AnchorTags.Where(x => x.GetAttribute("href").StartsWith("h")).ToList(); var linksPath = AllLinks.Cast <IHtmlAnchorElement>() .Select(m => m.Href) .ToList(); ChromeOptions options = new ChromeOptions(); options.AddArgument("headless");//Comment if we want to see the window. var driver = new ChromeDriver(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), options); driver.Navigate().GoToUrl(url); var screenshot = (driver as ITakesScreenshot).GetScreenshot(); screenshot.SaveAsFile(Guid.NewGuid() + ".png"); driver.Close(); driver.Quit(); mymodel.title = metaTitle; mymodel.description = mataTagsDescription; mymodel.keywords = mataTagsKeywords; mymodel.AllHyperLinks = linksPath; mymodel.screenShot = screenshot; return(mymodel); }
public async Task <int> AddAsyncDoctor() { var parser = new AngleSharp.Html.Parser.HtmlParser(); var client = new HttpClient(); for (int cityNumber = 1; cityNumber < 28; cityNumber++) { var city = ""; if (cityNumber != 1 && cityNumber != 2 && cityNumber != 23) { switch (cityNumber) { case 1: city = "готово"; break; case 2: city = "готово"; break; case 3: city = "Бургас"; break; case 4: city = "Варна"; break; case 5: city = "Велико Търново"; break; case 6: city = "Видин"; break; case 7: city = "Враца"; break; case 8: city = "Габрово"; break; case 9: city = "Добрич"; break; case 10: city = "Кърджали"; break; case 11: city = "Кюстендил"; break; case 12: city = "Ловеч"; break; case 13: city = "Монтана"; break; case 14: city = "Пазарджик"; break; case 15: city = "Перник"; break; case 16: city = "Плевен"; break; case 17: city = "Пловдив"; break; case 18: city = "Разград"; break; case 19: city = "Русе"; break; case 20: city = "Силистра"; break; case 21: city = "Сливен"; break; case 22: city = "Смолян"; break; case 23: city = "София"; break; case 24: city = "София-област"; break; case 25: city = "Стара Загора"; break; case 26: city = "Търговище"; break; case 27: city = "Хасково"; break; case 28: city = "Шумен"; break; case 29: city = "Ямбол"; break; } } else { continue; } var numberOfDoctors = 1.0; for (var page = 1; page <= numberOfDoctors; page++) { var url = $"https://blsbg.eu/bg/medics/unionlist/{cityNumber}?UIN_page={page}&ajax=yw0"; string html = null; for (var i = 0; i < 5; i++) { try { var response = await client.GetAsync(url); html = await response.Content.ReadAsStringAsync(); break; } catch { Console.Write('!'); Thread.Sleep(500); } } if (string.IsNullOrWhiteSpace(html)) { continue; } var document = parser.ParseDocument(html); var numberOfDoctorsDiv = document.GetElementsByClassName("summary"); var numberOfDoctorsArr = numberOfDoctorsDiv[0].TextContent.Split(' '); numberOfDoctors = Math.Ceiling(double.Parse(numberOfDoctorsArr[numberOfDoctorsArr.Count() - 1]) / 25); var doctorsTable = document.GetElementsByClassName("items"); var test = document.GetElementsByTagName("tr"); var count = 1; foreach (var item in test) { var tdElement = item.GetElementsByTagName("td"); if (count == 1) { count++; continue; } var addDoctor = new UserAcc() { UIN = long.Parse(tdElement[0].TextContent), Name = tdElement[2].TextContent, City = city, Specialty = tdElement[3].TextContent, }; await this.addRepository.AddAsync(addDoctor); } } } await this.addRepository.SaveChangesAsync(); return(1); }
public async Task <int> AddDoctor(int fromId, int toId) { var parser = new AngleSharp.Html.Parser.HtmlParser(); var client = new HttpClient(); for (var page = fromId; page <= toId; page++) { Console.Write('^'); var url = $"https://bestdoctors.bg/doctors/p/{page}"; string html = null; for (var i = 0; i < 10; i++) { try { var response = await client.GetAsync(url); html = await response.Content.ReadAsStringAsync(); break; } catch { Console.Write('!'); Thread.Sleep(500); } } if (string.IsNullOrWhiteSpace(html)) { continue; } var document = parser.ParseDocument(html); var docBox = document.GetElementsByClassName("docbox"); foreach (var item in docBox) { var docNameCollection = item.GetElementsByTagName("h4"); var docName = docNameCollection[0].TextContent.Replace("\n", string.Empty).Trim(); var cityCollection = item.GetElementsByTagName("p"); var cityReplace = cityCollection[1].TextContent.Replace(",", string.Empty); var city = cityReplace.Replace("\n", string.Empty).Trim(); var specialtyCollection = item.GetElementsByTagName("span"); var specialty = specialtyCollection[0].TextContent.Replace("\n", string.Empty).Trim(); var imageCollection = item.GetElementsByTagName("img"); var imageAttributes = imageCollection[0].Attributes; var imageSrc = imageAttributes[0].Value; var hospitalCollection = item.GetElementsByTagName("a"); if (hospitalCollection.Length > 1) { var hospital = hospitalCollection[1].TextContent.Replace("\n", string.Empty).Trim(); var addDoctor = new UserAcc() { Name = docName, City = city, SecondName = hospital, Specialty = specialty, FilePath = imageSrc, }; await this.addRepository.AddAsync(addDoctor); } else { var addDoctor = new UserAcc() { Name = docName, City = city, Specialty = specialty, FilePath = imageSrc, }; await this.addRepository.AddAsync(addDoctor); } } } await this.addRepository.SaveChangesAsync(); return(1); }