public override async Task <List <string> > CrawlAsync() { Console.WriteLine($"{nameof(MakoCrawler)} {nameof(CrawlAsync)} started"); var urls = new List <string>(); foreach (var recipeCategory in _recipesCategoriesList) { var web = new HtmlWeb(); var htmlDoc = await web.LoadWithRetryAsync($"{_baseUrl}-{recipeCategory}"); var lastPageString = htmlDoc. DocumentNode. SelectSingleNode("//div/max") .InnerText; var lastPage = int.Parse(lastPageString); for (var counterPages = 1; counterPages <= lastPage; counterPages++) { var singleUrl = $"{_baseUrl}-{recipeCategory}?page={counterPages}"; var htmlInnerDoc = web.Load(singleUrl); while (htmlInnerDoc.DocumentNode.InnerHtml.Contains("Apache Tomcat/6.0.18 - Error report")) { htmlInnerDoc = await web.LoadWithRetryAsync(singleUrl); } var nodes = htmlInnerDoc. DocumentNode. SelectNodes("//li[@class='hover']/div[@class='line-clamp']/h5/a"); if (nodes == null) { Console.WriteLine($"WARNING: nodes is null [{nameof(singleUrl)}={singleUrl}]"); continue; } nodes. ForEach(htmlNode => urls.Add("https://www.mako.co.il" + htmlNode.GetAttributeValue("href", ""))); Console.WriteLine($"{nameof(MakoCrawler)} {nameof(CrawlAsync)} Crawled {urls.Count} {nameof(urls)}"); } } var nonExistingUrls = urls. Where(url => !CrawlerProfile.SavedUrls.Contains(url)). Distinct(). Take(400). ToList(); CrawlerProfile.SavedUrls.AddRange(nonExistingUrls); Console.WriteLine($"{nameof(MakoCrawler)} {nameof(CrawlAsync)} finished [{nameof(nonExistingUrls.Count)}={nonExistingUrls.Count}]"); return(nonExistingUrls); }
protected override async Task <Recipe> ScrapeInternalAsync(string url) { Console.WriteLine($"{nameof(ShefLavanScraper)} {nameof(ScrapeInternalAsync)} started [{nameof(url)}={url}]"); var web = new HtmlWeb(); var htmlDoc = await web.LoadWithRetryAsync(url); var title = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:title']").GetAttributeValue("content", ""); var image = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").GetAttributeValue("content", ""); var preparationTime = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='short-properties bold']/span[@class='cooking-time']")?.InnerText; var link = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:url']").GetAttributeValue("content", ""); var numberOfDishes = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='properties']/span[@class='property']/strong")?.InnerText; var ingredients = htmlDoc.DocumentNode.SelectNodes("//div[@class='ingredients col-lg-3 col-md-4 col-sm-6 ']/ul[@class='ingredients-list']/li"); var recipeToAdd = new Recipe { Id = GetIdFromUrl(link), PreparationTime = preparationTime, Link = link, NumberOfDishes = GetNumberOfDishes(numberOfDishes), Picture = image, RecipeTitle = title, IngredientsList = CreateIngredientsList(ingredients), ValuesToSearch = TokanizationHelper.Tokenaize(title), NormalaizedIngredients = CreatenormalaizedIngredientsList(ingredients) }; Console.WriteLine($"{nameof(ShefLavanScraper)} {nameof(ScrapeInternalAsync)} finished."); return(recipeToAdd); }
protected override async Task <Recipe> ScrapeInternalAsync(string url) { Console.WriteLine($"{nameof(MakoScraper)} {nameof(ScrapeInternalAsync)} started [{nameof(url)}={url}]"); var web = new HtmlWeb(); var htmlDoc = await web.LoadWithRetryAsync(url); var title = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:title']").GetAttributeValue("content", ""); var image = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").GetAttributeValue("content", ""); var prepTime = htmlDoc.DocumentNode.SelectSingleNode("//ul[@class='table_container']/li[@class='titleContainer']/div/span[@itemprop='totalTime']")?.InnerText; var link = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:url']").GetAttributeValue("content", ""); var numOfDishes = htmlDoc.DocumentNode.SelectNodes("//div[@class='ingredients']/h3[@class='IngredientsTitle fontSize']")?.First().InnerText; var ingredients = htmlDoc.DocumentNode.SelectNodes("//div[@class='ingredients']/ul[@class='recipeIngredients']/li/span"); var recipeToAdd = new Recipe { Id = GetIdFromUrl(link), PreparationTime = prepTime, Link = link, NumberOfDishes = GetNumberOfDishes(numOfDishes), Picture = image, RecipeTitle = title, IngredientsList = CreateIngredientsList(ingredients), ValuesToSearch = TokanizationHelper.Tokenaize(title), NormalaizedIngredients = CreatenormalaizedIngredientsList(ingredients) }; Console.WriteLine($"{nameof(MakoScraper)} {nameof(ScrapeInternalAsync)} finished."); return(recipeToAdd); }
public override async Task <List <string> > CrawlAsync() { Console.WriteLine($"{nameof(ShefLavanCrawler)} {nameof(CrawlAsync)} started"); var urls = new List <string>(); var web = new HtmlWeb(); var htmlDoc = await web.LoadWithRetryAsync(_baseUrl); var lastPageString = htmlDoc.DocumentNode.SelectSingleNode("//a[@title='עבור לעמוד האחרון']") .GetAttributeValue("data-action-value", ""); var lastPage = int.Parse(lastPageString); for (var counterPages = 1; counterPages <= lastPage; counterPages++) { var singleUrl = $"{_baseUrl}&page={counterPages}"; var htmlInnerDoc = await web.LoadWithRetryAsync(singleUrl); htmlInnerDoc. DocumentNode. SelectNodes("//div[@class='list-box-content-wrapper']/a[@class='card-link']"). ForEach(htmlNode => urls.Add(htmlNode.GetAttributeValue("href", ""))); Console.WriteLine($"{nameof(ShefLavanCrawler)} {nameof(CrawlAsync)} Crawled {urls.Count} {nameof(urls)}"); } var nonExistingUrls = urls. Where(url => !CrawlerProfile.SavedUrls.Contains(url)). Distinct(). Take(400). ToList(); CrawlerProfile. SavedUrls. AddRange(nonExistingUrls); Console.WriteLine($"{nameof(ShefLavanCrawler)} {nameof(CrawlAsync)} finished [{nameof(nonExistingUrls.Count)}={nonExistingUrls.Count}]"); return(nonExistingUrls); }