Example #1
0
        public override async Task <List <string> > CrawlAsync()
        {
            Console.WriteLine($"{nameof(MakoCrawler)} {nameof(CrawlAsync)} started");

            var urls = new List <string>();

            foreach (var recipeCategory in _recipesCategoriesList)
            {
                var web = new HtmlWeb();

                var htmlDoc = await web.LoadWithRetryAsync($"{_baseUrl}-{recipeCategory}");

                var lastPageString =
                    htmlDoc.
                    DocumentNode.
                    SelectSingleNode("//div/max")
                    .InnerText;
                var lastPage = int.Parse(lastPageString);

                for (var counterPages = 1; counterPages <= lastPage; counterPages++)
                {
                    var singleUrl    = $"{_baseUrl}-{recipeCategory}?page={counterPages}";
                    var htmlInnerDoc = web.Load(singleUrl);

                    while (htmlInnerDoc.DocumentNode.InnerHtml.Contains("Apache Tomcat/6.0.18 - Error report"))
                    {
                        htmlInnerDoc = await web.LoadWithRetryAsync(singleUrl);
                    }

                    var nodes = htmlInnerDoc.
                                DocumentNode.
                                SelectNodes("//li[@class='hover']/div[@class='line-clamp']/h5/a");

                    if (nodes == null)
                    {
                        Console.WriteLine($"WARNING: nodes is null [{nameof(singleUrl)}={singleUrl}]");
                        continue;
                    }

                    nodes.
                    ForEach(htmlNode => urls.Add("https://www.mako.co.il" + htmlNode.GetAttributeValue("href", "")));

                    Console.WriteLine($"{nameof(MakoCrawler)} {nameof(CrawlAsync)} Crawled {urls.Count} {nameof(urls)}");
                }
            }

            var nonExistingUrls =
                urls.
                Where(url => !CrawlerProfile.SavedUrls.Contains(url)).
                Distinct().
                Take(400).
                ToList();

            CrawlerProfile.SavedUrls.AddRange(nonExistingUrls);

            Console.WriteLine($"{nameof(MakoCrawler)} {nameof(CrawlAsync)} finished [{nameof(nonExistingUrls.Count)}={nonExistingUrls.Count}]");

            return(nonExistingUrls);
        }
Example #2
0
        protected override async Task <Recipe> ScrapeInternalAsync(string url)
        {
            Console.WriteLine($"{nameof(ShefLavanScraper)} {nameof(ScrapeInternalAsync)} started [{nameof(url)}={url}]");

            var web     = new HtmlWeb();
            var htmlDoc = await web.LoadWithRetryAsync(url);

            var title           = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:title']").GetAttributeValue("content", "");
            var image           = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").GetAttributeValue("content", "");
            var preparationTime = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='short-properties bold']/span[@class='cooking-time']")?.InnerText;
            var link            = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:url']").GetAttributeValue("content", "");
            var numberOfDishes  = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='properties']/span[@class='property']/strong")?.InnerText;
            var ingredients     = htmlDoc.DocumentNode.SelectNodes("//div[@class='ingredients col-lg-3 col-md-4 col-sm-6 ']/ul[@class='ingredients-list']/li");

            var recipeToAdd = new Recipe
            {
                Id = GetIdFromUrl(link),
                PreparationTime        = preparationTime,
                Link                   = link,
                NumberOfDishes         = GetNumberOfDishes(numberOfDishes),
                Picture                = image,
                RecipeTitle            = title,
                IngredientsList        = CreateIngredientsList(ingredients),
                ValuesToSearch         = TokanizationHelper.Tokenaize(title),
                NormalaizedIngredients = CreatenormalaizedIngredientsList(ingredients)
            };

            Console.WriteLine($"{nameof(ShefLavanScraper)} {nameof(ScrapeInternalAsync)} finished.");

            return(recipeToAdd);
        }
Example #3
0
        protected override async Task <Recipe> ScrapeInternalAsync(string url)
        {
            Console.WriteLine($"{nameof(MakoScraper)} {nameof(ScrapeInternalAsync)} started [{nameof(url)}={url}]");

            var web     = new HtmlWeb();
            var htmlDoc = await web.LoadWithRetryAsync(url);

            var title       = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:title']").GetAttributeValue("content", "");
            var image       = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:image']").GetAttributeValue("content", "");
            var prepTime    = htmlDoc.DocumentNode.SelectSingleNode("//ul[@class='table_container']/li[@class='titleContainer']/div/span[@itemprop='totalTime']")?.InnerText;
            var link        = htmlDoc.DocumentNode.SelectSingleNode("//meta[@property='og:url']").GetAttributeValue("content", "");
            var numOfDishes = htmlDoc.DocumentNode.SelectNodes("//div[@class='ingredients']/h3[@class='IngredientsTitle fontSize']")?.First().InnerText;
            var ingredients = htmlDoc.DocumentNode.SelectNodes("//div[@class='ingredients']/ul[@class='recipeIngredients']/li/span");

            var recipeToAdd = new Recipe
            {
                Id = GetIdFromUrl(link),
                PreparationTime        = prepTime,
                Link                   = link,
                NumberOfDishes         = GetNumberOfDishes(numOfDishes),
                Picture                = image,
                RecipeTitle            = title,
                IngredientsList        = CreateIngredientsList(ingredients),
                ValuesToSearch         = TokanizationHelper.Tokenaize(title),
                NormalaizedIngredients = CreatenormalaizedIngredientsList(ingredients)
            };

            Console.WriteLine($"{nameof(MakoScraper)} {nameof(ScrapeInternalAsync)} finished.");

            return(recipeToAdd);
        }
Example #4
0
        public override async Task <List <string> > CrawlAsync()
        {
            Console.WriteLine($"{nameof(ShefLavanCrawler)} {nameof(CrawlAsync)} started");

            var urls    = new List <string>();
            var web     = new HtmlWeb();
            var htmlDoc = await web.LoadWithRetryAsync(_baseUrl);

            var lastPageString = htmlDoc.DocumentNode.SelectSingleNode("//a[@title='עבור לעמוד האחרון']")
                                 .GetAttributeValue("data-action-value", "");
            var lastPage = int.Parse(lastPageString);

            for (var counterPages = 1; counterPages <= lastPage; counterPages++)
            {
                var singleUrl    = $"{_baseUrl}&page={counterPages}";
                var htmlInnerDoc = await web.LoadWithRetryAsync(singleUrl);

                htmlInnerDoc.
                DocumentNode.
                SelectNodes("//div[@class='list-box-content-wrapper']/a[@class='card-link']").
                ForEach(htmlNode => urls.Add(htmlNode.GetAttributeValue("href", "")));

                Console.WriteLine($"{nameof(ShefLavanCrawler)} {nameof(CrawlAsync)} Crawled {urls.Count} {nameof(urls)}");
            }

            var nonExistingUrls =
                urls.
                Where(url => !CrawlerProfile.SavedUrls.Contains(url)).
                Distinct().
                Take(400).
                ToList();

            CrawlerProfile.
            SavedUrls.
            AddRange(nonExistingUrls);

            Console.WriteLine($"{nameof(ShefLavanCrawler)} {nameof(CrawlAsync)} finished [{nameof(nonExistingUrls.Count)}={nonExistingUrls.Count}]");

            return(nonExistingUrls);
        }