Beispiel #1
0
        public async Task Begin()
        {
            //var curlieHomePageDoc = CurlieWeb.Load(CurlieHomepage);
            CurlieWebClient.Start();
            var curlieHomePageDoc = await CurlieWebClient.LoadPage(CurlieHomepage);

            var rootNode = curlieHomePageDoc.DocumentNode;

            var categorySectionNode = rootNode.SelectSingleNode("//section[@id='category-section']");
            var categoryAsides      = categorySectionNode.SelectNodes("aside");

            //for demo purposes skip Arts which is extra big
            bool skipped = false;

            foreach (var categoryAside in categoryAsides)
            {
                if (!skipped)
                {
                    skipped = true;
                    continue;
                }
                await ParseRootCategory(categoryAside);

                break;
            }

            try
            {
                WriteToCsv();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.StackTrace);
            }

            return;
        }
Beispiel #2
0
        private async Task ParseSubCategories(Category rootCategory)
        {
            Console.WriteLine($"Extracting subcategories for {rootCategory.Url}");
            //read category page to find subcategories, related and other languages
            var categoryPage = await CurlieWebClient.LoadPage(GetCategoryFullUrl(rootCategory.Url));

            var categoryRoot = categoryPage.DocumentNode;


            //parse subcategories
            var subcategoriesDiv = categoryRoot.SelectSingleNode("//div[@id='subcategories-div']");

            //if category has no subcategory, just return
            if (subcategoriesDiv == null)
            {
                return;
            }

            var subcategoriesSections = subcategoriesDiv.SelectNodes("section[@class='children']");

            var catItems = new List <HtmlNode>();

            foreach (var subcategoriesSection in subcategoriesSections)
            {
                catItems.AddRange(subcategoriesSection.SelectNodes("div/div[@class='cat-item']"));
            }

            //for demo purposes, we will extract only 4 subcategories per category
            int categoriesExtracted = 0;
            int maxSubcategories    = 4;

            foreach (var catItem in catItems)
            {
                if (categoriesExtracted >= maxSubcategories)
                {
                    break;
                }

                var aNode = catItem.SelectSingleNode("a");
                var url   = aNode.Attributes["href"].Value;

                var name = aNode.SelectSingleNode("div/i").NextSibling.InnerText.Trim();

                var category = new Category()
                {
                    Name = name,
                    Url  = url
                };

                category.Parents.Add(rootCategory);
                if (Categories.TryAdd(category.Url, category))
                {
                    await ParseSubCategories(category);
                }
                else
                {
                    Categories[category.Url].Parents.Add(rootCategory);
                }

                ++categoriesExtracted;
                Console.WriteLine($"Progress for {rootCategory.Url}: {categoriesExtracted}/{maxSubcategories}");
            }
        }