Ejemplo n.º 1
0
        private async Task ImportTopCategoryAsync(IElement element)
        {
            var startTime = DateTime.Now;

            var topCatName = element.InnerHtml;
            var topCatUrl  = element.GetAttribute("href");

            Logger.Info($"导入食材顶级分类: {topCatName} {topCatUrl}");

            //var url = $"http:{topCatUrl}";
            var topDoc = await CrawlerHelper.GetDocumentAddHttpPrefixAsync(topCatUrl);

            var middleDivs = topDoc.QuerySelectorAll(".category_sub.clear");

            Logger.Info($"   有{middleDivs.Length} 中类 ");

            foreach (var midDiv in middleDivs)
            {
                var middleCatName = midDiv.FirstElementChild.TextContent; //h2 // node 是可视树,用xpath, element 是逻辑树,用selector
                if (dataStore.HasSave(topCatName, middleCatName))
                {
                    continue; // 已经导入的,就不重复导入
                }

                var ul            = midDiv.LastElementChild; //ul
                var foodMaterials = new FoodMaterialCollection();
                foreach (var li in ul.GetElementsByTagName("li"))
                {
                    var a = li.FirstElementChild;
                    var foodMaterialName = a.TextContent;
                    var foodMaterialHref = a.GetAttribute("href");

                    var foodMaterial = await TryGetFoodMaterial(foodMaterialName, foodMaterialHref);

                    if (foodMaterial != null)
                    {
                        foodMaterials.Add(foodMaterial);
                    }
                }
                Logger.Info($"    {middleCatName} 有{foodMaterials.Count}个食材 ");

                var rawItem = new FoodMaterialRawDataItem()
                {
                    Top           = topCatName,
                    Middle        = middleCatName,
                    FoodMaterials = foodMaterials
                };

                dataStore.SaveCategory(rawItem);

                rawData.Add(rawItem);

                string msg = $"导入{topCatName} {middleCatName}耗时:{DateTime.Now.Subtract(startTime).TotalSeconds}";
                Console.WriteLine(msg);
                Logger.Info(msg);
                //break;
            }
        }
Ejemplo n.º 2
0
        private async Task <FoodMaterialItem> GetFoodMaterial(string foodMaterialName, string foodMaterialHref)
        {
            string englishName = CrawlerHelper.GetUrlLast(foodMaterialHref);

            var foodMaterial = new FoodMaterialItem()
            {
                Description = foodMaterialName,
                EnglishName = englishName,
                SourceUrl   = foodMaterialHref,
                Photo       = FoodMaterial.GetImageUrlPath(englishName),
            };

            IHtmlDocument foodMaterialDoc = await CrawlerHelper.GetDocumentAddHttpPrefixAsync(foodMaterialHref + "/useful");

            var sourceImgUrl = foodMaterialDoc.QuerySelector("#category_pic")?.GetAttribute("data-src");

            if (sourceImgUrl == null)
            {
                Logger.Error($"{foodMaterialName} 找不到 category_pic");
            }

            //string localImgPath = "FoodMaterial\\" + englishName + ".jpg";
            if (ChiMaConfig.NeedDownloadFoodMaterialImage && sourceImgUrl != null)
            {
                string localImgPath = FoodMaterial.GetImageLocalPath(englishName);
                CrawlerHelper.DownloadImgAndSaveAsync(sourceImgUrl, localImgPath);
            }

            var nutritionsUL = foodMaterialDoc.QuerySelector(".category_use_table.mt10.clear")?.FirstElementChild;

            foodMaterial.Nutritions = new List <string>();
            if (nutritionsUL != null)
            {
                foreach (var li in nutritionsUL.GetElementsByTagName("li"))
                {
                    var name = li.TextContent;
                    //var value = li.FirstElementChild.TextContent;
                    foodMaterial.Nutritions.Add(name);
                }
            }

            return(foodMaterial);
        }