/// <summary> /// 获取子分类结果 /// </summary> /// <param name="url"></param> /// <returns></returns> private ChildCategoryResult GetChildCategoryResult(string url) { string childDataString = WebClientHelper.GetContent(url, Encoding.UTF8); Regex reg = new Regex(categoryOption.ChildCategoriesRegex.Pattern, RegexOptions.IgnoreCase); Match m = reg.Match(childDataString); if (m != null) { string data = m.Groups[categoryOption.ChildCategoriesRegex.GroupName].Value; return(JsonConvert.DeserializeObject <ChildCategoryResult>(data)); } return(null); }
/// <summary> /// 分析要抓取的商品并写入队列 /// </summary> private void DataAnalyzing() { collectorQueue = new Queue <SearchProductResult>(); //已抓取的页数 int pageNum = 0; //下一页处理 var next = Tools.Fix <TempCategory, string, string>(f => (cat, url) => { Thread.Sleep(1000); string nextLink = null; //是否继续抓取 bool isContinue = (searchOption.MaxSearchPages < 0 || pageNum < searchOption.MaxSearchPages) && !string.IsNullOrWhiteSpace(url); if (isContinue) { //获取默认商品列表页HTML string searchHtml = WebClientHelper.GetContent(url, searchOption.Encoding, searchOption.CookieString); //获取下一页链接 Regex nextPageRegex = new Regex(searchOption.NextPageRegex.Pattern, RegexOptions.IgnoreCase); //匹配下一页html Match nextMatch = nextPageRegex.Match(searchHtml); nextLink = nextMatch.Groups[searchOption.NextPageRegex.GroupName].Value; if (nextLink.StartsWith("?")) { nextLink = url.Split('?')[0] + nextLink; } //捕获当前页所有商品链接 Regex productRegex = new Regex(searchOption.ItemRegex.Pattern, RegexOptions.IgnoreCase); //匹配所有的商品区html MatchCollection proMatchs = productRegex.Matches(searchHtml); //将匹配到要抓取的商品信息加入队列 foreach (Match m in proMatchs) { collectorQueue.Enqueue(new SearchProductResult { SiteCatId = cat.SiteCatId, CategoryId = cat.CategoryId, Name = Tools.IgnoreHtmlTag(m.Groups[searchOption.ItemRegex.TitleGroupName].Value), Link = m.Groups[searchOption.ItemRegex.LinkGroupName].Value }); } pageNum++;//抓取的页数+1 //存在下一页,则继续下一页数据抓取 if (!string.IsNullOrWhiteSpace(nextLink)) { nextLink = f(cat, nextLink); } } return(nextLink); }); int catIndex = 0; //遍历采集的分类 foreach (var cat in needCollectCategories) { pageNum = 0; Writer.writeInvoke(new MessageState { Text = $"分析“{cat.Name}”下的商品……" }); next(cat, cat.Link); Writer.writeInvoke(new ProgressState { Max = needCollectCategories.Count(), Value = ++catIndex }); Writer.writeInvoke(new MessageState { Text = $"“{cat.Name}”下的商品分析完成!" }); } }
/// <summary> /// 抓取单个商品 /// </summary> /// <param name="searchRst"></param> /// <returns></returns> bool getProduct(SearchProductResult searchRst) { if (searchRst == null) { return(false); } #region//TODO 抓取商品数据 var detailsHtml = WebClientHelper.GetContent(searchRst.Link, detailsOption.Encoding); if (string.IsNullOrWhiteSpace(detailsHtml)) { return(false); } //商品装载数据 ProductSetupResult setup = null; Regex setupRegex = new Regex(detailsOption.SetupRegex.Pattern, RegexOptions.IgnoreCase); Match setupMatch = setupRegex.Match(detailsHtml); if (setupMatch != null) { string setupData = setupMatch.Groups[detailsOption.SetupRegex.GroupName].Value; setup = JsonConvert.DeserializeObject <ProductSetupResult>(setupData); } if (setup == null || setup.ItemDO == null) { return(false); } //定义商品ID long productId = Tools.NewId(); //图片 Dictionary <string, string> proImgDic = new Dictionary <string, string>();//key为“商品ID_v序号”组成,value为采集的图片地址 Regex imagesRegex = new Regex(detailsOption.ImagesDataRegex.Pattern, RegexOptions.IgnoreCase); Match imagesMatch = imagesRegex.Match(detailsHtml); if (imagesMatch != null) { string imagesData = imagesMatch.Groups[detailsOption.ImagesDataRegex.GroupName].Value; Regex singleImgRegex = new Regex(detailsOption.SingleImageRegex.Pattern, RegexOptions.IgnoreCase); MatchCollection singleImgMatches = singleImgRegex.Matches(imagesData); Regex removeRegex = new Regex(detailsOption.ImageSrcRemoveRegex.Pattern, RegexOptions.IgnoreCase); int proIdx = 0; foreach (Match m in singleImgMatches) { string src = m.Groups[detailsOption.SingleImageRegex.GroupName].Value.GetFullLink(); //移除缩略图标识,保留原图地址 src = removeRegex.Replace(src, ""); proImgDic.Add($"{setup.ItemDO.ItemId}_v{++proIdx}", src); } } //商品描述 string desc = null; var descHtml = WebClientHelper.GetContent(setup.Api.DescUrl, detailsOption.Encoding); Regex descRegex = new Regex(detailsOption.DescRegex.Pattern, RegexOptions.IgnoreCase); Match descMatch = descRegex.Match(descHtml); if (descMatch != null) { desc = descMatch.Groups[detailsOption.DescRegex.GroupName].Value; } //描述中的图片 Dictionary <string, string> descImgDic = new Dictionary <string, string>();//key为“商品ID_d序号”组成,value为采集的图片地址 Regex descImgRegex = new Regex(detailsOption.DescImageRegex.Pattern, RegexOptions.IgnoreCase); MatchCollection descImgMatches = descImgRegex.Matches(desc); int descIdx = 0; foreach (Match m in descImgMatches) { string src = m.Groups[detailsOption.DescImageRegex.GroupName].Value; //当前图片标识 string currentImgTag = $"{setup.ItemDO.ItemId}_d{++descIdx}"; //将详情描述中的当前图片地址用标识符替换以占位,待上传后用新地址替换 desc = desc.Replace(src, currentImgTag); descImgDic.Add(currentImgTag, src.GetFullLink()); } #endregion //下载商品展示图 proImgDic = WebClientHelper.DownloadFile(proImgDic, uploadOption.VisitAddress, uploadOption.SaveDirectory); //下载商品描述图 descImgDic = WebClientHelper.DownloadFile(descImgDic, uploadOption.VisitAddress, uploadOption.SaveDirectory); //将描述中的图更换为上传后的地址 foreach (var img in descImgDic) { //将描述中的标识符替换为上传后的图片地址 desc = desc.Replace(img.Key, img.Value); } #region // 解析成产品库数据 string title = setup.ItemDO.Title; if (detailsOption.ReplaceItems.Any()) { foreach (var rep in detailsOption.ReplaceItems) { title = title.Replace(rep.SourceText, rep.ReplaceTo); desc = desc.Replace(rep.SourceText, rep.ReplaceTo); } } //商品 Product product = new Product { BrandID = long.Parse(setup.ItemDO.BrandId), CategoryID = searchRst.CategoryId, CreateTime = DateTime.Now, Intro = desc, IsDelete = false, mainPic = proImgDic.Values.FirstOrDefault(), Path = searchRst.Link, Pics = string.Join(",", proImgDic.Values), ProductID = productId, Properties = string.Empty, Source = collectorType, Title = title, UpdateTime = DateTime.Now, Weight = float.Parse(setup.ItemDO.Weight), SourceProductID = long.Parse(setup.ItemDO.ItemId) }; //SKU ProductSku sku = new ProductSku { CreateTime = product.CreateTime, IsDelete = false, Name = product.Title, ProductID = product.ProductID, SalePrice = decimal.Parse(setup.Detail.DefaultItemPrice), SkuID = Tools.NewId(), UpdateTime = product.UpdateTime, Weight = product.Weight }; #endregion #region // 保存到数据库 using (var db = new DataContext()) { db.Product.Add(product); db.ProductSku.Add(sku); return(db.SaveChanges() > 0); } #endregion }