Beispiel #1
0
        /// <summary>
        /// 获取子分类结果
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private ChildCategoryResult GetChildCategoryResult(string url)
        {
            string childDataString = WebClientHelper.GetContent(url, Encoding.UTF8);

            Regex reg = new Regex(categoryOption.ChildCategoriesRegex.Pattern, RegexOptions.IgnoreCase);

            Match m = reg.Match(childDataString);

            if (m != null)
            {
                string data = m.Groups[categoryOption.ChildCategoriesRegex.GroupName].Value;

                return(JsonConvert.DeserializeObject <ChildCategoryResult>(data));
            }

            return(null);
        }
Beispiel #2
0
        /// <summary>
        /// 分析要抓取的商品并写入队列
        /// </summary>
        private void DataAnalyzing()
        {
            collectorQueue = new Queue <SearchProductResult>();

            //已抓取的页数
            int pageNum = 0;

            //下一页处理
            var next = Tools.Fix <TempCategory, string, string>(f => (cat, url) =>
            {
                Thread.Sleep(1000);

                string nextLink = null;

                //是否继续抓取
                bool isContinue = (searchOption.MaxSearchPages < 0 || pageNum < searchOption.MaxSearchPages) && !string.IsNullOrWhiteSpace(url);

                if (isContinue)
                {
                    //获取默认商品列表页HTML
                    string searchHtml = WebClientHelper.GetContent(url, searchOption.Encoding, searchOption.CookieString);

                    //获取下一页链接
                    Regex nextPageRegex = new Regex(searchOption.NextPageRegex.Pattern, RegexOptions.IgnoreCase);

                    //匹配下一页html
                    Match nextMatch = nextPageRegex.Match(searchHtml);

                    nextLink = nextMatch.Groups[searchOption.NextPageRegex.GroupName].Value;

                    if (nextLink.StartsWith("?"))
                    {
                        nextLink = url.Split('?')[0] + nextLink;
                    }

                    //捕获当前页所有商品链接
                    Regex productRegex = new Regex(searchOption.ItemRegex.Pattern, RegexOptions.IgnoreCase);

                    //匹配所有的商品区html
                    MatchCollection proMatchs = productRegex.Matches(searchHtml);

                    //将匹配到要抓取的商品信息加入队列
                    foreach (Match m in proMatchs)
                    {
                        collectorQueue.Enqueue(new SearchProductResult
                        {
                            SiteCatId  = cat.SiteCatId,
                            CategoryId = cat.CategoryId,
                            Name       = Tools.IgnoreHtmlTag(m.Groups[searchOption.ItemRegex.TitleGroupName].Value),
                            Link       = m.Groups[searchOption.ItemRegex.LinkGroupName].Value
                        });
                    }

                    pageNum++;//抓取的页数+1

                    //存在下一页,则继续下一页数据抓取
                    if (!string.IsNullOrWhiteSpace(nextLink))
                    {
                        nextLink = f(cat, nextLink);
                    }
                }

                return(nextLink);
            });

            int catIndex = 0;

            //遍历采集的分类
            foreach (var cat in needCollectCategories)
            {
                pageNum = 0;

                Writer.writeInvoke(new MessageState {
                    Text = $"分析“{cat.Name}”下的商品……"
                });

                next(cat, cat.Link);

                Writer.writeInvoke(new ProgressState {
                    Max = needCollectCategories.Count(), Value = ++catIndex
                });

                Writer.writeInvoke(new MessageState {
                    Text = $"“{cat.Name}”下的商品分析完成!"
                });
            }
        }
Beispiel #3
0
        /// <summary>
        /// 抓取单个商品
        /// </summary>
        /// <param name="searchRst"></param>
        /// <returns></returns>
        bool getProduct(SearchProductResult searchRst)
        {
            if (searchRst == null)
            {
                return(false);
            }

            #region//TODO 抓取商品数据
            var detailsHtml = WebClientHelper.GetContent(searchRst.Link, detailsOption.Encoding);

            if (string.IsNullOrWhiteSpace(detailsHtml))
            {
                return(false);
            }

            //商品装载数据
            ProductSetupResult setup = null;
            Regex setupRegex         = new Regex(detailsOption.SetupRegex.Pattern, RegexOptions.IgnoreCase);
            Match setupMatch         = setupRegex.Match(detailsHtml);
            if (setupMatch != null)
            {
                string setupData = setupMatch.Groups[detailsOption.SetupRegex.GroupName].Value;
                setup = JsonConvert.DeserializeObject <ProductSetupResult>(setupData);
            }

            if (setup == null || setup.ItemDO == null)
            {
                return(false);
            }

            //定义商品ID
            long productId = Tools.NewId();

            //图片
            Dictionary <string, string> proImgDic = new Dictionary <string, string>();//key为“商品ID_v序号”组成,value为采集的图片地址
            Regex imagesRegex = new Regex(detailsOption.ImagesDataRegex.Pattern, RegexOptions.IgnoreCase);
            Match imagesMatch = imagesRegex.Match(detailsHtml);
            if (imagesMatch != null)
            {
                string imagesData = imagesMatch.Groups[detailsOption.ImagesDataRegex.GroupName].Value;

                Regex singleImgRegex = new Regex(detailsOption.SingleImageRegex.Pattern, RegexOptions.IgnoreCase);

                MatchCollection singleImgMatches = singleImgRegex.Matches(imagesData);

                Regex removeRegex = new Regex(detailsOption.ImageSrcRemoveRegex.Pattern, RegexOptions.IgnoreCase);

                int proIdx = 0;
                foreach (Match m in singleImgMatches)
                {
                    string src = m.Groups[detailsOption.SingleImageRegex.GroupName].Value.GetFullLink();

                    //移除缩略图标识,保留原图地址
                    src = removeRegex.Replace(src, "");

                    proImgDic.Add($"{setup.ItemDO.ItemId}_v{++proIdx}", src);
                }
            }

            //商品描述
            string desc      = null;
            var    descHtml  = WebClientHelper.GetContent(setup.Api.DescUrl, detailsOption.Encoding);
            Regex  descRegex = new Regex(detailsOption.DescRegex.Pattern, RegexOptions.IgnoreCase);
            Match  descMatch = descRegex.Match(descHtml);
            if (descMatch != null)
            {
                desc = descMatch.Groups[detailsOption.DescRegex.GroupName].Value;
            }

            //描述中的图片
            Dictionary <string, string> descImgDic = new Dictionary <string, string>();//key为“商品ID_d序号”组成,value为采集的图片地址
            Regex           descImgRegex           = new Regex(detailsOption.DescImageRegex.Pattern, RegexOptions.IgnoreCase);
            MatchCollection descImgMatches         = descImgRegex.Matches(desc);
            int             descIdx = 0;
            foreach (Match m in descImgMatches)
            {
                string src = m.Groups[detailsOption.DescImageRegex.GroupName].Value;

                //当前图片标识
                string currentImgTag = $"{setup.ItemDO.ItemId}_d{++descIdx}";

                //将详情描述中的当前图片地址用标识符替换以占位,待上传后用新地址替换
                desc = desc.Replace(src, currentImgTag);

                descImgDic.Add(currentImgTag, src.GetFullLink());
            }
            #endregion

            //下载商品展示图
            proImgDic = WebClientHelper.DownloadFile(proImgDic, uploadOption.VisitAddress, uploadOption.SaveDirectory);

            //下载商品描述图
            descImgDic = WebClientHelper.DownloadFile(descImgDic, uploadOption.VisitAddress, uploadOption.SaveDirectory);

            //将描述中的图更换为上传后的地址
            foreach (var img in descImgDic)
            {
                //将描述中的标识符替换为上传后的图片地址
                desc = desc.Replace(img.Key, img.Value);
            }

            #region // 解析成产品库数据

            string title = setup.ItemDO.Title;

            if (detailsOption.ReplaceItems.Any())
            {
                foreach (var rep in detailsOption.ReplaceItems)
                {
                    title = title.Replace(rep.SourceText, rep.ReplaceTo);
                    desc  = desc.Replace(rep.SourceText, rep.ReplaceTo);
                }
            }

            //商品
            Product product = new Product
            {
                BrandID         = long.Parse(setup.ItemDO.BrandId),
                CategoryID      = searchRst.CategoryId,
                CreateTime      = DateTime.Now,
                Intro           = desc,
                IsDelete        = false,
                mainPic         = proImgDic.Values.FirstOrDefault(),
                Path            = searchRst.Link,
                Pics            = string.Join(",", proImgDic.Values),
                ProductID       = productId,
                Properties      = string.Empty,
                Source          = collectorType,
                Title           = title,
                UpdateTime      = DateTime.Now,
                Weight          = float.Parse(setup.ItemDO.Weight),
                SourceProductID = long.Parse(setup.ItemDO.ItemId)
            };

            //SKU
            ProductSku sku = new ProductSku
            {
                CreateTime = product.CreateTime,
                IsDelete   = false,
                Name       = product.Title,
                ProductID  = product.ProductID,
                SalePrice  = decimal.Parse(setup.Detail.DefaultItemPrice),
                SkuID      = Tools.NewId(),
                UpdateTime = product.UpdateTime,
                Weight     = product.Weight
            };

            #endregion

            #region // 保存到数据库

            using (var db = new DataContext())
            {
                db.Product.Add(product);
                db.ProductSku.Add(sku);

                return(db.SaveChanges() > 0);
            }

            #endregion
        }