예제 #1
0
        public void  GetWebSiteForYiYaoWang()
        {
            //用于执行页面的分页计数器
            int SNUMBER = 0;

            //获取目标分类地址数据
            IList <string> categoryURLList = GetURLList();

            //网页获取辅助类
            HttpHelper helper = new HttpHelper();

            #region 商品分类采集
            //遍历分类
            for (int i = 0; i < categoryURLList.Count; i++)
            {
                string webStieUrl = categoryURLList[i];
                while (true)
                {
                    SNUMBER++;
                    string   url  = string.Format(webStieUrl, SNUMBER);
                    HttpItem item = new HttpItem();
                    item.URL        = url;
                    item.Timeout    = 10000;
                    item.ResultType = ResultType.String;
                    item.UserAgent  = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";//伪装百度爬虫名
                    HttpResult result = helper.GetHtml(item);

                    //没有数据时进入下一个分类
                    if (result.Html.IndexOf("itemid") == -1)
                    {
                        SNUMBER = 0;
                        break;
                    }
                    IList <SpiderProductInfo> productInfoList = AnalyticalHtml(result);

                    Log("---------------------------------------------------");
                    Log("===目标:" + url + " ===");
                    Log("===已完成抓取,总条数:" + productInfoList.Count + "时间:" + System.DateTime.Now + "===");
                    if (productInfoList != null && productInfoList.Count > 0)
                    {
                        #region 抓取单个商品的详细
                        for (int j = 0; j < productInfoList.Count; j++)
                        {
                            SpiderProductInfo spInfo     = productInfoList[j];
                            HttpItem          detailItem = new HttpItem();
                            detailItem.URL        = spInfo.Url;
                            detailItem.Timeout    = 10000;
                            detailItem.ResultType = ResultType.String;
                            detailItem.UserAgent  = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";
                            HttpResult detailResult = helper.GetHtml(detailItem);
                            AnalyticalDetailHtml(detailResult, spInfo);
                        }
                        #endregion
                    }
                    Log("===目标:" + url + "===");
                    Log("===已完成存储!===");
                }
            }
            #endregion
        }
        /// <summary>
        /// 采集商品详情
        /// </summary>
        /// <param name="spiderProduct"></param>
        /// <returns></returns>
        public static ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
        {
            if (spiderDictionary.ContainsKey(spiderProduct.ECPlatformId))
            {
                return spiderDictionary[spiderProduct.ECPlatformId].SpiderProductDetail(spiderProduct);
            }

            return null;
        }
예제 #3
0
        /// <summary>
        /// 采集商品详情
        /// </summary>
        /// <param name="spiderProduct"></param>
        /// <returns></returns>
        public static ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
        {
            if (spiderDictionary.ContainsKey(spiderProduct.ECPlatformId))
            {
                return(spiderDictionary[spiderProduct.ECPlatformId].SpiderProductDetail(spiderProduct));
            }

            return(null);
        }
예제 #4
0
        public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
        {
            //var html=WebBrowerManager.Instance.Run(spiderProduct.Url);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(spiderProduct.HtmlSource);

            //标题
            var title = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[1]/h1[1]");
            var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img");

            // 文字价格
            var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[2]/ul[1]/li[2]/script[1]");

            // 产品图片
            //var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]");
            decimal realPrice = 0;

            if (price.Attributes["src"] != null && !string.IsNullOrEmpty(price.Attributes["src"].Value))
            {
                decimal.TryParse(ImageProcess.Recognize(price.Attributes["src"].Value), out realPrice);
            }

            // 促销信息是ajax
            if (title != null && price != null && priceText != null)
            {
                var beginIndex = priceText.InnerText.IndexOf("京东价:¥");

                var     endIndex         = priceText.InnerText.IndexOf("。", beginIndex);
                var     readPrice        = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length);
                decimal decimalRealPrice = 0;
                if (decimal.TryParse(readPrice, out decimalRealPrice))
                {
                    //UpdateProduct(productId, title.InnerText, decimal.Parse(readPrice));
                }
            }

            return(new ProductInfo()
            {
                Source = spiderProduct.HtmlSource, ProductId = spiderProduct.ProductId, Url = spiderProduct.Url, Price = realPrice
            });
        }
예제 #5
0
        /// <summary>
        /// 将目标商品分类页结果中的商品拆分成List 商品信息
        /// </summary>
        /// <param name="result">result.html 字符串 </param>
        /// <returns>返回数据集合</returns>
        private IList <SpiderProductInfo> AnalyticalHtml(HttpResult result)
        {
            IList <SpiderProductInfo> productInfoList = new List <SpiderProductInfo>();

            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(result.Html);
            var collection = doc.DocumentNode.SelectNodes("//div[@class='itemSearchResultCon']");

            foreach (HtmlAgilityPack.HtmlNode item in collection) //div
            {
                SpiderProductInfo spInfo = null;
                //子节点:a标签中href数据, img中的alt数据
                HtmlNodeCollection cNodes = item.ChildNodes;
                foreach (var citem in cNodes)
                {
                    //a标签
                    if (citem.Name.ToLower() == "a")
                    {
                        spInfo           = new SpiderProductInfo();
                        spInfo.SpiderPID = Convert.ToInt32(item.Attributes["itemid"].Value);
                        spInfo.Url       = citem.Attributes["href"].Value;
                        HtmlNodeCollection ccNodes = citem.ChildNodes;
                        foreach (var ccItem in ccNodes)
                        {
                            //img标签
                            if (ccItem.Name == "img")
                            {
                                spInfo.ProductName = ccItem.Attributes["alt"].Value;
                            }
                        }
                    }
                }
                if (spInfo != null)
                {
                    productInfoList.Add(spInfo);
                }
            }
            return(productInfoList);
        }
        public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
        {
            //var html=WebBrowerManager.Instance.Run(spiderProduct.Url);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(spiderProduct.HtmlSource);

            //标题
            var title = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[1]/h1[1]");
            var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img");

            // 文字价格
            var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[2]/ul[1]/li[2]/script[1]");

            // 产品图片
            //var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]");
            decimal realPrice = 0;
            if (price.Attributes["src"]!=null&&!string.IsNullOrEmpty(price.Attributes["src"].Value))
            {
                decimal.TryParse(ImageProcess.Recognize(price.Attributes["src"].Value),out realPrice) ;
            }

            // 促销信息是ajax
            if (title != null && price != null && priceText != null)
            {
                var beginIndex = priceText.InnerText.IndexOf("京东价:¥");

                var endIndex = priceText.InnerText.IndexOf("。", beginIndex);
                var readPrice = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length);
                decimal decimalRealPrice = 0;
                if (decimal.TryParse(readPrice, out decimalRealPrice))
                {
                    //UpdateProduct(productId, title.InnerText, decimal.Parse(readPrice));
                }

            }

            return new ProductInfo() { Source = spiderProduct.HtmlSource, ProductId = spiderProduct.ProductId, Url = spiderProduct.Url, Price = realPrice };
        }
 public void Enqueue(SpiderProductInfo spiderProduct)
 {
     SpiderUrlQueue.Enqueue(spiderProduct);
 }
예제 #8
0
 public void Enqueue(SpiderProductInfo spiderProduct)
 {
     SpiderUrlQueue.Enqueue(spiderProduct);
 }
예제 #9
0
 public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
 {
     throw new NotImplementedException();
 }
예제 #10
0
 public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
 {
     return(new ProductInfo());
 }
예제 #11
0
        /// <summary>
        /// 进入商品详情页后,解析出商品说明书Table,写入数据库
        /// </summary>
        /// <param name="detailResult">result.html 字符串</param>
        /// <param name="spInfo">包含 ID name  url</param>
        private void AnalyticalDetailHtml(HttpResult detailResult, SpiderProductInfo spInfo)
        {
            IList <SpiderProductInfo> spInfoList = new List <SpiderProductInfo>();

            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(detailResult.Html);
            var nameCollection = doc.DocumentNode.SelectNodes("//div[@class='middle_property']");//获取名字

            if (nameCollection == null)
            {
                nameCollection = doc.DocumentNode.SelectNodes("//div[@class='middle_property middle_propertyO2o']");//获取名字
                if (nameCollection == null)
                {
                    return;
                }
            }

            spInfo.ProductName = nameCollection[0].ChildNodes["h1"].InnerHtml; //h1中内嵌了其他标签
            if (spInfo.ProductName.IndexOf("<") != -1)
            {
                spInfo.ProductName = spInfo.ProductName.Split('<')[0];
            }

            var collection = doc.DocumentNode.SelectNodes("//table[@class='specificationBox']");

            if (collection == null)
            {
                return;
            }
            foreach (HtmlAgilityPack.HtmlNode tableItem in collection)                 //table
            {
                HtmlNodeCollection trNodes = tableItem.ChildNodes["tbody"].ChildNodes; //tr 节点
                foreach (var trItem in trNodes)
                {
                    if (trItem.ChildNodes.Count == 1)
                    {
                        continue;
                    }
                    string            keyName    = trItem.ChildNodes["th"].InnerText.Trim();
                    string            keyValue   = trItem.ChildNodes["td"].InnerHtml.Trim();
                    SpiderProductInfo spInfoFlag = new SpiderProductInfo();
                    spInfoFlag.SpiderPID   = spInfo.SpiderPID;
                    spInfoFlag.ProductName = spInfo.ProductName;
                    spInfoFlag.Url         = spInfo.Url;
                    spInfoFlag.KeyName     = keyName;
                    spInfoFlag.KeyValue    = keyValue;
                    spInfoList.Add(spInfoFlag);
                }
            }
            if (spInfoList != null && spInfoList.Count != 0)
            {
                var oResult = SpiderBLL.Instance.InsertSpider(spInfoList);
                if (oResult.ResultType != OperationResultType.Success)
                {
                    Log(string.Format("写入异常:数据ID {0},异常内容:{1}", spInfo.SpiderPID, oResult.Message));
                }
                else
                {
                    Console.WriteLine(string.Format("数据ID:{0},名称:{1}, 已完成添加", spInfo.SpiderPID, spInfo.ProductName));
                }
            }
        }
예제 #12
0
        /// <summary>
        ///  写入数据表事务提交
        /// </summary>
        /// <param name="spInfolist"></param>
        /// <returns></returns>
        public OperationResult <bool> InsertSpider(IList <SpiderProductInfo> spInfolist)
        {
            DbConnection  con         = null;
            DbTransaction transcation = null;

            try
            {
                using (con = dbw.CreateConnection())
                {
                    con.Open();
                    transcation = con.BeginTransaction();
                    using (var command = con.CreateCommand())
                    {
                        command.Transaction = transcation;
                        var sql = string.Empty;
                        sql += " INSERT INTO Spider ";
                        sql += " (SpiderPID,ProductName,Url,KeyName,KeyValue ) ";
                        sql += " VALUES ";
                        sql += " (@SpiderPID,@ProductName,@Url,@KeyName,@KeyValue) ";
                        command.Connection  = con;
                        command.CommandText = sql;

                        for (int i = 0; i < spInfolist.Count; i++)
                        {
                            command.Parameters.Clear();
                            SpiderProductInfo spInfo    = spInfolist[i];
                            DbParameter       spiderPID = command.CreateParameter();
                            spiderPID.DbType        = DbType.Int32;
                            spiderPID.Value         = spInfo.SpiderPID;
                            spiderPID.ParameterName = "SpiderPID";
                            command.Parameters.Add(spiderPID);

                            DbParameter productName = command.CreateParameter();
                            productName.DbType        = DbType.String;
                            productName.Value         = spInfo.ProductName;
                            productName.ParameterName = "ProductName";
                            command.Parameters.Add(productName);

                            DbParameter url = command.CreateParameter();
                            url.DbType        = DbType.String;
                            url.Value         = spInfo.Url;
                            url.ParameterName = "Url";
                            command.Parameters.Add(url);

                            DbParameter keyName = command.CreateParameter();
                            keyName.DbType        = DbType.String;
                            keyName.Value         = spInfo.KeyName;
                            keyName.ParameterName = "KeyName";
                            command.Parameters.Add(keyName);

                            DbParameter keyValue = command.CreateParameter();
                            keyValue.DbType        = DbType.String;
                            keyValue.Value         = spInfo.KeyValue;
                            keyValue.ParameterName = "KeyValue";
                            command.Parameters.Add(keyValue);

                            command.ExecuteNonQuery();
                        }
                    }
                    transcation.Commit();
                }
                return(new OperationResult <bool>(OperationResultType.Success, null, true));
            }
            catch (Exception e)
            {
                if (transcation != null)
                {
                    transcation.Rollback();
                }
                return(new OperationResult <bool>(OperationResultType.Error, e.Message, false));
            }
            finally
            {
                con.Close();
            }
        }
 public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
 {
     throw new NotImplementedException();
 }
 public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
 {
     return new ProductInfo();
 }