예제 #1
0
        /// <summary>
        /// 采取分类
        /// </summary>
        /// <param name="url"></param>
        public static void AmazonSpider(string url)
        {
            url = "http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore";
            //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore
            //var hcFirst = new HttpClient(url);
            //hcFirst.Timeout = 30000;
            //var htmlFirst = hcFirst.Request();
            //var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run(url);

            // 注意htmldecode
            html = HttpUtility.HtmlDecode(html);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='popover-grouping']");

            if (firstCategoryContainer == null || !firstCategoryContainer.Any())
            {
                return;
            }

            foreach (HtmlNode htmlNode in firstCategoryContainer)
            {
                var firstCategoryNode = htmlNode.SelectSingleNode("//div[@class='popover-category-name']/h2");

                if (firstCategoryNode == null)
                {
                    continue;
                }

                // 一级分类
                var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                Insert(firstKey.ToString(), firstCategoryNode.InnerText, string.Empty, "0", "8");

                // 二级分类
                var secondCategoryNodes = htmlNode.SelectNodes("div");
                if (secondCategoryNodes == null || !secondCategoryNodes.Any())
                {
                    continue;
                }

                foreach (HtmlNode node in secondCategoryNodes)
                {
                    if (node.Attributes["class"] == null)
                    {
                        var secondCategoryNode = node.SelectSingleNode("a");
                        if (secondCategoryNode != null && secondCategoryNode.Attributes["href"] != null &&
                            !string.IsNullOrEmpty(secondCategoryNode.Attributes["href"].Value))
                        {
                            var categoryUrl  = "http://www.amazon.cn" + HttpUtility.UrlDecode(secondCategoryNode.Attributes["href"].Value);
                            var categoryName = secondCategoryNode.InnerText;
                            var secondKey    = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            Insert(secondKey.ToString(), categoryName, categoryUrl, "0", "8");
                        }
                    }
                }
            }
        }
예제 #2
0
        /// <summary>
        /// 亚马逊产品列表
        /// </summary>
        public static void AmazonProductList(string url)
        {
            //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore
            if (string.IsNullOrEmpty(url))
            {
                return;
            }
            Uri    uri                    = new Uri(url);
            string queryString            = uri.Query;
            NameValueCollection nameValue = UrlHelper.GetQueryString(queryString);
            // 根据url中抽取分类
            string node = nameValue["node"];

            if (string.IsNullOrEmpty(node))
            {
                return;
            }

            string urlTemplate  = "http://www.amazon.cn/s/ref=?rh=n:{0}&page={1}";
            var    firstPageUrl = string.Format(urlTemplate, node, 1);
            var    hcFirst      = new HttpClient(firstPageUrl);

            hcFirst.Timeout = 30000;
            var html         = HttpUtility.HtmlDecode(hcFirst.Request());
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var maxPageNode   = htmlDocument.DocumentNode.SelectSingleNode("//span[@class='pagnDisabled']");
            var maxPageNumber = 0;

            if (maxPageNode != null && int.TryParse(maxPageNode.InnerText, out maxPageNumber))
            {
                for (int i = 1; i <= maxPageNumber; i++)
                {
                    if (i != 1)
                    {
                        var pageUrl = string.Format(urlTemplate, node, i);
                        var hc      = new HttpClient(pageUrl);
                        hc.Timeout   = 30000;
                        html         = HttpUtility.HtmlDecode(hc.Request());
                        htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                    }

                    var productNodes =
                        htmlDocument.DocumentNode.SelectNodes(
                            "//div[@class='result product'] | //div[@class='result lastRow product']");
                    if (productNodes == null)
                    {
                        return;
                    }

                    foreach (HtmlNode productNode in productNodes)
                    {
                        var imageNode = productNode.SelectSingleNode("div[@class='image']/a");
                        var titleNode = productNode.SelectSingleNode("div[@class='data']/h3[@class='title']/a");
                    }
                }
            }
        }
예제 #3
0
        public List <ProductInfo> SpiderProductList(SpiderCategoryInfo spiderCategory)
        {
            WebBrowerManager.Instance.Setup(new cEXWB());
            WebBrowerManager.Instance.TimeOut = 15;
            var html         = WebBrowerManager.Instance.Run(spiderCategory.CategoryUrl);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            htmlDocument.GetElementbyId("search_result");
            return(new List <ProductInfo>());
        }
예제 #4
0
        public static List <string> GetUrls()
        {
            var indexUrls = new List <string>();
            var indexUrl  = "http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm";

            var firstPage = WebBrowerManager.Instance.Brower(indexUrl);

            if (string.IsNullOrEmpty(firstPage.HtmlSource))
            {
                return(indexUrls);
            }

            var firstDocument = HtmlAgilityPackHelper.GetHtmlDocument(firstPage.HtmlSource);
            var firstPageNode = firstDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[4]/a[1]");
            var lastPageNode  = firstDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[1]/div[3]/div[1]/div[4]/a[4]");

            var maxPageNumber = 0;

            if (lastPageNode.HasAttributes)
            {
                var attr = lastPageNode.Attributes["href"];
                if (attr != null && !string.IsNullOrEmpty(attr.Value))
                {
                    var index      = attr.Value.IndexOf("index");
                    var pointIndex = attr.Value.IndexOf(".");
                    if (index >= 0 && pointIndex > 0)
                    {
                        var length        = pointIndex - index - "index".Length;
                        var maxPageString = attr.Value.Substring(index + "index".Length, length);

                        int.TryParse(maxPageString, out maxPageNumber);
                    }
                }
            }

            if (maxPageNumber > 0)
            {
                string url = string.Empty;
                for (int i = 0; i <= maxPageNumber; i++)
                {
                    if (i == 0)
                    {
                        url = "http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm";
                    }
                    else
                    {
                        url = string.Format("http://www.sge.sh/publish/sge/xqzx/jyxq/index{0}.htm", i);
                    }
                    indexUrls.Add(url);
                }
            }

            return(indexUrls);
        }
예제 #5
0
        public static void DangDangSpider(string url)
        {
            //http://category.dangdang.com/
            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run(url);

            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var container =
                htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[2]/div[1]/div[2]/div[1]/div[4]/div");

            if (container == null || !container.Any())
            {
                return;
            }

            foreach (HtmlNode htmlNode in container)
            {
                if (htmlNode.HasAttributes && htmlNode.Attributes["Id"] != null)
                {
                    var liNodes = htmlNode.SelectNodes("div[2]/ul[1]/li");

                    foreach (HtmlNode liNode in liNodes)
                    {
                        var aNode = liNode.SelectNodes("a");
                        var chars = new char[6] {
                            '&', 'n', 'b', 's', 'p', ';'
                        };

                        foreach (HtmlNode node in aNode)
                        {
                            var firstKey = 0;
                            if (node.HasAttributes && node.Attributes["class"] != null && node.Attributes["class"].Value == "title")
                            {
                                // 一级分类
                                firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                                Insert(firstKey.ToString(), node.InnerText.Trim(chars), node.Attributes["href"].Value, "0", "5");
                            }
                            else
                            {
                                //二级分类

                                // 一级分类
                                var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                                Insert(secondKey.ToString(), node.InnerText, node.Attributes["href"].Value, firstKey.ToString(), "5");
                            }
                        }
                    }
                }
            }
        }
예제 #6
0
        /// <summary>
        /// 一号店商品采集方法
        /// </summary>
        /// <param name="url">全部分类url</param>
        public static void YiHaoDianSpider(string url)
        {
            //http://www.yihaodian.com/product/listAll.do
            HttpClient hc = new HttpClient(url);

            hc.Timeout = 30000;
            var allSortHtml            = hc.Request();
            var htmlDocument           = HtmlAgilityPackHelper.GetHtmlDocument(allSortHtml);
            var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='alonesort']");

            //var texts = new List<string>();
            foreach (HtmlNode firstCategoryNode in firstCategoryContainer)
            {
                var node = firstCategoryNode.CssSelect(".mt>h3>a");

                if (node != null && node.Any())
                {
                    //一级分类
                    var firstCategoryText = node.FirstOrDefault().InnerText;
                    var firstKey          = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                    Insert(firstKey.ToString(), firstCategoryText, node.FirstOrDefault().Attributes["href"].Value, "0", "2");


                    var secondCategoryContainer = firstCategoryNode.CssSelect(".mc>.fore");

                    foreach (HtmlNode htmlNode in secondCategoryContainer)
                    {
                        //二级分类
                        var secondCategoryNode = htmlNode.CssSelect("dt>a").FirstOrDefault();
                        var secondKey          = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                        if (secondCategoryNode.Attributes["href"] != null)
                        {
                            Insert(secondKey.ToString(), secondCategoryNode.InnerText, secondCategoryNode.Attributes["href"].Value, firstKey.ToString(), "2");
                        }

                        // 三级分类集合
                        var threeCategoryNodes = htmlNode.CssSelect("dd>em>span>a");
                        foreach (HtmlNode threeCategoryNode in threeCategoryNodes)
                        {
                            // 插入三级分类
                            var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            if (threeCategoryNode.Attributes["href"] != null)
                            {
                                Insert(thirdKey.ToString(), threeCategoryNode.InnerText, threeCategoryNode.Attributes["href"].Value, secondKey.ToString(), "2");
                            }
                        }
                    }
                }
            }
        }
예제 #7
0
        public static void SuNingSpider(string url)
        {
            //www.suning.com/emall/SNProductCatgroupView?storeId=10052&catalogId=10051&flag=1
            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run(url);

            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var container = htmlDocument.DocumentNode.CssSelect("div.sFloor.clearfix");

            foreach (HtmlNode htmlNode in container)
            {
                var firstNode = htmlNode.CssSelect("h3>a");
                var firstKey  = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                Insert(firstKey.ToString(), firstNode.FirstOrDefault().InnerText, firstNode.FirstOrDefault().Attributes["href"].Value, "0", "4");

                var temp = new List <HtmlNode>();
                if (htmlNode.CssSelect(".listLeft>dl") != null && htmlNode.CssSelect(".listLeft>dl").Any())
                {
                    temp.AddRange(htmlNode.CssSelect(".listLeft>dl"));
                }
                if (htmlNode.CssSelect(".listRight>dl") != null && htmlNode.CssSelect(".listRight>dl").Any())
                {
                    temp.AddRange(htmlNode.CssSelect(".listRight>dl"));
                }

                foreach (HtmlNode node in temp)
                {
                    var secondNode = node.CssSelect("dt>a").FirstOrDefault();

                    var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                    if (secondNode != null)
                    {
                        Insert(secondKey.ToString(), secondNode.InnerText, "", firstKey.ToString(), "4");
                    }

                    var thridNodes = node.CssSelect("dd>span>a");
                    if (thridNodes != null && thridNodes.Any())
                    {
                        foreach (HtmlNode thridNode in thridNodes)
                        {
                            // 插入三级分类
                            var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            Insert(thirdKey.ToString(), thridNode.InnerText, thridNode.Attributes["href"].Value, secondKey.ToString(), "4");
                        }
                    }
                }
            }
        }
예제 #8
0
        public ProductInfo SpiderProductDetail(SpiderProductInfo spiderProduct)
        {
            //var html=WebBrowerManager.Instance.Run(spiderProduct.Url);
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(spiderProduct.HtmlSource);

            //标题
            var title = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[1]/h1[1]");
            var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img");

            // 文字价格
            var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[6]/div[1]/div[2]/ul[1]/li[2]/script[1]");

            // 产品图片
            //var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]");
            decimal realPrice = 0;

            if (price.Attributes["src"] != null && !string.IsNullOrEmpty(price.Attributes["src"].Value))
            {
                decimal.TryParse(ImageProcess.Recognize(price.Attributes["src"].Value), out realPrice);
            }

            // 促销信息是ajax
            if (title != null && price != null && priceText != null)
            {
                var beginIndex = priceText.InnerText.IndexOf("京东价:¥");

                var     endIndex         = priceText.InnerText.IndexOf("。", beginIndex);
                var     readPrice        = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length);
                decimal decimalRealPrice = 0;
                if (decimal.TryParse(readPrice, out decimalRealPrice))
                {
                    //UpdateProduct(productId, title.InnerText, decimal.Parse(readPrice));
                }
            }

            return(new ProductInfo()
            {
                Source = spiderProduct.HtmlSource, ProductId = spiderProduct.ProductId, Url = spiderProduct.Url, Price = realPrice
            });
        }
예제 #9
0
        public static List <string> Spider(List <string> urls)
        {
            var pageUrls = new List <string>();

            foreach (string indexUrl in urls)
            {
                var html     = WebBrowerManager.Instance.Brower(indexUrl).HtmlSource;
                var document = HtmlAgilityPackHelper.GetHtmlDocument(html);
                var liNodes  = document.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[3]/div[1]/div[3]/ul[1]/li");
                if (liNodes != null && liNodes.Any())
                {
                    foreach (HtmlNode liNode in liNodes)
                    {
                        var aNode = liNode.SelectSingleNode("a");
                        if (aNode != null && aNode.HasAttributes && aNode.Attributes["href"] != null && !string.IsNullOrEmpty(aNode.Attributes["href"].Value))
                        {
                            string pageUrl = string.Format("http://www.sge.sh/publish/sge/xqzx/jyxq/{0}", aNode.Attributes["href"].Value);
                            pageUrls.Add(pageUrl);
                        }
                    }
                }
            }
            return(pageUrls);
        }
예제 #10
0
        private void WebBrower_DocumentComplete(object sender, DocumentCompleteEventArgs e)
        {
            if (e.url.ToLower() == "about:blank")
            {
                return;
            }

            cEXWB pWB = sender as cEXWB;

            if (e.istoplevel)
            {
                IsDocumentFinish = true;
                Html             = pWB.DocumentSource;
                htmlDocument     = HtmlAgilityPackHelper.GetHtmlDocument(pWB.DocumentSource);
                //Stopwatch sw=new Stopwatch();
                //sw.Start();
                //LoadHtmlTree();
                //sw.Stop();
                Stopwatch sw1 = new Stopwatch();
                sw1.Start();
                BuildTree3();
                sw1.Stop();
                richTextBox.Text += string.Format("DOM树构建时间{0}毫秒", sw1.ElapsedMilliseconds) + "\n";
                //MessageBox.Show(sw1.ElapsedMilliseconds.ToString());
                //Logger.Log(LogLevel.Info, string.Format("DocumentComplete,Url:{0},时间:{1}", e.url,DateTime.Now));
            }
            else if (pWB != null && pWB.MainDocumentFullyLoaded) // a frame naviagtion within a frameset
            {
                IsDocumentFinish = true;
                //Logger.Log(LogLevel.Info, string.Format("MainDocumentFullyLoaded,Url:{0},时间:{1}", e.url, DateTime.Now));
            }
            else
            {
                //log.Debug("DocumentComplete::TopLevel is FALSE===>" + e.url);
            }
        }
예제 #11
0
        public static List <Gold> SpiderPrice(List <string> urls)
        {
            var errorUrls = new List <string>();
            var data      = new List <Gold>();
            var chars     = new char[6] {
                '&', 'n', 'b', 's', 'p', ';'
            };

            foreach (string url in urls)
            {
                var html = WebBrowerManager.Instance.Brower(url).HtmlSource;
                if (string.IsNullOrEmpty(html))
                {
                    errorUrls.Add(url);
                    continue;
                }
                var document  = HtmlAgilityPackHelper.GetHtmlDocument(html);
                var tableNode = document.DocumentNode.SelectSingleNode("//div[@class='newscontent']//table");
                if (tableNode != null)
                {
                    var trNodes = tableNode.SelectNodes("tbody[1]/tr");
                    foreach (var trNode in trNodes)
                    {
                        var firstTdNode = trNode.SelectSingleNode("td");
                        if (firstTdNode != null)
                        {
                            if (firstTdNode.InnerText.Contains("Au9995") || firstTdNode.InnerText.Contains("Au9999"))
                            {
                                var tdNodes = trNode.SelectNodes("td");
                                if (tdNodes != null && tdNodes.Any())
                                {
                                    var dataTdNodes = tdNodes.Take(5).ToList();
                                    try
                                    {
                                        Gold gold = new Gold();
                                        ;
                                        gold.Name         = dataTdNodes[0].InnerText.TrimEnd(chars);
                                        gold.OpeningPrice = decimal.Parse(dataTdNodes[1].InnerText);
                                        gold.HighestPrice = decimal.Parse(dataTdNodes[2].InnerText);
                                        gold.LowestPrice  = decimal.Parse(dataTdNodes[3].InnerText);
                                        gold.ClosingPrice = decimal.Parse(dataTdNodes[4].InnerText);
                                        gold.DateString   =
                                            document.DocumentNode.SelectSingleNode(
                                                "/html[1]/body[1]/div[1]/div[3]/div[1]/div[3]/div[1]/p[1]").InnerText;
                                        data.Add(gold);
                                    }
                                    catch (Exception)
                                    {
                                        errorUrls.Add(url);
                                        continue;
                                    }
                                }
                            }
                        }
                    }
                }
            }

            File.WriteAllText(Environment.CurrentDirectory + "\\errorUrls.json", JsonHelper.ToJson(errorUrls));

            return(data);
        }
예제 #12
0
        /// <summary>
        /// 苏宁列表页面商品提取
        /// </summary>
        /// <param name="url"></param>
        public static object SuNingProductList(object obj)
        {
            var url = obj as string;

            if (string.IsNullOrEmpty(url))
            {
                return(false);
            }
            Uri    uri                    = new Uri(url);
            string queryString            = uri.Query;
            NameValueCollection nameValue = UrlHelper.GetQueryString(queryString);
            // 根据url中抽取分类
            string cid = nameValue["ci"];

            if (string.IsNullOrEmpty(cid))
            {
                return(false);
            }

            string urlTemplate  = "http://search.suning.com/emall/strd.do?ci={0}&cityId=9017&cp={1}&il=0&si=5&st=14&iy=-1";
            var    firstPageUrl = string.Format("http://search.suning.com/emall/strd.do?ci={0}&cityId=9017&cp=0&il=0&si=5&st=14&iy=-1", cid);
            var    hcFirst      = new HttpClient(firstPageUrl);

            hcFirst.Timeout = 30000;
            var htmlFirst    = hcFirst.Request();
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            // 先找最大页面页面
            var pageContainer = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[8]/a");

            if (pageContainer == null || !pageContainer.Any())
            {
                return(false);
            }
            var lastPageNode = pageContainer[pageContainer.Count - 2];

            var lastPageNumber = int.Parse(lastPageNode.InnerText);

            for (int i = 0; i < lastPageNumber; i++)
            {
                if (i == 0)
                {
                    // 解析商品
                    var productLis = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[6]/ul[1]/li");
                    foreach (var htmlNode in productLis)
                    {
                        var aNode = htmlNode.SelectSingleNode("a");
                        if (aNode != null)
                        {
                            // 商品名称
                            var name = aNode.Attributes["title"].Value;
                            // 商品链接
                            var href = aNode.Attributes["href"].Value;
                            // 图片
                            var imageNode = aNode.SelectSingleNode("img");
                            var picUrl    = string.Empty;
                            if (imageNode != null && imageNode.Attributes["src2"] != null && !string.IsNullOrEmpty(imageNode.Attributes["src2"].Value))
                            {
                                // 图片url
                                picUrl = imageNode.Attributes["src2"].Value;
                            }

                            // 评论
                            var commentNode = htmlNode.SelectSingleNode("div[1]/div[1]/p[1]/a[1]/i[1]");
                            int commentNum  = 0;
                            if (commentNode != null)
                            {
                                // 评论数目
                                int.TryParse(commentNode.InnerText, out commentNum);
                            }
                            if (!DataAccess.IsExistUrl(href))
                            {
                                DataAccess.InsertProduct(name, href, int.Parse(cid), commentNum, picUrl);
                            }
                        }
                    }
                }
                else
                {
                    var        categoryUrl = string.Format(urlTemplate, cid, i);
                    HttpClient hc          = new HttpClient(categoryUrl);
                    hc.Timeout = 30000;
                    var html = hc.Request();
                    htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                    // 解析商品
                    // 解析商品
                    var productLis = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[6]/ul[1]/li");
                    foreach (var htmlNode in productLis)
                    {
                        var aNode = htmlNode.SelectSingleNode("a");
                        if (aNode != null)
                        {
                            // 商品名称
                            var name = aNode.Attributes["title"].Value;
                            // 商品链接
                            var href = aNode.Attributes["href"].Value;
                            // 图片
                            var imageNode = aNode.SelectSingleNode("img");
                            var picUrl    = string.Empty;
                            if (imageNode != null && imageNode.Attributes["src2"] != null && !string.IsNullOrEmpty(imageNode.Attributes["src2"].Value))
                            {
                                // 图片url
                                picUrl = imageNode.Attributes["src2"].Value;
                            }

                            // 评论
                            var commentNode = htmlNode.SelectSingleNode("div[1]/div[1]/p[1]/a[1]/i[1]");
                            int commentNum  = 0;
                            if (commentNode != null)
                            {
                                // 评论数目
                                int.TryParse(commentNode.InnerText, out commentNum);
                            }
                            if (!DataAccess.IsExistUrl(href))
                            {
                                DataAccess.InsertProduct(name, href, int.Parse(cid), commentNum, picUrl);
                            }
                        }
                    }
                }
            }
            return(true);
        }
예제 #13
0
        public static void DangDangProductList(string categoryUrl)
        {
            //
            Uri uri = new Uri(categoryUrl);
            NameValueCollection nameValue = UrlHelper.GetQueryString(uri.Query);
            // 根据url中抽取分类
            string cid = nameValue["cat"];

            if (string.IsNullOrEmpty(cid))
            {
                return;
            }

            var hcFirst = new HttpClient(categoryUrl);

            hcFirst.Timeout = 30000;
            var htmlFirst    = hcFirst.Request();
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            // 查找第一页面
            var maxPageNode = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[2]/div[3]/div[5]/div[1]/div[2]/span[1]");

            if (maxPageNode == null || string.IsNullOrEmpty(maxPageNode.InnerText))
            {
                return;
            }
            var maxPageString = maxPageNode.InnerText.Substring("1&nbsp;/&nbsp;".Length);
            //		InnerText	"1&nbsp;/&nbsp;19"	string
            var maxPageNumber = 0;

            if (!int.TryParse(maxPageString, out maxPageNumber))
            {
                return;
            }
            //页面模板
            var tempUrl = "http://category.dangdang.com/all/?category_id={0}&page_index={1}";

            for (int j = 1; j <= maxPageNumber; j++)
            {
                if (j != 1)
                {
                    var hc = new HttpClient(string.Format(tempUrl, cid, j));
                    hc.Timeout = 30000;
                    var html = hc.Request();
                    if (!string.IsNullOrEmpty(html))
                    {
                        htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                    }
                }

                var productList =
                    htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[2]/div[3]/div[7]/div");
                if (productList == null || !productList.Any())
                {
                    return;
                }
                int i = 0;
                foreach (HtmlNode htmlNode in productList)
                {
                    if (htmlNode.Attributes["class"] != null && htmlNode.Attributes["class"].Value.Contains("listitem"))
                    {
                        var a = htmlNode.SelectSingleNode("p[1]/a[1]");
                        // 商品页面
                        if (a != null && a.Attributes["href"] != null && !string.IsNullOrEmpty(a.Attributes["href"].Value))
                        {
                            //
                            // 商品链接
                            var productUrl = a.Attributes["href"].Value;
                            // 商品列表页面图片
                            var image           = a.SelectSingleNode("img[1]");
                            var productImageUrl = image.Attributes["src"].Value;

                            // 商品名称
                            var titleNode = htmlNode.SelectSingleNode("p[3]/a[1]");

                            // 评论数
                            var commentNode  = htmlNode.CssSelect("p.starlevel"); //htmlNode.SelectSingleNode("p[4]/span[1]/a[1]");
                            var commentCount = 0;
                            if (commentNode != null && !string.IsNullOrEmpty(commentNode.FirstOrDefault().InnerText))
                            {
                                var index = commentNode.FirstOrDefault().InnerText.IndexOf("条");
                                if (index != -1)
                                {
                                    var s = commentNode.FirstOrDefault().InnerText.Substring(1, index - 1);
                                    int.TryParse(s, out commentCount);
                                }
                            }

                            if (!DataAccess.IsExistUrl(productUrl))
                            {
                                DataAccess.InsertProduct(titleNode.InnerText, productUrl, int.Parse(cid), commentCount, productImageUrl);
                            }
                        }
                    }
                }
            }
        }
예제 #14
0
        /// <summary>
        /// 易讯商品列表页面数据采集
        /// </summary>
        /// <param name="url"></param>
        public static void WuYiBuyProductList(string url)
        {
            var hcFirst = new HttpClient(url);

            hcFirst.Timeout = 30000;
            var htmlFirst    = hcFirst.Request();
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            // 寻找第二页面链接及最大页码

            var secondPageNode = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[4]/div[2]/div[6]/div[1]/a[1]");
            var secondPageUrl  = string.Empty;
            var urlTemplate    = string.Empty;

            if (secondPageNode != null)
            {
                if (secondPageNode.Attributes["href"] != null && !string.IsNullOrEmpty(secondPageNode.Attributes["href"].Value))
                {
                    secondPageUrl = secondPageNode.Attributes["href"].Value;
                    var spiltArray = secondPageUrl.Split('-');
                    spiltArray[6] = "{0}";
                    // 每一页面链接模板

                    for (int i = 0; i < spiltArray.Length; i++)
                    {
                        if (i == spiltArray.Length - 1)
                        {
                            urlTemplate += spiltArray[i];
                        }
                        else
                        {
                            urlTemplate += spiltArray[i] + "-";
                        }
                    }
                }
            }

            var maxPageNode = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[4]/div[2]/div[6]/div[1]/a[last()-1]");

            var maxPageNumber = 1;

            //说明有多个页面
            if (maxPageNode != null && secondPageNode != null && !string.IsNullOrEmpty(urlTemplate))
            {
                int.TryParse(maxPageNode.InnerText, out maxPageNumber);
            }

            for (int i = 1; i <= maxPageNumber; i++)
            {
                if (i != 1)
                {
                    var hc = new HttpClient(string.Format(urlTemplate, i));
                    hc.Timeout = 50000;
                    var html = hcFirst.Request();
                    htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                }

                // 寻找当前商品的链接
                var productListNodes = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[2]/div[5]/ul[1]/li");
                if (productListNodes == null)
                {
                    return;
                }

                foreach (HtmlNode productNode in productListNodes)
                {
                    // 商品名称
                    var productNameNode = productNode.SelectSingleNode("./div[1]/h4[1]/a[1]");

                    if (productNameNode == null)
                    {
                        continue;
                    }

                    // 商品列表图
                    var productImageNode = productNode.SelectSingleNode("./a[1]/img[1]");

                    // 商品链接
                    var productHref = productNameNode.Attributes["href"].Value;

                    // 商品评论数量
                    var commentNode = productNode.SelectSingleNode("./div[1]/p[2]/a[1]");

                    // 商品价格
                    var productPriceNode = productNode.SelectSingleNode("./div[2]/p[2]/strong[1]");

                    // 商品原始id
                }
            }
        }
예제 #15
0
        /// <summary>
        /// 易讯
        /// </summary>
        /// <param name="url"></param>
        public static void WuYiBuySpider(string url)
        {
            //CsQuery
            //Fizzler
            //http://code.google.com/p/sharp-query/downloads/list

            //http://www.51buy.com/portal.html
            //HttpClient hc = new HttpClient(url);
            //hc.Timeout = 30000;
            //var allSortHtml = hc.Request();
            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run(url);

            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var container = htmlDocument.DocumentNode.CssSelect("#protal_list");

            if (container == null || !container.Any() || container.FirstOrDefault() == null)
            {
                return;
            }

            var firstCategoryContainer = container.FirstOrDefault().CssSelect(".item");

            foreach (HtmlNode htmlNode in firstCategoryContainer)
            {
                var firstCategoryNode = htmlNode.CssSelect("div.item_hd>h3>a");
                var firstKey          = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                Insert(firstKey.ToString(), firstCategoryNode.FirstOrDefault().InnerText, firstCategoryNode.FirstOrDefault().Attributes["href"].Value, "0", "3");


                var secondCategoryContainer = htmlNode.CssSelect("dl");

                foreach (HtmlNode node in secondCategoryContainer)
                {
                    var secondCategoryNode = node.CssSelect("dt");

                    var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                    if (secondCategoryNode.FirstOrDefault() != null)
                    {
                        Insert(secondKey.ToString(), secondCategoryNode.FirstOrDefault().InnerText, "", firstKey.ToString(), "3");
                    }


                    var threeCategoryContainer = node.CssSelect("dd");

                    foreach (HtmlNode htmlNode1 in threeCategoryContainer)
                    {
                        var threeCategoryNodes = htmlNode1.CssSelect("a");

                        foreach (HtmlNode threeCategoryNode in threeCategoryNodes)
                        {
                            // 插入三级分类
                            var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            Insert(thirdKey.ToString(), threeCategoryNode.InnerText, threeCategoryNode.Attributes["href"].Value, secondKey.ToString(), "3");
                        }
                    }
                }
            }
        }
예제 #16
0
        public Main()
        {
            InitializeComponent();

            //Init();
            //AutoUpdaterHelper.AutoUpdater();

            InitControls();

            return;

            #region 注释
            // 亚马逊
            //Spider.AmazonSpider(string.Empty);
            Spider.AmazonProductList("http://www.amazon.cn/电脑及配件/b/ref=sd_allcat_pc_?ie=UTF8&node=888465051");
            return;

            // 图片价格识别
            ImageProcess.Recognize("http://jprice.360buyimg.com/price/gp1004750985-1-1-3.png");
            return;

            // csexwb组建注册
            //WebBrowerManager.Instance.CheckCsExwbIsRegistered();
            //WebBrowerManager.Instance.Register();
            //WebBrowerManager.Instance.RegisterCsExwb();
            //WebBrowerManager.Instance.CheckCsExwbIsRegistered();
            //WebBrowerManager.Instance.UnregisterCsExwb();
            //WebBrowerManager.Instance.CheckCsExwbIsRegistered();

            //一号店
            Spider.YiHaoDianSpider("http://www.yihaodian.com/product/listAll.do");
            //Spider.YiHaoDianList();


            //易讯
            //Spider.WuYiBuySpider("http://www.51buy.com/portal.html");
            //Spider.WuYiBuyProductList("http://list.51buy.com/308--------.html");
            Spider.WuYiBuyProductList("http://list.51buy.com/998-0-6-11-24-0-1-5191e20841-.html");

            return;

            //当当
            //Spider.DangDangSpider("http://category.dangdang.com/");

            var dt = DataAccess.GetProductCategory(" ECPlatformId=5 limit 48,100");
            if (dt != null && dt.Rows.Count > 0)
            {
                foreach (DataRow dr in dt.Rows)
                {
                    if (!dr.IsNull("Url"))
                    {
                        var url = Convert.ToString(dr["Url"]);
                        Spider.DangDangProductList(url);
                    }
                }
            }


            // 苏宁
            //Spider.SuNingSpider("www.suning.com/emall/SNProductCatgroupView?storeId=10052&catalogId=10051&flag=1");

            //var dt=DataAccess.GetProductCategory(" ECPlatformId=4 limit 48,100");
            //if (dt != null && dt.Rows.Count > 0)
            //{
            //    foreach (DataRow dr in dt.Rows)
            //    {
            //        if (!dr.IsNull("Url"))
            //        {
            //            var url = Convert.ToString(dr["Url"]);
            //            Spider.SuNingProductList(url);
            //        }
            //    }
            //}


            //淘宝
            //Spider.TaoBaoDetail();

            return;

            //Spider.IndexTest();

            WebBrowerManager.Instance.Setup(new cEXWB());
            var html = WebBrowerManager.Instance.Run("http://www.51buy.com/portal.html");


            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var firstCategoryContainer = htmlDocument.DocumentNode.CssSelect("div.item");
            foreach (HtmlNode htmlNode in firstCategoryContainer)
            {
                var firstNCategoryNode = htmlNode.CssSelect("div.item_hd");
                firstNCategoryNode.CssSelectAncestors("h3 > a");
                var secondCategoryContainer = htmlNode.CssSelect("dl");

                foreach (HtmlNode node in secondCategoryContainer)
                {
                    var secondCategoryNode = node.CssSelect("dt");

                    var threeCategoryContainer = node.CssSelect("dd");

                    foreach (HtmlNode htmlNode1 in threeCategoryContainer)
                    {
                        var threeCategoryNodes = htmlNode1.CssSelect("a");
                        foreach (HtmlNode threeCategoryNode in threeCategoryNodes)
                        {
                        }
                    }
                }
            }
            #endregion
        }