Пример #1
0
        /// <summary>
        /// 亚马逊产品列表
        /// </summary>
        public static void AmazonProductList(string url)
        {
            //http://www.amazon.cn/gp/site-directory/ref=sa_menu_fullstore
            if (string.IsNullOrEmpty(url))
            {
                return;
            }
            Uri uri = new Uri(url);
            string queryString = uri.Query;
            NameValueCollection nameValue = UrlHelper.GetQueryString(queryString);
            // 根据url中抽取分类
            string node = nameValue["node"];

            if (string.IsNullOrEmpty(node))
            {
                return;
            }

            string urlTemplate = "http://www.amazon.cn/s/ref=?rh=n:{0}&page={1}";
            var firstPageUrl = string.Format(urlTemplate, node,1);
            var hcFirst = new HttpClient(firstPageUrl);
            hcFirst.Timeout = 30000;
            var html = HttpUtility.HtmlDecode(hcFirst.Request());
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            var maxPageNode = htmlDocument.DocumentNode.SelectSingleNode("//span[@class='pagnDisabled']");
            var maxPageNumber = 0;
            if(maxPageNode!=null&&int.TryParse(maxPageNode.InnerText,out maxPageNumber))
            {
                for (int i = 1; i <= maxPageNumber; i++)
                {
                    if (i != 1)
                    {
                        var pageUrl = string.Format(urlTemplate, node, i);
                        var hc = new HttpClient(pageUrl);
                        hc.Timeout = 30000;
                        html = HttpUtility.HtmlDecode(hc.Request());
                        htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                    }

                    var productNodes =
                        htmlDocument.DocumentNode.SelectNodes(
                            "//div[@class='result product'] | //div[@class='result lastRow product']");
                    if (productNodes == null)
                    {
                        return;
                    }

                    foreach (HtmlNode productNode in productNodes)
                    {
                        var imageNode = productNode.SelectSingleNode("div[@class='image']/a");
                        var titleNode = productNode.SelectSingleNode("div[@class='data']/h3[@class='title']/a");
                    }
                }
            }
        }
Пример #2
0
        static void Main(string[] args)
        {
            #region
            HttpClient hc3 = new HttpClient("http://www.soxuan.com");
            var html2=hc3.Request();
            return;
            #endregion

            #region

            FileRead.Read();
            FileRead.Read("GetPromotionRulesByIdsWithoutRuleExpands");
            return;
            #endregion

            #region mongodb
            //var objectid= MongoDBOfficialTest.Insert(new ShoppingCartEntity(){CartId = "123",Ha = "ssss",Promotion = new PromotionEntity(){Date1="满赠新促销",Date2 = new List<string>(){"测试"}}});
            var objectid = new ObjectId("50e78f8c9e3eca2d6c538b9d");
            MongoDBOfficialTest.GetById(objectid);
            //MongoDBOfficialTest.GetById(objectid);

            //MongoDBTest.Insert(new ShoppingCartEntity(){CartId = "123456"});
            //MongoDBTest.GetById("123456");
            //MongoDBTest.Update(new ShoppingCartEntity(){CartId = "123456"});
            return;
            #endregion

            //var indexUrls= GoldSpider.GetUrls();
            //var urls= GoldSpider.Spider(indexUrls);
            //string json = JsonHelper.ToJson(urls);
            //File.WriteAllText(Environment.CurrentDirectory+"\\urls.json",json);

            var data= JsonHelper.FromJson<List<string>>(File.ReadAllText(Environment.CurrentDirectory + "\\urls.json"));
            var finalDatas= GoldSpider.SpiderPrice(data);
            File.WriteAllText(Environment.CurrentDirectory + "\\data.json", JsonHelper.ToJson(finalDatas));
            return;

            //SqliteTest.Test();
            //string connectionstring1 = "Data Source=e:\\sqlite.db3";
            //string connectionstring2 = "Data Source=e:\\sqlite.db3;PRAGMA cache_size=10000";
            //SqliteTest.Query("select id from test1 limit 0,10000", connectionstring2);
            //SqliteTest.Query("select id from test1 limit 0,10000", connectionstring1);
            //SqliteTest.Query("select id from test1 limit 0,10000", connectionstring1);

            SqliteTest.Memory(100000);
            SqliteTest.MemoryQuery("select id from test1 limit 0,10000");
            return;

            WebBrowerManager.Instance.Setup(new cEXWB());
            WebBrowerManager.Instance.TimeOut = 15;
            WebBrowerManager.Instance.FilterRequest = true;
            WebBrowerManager.Instance.FilterAction.Add(".css", (string key, string source) =>
                {
                    if(source.EndsWith(key))
                    {
                        return true;
                    }
                    return false;
                });
            string html1 = WebBrowerManager.Instance.Run("http://www.sge.sh/publish/sge/xqzx/jyxq/index.htm");

            Console.WriteLine(html1);
            Console.Read();
            return;

            TaskManager taskManager=new TaskManager();
            taskManager.Test02();
            Console.ReadKey();
            return;

            Process.Start("IExplore.exe", "www.northwindtraders.comTest");

            //
            //EncodingTest.Test();
            //return;
            using(HttpClient hc1 = new HttpClient("http://www.cnblogs.com"))
            {
                string html = hc1.Request();
            }

            //WebPage page = new WebPage(html, "http://www.cnblogs.com", Encoding.UTF8);
            //page.SaveHtmlAndResource(@"1.html", false, new DirConfig(@"z:\1"));

            //return;

            HttpClientTest.Test();
            return;

            SqliteTest.Test();

            var uri = new Uri("http://misc.360buyimg.com/lib/js/2012/base-v1.js");

            //WebBrowerManager.Instance.ToVisitUrls = new List<string> { "http://www.360buy.com" };
            WebBrowerManager.Instance.Setup(new cEXWB());
            WebBrowerManager.Instance.Run(uri.ToString());

            BootStrapperManager.Initialize(new NinjectBootstrapper());

            var add = CommonBootStrapper.ServiceLocator.GetInstance<Test>();
            //add.Alert("ceshi");
            add.Test1();

            HttpClient hc = new HttpClient("http://misc.360buyimg.com/lib/js/2012/base-v1.js");

            hc.SaveFile("e:\\1.js");

            hc.Request();
            hc.BeginRequest((h) =>
                {
                    Console.Write(h);
                });
            Console.ReadKey();
            var s= hc.Request();

            var list = new List<UnionOrderTransBFD>();
            list.Add(new UnionOrderTransBFD() { ActualPrice = 1, CommissionPrice = 1, Rate = 1, Source = ">123", SONumber = 111111111111, UpdateDate = DateTime.Now });
            list.Add(new UnionOrderTransBFD() { ActualPrice = 1, CommissionPrice = 1, Rate = 1, Source = ">123", SONumber = 111111111111, UpdateDate = DateTime.Now });

            var xml= ObjectXmlSerializer.ToXml(list,"ccc",true,false);

            var a=new A();
            a.name = 1;
            //a.ObjectB = new B() { ItemCode = "1", Qty = 1 };
            var b = new A();
            b.name =1;
            //b.ObjectB = new B() { ItemCode = "1", Qty = 1 };
            var isEqual = DotNet.Common.Utility.GenericEqualityComparer<A>.Equals(a, b);
            Console.WriteLine(isEqual);
        }
Пример #3
0
        /// <summary>
        /// 一号店商品采集方法
        /// </summary>
        /// <param name="url">全部分类url</param>
        public static void YiHaoDianSpider(string url)
        {
            //http://www.yihaodian.com/product/listAll.do
            HttpClient hc = new HttpClient(url);
            hc.Timeout = 30000;
            var allSortHtml = hc.Request();
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(allSortHtml);
            var firstCategoryContainer = htmlDocument.DocumentNode.SelectNodes("//div[@class='alonesort']");
            //var texts = new List<string>();
            foreach (HtmlNode firstCategoryNode in firstCategoryContainer)
            {

                var node = firstCategoryNode.CssSelect(".mt>h3>a");

                if (node != null && node.Any())
                {
                    //一级分类
                    var firstCategoryText = node.FirstOrDefault().InnerText;
                    var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                    Insert(firstKey.ToString(), firstCategoryText, node.FirstOrDefault().Attributes["href"].Value, "0","2");

                    var secondCategoryContainer = firstCategoryNode.CssSelect(".mc>.fore");

                    foreach (HtmlNode htmlNode in secondCategoryContainer)
                    {
                        //二级分类
                        var secondCategoryNode = htmlNode.CssSelect("dt>a").FirstOrDefault();
                        var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");

                        if(secondCategoryNode.Attributes["href"]!=null)
                        {
                            Insert(secondKey.ToString(), secondCategoryNode.InnerText, secondCategoryNode.Attributes["href"].Value, firstKey.ToString(),"2");
                        }

                        // 三级分类集合
                        var threeCategoryNodes = htmlNode.CssSelect("dd>em>span>a");
                        foreach (HtmlNode threeCategoryNode in threeCategoryNodes)
                        {
                            // 插入三级分类
                            var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory2");
                            if(threeCategoryNode.Attributes["href"]!=null)
                            {
                                Insert(thirdKey.ToString(), threeCategoryNode.InnerText, threeCategoryNode.Attributes["href"].Value, secondKey.ToString(),"2");
                            }
                        }
                    }
                }
            }
        }
Пример #4
0
 /// <summary>
 /// 一号店商品列表页面
 /// </summary>
 public static void YiHaoDianProductList()
 {
     //
     HttpClient hc = new HttpClient("http://www.yihaodian.com/ctg/searchPage/c5484-%E5%A5%B6%E8%8C%B6/b0/a-s1-v0-p5-price-d0-f04-m1-rt0-pid-k/?callback=jsonp1352021900435");
     hc.Timeout = 30000;
     var allSortHtml = hc.Request();
 }
Пример #5
0
        /// <summary>
        /// 易讯商品列表页面数据采集
        /// </summary>
        /// <param name="url"></param>
        public static void WuYiBuyProductList(string url)
        {
            var hcFirst = new HttpClient(url);
            hcFirst.Timeout = 30000;
            var htmlFirst = hcFirst.Request();
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            // 寻找第二页面链接及最大页码

            var secondPageNode = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[4]/div[2]/div[6]/div[1]/a[1]");
            var secondPageUrl = string.Empty;
            var urlTemplate = string.Empty;
            if(secondPageNode!=null)
            {
                if(secondPageNode.Attributes["href"]!=null&& !string.IsNullOrEmpty(secondPageNode.Attributes["href"].Value))
                {
                    secondPageUrl = secondPageNode.Attributes["href"].Value;
                    var spiltArray = secondPageUrl.Split('-');
                    spiltArray[6] = "{0}";
                    // 每一页面链接模板

                    for (int i = 0; i < spiltArray.Length; i++)
                    {
                        if(i==spiltArray.Length-1)
                        {
                            urlTemplate += spiltArray[i];
                        }
                        else
                        {
                            urlTemplate += spiltArray[i] + "-";
                        }
                    }
                }
            }

            var maxPageNode = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[4]/div[2]/div[6]/div[1]/a[last()-1]");

            var maxPageNumber = 1;

            //说明有多个页面
            if(maxPageNode!=null&&secondPageNode!=null&&!string.IsNullOrEmpty(urlTemplate))
            {
                int.TryParse(maxPageNode.InnerText, out maxPageNumber);
            }

            for (int i = 1; i <= maxPageNumber; i++)
            {
                if (i != 1)
                {
                    var hc = new HttpClient(string.Format(urlTemplate, i));
                    hc.Timeout = 50000;
                    var html = hcFirst.Request();
                    htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                }

                // 寻找当前商品的链接
                var productListNodes = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[2]/div[5]/ul[1]/li");
                if(productListNodes==null)
                {
                    return;
                }

                foreach (HtmlNode productNode in productListNodes)
                {
                    // 商品名称
                    var productNameNode = productNode.SelectSingleNode("./div[1]/h4[1]/a[1]");

                    if(productNameNode==null)
                    {
                        continue;
                    }

                    // 商品列表图
                    var productImageNode = productNode.SelectSingleNode("./a[1]/img[1]");

                    // 商品链接
                    var productHref = productNameNode.Attributes["href"].Value;

                    // 商品评论数量
                    var commentNode = productNode.SelectSingleNode("./div[1]/p[2]/a[1]");

                    // 商品价格
                    var productPriceNode = productNode.SelectSingleNode("./div[2]/p[2]/strong[1]");

                    // 商品原始id
                }
            }
        }
Пример #6
0
        /// <summary>
        /// 苏宁列表页面商品提取
        /// </summary>
        /// <param name="url"></param>
        public static object SuNingProductList(object obj)
        {
            var url = obj as string;
            if(string.IsNullOrEmpty(url))
            {
                return false;
            }
            Uri uri = new Uri(url);
            string queryString = uri.Query;
            NameValueCollection nameValue = UrlHelper.GetQueryString(queryString);
            // 根据url中抽取分类
            string cid = nameValue["ci"];

            if(string.IsNullOrEmpty(cid))
            {
                return false;
            }

            string urlTemplate = "http://search.suning.com/emall/strd.do?ci={0}&cityId=9017&cp={1}&il=0&si=5&st=14&iy=-1";
            var firstPageUrl = string.Format("http://search.suning.com/emall/strd.do?ci={0}&cityId=9017&cp=0&il=0&si=5&st=14&iy=-1", cid);
            var hcFirst = new HttpClient(firstPageUrl);
            hcFirst.Timeout = 30000;
            var htmlFirst = hcFirst.Request();
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            // 先找最大页面页面
            var pageContainer = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[8]/a");
            if(pageContainer==null||!pageContainer.Any())
            {
                return false;
            }
            var lastPageNode = pageContainer[pageContainer.Count - 2];

            var lastPageNumber = int.Parse(lastPageNode.InnerText);

            for (int i = 0; i < lastPageNumber; i++)
            {
                if(i==0)
                {
                    // 解析商品
                    var productLis =htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[6]/ul[1]/li");
                    foreach (var htmlNode in productLis)
                    {
                        var aNode = htmlNode.SelectSingleNode("a");
                        if(aNode!=null)
                        {
                            // 商品名称
                            var name = aNode.Attributes["title"].Value;
                            // 商品链接
                            var href = aNode.Attributes["href"].Value;
                            // 图片
                            var imageNode=aNode.SelectSingleNode("img");
                            var picUrl = string.Empty;
                            if(imageNode!=null&&imageNode.Attributes["src2"]!=null&&!string.IsNullOrEmpty(imageNode.Attributes["src2"].Value))
                            {
                                // 图片url
                                picUrl= imageNode.Attributes["src2"].Value;
                            }

                            // 评论
                            var commentNode = htmlNode.SelectSingleNode("div[1]/div[1]/p[1]/a[1]/i[1]");
                            int commentNum = 0;
                            if(commentNode!=null)
                            {
                                // 评论数目
                                int.TryParse(commentNode.InnerText, out commentNum);
                            }
                            if (!DataAccess.IsExistUrl(href))
                            {
                                DataAccess.InsertProduct(name, href, int.Parse(cid), commentNum, picUrl);
                            }
                        }
                    }
                }
                else
                {
                    var categoryUrl = string.Format(urlTemplate, cid, i);
                    HttpClient hc = new HttpClient(categoryUrl);
                    hc.Timeout = 30000;
                    var html = hc.Request();
                    htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                    // 解析商品
                    // 解析商品
                    var productLis = htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[1]/div[7]/div[2]/div[6]/ul[1]/li");
                    foreach (var htmlNode in productLis)
                    {
                        var aNode = htmlNode.SelectSingleNode("a");
                        if (aNode != null)
                        {
                            // 商品名称
                            var name = aNode.Attributes["title"].Value;
                            // 商品链接
                            var href = aNode.Attributes["href"].Value;
                            // 图片
                            var imageNode = aNode.SelectSingleNode("img");
                            var picUrl = string.Empty;
                            if (imageNode != null && imageNode.Attributes["src2"] != null && !string.IsNullOrEmpty(imageNode.Attributes["src2"].Value))
                            {
                                // 图片url
                                picUrl = imageNode.Attributes["src2"].Value;
                            }

                            // 评论
                            var commentNode = htmlNode.SelectSingleNode("div[1]/div[1]/p[1]/a[1]/i[1]");
                            int commentNum = 0;
                            if (commentNode != null)
                            {
                                // 评论数目
                                int.TryParse(commentNode.InnerText, out commentNum);
                            }
                            if (!DataAccess.IsExistUrl(href))
                            {
                                DataAccess.InsertProduct(name, href, int.Parse(cid), commentNum, picUrl);
                            }
                        }
                    }
                }
            }
            return true;
        }
        public void HtmlAgilityPack_Demo01()
        {
            var list = new List<string>();
            list.Add("2");
            list.Add("3");
            list.Add("4");
            list.Add("5");
            list.Add("7");
            list.Add("8");

            list.Add("9");
            list.Add("10");
            list.Add("11");
            list.Add("12");
            list.Add("13");
            list.Add("14");

            list.Add("15");
            list.Add("16");
            list.Add("17");
            list.Add("18");
            list.Add("19");

            list.Add("20");
            list.Add("21");
            list.Add("22");
            list.Add("23");
            list.Add("24");
            list.Add("25");

            list.Add("26");
            list.Add("27");
            list.Add("28");
            list.Add("29");
            list.Add("31");

            foreach (string s in list)
            {
                var webGet = new HtmlWeb();
                HttpClient httpClient1 = new HttpClient("http://www.yihui-lighting.com/productshow.asp?id="+s);
                HtmlAgilityPack.HtmlDocument htmlDocument1 = new HtmlAgilityPack.HtmlDocument();
                htmlDocument1.LoadHtml(httpClient1.Request());
                var document = htmlDocument1;

                var data = document.DocumentNode.SelectNodes("//*[@id=\"mitte2\"]");
                StringBuilder sb = new StringBuilder();
                int i = 0;
                foreach (HtmlNode htmlNode in data)
                {
                    var images = document.DocumentNode.SelectNodes("//img");
                    if (images != null && images.Count > 0)
                    {
                        int j = 0;
                        foreach (HtmlNode imageNode in images)
                        {
                            j++;
                            WebClient webClient = new WebClient();
                            var dir = "images\\content";
                            string filepath = "z:\\upload\\" + dir;
                            if (!Directory.Exists(filepath))
                            {
                                Directory.CreateDirectory("z:\\upload\\" + dir);
                            }

                            if (imageNode.Attributes["src"].Value != null)
                            {
                                try
                                {
                                    string url = "http://www.yihui-lighting.com/UpProduct";
                                    downloadfile(url + "//" + imageNode.Attributes["src"].Value.Substring(imageNode.Attributes["src"].Value.LastIndexOf('/') + 1), imageNode.Attributes["src"].Value.Substring(imageNode.Attributes["src"].Value.LastIndexOf('/') + 1));
                                }
                                catch
                                {

                                }

                            }
                        }
                    }
                    if (!string.IsNullOrEmpty(htmlNode.InnerText))
                    {
                        //
                        var titleNode = document.DocumentNode.SelectSingleNode(
                            "/html/body/div/div[3]/div[3]/div[2]/div/h2/font");
                        var title = titleNode.InnerText??"";
                        var body = htmlNode.InnerHtml.Replace("UpProduct/", "upload/images/content/")??"";

                        var model = new ProductInfo();
                        model.Title = title;
                        model.CategoryId = 2;

                        model.Content = body;

                        model.InDate = DateTime.Now;
                        model.DisplayOrder = OrderGenerator.NewOrder();
                        IProductService productService = new ProductService();
                        try
                        {
                            productService.Add(model);
                        }
                        catch
                        {

                        }

                    }
                }
            }
        }
Пример #8
0
        public static void DangDangProductList(string categoryUrl)
        {
            //
            Uri uri = new Uri(categoryUrl);
            NameValueCollection nameValue = UrlHelper.GetQueryString(uri.Query);
            // 根据url中抽取分类
            string cid = nameValue["cat"];
            if(string.IsNullOrEmpty(cid))
            {
                return;
            }

            var hcFirst = new HttpClient(categoryUrl);
            hcFirst.Timeout = 30000;
            var htmlFirst = hcFirst.Request();
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(htmlFirst);

            // 查找第一页面
            var maxPageNode=htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[2]/div[3]/div[5]/div[1]/div[2]/span[1]");
            if(maxPageNode==null||string.IsNullOrEmpty(maxPageNode.InnerText))
            {
                return;
            }
            var maxPageString = maxPageNode.InnerText.Substring("1&nbsp;/&nbsp;".Length);
            //		InnerText	"1&nbsp;/&nbsp;19"	string
            var maxPageNumber = 0;
            if (!int.TryParse(maxPageString, out maxPageNumber))
            {
                return;
            }
            //页面模板
            var tempUrl = "http://category.dangdang.com/all/?category_id={0}&page_index={1}";

            for (int j = 1; j <= maxPageNumber; j++)
            {
                if(j!=1)
                {
                    var hc = new HttpClient(string.Format(tempUrl,cid,j));
                    hc.Timeout = 30000;
                    var html = hc.Request();
                    if(!string.IsNullOrEmpty(html))
                    {
                        htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);
                    }
                }

                var productList =
                htmlDocument.DocumentNode.SelectNodes("/html[1]/body[1]/div[2]/div[3]/div[7]/div");
                if (productList == null || !productList.Any())
                {
                    return;
                }
                int i = 0;
                foreach (HtmlNode htmlNode in productList)
                {
                    if (htmlNode.Attributes["class"] != null && htmlNode.Attributes["class"].Value.Contains("listitem"))
                    {

                        var a = htmlNode.SelectSingleNode("p[1]/a[1]");
                        // 商品页面
                        if (a != null && a.Attributes["href"] != null && !string.IsNullOrEmpty(a.Attributes["href"].Value))
                        {
                            //
                            // 商品链接
                            var productUrl = a.Attributes["href"].Value;
                            // 商品列表页面图片
                            var image = a.SelectSingleNode("img[1]");
                            var productImageUrl = image.Attributes["src"].Value;

                            // 商品名称
                            var titleNode = htmlNode.SelectSingleNode("p[3]/a[1]");

                            // 评论数
                            var commentNode = htmlNode.CssSelect("p.starlevel"); //htmlNode.SelectSingleNode("p[4]/span[1]/a[1]");
                            var commentCount = 0;
                            if(commentNode!=null&&!string.IsNullOrEmpty(commentNode.FirstOrDefault().InnerText))
                            {
                                var index = commentNode.FirstOrDefault().InnerText.IndexOf("条");
                                if(index!=-1)
                                {
                                    var s = commentNode.FirstOrDefault().InnerText.Substring(1, index - 1);
                                    int.TryParse(s, out commentCount);
                                }
                            }

                            if (!DataAccess.IsExistUrl(productUrl))
                            {
                                DataAccess.InsertProduct(titleNode.InnerText, productUrl, int.Parse(cid), commentCount, productImageUrl);
                            }
                        }
                    }
                }
            }
        }
Пример #9
0
        public void Process()
        {
            string url = "http://www.360buy.com/products/670-671-672-0-0-0-0-0-0-0-1-1-{0}.html";

            HttpClient client=new HttpClient("http://www.360buy.com/products/670-671-672-0-0-0-0-0-0-0-1-1-1.html");
            var html = client.Request();

            //通过第一页找到一共有多少页面
            var htmlDocument= HtmlAgilityPackHelper.GetHtmlDocument(html);

            //var productNodes2 = htmlDocument.GetElementbyId("plist").SelectNodes("//a[@href]");

            var maxPager = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='pagin fr']/a[4]");

            var urls = new List<string>();

            if (maxPager != null)
            {
                int maxPageNumber = 0;
                if (int.TryParse(maxPager.InnerText, out maxPageNumber) && maxPageNumber>0)
                {
                    for (int i = 1; i <= maxPageNumber; i++)
                    {
                        string newUrl = string.Format(url, i);
                        HttpClient hc=new HttpClient(newUrl);
                        var newHtml = hc.Request();
                        var doc = HtmlAgilityPackHelper.GetHtmlDocument(newHtml);

                        var productContainer = doc.GetElementbyId("plist");
                        if(productContainer==null)
                        {
                            //Logger.Log(url+"数据获取有误");
                            continue;
                        }
                        var productNodes = productContainer.SelectNodes("//a");

                        if(productNodes==null||productNodes.Count==0)
                        {
                            //Logger.Log(url + "数据获取有误");
                            continue;
                        }
                        foreach (var productNode in productNodes)
                        {
                            if (productNode.Attributes["href"] != null && productNode.Attributes["href"].Value != null && productNode.Attributes["href"].Value.Contains("product/"))
                            {

                                var href = productNode.Attributes["href"].Value;
                                if (!IsExistUrl(href))
                                {
                                    using (DataCommand cmd = DataCommandManager.GetDataCommand("InsertUrls"))
                                    {
                                        cmd.SetParameterValue("@Guid", System.Guid.NewGuid().ToString());
                                        cmd.SetParameterValue("@Url", href);
                                        cmd.ExecuteNonQuery();
                                    }
                                }
                            }

                        }

                        Thread.Sleep(1000);
                    }
                }
            }
            MessageBox.Show("笔记本数据采集完了");
        }
Пример #10
0
        public void ProcessCategory(string url,string categoryId)
        {
            var link = string.Format(url, 1);
            HttpClient client = new HttpClient(link);
            client.Timeout = 1000000;
            client.StatusUpdate += new EventHandler<StatusUpdateEventArgs>(Message);
            var html = client.Request();

            //通过第一页找到一共有多少页面
            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            //var productNodes2 = htmlDocument.GetElementbyId("plist").SelectNodes("//a[@href]");

            var maxPager = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='pagin fr']/a[4]");

            if (maxPager != null)
            {
                int maxPageNumber = 0;
                if (int.TryParse(maxPager.InnerText, out maxPageNumber) && maxPageNumber > 0)
                {
                    for (int i = 1; i <= maxPageNumber; i++)
                    {
                        string newUrl = string.Format(url, i);
                        HttpClient hc = new HttpClient(newUrl);
                        hc.Timeout = 1000000;
                        hc.StatusUpdate += new EventHandler<StatusUpdateEventArgs>(Message);
                        var newHtml = hc.Request();
                        var doc = HtmlAgilityPackHelper.GetHtmlDocument(newHtml);

                        var productContainer = doc.GetElementbyId("plist");
                        if (productContainer == null)
                        {
                            //Logger.Log(url+"数据获取有误");
                            continue;
                        }
                        var productNodes = productContainer.CssSelect("div.p-name>a");//productContainer.SelectNodes("//a");
                        if (productNodes == null || productNodes.Count() == 0)
                        {
                            //Logger.Log(url + "数据获取有误");
                            continue;
                        }
                        foreach (var productNode in productNodes)
                        {
                            if (productNode.Attributes["href"] != null && productNode.Attributes["href"].Value != null && productNode.Attributes["href"].Value.Contains("product/"))
                            {

                                var href = productNode.Attributes["href"].Value;
                                if (!IsExistUrl(href))
                                {
                                    InsertProduct(productNode.InnerText, href, int.Parse(categoryId), 0);
                                }

                            }

                        }

                        Thread.Sleep(5000);
                    }
                }
            }
        }
Пример #11
0
        public void GatherPrice(string productId,string url)
        {
            HttpClient client = new HttpClient(url);
            var html = client.Request();

            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(html);

            //标题
            var title = htmlDocument.DocumentNode.SelectSingleNode("/html/body/div[5]/div/div/h1");
            var price = htmlDocument.DocumentNode.SelectSingleNode("//div[@class='p-price']/img");
            // 文字价格
            var priceText = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/ul[1]/script[1]");

            // 产品图片
            var defaultImage = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]/div[5]/div[1]/div[2]/div[1]");

            // 促销信息是ajax
            if(title!=null&&price!=null&&priceText!=null)
            {
                var beginIndex= priceText.InnerText.IndexOf("京东价:¥");

                var endIndex = priceText.InnerText.IndexOf("。", beginIndex);
                var readPrice = priceText.InnerText.Substring(beginIndex + "京东价:¥".Length, endIndex - beginIndex - "京东价:¥".Length);
            }
        }
Пример #12
0
        public void GatherAll()
        {
            //var d1 = Math.Round(3.346, 2);
            //var d2 = Math.Round(3.341, 2);
            //var d3 = Math.Round(3.34, 2);
            //var d4 = Math.Round(3.346, 2, MidpointRounding.ToEven);
            //var d5 = Math.Round(3.349, 2, MidpointRounding.AwayFromZero);
            //var d6 = Math.Round(3.345, 2, MidpointRounding.AwayFromZero);
            //var d7 = Math.Round(3.345, 2, MidpointRounding.ToEven);
            //var d8 = Math.Floor(3.345);
            //var d9 = Math.Floor(3.347);
            //var d10 = Math.Floor(3.341);
            //var a = Math.Round(2.346 - 0.005, 2);
            var url = "http://www.360buy.com/allSort.aspx";
            HttpClient hc=new HttpClient(url);
            hc.Timeout = 30000;
            var allSortHtml= hc.Request();

            var htmlDocument = HtmlAgilityPackHelper.GetHtmlDocument(allSortHtml);

            var categoryContainer = htmlDocument.GetElementbyId("allsort");

            if(categoryContainer==null)
            {
                return;
            }

            var firstCategory = htmlDocument.DocumentNode.SelectNodes("/html/body/div[5]/div/div/div/h2");

            foreach (var first in firstCategory)
            {
                //插入一级分类
                var firstKey = KeyGenerator.Instance.GetNextValue("ProductCategory");
                Insert(firstKey.ToString(), first.InnerText,"","0");
                // 二级分类
                var secondCategorys = first.ParentNode.ParentNode.CssSelect("dt");
                foreach (var secondCategory in secondCategorys)
                {
                    // 插入二级分类
                    var secondKey = KeyGenerator.Instance.GetNextValue("ProductCategory");
                    Insert(secondKey.ToString(), secondCategory.InnerText, "", firstKey.ToString());
                    // 三级分类
                    var thirdCategorys= secondCategory.ParentNode.CssSelect("dd>em>a");
                    foreach (var thirdCategory in thirdCategorys)
                    {
                        // 插入三级分类
                        var thirdKey = KeyGenerator.Instance.GetNextValue("ProductCategory");
                        if(thirdCategory.Attributes["href"]!=null&&!string.IsNullOrEmpty(thirdCategory.Attributes["href"].Value))
                        {
                            Insert(thirdKey.ToString(), thirdCategory.InnerText, thirdCategory.Attributes["href"].Value, secondKey.ToString());
                        }

                    }
                }
            }
        }
Пример #13
0
        /// <summary>
        /// 从一个CHttpWebResponse实例保存文件到本地
        /// </summary>
        /// <param name="cResponse">一个CHttpWebResponse实例</param>
        /// <param name="dirPath">文件夹的绝对路径</param>
        /// <param name="fileName">文件名,如果为null或者String.Empty则自动获取(推荐设置设置为自动获取)</param>
        /// <returns>如果保存成功,返回文件绝对路径,否则返回null</returns>
        internal static string SaveFile(HttpClient httpClient, string dirPath, string fileName)
        {
            string filePath;
            //如果成功接收服务器输出的文件流

            //如果没有指定文件名,则自动获取文件名
            if (string.IsNullOrEmpty(fileName))
            {
                fileName = httpClient.GetFileName();
            }
            filePath = dirPath + "\\" + fileName;
            //filePath = Regex.Replace(filePath, @"\{2,}", @"\");
            filePath = Path.GetFullPath(filePath);
            return FileUtility.SaveFile(filePath, httpClient.MemoryStream);
        }
Пример #14
0
        /// <summary>
        ///使用同步方式下载,将html内容保存为html文件,并将html内容中引用的图片,css,js,flash都保存到本地,然后将html内容中引用的地址都转换为相对地址
        /// </summary>
        /// <param name="fileName">保存的文件名</param>                
        /// <param name="ignoreScript">保存的时候是否忽略Script标签,若为true,则忽略JS</param>
        /// <param name="dirConfig">页面下载文件夹相关配置信息</param>  
        /// <returns>如果下载成功,返回文件的本地路径,否则返回null</returns>
        public string SaveHtmlAndResource(string fileName, bool ignoreScript, DirConfig dirConfig)
        {
            string htmlContent = this.html;
            if (!dirConfig.CheckLegal())
            {
                throw new Exception("路径配置不在统一磁盘下");
            }
            dirConfig.CreateDir();
            if (string.IsNullOrEmpty(this.url))
            {
                throw new Exception("未指定当前url");
            }
            //是否忽略JS
            if (ignoreScript)
            {
                htmlContent = this.FilterScript();
            }
            #region 保存所有css和css中引用的图片
            htmlContent = RegexLibrary.RegCssLink.Replace(htmlContent, delegate(Match match)
            {
                //由于css文件里有可能引用图片,所以在此处需要自定义css文件下载函数,将css文件中的图片下载到本地然后替换css文件中的引用路径
                return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.CssDirPath, "src", null, delegate(string cssUrl)
                {
                    //css内容、css保存路径
                    string cssContent, cssSavePath = cssUrl;
                    //请求css文件
                    //cRequest = Spider.CreateRequest(cssUrl);
                    //cRequest.SetHeader("Referer", this.Url);
                    //cResponse = Spider.Get(cRequest);
                    HttpClient hc = new HttpClient(cssUrl);

                    string content = hc.Request();
                    //如果请求成功
                    if (!string.IsNullOrEmpty(content))
                    {
                        string tempFileName = hc.GetFileName();
                        //确定css文件保存的绝对路径
                        cssSavePath = dirConfig.UseWebSite ? Path.GetFullPath(PathUtility.GetSaveDir(this.Url, cssUrl, dirConfig.HtmlDirPath) + tempFileName) : Path.GetFullPath(dirConfig.CssDirPath + "\\" + tempFileName);
                        //获取css内容
                        cssContent = content;// cResponse.GetContent(null, false);
                        //下载css里引用的图片,并替换css内容中图片地址的引用
                        cssContent = this.ReplaceBackgroundUrl(cssContent, cssUrl, dirConfig, Path.GetDirectoryName(cssSavePath), dirConfig.ImgDirPath, false, null);
                        //将css保存到本地
                        cssSavePath = FileUtility.SaveText(cssSavePath, cssContent, hc.Encoding);
                    }
                    //返回保存以后的css本地路径
                    return cssSavePath;
                });
            });
            #endregion

            #region 保存所有JS
            htmlContent = RegexLibrary.RegScriptLink.Replace(htmlContent, delegate(Match match)
            {
                return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.JsDirPath, "src", null, null);
            });
            #endregion

            #region 保存所有图片
            htmlContent = RegexLibrary.RegImg.Replace(htmlContent, delegate(Match match)
            {
                return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.ImgDirPath, "src", null, null);
            });
            #endregion

            #region 保存所有Flash
            htmlContent = RegexLibrary.RegFlash.Replace(htmlContent, delegate(Match match)
            {
                return this.MatchUrl(match, dirConfig, this.url, dirConfig.HtmlDirPath, dirConfig.FlashDirPath, "src", null, null);
            });
            #endregion

            #region 当前页面内嵌css中的图片
            htmlContent = this.ReplaceBackgroundUrl(htmlContent, this.Url, dirConfig, dirConfig.HtmlDirPath, dirConfig.ImgDirPath, false, null);
            #endregion
            return FileUtility.SaveText(Path.GetFullPath(dirConfig.HtmlDirPath + "\\" + fileName), htmlContent, this.encode);
        }
Пример #15
0
 /// <summary>
 /// 使用同步保存资源的函数,将css,js,flash,图片等资源文件保存到本地
 /// </summary>
 /// <param name="fileUrl">资源在公网上的url路径</param>
 /// <param name="dirPath">本地文件夹完整路径,资源文件将保存在此文件夹</param>   
 /// <param name="fileName">文件名,如果为null或者String.Empty则自动获取(推荐设置设置为自动获取)</param>
 /// <returns>如果保存成功,返回文件绝对路径,否则返回null</returns>
 public static string SaveResource(string fileUrl, string dirPath, string fileName)
 {
     HttpClient hc = new HttpClient(fileUrl);
     string data= hc.Request();
     if (string.IsNullOrEmpty(data))
     {
         return string.Empty;
     }
     return SaveFile(hc, dirPath, fileName);
 }