/* * [Function] * from amazon main category url extract sub category list * [Input] * http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dappliances * [Output] * sub/child categoryItem list * [Note] */ public List<categoryItem> extractSubCategoryList(string amazonMainCategoryUrl) { List<categoryItem> subCategoryList = new List<categoryItem>(); string respHtml = ""; respHtml = crl.getUrlRespHtml_multiTry(amazonMainCategoryUrl); //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dappliances //<div id="left"> // <div id="leftNav"> // <div id="leftNavContainer"> // <div id="refinements" data-baserh="n%3A2619525011" data-browseladder="n%3A2619525011"> // <h2 >Department</h2> // <ul id="ref_2619526011" data-typeid="n" > // <li style="margin-left: 0px"> // <strong>Appliances</strong> // </li> // <!-- A non-null numberVisibleValues indicates that we should put the non-visible items behind a "See more..." expando, but only if there are enough values available to be hidden. --> // <li style="margin-left: -2px"> // <a href="/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&bbn=2619526011&ie=UTF8&qid=1371180383&rnid=2619526011"> // <span class="refinementLink">Air Conditioners</span><span class="narrowValue"> (8,704)</span> // </a> // </li> // ............ // <li style="margin-left: -2px"> // <a href="/s/ref=lp_2619525011_nr_n_28?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A2383576011&bbn=2619526011&ie=UTF8&qid=1371180383&rnid=2619526011"> // <span class="refinementLink">Washers & Dryers</span><span class="narrowValue"> (1,390)</span> // </a> // </li> // <li style="margin-left: -2px"> // <a href="/s/ref=lp_2619525011_nr_n_29?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3741521&bbn=2619526011&ie=UTF8&qid=1371180383&rnid=2619526011"> // <span class="refinementLink">Wine Cellars</span><span class="narrowValue"> (3,761)</span> // </a> // </li> // </ul> HtmlAgilityPack.HtmlDocument htmlDoc = crl.htmlToHtmlDoc(respHtml); HtmlNode refinementsNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='refinements' and @data-baserh and @data-browseladder]"); HtmlNodeCollection subCategoryNodeList = refinementsNode.SelectNodes("//ul[@id and @data-typeid]/li/a"); if ((subCategoryNodeList != null) && (subCategoryNodeList.Count > 0)) { foreach (HtmlNode subCatNode in subCategoryNodeList) { string subCatUrl = subCatNode.Attributes["href"].Value; //"/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&bbn=2619526011&ie=UTF8&qid=1371183419&rnid=2619526011" subCatUrl = constAmazonDomainUrl + subCatUrl; //"http://www.amazon.com/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&bbn=2619526011&ie=UTF8&qid=1371183419&rnid=2619526011" subCatUrl = HttpUtility.HtmlDecode(subCatUrl); //"http://www.amazon.com/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&bbn=2619526011&ie=UTF8&qid=1371183419&rnid=2619526011" HtmlNode refinementLinkNode = subCatNode.SelectSingleNode("./span[@class='refinementLink']"); if (refinementLinkNode != null) { string subCatName = refinementLinkNode.InnerText; //"Air Conditioners" subCatName = subCatName.Trim(); subCatName = HttpUtility.HtmlDecode(subCatName); //"Air Conditioners" //store info categoryItem singleSubCatItem = new categoryItem(); singleSubCatItem.Name = subCatName; singleSubCatItem.Key = ""; // sub category no key singleSubCatItem.Url = subCatUrl; subCategoryList.Add(singleSubCatItem); } else { //something wrong } } } else { //something wrong gLogger.Debug("can not find subCategoryNodeList"); } return subCategoryList; }
/* * [Function] * from amazon main url extract main category * [Input] * http://www.amazon.com/ref=nb_sb_noss_null * [Output] * categoryItem list, contains 36 main category: * Key "instant-video" string * Name "Amazon Instant Video" string * ... * Key "watches" string * Name "Watches" string * [Note] */ public List<categoryItem> extractMainCategoryList(string amazonMainUrl) { List < categoryItem> mainCategoryList = new List<categoryItem>(); string respHtml = ""; //respHtml = crl.getUrlRespHtml(regularCategoryMainUrl); respHtml = crl.getUrlRespHtml_multiTry(amazonMainUrl); /* <span id='nav-search-in' class='nav-sprite'> <span id='nav-search-in-content' data-value="search-alias=aps"> All </span> <span class='nav-down-arrow nav-sprite'></span> <select name="url" id="searchDropdownBox" class="searchSelect" title="Search in" ><option value="search-alias=aps" selected="selected">All Departments</option><option value="search-alias=instant-video">Amazon Instant Video</option><option value="search-alias=appliances">Appliances</option><option value="search-alias=mobile-apps">Apps for Android</option><option value="search-alias=arts-crafts">Arts, Crafts & Sewing</option><option value="search-alias=automotive">Automotive</option><option value="search-alias=baby-products">Baby</option><option value="search-alias=beauty">Beauty</option><option value="search-alias=stripbooks">Books</option><option value="search-alias=mobile">Cell Phones & Accessories</option><option value="search-alias=apparel">Clothing & Accessories</option><option value="search-alias=collectibles">Collectibles</option><option value="search-alias=computers">Computers</option><option value="search-alias=financial">Credit Cards</option><option value="search-alias=electronics">Electronics</option><option value="search-alias=gift-cards">Gift Cards Store</option><option value="search-alias=grocery">Grocery & Gourmet Food</option><option value="search-alias=hpc">Health & Personal Care</option><option value="search-alias=garden">Home & Kitchen</option><option value="search-alias=industrial">Industrial & Scientific</option><option value="search-alias=jewelry">Jewelry</option><option value="search-alias=digital-text">Kindle Store</option><option value="search-alias=magazines">Magazine Subscriptions</option><option value="search-alias=movies-tv">Movies & TV</option><option value="search-alias=digital-music">MP3 Music</option><option value="search-alias=popular">Music</option><option value="search-alias=mi">Musical Instruments</option><option value="search-alias=office-products">Office Products</option><option value="search-alias=lawngarden">Patio, Lawn & Garden</option><option value="search-alias=pets">Pet Supplies</option><option value="search-alias=shoes">Shoes</option><option value="search-alias=software">Software</option><option value="search-alias=sporting">Sports & Outdoors</option><option value="search-alias=tools">Tools & Home Improvement</option><option value="search-alias=toys-and-games">Toys & Games</option><option value="search-alias=videogames">Video Games</option><option value="search-alias=watches">Watches</option></select> </span> */ HtmlAgilityPack.HtmlDocument htmlDoc = crl.htmlToHtmlDoc(respHtml); HtmlNode categorySelectNode = htmlDoc.DocumentNode.SelectSingleNode("//span[@id='nav-search-in' and @class='nav-sprite']/select[@name='url' and @id='searchDropdownBox' and @class='searchSelect']"); if (categorySelectNode != null) { HtmlNodeCollection optionNodeList = categorySelectNode.SelectNodes(".//option[@value]"); //omit first one: //<option value="search-alias=aps" selected="selected">All Departments</option> optionNodeList.Remove(0); foreach (HtmlNode singleOptionNode in optionNodeList) { //<option value="search-alias=instant-video">Amazon Instant Video</option> //<option value="search-alias=appliances">Appliances</option> //... //<option value="search-alias=watches">Watches</option> string searchValue = singleOptionNode.Attributes["value"].Value; //search-alias=instant-video string categoryKey = ""; //instant-video if (crl.extractSingleStr(@"=([a-z\-]+)", searchValue, out categoryKey)) { //instant-video //appliances //mobile-apps string generalCategory = singleOptionNode.InnerText; //Amazon Instant Video //string generalCategory = singleOptionNode.NextSibling.InnerText; //Amazon Instant Video //store info categoryItem singleCategoryItem = new categoryItem(); singleCategoryItem.Name = generalCategory; singleCategoryItem.Key = categoryKey; singleCategoryItem.Url = generateMainCategoryUrlFromCategoryKey(categoryKey); //add to list mainCategoryList.Add(singleCategoryItem); } else { //something wrong gLogger.Debug(String.Format("can not extart main category key for html node {0} for {1}",singleOptionNode.ToString(),amazonMainUrl)); } } } else { //something wrong gLogger.Debug("can not find categorySelectNode for " + amazonMainUrl); } return mainCategoryList; }
/* * [Function] * from amazon Bes Seller url extract category * [Input] * http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab * http://www.amazon.com/Best-Sellers/zgbs * [Output] * categoryItem list, contains 35 main category: * ... * [Note] */ public List<categoryItem> extractBestSellerCategoryList(string amazonBestSellerUrl) { List<categoryItem> bestSellerCategoryList = new List<categoryItem>(); //http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab //<ul id="zg_browseRoot"> // <li> // <span class="zg_selected"> Any Department</span> // </li> // <ul> // <li><a href='http://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/ref=zg_bs_nav_0'>Appliances</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Appstore-Android/zgbs/mobile-apps/ref=zg_bs_nav_0'>Appstore for Android</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Arts-Crafts-Sewing/zgbs/arts-crafts/ref=zg_bs_nav_0'>Arts, Crafts & Sewing</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Automotive/zgbs/automotive/ref=zg_bs_nav_0'>Automotive</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Baby/zgbs/baby-products/ref=zg_bs_nav_0'>Baby</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Beauty/zgbs/beauty/ref=zg_bs_nav_0'>Beauty</a></li> // <li><a href='http://www.amazon.com/best-sellers-books-Amazon/zgbs/books/ref=zg_bs_nav_0'>Books</a></li> // <li><a href='http://www.amazon.com/best-sellers-camera-photo/zgbs/photo/ref=zg_bs_nav_0'>Camera & Photo</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Cell-Phones-Accessories/zgbs/wireless/ref=zg_bs_nav_0'>Cell Phones & Accessories</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Clothing/zgbs/apparel/ref=zg_bs_nav_0'>Clothing</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Computers-Accessories/zgbs/pc/ref=zg_bs_nav_0'>Computers & Accessories</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/ref=zg_bs_nav_0'>Electronics</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Gift-Cards-Store/zgbs/gift-cards/ref=zg_bs_nav_0'>Gift Cards Store</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Grocery-Gourmet-Food/zgbs/grocery/ref=zg_bs_nav_0'>Grocery & Gourmet Food</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Health-Personal-Care/zgbs/hpc/ref=zg_bs_nav_0'>Health & Personal Care</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0'>Home & Kitchen</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Home-Improvement/zgbs/hi/ref=zg_bs_nav_0'>Home Improvement</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Industrial-Scientific/zgbs/industrial/ref=zg_bs_nav_0'>Industrial & Scientific</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Jewelry/zgbs/jewelry/ref=zg_bs_nav_0'>Jewelry</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Kindle-Store/zgbs/digital-text/ref=zg_bs_nav_0'>Kindle Store</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Kitchen-Dining/zgbs/kitchen/ref=zg_bs_nav_0'>Kitchen & Dining</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-MP3-Downloads/zgbs/dmusic/ref=zg_bs_nav_0'>MP3 Downloads</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Magazines/zgbs/magazines/ref=zg_bs_nav_0'>Magazines</a></li> // <li><a href='http://www.amazon.com/best-sellers-movies-TV-DVD-Blu-ray/zgbs/movies-tv/ref=zg_bs_nav_0'>Movies & TV</a></li> // <li><a href='http://www.amazon.com/best-sellers-music-albums/zgbs/music/ref=zg_bs_nav_0'>Music</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Musical-Instruments/zgbs/musical-instruments/ref=zg_bs_nav_0'>Musical Instruments</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Office-Products/zgbs/office-products/ref=zg_bs_nav_0'>Office Products</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Patio-Lawn-Garden/zgbs/lawn-garden/ref=zg_bs_nav_0'>Patio, Lawn & Garden</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Pet-Supplies/zgbs/pet-supplies/ref=zg_bs_nav_0'>Pet Supplies</a></li> // <li><a href='http://www.amazon.com/best-sellers-shoes/zgbs/shoes/ref=zg_bs_nav_0'>Shoes</a></li> // <li><a href='http://www.amazon.com/best-sellers-software/zgbs/software/ref=zg_bs_nav_0'>Software</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Sports-Outdoors/zgbs/sporting-goods/ref=zg_bs_nav_0'>Sports & Outdoors</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/ref=zg_bs_nav_0'>Toys & Games</a></li> // <li><a href='http://www.amazon.com/best-sellers-video-games/zgbs/videogames/ref=zg_bs_nav_0'>Video Games</a></li> // <li><a href='http://www.amazon.com/Best-Sellers-Watches/zgbs/watches/ref=zg_bs_nav_0'>Watches</a></li> // </ul> //</li></ul> string bestSellerHtml = crl.getUrlRespHtml_multiTry(amazonBestSellerUrl); HtmlDocument htmlDoc = crl.htmlToHtmlDoc(bestSellerHtml); HtmlNode rootNode = htmlDoc.DocumentNode; HtmlNode browseRootUlNode = rootNode.SelectSingleNode("//ul[@id='zg_browseRoot']/ul"); if (browseRootUlNode != null) { //HtmlNodeCollection categoryNodeList = browseRootUlNode.SelectNodes(".//li/a[contains(@href, 'http://www.amazon.com/Best-Sellers-')]"); //HtmlNodeCollection categoryNodeList = browseRootUlNode.SelectNodes(".//li/a[contains(@href, 'http://www.amazon.com/Best-Sellers-') or contains(@href, 'http://www.amazon.com/best-sellers-')]"); HtmlNodeCollection categoryNodeList = browseRootUlNode.SelectNodes(".//li/a[contains(@href, 'http://www.amazon.com/')]"); foreach (HtmlNode categoryNode in categoryNodeList) { //<li><a href='http://www.amazon.com/best-sellers-camera-photo/zgbs/photo/ref=zg_bs_nav_0'>Camera & Photo</a></li> string categoryUrl = categoryNode.Attributes["href"].Value;//"http://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/ref=zg_bs_nav_0" string categoryStr = categoryNode.InnerText; categoryStr = HttpUtility.HtmlDecode(categoryStr);//"Appliances" string categoryKey = ""; if (extractCatKeyFromBestSellerCatUrl(categoryUrl, out categoryKey)) { //store info categoryItem bestSellerCategoryItem = new categoryItem(); bestSellerCategoryItem.Name = categoryStr; //"Appliances" bestSellerCategoryItem.Key = categoryKey; //"appliances" bestSellerCategoryItem.Url = categoryUrl; //"http://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/ref=zg_bs_nav_0" bestSellerCategoryList.Add(bestSellerCategoryItem); } else { //something wrong } } } else { //something wrong } return bestSellerCategoryList; }