Ejemplo n.º 1
0
    /*
     * [Function]
     * from amazon main category url extract sub category list
     * [Input]
     * http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dappliances
     * [Output]
     * sub/child categoryItem list
     * [Note]
     */
    public List<categoryItem> extractSubCategoryList(string amazonMainCategoryUrl)
    {
        List<categoryItem> subCategoryList = new List<categoryItem>();

        string respHtml = "";
        respHtml = crl.getUrlRespHtml_multiTry(amazonMainCategoryUrl);

        //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dappliances
        //<div id="left">
        //   <div id="leftNav">
        //       <div id="leftNavContainer">
        //            <div id="refinements" data-baserh="n%3A2619525011" data-browseladder="n%3A2619525011">
        //                <h2 >Department</h2>
        //                <ul id="ref_2619526011" data-typeid="n" >
        //                    <li style="margin-left: 0px">
        //                        <strong>Appliances</strong>
        //                    </li>
        //                    <!-- A non-null numberVisibleValues indicates that we should put the non-visible items behind a "See more..." expando, but only if there are enough values available to be hidden. -->
        //                    <li style="margin-left: -2px">
        //                        <a href="/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&amp;bbn=2619526011&amp;ie=UTF8&amp;qid=1371180383&amp;rnid=2619526011">
        //                            <span class="refinementLink">Air Conditioners</span><span class="narrowValue">&nbsp;(8,704)</span>
        //                        </a>
        //                    </li>
        //            ............
        //                    <li style="margin-left: -2px">
        //                        <a href="/s/ref=lp_2619525011_nr_n_28?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A2383576011&amp;bbn=2619526011&amp;ie=UTF8&amp;qid=1371180383&amp;rnid=2619526011">
        //                             <span class="refinementLink">Washers &amp; Dryers</span><span class="narrowValue">&nbsp;(1,390)</span>
        //                        </a>
        //                    </li>
        //                    <li style="margin-left: -2px">
        //                        <a href="/s/ref=lp_2619525011_nr_n_29?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3741521&amp;bbn=2619526011&amp;ie=UTF8&amp;qid=1371180383&amp;rnid=2619526011">
        //                            <span class="refinementLink">Wine Cellars</span><span class="narrowValue">&nbsp;(3,761)</span>
        //                        </a>
        //                    </li>
        //                </ul>

        HtmlAgilityPack.HtmlDocument htmlDoc = crl.htmlToHtmlDoc(respHtml);
        HtmlNode refinementsNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='refinements' and @data-baserh and @data-browseladder]");

        HtmlNodeCollection subCategoryNodeList = refinementsNode.SelectNodes("//ul[@id and @data-typeid]/li/a");
        if ((subCategoryNodeList != null) && (subCategoryNodeList.Count > 0))
        {
            foreach (HtmlNode subCatNode in subCategoryNodeList)
            {
                string subCatUrl = subCatNode.Attributes["href"].Value; //"/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&amp;bbn=2619526011&amp;ie=UTF8&amp;qid=1371183419&amp;rnid=2619526011"
                subCatUrl = constAmazonDomainUrl + subCatUrl; //"http://www.amazon.com/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&amp;bbn=2619526011&amp;ie=UTF8&amp;qid=1371183419&amp;rnid=2619526011"
                subCatUrl = HttpUtility.HtmlDecode(subCatUrl); //"http://www.amazon.com/s/ref=lp_2619525011_nr_n_0?rh=n%3A2619525011%2Cn%3A%212619526011%2Cn%3A3737671&bbn=2619526011&ie=UTF8&qid=1371183419&rnid=2619526011"

                HtmlNode refinementLinkNode = subCatNode.SelectSingleNode("./span[@class='refinementLink']");
                if (refinementLinkNode != null)
                {
                    string subCatName = refinementLinkNode.InnerText; //"Air Conditioners"
                    subCatName = subCatName.Trim();
                    subCatName = HttpUtility.HtmlDecode(subCatName); //"Air Conditioners"

                    //store info
                    categoryItem singleSubCatItem = new categoryItem();
                    singleSubCatItem.Name = subCatName;
                    singleSubCatItem.Key = ""; // sub category no key
                    singleSubCatItem.Url = subCatUrl;

                    subCategoryList.Add(singleSubCatItem);
                }
                else
                {
                    //something wrong
                }
            }
        }
        else
        {
            //something wrong
            gLogger.Debug("can not find subCategoryNodeList");
        }

        return subCategoryList;
    }
Ejemplo n.º 2
0
    /*
     * [Function]
     * from amazon main url extract main category
     * [Input]
     * http://www.amazon.com/ref=nb_sb_noss_null
     * [Output]
     * categoryItem list, contains 36 main category:
     * Key	"instant-video"	string
     * Name	"Amazon Instant Video"	string
     * ...
     * Key	"watches"	string
     * Name	"Watches"	string
     * [Note]
     */
    public List<categoryItem> extractMainCategoryList(string amazonMainUrl)
    {
        List < categoryItem> mainCategoryList = new List<categoryItem>();

        string respHtml = "";
        //respHtml = crl.getUrlRespHtml(regularCategoryMainUrl);
        respHtml = crl.getUrlRespHtml_multiTry(amazonMainUrl);
        
        /*
        <span id='nav-search-in' class='nav-sprite'>
          <span id='nav-search-in-content' data-value="search-alias=aps">
            All
          </span>
          <span class='nav-down-arrow nav-sprite'></span>
          <select name="url" id="searchDropdownBox" class="searchSelect" title="Search in"   ><option value="search-alias=aps" selected="selected">All Departments</option><option value="search-alias=instant-video">Amazon Instant Video</option><option value="search-alias=appliances">Appliances</option><option value="search-alias=mobile-apps">Apps for Android</option><option value="search-alias=arts-crafts">Arts, Crafts & Sewing</option><option value="search-alias=automotive">Automotive</option><option value="search-alias=baby-products">Baby</option><option value="search-alias=beauty">Beauty</option><option value="search-alias=stripbooks">Books</option><option value="search-alias=mobile">Cell Phones & Accessories</option><option value="search-alias=apparel">Clothing & Accessories</option><option value="search-alias=collectibles">Collectibles</option><option value="search-alias=computers">Computers</option><option value="search-alias=financial">Credit Cards</option><option value="search-alias=electronics">Electronics</option><option value="search-alias=gift-cards">Gift Cards Store</option><option value="search-alias=grocery">Grocery & Gourmet Food</option><option value="search-alias=hpc">Health & Personal Care</option><option value="search-alias=garden">Home & Kitchen</option><option value="search-alias=industrial">Industrial & Scientific</option><option value="search-alias=jewelry">Jewelry</option><option value="search-alias=digital-text">Kindle Store</option><option value="search-alias=magazines">Magazine Subscriptions</option><option value="search-alias=movies-tv">Movies & TV</option><option value="search-alias=digital-music">MP3 Music</option><option value="search-alias=popular">Music</option><option value="search-alias=mi">Musical Instruments</option><option value="search-alias=office-products">Office Products</option><option value="search-alias=lawngarden">Patio, Lawn & Garden</option><option value="search-alias=pets">Pet Supplies</option><option value="search-alias=shoes">Shoes</option><option value="search-alias=software">Software</option><option value="search-alias=sporting">Sports & Outdoors</option><option value="search-alias=tools">Tools & Home Improvement</option><option value="search-alias=toys-and-games">Toys & Games</option><option value="search-alias=videogames">Video Games</option><option value="search-alias=watches">Watches</option></select>
        </span>
         */
        HtmlAgilityPack.HtmlDocument htmlDoc = crl.htmlToHtmlDoc(respHtml);
        HtmlNode categorySelectNode = htmlDoc.DocumentNode.SelectSingleNode("//span[@id='nav-search-in' and @class='nav-sprite']/select[@name='url' and @id='searchDropdownBox' and @class='searchSelect']");
        if (categorySelectNode != null)
        {
            HtmlNodeCollection optionNodeList = categorySelectNode.SelectNodes(".//option[@value]");

            //omit first one:
            //<option value="search-alias=aps" selected="selected">All Departments</option>
            optionNodeList.Remove(0);

            foreach (HtmlNode singleOptionNode in optionNodeList)
            {
                //<option value="search-alias=instant-video">Amazon Instant Video</option>
                //<option value="search-alias=appliances">Appliances</option>
                //...
                //<option value="search-alias=watches">Watches</option>
                string searchValue = singleOptionNode.Attributes["value"].Value; //search-alias=instant-video
                string categoryKey = ""; //instant-video
                if (crl.extractSingleStr(@"=([a-z\-]+)", searchValue, out categoryKey))
                {
                    //instant-video
                    //appliances
                    //mobile-apps

                    string generalCategory = singleOptionNode.InnerText; //Amazon Instant Video
                    //string generalCategory = singleOptionNode.NextSibling.InnerText; //Amazon Instant Video

                    //store info
                    categoryItem singleCategoryItem = new categoryItem();
                    singleCategoryItem.Name = generalCategory;
                    singleCategoryItem.Key = categoryKey;
                    singleCategoryItem.Url = generateMainCategoryUrlFromCategoryKey(categoryKey);

                    //add to list
                    mainCategoryList.Add(singleCategoryItem);
                }
                else
                {
                    //something wrong
                    gLogger.Debug(String.Format("can not extart main category key for html node {0} for {1}",singleOptionNode.ToString(),amazonMainUrl));
                }
            }
        }
        else
        {
            //something wrong
            gLogger.Debug("can not find categorySelectNode for " + amazonMainUrl);
        }

        return mainCategoryList;
    }
Ejemplo n.º 3
0
    /*
     * [Function]
     * from amazon Bes Seller url extract category
     * [Input]
     * http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab
     * http://www.amazon.com/Best-Sellers/zgbs
     * [Output]
     * categoryItem list, contains 35 main category:
     * ...
     * [Note]
     */
    public List<categoryItem> extractBestSellerCategoryList(string amazonBestSellerUrl)
    {
        List<categoryItem> bestSellerCategoryList = new List<categoryItem>();

        //http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab
          //<ul id="zg_browseRoot">
          //  <li> 
          //   <span class="zg_selected"> Any Department</span>
          //  </li> 
          //  <ul>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/ref=zg_bs_nav_0'>Appliances</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Appstore-Android/zgbs/mobile-apps/ref=zg_bs_nav_0'>Appstore for Android</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Arts-Crafts-Sewing/zgbs/arts-crafts/ref=zg_bs_nav_0'>Arts, Crafts & Sewing</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Automotive/zgbs/automotive/ref=zg_bs_nav_0'>Automotive</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Baby/zgbs/baby-products/ref=zg_bs_nav_0'>Baby</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Beauty/zgbs/beauty/ref=zg_bs_nav_0'>Beauty</a></li>
          //    <li><a href='http://www.amazon.com/best-sellers-books-Amazon/zgbs/books/ref=zg_bs_nav_0'>Books</a></li>
          //    <li><a href='http://www.amazon.com/best-sellers-camera-photo/zgbs/photo/ref=zg_bs_nav_0'>Camera &amp; Photo</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Cell-Phones-Accessories/zgbs/wireless/ref=zg_bs_nav_0'>Cell Phones & Accessories</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Clothing/zgbs/apparel/ref=zg_bs_nav_0'>Clothing</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Computers-Accessories/zgbs/pc/ref=zg_bs_nav_0'>Computers & Accessories</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/ref=zg_bs_nav_0'>Electronics</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Gift-Cards-Store/zgbs/gift-cards/ref=zg_bs_nav_0'>Gift Cards Store</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Grocery-Gourmet-Food/zgbs/grocery/ref=zg_bs_nav_0'>Grocery & Gourmet Food</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Health-Personal-Care/zgbs/hpc/ref=zg_bs_nav_0'>Health & Personal Care</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/ref=zg_bs_nav_0'>Home &amp; Kitchen</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Home-Improvement/zgbs/hi/ref=zg_bs_nav_0'>Home Improvement</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Industrial-Scientific/zgbs/industrial/ref=zg_bs_nav_0'>Industrial & Scientific</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Jewelry/zgbs/jewelry/ref=zg_bs_nav_0'>Jewelry</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Kindle-Store/zgbs/digital-text/ref=zg_bs_nav_0'>Kindle Store</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Kitchen-Dining/zgbs/kitchen/ref=zg_bs_nav_0'>Kitchen & Dining</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-MP3-Downloads/zgbs/dmusic/ref=zg_bs_nav_0'>MP3 Downloads</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Magazines/zgbs/magazines/ref=zg_bs_nav_0'>Magazines</a></li>
          //    <li><a href='http://www.amazon.com/best-sellers-movies-TV-DVD-Blu-ray/zgbs/movies-tv/ref=zg_bs_nav_0'>Movies & TV</a></li>
          //    <li><a href='http://www.amazon.com/best-sellers-music-albums/zgbs/music/ref=zg_bs_nav_0'>Music</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Musical-Instruments/zgbs/musical-instruments/ref=zg_bs_nav_0'>Musical Instruments</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Office-Products/zgbs/office-products/ref=zg_bs_nav_0'>Office Products</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Patio-Lawn-Garden/zgbs/lawn-garden/ref=zg_bs_nav_0'>Patio, Lawn & Garden</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Pet-Supplies/zgbs/pet-supplies/ref=zg_bs_nav_0'>Pet Supplies</a></li>
          //    <li><a href='http://www.amazon.com/best-sellers-shoes/zgbs/shoes/ref=zg_bs_nav_0'>Shoes</a></li>
          //    <li><a href='http://www.amazon.com/best-sellers-software/zgbs/software/ref=zg_bs_nav_0'>Software</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Sports-Outdoors/zgbs/sporting-goods/ref=zg_bs_nav_0'>Sports &amp; Outdoors</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/ref=zg_bs_nav_0'>Toys &amp; Games</a></li>
          //    <li><a href='http://www.amazon.com/best-sellers-video-games/zgbs/videogames/ref=zg_bs_nav_0'>Video Games</a></li>
          //    <li><a href='http://www.amazon.com/Best-Sellers-Watches/zgbs/watches/ref=zg_bs_nav_0'>Watches</a></li>
          //  </ul>
          //</li></ul>


        string bestSellerHtml = crl.getUrlRespHtml_multiTry(amazonBestSellerUrl);

        HtmlDocument htmlDoc = crl.htmlToHtmlDoc(bestSellerHtml);
        HtmlNode rootNode = htmlDoc.DocumentNode;

        HtmlNode browseRootUlNode = rootNode.SelectSingleNode("//ul[@id='zg_browseRoot']/ul");
        if (browseRootUlNode != null)
        {
            //HtmlNodeCollection categoryNodeList = browseRootUlNode.SelectNodes(".//li/a[contains(@href, 'http://www.amazon.com/Best-Sellers-')]");
            //HtmlNodeCollection categoryNodeList = browseRootUlNode.SelectNodes(".//li/a[contains(@href, 'http://www.amazon.com/Best-Sellers-') or contains(@href, 'http://www.amazon.com/best-sellers-')]");
            HtmlNodeCollection categoryNodeList = browseRootUlNode.SelectNodes(".//li/a[contains(@href, 'http://www.amazon.com/')]");

            foreach (HtmlNode categoryNode in categoryNodeList)
            {
                //<li><a href='http://www.amazon.com/best-sellers-camera-photo/zgbs/photo/ref=zg_bs_nav_0'>Camera &amp; Photo</a></li>
                string categoryUrl = categoryNode.Attributes["href"].Value;//"http://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/ref=zg_bs_nav_0"

                string categoryStr = categoryNode.InnerText;
                categoryStr = HttpUtility.HtmlDecode(categoryStr);//"Appliances"

                string categoryKey = "";
                if (extractCatKeyFromBestSellerCatUrl(categoryUrl, out categoryKey))
                {
                    //store info
                    categoryItem bestSellerCategoryItem = new categoryItem();
                    bestSellerCategoryItem.Name = categoryStr; //"Appliances"
                    bestSellerCategoryItem.Key = categoryKey; //"appliances"
                    bestSellerCategoryItem.Url = categoryUrl; //"http://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/ref=zg_bs_nav_0"

                    bestSellerCategoryList.Add(bestSellerCategoryItem);
                }
                else
                {
                    //something wrong
                }
            }
        }
        else
        {
            //something wrong
        }

        return bestSellerCategoryList;
    }