public async Task <List <AmazonData> > GetAmazonData(string keywords, int dataCount)
        {
            int pageNo         = 1;
            int resultPosition = 1;
            int pageCount      = 0;
            var modelList      = new List <AmazonData>();
            var config         = Configuration.Default.WithDefaultLoader();
            var context        = BrowsingContext.New(config);

            try
            {
                do
                {
                    var amazonURL = string.Format(Constant.AMAZON_URL, Common.JoinStringWithPlus(keywords), pageNo);
                    var document  = await context.OpenAsync(amazonURL);

                    var searchDivElement = document.QuerySelector(Constant.SEARCH_MAIN_DIV_CSS_SELECTOR);
                    pageCount = ProcessData.GetPageNo(searchDivElement);
                    //get only searched result not Amazon's Choice,Editor's recommendations and so on
                    var searchResultElement = searchDivElement.Children.Where(x => x.LocalName == "div" && x.GetAttribute("data-component-type") == "s-search-result").ToList();
                    foreach (var item in searchResultElement)
                    {
                        // var model = new AmazonData();
                        bool displayInRow = true;
                        var  dataIndex    = item.GetAttribute("data-index");
                        var  divNo        = int.Parse(dataIndex) + 1;
                        //find out if result is displayed in single row or in 4 columns
                        var displayInSingleRow = item.QuerySelector("div.a-section.a-spacing-medium").QuerySelector("div.sg-row");
                        if (displayInSingleRow == null)
                        {
                            displayInRow = false;
                        }
                        var title          = ProcessData.GetTitle(divNo, displayInRow, item);
                        var descriptionURL = ProcessData.GetDescription(divNo, displayInRow, item);
                        var model          = new AmazonData()
                        {
                            SearchResultPosition = resultPosition,
                            Title          = title,
                            DescriptionURL = descriptionURL
                        };
                        modelList.Add(model);
                        resultPosition++;
                        if (modelList.Count == dataCount)
                        {
                            goto exit;
                        }
                    }
                    pageNo++;
                } while (pageNo <= pageCount);
exit:
                return(modelList);
            }
            catch (Exception ex)
            {
                throw;
            }
        }
 public static void Execute(int num = 2)
 {
     switch (num)
     {
     case 1:
         AmazonData.GetHighRatedUser(@"D:\Data\Datasets\Amazon\amazon-books-ml-format.txt",
                                     @"D:\Data\Datasets\Amazon\amazon-books-r30.txt");
         break;
     }
 }
Example #3
0
        private void PreemptiveCache(String category, String searchString, int page, int perPage, int catID)
        {
            // Try to cache next page
            // TODO: If cacher is already in progress when page is requested, wait for completion
            Debug.WriteLine("Async cache started " + DateTime.Now.ToLongTimeString());
            ResponseContainer cacheThese = AmazonData.AWSQuery(searchString, category, page + 1, perPage);

            if (cacheThese.error == null)
            {
                MemoryCache.Cache(cacheThese, page + 1, perPage, searchString, catID);
            }

            Debug.WriteLine("Async cache completed " + DateTime.Now.ToLongTimeString());
        }
Example #4
0
        /// <summary>
        /// Controller action for product query
        /// </summary>
        /// <returns>Query results as json.</returns>
        /// <param name="catID">Category ID</param>
        /// <param name="searchString">Search Keywords</param>
        /// <param name="page">Starting page of query</param>
        /// <param name="perPage">Items per page</param>
        public String Json(int catID, String searchString, int page = 1, int perPage = 10)
        {
            // Sanitize user inputs
            String category = ProductCategories.ResolveCategory(catID);

            page         = Math.Max(1, page);
            page         = Math.Min(10, page);
            perPage      = Math.Max(1, perPage);
            perPage      = Math.Min(70, perPage);
            searchString = Regex.Replace(searchString, "[^a-z A-Z|0-9]", "");

            // Check if is cached
            ResponseContainer products = MemoryCache.Acquire(page, perPage, searchString, catID);

            if (products == null)
            {
                products = AmazonData.AWSQuery(searchString, category, page, perPage);

                // Try to cache current page
                if (products.error == null)
                {
                    MemoryCache.Cache(products, page, perPage, searchString, catID);
                }
            }
            else
            {
                Debug.WriteLine("Retrieved page {0} from cache", page);
            }

            // Start new async thread to cache next page
            Action <String, String, int, int, int> cacheDelegate = PreemptiveCache;

            cacheDelegate.BeginInvoke(category, searchString, page, perPage, catID, cacheDelegate.EndInvoke, null);

            // Return response
            JavaScriptSerializer jsSerializer = new JavaScriptSerializer();

            Response.AddHeader("Content-Type", "text/plain");
            Debug.WriteLine("query returned.");
            return(jsSerializer.Serialize(products));
        }
Example #5
0
        public static List <AmazonData[]> Parse(string filePath, int batchSize = 1)
        {
            var batchList       = new List <AmazonData[]>();
            var numTotalRecords = 0;

            using (StreamReader reader = new StreamReader(Environment.ExpandEnvironmentVariables(filePath)))
            {
                string line;
                int    idx = 0;

                var batch = new AmazonData[batchSize];
                while ((line = reader.ReadLine()) != null)
                {
                    batch[idx] = ParseLine(line);
                    numTotalRecords++;
                    idx++;

                    if (idx == batchSize)
                    {
                        batchList.Add(batch);
                        //Console.WriteLine("{0}st batch (size: {1}) of {2} has been added", batchList.Count, batchSize, typeof(AmazonModel).Name);
                        batch = new AmazonData[batchSize];
                        idx   = 0;
                    }
                }

                if (idx > 0 && idx < batchSize)
                {
                    //Console.WriteLine("Remaining {0} records of {1} as the last batch", idx, typeof(AmazonModel).Name);
                    batchList.Add(batch); // Add remaining batch
                }
            }

            System.Console.WriteLine("{0} records have been loaded into {1} batches", numTotalRecords, batchList.Count);
            return(batchList);
        }
Example #6
0
        public static AmazonData ParseLine(string line, bool readAll = true)
        {
            string[] tokens = new string[9];

            int fieldToAnalyzeStart = 0;
            int fId = 0;

            int  currentField = 0;
            bool skipFieldSep = false;
            int  i            = 0;
            int  countSeqLen  = 0;
            int  len          = line.Length;

            int stackTot  = 0;
            int stackTop  = 0;
            int stackBase = 0;

            // parse initial '"'
            while (line[countSeqLen] == '"')
            {
                countSeqLen++;
            }
            i = countSeqLen;

            stackTot  = stackTop = countSeqLen;
            stackBase = countSeqLen > 0 ? 1 : 0;

            while (i < len)
            {
                countSeqLen = 0;
                while (i + countSeqLen < len && line[i + countSeqLen] == '"')
                {
                    countSeqLen++;
                }
                i += countSeqLen;

                int prevQuoteSeqLen = countSeqLen;
                if (countSeqLen > 0)
                {
                    if (countSeqLen > stackTot)
                    {
                        stackTot |= countSeqLen;
                        stackTop  = countSeqLen & (~countSeqLen >> 1);
                    }
                    else if (countSeqLen == stackTop / 2)
                    {
                        stackTot -= stackTop;
                        stackTot -= countSeqLen;
                        stackTop  = countSeqLen / 2;
                    }
                    else
                    {
                        stackTot ^= countSeqLen;
                        stackTop  = stackTot & (~stackTot >> 1);
                    }
                    if (!Functions.IsContigBitmask(stackTot))
                    {
                        break;
                    }
                }
                skipFieldSep = stackTop > stackBase;

                if (!skipFieldSep && line[i] == ',')
                {
                    if (readAll || fId == 4)
                    {
                        tokens[fId] = line.Substring(fieldToAnalyzeStart + prevQuoteSeqLen + 1,
                                                     i - fieldToAnalyzeStart - prevQuoteSeqLen * 2 - 1);
                    }
                    fId++;

                    currentField++;
                    fieldToAnalyzeStart = i;
                }
                i++;
            }

            if (tokens[8] == null)
            {
                tokens[fId] = line.Substring(fieldToAnalyzeStart + 1, line.Length - fieldToAnalyzeStart - 1);
            }

            if (readAll)
            {
                var amazonData = new AmazonData
                {
                    ReviewerID     = tokens[0],
                    Asin           = tokens[1],
                    ReviewerName   = tokens[2],
                    Helpful        = tokens[3],
                    Text           = tokens[4],
                    Label          = int.Parse(tokens[5]),
                    Summary        = tokens[6],
                    UnixReviewTime = tokens[7],
                    ReviewTime     = int.Parse(tokens[8]),
                };
                return(amazonData);
            }
            else
            {
                var amazonData = new AmazonData
                {
                    ReviewerID     = "",
                    Asin           = "",
                    ReviewerName   = "",
                    Helpful        = "",
                    Text           = tokens[4],
                    Label          = -1,
                    Summary        = "",
                    UnixReviewTime = "",
                    ReviewTime     = -1
                };
                return(amazonData);
            }
        }
Example #7
0
 public AmazonSearchSteps(AmazonData amazonData)
 {
     _amazonData = amazonData;
 }
 public AmazonSearchStepsContextInjection(AmazonData amazonData)
 {
     _amazonData = amazonData;
 }