public async Task <List <AmazonData> > GetAmazonData(string keywords, int dataCount) { int pageNo = 1; int resultPosition = 1; int pageCount = 0; var modelList = new List <AmazonData>(); var config = Configuration.Default.WithDefaultLoader(); var context = BrowsingContext.New(config); try { do { var amazonURL = string.Format(Constant.AMAZON_URL, Common.JoinStringWithPlus(keywords), pageNo); var document = await context.OpenAsync(amazonURL); var searchDivElement = document.QuerySelector(Constant.SEARCH_MAIN_DIV_CSS_SELECTOR); pageCount = ProcessData.GetPageNo(searchDivElement); //get only searched result not Amazon's Choice,Editor's recommendations and so on var searchResultElement = searchDivElement.Children.Where(x => x.LocalName == "div" && x.GetAttribute("data-component-type") == "s-search-result").ToList(); foreach (var item in searchResultElement) { // var model = new AmazonData(); bool displayInRow = true; var dataIndex = item.GetAttribute("data-index"); var divNo = int.Parse(dataIndex) + 1; //find out if result is displayed in single row or in 4 columns var displayInSingleRow = item.QuerySelector("div.a-section.a-spacing-medium").QuerySelector("div.sg-row"); if (displayInSingleRow == null) { displayInRow = false; } var title = ProcessData.GetTitle(divNo, displayInRow, item); var descriptionURL = ProcessData.GetDescription(divNo, displayInRow, item); var model = new AmazonData() { SearchResultPosition = resultPosition, Title = title, DescriptionURL = descriptionURL }; modelList.Add(model); resultPosition++; if (modelList.Count == dataCount) { goto exit; } } pageNo++; } while (pageNo <= pageCount); exit: return(modelList); } catch (Exception ex) { throw; } }
public static void Execute(int num = 2) { switch (num) { case 1: AmazonData.GetHighRatedUser(@"D:\Data\Datasets\Amazon\amazon-books-ml-format.txt", @"D:\Data\Datasets\Amazon\amazon-books-r30.txt"); break; } }
private void PreemptiveCache(String category, String searchString, int page, int perPage, int catID) { // Try to cache next page // TODO: If cacher is already in progress when page is requested, wait for completion Debug.WriteLine("Async cache started " + DateTime.Now.ToLongTimeString()); ResponseContainer cacheThese = AmazonData.AWSQuery(searchString, category, page + 1, perPage); if (cacheThese.error == null) { MemoryCache.Cache(cacheThese, page + 1, perPage, searchString, catID); } Debug.WriteLine("Async cache completed " + DateTime.Now.ToLongTimeString()); }
/// <summary> /// Controller action for product query /// </summary> /// <returns>Query results as json.</returns> /// <param name="catID">Category ID</param> /// <param name="searchString">Search Keywords</param> /// <param name="page">Starting page of query</param> /// <param name="perPage">Items per page</param> public String Json(int catID, String searchString, int page = 1, int perPage = 10) { // Sanitize user inputs String category = ProductCategories.ResolveCategory(catID); page = Math.Max(1, page); page = Math.Min(10, page); perPage = Math.Max(1, perPage); perPage = Math.Min(70, perPage); searchString = Regex.Replace(searchString, "[^a-z A-Z|0-9]", ""); // Check if is cached ResponseContainer products = MemoryCache.Acquire(page, perPage, searchString, catID); if (products == null) { products = AmazonData.AWSQuery(searchString, category, page, perPage); // Try to cache current page if (products.error == null) { MemoryCache.Cache(products, page, perPage, searchString, catID); } } else { Debug.WriteLine("Retrieved page {0} from cache", page); } // Start new async thread to cache next page Action <String, String, int, int, int> cacheDelegate = PreemptiveCache; cacheDelegate.BeginInvoke(category, searchString, page, perPage, catID, cacheDelegate.EndInvoke, null); // Return response JavaScriptSerializer jsSerializer = new JavaScriptSerializer(); Response.AddHeader("Content-Type", "text/plain"); Debug.WriteLine("query returned."); return(jsSerializer.Serialize(products)); }
public static List <AmazonData[]> Parse(string filePath, int batchSize = 1) { var batchList = new List <AmazonData[]>(); var numTotalRecords = 0; using (StreamReader reader = new StreamReader(Environment.ExpandEnvironmentVariables(filePath))) { string line; int idx = 0; var batch = new AmazonData[batchSize]; while ((line = reader.ReadLine()) != null) { batch[idx] = ParseLine(line); numTotalRecords++; idx++; if (idx == batchSize) { batchList.Add(batch); //Console.WriteLine("{0}st batch (size: {1}) of {2} has been added", batchList.Count, batchSize, typeof(AmazonModel).Name); batch = new AmazonData[batchSize]; idx = 0; } } if (idx > 0 && idx < batchSize) { //Console.WriteLine("Remaining {0} records of {1} as the last batch", idx, typeof(AmazonModel).Name); batchList.Add(batch); // Add remaining batch } } System.Console.WriteLine("{0} records have been loaded into {1} batches", numTotalRecords, batchList.Count); return(batchList); }
public static AmazonData ParseLine(string line, bool readAll = true) { string[] tokens = new string[9]; int fieldToAnalyzeStart = 0; int fId = 0; int currentField = 0; bool skipFieldSep = false; int i = 0; int countSeqLen = 0; int len = line.Length; int stackTot = 0; int stackTop = 0; int stackBase = 0; // parse initial '"' while (line[countSeqLen] == '"') { countSeqLen++; } i = countSeqLen; stackTot = stackTop = countSeqLen; stackBase = countSeqLen > 0 ? 1 : 0; while (i < len) { countSeqLen = 0; while (i + countSeqLen < len && line[i + countSeqLen] == '"') { countSeqLen++; } i += countSeqLen; int prevQuoteSeqLen = countSeqLen; if (countSeqLen > 0) { if (countSeqLen > stackTot) { stackTot |= countSeqLen; stackTop = countSeqLen & (~countSeqLen >> 1); } else if (countSeqLen == stackTop / 2) { stackTot -= stackTop; stackTot -= countSeqLen; stackTop = countSeqLen / 2; } else { stackTot ^= countSeqLen; stackTop = stackTot & (~stackTot >> 1); } if (!Functions.IsContigBitmask(stackTot)) { break; } } skipFieldSep = stackTop > stackBase; if (!skipFieldSep && line[i] == ',') { if (readAll || fId == 4) { tokens[fId] = line.Substring(fieldToAnalyzeStart + prevQuoteSeqLen + 1, i - fieldToAnalyzeStart - prevQuoteSeqLen * 2 - 1); } fId++; currentField++; fieldToAnalyzeStart = i; } i++; } if (tokens[8] == null) { tokens[fId] = line.Substring(fieldToAnalyzeStart + 1, line.Length - fieldToAnalyzeStart - 1); } if (readAll) { var amazonData = new AmazonData { ReviewerID = tokens[0], Asin = tokens[1], ReviewerName = tokens[2], Helpful = tokens[3], Text = tokens[4], Label = int.Parse(tokens[5]), Summary = tokens[6], UnixReviewTime = tokens[7], ReviewTime = int.Parse(tokens[8]), }; return(amazonData); } else { var amazonData = new AmazonData { ReviewerID = "", Asin = "", ReviewerName = "", Helpful = "", Text = tokens[4], Label = -1, Summary = "", UnixReviewTime = "", ReviewTime = -1 }; return(amazonData); } }
public AmazonSearchSteps(AmazonData amazonData) { _amazonData = amazonData; }
public AmazonSearchStepsContextInjection(AmazonData amazonData) { _amazonData = amazonData; }