/// <summary> /// Creates a CSVRow instance for a given UPC with data collected from the input CSV file and the internet (if enabled) /// </summary> /// <param name="theRowIndex">Index of the row in the CSV</param> private void LoadRow(int theRowIndex) { int count = 0; using (var reader = new StreamReader(InputPath)) using (var csv = new CsvReader(reader)) { try { csv.Configuration.RegisterClassMap <InputMap>(); // Get all the data IEnumerable <CSVRow> records = csv.GetRecords <CSVRow>(); // Iterate through the data and lookup the UPCs foreach (var rec in records) { // Once we are at the starting point, start scraping if (count == theRowIndex) { Console.WriteLine("Scraping for index: " + theRowIndex.ToString() + ": " + rec.full_upc); ScraperOutput output = new ScraperOutput(); output.desc = ""; output.upc = rec.full_upc; // Normal scraping behavior if enabled if (!isOffline) { output = ScrapeWeb(rec.full_upc, rec); } Console.WriteLine("ADD " + rec.full_upc.ToString() + " " + recordBuffer.Count); recordBuffer.Add(new Tuple <CSVRow, ScraperOutput>(rec, output)); } count++; } } catch (HeaderValidationException e) { Console.WriteLine("ERROR: " + e.Message); errorState = new Tuple <bool, string>(true, e.Message); } totalRecordsCount = count; } }
/// <summary> /// Goes back one index, unused /// </summary> private void GetLastEntry() { Tuple <CSVRow, ScraperOutput> entry = webWorker.GetLastRow(); CSVRow row = entry.Item1; ScraperOutput output = entry.Item2; txtBrand.Text = row.brand2; txtDesc.Text = row.desc2; txtSignDesc.Text = row.sign; txtUPC.Text = row.full_upc; txtNewBrand.Text = row.brand; txtNewDescription.Text = row.desc; txtNotes.Text = "oops"; // Crucial to make sure we update this webWorker.rowIndex--; }
private ScraperOutput HandleWebError(string upc, bool switchOffline = false, string msg = "") { ScraperOutput output = new ScraperOutput(); if (switchOffline) { output.desc = "Internet connection failed, switching to Offline mode"; } else { output.desc = "Web Error: " + msg; } output.upc = upc; isOffline = switchOffline; return(output); }
/// <summary> /// Loads the next item in the Record Buffer into the user application /// </summary> private void LoadNextEntry() { // If the web worker ran into an error, stop the progress if (webWorker.errorState.Item1 == true) { LogText("[ERROR] " + webWorker.errorState.Item2); return; } // Catch the end condition if (webWorker.recordBuffer.Count == 0) { LogText("No more records left to populate!"); lblcsvIndex.Text = "All finished! Check the output file location."; txtBrand.Text = ""; txtDesc.Text = ""; txtSignDesc.Text = ""; txtUPC.Text = ""; txtNewBrand.Text = ""; txtNewDescription.Text = ""; txtNotes.Text = ""; return; } // Get the next entry in the buffer Tuple <CSVRow, ScraperOutput> entry = webWorker.recordBuffer[0]; CSVRow row = entry.Item1; ScraperOutput output = entry.Item2; // Makes sure that we convert all scientific notation back to real numbers string upc = decimal.Parse(row.full_upc, System.Globalization.NumberStyles.Any).ToString(); // Make sure we didn't lose any leading zeros while (upc.Length < 14) { upc = '0' + upc; } string currentUPC = upc; string lastManu, currentManu; string lastNoLeading = lastUPC.TrimStart('0'); string currentNoLeading = currentUPC.TrimStart('0'); // How many digits to check for uniqueness int checkLength = 7; // Get the manufacturer codes if (lastNoLeading.Length > checkLength && currentNoLeading.Length > checkLength) { lastManu = lastNoLeading.Substring(0, checkLength); currentManu = currentNoLeading.Substring(0, checkLength); } else { // These just have to be non-equal, doesn't matter the value lastManu = "1"; currentManu = "-1"; } // If the manufacturer codes are the same OR there are brand names and they are the same, keep the current brand name if (lastManu == currentManu || (lastBrand.ToLower() == row.brand2.ToLower() && row.brand2 != "")) { //row.brand = txtNewBrand.Text; } lastBrand = row.brand2; lastUPC = upc; txtBrand.Text = row.brand2; txtDesc.Text = row.desc2; txtSignDesc.Text = row.sign != " " ? row.sign : row.pos; txtUPC.Text = upc; // Fill the new textboxes with the best data of the bunch txtNewBrand.Text = (row.brand.Length != 0) ? row.brand : row.brand2; txtNewDescription.Text = (row.desc.Length != 0) ? row.desc : output.desc; txtNotes.Text = row.notes; // Run the formatting logic txtNewBrand.Text = toTitlecase(txtNewBrand.Text); txtNewDescription.Text = DescriptionFormat(txtNewDescription.Text); if (txtNewDescription.Text.ToLower() == "null") { txtNewDescription.Text = ""; } btnNext.Enabled = true; btnSkip.Enabled = true; if (chkGoogleUPC.Checked) { System.Diagnostics.Process.Start("https://www.google.com/search?q=" + txtUPC.Text); } // Update the label letting the user know where they are lblcsvIndex.Text = "Item #" + (webWorker.GetCurrentRowIndex()).ToString() + " of " + webWorker.totalRecordsCount.ToString() + " - Row #" + (webWorker.GetCurrentRowIndex() + 2).ToString(); }
/// <summary> /// Scrapes the internet for product data for a given UPC and its matching CSV row /// </summary> /// <param name="upc">UPC to search for</param> /// <param name="record">CSV record from the input file</param> /// <returns></returns> private ScraperOutput ScrapeWeb(string upc, CSVRow record) { ScraperOutput output; // Go to the website for that UPC string urlAddress = "https://www.barcodelookup.com/" + upc; try { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); if (response.StatusCode == HttpStatusCode.OK) { // Copy/pasted web stuff Stream receiveStream = response.GetResponseStream(); StreamReader readStream = null; if (response.CharacterSet == null) { readStream = new StreamReader(receiveStream); } else { readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet)); } // End Copy/pasted web stuff string data = readStream.ReadToEnd(); //Console.WriteLine("Reading data (length #" + data.Length.ToString() + " characters)"); // The product name is in the page's meta tag string startStr = "<meta name=\"description\" content=\"Barcode Lookup provides info on EAN "; string endStr = "\">"; int startIndex = data.IndexOf(startStr); // Check for invalid data int failIndex = data.IndexOf("<meta name=\"description\" content=\"This barcode doesn't exist in our database. Please search for another barcode in the search box"); int badCodeIndex = data.IndexOf("\"This barcode number is not valid"); // Catch the scenarios where the website fails to provide a clear definition for the website if (failIndex != -1 || badCodeIndex != -1 || startIndex == -1) { //Console.WriteLine("Could not find UPC"); output = new ScraperOutput(); output.desc = "null"; output.upc = upc; return(output); } // If all has worked, grab the ending index int endIndex = data.IndexOf(endStr, startIndex); // Ignore the meta tag startIndex += startStr.Length; int grabLength = endIndex - startIndex; // Grab the website's product name which consists of the EAN barcode and the name string eanAndName = data.Substring(startIndex, grabLength); // Apply my logic to format the title string into something user friendly string name = formatProductDescription(eanAndName, record); // Format and return the output output = new ScraperOutput(); output.desc = name; output.upc = upc; return(output); } else { // If we are connected to the internet but the query fails, we try again next time string err = "Web error code: " + response.StatusCode.ToString(); Console.WriteLine(err); return(HandleWebError(upc, false, err)); } } catch (Exception e) { // If we are unable to connect to the internet, we go offline Console.WriteLine("Error code: " + e.Message); return(HandleWebError(upc, true)); } }