private void OpenFileButton_Click(object sender, EventArgs e) { InputFileTextbox.Text = ""; ColumnsToRetainCheckedListBox.Items.Clear(); if (DelimiterTextBox.TextLength < 1 || QuoteTextBox.TextLength < 1) { MessageBox.Show("You must enter characters for your delimiter and quotes, respectively.", "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); return; } using (var dialog = new OpenFileDialog()) { dialog.Multiselect = false; dialog.CheckFileExists = true; dialog.CheckPathExists = true; dialog.ValidateNames = true; dialog.Title = "Please choose the CSV file that you would like to read"; dialog.FileName = "Your Input File.csv"; dialog.Filter = "Comma-Separated Values (CSV) File (*.csv)|*.csv"; if (dialog.ShowDialog() == DialogResult.OK) { InputFileTextbox.Text = dialog.FileName; InputFileTextbox.SelectionStart = InputFileTextbox.Text.Length; InputFileTextbox.SelectionLength = 0; } else { InputFileTextbox.Text = ""; return; } } hoju = new Homer(); hoju.InitializeFileDetails(fileIn: InputFileTextbox.Text, allowRepl: AllowReplacementsCheckbox.Checked, containsHead: ContainsHeaderCheckbox.Checked, fEncode: Encoding.GetEncoding(EncodingComboBox.SelectedItem.ToString()), quotechar: QuoteTextBox.Text[0], delimchar: DelimiterTextBox.Text[0]); BackgroundWorker rowCounter = new BackgroundWorker(); rowCounter.WorkerReportsProgress = true; rowCounter.DoWork += new DoWorkEventHandler(backgroundWorker_CountRows); rowCounter.ProgressChanged += new ProgressChangedEventHandler(backgroundWorker_CountRowsProgressChanged); rowCounter.RunWorkerCompleted += new RunWorkerCompletedEventHandler(backgroundWorker_CountRowsRunWorkerCompleted); DisableControls(); StartButton.Enabled = false; EnableProgBarNeverEnding(); StatusLabel.Text = "Counting rows of data..."; //let's get counting, but on a background thread rowCounter.RunWorkerAsync(hoju); }
private void RoyalSamplerForm_Load(object sender, EventArgs e) { DisableProgBar(); SubsamplingModeComboBox.Items.Add("Split File into Chunks"); SubsamplingModeComboBox.Items.Add("Sample by Range"); SubsamplingModeComboBox.Items.Add("Randomized Subsampling"); SubsamplingModeComboBox.SelectedItem = "Split File into Chunks"; foreach (var encoding in Encoding.GetEncodings()) { EncodingComboBox.Items.Add(encoding.Name); } try { Encoding selectedEncoding = Encoding.GetEncoding("utf-8"); EncodingComboBox.SelectedIndex = EncodingComboBox.FindStringExact(selectedEncoding.BodyName); } catch { EncodingComboBox.SelectedIndex = EncodingComboBox.FindStringExact(Encoding.Default.BodyName); } this.Text = "Royal Sampler v" + System.Reflection.Assembly.GetExecutingAssembly().GetName().Version.ToString() + ", by Ryan L. Boyd"; DelimiterTextBox.Text = ","; QuoteTextBox.Text = "\""; ContainsHeaderCheckbox.Checked = true; NumberOfFilesToCreateTextBox.Text = "5"; NumberOfFilesToCreateTextBox.MaxLength = 10; NumberOfSamplesPerFileTextBox.Text = ""; NumberOfSamplesPerFileTextBox.MaxLength = 10; InputFileTextbox.Select(); InputFileTextbox.Enabled = false; MainProgressBar.Minimum = 0; MainProgressBar.Maximum = 100; MainProgressBar.Value = 0; MainProgressBar.Step = 1; MainProgressBar.Enabled = false; AllowReplacementsCheckbox.Checked = true; ChangeCancelToStartButton(); theDealer = new BackgroundWorker(); hoju = new Homer(); }
private void backgroundWorker_SplitIntoChunks(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; Random random = new Random(); if (!String.IsNullOrEmpty(homer.randSeedString)) { random = new Random(int.Parse(homer.randSeedString)); } string filenamePadding = "D" + homer.numberOfSamples.ToString().Length.ToString(); string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; try { if (homer.HasHeader()) { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); string[] headerRow; headerRow = csvDat.Item1.ToArray <string>(); string headerRowToWriteString = RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); ulong sampleNumber = 0; ulong rowsWritten = 0; ulong rowsWrittenTotal = 0; FileStream fileStreamOut = null; StreamWriter streamWriter = null; foreach (var line in csvDat.Item2) { //open up a new file to write out if (rowsWritten == 0) { string filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None); streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding()); streamWriter.Write(headerRowToWriteString); } string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; rowsWrittenTotal++; if (rowsWritten == homer.rowsPerSample) { rowsWritten = 0; sampleNumber++; streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } ; if (rowsWrittenTotal % 1000 == 0) { int pctDone = (int)Math.Round((((double)rowsWrittenTotal / homer.GetRowCount()) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); if ((sender as BackgroundWorker).CancellationPending) { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); e.Result = "Cancelled"; break; } } } //everything has been written, so now we just close up shop try { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } catch { } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong sampleNumber = 0; ulong rowsWritten = 0; ulong rowsWrittenTotal = 0; FileStream fileStreamOut = null; StreamWriter streamWriter = null; foreach (var line in csvDat) { //open up a new file to write out if (rowsWritten == 0) { string filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None); streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding()); } string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; rowsWrittenTotal++; if (rowsWritten == homer.rowsPerSample) { rowsWritten = 0; sampleNumber++; streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } ; if (rowsWrittenTotal % 1000 == 0) { int pctDone = (int)Math.Round((((double)rowsWrittenTotal / homer.GetRowCount()) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); if ((sender as BackgroundWorker).CancellationPending) { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); e.Result = "Cancelled"; break; } } } //everything has been written, so now we just close up shop try { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } catch { } } } } catch { MessageBox.Show("There was an error in writing your output file(s). This often occurs when your output file is already open in another application.", "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } return; }
private void backgroundWorker_SubSampleWithoutReplacement(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; Random random = new Random(); if (!String.IsNullOrEmpty(homer.randSeedString)) { random = new Random(int.Parse(homer.randSeedString)); } string filenamePadding = "D" + homer.numberOfSamples.ToString().Length.ToString(); string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; int pctDone = 0; ulong actualSamplesToBeWritten; if (homer.numberOfSamples * homer.rowsPerSample > homer.GetRowCount()) { actualSamplesToBeWritten = (ulong)Math.Round((homer.GetRowCount() / (double)homer.rowsPerSample) * 100, 0, MidpointRounding.AwayFromZero); } else { actualSamplesToBeWritten = homer.numberOfSamples; } HashSet <ulong> rowsToKeep; ulong[] rowsToSample = new ulong[homer.GetRowCount()]; #region Randomize order of sample for (ulong i = 0; i < homer.GetRowCount(); i++) { rowsToSample[i] = i + 1; } rowsToSample = rowsToSample.OrderBy(x => random.NextLong()).ToArray <ulong>(); #endregion //this is our outermost block within the bgworker: the timer that we use to report progress TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01); using (new System.Threading.Timer( _ => (sender as BackgroundWorker).ReportProgress(pctDone), null, reportPeriod, reportPeriod)) { for (ulong sampleNumber = 0; sampleNumber < homer.numberOfSamples; sampleNumber++) { if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } ulong skipToVal = (sampleNumber * homer.rowsPerSample); ulong takeVal = homer.rowsPerSample; if (skipToVal > homer.GetRowCount()) { break; } if (skipToVal + takeVal > (ulong)rowsToSample.Length) { takeVal = (ulong)rowsToSample.Length - skipToVal; } ulong[] subsample = rowsToSample.Skip((int)skipToVal).Take((int)takeVal).ToArray(); rowsToKeep = subsample.ToHashSet <ulong>(); #region Get Busy Writin' or Get Busy Dyin' ulong rowsWritten = 0; //first we need to open up our output filename string filenameOut; if (String.IsNullOrEmpty(homer.randSeedString)) { filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } else { filenameOut = Path.Combine(homer.GetOutputLocation(), homer.randSeedString + "_subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } try { using (FileStream fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None)) using (StreamWriter streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding())) { if (homer.HasHeader()) { string[] headerRow; using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); headerRow = csvDat.Item1.ToArray <string>(); //write the header row streamWriter.Write(RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices)); ulong rowNumber = 0; string rowToWriteString; foreach (var line in csvDat.Item2) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToKeep.Contains(rowNumber)) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; if (rowsWritten == homer.rowsPerSample) { break; } } } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong rowNumber = 0; foreach (var line in csvDat) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToKeep.Contains(rowNumber)) { string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; if (rowsWritten == homer.rowsPerSample) { break; } } } } } } #endregion } catch { MessageBox.Show(genericProcessingError, "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } } } return; }
private void backgroundWorker_TargetedSubsampling(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; #region Get Busy Writin' or Get Busy Dyin' try { using (FileStream fileStreamOut = new FileStream(homer.GetOutputLocation(), FileMode.Create, FileAccess.Write, FileShare.None)) using (StreamWriter streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding())) { if (homer.HasHeader()) { string[] headerRow; using (var fileStreamIn = new FileStream(homer.GetInputFile(), FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); headerRow = csvDat.Item1.ToArray <string>(); string rowToWriteString = RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); //write the header row streamWriter.Write(rowToWriteString); ulong rowNumber = 0; foreach (var line in csvDat.Item2) { rowNumber++; if (rowNumber % 1000 == 0) { if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } //report progress //MessageBox.Show((((double)sampleNumber / homer.numberOfSamples) * 100).ToString()); int pctDone = (int)Math.Round((((double)rowNumber / homer.endRow) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); } if (rowNumber >= homer.startRow && rowNumber <= homer.endRow) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); } if (rowNumber == homer.endRow) { break; } } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong rowNumber = 0; string rowToWriteString; foreach (var line in csvDat) { rowNumber++; if (rowNumber % 1000 == 0) { if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } //report progress //MessageBox.Show((((double)sampleNumber / homer.numberOfSamples) * 100).ToString()); int pctDone = (int)Math.Round((((double)rowNumber / homer.endRow) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); } if (rowNumber >= homer.startRow && rowNumber <= homer.endRow) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); } if (rowNumber == homer.endRow) { break; } } } } } #endregion } catch { MessageBox.Show(genericProcessingError, "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } return; }
private void backgroundWorker_SubSampleWithReplacement(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; Random random = new Random(); if (!String.IsNullOrEmpty(homer.randSeedString)) { random = new Random(int.Parse(homer.randSeedString)); } string filenamePadding = "D" + homer.numberOfSamples.ToString().Length.ToString(); string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; int pctDone = 0; //this is our outermost block within the bgworker: the timer that we use to report progress TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01); using (new System.Threading.Timer( _ => (sender as BackgroundWorker).ReportProgress(pctDone), null, reportPeriod, reportPeriod)) { for (ulong sampleNumber = 0; sampleNumber < homer.numberOfSamples; sampleNumber++) { //break out of this method if the user cancels from the form if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } Dictionary <ulong, int> rowsToSample = new Dictionary <ulong, int>(); #region Determine Our Samples Needed ulong rowsSampledCount = 0; while (rowsSampledCount < homer.rowsPerSample) { ulong randomDraw = random.NextLong(1, homer.GetRowCount()); if (rowsToSample.ContainsKey(randomDraw)) { rowsToSample[randomDraw]++; } else { rowsToSample.Add(randomDraw, 1); } rowsSampledCount++; } #endregion #region Get Busy Writin' or Get Busy Dyin' ulong rowsWritten = 0; //first we need to open up our output filename string filenameOut; if (String.IsNullOrEmpty(homer.randSeedString)) { filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } else { filenameOut = Path.Combine(homer.GetOutputLocation(), homer.randSeedString + "_subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } try { using (FileStream fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None)) using (StreamWriter streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding())) { if (homer.HasHeader()) { string[] headerRow; using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); headerRow = csvDat.Item1.ToArray <string>(); string rowToWriteString = RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); //write the header row streamWriter.Write(rowToWriteString); ulong rowNumber = 0; foreach (var line in csvDat.Item2) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToSample.ContainsKey(rowNumber)) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); for (int numDraws = 0; numDraws < rowsToSample[rowNumber]; numDraws++) { streamWriter.Write(rowToWriteString); } rowsWritten += (ulong)rowsToSample[rowNumber]; if (rowsWritten == homer.rowsPerSample) { break; } } } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong rowNumber = 0; foreach (var line in csvDat) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToSample.ContainsKey(rowNumber)) { string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); for (int numDraws = 0; numDraws < rowsToSample[rowNumber]; numDraws++) { streamWriter.Write(rowToWriteString); } rowsWritten += (ulong)rowsToSample[rowNumber]; if (rowsWritten == homer.rowsPerSample) { break; } } } } } } #endregion } catch { MessageBox.Show(genericProcessingError, "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } } } return; }