private void backgroundWorker_SplitIntoChunks(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; Random random = new Random(); if (!String.IsNullOrEmpty(homer.randSeedString)) { random = new Random(int.Parse(homer.randSeedString)); } string filenamePadding = "D" + homer.numberOfSamples.ToString().Length.ToString(); string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; try { if (homer.HasHeader()) { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); string[] headerRow; headerRow = csvDat.Item1.ToArray <string>(); string headerRowToWriteString = RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); ulong sampleNumber = 0; ulong rowsWritten = 0; ulong rowsWrittenTotal = 0; FileStream fileStreamOut = null; StreamWriter streamWriter = null; foreach (var line in csvDat.Item2) { //open up a new file to write out if (rowsWritten == 0) { string filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None); streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding()); streamWriter.Write(headerRowToWriteString); } string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; rowsWrittenTotal++; if (rowsWritten == homer.rowsPerSample) { rowsWritten = 0; sampleNumber++; streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } ; if (rowsWrittenTotal % 1000 == 0) { int pctDone = (int)Math.Round((((double)rowsWrittenTotal / homer.GetRowCount()) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); if ((sender as BackgroundWorker).CancellationPending) { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); e.Result = "Cancelled"; break; } } } //everything has been written, so now we just close up shop try { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } catch { } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong sampleNumber = 0; ulong rowsWritten = 0; ulong rowsWrittenTotal = 0; FileStream fileStreamOut = null; StreamWriter streamWriter = null; foreach (var line in csvDat) { //open up a new file to write out if (rowsWritten == 0) { string filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None); streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding()); } string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; rowsWrittenTotal++; if (rowsWritten == homer.rowsPerSample) { rowsWritten = 0; sampleNumber++; streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } ; if (rowsWrittenTotal % 1000 == 0) { int pctDone = (int)Math.Round((((double)rowsWrittenTotal / homer.GetRowCount()) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); if ((sender as BackgroundWorker).CancellationPending) { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); e.Result = "Cancelled"; break; } } } //everything has been written, so now we just close up shop try { streamWriter.Close(); streamWriter.Dispose(); fileStreamOut.Close(); fileStreamOut.Dispose(); } catch { } } } } catch { MessageBox.Show("There was an error in writing your output file(s). This often occurs when your output file is already open in another application.", "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } return; }
private void backgroundWorker_SubSampleWithoutReplacement(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; Random random = new Random(); if (!String.IsNullOrEmpty(homer.randSeedString)) { random = new Random(int.Parse(homer.randSeedString)); } string filenamePadding = "D" + homer.numberOfSamples.ToString().Length.ToString(); string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; int pctDone = 0; ulong actualSamplesToBeWritten; if (homer.numberOfSamples * homer.rowsPerSample > homer.GetRowCount()) { actualSamplesToBeWritten = (ulong)Math.Round((homer.GetRowCount() / (double)homer.rowsPerSample) * 100, 0, MidpointRounding.AwayFromZero); } else { actualSamplesToBeWritten = homer.numberOfSamples; } HashSet <ulong> rowsToKeep; ulong[] rowsToSample = new ulong[homer.GetRowCount()]; #region Randomize order of sample for (ulong i = 0; i < homer.GetRowCount(); i++) { rowsToSample[i] = i + 1; } rowsToSample = rowsToSample.OrderBy(x => random.NextLong()).ToArray <ulong>(); #endregion //this is our outermost block within the bgworker: the timer that we use to report progress TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01); using (new System.Threading.Timer( _ => (sender as BackgroundWorker).ReportProgress(pctDone), null, reportPeriod, reportPeriod)) { for (ulong sampleNumber = 0; sampleNumber < homer.numberOfSamples; sampleNumber++) { if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } ulong skipToVal = (sampleNumber * homer.rowsPerSample); ulong takeVal = homer.rowsPerSample; if (skipToVal > homer.GetRowCount()) { break; } if (skipToVal + takeVal > (ulong)rowsToSample.Length) { takeVal = (ulong)rowsToSample.Length - skipToVal; } ulong[] subsample = rowsToSample.Skip((int)skipToVal).Take((int)takeVal).ToArray(); rowsToKeep = subsample.ToHashSet <ulong>(); #region Get Busy Writin' or Get Busy Dyin' ulong rowsWritten = 0; //first we need to open up our output filename string filenameOut; if (String.IsNullOrEmpty(homer.randSeedString)) { filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } else { filenameOut = Path.Combine(homer.GetOutputLocation(), homer.randSeedString + "_subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } try { using (FileStream fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None)) using (StreamWriter streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding())) { if (homer.HasHeader()) { string[] headerRow; using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); headerRow = csvDat.Item1.ToArray <string>(); //write the header row streamWriter.Write(RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices)); ulong rowNumber = 0; string rowToWriteString; foreach (var line in csvDat.Item2) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToKeep.Contains(rowNumber)) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; if (rowsWritten == homer.rowsPerSample) { break; } } } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong rowNumber = 0; foreach (var line in csvDat) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToKeep.Contains(rowNumber)) { string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); rowsWritten++; if (rowsWritten == homer.rowsPerSample) { break; } } } } } } #endregion } catch { MessageBox.Show(genericProcessingError, "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } } } return; }
private void backgroundWorker_SubSampleWithReplacement(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; Random random = new Random(); if (!String.IsNullOrEmpty(homer.randSeedString)) { random = new Random(int.Parse(homer.randSeedString)); } string filenamePadding = "D" + homer.numberOfSamples.ToString().Length.ToString(); string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; int pctDone = 0; //this is our outermost block within the bgworker: the timer that we use to report progress TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01); using (new System.Threading.Timer( _ => (sender as BackgroundWorker).ReportProgress(pctDone), null, reportPeriod, reportPeriod)) { for (ulong sampleNumber = 0; sampleNumber < homer.numberOfSamples; sampleNumber++) { //break out of this method if the user cancels from the form if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } Dictionary <ulong, int> rowsToSample = new Dictionary <ulong, int>(); #region Determine Our Samples Needed ulong rowsSampledCount = 0; while (rowsSampledCount < homer.rowsPerSample) { ulong randomDraw = random.NextLong(1, homer.GetRowCount()); if (rowsToSample.ContainsKey(randomDraw)) { rowsToSample[randomDraw]++; } else { rowsToSample.Add(randomDraw, 1); } rowsSampledCount++; } #endregion #region Get Busy Writin' or Get Busy Dyin' ulong rowsWritten = 0; //first we need to open up our output filename string filenameOut; if (String.IsNullOrEmpty(homer.randSeedString)) { filenameOut = Path.Combine(homer.GetOutputLocation(), "subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } else { filenameOut = Path.Combine(homer.GetOutputLocation(), homer.randSeedString + "_subsample" + (sampleNumber + 1).ToString(filenamePadding) + ".csv"); } try { using (FileStream fileStreamOut = new FileStream(filenameOut, FileMode.Create, FileAccess.Write, FileShare.None)) using (StreamWriter streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding())) { if (homer.HasHeader()) { string[] headerRow; using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); headerRow = csvDat.Item1.ToArray <string>(); string rowToWriteString = RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); //write the header row streamWriter.Write(rowToWriteString); ulong rowNumber = 0; foreach (var line in csvDat.Item2) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToSample.ContainsKey(rowNumber)) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); for (int numDraws = 0; numDraws < rowsToSample[rowNumber]; numDraws++) { streamWriter.Write(rowToWriteString); } rowsWritten += (ulong)rowsToSample[rowNumber]; if (rowsWritten == homer.rowsPerSample) { break; } } } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong rowNumber = 0; foreach (var line in csvDat) { rowNumber++; //calculate how far long we are if (rowNumber % 1000 == 0) { pctDone = calcPctDone(rowsWritten, homer.rowsPerSample, sampleNumber, homer.numberOfSamples); if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } } if (rowsToSample.ContainsKey(rowNumber)) { string rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); for (int numDraws = 0; numDraws < rowsToSample[rowNumber]; numDraws++) { streamWriter.Write(rowToWriteString); } rowsWritten += (ulong)rowsToSample[rowNumber]; if (rowsWritten == homer.rowsPerSample) { break; } } } } } } #endregion } catch { MessageBox.Show(genericProcessingError, "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } } } return; }
private void backgroundWorker_TargetedSubsampling(object sender, System.ComponentModel.DoWorkEventArgs e) { Homer homer = (Homer)e.Argument; string quoteString = homer.GetQuote().ToString(); string escapedQuoteString = homer.GetQuote().ToString() + homer.GetQuote().ToString(); int numCols = homer.retainedIndices.Count; #region Get Busy Writin' or Get Busy Dyin' try { using (FileStream fileStreamOut = new FileStream(homer.GetOutputLocation(), FileMode.Create, FileAccess.Write, FileShare.None)) using (StreamWriter streamWriter = new StreamWriter(fileStreamOut, homer.GetEncoding())) { if (homer.HasHeader()) { string[] headerRow; using (var fileStreamIn = new FileStream(homer.GetInputFile(), FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.ParseHeadAndTail(streamReader, homer.GetDelim(), homer.GetQuote()); headerRow = csvDat.Item1.ToArray <string>(); string rowToWriteString = RowCleaner.CleanRow(headerRow, homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); //write the header row streamWriter.Write(rowToWriteString); ulong rowNumber = 0; foreach (var line in csvDat.Item2) { rowNumber++; if (rowNumber % 1000 == 0) { if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } //report progress //MessageBox.Show((((double)sampleNumber / homer.numberOfSamples) * 100).ToString()); int pctDone = (int)Math.Round((((double)rowNumber / homer.endRow) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); } if (rowNumber >= homer.startRow && rowNumber <= homer.endRow) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); } if (rowNumber == homer.endRow) { break; } } } } else { using (var fileStreamIn = File.OpenRead(homer.GetInputFile())) using (var streamReader = new StreamReader(fileStreamIn, encoding: homer.GetEncoding())) { var csvDat = CsvParser.Parse(streamReader, homer.GetDelim(), homer.GetQuote()); ulong rowNumber = 0; string rowToWriteString; foreach (var line in csvDat) { rowNumber++; if (rowNumber % 1000 == 0) { if ((sender as BackgroundWorker).CancellationPending) { e.Result = "Cancelled"; break; } //report progress //MessageBox.Show((((double)sampleNumber / homer.numberOfSamples) * 100).ToString()); int pctDone = (int)Math.Round((((double)rowNumber / homer.endRow) * 10000), 0, MidpointRounding.AwayFromZero); (sender as BackgroundWorker).ReportProgress(pctDone); } if (rowNumber >= homer.startRow && rowNumber <= homer.endRow) { rowToWriteString = RowCleaner.CleanRow(line.ToArray <string>(), homer.GetDelim(), quoteString, escapedQuoteString, numCols, hoju.retainedIndices); streamWriter.Write(rowToWriteString); } if (rowNumber == homer.endRow) { break; } } } } } #endregion } catch { MessageBox.Show(genericProcessingError, "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); e.Result = "Cancelled"; return; } return; }
/// <summary> /// Counts the number of rows within the CSV file. /// </summary> public FileDetails CountRows() { this.fileDetails.totalNumberOfRows = 0; this.fileDetails.rowErrorCount = 0; using (var stream = new FileStream(this.GetInputFile(), FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var reader = new StreamReader(stream, encoding: this.fileDetails.fileEncoding)) { if (fileDetails.containsHeader) { var csvDat = CsvParser.ParseHeadAndTail(reader, fileDetails.delimiter, fileDetails.quote); fileDetails.colNames = csvDat.Item1.ToList <string>(); try { foreach (var line in csvDat.Item2) { this.fileDetails.totalNumberOfRows++; } } catch { MessageBox.Show("There was an error parsing your CSV file.", "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); } } else { int numCols = 0; var csvDat = CsvParser.Parse(reader, fileDetails.delimiter, fileDetails.quote); try { foreach (var line in csvDat) { this.fileDetails.totalNumberOfRows++; int numColsOnLine = line.Count; if (numColsOnLine > numCols) { numCols = numColsOnLine; } } } catch { MessageBox.Show("There was an error parsing your CSV file.", "D'oh!", MessageBoxButtons.OK, MessageBoxIcon.Error); } List <string> colNames = new List <string>(); for (int colNum = 0; colNum < numCols; colNum++) { colNames.Add("V" + colNum.ToString()); } this.fileDetails.colNames = colNames; } } return(this.fileDetails); }