//Loading the contingency tables from the input file //Implements the sampling techniques private List <ContingencyTable> loadTables() { try { StreamReader sr = m_fiInput.OpenText(); ContingencyTable ctCurrent = null; if (m_bReportProgress) { m_bContinue = m_prReport.reportPhase("Loading data"); m_bContinue = m_prReport.reportMessage("Loading data from file " + m_fiInput.Name, true); } string sLine = ""; List <ContingencyTable> actTables = new List <ContingencyTable>(); int cTables = 0; long cCharacters = 0; bool bUseTable = true; double dSampleProbability = 0.0, dProb = 0.0; Random rnd = new Random(); int iLineNumber = 0; if (m_bHuge) { m_ctcTableCounts = new ContingencyTableCache(); } else { m_ctcTableCounts = null; } //On the first iteration go through the file to check the number of rows (tables) if (m_cTables == -1) { m_cTables = 0; sLine = sr.ReadLine(); initColumnHeaders(sLine); while (!sr.EndOfStream) { sLine = sr.ReadLine(); m_cTables++; if (m_bReportProgress) { if (m_cTables % MAX_REPROT_POINT == 0) { m_bContinue = m_prReport.reportMessage(".", false); } } } if (m_bReportProgress) { m_bContinue = m_prReport.reportMessage("", true); m_bContinue = m_prReport.reportMessage("Found " + m_cTables + " data rows.", true); } } //Instead of enforcing a hard sample size, we sample the given the sample probability dSampleProbability = m_iSampleSize / (double)m_cTables; sr.Close(); sr = m_fiInput.OpenText(); if (m_bReportProgress) { if (m_bSampling) { m_bContinue = m_prReport.reportPhase("Sampling tables"); m_bContinue = m_prReport.reportMessage("Sampling " + m_iSampleSize + " tables.", true); } } if (m_bHasColumnHeaders) { sr.ReadLine(); } while (!sr.EndOfStream && m_bContinue) { sLine = sr.ReadLine().Trim(); iLineNumber++; if (sLine.Length > 0) { bUseTable = true;//general use flag - sampling, validation, ... if (m_bSampling) { dProb = rnd.NextDouble(); if (dProb > dSampleProbability) { bUseTable = false; } } if (bUseTable) { ctCurrent = new ContingencyTable(sLine, m_cTableNamesColumns); bUseTable = ctCurrent.validate(); } if (bUseTable) { if (m_bHuge)//instead of maintaining all the tables try to see whether we already loaded a table with the same counts { double dCount = m_ctcTableCounts.getCachedValue(ctCurrent); if (double.IsNaN(dCount))//First time table was observed { dCount = 0; actTables.Add(ctCurrent); } m_ctcTableCounts.setCachedValue(ctCurrent, dCount + 1);//increment the table count } else//not huge - maintain all tables (including duplicates) { actTables.Add(ctCurrent); } } cTables++; } if ((cTables > 0) && (cTables % MAX_REPROT_POINT == 0)) { if (m_bReportProgress) { m_bContinue = m_prReport.reportProcessedTables(cTables, m_cTables); m_bContinue = m_prReport.reportMessage("Loaded " + cTables + " tables.", false); if (m_bHuge) { m_bContinue = m_prReport.reportMessage(" Found " + actTables.Count + " distinct tables.", false); } m_bContinue = m_prReport.reportMessage("", true); } } cCharacters += sLine.Length + 2; } if (m_bReportProgress) { m_bContinue = m_prReport.reportMessage("Done loading data. Found " + actTables.Count + " distinct tables.", true); } sr.Close(); return(actTables); } catch (Exception e) { m_bContinue = m_prReport.reportError("Could not load data : " + e.Message); } return(null); }