/// <summary> /// /// </summary> /// <returns>bool indicating successful tabulate. False indicates the audit failed, in which case the outputFileName will be used</returns> public static bool CreateTabulateReport(DirectoryInfo dirinfo, ICollection <string> inputFilePatternCollection, string outputFileName, KeepTest <Dictionary <string, string> > globalKeepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPValue, bool auditRowIndexValues, bool useStoreyTibsharaniMethod, int numTestsStoreyTibsOverride, bool doLocalTabulation) { using (TextWriter textWriter = File.CreateText(outputFileName)) // Do this early so that if it fails, well know { int splitCount = splitKeepTestList.Count + 1; List <KeyValuePair <Dictionary <string, string>, double> >[] realRowCollectionToSortArray = new List <KeyValuePair <Dictionary <string, string>, double> > [splitCount]; //List<double>[] nullValueCollectionToBeSortedArray = new List<double>[splitCount]; Dictionary <int, List <double> >[] nullValueCollectionToBeSortedArray = new Dictionary <int, List <double> > [splitCount]; int[] totalPValueCount = new int[splitCount]; for (int i = 0; i < splitCount; i++) { realRowCollectionToSortArray[i] = new List <KeyValuePair <Dictionary <string, string>, double> >(10000); //nullValueCollectionToBeSortedArray[i] = new List<double>(10000); nullValueCollectionToBeSortedArray[i] = new Dictionary <int, List <double> >(); } string headerSoFar = null; Set <int> broadRealAndNullIndexSetSoFar = null; foreach (string broadInputFilePattern in inputFilePatternCollection) { Set <int> narrowRealAndNullIndexSetSetSoFar = Set <int> .GetInstance(); foreach (string narrowInputFilePattern in broadInputFilePattern.Split('+')) { Set <int> realAndNullIndexSet; RowIndexTabulator tabulator = TryCreateTabulateReportInternal(out realAndNullIndexSet, dirinfo, narrowInputFilePattern, globalKeepTest, splitKeepTestList, maxPValue, auditRowIndexValues, useStoreyTibsharaniMethod, ref realRowCollectionToSortArray, ref nullValueCollectionToBeSortedArray, ref totalPValueCount, ref headerSoFar, doLocalTabulation); if (!tabulator.IsComplete()) { textWriter.WriteLine(tabulator.GetSkipRangeCollection()); Console.WriteLine("Not all needed rows were found in {0}.", narrowInputFilePattern); Console.WriteLine("Found rows:\n{0}", tabulator.GetSkipRangeCollection()); Console.WriteLine("{0} created as skip file.", outputFileName); return(false); } //Instead of throwing an error, we could filter out the duplicated null indexes Helper.CheckCondition(narrowRealAndNullIndexSetSetSoFar.IntersectionIsEmpty(realAndNullIndexSet), string.Format("Within inputFilePattern {0}, multiple '+'-connected parts cover the same nullIndex(s), {1}", broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar.Intersection(realAndNullIndexSet))); narrowRealAndNullIndexSetSetSoFar.AddNewRange(realAndNullIndexSet); } Helper.CheckCondition(!auditRowIndexValues || narrowRealAndNullIndexSetSetSoFar.Contains(-1), string.Format("The 'null' index -1 for the real data was not seen in {0}", broadInputFilePattern)); if (broadRealAndNullIndexSetSoFar == null) { broadRealAndNullIndexSetSoFar = narrowRealAndNullIndexSetSetSoFar; } //else //{ // Helper.CheckCondition(broadRealAndNullIndexSetSoFar.Equals(narrowRealAndNullIndexSetSetSoFar), // string.Format("The broad inputFilePattern {0} covers a different set of nullIndexes ({1}) than its predecessors ({2})", // broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar, broadRealAndNullIndexSetSoFar)); //} } double numberOfRandomizationRuns = useStoreyTibsharaniMethod ? 0 : broadRealAndNullIndexSetSoFar.Count - 1; Console.WriteLine("Detected {0} randomized runs relative to the number of real runs.", numberOfRandomizationRuns); Helper.CheckCondition <InvalidDataException>(useStoreyTibsharaniMethod || numberOfRandomizationRuns > 0, "No randomization runs detected. Did you mean to include a -{0} flag?", Tabulate.STOREY_METHOD_NAME); //Compute q-values from p-values (and p-values from test statistic) List <KeyValuePair <Dictionary <string, string>, double> > rowAndQValues = new List <KeyValuePair <Dictionary <string, string>, double> >(1000); Dictionary <double, double> rowToPvalFromRandomizations = null; for (int i = 0; i < splitCount; i++) { int numTestsToUse; if (numTestsStoreyTibsOverride != -1) { Console.WriteLine("Using " + numTestsStoreyTibsOverride + " p-values for computation of q-values rather than the observed number (" + totalPValueCount[i] + ")"); numTestsToUse = numTestsStoreyTibsOverride; } else { numTestsToUse = totalPValueCount[i]; } //List<double> placeFiller = nullValueCollectionToBeSortedArray[i][0]; Dictionary <Dictionary <string, string>, double> qValueList; if (useStoreyTibsharaniMethod) { qValueList = SpecialFunctions.ComputeQValuesUseStoreyTibsharani(ref realRowCollectionToSortArray[i], row => row.Value, numTestsToUse) .ToDictionary(entry => entry.Key.Key, entry => entry.Value); } else if (!doLocalTabulation) { qValueList = SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i], row => row.Value, row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["groupId"]), row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["rowIndex"]), ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns, out rowToPvalFromRandomizations, doLocalTabulation) .ToDictionary(entry => entry.Key.Key, entry => entry.Value); } else//do local tabulation { qValueList = SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i], row => row.Value, row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["groupId"]), row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["rowIndex"]), ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns, out rowToPvalFromRandomizations, doLocalTabulation) .ToDictionary(entry => entry.Key.Key, entry => entry.Value); } //Dictionary<Dictionary<string, string>, double> qValueList = // (useStoreyTibsharaniMethod ? // SpecialFunctions.ComputeQValuesUseStoreyTibsharani(ref realRowCollectionToSortArray[i], row => row.Value, numTestsToUse) : // SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i], row => row.Value, // ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns,out pValToPvalFromRandomizations)) // .ToDictionary(entry => entry.Key.Key, entry => entry.Value); foreach (KeyValuePair <Dictionary <string, string>, double> rowAndQValue in qValueList) { rowAndQValues.Add(new KeyValuePair <Dictionary <string, string>, double>(rowAndQValue.Key, rowAndQValue.Value)); } } rowAndQValues.Sort((row1, row2) => row1.Value == row2.Value ? AccessPValueFromPhylotreeRow(row1.Key).CompareTo(AccessPValueFromPhylotreeRow(row2.Key)) : row1.Value.CompareTo(row2.Value)); //!!!this code is repeated elsewhere if (COL_TO_TABULATE.TESTSTATISTIC == _columnToTabulate) { Helper.CheckCondition(!useStoreyTibsharaniMethod, "the way its set up now, cannot use TestStatistic column with useStoreyTibshirani"); textWriter.WriteLine(Helper.CreateTabString(headerSoFar, "pValFromRandomizations", "qValue")); } else { textWriter.WriteLine(Helper.CreateTabString(headerSoFar, "qValue")); } //foreach (Dictionary<string, string> row in realRowCollectionToSortArray) //{ // double qValue = qValueList[row]; // textWriter.WriteLine(Helper.CreateTabString(row[""], qValue)); //} foreach (KeyValuePair <Dictionary <string, string>, double> rowAndQValue in rowAndQValues) { if (COL_TO_TABULATE.TESTSTATISTIC == _columnToTabulate) { double thisRow = double.Parse(rowAndQValue.Key["rowIndex"]); double thisPvalFromRandomization = rowToPvalFromRandomizations[thisRow]; textWriter.WriteLine(Helper.CreateTabString(rowAndQValue.Key[""], thisPvalFromRandomization, rowAndQValue.Value)); } else { textWriter.WriteLine(Helper.CreateTabString(rowAndQValue.Key[""], rowAndQValue.Value)); } } } return(true); }
/// <summary> /// currently hard-coded to map the rowId to a groupId, used for localTabulation, by using hypothesisId = rowId % numRealHypotheses /// </summary> /// <param name="nullIndexSet"></param> /// <param name="dirinfo"></param> /// <param name="inputFilePattern"></param> /// <param name="globalKeepTest"></param> /// <param name="splitKeepTestList"></param> /// <param name="maxPValue"></param> /// <param name="auditRowIndexValues"></param> /// <param name="useStoreyTibsharaniMethod"></param> /// <param name="realRowCollectionToSortArray"></param> /// <param name="nullValueCollectionToBeSortedArrayDict"></param> /// <param name="totalPValueCount"></param> /// <param name="headerSoFar"></param> /// <param name="doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations"></param> /// <returns></returns> private static RowIndexTabulator TryCreateTabulateReportInternal(out Set <int> nullIndexSet, DirectoryInfo dirinfo, string inputFilePattern, KeepTest <Dictionary <string, string> > globalKeepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPValue, bool auditRowIndexValues, bool useStoreyTibsharaniMethod, ref List <KeyValuePair <Dictionary <string, string>, double> >[] realRowCollectionToSortArray, ref Dictionary <int, List <double> >[] nullValueCollectionToBeSortedArrayDict, ref int[] totalPValueCount, ref string headerSoFar, bool doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations ) { //int splitCount=splitKeepTestList.Count; //List<double>[] nullValueCollectionToBeSortedArray = new List<double>[splitCount]; //for (int j = 0; j < splitCount; j++) nullValueCollectionToBeSortedArray[j] = new List<double>(); nullIndexSet = Set <int> .GetInstance(); //!!!very similar code elsewhere RowIndexTabulator rowIndexTabulator = RowIndexTabulator.GetInstance(auditRowIndexValues); //RangeCollection unfilteredRowIndexRangeCollection = new RangeCollection(); int lastWriteLineLength = 0; int nullValueCount = 0; foreach (FileInfo fileinfo in dirinfo.GetFiles(inputFilePattern)) { try { int sigLines = realRowCollectionToSortArray.Select(split => split.Count).Sum(); //nullValueCount = nullValueCollectionToBeSortedArray.Select(split => split.Count).Sum(); int totalLines = sigLines + nullValueCount + totalPValueCount.Sum(); string writeLine = string.Format("{0}/{1} lines have p<=1. Now reading {2}", sigLines, totalLines, fileinfo.FullName); Console.Write("\r{0,-" + lastWriteLineLength + "}", writeLine); lastWriteLineLength = writeLine.Length; string headerOnFile; using (TextReader reader = SpecialFunctions.GetTextReaderWithExternalReadWriteAccess(fileinfo.FullName)) { headerOnFile = reader.ReadLine(); if (headerSoFar == null) { headerSoFar = headerOnFile; } else if (headerSoFar != headerOnFile) { Console.WriteLine("Warning: The header for file {0} is different from the 1st file read in", fileinfo.Name); } } //KeepAa2AaOnly keepAa = KeepAa2AaOnly.GetInstance(); //Console.WriteLine(keepAa); using (TextReader reader = SpecialFunctions.GetTextReaderWithExternalReadWriteAccess(fileinfo.FullName)) { foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(reader, headerOnFile, /*includeWholeLine*/ true)) { if (rowIndexTabulator.TryAdd(row, fileinfo.FullName) && globalKeepTest.Test(row)) { //Helper.CheckCondition(row.ContainsKey(NullIndexColumnName), string.Format(@"When tabulating a ""{0}"" column is required. (File ""{1}"")", NullIndexColumnName, fileinfo.Name)); //int nullIndex = int.Parse(row[NullIndexColumnName]); int nullIndex = !row.ContainsKey(NullIndexColumnName) && useStoreyTibsharaniMethod ? -1 : int.Parse(row[NullIndexColumnName]); nullIndexSet.AddNewOrOld(nullIndex); double pValue = AccessPValueFromPhylotreeRow(row); if (useStoreyTibsharaniMethod && nullIndex == -1) { int splitIdx = GetSplitTabulateIndex(row, splitKeepTestList); if (pValue <= maxPValue) { realRowCollectionToSortArray[splitIdx].Add(new KeyValuePair <Dictionary <string, string>, double>(row, pValue)); } //nullValueCollectionToBeSortedArray[splitIdx].Add(pValue); totalPValueCount[splitIdx]++; } else if (!useStoreyTibsharaniMethod) { if (pValue <= maxPValue) { int splitIdx = GetSplitTabulateIndex(row, splitKeepTestList); if (nullIndex == -1) { realRowCollectionToSortArray[splitIdx].Add(new KeyValuePair <Dictionary <string, string>, double>(row, pValue)); //realRowCollectionToSortArray[splitIdx].Add(row); } else { int groupId; if (!doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations) { //always add it to the zero key if not doing local tabulations groupId = 0; } else { groupId = int.Parse(row[GroupIdColumnName]); } nullValueCollectionToBeSortedArrayDict[splitIdx].GetValueOrDefault(groupId).Add(pValue); nullValueCount++; //nullValueCollectionToBeSortedArray[splitIdx].Add(pValue); } } } } } } } catch { Console.WriteLine("\nFailure parsing {0}.", fileinfo.Name); throw; } } Console.WriteLine("\r{0,-" + lastWriteLineLength + "}", "Read all files."); return(rowIndexTabulator); //rowIndexTabulator.CheckIsComplete(inputFilePattern); //return nullIndexSet; }