Exemple #1
0
        /// <summary>
        ///
        /// </summary>
        /// <returns>bool indicating successful tabulate. False indicates the audit failed, in which case the outputFileName will be used</returns>
        public static bool CreateTabulateReport(DirectoryInfo dirinfo, ICollection <string> inputFilePatternCollection, string outputFileName,
                                                KeepTest <Dictionary <string, string> > globalKeepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPValue,
                                                bool auditRowIndexValues, bool useStoreyTibsharaniMethod, int numTestsStoreyTibsOverride, bool doLocalTabulation)
        {
            using (TextWriter textWriter = File.CreateText(outputFileName)) // Do this early so that if it fails, well know
            {
                int splitCount = splitKeepTestList.Count + 1;
                List <KeyValuePair <Dictionary <string, string>, double> >[] realRowCollectionToSortArray = new List <KeyValuePair <Dictionary <string, string>, double> > [splitCount];
                //List<double>[] nullValueCollectionToBeSortedArray = new List<double>[splitCount];
                Dictionary <int, List <double> >[] nullValueCollectionToBeSortedArray = new Dictionary <int, List <double> > [splitCount];
                int[] totalPValueCount = new int[splitCount];

                for (int i = 0; i < splitCount; i++)
                {
                    realRowCollectionToSortArray[i] = new List <KeyValuePair <Dictionary <string, string>, double> >(10000);
                    //nullValueCollectionToBeSortedArray[i] = new List<double>(10000);
                    nullValueCollectionToBeSortedArray[i] = new Dictionary <int, List <double> >();
                }

                string headerSoFar = null;

                Set <int> broadRealAndNullIndexSetSoFar = null;

                foreach (string broadInputFilePattern in inputFilePatternCollection)
                {
                    Set <int> narrowRealAndNullIndexSetSetSoFar = Set <int> .GetInstance();

                    foreach (string narrowInputFilePattern in broadInputFilePattern.Split('+'))
                    {
                        Set <int>         realAndNullIndexSet;
                        RowIndexTabulator tabulator = TryCreateTabulateReportInternal(out realAndNullIndexSet, dirinfo, narrowInputFilePattern,
                                                                                      globalKeepTest, splitKeepTestList, maxPValue, auditRowIndexValues, useStoreyTibsharaniMethod,
                                                                                      ref realRowCollectionToSortArray, ref nullValueCollectionToBeSortedArray, ref totalPValueCount, ref headerSoFar, doLocalTabulation);
                        if (!tabulator.IsComplete())
                        {
                            textWriter.WriteLine(tabulator.GetSkipRangeCollection());
                            Console.WriteLine("Not all needed rows were found in {0}.", narrowInputFilePattern);
                            Console.WriteLine("Found rows:\n{0}", tabulator.GetSkipRangeCollection());
                            Console.WriteLine("{0} created as skip file.", outputFileName);
                            return(false);
                        }



                        //Instead of throwing an error, we could filter out the duplicated null indexes
                        Helper.CheckCondition(narrowRealAndNullIndexSetSetSoFar.IntersectionIsEmpty(realAndNullIndexSet),
                                              string.Format("Within inputFilePattern {0}, multiple '+'-connected parts cover the same nullIndex(s), {1}",
                                                            broadInputFilePattern,
                                                            narrowRealAndNullIndexSetSetSoFar.Intersection(realAndNullIndexSet)));

                        narrowRealAndNullIndexSetSetSoFar.AddNewRange(realAndNullIndexSet);
                    }

                    Helper.CheckCondition(!auditRowIndexValues || narrowRealAndNullIndexSetSetSoFar.Contains(-1),
                                          string.Format("The 'null' index -1 for the real data was not seen in {0}", broadInputFilePattern));


                    if (broadRealAndNullIndexSetSoFar == null)
                    {
                        broadRealAndNullIndexSetSoFar = narrowRealAndNullIndexSetSetSoFar;
                    }
                    //else
                    //{
                    //	Helper.CheckCondition(broadRealAndNullIndexSetSoFar.Equals(narrowRealAndNullIndexSetSetSoFar),
                    //		string.Format("The broad inputFilePattern {0} covers a different set of nullIndexes ({1}) than its predecessors ({2})",
                    //		broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar, broadRealAndNullIndexSetSoFar));
                    //}
                }

                double numberOfRandomizationRuns = useStoreyTibsharaniMethod ? 0 : broadRealAndNullIndexSetSoFar.Count - 1;
                Console.WriteLine("Detected {0} randomized runs relative to the number of real runs.", numberOfRandomizationRuns);
                Helper.CheckCondition <InvalidDataException>(useStoreyTibsharaniMethod || numberOfRandomizationRuns > 0, "No randomization runs detected. Did you mean to include a -{0} flag?", Tabulate.STOREY_METHOD_NAME);

                //Compute q-values from p-values (and p-values from test statistic)
                List <KeyValuePair <Dictionary <string, string>, double> > rowAndQValues = new List <KeyValuePair <Dictionary <string, string>, double> >(1000);
                Dictionary <double, double> rowToPvalFromRandomizations = null;
                for (int i = 0; i < splitCount; i++)
                {
                    int numTestsToUse;
                    if (numTestsStoreyTibsOverride != -1)
                    {
                        Console.WriteLine("Using " + numTestsStoreyTibsOverride + " p-values for computation of q-values rather than the observed number (" + totalPValueCount[i] + ")");
                        numTestsToUse = numTestsStoreyTibsOverride;
                    }
                    else
                    {
                        numTestsToUse = totalPValueCount[i];
                    }


                    //List<double> placeFiller = nullValueCollectionToBeSortedArray[i][0];

                    Dictionary <Dictionary <string, string>, double> qValueList;
                    if (useStoreyTibsharaniMethod)
                    {
                        qValueList = SpecialFunctions.ComputeQValuesUseStoreyTibsharani(ref realRowCollectionToSortArray[i], row => row.Value, numTestsToUse)
                                     .ToDictionary(entry => entry.Key.Key, entry => entry.Value);
                    }
                    else if (!doLocalTabulation)
                    {
                        qValueList = SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i],
                                                                             row => row.Value,
                                                                             row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["groupId"]),
                                                                             row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["rowIndex"]),
                                                                             ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns, out rowToPvalFromRandomizations, doLocalTabulation)
                                     .ToDictionary(entry => entry.Key.Key, entry => entry.Value);
                    }
                    else//do local tabulation
                    {
                        qValueList = SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i],
                                                                             row => row.Value,
                                                                             row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["groupId"]),
                                                                             row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["rowIndex"]),
                                                                             ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns, out rowToPvalFromRandomizations, doLocalTabulation)
                                     .ToDictionary(entry => entry.Key.Key, entry => entry.Value);
                    }

                    //Dictionary<Dictionary<string, string>, double> qValueList =
                    //    (useStoreyTibsharaniMethod ?
                    //        SpecialFunctions.ComputeQValuesUseStoreyTibsharani(ref realRowCollectionToSortArray[i], row => row.Value, numTestsToUse) :
                    //        SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i], row => row.Value,
                    //                ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns,out pValToPvalFromRandomizations))
                    //    .ToDictionary(entry => entry.Key.Key, entry => entry.Value);

                    foreach (KeyValuePair <Dictionary <string, string>, double> rowAndQValue in qValueList)
                    {
                        rowAndQValues.Add(new KeyValuePair <Dictionary <string, string>, double>(rowAndQValue.Key, rowAndQValue.Value));
                    }
                }

                rowAndQValues.Sort((row1, row2) =>
                                   row1.Value == row2.Value ?
                                   AccessPValueFromPhylotreeRow(row1.Key).CompareTo(AccessPValueFromPhylotreeRow(row2.Key)) :
                                   row1.Value.CompareTo(row2.Value));

                //!!!this code is repeated elsewhere
                if (COL_TO_TABULATE.TESTSTATISTIC == _columnToTabulate)
                {
                    Helper.CheckCondition(!useStoreyTibsharaniMethod, "the way its set up now, cannot use TestStatistic column with useStoreyTibshirani");
                    textWriter.WriteLine(Helper.CreateTabString(headerSoFar, "pValFromRandomizations", "qValue"));
                }
                else
                {
                    textWriter.WriteLine(Helper.CreateTabString(headerSoFar, "qValue"));
                }
                //foreach (Dictionary<string, string> row in realRowCollectionToSortArray)
                //{
                //    double qValue = qValueList[row];
                //    textWriter.WriteLine(Helper.CreateTabString(row[""], qValue));
                //}
                foreach (KeyValuePair <Dictionary <string, string>, double> rowAndQValue in rowAndQValues)
                {
                    if (COL_TO_TABULATE.TESTSTATISTIC == _columnToTabulate)
                    {
                        double thisRow = double.Parse(rowAndQValue.Key["rowIndex"]);
                        double thisPvalFromRandomization = rowToPvalFromRandomizations[thisRow];
                        textWriter.WriteLine(Helper.CreateTabString(rowAndQValue.Key[""], thisPvalFromRandomization, rowAndQValue.Value));
                    }
                    else
                    {
                        textWriter.WriteLine(Helper.CreateTabString(rowAndQValue.Key[""], rowAndQValue.Value));
                    }
                }
            }
            return(true);
        }
Exemple #2
0
        /// <summary>
        /// currently hard-coded to map the rowId to a groupId, used for localTabulation, by using hypothesisId = rowId % numRealHypotheses
        /// </summary>
        /// <param name="nullIndexSet"></param>
        /// <param name="dirinfo"></param>
        /// <param name="inputFilePattern"></param>
        /// <param name="globalKeepTest"></param>
        /// <param name="splitKeepTestList"></param>
        /// <param name="maxPValue"></param>
        /// <param name="auditRowIndexValues"></param>
        /// <param name="useStoreyTibsharaniMethod"></param>
        /// <param name="realRowCollectionToSortArray"></param>
        /// <param name="nullValueCollectionToBeSortedArrayDict"></param>
        /// <param name="totalPValueCount"></param>
        /// <param name="headerSoFar"></param>
        /// <param name="doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations"></param>
        /// <returns></returns>
        private static RowIndexTabulator TryCreateTabulateReportInternal(out Set <int> nullIndexSet, DirectoryInfo dirinfo,
                                                                         string inputFilePattern,
                                                                         KeepTest <Dictionary <string, string> > globalKeepTest,
                                                                         List <KeepTest <Dictionary <string, string> > > splitKeepTestList,
                                                                         double maxPValue,
                                                                         bool auditRowIndexValues,
                                                                         bool useStoreyTibsharaniMethod,
                                                                         ref List <KeyValuePair <Dictionary <string, string>, double> >[] realRowCollectionToSortArray,
                                                                         ref Dictionary <int, List <double> >[] nullValueCollectionToBeSortedArrayDict,
                                                                         ref int[] totalPValueCount,
                                                                         ref string headerSoFar,
                                                                         bool doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations
                                                                         )
        {
            //int splitCount=splitKeepTestList.Count;
            //List<double>[] nullValueCollectionToBeSortedArray = new List<double>[splitCount];
            //for (int j = 0; j < splitCount; j++) nullValueCollectionToBeSortedArray[j] = new List<double>();

            nullIndexSet = Set <int> .GetInstance();

            //!!!very similar code elsewhere
            RowIndexTabulator rowIndexTabulator = RowIndexTabulator.GetInstance(auditRowIndexValues);
            //RangeCollection unfilteredRowIndexRangeCollection = new RangeCollection();
            int lastWriteLineLength = 0;
            int nullValueCount      = 0;

            foreach (FileInfo fileinfo in dirinfo.GetFiles(inputFilePattern))
            {
                try
                {
                    int sigLines = realRowCollectionToSortArray.Select(split => split.Count).Sum();
                    //nullValueCount = nullValueCollectionToBeSortedArray.Select(split => split.Count).Sum();
                    int totalLines = sigLines + nullValueCount + totalPValueCount.Sum();

                    string writeLine = string.Format("{0}/{1} lines have p<=1. Now reading {2}", sigLines, totalLines, fileinfo.FullName);
                    Console.Write("\r{0,-" + lastWriteLineLength + "}", writeLine);
                    lastWriteLineLength = writeLine.Length;

                    string headerOnFile;
                    using (TextReader reader = SpecialFunctions.GetTextReaderWithExternalReadWriteAccess(fileinfo.FullName))
                    {
                        headerOnFile = reader.ReadLine();
                        if (headerSoFar == null)
                        {
                            headerSoFar = headerOnFile;
                        }
                        else if (headerSoFar != headerOnFile)
                        {
                            Console.WriteLine("Warning: The header for file {0} is different from the 1st file read in", fileinfo.Name);
                        }
                    }

                    //KeepAa2AaOnly keepAa = KeepAa2AaOnly.GetInstance();
                    //Console.WriteLine(keepAa);

                    using (TextReader reader = SpecialFunctions.GetTextReaderWithExternalReadWriteAccess(fileinfo.FullName))
                    {
                        foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(reader, headerOnFile, /*includeWholeLine*/ true))
                        {
                            if (rowIndexTabulator.TryAdd(row, fileinfo.FullName) && globalKeepTest.Test(row))
                            {
                                //Helper.CheckCondition(row.ContainsKey(NullIndexColumnName), string.Format(@"When tabulating a ""{0}"" column is required. (File ""{1}"")", NullIndexColumnName, fileinfo.Name));

                                //int nullIndex = int.Parse(row[NullIndexColumnName]);
                                int nullIndex = !row.ContainsKey(NullIndexColumnName) && useStoreyTibsharaniMethod ? -1 : int.Parse(row[NullIndexColumnName]);
                                nullIndexSet.AddNewOrOld(nullIndex);

                                double pValue = AccessPValueFromPhylotreeRow(row);
                                if (useStoreyTibsharaniMethod && nullIndex == -1)
                                {
                                    int splitIdx = GetSplitTabulateIndex(row, splitKeepTestList);
                                    if (pValue <= maxPValue)
                                    {
                                        realRowCollectionToSortArray[splitIdx].Add(new KeyValuePair <Dictionary <string, string>, double>(row, pValue));
                                    }
                                    //nullValueCollectionToBeSortedArray[splitIdx].Add(pValue);
                                    totalPValueCount[splitIdx]++;
                                }
                                else if (!useStoreyTibsharaniMethod)
                                {
                                    if (pValue <= maxPValue)
                                    {
                                        int splitIdx = GetSplitTabulateIndex(row, splitKeepTestList);
                                        if (nullIndex == -1)
                                        {
                                            realRowCollectionToSortArray[splitIdx].Add(new KeyValuePair <Dictionary <string, string>, double>(row, pValue));
                                            //realRowCollectionToSortArray[splitIdx].Add(row);
                                        }
                                        else
                                        {
                                            int groupId;
                                            if (!doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations)
                                            {
                                                //always add it to the zero key if not doing local tabulations
                                                groupId = 0;
                                            }
                                            else
                                            {
                                                groupId = int.Parse(row[GroupIdColumnName]);
                                            }

                                            nullValueCollectionToBeSortedArrayDict[splitIdx].GetValueOrDefault(groupId).Add(pValue);
                                            nullValueCount++;
                                            //nullValueCollectionToBeSortedArray[splitIdx].Add(pValue);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                catch
                {
                    Console.WriteLine("\nFailure parsing {0}.", fileinfo.Name);
                    throw;
                }
            }

            Console.WriteLine("\r{0,-" + lastWriteLineLength + "}", "Read all files.");
            return(rowIndexTabulator);
            //rowIndexTabulator.CheckIsComplete(inputFilePattern);

            //return nullIndexSet;
        }