/// <summary> /// Merges all the files in dirinfo matching inputFilePattern and places them in mergedFileName. If MergedFileName exists, will append to the end. Then attempts to /// tabulate with the given parameters. If it fails, then it will places the workitems that it found in completedRowsFileName in the form a range, which can be used as a skipfile. /// If successful, then it deletes completedRowsFileName, if it exists. This is useful for deleting previous skip files. /// </summary> /// <param name="dirinfo">location where all the files can be found.</param> /// <returns>True if tabulate was successful, false otherwise.</returns> public static bool MergeThenTabulateOrCreateSkipFile(DirectoryInfo dirinfo, string inputFilePattern, string mergedFileName, string tabulateResultFileName, string skipFileName, KeepTest <Dictionary <string, string> > keepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPForTabulate, bool useStoreyMethod) { Console.Write("Merging files..."); Tabulate.MergeFilesUsedToTabulate(dirinfo, inputFilePattern, mergedFileName, true); Console.WriteLine("done merging."); bool tabulated = Tabulate.CreateTabulateReport( dirinfo, mergedFileName, tabulateResultFileName, keepTest, splitKeepTestList, maxPForTabulate, true /* audit */, useStoreyMethod); if (tabulated) { //string skipFileName = completedRowsFileName.Replace("completedRows", "skipFile"); //File.Delete(completedRowsFileName); // at this point we know everything's done, so delete it. if (File.Exists(skipFileName)) { File.Delete(skipFileName); } return(true); } else { Console.WriteLine("Tabulation failed. Missing rows placed in {0}.", skipFileName); SpecialFunctions.MoveAndReplace(tabulateResultFileName, skipFileName); return(false); } }
public static bool CreateTabulateReport(DirectoryInfo dirinfo, ICollection <string> inputFilePatternCollection, string outputFileName, KeepTest <Dictionary <string, string> > globalKeepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPValue, bool auditRowIndexValues, bool useStoreyTibsharaniMethod, bool doLocalTabulation) { int numTestsStoreyTibsOverride = -1; return(CreateTabulateReport(dirinfo, inputFilePatternCollection, outputFileName, globalKeepTest, splitKeepTestList, maxPValue, auditRowIndexValues, useStoreyTibsharaniMethod, numTestsStoreyTibsOverride, doLocalTabulation)); }
public static UniversalWorkList GetInstance( IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration, IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration, //Dictionary<string, Dictionary<string, SufficientStatistics>> predictorVariableToCaseIdToRealNonMissingValue, NullDataCollection nullDataCollection, RangeCollection nullIndexRange, KeepTest <Dictionary <string, string> > keepTest ) { //SpecialFunctions.CheckCondition(-1 <= nullIndexStart && nullIndexStart <= nullIndexLast); bool enumeratePairs = keepTest is KeepPredictorTargetPairs; if (keepTest is KeepCollection <Dictionary <string, string> > ) { foreach (KeepTest <Dictionary <string, string> > keepTestInCollection in ((KeepCollection <Dictionary <string, string> >)keepTest).KeepTestCollection) { if (keepTestInCollection is KeepPredictorTargetPairs) { enumeratePairs = true; } } } UniversalWorkList aUniversalWorkList; if (enumeratePairs) { aUniversalWorkList = UniversalWorkListPredTargPairs.GetInstance( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataCollection, nullIndexRange, keepTest ); } else { aUniversalWorkList = new UniversalWorkList( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataCollection, nullIndexRange, keepTest ); } // aUniversalWorkList._predictorNameAndCaseIdToNonMissingValueEnumeration = predictorNameAndCaseIdToNonMissingValueEnumeration; // aUniversalWorkList._targetNameAndCaseIdToNonMissingValueEnumeration = targetNameAndCaseIdToNonMissingValueEnumeration; //// aUniversalWorkList._targetVariables = targetVariables; //// aUniversalWorkList._predictorVariableToCaseIdToNonMissingValue = predictorVariableToCaseIdToRealNonMissingValue; // aUniversalWorkList._keepTest = keepTest; // aUniversalWorkList._nullDataCollection = nullDataCollection; // aUniversalWorkList._nullIndexRange = nullIndexRange; return(aUniversalWorkList); }
protected UniversalWorkList( IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration, IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration, NullDataCollection nullDataCollection, RangeCollection nullIndexRange, KeepTest <Dictionary <string, string> > keepTest ) { _predictorNameAndCaseIdToNonMissingValueEnumeration = predictorNameAndCaseIdToNonMissingValueEnumeration; _targetNameAndCaseIdToNonMissingValueEnumeration = targetNameAndCaseIdToNonMissingValueEnumeration; _keepTest = keepTest; _nullDataCollection = nullDataCollection; _nullIndexRange = nullIndexRange; //Console.WriteLine("In UniversalWorkList constructor."); }
protected UniversalWorkListPredTargPairs( IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration, IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration, NullDataCollection nullDataCollection, RangeCollection nullIndexRange, KeepTest <Dictionary <string, string> > keepTest ) : base( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataCollection, nullIndexRange, keepTest ) { //Console.WriteLine("In UniversalWorkListPredTargPairs constructor."); }
new public static UniversalWorkList GetInstance( IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration, IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration, //Dictionary<string, Dictionary<string, SufficientStatistics>> predictorVariableToCaseIdToRealNonMissingValue, NullDataCollection nullDataCollection, RangeCollection nullIndexRange, KeepTest <Dictionary <string, string> > keepTest ) { //SpecialFunctions.CheckCondition(-1 <= nullIndexStart && nullIndexStart <= nullIndexLast); UniversalWorkList aUniversalWorkList = new UniversalWorkListPredTargPairs( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataCollection, nullIndexRange, keepTest ); return(aUniversalWorkList); }
//!!!would be nice if class didn't have to know all these classes it was compatible with public override bool IsCompatibleWithNewKeepTest(KeepTest <Dictionary <string, string> > keepTestNew) { if (keepTestNew is K1) { return(k1 <= ((K1)keepTestNew).k1); } //!!!This code is duplicate many times if (keepTestNew is And <Dictionary <string, string> > ) { And <Dictionary <string, string> > aAnd = (And <Dictionary <string, string> >)keepTestNew; foreach (KeepTest <Dictionary <string, string> > conjunct in aAnd.KeepTestCollection) { if (!IsCompatibleWithNewKeepTest(conjunct)) { return(false); } } return(true); } return(false); }
public override bool IsCompatibleWithNewKeepTest(KeepTest <Dictionary <string, string> > keepTestNew) { return(false); //!!!could be made tighter }
public void Run( ModelScorer modelScorer, PhyloTree phyloTree, string predictorSparseFileName, string targetSparseFileName, string leafDistributionName, string nullDataGeneratorName, KeepTest <Dictionary <string, string> > keepTest, RangeCollection skipRowIndexRangeCollectionOrNull, string shortName, string outputDirectoryName, RangeCollection pieceIndexRangeCollection, int pieceCount, RangeCollection nullIndexRangeCollection, string optimizerName) { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); Directory.CreateDirectory(outputDirectoryName); string outputFileName = string.Format(@"{0}\{1}.{2}.{3}.{4}.{5}.{6}{7}.txt", outputDirectoryName, shortName, leafDistributionName, nullDataGeneratorName, nullIndexRangeCollection, pieceCount, pieceIndexRangeCollection, skipRowIndexRangeCollectionOrNull == null ? "" : ".Skip" + skipRowIndexRangeCollectionOrNull.Count().ToString() ); #region from PhyloTree refactor //Dictionary<string, Dictionary<string, bool>> predictorVariableToCaseIdToRealNonMissingValue = LoadSparseFileInMemory<bool>(predictorSparseFileName); //IEnumerable<Pair<string, Dictionary<string, T>>> targetNameAndCaseIdToNonMissingValueEnumeration = LoadSparseFileEnumeration<T>(targetSparseFileName); //NullDataCollection nullDataGenerator = // NullDataCollection.GetInstance(this, modelTester, nullIndexRangeCollection, predictorVariableToCaseIdToRealNonMissingValue); //UniversalWorkList<T> workList = UniversalWorkList<T>.GetInstance( // predictorVariableToCaseIdToRealNonMissingValue, // targetNameAndCaseIdToNonMissingValueEnumeration, // nullDataGenerator, nullIndexRangeCollection, keepTest); #endregion bool speedOverMemory = true; IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > predictorNameAndCaseIdToNonMissingValueEnumeration = CreateNameAndCaseIdToNonMissingValueEnumeration(predictorSparseFileName, speedOverMemory); IEnumerable <Pair <string, Dictionary <string, SufficientStatistics> > > targetNameAndCaseIdToNonMissingValueEnumeration = CreateNameAndCaseIdToNonMissingValueEnumeration(targetSparseFileName, speedOverMemory); NullDataCollection nullDataGenerator = CreateNullDataGenerator(nullDataGeneratorName, modelScorer, phyloTree, nullIndexRangeCollection, predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration); UniversalWorkList workList = UniversalWorkList.GetInstance( predictorNameAndCaseIdToNonMissingValueEnumeration, targetNameAndCaseIdToNonMissingValueEnumeration, nullDataGenerator, nullIndexRangeCollection, keepTest); int workListCount = SpecialFunctions.Count(workList.List()); int effectiveWorkListCount; if (skipRowIndexRangeCollectionOrNull == null) { effectiveWorkListCount = workListCount; } else { effectiveWorkListCount = 0; for (int iRowIndex = 0; iRowIndex < workListCount; iRowIndex++) { if (!skipRowIndexRangeCollectionOrNull.Contains(iRowIndex)) { effectiveWorkListCount++; } } } Console.WriteLine("{0} Total rows. Skipping {1} of them.", workListCount, workListCount - effectiveWorkListCount); using (TextWriter textWriter = File.CreateText(outputFileName)) { textWriter.WriteLine(Header); int rowIndex = -1; int effectiveRowIndex = -1; foreach (RowData rowAndTargetData in workList.List()) { //!!!make all these parameters and the calculation a class ++rowIndex; Debug.Assert(rowIndex < workListCount); // real assert if (skipRowIndexRangeCollectionOrNull == null || !skipRowIndexRangeCollectionOrNull.Contains(rowIndex)) { ++effectiveRowIndex; int workIndex = ExtractWorkIndex(effectiveRowIndex, pieceCount, effectiveWorkListCount); if (pieceIndexRangeCollection.Contains(workIndex)) { Debug.WriteLine("WorkItemIndex " + rowIndex.ToString()); string reportLine; try { reportLine = CreateReportLine(modelScorer, phyloTree, rowAndTargetData, workList, rowIndex, workListCount, workIndex); } catch (OutOfMemoryException) { Console.WriteLine("OUT OF MEMORY!! Clearing cache and trying to recover where we left off."); modelScorer.ClearCache(); reportLine = CreateReportLine(modelScorer, phyloTree, rowAndTargetData, workList, rowIndex, workListCount, workIndex); } textWriter.WriteLine(reportLine); textWriter.Flush(); } } } } stopwatch.Stop(); Console.WriteLine("Running time: " + stopwatch.Elapsed); }
public override bool IsCompatibleWithNewKeepTest(KeepTest <Dictionary <string, string> > keepTestNew) { throw new Exception("The method or operation is not implemented."); }
public override bool IsCompatibleWithNewKeepTest(KeepTest <Dictionary <string, string> > keepTestNew) { return(false); }
public override bool IsCompatibleWithNewKeepTest(KeepTest <Dictionary <string, string> > keepTestNew) { // haven't done anything here. return(false); }
private static Set <int> CreateTabulateReportInternal( string inputFilePattern, KeepTest <Dictionary <string, string> > keepTest, double maxPValue, bool auditRowIndexValues, ref List <Dictionary <string, string> > realRowCollectionToSort, ref List <double> nullValueCollectionToBeSorted, ref string headerSoFar) { Set <int> nullIndexSet = Set <int> .GetInstance(); //!!!very similar code elsewhere RowIndexTabulator rowIndexTabulator = RowIndexTabulator.GetInstance(auditRowIndexValues); //RangeCollection unfilteredRowIndexRangeCollection = RangeCollection.GetInstance(); foreach (string fileName in Directory.GetFiles(Directory.GetCurrentDirectory(), inputFilePattern)) { Debug.WriteLine(fileName); string headerOnFile; bool firstRow = true; foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(fileName, /*includeWholeLine*/ true, out headerOnFile)) { if (firstRow) { firstRow = false; if (headerSoFar == null) { headerSoFar = headerOnFile; } else if (headerSoFar != headerOnFile) { Console.WriteLine("Warning: The header for file {0} is different from the 1st file read in", fileName); } } if (rowIndexTabulator.TryAdd(row, fileName) && keepTest.Test(row)) { //int unfilteredRowIndex = ReadUnfilteredRowIndexButIfMissingUseRowIndex(row, rowIndex); //unfilteredRowIndexRangeCollection.Add(unfilteredRowIndex); SpecialFunctions.CheckCondition(row.ContainsKey(NullIndexColumnName), string.Format(@"When tabulating a ""{0}"" column is required. (File ""{1}"")", NullIndexColumnName, fileName)); int nullIndex = int.Parse(row[NullIndexColumnName]); nullIndexSet.AddNewOrOld(nullIndex); double pValue = AccessPValueFromPhylotreeRow(row); //if (double.IsNaN(pValue)) //{ // pValue = 1; // row["PValue"] = "1"; //} if (pValue <= maxPValue) { if (nullIndex == -1) { realRowCollectionToSort.Add(row); } else { nullValueCollectionToBeSorted.Add(pValue); } } } } } rowIndexTabulator.CheckIsComplete(inputFilePattern); return(nullIndexSet); }
/// <summary> /// Does the work. /// </summary> public override void DoWork() { // get our input data and null the field to make sure we don't serialize it back InputData inputData = mInputData; mInputData = null; // get the job-specific names of input files FileDefCollection fileDefs = Job.FileDefs; string treeFileName = Utility.GetNamedFileDef(fileDefs, Constants.TreeFileDefName).LocalName; string predictorFileName = Utility.GetNamedFileDef(fileDefs, Constants.PredictorFileDefName).LocalName; string targetFileName = Utility.GetNamedFileDef(fileDefs, Constants.TargetFileDefName).LocalName; string skipRowIndexFileName = Utility.GetNamedFileDef(fileDefs, Constants.SkipRowIndexFileDefName).LocalName; // construct RangeCollections RangeCollection pieceIndexRangeCollection = RangeCollection.Parse(inputData.PieceIndexRange); RangeCollection nullIndexRangeCollection = RangeCollection.Parse(inputData.NullIndexRange); RangeCollection skipRowIndexRangeCollection; FileInfo fileInfo = new FileInfo(skipRowIndexFileName); if (fileInfo.Length > 0) { skipRowIndexRangeCollection = RangeCollection.Parse(File.ReadAllText(skipRowIndexFileName)); } else { skipRowIndexRangeCollection = null; } // do the rest PhyloTree aPhyloTree = PhyloTree.GetInstance(treeFileName, null); ModelScorer modelScorer = ModelScorer.GetInstance(aPhyloTree, inputData.LeafDistributionName, inputData.OptimizerName); ModelEvaluator modelEvaluator = ModelEvaluator.GetInstance(inputData.LeafDistributionName, modelScorer); KeepTest <Dictionary <string, string> > keepTest = KeepTest <Dictionary <string, string> > .GetInstance(null, inputData.KeepTestName); PhyloDDriver driver = PhyloDDriver.GetInstance(); // create a name for the temporary job sandbox. This directory gets created by driver.Run(...) string agentOutputDirectoryName = Path.Combine(Environment.CurrentDirectory, String.Format(CultureInfo.InvariantCulture, "{0}.{1}", Job.JobId, Task.TaskId)); // save the standard out and standard error in memory streams using (MemoryStream streamOut = new MemoryStream(), streamError = new MemoryStream()) { try { // redirect the outputs using ( StreamWriter writerOut = new StreamWriter(streamOut), writerError = new StreamWriter(streamError)) { Console.SetOut(writerOut); Console.SetError(writerError); try { // run the model string outputFileName = driver.Run( modelEvaluator, predictorFileName, targetFileName, inputData.LeafDistributionName, inputData.NullDataGeneratorName, keepTest, skipRowIndexRangeCollection, inputData.NiceName, agentOutputDirectoryName, pieceIndexRangeCollection, inputData.PieceCount, nullIndexRangeCollection, inputData.OptimizerName); // this is the expected output file name -- save this so it can be written on the master side with the same name. mOutputFileName = Path.GetFileName(outputFileName); mLocalOutputFileName = Path.Combine(inputData.LocalOutputDirectoryName, mOutputFileName); // get the output data string fullOutputPath = Path.Combine(agentOutputDirectoryName, mOutputFileName); if (!File.Exists(fullOutputPath)) { TaskResult.FailureReason = TaskFailureReason.MissingOutput; TaskResult.FailureMessage = String.Format(CultureInfo.CurrentCulture, "Cannot find output file '{0}'", targetFileName); TaskResult.Status = TaskAssignmentStatus.Failed; } using (StreamReader outputData = new StreamReader(fullOutputPath)) { mOutputData = outputData.ReadToEnd(); } } finally { // this finally is to make sure we delete the folder // get rid of the sandbox Directory.Delete(agentOutputDirectoryName, true); } } } finally { // this finally is to make sure we get console output Encoding encoding = Encoding.Default; TaskResult.StandardOutput = encoding.GetString(streamOut.GetBuffer()); TaskResult.StandardError = encoding.GetString(streamError.GetBuffer()); } } }
/// <summary> /// currently hard-coded to map the rowId to a groupId, used for localTabulation, by using hypothesisId = rowId % numRealHypotheses /// </summary> /// <param name="nullIndexSet"></param> /// <param name="dirinfo"></param> /// <param name="inputFilePattern"></param> /// <param name="globalKeepTest"></param> /// <param name="splitKeepTestList"></param> /// <param name="maxPValue"></param> /// <param name="auditRowIndexValues"></param> /// <param name="useStoreyTibsharaniMethod"></param> /// <param name="realRowCollectionToSortArray"></param> /// <param name="nullValueCollectionToBeSortedArrayDict"></param> /// <param name="totalPValueCount"></param> /// <param name="headerSoFar"></param> /// <param name="doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations"></param> /// <returns></returns> private static RowIndexTabulator TryCreateTabulateReportInternal(out Set <int> nullIndexSet, DirectoryInfo dirinfo, string inputFilePattern, KeepTest <Dictionary <string, string> > globalKeepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPValue, bool auditRowIndexValues, bool useStoreyTibsharaniMethod, ref List <KeyValuePair <Dictionary <string, string>, double> >[] realRowCollectionToSortArray, ref Dictionary <int, List <double> >[] nullValueCollectionToBeSortedArrayDict, ref int[] totalPValueCount, ref string headerSoFar, bool doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations ) { //int splitCount=splitKeepTestList.Count; //List<double>[] nullValueCollectionToBeSortedArray = new List<double>[splitCount]; //for (int j = 0; j < splitCount; j++) nullValueCollectionToBeSortedArray[j] = new List<double>(); nullIndexSet = Set <int> .GetInstance(); //!!!very similar code elsewhere RowIndexTabulator rowIndexTabulator = RowIndexTabulator.GetInstance(auditRowIndexValues); //RangeCollection unfilteredRowIndexRangeCollection = new RangeCollection(); int lastWriteLineLength = 0; int nullValueCount = 0; foreach (FileInfo fileinfo in dirinfo.GetFiles(inputFilePattern)) { try { int sigLines = realRowCollectionToSortArray.Select(split => split.Count).Sum(); //nullValueCount = nullValueCollectionToBeSortedArray.Select(split => split.Count).Sum(); int totalLines = sigLines + nullValueCount + totalPValueCount.Sum(); string writeLine = string.Format("{0}/{1} lines have p<=1. Now reading {2}", sigLines, totalLines, fileinfo.FullName); Console.Write("\r{0,-" + lastWriteLineLength + "}", writeLine); lastWriteLineLength = writeLine.Length; string headerOnFile; using (TextReader reader = SpecialFunctions.GetTextReaderWithExternalReadWriteAccess(fileinfo.FullName)) { headerOnFile = reader.ReadLine(); if (headerSoFar == null) { headerSoFar = headerOnFile; } else if (headerSoFar != headerOnFile) { Console.WriteLine("Warning: The header for file {0} is different from the 1st file read in", fileinfo.Name); } } //KeepAa2AaOnly keepAa = KeepAa2AaOnly.GetInstance(); //Console.WriteLine(keepAa); using (TextReader reader = SpecialFunctions.GetTextReaderWithExternalReadWriteAccess(fileinfo.FullName)) { foreach (Dictionary <string, string> row in SpecialFunctions.TabFileTable(reader, headerOnFile, /*includeWholeLine*/ true)) { if (rowIndexTabulator.TryAdd(row, fileinfo.FullName) && globalKeepTest.Test(row)) { //Helper.CheckCondition(row.ContainsKey(NullIndexColumnName), string.Format(@"When tabulating a ""{0}"" column is required. (File ""{1}"")", NullIndexColumnName, fileinfo.Name)); //int nullIndex = int.Parse(row[NullIndexColumnName]); int nullIndex = !row.ContainsKey(NullIndexColumnName) && useStoreyTibsharaniMethod ? -1 : int.Parse(row[NullIndexColumnName]); nullIndexSet.AddNewOrOld(nullIndex); double pValue = AccessPValueFromPhylotreeRow(row); if (useStoreyTibsharaniMethod && nullIndex == -1) { int splitIdx = GetSplitTabulateIndex(row, splitKeepTestList); if (pValue <= maxPValue) { realRowCollectionToSortArray[splitIdx].Add(new KeyValuePair <Dictionary <string, string>, double>(row, pValue)); } //nullValueCollectionToBeSortedArray[splitIdx].Add(pValue); totalPValueCount[splitIdx]++; } else if (!useStoreyTibsharaniMethod) { if (pValue <= maxPValue) { int splitIdx = GetSplitTabulateIndex(row, splitKeepTestList); if (nullIndex == -1) { realRowCollectionToSortArray[splitIdx].Add(new KeyValuePair <Dictionary <string, string>, double>(row, pValue)); //realRowCollectionToSortArray[splitIdx].Add(row); } else { int groupId; if (!doLocalTabulationOfPermutationsToGetPvaluesFromRandomizations) { //always add it to the zero key if not doing local tabulations groupId = 0; } else { groupId = int.Parse(row[GroupIdColumnName]); } nullValueCollectionToBeSortedArrayDict[splitIdx].GetValueOrDefault(groupId).Add(pValue); nullValueCount++; //nullValueCollectionToBeSortedArray[splitIdx].Add(pValue); } } } } } } } catch { Console.WriteLine("\nFailure parsing {0}.", fileinfo.Name); throw; } } Console.WriteLine("\r{0,-" + lastWriteLineLength + "}", "Read all files."); return(rowIndexTabulator); //rowIndexTabulator.CheckIsComplete(inputFilePattern); //return nullIndexSet; }
static void Main(string[] args) { try { ArgCollection argCollection = ArgCollection.GetInstance(args); if (argCollection.ExtractOptionalFlag("help")) { Console.WriteLine(""); Console.WriteLine(UsageMessage); Console.WriteLine(HelpMessage); return; } string optimizerName = argCollection.ExtractOptional <string>("optimizer", "BrentThenGrid"); string keepTestName = argCollection.ExtractOptional <string>("keepTest", "AlwaysKeep"); string skipRowIndexFileNameOrNull = argCollection.ExtractOptional <string>("skipRowIndexFile", null); argCollection.CheckNoMoreOptions(); string treeFileName = argCollection.ExtractNext <string>("treeFile"); string predictorFileName = argCollection.ExtractNext <string>("predictorFile"); string targetFileName = argCollection.ExtractNext <string>("targetFile"); string leafDistributionName = argCollection.ExtractNext <string>("leafDistribution"); string nullDataGeneratorName = argCollection.ExtractNext <string>("nullDataGenerator"); string niceName = argCollection.ExtractNext <string>("niceName"); string outputDirectory = argCollection.ExtractNext <string>("outputDirectory"); RangeCollection pieceIndexRangeCollection = argCollection.ExtractNext <RangeCollection>("pieceIndexRange"); int pieceCount = argCollection.ExtractNext <int>("pieceCount"); RangeCollection nullIndexRangeCollection = argCollection.ExtractNext <RangeCollection>("nullIndexRange"); argCollection.CheckThatEmpty(); if (!PhyloDDriver.ValidateDistribution(leafDistributionName)) { Console.WriteLine("{0} is not a recognized distribution name. Please choose a name from the following list:", leafDistributionName); foreach (string name in PhyloDDriver.GetDistributionNames()) { Console.WriteLine("\t{0}", name); } throw new ArgumentException("Invalid distribution name."); } RangeCollection skipRowIndexRangeCollectionOrNull = (null == skipRowIndexFileNameOrNull) || skipRowIndexFileNameOrNull == "null" ? null : RangeCollection.Parse(File.ReadAllText(skipRowIndexFileNameOrNull)); KeepTest <Dictionary <string, string> > keepTest = KeepTest <Dictionary <string, string> > .GetInstance(null, keepTestName); SpecialFunctions.CheckCondition(pieceIndexRangeCollection.IsBetween(0, pieceCount - 1), "pieceIndex must be at least 0 and less than pieceCount"); SpecialFunctions.CheckCondition(nullIndexRangeCollection.IsBetween(-1, int.MaxValue), "nullIndex must be at least -1"); PhyloTree aPhyloTree = PhyloTree.GetInstance(treeFileName, null); ModelScorer modelScorer = ModelScorer.GetInstance(aPhyloTree, leafDistributionName, optimizerName); ModelEvaluator modelEvaluator = ModelEvaluator.GetInstance(leafDistributionName, modelScorer); PhyloDDriver driver = PhyloDDriver.GetInstance(); driver.Run( modelEvaluator, predictorFileName, targetFileName, leafDistributionName, nullDataGeneratorName, keepTest, skipRowIndexRangeCollectionOrNull, niceName, outputDirectory, pieceIndexRangeCollection, pieceCount, nullIndexRangeCollection, optimizerName); //Console.Write("Press enter to exist."); //Console.Read(); } catch (Exception exception) { Console.WriteLine(""); Console.WriteLine(exception.Message); if (exception.InnerException != null) { Console.WriteLine(exception.InnerException.Message); } Console.WriteLine(""); Console.WriteLine(UsageMessage); throw; } }
static void Main(string[] argsx) { try { List <string> argumentCollection = new List <string>(argsx); bool auditRowIndexValues = true; string noAuditFlag = "-NoAudit"; if (argumentCollection.Contains(noAuditFlag)) { argumentCollection.Remove(noAuditFlag); auditRowIndexValues = false; } double maxPValue = 1.0; // Ignore pValues greater than this string maxPValueFlag = "-MaxPValue"; int maxPValuePosition = argumentCollection.IndexOf(maxPValueFlag); if (maxPValuePosition >= 0) { argumentCollection.RemoveAt(maxPValuePosition); SpecialFunctions.CheckCondition(maxPValuePosition < argumentCollection.Count, "pValue expected after -MaxPValue"); maxPValue = double.Parse(argumentCollection[maxPValuePosition]); argumentCollection.RemoveAt(maxPValuePosition); } KeepTest <Dictionary <string, string> > keepTest; // Ignore pValues greater than this string keepTestFlag = "-KeepTest"; int keepTestPosition = argumentCollection.IndexOf(keepTestFlag); if (keepTestPosition >= 0) { argumentCollection.RemoveAt(keepTestPosition); SpecialFunctions.CheckCondition(keepTestPosition < argumentCollection.Count, "KeepTest expected after -MaxPValue"); keepTest = KeepTest <Dictionary <string, string> > .GetInstance(null, argumentCollection[keepTestPosition]); argumentCollection.RemoveAt(keepTestPosition); } else { keepTest = new AlwaysKeep <Dictionary <string, string> >(); } SpecialFunctions.CheckCondition(argumentCollection.Count > 1, "Expect 2 or more parameters"); string outputFileName = argumentCollection[argumentCollection.Count - 1]; argumentCollection.RemoveAt(argumentCollection.Count - 1); Tabulate.CreateTabulateReport(argumentCollection, outputFileName, keepTest, maxPValue, auditRowIndexValues); } catch (Exception e) { Console.WriteLine(""); Console.WriteLine(e.Message); if (e.InnerException != null) { Console.WriteLine(e.InnerException.Message); } Console.WriteLine(@" Usage: Tabulate {-NoAudit} {-MaxPValue maxPValue} {-KeepTest keeptest} broadInputFileNamePattern1 {broadInputFileNamePattern2 ...} outputFileName Each broadInputFileNamePattern1 is of the form narrowInputFileNamePattern1{+narrowInputFileNamePattern2...} Each broadInputFileNamePattern must cover the same range of nullIndexes (including -1, the real index). Each narrowInputFileNamePattern within a broadInputFileNamePattern must cover a disjoint set of nullIndexes. For example Tabulate -MaxPValue .05 raw\GagEscape0606*-1-19*.txt raw\GagReversion0606*-1-9*.txt+raw\GagReversion0606*10-19*.txt AllGag.qValue.txt Notice that broad pattern raw\GagEscape0606*-1-19*.txt has one narrow pattern and covers nullIndex's -1 to 19 While broad pattern raw\GagReversion0606*-1-9*.txt+raw\GagReversion0606*10-19*.txt has two narrow patterns: raw\GagReversion0606*-1-9*.txt, which covers nullIndexes -1 to 9 raw\GagReversion0606*10-19*.txt which covers nullIndexes 10 to 19 By default, ""Tabulate"" will audit the ""rowIndex"" and ""rowCount"" values in the input to remove duplicates and check that all rows are present. Use ""-NoAudit"" when this is not desired. Use ""-MaxPValue maxPValue"", where maxPValue is a double, to ignore rows with obviously bad rows "); throw; } }
///// <summary> ///// ///// </summary> ///// <returns>bool indicating successful tabulate. False indicates the audit failed, in which case the outputFileName will be used ///// to create a skip file.</returns> //public static bool CreateTabulateReport(DirectoryInfo dirinfo, ICollection<string> inputFilePatternCollection, string outputFileName, // KeepTest<Dictionary<string, string>> globalKeepTest, double maxPValue, bool auditRowIndexValues, bool useStoreyTibsharaniMethod) //{ // return CreateTabulateReport(dirinfo, inputFilePatternCollection, outputFileName, globalKeepTest, new List<KeepTest<Dictionary<string,string>>>(), // maxPValue, auditRowIndexValues, useStoreyTibsharaniMethod); //} public static bool CreateTabulateReport(DirectoryInfo dirinfo, string inputFilePattern, string outputFileName, KeepTest <Dictionary <string, string> > globalKeepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPValue, bool auditRowIndexValues, bool useStoreyTibsharaniMethod) { return(CreateTabulateReport(dirinfo, SpecialFunctions.CreateSingletonList(inputFilePattern), outputFileName, globalKeepTest, splitKeepTestList, maxPValue, auditRowIndexValues, useStoreyTibsharaniMethod)); }
new public static KeepTest <Dictionary <string, string> > GetInstance(string inputDirectory, string binarySeqFileName, string hlaFileName, string keepTestName, int merSize, Dictionary <int, string> pidToCaseName) { //!!!would be nice of classes could parse themselves if (keepTestName.StartsWith(KeepEndOfGag.Prefix)) { bool keepIt = bool.Parse(keepTestName.Substring(KeepEndOfGag.Prefix.Length)); return(KeepEndOfGag.GetInstance(keepIt)); } else if (keepTestName.StartsWith(K1.Prefix)) { int k1 = int.Parse(keepTestName.Substring(K1.Prefix.Length)); return(K1.GetInstance(k1)); } //else if (keepTestName.StartsWith(K2.Prefix)) //{ // int k2 = int.Parse(keepTestName.Substring(K2.Prefix.Length)); // return K2.GetInstance(k2); //} else if (keepTestName.StartsWith(KeepNonOverlappingAA.Prefix)) { return(KeepNonOverlappingAA.GetInstance()); } else if (keepTestName.StartsWith(KeepGene.Prefix)) { string geneRange = keepTestName.Substring(KeepGene.Prefix.Length); return(KeepGene.GetInstance(geneRange)); } else if (keepTestName.StartsWith(KeepSpecificRows.Prefix)) { return(KeepSpecificRows.GetInstance(keepTestName.Substring(KeepSpecificRows.Prefix.Length))); } else if (keepTestName.StartsWith(KeepSpecificRow.Prefix)) { return(KeepSpecificRow.GetInstance(keepTestName.Substring(KeepSpecificRow.Prefix.Length))); } else if (keepTestName.StartsWith(KeepSpecificGenes.Prefix)) { return(KeepSpecificGenes.GetInstance(keepTestName.Substring(KeepSpecificGenes.Prefix.Length))); } else if (keepTestName.StartsWith(KeepOneOfAAPair.Prefix)) { return(KeepOneOfAAPair.GetInstance()); } else if (keepTestName.StartsWith(KeepAllButSamePosition.Prefix)) { return(KeepAllButSamePosition.GetInstance()); } else if (keepTestName.StartsWith(KeepAllButSameDeletion.Prefix)) { return(KeepAllButSameDeletion.GetInstance()); } else if (keepTestName.StartsWith(KeepNonTrivialRows.Prefix)) { return(new KeepNonTrivialRows()); } else if (keepTestName.StartsWith(KeepTestTemp.Prefix)) { return(KeepTestTemp.GetInstance()); } //else if (keepTestName.StartsWith(KeepPollockOneDirection.Prefix)) //{ // return KeepPollockOneDirection.GetInstance(keepTestName.Substring(KeepPollockOneDirection.Prefix.Length)); //} //else if (keepTestName.StartsWith(KeepFisherOneDirection.Prefix)) //{ // return KeepFisherOneDirection.GetInstance(keepTestName.Substring(KeepFisherOneDirection.Prefix.Length)); //} else if (keepTestName.StartsWith(KeepNonRare.Prefix)) { return(KeepNonRare.GetInstance(keepTestName.Substring(KeepNonRare.Prefix.Length))); } else if (keepTestName.StartsWith(KeepPredictorTargetPairs.Prefix)) { return(KeepPredictorTargetPairs.GetInstance(keepTestName.Substring(KeepPredictorTargetPairs.Prefix.Length))); } else if (keepTestName.StartsWith("JointGagPolTest")) { return(And <Dictionary <string, string> > .GetInstance( //KeepRandom<Dictionary<string,string>>.GetInstance(0, 0.001), // how do we make it the same when we count and when we really run through it? KeepOneOfAAPair.GetInstance(), KeepNonOverlappingAA.GetInstance(), KeepSpecificGenes.GetInstance(keepTestName.Substring("JointGagPolTest".Length)))); } else { return(KeepTest <TRow> .GetInstance(inputDirectory, binarySeqFileName, null, keepTestName, merSize, pidToCaseName)); } }
/// <summary> /// /// </summary> /// <returns>bool indicating successful tabulate. False indicates the audit failed, in which case the outputFileName will be used</returns> public static bool CreateTabulateReport(DirectoryInfo dirinfo, ICollection <string> inputFilePatternCollection, string outputFileName, KeepTest <Dictionary <string, string> > globalKeepTest, List <KeepTest <Dictionary <string, string> > > splitKeepTestList, double maxPValue, bool auditRowIndexValues, bool useStoreyTibsharaniMethod, int numTestsStoreyTibsOverride, bool doLocalTabulation) { using (TextWriter textWriter = File.CreateText(outputFileName)) // Do this early so that if it fails, well know { int splitCount = splitKeepTestList.Count + 1; List <KeyValuePair <Dictionary <string, string>, double> >[] realRowCollectionToSortArray = new List <KeyValuePair <Dictionary <string, string>, double> > [splitCount]; //List<double>[] nullValueCollectionToBeSortedArray = new List<double>[splitCount]; Dictionary <int, List <double> >[] nullValueCollectionToBeSortedArray = new Dictionary <int, List <double> > [splitCount]; int[] totalPValueCount = new int[splitCount]; for (int i = 0; i < splitCount; i++) { realRowCollectionToSortArray[i] = new List <KeyValuePair <Dictionary <string, string>, double> >(10000); //nullValueCollectionToBeSortedArray[i] = new List<double>(10000); nullValueCollectionToBeSortedArray[i] = new Dictionary <int, List <double> >(); } string headerSoFar = null; Set <int> broadRealAndNullIndexSetSoFar = null; foreach (string broadInputFilePattern in inputFilePatternCollection) { Set <int> narrowRealAndNullIndexSetSetSoFar = Set <int> .GetInstance(); foreach (string narrowInputFilePattern in broadInputFilePattern.Split('+')) { Set <int> realAndNullIndexSet; RowIndexTabulator tabulator = TryCreateTabulateReportInternal(out realAndNullIndexSet, dirinfo, narrowInputFilePattern, globalKeepTest, splitKeepTestList, maxPValue, auditRowIndexValues, useStoreyTibsharaniMethod, ref realRowCollectionToSortArray, ref nullValueCollectionToBeSortedArray, ref totalPValueCount, ref headerSoFar, doLocalTabulation); if (!tabulator.IsComplete()) { textWriter.WriteLine(tabulator.GetSkipRangeCollection()); Console.WriteLine("Not all needed rows were found in {0}.", narrowInputFilePattern); Console.WriteLine("Found rows:\n{0}", tabulator.GetSkipRangeCollection()); Console.WriteLine("{0} created as skip file.", outputFileName); return(false); } //Instead of throwing an error, we could filter out the duplicated null indexes Helper.CheckCondition(narrowRealAndNullIndexSetSetSoFar.IntersectionIsEmpty(realAndNullIndexSet), string.Format("Within inputFilePattern {0}, multiple '+'-connected parts cover the same nullIndex(s), {1}", broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar.Intersection(realAndNullIndexSet))); narrowRealAndNullIndexSetSetSoFar.AddNewRange(realAndNullIndexSet); } Helper.CheckCondition(!auditRowIndexValues || narrowRealAndNullIndexSetSetSoFar.Contains(-1), string.Format("The 'null' index -1 for the real data was not seen in {0}", broadInputFilePattern)); if (broadRealAndNullIndexSetSoFar == null) { broadRealAndNullIndexSetSoFar = narrowRealAndNullIndexSetSetSoFar; } //else //{ // Helper.CheckCondition(broadRealAndNullIndexSetSoFar.Equals(narrowRealAndNullIndexSetSetSoFar), // string.Format("The broad inputFilePattern {0} covers a different set of nullIndexes ({1}) than its predecessors ({2})", // broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar, broadRealAndNullIndexSetSoFar)); //} } double numberOfRandomizationRuns = useStoreyTibsharaniMethod ? 0 : broadRealAndNullIndexSetSoFar.Count - 1; Console.WriteLine("Detected {0} randomized runs relative to the number of real runs.", numberOfRandomizationRuns); Helper.CheckCondition <InvalidDataException>(useStoreyTibsharaniMethod || numberOfRandomizationRuns > 0, "No randomization runs detected. Did you mean to include a -{0} flag?", Tabulate.STOREY_METHOD_NAME); //Compute q-values from p-values (and p-values from test statistic) List <KeyValuePair <Dictionary <string, string>, double> > rowAndQValues = new List <KeyValuePair <Dictionary <string, string>, double> >(1000); Dictionary <double, double> rowToPvalFromRandomizations = null; for (int i = 0; i < splitCount; i++) { int numTestsToUse; if (numTestsStoreyTibsOverride != -1) { Console.WriteLine("Using " + numTestsStoreyTibsOverride + " p-values for computation of q-values rather than the observed number (" + totalPValueCount[i] + ")"); numTestsToUse = numTestsStoreyTibsOverride; } else { numTestsToUse = totalPValueCount[i]; } //List<double> placeFiller = nullValueCollectionToBeSortedArray[i][0]; Dictionary <Dictionary <string, string>, double> qValueList; if (useStoreyTibsharaniMethod) { qValueList = SpecialFunctions.ComputeQValuesUseStoreyTibsharani(ref realRowCollectionToSortArray[i], row => row.Value, numTestsToUse) .ToDictionary(entry => entry.Key.Key, entry => entry.Value); } else if (!doLocalTabulation) { qValueList = SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i], row => row.Value, row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["groupId"]), row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["rowIndex"]), ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns, out rowToPvalFromRandomizations, doLocalTabulation) .ToDictionary(entry => entry.Key.Key, entry => entry.Value); } else//do local tabulation { qValueList = SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i], row => row.Value, row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["groupId"]), row => int.Parse(((KeyValuePair <System.Collections.Generic.Dictionary <string, string>, double>)row).Key["rowIndex"]), ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns, out rowToPvalFromRandomizations, doLocalTabulation) .ToDictionary(entry => entry.Key.Key, entry => entry.Value); } //Dictionary<Dictionary<string, string>, double> qValueList = // (useStoreyTibsharaniMethod ? // SpecialFunctions.ComputeQValuesUseStoreyTibsharani(ref realRowCollectionToSortArray[i], row => row.Value, numTestsToUse) : // SpecialFunctions.ComputeQValuesUseNulls(ref realRowCollectionToSortArray[i], row => row.Value, // ref nullValueCollectionToBeSortedArray[i], numberOfRandomizationRuns,out pValToPvalFromRandomizations)) // .ToDictionary(entry => entry.Key.Key, entry => entry.Value); foreach (KeyValuePair <Dictionary <string, string>, double> rowAndQValue in qValueList) { rowAndQValues.Add(new KeyValuePair <Dictionary <string, string>, double>(rowAndQValue.Key, rowAndQValue.Value)); } } rowAndQValues.Sort((row1, row2) => row1.Value == row2.Value ? AccessPValueFromPhylotreeRow(row1.Key).CompareTo(AccessPValueFromPhylotreeRow(row2.Key)) : row1.Value.CompareTo(row2.Value)); //!!!this code is repeated elsewhere if (COL_TO_TABULATE.TESTSTATISTIC == _columnToTabulate) { Helper.CheckCondition(!useStoreyTibsharaniMethod, "the way its set up now, cannot use TestStatistic column with useStoreyTibshirani"); textWriter.WriteLine(Helper.CreateTabString(headerSoFar, "pValFromRandomizations", "qValue")); } else { textWriter.WriteLine(Helper.CreateTabString(headerSoFar, "qValue")); } //foreach (Dictionary<string, string> row in realRowCollectionToSortArray) //{ // double qValue = qValueList[row]; // textWriter.WriteLine(Helper.CreateTabString(row[""], qValue)); //} foreach (KeyValuePair <Dictionary <string, string>, double> rowAndQValue in rowAndQValues) { if (COL_TO_TABULATE.TESTSTATISTIC == _columnToTabulate) { double thisRow = double.Parse(rowAndQValue.Key["rowIndex"]); double thisPvalFromRandomization = rowToPvalFromRandomizations[thisRow]; textWriter.WriteLine(Helper.CreateTabString(rowAndQValue.Key[""], thisPvalFromRandomization, rowAndQValue.Value)); } else { textWriter.WriteLine(Helper.CreateTabString(rowAndQValue.Key[""], rowAndQValue.Value)); } } } return(true); }
//Similar to the other tabulators, but can work with multiple sets of pValues files //!!!would be better if could cut off really bad pValues to save memory //!!! also would be nice to have filters public static void CreateTabulateReport(ICollection <string> inputFilePatternCollection, string outputFileName, KeepTest <Dictionary <string, string> > keepTest, double maxPValue, bool auditRowIndexValues) { //SpecialFunctions.CheckCondition(!File.Exists(outputFileName), "Output file already exists: " + outputFileName); using (TextWriter textWriter = File.CreateText(outputFileName)) // Do this early so that if it fails, well know { List <Dictionary <string, string> > realRowCollectionToSort = new List <Dictionary <string, string> >(); List <double> nullValueCollectionToBeSorted = new List <double>(); string headerSoFar = null; Set <int> broadRealAndNullIndexSetSoFar = null; foreach (string broadInputFilePattern in inputFilePatternCollection) { Set <int> narrowRealAndNullIndexSetSetSoFar = Set <int> .GetInstance(); foreach (string narrowInputFilePattern in broadInputFilePattern.Split('+')) { Set <int> realAndNullIndexSet = CreateTabulateReportInternal(narrowInputFilePattern, keepTest, maxPValue, auditRowIndexValues, ref realRowCollectionToSort, ref nullValueCollectionToBeSorted, ref headerSoFar); //Instead of throwing an error, we could filter out the duplicated null indexes SpecialFunctions.CheckCondition(narrowRealAndNullIndexSetSetSoFar.IntersectionIsEmpty(realAndNullIndexSet), string.Format("Within inputFilePattern {0}, multiple '+'-connected parts cover the same nullIndex(s), {1}", broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar.Intersection(realAndNullIndexSet))); narrowRealAndNullIndexSetSetSoFar.AddNewRange(realAndNullIndexSet); } SpecialFunctions.CheckCondition(!auditRowIndexValues || narrowRealAndNullIndexSetSetSoFar.Contains(-1), string.Format("The 'null' index -1 for the real data was not seen in {0}", broadInputFilePattern)); if (broadRealAndNullIndexSetSoFar == null) { broadRealAndNullIndexSetSoFar = narrowRealAndNullIndexSetSetSoFar; } else { SpecialFunctions.CheckCondition(broadRealAndNullIndexSetSoFar.Equals(narrowRealAndNullIndexSetSetSoFar), string.Format("The broad inputFilePattern {0} covers a different set of nullIndexes ({1}) than its predecessors ({2})", broadInputFilePattern, narrowRealAndNullIndexSetSetSoFar, broadRealAndNullIndexSetSoFar)); } } double numberOfRandomizationRuns = broadRealAndNullIndexSetSoFar.Count - 1; Console.WriteLine("Detected {0} randomized runs relative to the number of real runs.", numberOfRandomizationRuns); Dictionary <Dictionary <string, string>, double> qValueList = SpecialFunctions.ComputeQValues(ref realRowCollectionToSort, AccessPValueFromPhylotreeRow, ref nullValueCollectionToBeSorted, numberOfRandomizationRuns); //!!!this code is repeated elsewhere textWriter.WriteLine(SpecialFunctions.CreateTabString(headerSoFar, "qValue")); foreach (Dictionary <string, string> row in realRowCollectionToSort) { double qValue = qValueList[row]; textWriter.WriteLine(SpecialFunctions.CreateTabString(row[""], qValue)); } } }