static Dictionary <string, int> highRepSeqs; // long seqs + counts for high depth reads static void Main(string[] args) { if (args.Length == 0) { Console.WriteLine("FilterReadsByDepth -min minDepth -max maxDepth [-reduce maxCopies] [-histoOnly] [-stats statsFN] [-f format] [-t #threads] cbtFN fileNames"); return; } List <string> FNParams = new List <string>(); // the .cbt name and the set of file names or patterns int noThreads = 2; // find out who we are so we can track what program & args produced the result files Process myProcess = Process.GetCurrentProcess(); myProcessNameAndArgs = myProcess.ProcessName; foreach (string a in args) { myProcessNameAndArgs = myProcessNameAndArgs + " " + a; } for (int p = 0; p < args.Length; p++) { if (args[p][0] == '-') { args[p] = args[p].ToLower(); if (args[p] == "-s" || args[p] == "-stats") { if (!CheckForParamValue(p, args.Length, "stats file name string expected after -s|-stats")) { return; } statsFN = args[p + 1]; p++; continue; } if (args[p] == "-min") { if (!CheckForParamValue(p, args.Length, "minDepth number expected after -min")) { return; } try { minDepth = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -min parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-max") { if (!CheckForParamValue(p, args.Length, "maxDepth number expected after -max")) { return; } try { maxDepth = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -max parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-r" || args[p] == "-reduce") { if (!CheckForParamValue(p, args.Length, "reduced depth number expected after -reduce")) { return; } try { reducedDepth = Convert.ToInt32(args[p + 1]); reducingReads = true; } catch { Console.WriteLine("expected a number for the -reduce parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-histoonly" || args[p] == "-ho") { histoOnly = true; continue; } if (args[p] == "-t" || args[p] == "-threads") { if (!CheckForParamValue(p, args.Length, "number expected after -t|-threads")) { return; } try { noThreads = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -t|-threads parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-f" || args[p] == "-format") { if (!CheckForParamValue(p, args.Length, "reads format expected after -f|-format")) { return; } string readsFormatParam = args[p + 1].ToLower(); if (readsFormatParam == "fna") { readsFormat = MerStrings.formatFNA; } else if (readsFormatParam == "fasta") { readsFormat = MerStrings.formatFNA; } else if (readsFormatParam == "fa") { readsFormat = MerStrings.formatFNA; } else if (readsFormatParam == "fastq") { readsFormat = MerStrings.formatFASTQ; } else if (readsFormatParam == "fq") { readsFormat = MerStrings.formatFASTQ; } else { Console.WriteLine("reads format must be fasta or fastq: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-o" || args[p] == "-output") { if (!CheckForParamValue(p, args.Length, "directory name expected after -o|-output")) { return; } outputDir = args[p + 1]; p++; continue; } } FNParams.Add(args[p]); } if (FNParams.Count < 2) { Console.WriteLine("expected a cbt file name and at least one reads file name or pattern"); return; } // validate the output directory & set the output prefix string string fnSeparator = Path.DirectorySeparatorChar.ToString(); // \ for Windows; / for Unix/Linux if (outputDir != null) { try { // add a trailing \ if the output directory name doesn't already have one if (!outputDir.EndsWith(fnSeparator)) { outputDir += fnSeparator; } string testOutputFN = outputDir + "43EDD23F-5F68-47f0-B7B9-66AE9EE3BF0B.txt"; StreamWriter testTemp = new StreamWriter(testOutputFN); testTemp.Close(); File.Delete(testOutputFN); } catch { Console.WriteLine("Output directory: " + args[6] + " was invalid"); return; } } // take the cbt file name from the start of the non-option list string cbtFN = FNParams[0]; FNParams.RemoveAt(0); if (FNParams.Count == 0) { Console.WriteLine("did not find any reads file names or patterns"); return; } if (!File.Exists(cbtFN)) { Console.WriteLine("k-mer consensus (.cbt) file not found: " + cbtFN); return; } List <string> readsFileNames = new List <string>(FNParams.Count); List <string> readsFilePaths = new List <string>(FNParams.Count); foreach (string readsFNP in FNParams) { string readsFileName; string readsFilePath; GetPathFN(readsFNP, out readsFilePath, out readsFileName); readsFilePaths.Add(readsFilePath); readsFileNames.Add(readsFileName); } List <string> expandedReadsFNs = new List <string>(); for (int f = 0; f < FNParams.Count; f++) { string[] matchedReadsFNs = Directory.GetFiles(readsFilePaths[f], readsFileNames[f], SearchOption.TopDirectoryOnly); foreach (string matchedReadsFN in matchedReadsFNs) { expandedReadsFNs.Add(matchedReadsFN); } } // make sure there aren't any duplicates in the file list (seems to be a bug on the Cherax SGI HPC system and it returns each file name twice) List <string> distinctReadsFNs = new List <string>(); foreach (string fn in expandedReadsFNs) { if (!distinctReadsFNs.Contains(fn)) { distinctReadsFNs.Add(fn); } } // finally... the set of fully qualified, distinct reads files string[] readsFNs; readsFNs = distinctReadsFNs.ToArray(); Array.Sort(readsFNs); int noOfReadsFiles = distinctReadsFNs.Count; if (noOfReadsFiles == 0) { Console.WriteLine("No matching reads files found"); return; } StreamReader formatTester = new StreamReader(readsFNs[0]); string firstLine = formatTester.ReadLine(); if (firstLine[0] == '>') { readsFormat = MerStrings.formatFNA; } if (firstLine[0] == '@') { readsFormat = MerStrings.formatFASTQ; } formatTester.Close(); formatTester = null; if (statsFN == null) { // construct a stats statsFN = readsFileNames[0].Substring(0, readsFileNames[0].LastIndexOf('.')); statsFN = statsFN.Replace('?', '_'); statsFN = statsFN.Replace('*', '_'); statsFN = statsFN.Replace('/', '_'); statsFN = statsFN.Replace('\\', '_'); statsFN = statsFN.Replace("__", "_"); statsFN = statsFN.Replace("__", "_"); statsFN = statsFN + "_fstats.txt"; statsFN = statsFN.Replace("__", "_"); } // calculate the min load depth from the min reps depth - don't need to load all of the singletons and other errors into memory //int minLoadDepth = minDepth / 2; //if (minLoadDepth <= 1) // minLoadDepth = 2; int minLoadDepth = minDepth; long loadedUniqueMers = 0; long loadedTotalMers = 0; // load the .cbt file into a merTable (either a hash table (small) or a sorted array (large)) long mersLoaded = MerStrings.LoadCBTFile(cbtFN, minLoadDepth, 0, 0, minDepth, out uniqueMers, out merSize, out averageDepth, out loadedUniqueMers, out loadedTotalMers); if (merSize < 1 || merSize > 32) { Console.WriteLine("bad k-mer size found at start of .cbt file"); return; } MerStrings.Initialise(merSize); highRepSeqs = new Dictionary <string, int>(10000000); highRepSeqs.Add(new string('A', 40), 0); highRepSeqs.Add(new string('C', 40), 0); highRepSeqs.Add(new string('G', 40), 0); highRepSeqs.Add(new string('T', 40), 0); // resolve the FASTQ qual ambiguity by reading through quals until one is encountered that can only come from either of the alternative sets if (readsFormat == MerStrings.formatFASTQ) { qualBase = MerStrings.ResolveFastqQualAmbiguity(readsFNs[0], out fullQualHeaders); } // and check whether we've got Unix data so we can write out the corrected files in the same format string lfConvention = MerStrings.LFConvention(readsFNs[0]); // start the monitor/synchronising thread Thread monitorProgress = new Thread(RateReporter); monitorProgress.Priority = ThreadPriority.AboveNormal; monitorProgress.Start(); readsFiles = new StreamReader[2]; filteredReads = new StreamWriter[2]; Dictionary <int, int> readDepths = new Dictionary <int, int>(1000); // filter a pair of files at a time (allowing us to filter many files in a single run while keeping pairedness) for (int f = 0; f < noOfReadsFiles; f += 2) { // for each file in the pair for (int p = 0; p < 2; p++) { if (f + p < noOfReadsFiles) { string fullReadsFN = readsFNs[f + p]; string readsPath; string readsFN; GetPathFN(fullReadsFN, out readsPath, out readsFN); string fileSuffix = readsFN.Substring(readsFN.LastIndexOf('.')); string fileWithoutSuffix = readsFN.Substring(0, readsFN.LastIndexOf('.')); readsFiles[p] = new StreamReader(fullReadsFN, Encoding.ASCII, false, 1000000); Console.WriteLine("filtering " + readsFN); // check that the file appears to be in the expected format char firstChar = (char)readsFiles[p].Peek(); if (readsFormat == MerStrings.formatFASTQ && firstChar != '@') { Console.WriteLine(readsFN + " does not appear to be in FASTQ format"); return; } if (readsFormat == MerStrings.formatFNA && firstChar != '>') { Console.WriteLine(readsFN + " does not appear to be in FASTA format"); return; } string outputPath = outputDir == null ? readsPath + fnSeparator : outputDir; if (!histoOnly) { string maxDepthString = maxDepth.ToString(); if (maxDepth == int.MaxValue) { maxDepthString = "max"; } filteredReads[p] = new StreamWriter(outputPath + fileWithoutSuffix + "_" + minDepth + "_" + maxDepthString + fileSuffix, false, readsFiles[p].CurrentEncoding, 1000000); filteredReads[p].NewLine = lfConvention; } } else { readsFiles[p] = null; filteredReads[p] = null; } } filterThreadParams[] filterParams = new filterThreadParams[noThreads]; Thread[] filteringThreads = new Thread[noThreads]; // ready a new thread for each parallel healer for (int b = 0; b < noThreads; b++) { filterParams[b] = new filterThreadParams(); filterParams[b].threadNumber = b + 1; filterParams[b].readsFiles = readsFiles; filterParams[b].filteredReads = filteredReads; filteringThreads[b] = new Thread(new ParameterizedThreadStart(Program.FilteringThread)); filteringThreads[b].Priority = ThreadPriority.BelowNormal; filteringThreads[b].Name = b.ToString(); filteringThreads[b].Start(filterParams[b]); } // and wait for all threads to finish for (int b = 0; b < noThreads; b++) { filteringThreads[b].Join(); filteringThreads[b] = null; //Console.WriteLine("finished healing thread " + b); } foreach (StreamWriter r in filteredReads) { if (r != null) { r.Close(); } } // merge the per-thread histograms for (int b = 0; b < noThreads; b++) { Dictionary <int, int> threadReadDepths = filterParams[b].depthHisto; foreach (KeyValuePair <int, int> kvp in threadReadDepths) { if (readDepths.ContainsKey(kvp.Key)) { readDepths[kvp.Key] += kvp.Value; } else { readDepths.Add(kvp.Key, kvp.Value); } } } } // for a pair of files StreamWriter histo = new StreamWriter(statsFN); histo.WriteLine(myProcessNameAndArgs); histo.WriteLine(); histo.WriteLine("depth\tcount"); int[] depths = readDepths.Keys.ToArray <int>(); int[] counts = readDepths.Values.ToArray <int>(); Array.Sort <int, int>(depths, counts); for (int i = 0; i < readDepths.Count; i++) { histo.WriteLine(depths[i] + "\t" + counts[i]); } Console.WriteLine("discarded " + reducedReads + "/" + discardedReads + " of " + totalReads + " reads"); histo.WriteLine("discarded " + reducedReads + "/" + discardedReads + " of " + totalReads + " reads"); histo.Close(); stopMonitor = true; monitorProgress.Join(); }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("usage: GenerateMerPairs [-m min] [-t threads] cbtFN readsPattern or file names (" + version + ")"); return; } List <string> FNParams = new List <string>(); // the .cbt name and the set of file names or patterns int noThreads = 1; // no. of healing threads to run in parallel (1 thread is default) int minLoadReps = 3; // min rep count needed before mer will be loaded into uniqueMers table or saved as a pair for (int p = 0; p < args.Length; p++) { if (args[p][0] == '-') { args[p] = args[p].ToLower(); if (args[p] == "-m" || args[p] == "-min") { if (!CheckForParamValue(p, args.Length, "minReps number expected after -m|-min")) { return; } try { minLoadReps = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -m|-min parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-t" || args[p] == "-threads") { if (!CheckForParamValue(p, args.Length, "number expected after -t|-threads")) { return; } try { noThreads = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -t|-threads parameter: " + args[p + 1]); return; } p++; continue; } Console.WriteLine("unrecognised option: " + args[p]); Console.WriteLine("usage: generateMerPairs [-m min] [-t threads] cbtFN readsPattern or file names (" + version + ")"); return; } FNParams.Add(args[p]); } if (FNParams.Count < 2) { Console.WriteLine("expected a cbt file name and at least one reads file name or pattern"); return; } // take the cbt file name from the start of the non-option list string cbtFN = FNParams[0]; FNParams.RemoveAt(0); if (FNParams.Count == 0) { Console.WriteLine("did not find any reads file names or patterns"); return; } string pairsFN = cbtFN.Replace(".cbt", ".prs"); List <string> readsFileNames = new List <string>(FNParams.Count); List <string> readsFilePaths = new List <string>(FNParams.Count); foreach (string readsFNP in FNParams) { string readsFileName; string readsFilePath; GetPathFN(readsFNP, out readsFilePath, out readsFileName); readsFilePaths.Add(readsFilePath); readsFileNames.Add(readsFileName); } List <string> expandedReadsFNs = new List <string>(); for (int f = 0; f < FNParams.Count; f++) { string[] matchedReadsFNs = Directory.GetFiles(readsFilePaths[f], readsFileNames[f], SearchOption.TopDirectoryOnly); foreach (string matchedReadsFN in matchedReadsFNs) { expandedReadsFNs.Add(matchedReadsFN); } } // make sure there aren't any duplicates in the file list (seems to be a bug on the Cherax SGI HPC system and it returns each file name twice) List <string> distinctReadsFNs = new List <string>(); foreach (string fn in expandedReadsFNs) { if (!distinctReadsFNs.Contains(fn)) { distinctReadsFNs.Add(fn); } } // finally... the set of fully qualified, distinct reads files string[] readsFNs; readsFNs = distinctReadsFNs.ToArray(); if (readsFNs.Length == 0) { Console.WriteLine("No matching read files found"); return; } int noOfReadsFiles = readsFNs.Length; readsFiles = new StreamReader[noOfReadsFiles]; for (int f = 0; f < noOfReadsFiles; f++) { string readsFN = readsFNs[f]; readsFiles[f] = new StreamReader(readsFN); } // look at the first file to determine the file format and possible read length StreamReader testReader = new StreamReader(readsFNs[0]); char headerChar = (char)testReader.Peek(); if (headerChar == '>') { readsFormat = MerStrings.formatFNA; } if (headerChar == '@') { readsFormat = MerStrings.formatFASTQ; } int readLength = 0; for (int i = 0; i < 20; i++) { string nextRead = MerStrings.ReadRead(testReader, readsFormat); if (nextRead == null) { break; } int nextLength = nextRead.Length; if (nextLength > readLength) { readLength = nextLength; } } testReader.Close(); // have to able to fit at least two full mers into the read (no overlaps) if (readLength < 2 * merSize) { Console.WriteLine("reads too short to generate pairs: " + readLength); return; } if (!File.Exists(cbtFN)) { Console.WriteLine(".cbt file not found: " + cbtFN); return; } //string knownPairsFN = "C.sporogenesRaw_25_Copy_1.prs"; //BinaryReader knownPairs = new BinaryReader(File.Open(knownPairsFN, FileMode.Open, FileAccess.Read)); //knownPairs.ReadInt32(); //while (true) //{ // ulong mer = 0; // int count = 0; // try // { // mer = knownPairs.ReadUInt64(); // count = knownPairs.ReadInt32(); // goodPairs.Add(mer, count); // } // catch // { // break; // } //} //knownPairs.Close(); //Console.WriteLine("loaded " + goodPairs.Count + " good mers from " + knownPairsFN); long loadedUniqueMers = 0; long loadedTotalMers = 0; // load the .cbt file into a merTable (either a hash table (small) or a sorted array (large)) MerStrings.LoadCBTFile(cbtFN, minLoadReps, 0, 0, minLoadReps, out uniqueMers, out merSize, out averageDepth, out loadedUniqueMers, out loadedTotalMers); if (merSize < merStubSize) { Console.WriteLine("mers in .cbt file are shorter than merStub size: " + merSize + " < " + merStubSize); return; } uniquePairs = new MerCollections.MerTables(loadedUniqueMers, noThreads); // calculate a gap size based on the first read gap = (readLength - endGuard) / 2 - (merStubSize * 2); if (gap < minGap) { gap = minGap; } if (gap > maxGap) { gap = maxGap; } pairStride = merStubSize + gap + merStubSize; // start the monitor/synchronising thread Thread monitorProgress = new Thread(RateReporter); monitorProgress.Priority = ThreadPriority.AboveNormal; monitorProgress.Start(); DateTime pairingStart = DateTime.Now; foreach (string readsFN in readsFNs) { Console.WriteLine("Generating pairs from " + readsFN); StreamReader reads = new StreamReader(readsFN, Encoding.ASCII, false, 1000000); BufferedReader bufferedReads = new BufferedReader(readsFormat, reads, null); threadFinished = new EventWaitHandle[noThreads]; int threadNo = 0; for (int i = 0; i < noThreads; i++) { threadFinished[i] = new EventWaitHandle(false, EventResetMode.AutoReset); } for (int t = 0; t < noThreads; t++) { threadParams workerParam = new threadParams(); workerParam.threadNo = threadNo; workerParam.bufferedReadsFile = bufferedReads; ThreadPool.QueueUserWorkItem(new WaitCallback(PairWorker), workerParam); threadNo++; } // and wait for them all to finish for (int t = 0; t < noThreads; t++) { threadFinished[t].WaitOne(); } } BinaryWriter pairsFile = new BinaryWriter(File.Open(pairsFN, FileMode.Create, FileAccess.Write)); pairsFile.Write(gap); for (int pi = 0; pi < uniquePairs.noOfPartitions; pi++) { totalPairsGenerated += uniquePairs.repeatedMers[pi].Sort(); } for (int ti = 0; ti < noThreads; ti++) { if (uniquePairs.overflowMers[ti] != null) { totalPairsGenerated += uniquePairs.overflowMers[ti].Sort(); } } MergeAndWrite(pairsFile, uniquePairs.repeatedMers, uniquePairs.overflowMers); pairsFile.Close(); StopMonitorThread(monitorProgress); //Console.WriteLine(totalDeepUnbalancedReads + " deep unbalanced reads"); //Console.WriteLine(totalReadsProcessed + " reads processed"); Console.WriteLine("wrote " + totalPairsWritten + " pairs from " + totalReadsRead + " reads in " + (DateTime.Now - pairingStart).TotalSeconds.ToString("#.0") + "s"); }