private static void FilteringThread(object threadParams) { filterThreadParams theseParams = (filterThreadParams)threadParams; int filterNumber = theseParams.threadNumber; // which healing thread is this one? StreamReader[] readsFiles = theseParams.readsFiles; // the (shared) read files to be processed StreamWriter[] filteredReads = theseParams.filteredReads; // corresponding (shared) streams for filtered reads int noReadFNs = readsFiles.Length; bool[] fileActive = new bool[noReadFNs]; // have not yet reached EOF on this reads file bool[,] readValid = new bool[batchSize, noReadFNs]; // did the last read from this file produce a read? int filesStillActive = 0; // how many active reads files are still around string[,] readHeaderSet = new string[batchSize, noReadFNs]; // a batch of sets of read headers string[,] readSet = new string[batchSize, noReadFNs]; // a batch of sets of reads, possibly one from each file string[,] qualHeaderSet = new string[batchSize, noReadFNs]; // string[,] qualsSet = new string[batchSize, noReadFNs]; // text form of the quals int[] merDepths = new int[maxReadLength]; int[] depths = new int[noReadFNs]; // depths for each read in the set bool[] rightDepth = new bool[noReadFNs]; // are depths within the requested bounds? bool[] keepThisReadSet = new bool[batchSize]; // at least one of the set is of the desired depth, so keep the lot Dictionary <int, int> readDepths = new Dictionary <int, int>(1000); // depth histogram for this thread for (int f = 0; f < noReadFNs; f++) { if (readsFiles[f] != null) { fileActive[f] = true; // stays true until EOF filesStillActive++; } } // get the next set of reads and check their depths while (filesStillActive > 0) { lock (readsFiles) { // try getting the next batch of reads for (int b = 0; b < batchSize; b++) { for (int f = 0; f < noReadFNs; f++) { if (fileActive[f]) // only if we haven't already reached EOF on this file { readSet[b, f] = MerStrings.ReadRead(readsFiles[f], null, readsFormat, out readHeaderSet[b, f], out qualHeaderSet[b, f], out qualsSet[b, f]); if (readSet[b, f] == null) // this read failed - now at EOF for the file { fileActive[f] = false; readValid[b, f] = false; filesStillActive--; } else { readValid[b, f] = true; Interlocked.Increment(ref totalReads); progressReads++; } } else { readValid[b, f] = false; } } } } // lock to ensure synchronised reading from all reads files // now have a set of reads (the n'th read from each file, if they exist. So filter each one in turn. for (int b = 0; b < batchSize; b++) { keepThisReadSet[b] = true; for (int f = 0; f < noReadFNs; f++) { if (readValid[b, f]) { depths[f] = CalculateReadDepth(readSet[b, f], merDepths); //if (depths[f] > 100000) // Debugger.Break(); if (reducingReads && !histoOnly) { if (depths[f] >= minDepth) // possibly in the allowable range { if (depths[f] >= maxDepth) // above the max level, so a candidate for thinning { // extract and test all the long read keys int keyReps = 0; for (int i = 0; i < readSet[b, f].Length - 40; i++) { string readKey = readSet[b, f].Substring(i, 40); // ignore them if they contain an N if (readKey.Contains('N')) { continue; } // look the next seq in the table if (highRepSeqs.ContainsKey(readKey)) { highRepSeqs[readKey]++; keyReps = highRepSeqs[readKey]; } // and break if we found it if (keyReps > 0) { break; } } if (keyReps > reducedDepth) { rightDepth[f] = false; // we already have enough of these reads, so mark it to be discarded Interlocked.Increment(ref reducedReads); } if (keyReps == 0) // didn't find this read already, so remember it for the future { string readKey = readSet[b, f].Substring(0, 40); if (!readKey.Contains('N')) { lock (highRepSeqs) { if (!highRepSeqs.ContainsKey(readKey)) { highRepSeqs.Add(readKey, 1); } } } rightDepth[f] = true; // and let the read through } } else { rightDepth[f] = true; // reducing but read between min and max so let it through } } else { rightDepth[f] = false; // reducing, but below the requested min depth } } else { rightDepth[f] = depths[f] >= minDepth && depths[f] <= maxDepth; // not reducing, so must be between min and max } } else { depths[f] = 0; rightDepth[f] = false; } // keep the read only if all members of the set should be kept (if paired) keepThisReadSet[b] = keepThisReadSet[b] & rightDepth[f]; if (readDepths.ContainsKey(depths[f])) { readDepths[depths[f]]++; } else { readDepths.Add(depths[f], 1); } } } // end of checking a batch for (int b = 0; b < batchSize; b++) { if (filesStillActive > 0 && !histoOnly) { lock (filteredReads) { for (int f = 0; f < noReadFNs; f++) { if (readValid[b, f]) { if (keepThisReadSet[b]) { SaveFilteredReadAndQual(filteredReads[f], readHeaderSet[b, f], readSet[b, f], qualsSet[b, f]); progressWantedReads++; } else { Interlocked.Increment(ref discardedReads); } } } } // writing out a set of healed reads } } } // end of file reading/healing loop theseParams.depthHisto = readDepths; }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("usage: GenerateMerPairs [-m min] [-t threads] cbtFN readsPattern or file names (" + version + ")"); return; } List <string> FNParams = new List <string>(); // the .cbt name and the set of file names or patterns int noThreads = 1; // no. of healing threads to run in parallel (1 thread is default) int minLoadReps = 3; // min rep count needed before mer will be loaded into uniqueMers table or saved as a pair for (int p = 0; p < args.Length; p++) { if (args[p][0] == '-') { args[p] = args[p].ToLower(); if (args[p] == "-m" || args[p] == "-min") { if (!CheckForParamValue(p, args.Length, "minReps number expected after -m|-min")) { return; } try { minLoadReps = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -m|-min parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-t" || args[p] == "-threads") { if (!CheckForParamValue(p, args.Length, "number expected after -t|-threads")) { return; } try { noThreads = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -t|-threads parameter: " + args[p + 1]); return; } p++; continue; } Console.WriteLine("unrecognised option: " + args[p]); Console.WriteLine("usage: generateMerPairs [-m min] [-t threads] cbtFN readsPattern or file names (" + version + ")"); return; } FNParams.Add(args[p]); } if (FNParams.Count < 2) { Console.WriteLine("expected a cbt file name and at least one reads file name or pattern"); return; } // take the cbt file name from the start of the non-option list string cbtFN = FNParams[0]; FNParams.RemoveAt(0); if (FNParams.Count == 0) { Console.WriteLine("did not find any reads file names or patterns"); return; } string pairsFN = cbtFN.Replace(".cbt", ".prs"); List <string> readsFileNames = new List <string>(FNParams.Count); List <string> readsFilePaths = new List <string>(FNParams.Count); foreach (string readsFNP in FNParams) { string readsFileName; string readsFilePath; GetPathFN(readsFNP, out readsFilePath, out readsFileName); readsFilePaths.Add(readsFilePath); readsFileNames.Add(readsFileName); } List <string> expandedReadsFNs = new List <string>(); for (int f = 0; f < FNParams.Count; f++) { string[] matchedReadsFNs = Directory.GetFiles(readsFilePaths[f], readsFileNames[f], SearchOption.TopDirectoryOnly); foreach (string matchedReadsFN in matchedReadsFNs) { expandedReadsFNs.Add(matchedReadsFN); } } // make sure there aren't any duplicates in the file list (seems to be a bug on the Cherax SGI HPC system and it returns each file name twice) List <string> distinctReadsFNs = new List <string>(); foreach (string fn in expandedReadsFNs) { if (!distinctReadsFNs.Contains(fn)) { distinctReadsFNs.Add(fn); } } // finally... the set of fully qualified, distinct reads files string[] readsFNs; readsFNs = distinctReadsFNs.ToArray(); if (readsFNs.Length == 0) { Console.WriteLine("No matching read files found"); return; } int noOfReadsFiles = readsFNs.Length; readsFiles = new StreamReader[noOfReadsFiles]; for (int f = 0; f < noOfReadsFiles; f++) { string readsFN = readsFNs[f]; readsFiles[f] = new StreamReader(readsFN); } // look at the first file to determine the file format and possible read length StreamReader testReader = new StreamReader(readsFNs[0]); char headerChar = (char)testReader.Peek(); if (headerChar == '>') { readsFormat = MerStrings.formatFNA; } if (headerChar == '@') { readsFormat = MerStrings.formatFASTQ; } int readLength = 0; for (int i = 0; i < 20; i++) { string nextRead = MerStrings.ReadRead(testReader, readsFormat); if (nextRead == null) { break; } int nextLength = nextRead.Length; if (nextLength > readLength) { readLength = nextLength; } } testReader.Close(); // have to able to fit at least two full mers into the read (no overlaps) if (readLength < 2 * merSize) { Console.WriteLine("reads too short to generate pairs: " + readLength); return; } if (!File.Exists(cbtFN)) { Console.WriteLine(".cbt file not found: " + cbtFN); return; } //string knownPairsFN = "C.sporogenesRaw_25_Copy_1.prs"; //BinaryReader knownPairs = new BinaryReader(File.Open(knownPairsFN, FileMode.Open, FileAccess.Read)); //knownPairs.ReadInt32(); //while (true) //{ // ulong mer = 0; // int count = 0; // try // { // mer = knownPairs.ReadUInt64(); // count = knownPairs.ReadInt32(); // goodPairs.Add(mer, count); // } // catch // { // break; // } //} //knownPairs.Close(); //Console.WriteLine("loaded " + goodPairs.Count + " good mers from " + knownPairsFN); long loadedUniqueMers = 0; long loadedTotalMers = 0; // load the .cbt file into a merTable (either a hash table (small) or a sorted array (large)) MerStrings.LoadCBTFile(cbtFN, minLoadReps, 0, 0, minLoadReps, out uniqueMers, out merSize, out averageDepth, out loadedUniqueMers, out loadedTotalMers); if (merSize < merStubSize) { Console.WriteLine("mers in .cbt file are shorter than merStub size: " + merSize + " < " + merStubSize); return; } uniquePairs = new MerCollections.MerTables(loadedUniqueMers, noThreads); // calculate a gap size based on the first read gap = (readLength - endGuard) / 2 - (merStubSize * 2); if (gap < minGap) { gap = minGap; } if (gap > maxGap) { gap = maxGap; } pairStride = merStubSize + gap + merStubSize; // start the monitor/synchronising thread Thread monitorProgress = new Thread(RateReporter); monitorProgress.Priority = ThreadPriority.AboveNormal; monitorProgress.Start(); DateTime pairingStart = DateTime.Now; foreach (string readsFN in readsFNs) { Console.WriteLine("Generating pairs from " + readsFN); StreamReader reads = new StreamReader(readsFN, Encoding.ASCII, false, 1000000); BufferedReader bufferedReads = new BufferedReader(readsFormat, reads, null); threadFinished = new EventWaitHandle[noThreads]; int threadNo = 0; for (int i = 0; i < noThreads; i++) { threadFinished[i] = new EventWaitHandle(false, EventResetMode.AutoReset); } for (int t = 0; t < noThreads; t++) { threadParams workerParam = new threadParams(); workerParam.threadNo = threadNo; workerParam.bufferedReadsFile = bufferedReads; ThreadPool.QueueUserWorkItem(new WaitCallback(PairWorker), workerParam); threadNo++; } // and wait for them all to finish for (int t = 0; t < noThreads; t++) { threadFinished[t].WaitOne(); } } BinaryWriter pairsFile = new BinaryWriter(File.Open(pairsFN, FileMode.Create, FileAccess.Write)); pairsFile.Write(gap); for (int pi = 0; pi < uniquePairs.noOfPartitions; pi++) { totalPairsGenerated += uniquePairs.repeatedMers[pi].Sort(); } for (int ti = 0; ti < noThreads; ti++) { if (uniquePairs.overflowMers[ti] != null) { totalPairsGenerated += uniquePairs.overflowMers[ti].Sort(); } } MergeAndWrite(pairsFile, uniquePairs.repeatedMers, uniquePairs.overflowMers); pairsFile.Close(); StopMonitorThread(monitorProgress); //Console.WriteLine(totalDeepUnbalancedReads + " deep unbalanced reads"); //Console.WriteLine(totalReadsProcessed + " reads processed"); Console.WriteLine("wrote " + totalPairsWritten + " pairs from " + totalReadsRead + " reads in " + (DateTime.Now - pairingStart).TotalSeconds.ToString("#.0") + "s"); }