private static bool FindMerInUniqueMers(ulong mer, out int plusCount, out int rcCount) { bool foundMer = true; ulong rcMer = MerStrings.ReverseComplement(mer); ulong countPair; bool rcMerWasCanonical = false; rcMerWasCanonical = rcMer < mer; if (rcMerWasCanonical) { mer = rcMer; } if (!uniqueMers.TryGetValue(mer, out countPair)) { //string missingMer = MerStrings.ExpandMer(packedMer); countPair = 0; // not in the table foundMer = false; } // extract the plus, RC and qual values from the packed ulong value if (rcMerWasCanonical) { rcCount = (int)(countPair >> 32); plusCount = (int)(countPair & 0xFFFFFFFF); } else { plusCount = (int)(countPair >> 32); rcCount = (int)(countPair & 0xFFFFFFFF); } return(foundMer); }
// Generate a set of mers from a read and calculate their read depths. private static int GetMerDepths(string read, int[] merDepths) { int readLength = read.Length; int mersInRead = readLength - merSize + 1; bool merIsValid = false; ulong lastMer = 0; // read too short to tile for mers if (readLength < merSize) { mersInRead = 0; return(mersInRead); } for (int i = 0; i < mersInRead; i++) { if (merIsValid) { merIsValid = MerStrings.CondenseMerIncremental(merSize, lastMer, read, i, out lastMer); } else { merIsValid = MerStrings.CondenseMer(read.Substring(i, merSize), out lastMer); } if (merIsValid) { int plusCount = 0; int rcCount = 0; if (FindMerInUniqueMers(lastMer, out plusCount, out rcCount)) { int sumCount = plusCount + rcCount; // don't count huge unbalanced counts if ((plusCount > 100 * rcCount || rcCount > 100 * plusCount)) { merDepths[i] = 0; } else { merDepths[i] = sumCount; } } else { merDepths[i] = 0; } } else { merDepths[i] = 0; } } return(mersInRead); }
private static void SaveFilteredReadAndQual(StreamWriter filteredReads, string header, string filteredRead, string quals) { MerStrings.WriteRead(filteredReads, header, filteredRead, readsFormat); if (readsFormat == MerStrings.formatFASTQ) { if (fullQualHeaders) { filteredReads.WriteLine("+" + header.Substring(1)); } else { filteredReads.WriteLine("+"); } filteredReads.WriteLine(quals); } }
static Dictionary <string, int> highRepSeqs; // long seqs + counts for high depth reads static void Main(string[] args) { if (args.Length == 0) { Console.WriteLine("FilterReadsByDepth -min minDepth -max maxDepth [-reduce maxCopies] [-histoOnly] [-stats statsFN] [-f format] [-t #threads] cbtFN fileNames"); return; } List <string> FNParams = new List <string>(); // the .cbt name and the set of file names or patterns int noThreads = 2; // find out who we are so we can track what program & args produced the result files Process myProcess = Process.GetCurrentProcess(); myProcessNameAndArgs = myProcess.ProcessName; foreach (string a in args) { myProcessNameAndArgs = myProcessNameAndArgs + " " + a; } for (int p = 0; p < args.Length; p++) { if (args[p][0] == '-') { args[p] = args[p].ToLower(); if (args[p] == "-s" || args[p] == "-stats") { if (!CheckForParamValue(p, args.Length, "stats file name string expected after -s|-stats")) { return; } statsFN = args[p + 1]; p++; continue; } if (args[p] == "-min") { if (!CheckForParamValue(p, args.Length, "minDepth number expected after -min")) { return; } try { minDepth = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -min parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-max") { if (!CheckForParamValue(p, args.Length, "maxDepth number expected after -max")) { return; } try { maxDepth = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -max parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-r" || args[p] == "-reduce") { if (!CheckForParamValue(p, args.Length, "reduced depth number expected after -reduce")) { return; } try { reducedDepth = Convert.ToInt32(args[p + 1]); reducingReads = true; } catch { Console.WriteLine("expected a number for the -reduce parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-histoonly" || args[p] == "-ho") { histoOnly = true; continue; } if (args[p] == "-t" || args[p] == "-threads") { if (!CheckForParamValue(p, args.Length, "number expected after -t|-threads")) { return; } try { noThreads = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -t|-threads parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-f" || args[p] == "-format") { if (!CheckForParamValue(p, args.Length, "reads format expected after -f|-format")) { return; } string readsFormatParam = args[p + 1].ToLower(); if (readsFormatParam == "fna") { readsFormat = MerStrings.formatFNA; } else if (readsFormatParam == "fasta") { readsFormat = MerStrings.formatFNA; } else if (readsFormatParam == "fa") { readsFormat = MerStrings.formatFNA; } else if (readsFormatParam == "fastq") { readsFormat = MerStrings.formatFASTQ; } else if (readsFormatParam == "fq") { readsFormat = MerStrings.formatFASTQ; } else { Console.WriteLine("reads format must be fasta or fastq: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-o" || args[p] == "-output") { if (!CheckForParamValue(p, args.Length, "directory name expected after -o|-output")) { return; } outputDir = args[p + 1]; p++; continue; } } FNParams.Add(args[p]); } if (FNParams.Count < 2) { Console.WriteLine("expected a cbt file name and at least one reads file name or pattern"); return; } // validate the output directory & set the output prefix string string fnSeparator = Path.DirectorySeparatorChar.ToString(); // \ for Windows; / for Unix/Linux if (outputDir != null) { try { // add a trailing \ if the output directory name doesn't already have one if (!outputDir.EndsWith(fnSeparator)) { outputDir += fnSeparator; } string testOutputFN = outputDir + "43EDD23F-5F68-47f0-B7B9-66AE9EE3BF0B.txt"; StreamWriter testTemp = new StreamWriter(testOutputFN); testTemp.Close(); File.Delete(testOutputFN); } catch { Console.WriteLine("Output directory: " + args[6] + " was invalid"); return; } } // take the cbt file name from the start of the non-option list string cbtFN = FNParams[0]; FNParams.RemoveAt(0); if (FNParams.Count == 0) { Console.WriteLine("did not find any reads file names or patterns"); return; } if (!File.Exists(cbtFN)) { Console.WriteLine("k-mer consensus (.cbt) file not found: " + cbtFN); return; } List <string> readsFileNames = new List <string>(FNParams.Count); List <string> readsFilePaths = new List <string>(FNParams.Count); foreach (string readsFNP in FNParams) { string readsFileName; string readsFilePath; GetPathFN(readsFNP, out readsFilePath, out readsFileName); readsFilePaths.Add(readsFilePath); readsFileNames.Add(readsFileName); } List <string> expandedReadsFNs = new List <string>(); for (int f = 0; f < FNParams.Count; f++) { string[] matchedReadsFNs = Directory.GetFiles(readsFilePaths[f], readsFileNames[f], SearchOption.TopDirectoryOnly); foreach (string matchedReadsFN in matchedReadsFNs) { expandedReadsFNs.Add(matchedReadsFN); } } // make sure there aren't any duplicates in the file list (seems to be a bug on the Cherax SGI HPC system and it returns each file name twice) List <string> distinctReadsFNs = new List <string>(); foreach (string fn in expandedReadsFNs) { if (!distinctReadsFNs.Contains(fn)) { distinctReadsFNs.Add(fn); } } // finally... the set of fully qualified, distinct reads files string[] readsFNs; readsFNs = distinctReadsFNs.ToArray(); Array.Sort(readsFNs); int noOfReadsFiles = distinctReadsFNs.Count; if (noOfReadsFiles == 0) { Console.WriteLine("No matching reads files found"); return; } StreamReader formatTester = new StreamReader(readsFNs[0]); string firstLine = formatTester.ReadLine(); if (firstLine[0] == '>') { readsFormat = MerStrings.formatFNA; } if (firstLine[0] == '@') { readsFormat = MerStrings.formatFASTQ; } formatTester.Close(); formatTester = null; if (statsFN == null) { // construct a stats statsFN = readsFileNames[0].Substring(0, readsFileNames[0].LastIndexOf('.')); statsFN = statsFN.Replace('?', '_'); statsFN = statsFN.Replace('*', '_'); statsFN = statsFN.Replace('/', '_'); statsFN = statsFN.Replace('\\', '_'); statsFN = statsFN.Replace("__", "_"); statsFN = statsFN.Replace("__", "_"); statsFN = statsFN + "_fstats.txt"; statsFN = statsFN.Replace("__", "_"); } // calculate the min load depth from the min reps depth - don't need to load all of the singletons and other errors into memory //int minLoadDepth = minDepth / 2; //if (minLoadDepth <= 1) // minLoadDepth = 2; int minLoadDepth = minDepth; long loadedUniqueMers = 0; long loadedTotalMers = 0; // load the .cbt file into a merTable (either a hash table (small) or a sorted array (large)) long mersLoaded = MerStrings.LoadCBTFile(cbtFN, minLoadDepth, 0, 0, minDepth, out uniqueMers, out merSize, out averageDepth, out loadedUniqueMers, out loadedTotalMers); if (merSize < 1 || merSize > 32) { Console.WriteLine("bad k-mer size found at start of .cbt file"); return; } MerStrings.Initialise(merSize); highRepSeqs = new Dictionary <string, int>(10000000); highRepSeqs.Add(new string('A', 40), 0); highRepSeqs.Add(new string('C', 40), 0); highRepSeqs.Add(new string('G', 40), 0); highRepSeqs.Add(new string('T', 40), 0); // resolve the FASTQ qual ambiguity by reading through quals until one is encountered that can only come from either of the alternative sets if (readsFormat == MerStrings.formatFASTQ) { qualBase = MerStrings.ResolveFastqQualAmbiguity(readsFNs[0], out fullQualHeaders); } // and check whether we've got Unix data so we can write out the corrected files in the same format string lfConvention = MerStrings.LFConvention(readsFNs[0]); // start the monitor/synchronising thread Thread monitorProgress = new Thread(RateReporter); monitorProgress.Priority = ThreadPriority.AboveNormal; monitorProgress.Start(); readsFiles = new StreamReader[2]; filteredReads = new StreamWriter[2]; Dictionary <int, int> readDepths = new Dictionary <int, int>(1000); // filter a pair of files at a time (allowing us to filter many files in a single run while keeping pairedness) for (int f = 0; f < noOfReadsFiles; f += 2) { // for each file in the pair for (int p = 0; p < 2; p++) { if (f + p < noOfReadsFiles) { string fullReadsFN = readsFNs[f + p]; string readsPath; string readsFN; GetPathFN(fullReadsFN, out readsPath, out readsFN); string fileSuffix = readsFN.Substring(readsFN.LastIndexOf('.')); string fileWithoutSuffix = readsFN.Substring(0, readsFN.LastIndexOf('.')); readsFiles[p] = new StreamReader(fullReadsFN, Encoding.ASCII, false, 1000000); Console.WriteLine("filtering " + readsFN); // check that the file appears to be in the expected format char firstChar = (char)readsFiles[p].Peek(); if (readsFormat == MerStrings.formatFASTQ && firstChar != '@') { Console.WriteLine(readsFN + " does not appear to be in FASTQ format"); return; } if (readsFormat == MerStrings.formatFNA && firstChar != '>') { Console.WriteLine(readsFN + " does not appear to be in FASTA format"); return; } string outputPath = outputDir == null ? readsPath + fnSeparator : outputDir; if (!histoOnly) { string maxDepthString = maxDepth.ToString(); if (maxDepth == int.MaxValue) { maxDepthString = "max"; } filteredReads[p] = new StreamWriter(outputPath + fileWithoutSuffix + "_" + minDepth + "_" + maxDepthString + fileSuffix, false, readsFiles[p].CurrentEncoding, 1000000); filteredReads[p].NewLine = lfConvention; } } else { readsFiles[p] = null; filteredReads[p] = null; } } filterThreadParams[] filterParams = new filterThreadParams[noThreads]; Thread[] filteringThreads = new Thread[noThreads]; // ready a new thread for each parallel healer for (int b = 0; b < noThreads; b++) { filterParams[b] = new filterThreadParams(); filterParams[b].threadNumber = b + 1; filterParams[b].readsFiles = readsFiles; filterParams[b].filteredReads = filteredReads; filteringThreads[b] = new Thread(new ParameterizedThreadStart(Program.FilteringThread)); filteringThreads[b].Priority = ThreadPriority.BelowNormal; filteringThreads[b].Name = b.ToString(); filteringThreads[b].Start(filterParams[b]); } // and wait for all threads to finish for (int b = 0; b < noThreads; b++) { filteringThreads[b].Join(); filteringThreads[b] = null; //Console.WriteLine("finished healing thread " + b); } foreach (StreamWriter r in filteredReads) { if (r != null) { r.Close(); } } // merge the per-thread histograms for (int b = 0; b < noThreads; b++) { Dictionary <int, int> threadReadDepths = filterParams[b].depthHisto; foreach (KeyValuePair <int, int> kvp in threadReadDepths) { if (readDepths.ContainsKey(kvp.Key)) { readDepths[kvp.Key] += kvp.Value; } else { readDepths.Add(kvp.Key, kvp.Value); } } } } // for a pair of files StreamWriter histo = new StreamWriter(statsFN); histo.WriteLine(myProcessNameAndArgs); histo.WriteLine(); histo.WriteLine("depth\tcount"); int[] depths = readDepths.Keys.ToArray <int>(); int[] counts = readDepths.Values.ToArray <int>(); Array.Sort <int, int>(depths, counts); for (int i = 0; i < readDepths.Count; i++) { histo.WriteLine(depths[i] + "\t" + counts[i]); } Console.WriteLine("discarded " + reducedReads + "/" + discardedReads + " of " + totalReads + " reads"); histo.WriteLine("discarded " + reducedReads + "/" + discardedReads + " of " + totalReads + " reads"); histo.Close(); stopMonitor = true; monitorProgress.Join(); }
private static void FilteringThread(object threadParams) { filterThreadParams theseParams = (filterThreadParams)threadParams; int filterNumber = theseParams.threadNumber; // which healing thread is this one? StreamReader[] readsFiles = theseParams.readsFiles; // the (shared) read files to be processed StreamWriter[] filteredReads = theseParams.filteredReads; // corresponding (shared) streams for filtered reads int noReadFNs = readsFiles.Length; bool[] fileActive = new bool[noReadFNs]; // have not yet reached EOF on this reads file bool[,] readValid = new bool[batchSize, noReadFNs]; // did the last read from this file produce a read? int filesStillActive = 0; // how many active reads files are still around string[,] readHeaderSet = new string[batchSize, noReadFNs]; // a batch of sets of read headers string[,] readSet = new string[batchSize, noReadFNs]; // a batch of sets of reads, possibly one from each file string[,] qualHeaderSet = new string[batchSize, noReadFNs]; // string[,] qualsSet = new string[batchSize, noReadFNs]; // text form of the quals int[] merDepths = new int[maxReadLength]; int[] depths = new int[noReadFNs]; // depths for each read in the set bool[] rightDepth = new bool[noReadFNs]; // are depths within the requested bounds? bool[] keepThisReadSet = new bool[batchSize]; // at least one of the set is of the desired depth, so keep the lot Dictionary <int, int> readDepths = new Dictionary <int, int>(1000); // depth histogram for this thread for (int f = 0; f < noReadFNs; f++) { if (readsFiles[f] != null) { fileActive[f] = true; // stays true until EOF filesStillActive++; } } // get the next set of reads and check their depths while (filesStillActive > 0) { lock (readsFiles) { // try getting the next batch of reads for (int b = 0; b < batchSize; b++) { for (int f = 0; f < noReadFNs; f++) { if (fileActive[f]) // only if we haven't already reached EOF on this file { readSet[b, f] = MerStrings.ReadRead(readsFiles[f], null, readsFormat, out readHeaderSet[b, f], out qualHeaderSet[b, f], out qualsSet[b, f]); if (readSet[b, f] == null) // this read failed - now at EOF for the file { fileActive[f] = false; readValid[b, f] = false; filesStillActive--; } else { readValid[b, f] = true; Interlocked.Increment(ref totalReads); progressReads++; } } else { readValid[b, f] = false; } } } } // lock to ensure synchronised reading from all reads files // now have a set of reads (the n'th read from each file, if they exist. So filter each one in turn. for (int b = 0; b < batchSize; b++) { keepThisReadSet[b] = true; for (int f = 0; f < noReadFNs; f++) { if (readValid[b, f]) { depths[f] = CalculateReadDepth(readSet[b, f], merDepths); //if (depths[f] > 100000) // Debugger.Break(); if (reducingReads && !histoOnly) { if (depths[f] >= minDepth) // possibly in the allowable range { if (depths[f] >= maxDepth) // above the max level, so a candidate for thinning { // extract and test all the long read keys int keyReps = 0; for (int i = 0; i < readSet[b, f].Length - 40; i++) { string readKey = readSet[b, f].Substring(i, 40); // ignore them if they contain an N if (readKey.Contains('N')) { continue; } // look the next seq in the table if (highRepSeqs.ContainsKey(readKey)) { highRepSeqs[readKey]++; keyReps = highRepSeqs[readKey]; } // and break if we found it if (keyReps > 0) { break; } } if (keyReps > reducedDepth) { rightDepth[f] = false; // we already have enough of these reads, so mark it to be discarded Interlocked.Increment(ref reducedReads); } if (keyReps == 0) // didn't find this read already, so remember it for the future { string readKey = readSet[b, f].Substring(0, 40); if (!readKey.Contains('N')) { lock (highRepSeqs) { if (!highRepSeqs.ContainsKey(readKey)) { highRepSeqs.Add(readKey, 1); } } } rightDepth[f] = true; // and let the read through } } else { rightDepth[f] = true; // reducing but read between min and max so let it through } } else { rightDepth[f] = false; // reducing, but below the requested min depth } } else { rightDepth[f] = depths[f] >= minDepth && depths[f] <= maxDepth; // not reducing, so must be between min and max } } else { depths[f] = 0; rightDepth[f] = false; } // keep the read only if all members of the set should be kept (if paired) keepThisReadSet[b] = keepThisReadSet[b] & rightDepth[f]; if (readDepths.ContainsKey(depths[f])) { readDepths[depths[f]]++; } else { readDepths.Add(depths[f], 1); } } } // end of checking a batch for (int b = 0; b < batchSize; b++) { if (filesStillActive > 0 && !histoOnly) { lock (filteredReads) { for (int f = 0; f < noReadFNs; f++) { if (readValid[b, f]) { if (keepThisReadSet[b]) { SaveFilteredReadAndQual(filteredReads[f], readHeaderSet[b, f], readSet[b, f], qualsSet[b, f]); progressWantedReads++; } else { Interlocked.Increment(ref discardedReads); } } } } // writing out a set of healed reads } } } // end of file reading/healing loop theseParams.depthHisto = readDepths; }
public void AddOrIncrement(ulong mer, int threadNo) { long addingIncrement = 0x0000000100000000; // assume we've got the as-read form is the canonical form ulong rcFlagToBeSet = 0x0; // and that we don't want to set the RC flag // generate canonical k-mer first ulong rcMer = MerStrings.ReverseComplement(mer); if (rcMer < mer) { mer = rcMer; addingIncrement = 0x0000000000000001; // increment the low part of the count pair rcFlagToBeSet = singletonRCFlagMask; // remember if the canonical k-mer was the RC form } int absMerHashCode = mer.GetHashCode() & int31Mask; int partitionNo = absMerHashCode % noOfPartitions; int singletonPartitionNo = singletonPrefixBits == 0 ? 0 : (int)(mer >> (64 - singletonPrefixBits)); // this mer may have been seen before, so first try updating it in one of the repeated mer tables bool updatedRepeat = UpdateRepeatedMer(partitionNo, mer, threadNo, mer, addingIncrement); if (updatedRepeat) { return; } // handling a k-mer for the first time - try adding it to the singletons table // ---------------------------------------------------------------------------- // get a stable pointer to the current singetons table (in case someone else fills it and initiates a flush while we're still busy with it) MerCollection thisSingletonPartition = singletonFilters[singletonPartitionNo]; Interlocked.Increment(ref thisSingletonPartition.activeCount); // try to add this mer to this partition's singletons collection (and fetch the existing singleton+flag if it's already there) int filterIdx; ulong fMer = mer | rcFlagToBeSet | singletonActiveFlagMask; bool added = thisSingletonPartition.TryInsertKey(fMer, out filterIdx); if (added) { // successfully added this mer so we must be seeing it for the first time // if singleton table is already full enough, flush it out and empty the table if (thisSingletonPartition.Count >= maxSingletonCapacity[singletonPartitionNo]) { bool flushNeeded = true; int flushNumberToUse = 0; // lock this section to avoid two threads trying to flush/replace the same singleton buffer concurrently lock (lockSingletons) { // test entry condition now that we have the lock (filter may have been reset while we were waiting) if (!thisSingletonPartition.flushed) { // allocate a replacement table for the other threads to use while we're flushing this one int newSingletonLength = thisSingletonPartition.length + thisSingletonPartition.length / 4; if (newSingletonLength > maxSingletonSize) { newSingletonLength = maxSingletonSize; } MerCollection emptySingletonFilter = new MerCollection(newSingletonLength, singletonMerMask); // allocate new local filter for the partition singletonFilters[singletonPartitionNo] = emptySingletonFilter; // make it visible to the concurrent threads (single point assignment) maxSingletonCapacity[singletonPartitionNo] = newSingletonLength * 8 / 10; thisSingletonPartition.flushed = true; flushNumberToUse = flushSingletonNumber[singletonPartitionNo]; flushSingletonNumber[singletonPartitionNo]++; } else { flushNeeded = false; } } if (flushNeeded) { while (thisSingletonPartition.activeCount > 1) { // pause briefly to let any inflight updates to this table to complete Thread.Sleep(100); } FlushSingletons(thisSingletonPartition, singletonPartitionNo, flushNumberToUse); } //flushes++; } } else { // Insert failed, so must be seeing this k-mer for second (or rarely more) time. Mark as inactive in singletons and add to a repeats table with appropriate counts. // There can be a race here with two threads trying to concurrently promote the same singleton. This is resolved by atomically clearing the singleton // active flag - and only one of the threads will get the 'active' flag returned from the Exchange. This thread does the promotion - and then sets the // promotion-complete bit for the singleton. The other threads will spin until they find this bit has been set. if (tracing) { lock (traceUpdates) { traceUpdates.Enqueue(new TraceEntry(threadNo, 1, singletonPartitionNo, filterIdx, (ulong)thisSingletonPartition.entries[filterIdx].key)); if (traceUpdates.Count > maxTrace) { traceUpdates.Dequeue(); } } } // get the current value of this singleton entry (safe because the promotion changes are progressive) ulong merFromFilter = (ulong)thisSingletonPartition.entries[filterIdx].key; // and see if this singleton may have already been promoted bool activeSingleton = (merFromFilter & singletonActiveFlagMask) != 0; // if this singleton may be 'active', try to promote it if (activeSingleton) { ulong inactiveMer = mer & singletonMerMask; // build what the inactive-but-being-promoted entry should look like // if no-one else has altered the singleton entry, then set it to inactive-but-being-promoted long currentMerFromFilter = Interlocked.CompareExchange(ref thisSingletonPartition.entries[filterIdx].key, (long)inactiveMer, (long)merFromFilter); if (tracing) { lock (traceUpdates) { traceUpdates.Enqueue(new TraceEntry(threadNo, 2, singletonPartitionNo, filterIdx, (ulong)currentMerFromFilter)); if (traceUpdates.Count > maxTrace) { traceUpdates.Dequeue(); } } } // if this thread successfully set the singleton to 'inactive', it will take care of the promotion if (currentMerFromFilter == (long)merFromFilter) { ulong rcFlag = merFromFilter & singletonRCFlagMask; // non-zero --> RC found in singletons long initialCount = 0; if (rcFlag != 0) // singleton was seen in RC form { initialCount = 0x0000000000000001; } else // singleton was seen in as-is form { initialCount = 0x0000000100000000; } if (repeatedMersFull[partitionNo]) { if (overflowMers[threadNo] == null) { overflowMers[threadNo] = new MerDictionary(repeatedMers[partitionNo].lengthEntries / 10, fullMerMask); //Console.WriteLine("added overflow for thread " + threadNo + " for [" + partitionNo + "]"); } bool full = overflowMers[threadNo].Add(mer, initialCount); if (full) { overflowMers[threadNo].Resize(); } } else { bool full = repeatedMers[partitionNo].Add(mer, initialCount); if (full) { repeatedMersFull[partitionNo] = true; } } // now that the mer has been promoted, set the 'promoted' flag inactiveMer = inactiveMer | (long)singletonPromotedFlagMask; thisSingletonPartition.entries[filterIdx].key = (long)inactiveMer; if (tracing) { lock (traceUpdates) { traceUpdates.Enqueue(new TraceEntry(threadNo, 3, singletonPartitionNo, filterIdx, (ulong)thisSingletonPartition.entries[filterIdx].key)); if (traceUpdates.Count > maxTrace) { traceUpdates.Dequeue(); } } } } } // singleton is now known to be no longer active, so wait (if necessary) for the 'promoted' flag to be set and increment the repeat counter merFromFilter = (ulong)thisSingletonPartition.entries[filterIdx].key; if (tracing) { lock (traceUpdates) { traceUpdates.Enqueue(new TraceEntry(threadNo, 4, singletonPartitionNo, filterIdx, merFromFilter)); if (traceUpdates.Count > maxTrace) { traceUpdates.Dequeue(); } } } bool promotionComplete = (merFromFilter & singletonPromotedFlagMask) != 0; bool alreadySlept = false; while (!promotionComplete) { promotionComplete = (((ulong)thisSingletonPartition.entries[filterIdx].key & singletonPromotedFlagMask) != 0); if (alreadySlept && !promotionComplete) { if (tracing) { lock (traceUpdates) { StreamWriter trace = new StreamWriter("trace.txt"); foreach (TraceEntry t in traceUpdates) { trace.WriteLine(t.place + "\t" + t.thread + "\t" + t.partition + "\t" + t.index + "\t" + t.value.ToString("x16")); } trace.Close(); } Console.WriteLine("promotion still not complete after sleep"); } } if (!promotionComplete) { Thread.Sleep(100); } alreadySlept = true; } UpdateRepeatedMerAfterPromotion(partitionNo, mer, threadNo, mer, addingIncrement); //if (!updateSucceeded) //{ // lock (traceUpdates) // { // StreamWriter trace = new StreamWriter("trace.txt"); // foreach (TraceEntry t in traceUpdates) // trace.WriteLine(t.thread + "\t" + t.place + "\t" + t.partition + "\t" + t.index + "\t" + t.value.ToString("x16")); // trace.Close(); // } // Console.WriteLine("UpdateRepeatedMerRetry failed after waiting for promotion to complete"); //} } Interlocked.Decrement(ref thisSingletonPartition.activeCount); }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("usage: GenerateMerPairs [-m min] [-t threads] cbtFN readsPattern or file names (" + version + ")"); return; } List <string> FNParams = new List <string>(); // the .cbt name and the set of file names or patterns int noThreads = 1; // no. of healing threads to run in parallel (1 thread is default) int minLoadReps = 3; // min rep count needed before mer will be loaded into uniqueMers table or saved as a pair for (int p = 0; p < args.Length; p++) { if (args[p][0] == '-') { args[p] = args[p].ToLower(); if (args[p] == "-m" || args[p] == "-min") { if (!CheckForParamValue(p, args.Length, "minReps number expected after -m|-min")) { return; } try { minLoadReps = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -m|-min parameter: " + args[p + 1]); return; } p++; continue; } if (args[p] == "-t" || args[p] == "-threads") { if (!CheckForParamValue(p, args.Length, "number expected after -t|-threads")) { return; } try { noThreads = Convert.ToInt32(args[p + 1]); } catch { Console.WriteLine("expected a number for the -t|-threads parameter: " + args[p + 1]); return; } p++; continue; } Console.WriteLine("unrecognised option: " + args[p]); Console.WriteLine("usage: generateMerPairs [-m min] [-t threads] cbtFN readsPattern or file names (" + version + ")"); return; } FNParams.Add(args[p]); } if (FNParams.Count < 2) { Console.WriteLine("expected a cbt file name and at least one reads file name or pattern"); return; } // take the cbt file name from the start of the non-option list string cbtFN = FNParams[0]; FNParams.RemoveAt(0); if (FNParams.Count == 0) { Console.WriteLine("did not find any reads file names or patterns"); return; } string pairsFN = cbtFN.Replace(".cbt", ".prs"); List <string> readsFileNames = new List <string>(FNParams.Count); List <string> readsFilePaths = new List <string>(FNParams.Count); foreach (string readsFNP in FNParams) { string readsFileName; string readsFilePath; GetPathFN(readsFNP, out readsFilePath, out readsFileName); readsFilePaths.Add(readsFilePath); readsFileNames.Add(readsFileName); } List <string> expandedReadsFNs = new List <string>(); for (int f = 0; f < FNParams.Count; f++) { string[] matchedReadsFNs = Directory.GetFiles(readsFilePaths[f], readsFileNames[f], SearchOption.TopDirectoryOnly); foreach (string matchedReadsFN in matchedReadsFNs) { expandedReadsFNs.Add(matchedReadsFN); } } // make sure there aren't any duplicates in the file list (seems to be a bug on the Cherax SGI HPC system and it returns each file name twice) List <string> distinctReadsFNs = new List <string>(); foreach (string fn in expandedReadsFNs) { if (!distinctReadsFNs.Contains(fn)) { distinctReadsFNs.Add(fn); } } // finally... the set of fully qualified, distinct reads files string[] readsFNs; readsFNs = distinctReadsFNs.ToArray(); if (readsFNs.Length == 0) { Console.WriteLine("No matching read files found"); return; } int noOfReadsFiles = readsFNs.Length; readsFiles = new StreamReader[noOfReadsFiles]; for (int f = 0; f < noOfReadsFiles; f++) { string readsFN = readsFNs[f]; readsFiles[f] = new StreamReader(readsFN); } // look at the first file to determine the file format and possible read length StreamReader testReader = new StreamReader(readsFNs[0]); char headerChar = (char)testReader.Peek(); if (headerChar == '>') { readsFormat = MerStrings.formatFNA; } if (headerChar == '@') { readsFormat = MerStrings.formatFASTQ; } int readLength = 0; for (int i = 0; i < 20; i++) { string nextRead = MerStrings.ReadRead(testReader, readsFormat); if (nextRead == null) { break; } int nextLength = nextRead.Length; if (nextLength > readLength) { readLength = nextLength; } } testReader.Close(); // have to able to fit at least two full mers into the read (no overlaps) if (readLength < 2 * merSize) { Console.WriteLine("reads too short to generate pairs: " + readLength); return; } if (!File.Exists(cbtFN)) { Console.WriteLine(".cbt file not found: " + cbtFN); return; } //string knownPairsFN = "C.sporogenesRaw_25_Copy_1.prs"; //BinaryReader knownPairs = new BinaryReader(File.Open(knownPairsFN, FileMode.Open, FileAccess.Read)); //knownPairs.ReadInt32(); //while (true) //{ // ulong mer = 0; // int count = 0; // try // { // mer = knownPairs.ReadUInt64(); // count = knownPairs.ReadInt32(); // goodPairs.Add(mer, count); // } // catch // { // break; // } //} //knownPairs.Close(); //Console.WriteLine("loaded " + goodPairs.Count + " good mers from " + knownPairsFN); long loadedUniqueMers = 0; long loadedTotalMers = 0; // load the .cbt file into a merTable (either a hash table (small) or a sorted array (large)) MerStrings.LoadCBTFile(cbtFN, minLoadReps, 0, 0, minLoadReps, out uniqueMers, out merSize, out averageDepth, out loadedUniqueMers, out loadedTotalMers); if (merSize < merStubSize) { Console.WriteLine("mers in .cbt file are shorter than merStub size: " + merSize + " < " + merStubSize); return; } uniquePairs = new MerCollections.MerTables(loadedUniqueMers, noThreads); // calculate a gap size based on the first read gap = (readLength - endGuard) / 2 - (merStubSize * 2); if (gap < minGap) { gap = minGap; } if (gap > maxGap) { gap = maxGap; } pairStride = merStubSize + gap + merStubSize; // start the monitor/synchronising thread Thread monitorProgress = new Thread(RateReporter); monitorProgress.Priority = ThreadPriority.AboveNormal; monitorProgress.Start(); DateTime pairingStart = DateTime.Now; foreach (string readsFN in readsFNs) { Console.WriteLine("Generating pairs from " + readsFN); StreamReader reads = new StreamReader(readsFN, Encoding.ASCII, false, 1000000); BufferedReader bufferedReads = new BufferedReader(readsFormat, reads, null); threadFinished = new EventWaitHandle[noThreads]; int threadNo = 0; for (int i = 0; i < noThreads; i++) { threadFinished[i] = new EventWaitHandle(false, EventResetMode.AutoReset); } for (int t = 0; t < noThreads; t++) { threadParams workerParam = new threadParams(); workerParam.threadNo = threadNo; workerParam.bufferedReadsFile = bufferedReads; ThreadPool.QueueUserWorkItem(new WaitCallback(PairWorker), workerParam); threadNo++; } // and wait for them all to finish for (int t = 0; t < noThreads; t++) { threadFinished[t].WaitOne(); } } BinaryWriter pairsFile = new BinaryWriter(File.Open(pairsFN, FileMode.Create, FileAccess.Write)); pairsFile.Write(gap); for (int pi = 0; pi < uniquePairs.noOfPartitions; pi++) { totalPairsGenerated += uniquePairs.repeatedMers[pi].Sort(); } for (int ti = 0; ti < noThreads; ti++) { if (uniquePairs.overflowMers[ti] != null) { totalPairsGenerated += uniquePairs.overflowMers[ti].Sort(); } } MergeAndWrite(pairsFile, uniquePairs.repeatedMers, uniquePairs.overflowMers); pairsFile.Close(); StopMonitorThread(monitorProgress); //Console.WriteLine(totalDeepUnbalancedReads + " deep unbalanced reads"); //Console.WriteLine(totalReadsProcessed + " reads processed"); Console.WriteLine("wrote " + totalPairsWritten + " pairs from " + totalReadsRead + " reads in " + (DateTime.Now - pairingStart).TotalSeconds.ToString("#.0") + "s"); }
static void PairWorker(object param) { threadParams threadParam = (threadParams)param; int threadNo = (int)threadParam.threadNo; BufferedReader readsFile = threadParam.bufferedReadsFile; bool EOF = false; Sequence[] readHeaderBatch = new Sequence[batchSize]; Sequence[] readBatch = new Sequence[batchSize]; for (int i = 0; i < batchSize; i++) { readHeaderBatch[i] = new Sequence(defaultHeaderLength); readBatch[i] = new Sequence(defaultReadLength); } int readsInBatch = 0; long threadReadsRead = 0; long threadReadsProcessed = 0; ulong[] mersFromRead = new ulong[1000]; bool[] merValid = new bool[1000]; ulong[] canonicalMersFromRead = new ulong[1000]; int[] plusDepths = new int[1000]; int[] rcDepths = new int[1000]; bool deepUnbalanced = false; long threadDeepUnbalancedCount = 0; int minDepth = averageDepth / 20; while (!EOF) { lock (readsFile) { readsInBatch = readsFile.ReadReads(batchSize, readHeaderBatch, readBatch, null, null); if (readsInBatch != batchSize) { EOF = true; } threadReadsRead += readsInBatch; } progressReadsProcessed += readsInBatch; for (int r = 0; r < readsInBatch; r++) { threadReadsProcessed++; Sequence read = readBatch[r]; int readLength = read.Length; if (readLength < 2 * merSize) { continue; } if (readLength < 200) { stepSize = 1; } else { stepSize = 2; } //string target = "GTATATAATAAAGTTTTTTATAAAATTTTAAAAGATCATTATAAAAATATAATAACAATTAATATAATATTAATATACTTTAGTTATAGCTATAAATCTTT"; //if (read.ToString() == target) // Debugger.Break(); int merCount = MerStrings.GenerateMersFromRead(read, merSize, ref mersFromRead, ref merValid); for (int i = 0; i < merCount; i++) { if (merValid[i]) { ulong rcMer = MerStrings.ReverseComplement(mersFromRead[i], merSize); if (rcMer < mersFromRead[i]) { canonicalMersFromRead[i] = rcMer; } else { canonicalMersFromRead[i] = mersFromRead[i]; } } } GetDepthsForRead(merCount, mersFromRead, canonicalMersFromRead, merValid, plusDepths, rcDepths, minDepth, out deepUnbalanced); if (deepUnbalanced) { threadDeepUnbalancedCount++; continue; } ulong pair; int pairDepth; bool gotPair; int startingM = 0; int lastM = read.Length - pairStride; // generate pairs up to the end of the read (used to only generate from first part) while (startingM < lastM) { if (merValid[startingM]) { gotPair = GeneratePairFromRead(mersFromRead, merValid, plusDepths, rcDepths, startingM, merCount, minDepth, out pair, out pairDepth); if (gotPair) { ulong rcPair = MerStrings.ReverseComplement(pair, 32); if (rcPair < pair) { pair = rcPair; } //if (pair == 0x054A0985B90B34D1) // Debugger.Break(); uniquePairs.AddIfNotPresent(pair, pairDepth, threadNo); //lock (pairDictionary) //{ // if (!pairDictionary.ContainsKey(pair)) // pairDictionary.Add(pair, pairDepth); //} //Interlocked.Increment(ref GPTrue); //gotPairFromRead = true; } //else //Interlocked.Increment(ref GPfalse); } startingM += stepSize; } //if (!gotPairFromRead) // threadReadsWithNoPairs++; } } Interlocked.Add(ref totalReadsProcessed, threadReadsProcessed); Interlocked.Add(ref totalReadsRead, threadReadsRead); Interlocked.Add(ref totalDeepUnbalancedReads, threadDeepUnbalancedCount); threadFinished[threadNo].Set(); }