Ejemplo n.º 1
0
        public List <LowRepMerBuffer> filledCulledBuffers = null;   // only used if culledBuffer becomes full

        // constructor
        public MerTables(long dictionarySize, int noThreads)
        {
            // scale genome size to compensate for the number of repeated error k-mers
            dictionarySize = dictionarySize * 2;

            // how many shared mer partitions are needed to safely hold this many distinct k-mers?
            this.noOfPartitions = (int)(dictionarySize / maxTableSize + 1);
            if (this.noOfPartitions < 1)
            {
                this.noOfPartitions = 1;
            }
            // and how big should the partitions be?
            int partitionSize = (int)(dictionarySize / noOfPartitions);

            if (partitionSize < minTableSize)
            {
                partitionSize = minTableSize;
            }

            repeatedMers     = new MerDictionary[noOfPartitions];           // create partitioned dictionaries
            repeatedMersFull = new bool[noOfPartitions];                    // create full flags array (default is false)
            overflowMers     = new MerDictionary[noThreads];                // create per-thread overflow tables

            // initialise per-partition structures
            for (int i = 0; i < noOfPartitions; i++)
            {
                repeatedMers[i] = new MerDictionary(partitionSize, fullMerMask);
            }

            // initialise per-thread structures
            for (int i = 0; i < noThreads; i++)
            {
                // overflowMers[i] = new MerDictionary(singletonSize); // allocated when first used to save on memory space
            }
        }
Ejemplo n.º 2
0
        public void AddIfNotPresent(ulong mer, long value, int threadNo)
        {
            int absMerHashCode = mer.GetHashCode() & int31Mask;
            int partitionNo    = absMerHashCode % noOfPartitions;

            // this mer may have been seen before
            int idx = repeatedMers[partitionNo].FindEntry(mer);

            if (idx >= 0)
            {
                long storedValue = repeatedMers[partitionNo].entries[idx].value;
                if (value > storedValue)
                {
                    repeatedMers[partitionNo].entries[idx].value = value;
                }

                return;
            }

            // new mer
            if (repeatedMersFull[partitionNo])
            {
                // no space in main table so add it to this thread's overflow table
                if (overflowMers[threadNo] == null)
                {
                    overflowMers[threadNo] = new MerDictionary(repeatedMers[partitionNo].lengthEntries / 10, fullMerMask);
                    Console.WriteLine("added overflow for thread " + threadNo + " for [" + partitionNo + "]");
                }

                bool full = overflowMers[threadNo].Add(mer, value);
                // add will always work but could return 'no more please' status
                if (full)
                {
                    overflowMers[threadNo].Resize();
                }
            }
            else
            {
                // space in main table - so add it. Could be a race here where the mer is added twice. Resolved during writing phase.
                bool full = repeatedMers[partitionNo].Add(mer, value);
                if (full)
                {
                    repeatedMersFull[partitionNo] = true;
                }
            }
        }
        private void UpdateRepeatedMerAfterPromotion(int partitionNo, ulong pMer, int threadNo, ulong mer, long addingIncrement)
        {
            bool updated = false;

            // update the count for this mer if it's in the shared repeated set
            updated = repeatedMers[partitionNo].UpdateIfPresent(pMer, addingIncrement);
            if (updated)
            {
                //repeats++;
                return;
            }

            // mer could be in the thread-local overflow table (or it may have been promoted by another thread)
            updated = overflowMers[threadNo] != null && overflowMers[threadNo].UpdateIfPresent(mer, addingIncrement);
            if (updated)
            {
                //overflow++;
                return;
            }

            // if we get here, the promoted singleton must have gone to another thread's overflow table (and the partition must be full)
            // so create a new overflow table if we don't already have one
            if (repeatedMersFull[partitionNo] && overflowMers[threadNo] == null)
            {
                overflowMers[threadNo] = new MerDictionary(repeatedMers[partitionNo].lengthEntries / 10, fullMerMask); // set length properly *****
                Console.WriteLine("added overflow for thread " + threadNo);
            }

            // finally add this promoted singleton to the overflow table for this thread
            if (overflowMers[threadNo] != null)
            {
                overflowMers[threadNo].Add(mer, addingIncrement);
                return;
            }

            // the shared repeated table isn't full but we didn't find the singleton there, could have been a race on a bucket that resulted in an orphan
            repeatedMers[partitionNo].Add(pMer, addingIncrement);
        }
Ejemplo n.º 4
0
        public bool AddOrIncrement(ulong mer, int threadNo)
        {
            int absMerHashCode = mer.GetHashCode() & int31Mask;
            int partitionNo    = absMerHashCode % noOfPartitions;

            // look in the main table first - and increment the value if the pair is there
            int idx = repeatedMers[partitionNo].FindEntry(mer);

            if (idx >= 0)
            {
                // minor race here - could result in counts being slightly low sometimes but doesn't matter for the purposes for which the count is used
                // could replace with InterlockedIncrement is this ever needs to be an accurate count
                repeatedMers[partitionNo].values[idx]++;
                //Interlocked.Increment(ref repeatedMers[partitionNo].values[idx]);

                return(true);
            }

            // perhaps the pair is in a per-thread overflow table
            if (repeatedMersFull[partitionNo] && overflowMers[threadNo] != null)
            {
                // is the mer in the overflow table?
                int overflowIdx = overflowMers[threadNo].FindEntry(mer);
                if (overflowIdx >= 0)
                {
                    // already there so just add to its count
                    overflowMers[threadNo].values[overflowIdx]++;
                    return(true);
                }
            }

            // pair not present so add an entry for it
            if (repeatedMersFull[partitionNo])
            {
                // main table is full, so add the entry to the overflow table for this thread
                if (overflowMers[threadNo] == null)
                {
                    // no such overflow table yet for this thread, so create one
                    overflowMers[threadNo] = new MerDictionary(repeatedMers[partitionNo].lengthEntries / 10, merSize, 1);
                    //Console.WriteLine("added overflow for thread " + threadNo + " for [" + partitionNo + "]");
                }

                bool full = overflowMers[threadNo].Add(mer, 1);
                // add will always work but could return 'no more please' status so we'll resize in this (unlikely) case
                if (full)
                {
                    //Console.WriteLine("resized overflow for thread " + threadNo + " for [" + partitionNo + "]");
                    overflowMers[threadNo].Resize();
                }
            }
            else
            {
                // space in main table - so add it. Could be a race here where the mer is added twice. Resolved during writing phase.
                bool full = repeatedMers[partitionNo].Add(mer, 1);
                if (full)
                {
                    repeatedMersFull[partitionNo] = true;
                }
            }
            return(false);
        }
        // constructor
        public MerTables(long dictionarySize, string tempDir, int noThreads)
        {
            // scale genome size to compensate for the number of repeated error k-mers
            dictionarySize = dictionarySize * 2;

            // how many shared mer partitions are needed to safely hold this many distinct k-mers?
            this.noOfPartitions = (int)(dictionarySize / maxTableSize + 1);
            if (this.noOfPartitions < 1)
            {
                this.noOfPartitions = 1;
            }
            // and how big should the partitions be?
            int partitionSize = (int)(dictionarySize / noOfPartitions);

            if (partitionSize < minTableSize)
            {
                partitionSize = minTableSize;
            }

            // how many singleton partitions are desirable?
            // per-partition singleton tables can use of a considerable amount of memory, but too few will increase the number of concurrently open files during merge
            if (noOfPartitions > 64)
            {
                noSingletonPartitions = 32;
                singletonPrefixBits   = 5;
            }
            if (noOfPartitions <= 64)
            {
                noSingletonPartitions = 16;
                singletonPrefixBits   = 4;
            }
            if (noOfPartitions <= 16)
            {
                noSingletonPartitions = 4;
                singletonPrefixBits   = 2;
            }
            if (noOfPartitions <= 4)
            {
                noSingletonPartitions = 1;
                singletonPrefixBits   = 0;
            }

            this.tempDirectory = tempDir;

            repeatedMers                 = new MerDictionary[noOfPartitions];        // create partitioned dictionaries
            repeatedMersFull             = new bool[noOfPartitions];                 // create full flags array (default is false)
            overflowMers                 = new MerDictionary[noThreads];             // create per-thread overflow tables
            singletonFilters             = new MerCollection[noSingletonPartitions]; // create partitioned singleton filters
            singletonsWaitingFlush       = new List <MerCollection> [noSingletonPartitions];
            singletonsWaitingFlushCounts = new List <int> [noSingletonPartitions];
            flushedSingletonFNs          = new List <string> [noSingletonPartitions];
            firstFlushedSingletonMer     = new List <ulong> [noSingletonPartitions];
            lastFlushedSingletonMer      = new List <ulong> [noSingletonPartitions];
            flushSingletonNumber         = new int[noSingletonPartitions];
            maxSingletonCapacity         = new int[noSingletonPartitions];

            flushedLowRepsFNs      = new List <string>();
            firstFlushedLowRepsMer = new List <ulong>();
            lastFlushedLowRepsMer  = new List <ulong>();

            // initialise per-partition structures
            for (int i = 0; i < noOfPartitions; i++)
            {
                repeatedMers[i] = new MerDictionary(partitionSize, fullMerMask);
            }

            // initialise per-singleton-partition structures
            for (int i = 0; i < noSingletonPartitions; i++)
            {
                int scaledSingletonSize = 3 * partitionSize / noSingletonPartitions * (noSingletonPartitions - i);
                scaledSingletonSize         = Math.Min(scaledSingletonSize, maxSingletonSize);
                scaledSingletonSize         = Math.Max(scaledSingletonSize, minSingletonSize);
                singletonFilters[i]         = new MerCollection(scaledSingletonSize, singletonMerMask);
                maxSingletonCapacity[i]     = singletonFilters[i].length * 9 / 10;
                flushedSingletonFNs[i]      = new List <string>();
                flushSingletonNumber[i]     = 1;
                firstFlushedSingletonMer[i] = new List <ulong>();
                lastFlushedSingletonMer[i]  = new List <ulong>();
            }

            // initialise per-thread structures
            for (int i = 0; i < noThreads; i++)
            {
                // overflowMers[i] = new MerDictionary(singletonSize); // allocated when first used to save on memory space
            }
        }
        // flush the low-rep mers from the repeat tables, condense the remaining repeated mers and fold in the per-thread repeats. Can only be called after all the
        // threads have finished for a seq data file. This code is *not* thread-safe.
        public void FlushLowRepMers(MerTables merTable, int fileNo)
        {
            // allocate a buffer to hold the flushed low-rep mers
            //int initialBufferLength = 500000;
            int initialBufferLength = this.repeatedMers[0].Capacity;

            culledBuffer              = new LowRepMerBuffer();
            culledBuffer.keys         = new ulong[initialBufferLength + noOfPartitions];
            culledBuffer.values       = new long[initialBufferLength + noOfPartitions];
            culledBuffer.idx          = 0;
            culledBuffer.bufferActive = true;
            culledBuffer.bufferNo     = 1;
            culledBuffer.limit        = initialBufferLength;
            culledLock = new object();

            FlushingThreadParams[] flushingParams = new FlushingThreadParams[noOfPartitions];
            Thread[] flushingThreads = new Thread[noOfPartitions];

            for (int p = 0; p < noOfPartitions; p++)
            {
                flushingParams[p]             = new FlushingThreadParams();
                flushingParams[p].merTable    = merTable;
                flushingParams[p].partitionNo = p;
                flushingThreads[p]            = new Thread(new ParameterizedThreadStart(MerTables.FlushLowRepMersInPartition));
                flushingThreads[p].Priority   = ThreadPriority.BelowNormal;
                flushingThreads[p].Start(flushingParams[p]);
            }

            for (int p = 0; p < noOfPartitions; p++)
            {
                flushingThreads[p].Join();
                flushingThreads[p] = null;
            }

            // write out any filled culled buffers
            int bufferNo = 0;

            if (filledCulledBuffers != null)
            {
                for (int i = 0; i < filledCulledBuffers.Count; i++)
                {
                    WriteLowRepMers(fileNo, bufferNo, filledCulledBuffers[i], filledCulledBuffers[i].keys.Length);
                    bufferNo++;
                    filledCulledBuffers[i] = null;
                }
                filledCulledBuffers = null;
            }
            // finally write out the remaining culled low-rep mers
            WriteLowRepMers(fileNo, bufferNo, culledBuffer, culledBuffer.idx);

            // return the temporary buffers
            culledBuffer = null;

            // finally push the per-thread dictionaries to the shared dictionary
            for (int t = 0; t < overflowMers.Length; t++)
            {
                if (overflowMers[t] == null)
                {
                    continue;
                }

                MerDictionary currentOverflow     = overflowMers[t];
                MerDictionary replacementOverflow = new MerDictionary(currentOverflow.Capacity, fullMerMask);

                foreach (KeyValuePair <ulong, long> kvp in currentOverflow)
                {
                    int absMerHashCode = kvp.Key.GetHashCode() & int31Mask;
                    int partitionNo    = absMerHashCode % noOfPartitions;

                    if (repeatedMersFull[partitionNo])
                    {
                        replacementOverflow.Add(kvp.Key, kvp.Value);
                    }
                    else
                    {
                        bool OK = repeatedMers[partitionNo].Add(kvp.Key, kvp.Value);
                        if (!OK)
                        {
                            repeatedMersFull[partitionNo] = true;
                        }
                    }
                }

                overflowMers[t] = replacementOverflow;
            }
        }
        public void AddOrIncrement(ulong mer, int threadNo)
        {
            long  addingIncrement = 0x0000000100000000;                 // assume we've got the as-read form is the canonical form
            ulong rcFlagToBeSet   = 0x0;                                // and that we don't want to set the RC flag

            // generate canonical k-mer first
            ulong rcMer = MerStrings.ReverseComplement(mer);

            if (rcMer < mer)
            {
                mer             = rcMer;
                addingIncrement = 0x0000000000000001;                   // increment the low part of the count pair
                rcFlagToBeSet   = singletonRCFlagMask;                  // remember if the canonical k-mer was the RC form
            }

            int absMerHashCode       = mer.GetHashCode() & int31Mask;
            int partitionNo          = absMerHashCode % noOfPartitions;
            int singletonPartitionNo = singletonPrefixBits == 0 ? 0 : (int)(mer >> (64 - singletonPrefixBits));

            // this mer may have been seen before, so first try updating it in one of the repeated mer tables
            bool updatedRepeat = UpdateRepeatedMer(partitionNo, mer, threadNo, mer, addingIncrement);

            if (updatedRepeat)
            {
                return;
            }

            // handling a k-mer for the first time - try adding it to the singletons table
            // ----------------------------------------------------------------------------

            // get a stable pointer to the current singetons table (in case someone else fills it and initiates a flush while we're still busy with it)
            MerCollection thisSingletonPartition = singletonFilters[singletonPartitionNo];

            Interlocked.Increment(ref thisSingletonPartition.activeCount);

            // try to add this mer to this partition's singletons collection (and fetch the existing singleton+flag if it's already there)
            int   filterIdx;
            ulong fMer  = mer | rcFlagToBeSet | singletonActiveFlagMask;
            bool  added = thisSingletonPartition.TryInsertKey(fMer, out filterIdx);

            if (added)
            {
                // successfully added this mer so we must be seeing it for the first time

                // if singleton table is already full enough, flush it out and empty the table
                if (thisSingletonPartition.Count >= maxSingletonCapacity[singletonPartitionNo])
                {
                    bool flushNeeded      = true;
                    int  flushNumberToUse = 0;

                    // lock this section to avoid two threads trying to flush/replace the same singleton buffer concurrently
                    lock (lockSingletons)
                    {
                        // test entry condition now that we have the lock (filter may have been reset while we were waiting)
                        if (!thisSingletonPartition.flushed)
                        {
                            // allocate a replacement table for the other threads to use while we're flushing this one
                            int newSingletonLength = thisSingletonPartition.length + thisSingletonPartition.length / 4;
                            if (newSingletonLength > maxSingletonSize)
                            {
                                newSingletonLength = maxSingletonSize;
                            }
                            MerCollection emptySingletonFilter = new MerCollection(newSingletonLength, singletonMerMask); // allocate new local filter for the partition

                            singletonFilters[singletonPartitionNo]     = emptySingletonFilter;                            // make it visible to the concurrent threads (single point assignment)
                            maxSingletonCapacity[singletonPartitionNo] = newSingletonLength * 8 / 10;
                            thisSingletonPartition.flushed             = true;
                            flushNumberToUse = flushSingletonNumber[singletonPartitionNo];
                            flushSingletonNumber[singletonPartitionNo]++;
                        }
                        else
                        {
                            flushNeeded = false;
                        }
                    }

                    if (flushNeeded)
                    {
                        while (thisSingletonPartition.activeCount > 1)
                        {
                            // pause briefly to let any inflight updates to this table to complete
                            Thread.Sleep(100);
                        }
                        FlushSingletons(thisSingletonPartition, singletonPartitionNo, flushNumberToUse);
                    }
                    //flushes++;
                }
            }
            else
            {
                // Insert failed, so must be seeing this k-mer for second (or rarely more) time. Mark as inactive in singletons and add to a repeats table with appropriate counts.
                // There can be a race here with two threads trying to concurrently promote the same singleton. This is resolved by atomically clearing the singleton
                // active flag - and only one of the threads will get the 'active' flag returned from the Exchange. This thread does the promotion - and then sets the
                // promotion-complete bit for the singleton. The other threads will spin until they find this bit has been set.

                if (tracing)
                {
                    lock (traceUpdates)
                    {
                        traceUpdates.Enqueue(new TraceEntry(threadNo, 1, singletonPartitionNo, filterIdx, (ulong)thisSingletonPartition.entries[filterIdx].key));
                        if (traceUpdates.Count > maxTrace)
                        {
                            traceUpdates.Dequeue();
                        }
                    }
                }

                // get the current value of this singleton entry (safe because the promotion changes are progressive)
                ulong merFromFilter = (ulong)thisSingletonPartition.entries[filterIdx].key;
                // and see if this singleton may have already been promoted
                bool activeSingleton = (merFromFilter & singletonActiveFlagMask) != 0;

                // if this singleton may be 'active', try to promote it
                if (activeSingleton)
                {
                    ulong inactiveMer = mer & singletonMerMask;                      // build what the inactive-but-being-promoted entry should look like
                    // if no-one else has altered the singleton entry, then set it to inactive-but-being-promoted
                    long currentMerFromFilter = Interlocked.CompareExchange(ref thisSingletonPartition.entries[filterIdx].key, (long)inactiveMer, (long)merFromFilter);

                    if (tracing)
                    {
                        lock (traceUpdates)
                        {
                            traceUpdates.Enqueue(new TraceEntry(threadNo, 2, singletonPartitionNo, filterIdx, (ulong)currentMerFromFilter));
                            if (traceUpdates.Count > maxTrace)
                            {
                                traceUpdates.Dequeue();
                            }
                        }
                    }

                    // if this thread successfully set the singleton to 'inactive', it will take care of the promotion
                    if (currentMerFromFilter == (long)merFromFilter)
                    {
                        ulong rcFlag = merFromFilter & singletonRCFlagMask;          // non-zero --> RC found in singletons

                        long initialCount = 0;
                        if (rcFlag != 0)                                        // singleton was seen in RC form
                        {
                            initialCount = 0x0000000000000001;
                        }
                        else                                                // singleton was seen in as-is form
                        {
                            initialCount = 0x0000000100000000;
                        }

                        if (repeatedMersFull[partitionNo])
                        {
                            if (overflowMers[threadNo] == null)
                            {
                                overflowMers[threadNo] = new MerDictionary(repeatedMers[partitionNo].lengthEntries / 10, fullMerMask);
                                //Console.WriteLine("added overflow for thread " + threadNo + " for [" + partitionNo + "]");
                            }

                            bool full = overflowMers[threadNo].Add(mer, initialCount);
                            if (full)
                            {
                                overflowMers[threadNo].Resize();
                            }
                        }
                        else
                        {
                            bool full = repeatedMers[partitionNo].Add(mer, initialCount);
                            if (full)
                            {
                                repeatedMersFull[partitionNo] = true;
                            }
                        }

                        // now that the mer has been promoted, set the 'promoted' flag
                        inactiveMer = inactiveMer | (long)singletonPromotedFlagMask;
                        thisSingletonPartition.entries[filterIdx].key = (long)inactiveMer;

                        if (tracing)
                        {
                            lock (traceUpdates)
                            {
                                traceUpdates.Enqueue(new TraceEntry(threadNo, 3, singletonPartitionNo, filterIdx, (ulong)thisSingletonPartition.entries[filterIdx].key));
                                if (traceUpdates.Count > maxTrace)
                                {
                                    traceUpdates.Dequeue();
                                }
                            }
                        }
                    }
                }

                // singleton is now known to be no longer active, so wait (if necessary) for the 'promoted' flag to be set and increment the repeat counter

                merFromFilter = (ulong)thisSingletonPartition.entries[filterIdx].key;

                if (tracing)
                {
                    lock (traceUpdates)
                    {
                        traceUpdates.Enqueue(new TraceEntry(threadNo, 4, singletonPartitionNo, filterIdx, merFromFilter));
                        if (traceUpdates.Count > maxTrace)
                        {
                            traceUpdates.Dequeue();
                        }
                    }
                }

                bool promotionComplete = (merFromFilter & singletonPromotedFlagMask) != 0;
                bool alreadySlept      = false;
                while (!promotionComplete)
                {
                    promotionComplete = (((ulong)thisSingletonPartition.entries[filterIdx].key & singletonPromotedFlagMask) != 0);
                    if (alreadySlept && !promotionComplete)
                    {
                        if (tracing)
                        {
                            lock (traceUpdates)
                            {
                                StreamWriter trace = new StreamWriter("trace.txt");
                                foreach (TraceEntry t in traceUpdates)
                                {
                                    trace.WriteLine(t.place + "\t" + t.thread + "\t" + t.partition + "\t" + t.index + "\t" + t.value.ToString("x16"));
                                }
                                trace.Close();
                            }
                            Console.WriteLine("promotion still not complete after sleep");
                        }
                    }
                    if (!promotionComplete)
                    {
                        Thread.Sleep(100);
                    }
                    alreadySlept = true;
                }

                UpdateRepeatedMerAfterPromotion(partitionNo, mer, threadNo, mer, addingIncrement);
                //if (!updateSucceeded)
                //{
                //    lock (traceUpdates)
                //    {
                //        StreamWriter trace = new StreamWriter("trace.txt");
                //        foreach (TraceEntry t in traceUpdates)
                //            trace.WriteLine(t.thread + "\t" + t.place + "\t" + t.partition + "\t" + t.index + "\t" + t.value.ToString("x16"));
                //        trace.Close();
                //    }
                //    Console.WriteLine("UpdateRepeatedMerRetry failed after waiting for promotion to complete");
                //}
            }

            Interlocked.Decrement(ref thisSingletonPartition.activeCount);
        }