Пример #1
0
        /// <summary>
        /// Parse a single sequence using a MBFTextReader.
        /// </summary>
        /// <param name="bioText">sequence alignment text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the sequences in the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequences's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <param name="alphabet">Alphabet of the sequences.</param>
        /// <param name="encoding">Required encoding.</param>
        /// <param name="referenceSequences">Reference sequences.</param>
        private static SAMAlignedSequence ParseSequence(string bioText, bool isReadOnly, IAlphabet alphabet, IEncoding encoding, IList <ISequence> referenceSequences)
        {
            const int optionalTokenStartingIndex = 11;

            string[] tokens = bioText.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries);

            SAMAlignedSequence alignedSeq = new SAMAlignedSequence();

            alignedSeq.QName = tokens[0];
            alignedSeq.Flag  = SAMAlignedSequenceHeader.GetFlag(tokens[1]);
            alignedSeq.RName = tokens[2];
            alignedSeq.Pos   = int.Parse(tokens[3], CultureInfo.InvariantCulture);
            alignedSeq.MapQ  = int.Parse(tokens[4], CultureInfo.InvariantCulture);
            alignedSeq.CIGAR = tokens[5];
            alignedSeq.MRNM  = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6];
            alignedSeq.MPos  = int.Parse(tokens[7], CultureInfo.InvariantCulture);
            alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture);

            ISequence refSeq = null;

            if (referenceSequences != null && referenceSequences.Count > 0)
            {
                refSeq = referenceSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0);
            }

            ParseQualityNSequence(alignedSeq, alphabet, encoding, tokens[9], tokens[10], refSeq, isReadOnly);
            SAMOptionalField optField = null;
            string           message;

            for (int i = optionalTokenStartingIndex; i < tokens.Length; i++)
            {
                optField = new SAMOptionalField();
                if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i]))
                {
                    message = string.Format(CultureInfo.CurrentCulture, Resource.InvalidOptionalField, tokens[i]);
                    throw new FormatException(message);
                }

                string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries);
                optField.Tag   = opttokens[0];
                optField.VType = opttokens[1];
                optField.Value = opttokens[2];

                alignedSeq.OptionalFields.Add(optField);
            }

            return(alignedSeq);
        }
Пример #2
0
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="libraryName">library name.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, string libraryName)
        {
            if (string.IsNullOrEmpty(libraryName))
            {
                throw new ArgumentNullException("libraryName");
            }

            CloneLibraryInformation libraryInfo = CloneLibrary.Instance.GetLibraryInformation(libraryName);

            if (libraryInfo == null)
            {
                throw new ArgumentOutOfRangeException("libraryName");
            }

            return(GetPairedReadType(read1, read2, libraryInfo));
        }
Пример #3
0
        /// <summary>
        /// Gets the insert length of reads.
        /// </summary>
        /// <param name="read1">First read.</param>
        /// <param name="read2">Second read.</param>
        /// <param name="validate">Validates the reads before calculating the insert length.</param>
        public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2, bool validate)
        {
            //                      reference chromosome
            //5'                         -->                      3'
            //----------------------------------------------------- F strand
            //
            //3'                         <--                       5'
            //----------------------------------------------------- R strand
            //        read1                         read2
            //    5'             3'             3'            5'
            //         -->                          <--
            //    |--------------               --------------|
            //    |<----------insert length------------------>|

            if (read1 == null)
            {
                throw new ArgumentNullException("read1");
            }

            if (read2 == null)
            {
                return(0);
            }


            if (validate)
            {
                PairedReadType type = GetPairedReadType(read1, read2, 0, 0);
                if (type != PairedReadType.Normal && type != PairedReadType.LengthAnomaly)
                {
                    return(0);
                }
            }

            if (read1.ISize == -read2.ISize)
            {
                return(read1.ISize >= 0 ? read1.ISize : -read1.ISize);
            }
            else
            {
                return(0);
            }
        }
Пример #4
0
        /// <summary>
        /// Parases sequence data and quality values and updates SAMAlignedSequence instance.
        /// </summary>
        /// <param name="alignedSeq">SAM aligned Sequence.</param>
        /// <param name="alphabet">Alphabet of the sequence to be created.</param>
        /// <param name="Encoding">Encoding to use while creating sequence.</param>
        /// <param name="sequencedata">Sequence data.</param>
        /// <param name="qualitydata">Quality values.</param>
        /// <param name="refSeq">Reference sequence if known.</param>
        /// <param name="isReadOnly">Flag to indicate whether the new sequence is required to in readonly or not.</param>
        public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, IEncoding Encoding, string sequencedata, string qualitydata, ISequence refSeq, bool isReadOnly)
        {
            if (alignedSeq == null)
            {
                throw new ArgumentNullException("alignedSeq");
            }

            if (string.IsNullOrWhiteSpace(sequencedata))
            {
                throw new ArgumentNullException("sequencedata");
            }

            if (string.IsNullOrWhiteSpace(qualitydata))
            {
                throw new ArgumentNullException("qualitydata");
            }

            bool isQualitativeSequence = true;
            string message = string.Empty;
            byte[] qualScores = null;
            FastQFormatType fastQType = QualityFormatType;

            if (sequencedata.Equals("*"))
            {
                return;
            }

            if (qualitydata.Equals("*"))
            {
                isQualitativeSequence = false;
            }

            if (isQualitativeSequence)
            {
                // Get the quality scores from the fourth line.
                qualScores = ASCIIEncoding.ASCII.GetBytes(qualitydata);

                // Check for sequence length and quality score length.
                if (sequencedata.Length != qualitydata.Length)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName);
                    message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Resource.SAM_NAME, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            // get "." symbol indexes.
            int index = sequencedata.IndexOf('.', 0);
            while (index > -1)
            {
                alignedSeq.DotSymbolIndexes.Add(index++);
                index = sequencedata.IndexOf('.', index);
            }

            // replace "." with N
            if (alignedSeq.DotSymbolIndexes.Count > 0)
            {
                sequencedata = sequencedata.Replace('.', 'N');
            }

            // get "=" symbol indexes.
            index = sequencedata.IndexOf('=', 0);
            while (index > -1)
            {
                alignedSeq.EqualSymbolIndexes.Add(index++);
                index = sequencedata.IndexOf('=', index);
            }

            // replace "=" with corresponding symbol from refSeq.
            if (alignedSeq.EqualSymbolIndexes.Count > 0)
            {
                if (refSeq == null)
                {
                    throw new ArgumentException(Resource.RefSequenceNofFound);
                }

                for (int i = 0; i < alignedSeq.EqualSymbolIndexes.Count; i++)
                {
                    index = alignedSeq.EqualSymbolIndexes[i];
                    sequencedata = sequencedata.Remove(index, 1);
                    sequencedata = sequencedata.Insert(index, refSeq[index].Symbol.ToString());
                }
            }

            ISequence sequence = null;
            if (isQualitativeSequence)
            {
                QualitativeSequence qualSeq = null;
                if (Encoding == null)
                {
                    qualSeq = new QualitativeSequence(alphabet, fastQType, sequencedata, qualScores);
                }
                else
                {
                    qualSeq = new QualitativeSequence(alphabet, fastQType, Encoding, sequencedata, qualScores);
                }

                qualSeq.ID = alignedSeq.QName;
                qualSeq.IsReadOnly = isReadOnly;
                sequence = qualSeq;
            }
            else
            {
                Sequence seq = null;
                if (Encoding == null)
                {
                    seq = new Sequence(alphabet, sequencedata);
                }
                else
                {
                    seq = new Sequence(alphabet, Encoding, sequencedata);
                }

                seq.ID = alignedSeq.QName;
                seq.IsReadOnly = isReadOnly;
                sequence = seq;
            }

            alignedSeq.QuerySequence = sequence;
        }
Пример #5
0
        /// <summary>
        /// Gets the paired reads when DV is enabled.
        /// </summary>
        /// <param name="meanLengthOfInsert">Mean of the insert length.</param>
        /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param>
        /// <param name="calculate">If this flag is set then mean and standard deviation will
        /// be calculated from the paired reads instead of specified.</param>
        /// <returns>List of paired read.</returns>
        private IList <PairedRead> GetDVAwarePairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false)
        {
            // Dictionary helps to get the information at one pass of alinged sequence list.
            Dictionary <string, DVEnabledPairedRead> pairedReads = new Dictionary <string, DVEnabledPairedRead>();
            double sum   = 0;
            int    count = 0;

            for (int i = 0; i < QuerySequences.Count; i++)
            {
                DVEnabledPairedRead pairedRead;
                SAMAlignedSequence  read = QuerySequences[i];
                if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead)
                {
                    if (pairedReads.TryGetValue(read.QName, out pairedRead))
                    {
                        if (pairedRead.Index2 == -1 || pairedRead.Index1 == -1)
                        {
                            if (pairedRead.Index2 == -1)
                            {
                                pairedRead.Index2 = i;
                            }
                            else
                            {
                                pairedRead.Index1 = i;
                            }

                            // For best performace,
                            // 1. BAM/SAM file should be sorted by reads name.
                            // 2. If sorted on mapping position then give unmapped read a coordinate (generally the coordinate of the mapped mate)
                            //    for sorting/indexing purposes only.


                            pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert);

                            if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly)
                            {
                                pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2);

                                if (calculate)
                                {
                                    sum += pairedRead.InsertLength;
                                    count++;
                                }
                            }
                        }
                        else
                        {
                            pairedRead.InsertLength = 0;
                            if (calculate)
                            {
                                sum -= pairedRead.InsertLength;
                                count--;
                            }

                            pairedRead.ReadIndexes.Add(i);
                            pairedRead.PairedType = PairedReadType.MultipleHits;
                        }
                    }
                    else
                    {
                        pairedRead = new DVEnabledPairedRead(QuerySequences);
                        if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*"))
                        {
                            pairedRead.Index1 = i;
                        }
                        else
                        {
                            pairedRead.Index2 = i;
                        }

                        pairedRead.PairedType   = PairedReadType.Orphan;
                        pairedRead.InsertLength = 0;
                        pairedReads.Add(read.QName, pairedRead);
                    }
                }
            }

            List <PairedRead> allreads = pairedReads.Values.ToList <PairedRead>();

            pairedReads = null;

            if (calculate && count > 0)
            {
                UpdateType(allreads, sum, count);
            }

            return(allreads);
        }
Пример #6
0
        /// <summary>
        /// Gets the paired reads when SAMAligned sequences are in memory.
        /// </summary>
        /// <param name="meanLengthOfInsert">Mean of the insert length.</param>
        /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param>
        /// <param name="calculate">If this flag is set then mean and standard deviation will
        /// be calculated from the paired reads instead of specified.</param>
        /// <returns>List of paired read.</returns>
        private IList <PairedRead> GetInMemoryPairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false)
        {
            // Dictionary helps to get the information at one pass of alinged sequence list.
            Dictionary <string, PairedRead> pairedReads = new Dictionary <string, PairedRead>();
            double sum   = 0;
            int    count = 0;

            for (int i = 0; i < QuerySequences.Count; i++)
            {
                PairedRead         pairedRead;
                SAMAlignedSequence read = QuerySequences[i];
                if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead)
                {
                    if (pairedReads.TryGetValue(read.QName, out pairedRead))
                    {
                        if (pairedRead.Read2 == null || pairedRead.Read1 == null)
                        {
                            if (pairedRead.Read2 == null)
                            {
                                pairedRead.Read2 = read;
                            }
                            else
                            {
                                pairedRead.Read1 = read;
                            }

                            pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert);
                            if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly)
                            {
                                pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2);
                                if (calculate)
                                {
                                    sum += pairedRead.InsertLength;
                                    count++;
                                }
                            }
                        }
                        else
                        {
                            pairedRead.InsertLength = 0;
                            if (calculate)
                            {
                                sum -= pairedRead.InsertLength;
                                count--;
                            }

                            pairedRead.Reads.Add(read);
                            pairedRead.PairedType = PairedReadType.MultipleHits;
                        }
                    }
                    else
                    {
                        pairedRead = new PairedRead();
                        if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*"))
                        {
                            pairedRead.Read1 = read;
                        }
                        else
                        {
                            pairedRead.Read2 = read;
                        }

                        pairedRead.PairedType   = PairedReadType.Orphan;
                        pairedRead.InsertLength = 0;
                        pairedReads.Add(read.QName, pairedRead);
                    }
                }
            }

            List <PairedRead> allreads = pairedReads.Values.ToList();

            pairedReads = null;
            if (calculate && count > 0)
            {
                UpdateType(allreads, sum, count);
            }


            return(allreads);
        }
Пример #7
0
 /// <summary>
 /// Gets the insert length of reads.
 /// </summary>
 /// <param name="read1">First read.</param>
 /// <param name="read2">Second read.</param>
 public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2)
 {
     return(GetInsertLength(read1, read2, false));
 }
Пример #8
0
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="meanLengthOfInsert">Mean of the insertion length.</param>
        /// <param name="standardDeviationOfInsert">Standard deviation of insertion length.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, float meanLengthOfInsert, float standardDeviationOfInsert)
        {
            PairedReadType type = PairedReadType.Normal;

            if (read1 == null)
            {
                throw new ArgumentNullException("read1");
            }

            if (read2 == null)
            {
                return(PairedReadType.Orphan);
            }

            if (string.IsNullOrEmpty(read2.RName) ||
                read2.RName.Equals("*") ||
                ((read2.Flag & SAMFlags.UnmappedQuery) == SAMFlags.UnmappedQuery))
            {
                type = PairedReadType.Orphan;
            }
            else if (!read2.RName.Equals(read1.RName))
            {
                type = PairedReadType.Chimera;
            }
            else
            {
                bool isBothforwardReads = IsForwardRead(read1) && IsForwardRead(read2);
                bool isBothReverseReads = IsReverseRead(read1) && IsReverseRead(read2);

                if (isBothforwardReads || isBothReverseReads)
                {
                    type = PairedReadType.StructuralAnomaly;
                }
                else
                {
                    int forwardReadStartPos = 0;
                    int reverseReadStartPos = 0;

                    if (IsForwardRead(read1))
                    {
                        forwardReadStartPos = read1.Pos;
                        reverseReadStartPos = read2.Pos;
                    }
                    else
                    {
                        forwardReadStartPos = read2.Pos;
                        reverseReadStartPos = read1.Pos;
                    }

                    if (forwardReadStartPos > reverseReadStartPos)
                    {
                        type = PairedReadType.StructuralAnomaly;
                    }
                    else
                    {
                        int insertLength = GetInsertLength(read1, read2);

                        // µ + 3σ
                        float upperLimit = meanLengthOfInsert + (3 * standardDeviationOfInsert);
                        // µ - 3σ
                        float lowerLimit = meanLengthOfInsert - (3 * standardDeviationOfInsert);
                        if (insertLength > upperLimit || insertLength < lowerLimit)
                        {
                            type = PairedReadType.LengthAnomaly;
                        }
                    }
                }
            }

            return(type);
        }