Beispiel #1
0
        /// <summary>
        /// Gets a boolean value which indicates that whether the specified read is reverse oriented or not.
        /// </summary>
        /// <param name="read">Aligned Sequence.</param>
        public static bool IsReverseRead(SAMAlignedSequence read)
        {
            if (read == null)
            {
                throw new ArgumentNullException("read");
            }

            return(!IsForwardRead(read));
        }
Beispiel #2
0
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="libraryInfo">Library information.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, CloneLibraryInformation libraryInfo)
        {
            if (libraryInfo == null)
            {
                throw new ArgumentNullException("libraryInfo");
            }

            return(GetPairedReadType(read1, read2, libraryInfo.MeanLengthOfInsert, libraryInfo.StandardDeviationOfInsert));
        }
Beispiel #3
0
        /// <summary>
        /// Gets a boolean value which indicates that whether the specified read is forward oriented or not.
        /// </summary>
        /// <param name="read">Aligned Sequence.</param>
        public static bool IsForwardRead(SAMAlignedSequence read)
        {
            if (read == null)
            {
                throw new ArgumentNullException("read");
            }

            return((read.Flag & SAMFlags.QueryOnReverseStrand) == 0);
        }
Beispiel #4
0
        public static string GetOptionValue(this SAMAlignedSequence sam, string tag, string vtype, string defaultValue)
        {
            var field = sam.OptionalFields.FirstOrDefault(m => m.Tag.Equals(tag) && m.VType.Equals(vtype));

            if (null == field)
            {
                return(defaultValue);
            }
            return(field.Value);
        }
Beispiel #5
0
 public static string GetQualityScoresString(this SAMAlignedSequence sam)
 {
     if (sam.Flag.HasFlag(SAMFlags.QueryOnReverseStrand))
     {
         return(new string(sam.GetEncodedQualityScores().Reverse().Select(a => (char)a).ToArray()));
     }
     else
     {
         return(new string(sam.GetEncodedQualityScores().Select(a => (char)a).ToArray()));
     }
 }
Beispiel #6
0
 public static string GetQuerySequenceString(this SAMAlignedSequence sam)
 {
     if (sam.Flag.HasFlag(SAMFlags.QueryOnReverseStrand))
     {
         return(sam.QuerySequence.GetReverseComplementedSequence().GetSequenceString());
     }
     else
     {
         return(sam.QuerySequence.GetSequenceString());
     }
 }
Beispiel #7
0
        /// <summary>
        /// Parses sequence data and quality values and updates SAMAlignedSequence instance.
        /// </summary>
        /// <param name="alignedSeq">SAM aligned Sequence.</param>
        /// <param name="alphabet">Alphabet of the sequence to be created.</param>
        /// <param name="sequencedata">Sequence data.</param>
        /// <param name="qualitydata">Quality values.</param>
        /// <param name="validate">Validation needed</param>
        public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, byte[] sequencedata, byte[] qualitydata, bool validate = true)
        {
            if (alignedSeq == null)
            {
                throw new ArgumentNullException("alignedSeq");
            }

            if (sequencedata == null || sequencedata.Length == 0)
            {
                throw new ArgumentNullException("sequencedata");
            }

            if (qualitydata == null || qualitydata.Length == 0)
            {
                throw new ArgumentNullException("qualitydata");
            }

            bool            isQualitativeSequence = true;
            string          message   = string.Empty;
            FastQFormatType fastQType = QualityFormatType;

            if (sequencedata.Length == 1 && sequencedata[0] == AsteriskAsByte)
            {
                return;
            }

            if (qualitydata.Length == 1 && qualitydata[0] == AsteriskAsByte)
            {
                isQualitativeSequence = false;
            }

            if (isQualitativeSequence)
            {
                // Check for sequence length and quality score length.
                if (sequencedata.Length != qualitydata.Length)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName);
                    message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, Properties.Resource.SAM_NAME, message1);
                    Trace.Report(message);
                    throw new Exception(message);
                }
            }

            alignedSeq.QuerySequence = isQualitativeSequence
                                     ? (ISequence) new QualitativeSequence(alphabet, fastQType, sequencedata, qualitydata, validate)
            {
                ID = alignedSeq.QName
            }
                                     : new Sequence(alphabet, sequencedata, validate)
            {
                ID = alignedSeq.QName
            };
        }
Beispiel #8
0
        /// <summary>
        /// Parses alignments in SAM format from a reader into a SequenceAlignmentMap object.
        /// </summary>
        /// <param name="reader">A reader for a biological sequence alignment text.</param>
        /// <returns>A new SequenceAlignmentMap instance containing parsed data.</returns>
        public IEnumerable <SAMAlignedSequence> ParseSequencesAsEnumerable(string fileName)
        {
            FileInfo fileInfo = new FileInfo(fileName);

            using (StreamReader reader = new StreamReader(fileName))
            {
                if (reader == null)
                {
                    throw new ArgumentNullException("reader");
                }

                // Parse the header lines and store them in a string.
                // This is being done as parsing the header using the textreader is parsing an extra line.
                List <string> headerStrings = new List <string>();
                string        line          = ReadNextLine(reader);
                while (line != null && line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
                {
                    headerStrings.Add(line);
                    line = ReadNextLine(reader);
                }

                // Parse the alignment header strings.
                SAMAlignmentHeader   header = ParseSamHeader(headerStrings);
                SequenceAlignmentMap sequenceAlignmentMap = new SequenceAlignmentMap(header);

                List <string> refSeqNames = null;
                bool          hasSQHeader = header.ReferenceSequences.Count > 0;
                if (!hasSQHeader)
                {
                    refSeqNames = new List <string>();
                }

                // Parse aligned sequences
                // If the SQ header is not present in header then get the reference sequences information from reads.
                while (line != null && !line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
                {
                    SAMAlignedSequence alignedSeq = ParseSequence(line, this.Alphabet);

                    if (!hasSQHeader)
                    {
                        if (!alignedSeq.RName.Equals("*", StringComparison.OrdinalIgnoreCase) &&
                            !refSeqNames.Contains(alignedSeq.RName, StringComparer.OrdinalIgnoreCase))
                        {
                            refSeqNames.Add(alignedSeq.RName);
                        }
                    }
                    yield return(alignedSeq);

                    //sequenceAlignmentMap.QuerySequences.Add(alignedSeq);
                    line = ReadNextLine(reader);
                }
            }
        }
Beispiel #9
0
        //public static string GetZezValue(this SAMAlignedSequence sam)
        //{
        //  return GetOptionValue(sam, "ZE", "Z");
        //}

        public static void WriteFastq(this SAMAlignedSequence sam, StreamWriter sw, bool posAsPaired = false)
        {
            if (posAsPaired)
            {
                sw.WriteLine(string.Format("@{0} {1}", sam.QName, sam.Pos));
            }
            else
            {
                sw.WriteLine("@" + sam.QName);
            }
            sw.WriteLine(sam.GetQuerySequenceString());
            sw.WriteLine("+");
            sw.WriteLine(sam.GetQualityScoresString());
        }
Beispiel #10
0
        /// <summary>
        /// Parse a single sequencer.
        /// </summary>
        /// <param name="bioText">sequence alignment text.</param>
        /// <param name="alphabet">Alphabet of the sequences.</param>
        /// <param name="referenceSequences">Reference sequences.</param>
        private static SAMAlignedSequence ParseSequence(string bioText, IAlphabet alphabet, IList <ISequence> referenceSequences)
        {
            const int optionalTokenStartingIndex = 11;

            string[] tokens = bioText.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries);

            SAMAlignedSequence alignedSeq = new SAMAlignedSequence();

            alignedSeq.QName = tokens[0];
            alignedSeq.Flag  = SAMAlignedSequenceHeader.GetFlag(tokens[1]);
            alignedSeq.RName = tokens[2];
            alignedSeq.Pos   = int.Parse(tokens[3], CultureInfo.InvariantCulture);
            alignedSeq.MapQ  = int.Parse(tokens[4], CultureInfo.InvariantCulture);
            alignedSeq.CIGAR = tokens[5];
            alignedSeq.MRNM  = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6];
            alignedSeq.MPos  = int.Parse(tokens[7], CultureInfo.InvariantCulture);
            alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture);

            ISequence refSeq = null;

            if (referenceSequences != null && referenceSequences.Count > 0)
            {
                refSeq = referenceSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0);
            }

            ParseQualityNSequence(alignedSeq, alphabet, tokens[9], tokens[10], refSeq);
            SAMOptionalField optField = null;
            string           message;

            for (int i = optionalTokenStartingIndex; i < tokens.Length; i++)
            {
                optField = new SAMOptionalField();
                if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i]))
                {
                    message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.InvalidOptionalField, tokens[i]);
                    throw new FormatException(message);
                }

                string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries);
                optField.Tag   = opttokens[0];
                optField.VType = opttokens[1];
                optField.Value = opttokens[2];

                alignedSeq.OptionalFields.Add(optField);
            }

            return(alignedSeq);
        }
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="libraryName">library name.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, string libraryName)
        {
            if (string.IsNullOrEmpty(libraryName))
            {
                throw new ArgumentNullException("libraryName");
            }

            CloneLibraryInformation libraryInfo = CloneLibrary.Instance.GetLibraryInformation(libraryName);

            if (libraryInfo == null)
            {
                throw new ArgumentOutOfRangeException("libraryName");
            }

            return(GetPairedReadType(read1, read2, libraryInfo));
        }
Beispiel #12
0
        /// <summary>
        /// Gets the insert length of reads.
        /// </summary>
        /// <param name="read1">First read.</param>
        /// <param name="read2">Second read.</param>
        /// <param name="validate">Validates the reads before calculating the insert length.</param>
        public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2, bool validate)
        {
            //                      reference chromosome
            //5'                         -->                      3'
            //----------------------------------------------------- F strand
            //
            //3'                         <--                       5'
            //----------------------------------------------------- R strand
            //        read1                         read2
            //    5'             3'             3'            5'
            //         -->                          <--
            //    |--------------               --------------|
            //    |<----------insert length------------------>|

            if (read1 == null)
            {
                throw new ArgumentNullException("read1");
            }

            if (read2 == null)
            {
                return(0);
            }


            if (validate)
            {
                PairedReadType type = GetPairedReadType(read1, read2, 0, 0);
                if (type != PairedReadType.Normal && type != PairedReadType.LengthAnomaly)
                {
                    return(0);
                }
            }

            if (read1.ISize == -read2.ISize)
            {
                return(read1.ISize >= 0 ? read1.ISize : -read1.ISize);
            }
            else
            {
                return(0);
            }
        }
Beispiel #13
0
        public static string GetOptionValue(this SAMAlignedSequence sam, string tag, string vtype, bool throwException = true, string parserName = null)
        {
            var field = sam.OptionalFields.FirstOrDefault(m => m.Tag.Equals(tag) && m.VType.Equals(vtype));

            if (null == field)
            {
                if (throwException)
                {
                    throw new Exception(string.Format("data error, cannot find {0}:{1}:XXX value in query {2}{3}.",
                                                      tag,
                                                      vtype,
                                                      sam.QName,
                                                      parserName == null ? "" : " by parser " + parserName));
                }
                else
                {
                    return(null);
                }
            }
            return(field.Value);
        }
 public IEnumerable<ISequence> Parse(Stream stream)
 {
     FastQParser fqp = new FastQParser ();
     foreach (var seq in fqp.Parse (stream)) {
         var name = seq.ID;
         var sp = name.Split ('/');
         var movie = sp [0];
         var hole = sp [1];
         SAMAlignedSequence sam = new SAMAlignedSequence ();
         sam.QuerySequence = seq;
         sam.OptionalFields.Add (new SAMOptionalField () { Tag = "sn", Value = "f,0,0,0,0" });
         sam.OptionalFields.Add (new SAMOptionalField () { Tag = "rs", Value = "f,0,0,0,0,0,0" });
         sam.OptionalFields.Add (new SAMOptionalField () { Tag = "zs", Value = "f,0,0,0,0,0,0" });
         PacBioCCSRead read = new PacBioCCSRead (sam) {
             AvgZscore = Single.NaN,
             HoleNumber = Convert.ToInt32 (hole),
             Movie = movie
         };
         yield return read;
     }
 }
Beispiel #15
0
        /// <summary>
        /// Parse a single sequencer.
        /// </summary>
        /// <param name="bioText">sequence alignment text.</param>
        /// <param name="alphabet">Alphabet of the sequences.</param>
        public static SAMAlignedSequence ParseSequence(string bioText, IAlphabet alphabet)
        {
            const int optionalTokenStartingIndex = 11;

            string[] tokens = bioText.Split(TabDelim, StringSplitOptions.RemoveEmptyEntries);

            SAMAlignedSequence alignedSeq = new SAMAlignedSequence
            {
                QName = tokens[0],
                Flag  = SAMAlignedSequenceHeader.GetFlag(tokens[1]),
                RName = tokens[2],
                Pos   = int.Parse(tokens[3]),
                MapQ  = int.Parse(tokens[4]),
                CIGAR = tokens[5]
            };

            alignedSeq.MRNM  = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6];
            alignedSeq.MPos  = int.Parse(tokens[7]);
            alignedSeq.ISize = int.Parse(tokens[8]);

            ParseQualityNSequence(alignedSeq, alphabet, tokens[9], tokens[10]);

            for (int i = optionalTokenStartingIndex; i < tokens.Length; i++)
            {
                SAMOptionalField optField = new SAMOptionalField();
                if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i]))
                {
                    throw new FormatException(string.Format(Properties.Resource.InvalidOptionalField, tokens[i]));
                }

                string[] opttokens = tokens[i].Split(ColonDelim, StringSplitOptions.RemoveEmptyEntries);
                optField.Tag   = opttokens[0];
                optField.VType = opttokens[1];
                optField.Value = opttokens[2];

                alignedSeq.OptionalFields.Add(optField);
            }

            return(alignedSeq);
        }
Beispiel #16
0
    public string SAMToString(SAMAlignedSequence sam)
    {
      if (sam == null)
      {
        return null;
      }

      return string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}",
        sam.QName,
        (int)sam.Flag,
        sam.RName,
        sam.Pos,
        sam.MapQ,
        sam.CIGAR,
        sam.MRNM,
        sam.MPos,
        sam.ISize,
        sam.GetQuerySequenceString(),
        sam.GetQualityScoresString(),
        (from of in sam.OptionalFields
         select string.Format("{0}:{1}:{2}", of.Tag, of.VType, of.Value)).Merge("\t"));
    }
Beispiel #17
0
        /// <summary>
        /// Gets encoded sequence according to the BAM specification.
        /// </summary>
        /// <param name="alignedSeq"></param>
        /// <returns></returns>
        private static byte[] GetEncodedSequence(SAMAlignedSequence alignedSeq)
        {
            List<byte> byteList = new List<byte>();
            ISequence seq = alignedSeq.QuerySequence;
            if (seq != null)
            {
                if (!(seq.Alphabet is DnaAlphabet))
                {
                    throw new ArgumentException(Properties.Resource.BAMFormatterSupportsDNAOnly);
                }

                byte[] symbolMap = seq.Alphabet.GetSymbolValueMap();

                for (int i = 0; i < seq.Count; i++)
                {
                    char symbol = (char)symbolMap[seq[i]];
                    byte encodedvalue = 0;

                  
                    // 4-bit encoded read: =ACMGRSVTWYHKDBN -> 0-15; the earlier base is stored in the 
                    // high-order 4 bits of the byte.
                    //Note:
                    // All the other symbols which are not supported by BAM specification (other than "=ACMGRSVTWYHKDBN") are converted to 'N'
                    // for example a '.' symbol which is supported by SAM specification will be converted to symbol 'N'
                    switch (symbol)
                    {
                        case '=':
                            encodedvalue = 0;
                            break;
                        case 'A':
                            encodedvalue = 1;
                            break;
                        case 'C':
                            encodedvalue = 2;
                            break;
                        case 'M':
                            encodedvalue = 3;
                            break;
                        case 'G':
                            encodedvalue = 4;
                            break;
                        case 'R':
                            encodedvalue = 5;
                            break;
                        case 'S':
                            encodedvalue = 6;
                            break;
                        case 'V':
                            encodedvalue = 7;
                            break;
                        case 'T':
                            encodedvalue = 8;
                            break;
                        case 'W':
                            encodedvalue = 9;
                            break;
                        case 'Y':
                            encodedvalue = 10;
                            break;
                        case 'H':
                            encodedvalue = 11;
                            break;
                        case 'K':
                            encodedvalue = 12;
                            break;
                        case 'D':
                            encodedvalue = 13;
                            break;
                        case 'B':
                            encodedvalue = 14;
                            break;
                        default:
                            encodedvalue = 15;
                            break;
                    }

                    if ((i + 1) % 2 > 0)
                    {
                        byteList.Add((byte)(encodedvalue << 4));
                    }
                    else
                    {
                        byteList[byteList.Count - 1] = (byte)(byteList[byteList.Count - 1] | encodedvalue);
                    }
                }
            }

            return byteList.ToArray();
        }
Beispiel #18
0
        // Gets block size required for the specified SAMAlignedSequence object.
        private int GetBlockSize(SAMAlignedSequence alignedSeq)
        {
            int readNameLen = alignedSeq.QName.Length + 1;
            int cigarLen = GetCIGARLength(alignedSeq.CIGAR);
            int readLen = (int)alignedSeq.QuerySequence.Count;

            return 32 + readNameLen + (cigarLen * 4) + ((readLen + 1) / 2) + readLen + GetAuxiliaryDataLength(alignedSeq);
        }
Beispiel #19
0
        List<BaseAndQualityAndPosition> getBasesForSequence(SAMAlignedSequence seq)
        {
            List<BaseAndQualityAndPosition> toReturn = new List<BaseAndQualityAndPosition>(seq.RefEndPos - seq.Pos + 10);
            // Decode the cigar string into operations.
            // TODO: This code is duplicated in many places
            string CIGAR = seq.CIGAR;
            List<KeyValuePair<char, int>> charsAndPositions = new List<KeyValuePair<char, int>>();
            for (int i = 0; i < CIGAR.Length; i++)
            {
                char ch = CIGAR[i];
                if (Char.IsDigit(ch))
                {
                    continue;
                }
                charsAndPositions.Add(new KeyValuePair<char, int>(ch, i));
            }

            // Get sequence bases and error probabilities
            var qseq = seq.QuerySequence as QualitativeSequence;
            var seq_log10ErrorProb = qseq.GetPhredQualityScores().Select(Utils.GetLog10ErrorProbability).ToArray();
            var seq_bases = qseq.ToArray();
            // Use the cigar operations to emit bases.
            int curRef = seq.Pos;
            int curQuery = 0;
            for (int i = 0; i < charsAndPositions.Count; i++)
            {
                // Parse the current cigar operation
                char ch = charsAndPositions[i].Key;
                int cig_start = i==0 ? 0 : charsAndPositions[i - 1].Value + 1;
                int cig_end = charsAndPositions[i].Value - cig_start;
                int cig_len = int.Parse(CIGAR.Substring(cig_start, cig_end));
                // Emit or advance based on cigar operation.
                switch (ch)
                {
                    case 'P': //padding (Silent deltions from padded reference)
                    case 'N': //skipped region from reference
                        throw new Exception("Pile up methods not built to handle reference clipping (Cigar P or N) yet.");
                    case 'M': //match or mismatch
                    case '=': //match
                    case 'X': //mismatch
                        for (int k = 0; k < cig_len; k++)
                        {                            
                            var bqp= new BaseAndQualityAndPosition(curRef,0, new BaseAndQuality(seq_bases[curQuery], seq_log10ErrorProb[curQuery]));
                            toReturn.Add(bqp);
                            curQuery++;
                            curRef++;
                        }
                        break;
                    case 'I'://insertion to the reference
                        for (int k = 0; k < cig_len; k++)
                        {                            
                            var bqp =  new BaseAndQualityAndPosition(curRef,k, new BaseAndQuality(seq_bases[curQuery], seq_log10ErrorProb[curQuery]));
                            toReturn.Add(bqp);
                            curQuery++;
                        }
                        break;
                    case 'D'://Deletion from the reference
                        for (int k = 0; k < cig_len; k++)
                        {                            
                            var bqp = new BaseAndQualityAndPosition(curRef,k, new BaseAndQuality((byte)'-', Double.NaN));
                            toReturn.Add(bqp);
                            curRef++;
                        }
                        break;
                    case 'S': //soft clipped
                        curQuery += cig_len;
                        break;
                    case 'H'://had clipped
                        break;
                    default:
                        throw new FormatException("Unexpected SAM Cigar element found " + ch.ToString());
                }                
            }
            return toReturn;
        }
Beispiel #20
0
        /// <summary>
        /// Gets the paired reads when DV is enabled.
        /// </summary>
        /// <param name="meanLengthOfInsert">Mean of the insert length.</param>
        /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param>
        /// <param name="calculate">If this flag is set then mean and standard deviation will
        /// be calculated from the paired reads instead of specified.</param>
        /// <returns>List of paired read.</returns>
        private IList <PairedRead> GetDVAwarePairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false)
        {
            // Dictionary helps to get the information at one pass of alinged sequence list.
            Dictionary <string, DVEnabledPairedRead> pairedReads = new Dictionary <string, DVEnabledPairedRead>();
            double sum   = 0;
            int    count = 0;

            for (int i = 0; i < QuerySequences.Count; i++)
            {
                DVEnabledPairedRead pairedRead;
                SAMAlignedSequence  read = QuerySequences[i];
                if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead)
                {
                    if (pairedReads.TryGetValue(read.QName, out pairedRead))
                    {
                        if (pairedRead.Index2 == -1 || pairedRead.Index1 == -1)
                        {
                            if (pairedRead.Index2 == -1)
                            {
                                pairedRead.Index2 = i;
                            }
                            else
                            {
                                pairedRead.Index1 = i;
                            }

                            // For best performace,
                            // 1. BAM/SAM file should be sorted by reads name.
                            // 2. If sorted on mapping position then give unmapped read a coordinate (generally the coordinate of the mapped mate)
                            //    for sorting/indexing purposes only.


                            pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert);

                            if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly)
                            {
                                pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2);

                                if (calculate)
                                {
                                    sum += pairedRead.InsertLength;
                                    count++;
                                }
                            }
                        }
                        else
                        {
                            pairedRead.InsertLength = 0;
                            if (calculate)
                            {
                                sum -= pairedRead.InsertLength;
                                count--;
                            }

                            pairedRead.ReadIndexes.Add(i);
                            pairedRead.PairedType = PairedReadType.MultipleHits;
                        }
                    }
                    else
                    {
                        pairedRead = new DVEnabledPairedRead(QuerySequences);
                        if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*"))
                        {
                            pairedRead.Index1 = i;
                        }
                        else
                        {
                            pairedRead.Index2 = i;
                        }

                        pairedRead.PairedType   = PairedReadType.Orphan;
                        pairedRead.InsertLength = 0;
                        pairedReads.Add(read.QName, pairedRead);
                    }
                }
            }

            List <PairedRead> allreads = pairedReads.Values.ToList <PairedRead>();

            pairedReads = null;

            if (calculate && count > 0)
            {
                UpdateType(allreads, sum, count);
            }

            return(allreads);
        }
Beispiel #21
0
        /// <summary>
        /// Parases sequence data and quality values and updates SAMAlignedSequence instance.
        /// </summary>
        /// <param name="alignedSeq">SAM aligned Sequence.</param>
        /// <param name="alphabet">Alphabet of the sequence to be created.</param>
        /// <param name="Encoding">Encoding to use while creating sequence.</param>
        /// <param name="sequencedata">Sequence data.</param>
        /// <param name="qualitydata">Quality values.</param>
        /// <param name="refSeq">Reference sequence if known.</param>
        /// <param name="isReadOnly">Flag to indicate whether the new sequence is required to in readonly or not.</param>
        public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, IEncoding Encoding, string sequencedata, string qualitydata, ISequence refSeq, bool isReadOnly)
        {
            if (alignedSeq == null)
            {
                throw new ArgumentNullException("alignedSeq");
            }

            if (string.IsNullOrWhiteSpace(sequencedata))
            {
                throw new ArgumentNullException("sequencedata");
            }

            if (string.IsNullOrWhiteSpace(qualitydata))
            {
                throw new ArgumentNullException("qualitydata");
            }

            bool   isQualitativeSequence = true;
            string message = string.Empty;

            byte[]          qualScores = null;
            FastQFormatType fastQType  = QualityFormatType;

            if (sequencedata.Equals("*"))
            {
                return;
            }

            if (qualitydata.Equals("*"))
            {
                isQualitativeSequence = false;
            }

            if (isQualitativeSequence)
            {
                // Get the quality scores from the fourth line.
                qualScores = ASCIIEncoding.ASCII.GetBytes(qualitydata);

                // Check for sequence length and quality score length.
                if (sequencedata.Length != qualitydata.Length)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName);
                    message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Resource.SAM_NAME, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            // get "." symbol indexes.
            int index = sequencedata.IndexOf('.', 0);

            while (index > -1)
            {
                alignedSeq.DotSymbolIndexes.Add(index);
                index = sequencedata.IndexOf('.', index);
            }

            // replace "." with N
            if (alignedSeq.DotSymbolIndexes.Count > 0)
            {
                sequencedata = sequencedata.Replace('.', 'N');
            }

            // get "=" symbol indexes.
            index = sequencedata.IndexOf('=', 0);
            while (index > -1)
            {
                alignedSeq.EqualSymbolIndexes.Add(index);
                index = sequencedata.IndexOf('=', index);
            }

            // replace "=" with corresponding symbol from refSeq.
            if (alignedSeq.EqualSymbolIndexes.Count > 0)
            {
                if (refSeq == null)
                {
                    throw new ArgumentException(Resource.RefSequenceNofFound);
                }

                for (int i = 0; i < alignedSeq.EqualSymbolIndexes.Count; i++)
                {
                    index        = alignedSeq.EqualSymbolIndexes[i];
                    sequencedata = sequencedata.Remove(index, 1);
                    sequencedata = sequencedata.Insert(index, refSeq[index].Symbol.ToString());
                }
            }

            ISequence sequence = null;

            if (isQualitativeSequence)
            {
                QualitativeSequence qualSeq = null;
                if (Encoding == null)
                {
                    qualSeq = new QualitativeSequence(alphabet, fastQType, sequencedata, qualScores);
                }
                else
                {
                    qualSeq = new QualitativeSequence(alphabet, fastQType, Encoding, sequencedata, qualScores);
                }

                qualSeq.ID         = alignedSeq.QName;
                qualSeq.IsReadOnly = isReadOnly;
                sequence           = qualSeq;
            }
            else
            {
                Sequence seq = null;
                if (Encoding == null)
                {
                    seq = new Sequence(alphabet, sequencedata);
                }
                else
                {
                    seq = new Sequence(alphabet, Encoding, sequencedata);
                }

                seq.ID         = alignedSeq.QName;
                seq.IsReadOnly = isReadOnly;
                sequence       = seq;
            }

            alignedSeq.QuerySequence = sequence;
        }
Beispiel #22
0
        /// <summary>
        /// Filters Sequence based on user inputs.
        /// </summary>
        /// <param name="alignedSequence">Aligned Sequence.</param>
        /// <returns>Whether aligned sequence matches user defined options.</returns>
        private bool Filter(SAMAlignedSequence alignedSequence)
        {
            bool filter = true;
            if (filter && FlagRequired != 0)
            {
                filter = (((int)alignedSequence.Flag) & FlagRequired) == FlagRequired;
            }

            if (filter && FilteringFlag != 0)
            {
                filter = ((((int)alignedSequence.Flag) & FilteringFlag) == 0);
            }

            if (filter && QualityMinimumMapping != 0)
            {
                filter = alignedSequence.MapQ == QualityMinimumMapping;
            }

            if (filter && !string.IsNullOrEmpty(Library) && rgRecFields.Count > 0)
            {
                filter = rgRecFields.First(
                        a => a.Tags.First(
                        b => b.Tag.Equals("ID")).Value.Equals(alignedSequence.OptionalFields.First(
                        c => c.Tag.Equals("RG")).Value)).Tags.First(
                        d => d.Tag.Equals("LB")).Value.Equals(Library);
            }

            if (filter && !string.IsNullOrEmpty(ReadGroup))
            {
                filter = alignedSequence.OptionalFields.AsParallel().Where(
                   O => O.Tag.ToUpper().Equals("RG")).ToList().Any(a => a.Value.Equals(ReadGroup));
            }

            if (filter && !string.IsNullOrEmpty(Region))
            {
                if (alignedSequence.RName.Equals(region.Chromosome))
                {
                    if (region.Start > -1)
                    {
                        if (alignedSequence.Pos >= region.Start)
                        {
                            if (region.End > -1)
                            {
                                if (alignedSequence.Pos <= region.End)
                                {
                                    filter = true;
                                }
                                else
                                {
                                    filter = false;
                                }
                            }
                            else
                            {
                                filter = true;
                            }
                        }
                        else
                        {
                            filter = false;
                        }
                    }
                    else
                    {
                        filter = true;
                    }
                }
                else
                {
                    filter = false;
                }
            }

            return filter;
        }
Beispiel #23
0
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="libraryName">library name.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, string libraryName)
        {
            if (string.IsNullOrEmpty(libraryName))
            {
                throw new ArgumentNullException("libraryName");
            }

            CloneLibraryInformation libraryInfo = CloneLibrary.Instance.GetLibraryInformation(libraryName);

            if (libraryInfo == null)
            {
                throw new ArgumentOutOfRangeException("libraryName");
            }

            return GetPairedReadType(read1, read2, libraryInfo);
        }
Beispiel #24
0
 /// <summary>
 /// Gets the insert length of reads.
 /// </summary>
 /// <param name="read1">First read.</param>
 /// <param name="read2">Second read.</param>
 public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2)
 {
     return GetInsertLength(read1, read2, false);
 }
Beispiel #25
0
 /// <summary>
 /// Gets an instance of SequenceRange class which represets alignment reigon of 
 /// specified aligned sequence (read) with reference sequence.
 /// </summary>
 /// <param name="alignedSequence">Aligned sequence.</param>
 private static ISequenceRange GetRegion(SAMAlignedSequence alignedSequence)
 {
     string refSeqName = alignedSequence.RName;
     long startPos = alignedSequence.Pos;
     long endPos = alignedSequence.RefEndPos;
     return new SequenceRange(refSeqName, startPos, endPos);
 }
Beispiel #26
0
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="libraryInfo">Library information.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, CloneLibraryInformation libraryInfo)
        {
            if (libraryInfo == null)
            {
                throw new ArgumentNullException("libraryInfo");
            }

            return GetPairedReadType(read1, read2, libraryInfo.MeanLengthOfInsert, libraryInfo.StandardDeviationOfInsert);
        }
Beispiel #27
0
        /// <summary>
        /// Gets a boolean value which indicates that whether the specified read is reverse oriented or not.
        /// </summary>
        /// <param name="read">Aligned Sequence.</param>
        public static bool IsReverseRead(SAMAlignedSequence read)
        {
            if (read == null)
            {
                throw new ArgumentNullException("read");
            }

            return !IsForwardRead(read);
        }
Beispiel #28
0
        /// <summary>
        /// Gets a boolean value which indicates that whether the specified read is forward oriented or not.
        /// </summary>
        /// <param name="read">Aligned Sequence.</param>
        public static bool IsForwardRead(SAMAlignedSequence read)
        {
            if (read == null)
            {
                throw new ArgumentNullException("read");
            }

            return (read.Flag & SAMFlags.QueryOnReverseStrand) == 0;
        }
Beispiel #29
0
        /// <summary>
        /// Gets the insert length of reads.
        /// </summary>
        /// <param name="read1">First read.</param>
        /// <param name="read2">Second read.</param>
        /// <param name="validate">Validates the reads before calculating the insert length.</param>
        public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2, bool validate)
        {
            //                      reference chromosome
            //5'                         -->                      3'
            //----------------------------------------------------- F strand
            // 
            //3'                         <--                       5'
            //----------------------------------------------------- R strand
            //        read1                         read2
            //    5'             3'             3'            5'
            //         -->                          <--   
            //    |--------------               --------------|
            //    |<----------insert length------------------>|

            if (read1 == null)
            {
                throw new ArgumentNullException("read1");
            }

            if (read2 == null)
            {
                return 0;
            }


            if (validate)
            {
                PairedReadType type = GetPairedReadType(read1, read2, 0, 0);
                if (type != PairedReadType.Normal && type != PairedReadType.LengthAnomaly)
                {
                    return 0;
                }
            }

            return read1.ISize == -read2.ISize ? (read1.ISize >= 0 ? read1.ISize : -read1.ISize) : 0;
        }
Beispiel #30
0
        // Gets the length of the optional fields in a SAMAlignedSequence object.
        private static int GetAuxiliaryDataLength(SAMAlignedSequence alignedSeq)
        {
            int size = 0;
            foreach (SAMOptionalField field in alignedSeq.OptionalFields)
            {
                size += 3;
                int valueSize = GetOptionalFieldValueSize(field);
                if (valueSize == 0)
                {
                    string message = string.Format(CultureInfo.InvariantCulture, Properties.Resource.BAM_InvalidIntValueInOptFieldOfAlignedSeq, field.Value, field.Tag, alignedSeq.QName);
                    throw new FormatException(message);
                }

                size += valueSize < 0 ? -valueSize : valueSize;
            }

            return size;
        }
        /// <summary>
        /// Add a sequence to the filtered output file header
        /// </summary>
        private void AddToHeader(SAMAlignedSequence seq)
        {
            newHeader.ReferenceSequences.Add(new ReferenceSequenceInfo(seq.RName, GetSequence(seq).Length));

            // for each good cluster
            SAMRecordField sq = new SAMRecordField("SQ");
            sq.Tags.Add(new SAMRecordFieldTag("SN", seq.RName));
            sq.Tags.Add(new SAMRecordFieldTag("LN", GetSequence(seq).Length.ToString(ci)));
            newHeader.RecordFields.Add(sq);
        }
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="meanLengthOfInsert">Mean of the insertion length.</param>
        /// <param name="standardDeviationOfInsert">Standard deviation of insertion length.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, float meanLengthOfInsert, float standardDeviationOfInsert)
        {
            PairedReadType type = PairedReadType.Normal;

            if (read1 == null)
            {
                throw new ArgumentNullException("read1");
            }

            if (read2 == null)
            {
                return(PairedReadType.Orphan);
            }

            if (string.IsNullOrEmpty(read2.RName) ||
                read2.RName.Equals("*") ||
                ((read2.Flag & SAMFlags.UnmappedQuery) == SAMFlags.UnmappedQuery))
            {
                type = PairedReadType.Orphan;
            }
            else if (!read2.RName.Equals(read1.RName))
            {
                type = PairedReadType.Chimera;
            }
            else
            {
                bool isBothforwardReads = IsForwardRead(read1) && IsForwardRead(read2);
                bool isBothReverseReads = IsReverseRead(read1) && IsReverseRead(read2);

                if (isBothforwardReads || isBothReverseReads)
                {
                    type = PairedReadType.StructuralAnomaly;
                }
                else
                {
                    int forwardReadStartPos = 0;
                    int reverseReadStartPos = 0;

                    if (IsForwardRead(read1))
                    {
                        forwardReadStartPos = read1.Pos;
                        reverseReadStartPos = read2.Pos;
                    }
                    else
                    {
                        forwardReadStartPos = read2.Pos;
                        reverseReadStartPos = read1.Pos;
                    }

                    if (forwardReadStartPos > reverseReadStartPos)
                    {
                        type = PairedReadType.StructuralAnomaly;
                    }
                    else
                    {
                        int insertLength = GetInsertLength(read1, read2);

                        // µ + 3σ
                        float upperLimit = meanLengthOfInsert + (3 * standardDeviationOfInsert);
                        // µ - 3σ
                        float lowerLimit = meanLengthOfInsert - (3 * standardDeviationOfInsert);
                        if (insertLength > upperLimit || insertLength < lowerLimit)
                        {
                            type = PairedReadType.LengthAnomaly;
                        }
                    }
                }
            }

            return(type);
        }
Beispiel #33
0
        /// <summary>
        /// Gets the paired reads type.
        /// </summary>
        /// <param name="read1">First aligned sequence.</param>
        /// <param name="read2">Second aligned sequence.</param>
        /// <param name="meanLengthOfInsert">Mean of the insertion length.</param>
        /// <param name="standardDeviationOfInsert">Standard deviation of insertion length.</param>
        public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, float meanLengthOfInsert, float standardDeviationOfInsert)
        {
            PairedReadType type = PairedReadType.Normal;
            if (read1 == null)
            {
                throw new ArgumentNullException("read1");
            }

            if (read2 == null)
            {
                return PairedReadType.Orphan;
            }

            if (string.IsNullOrEmpty(read2.RName)
                || read2.RName.Equals("*")
                || ((read2.Flag & SAMFlags.UnmappedQuery) == SAMFlags.UnmappedQuery))
            {
                type = PairedReadType.Orphan;
            }
            else if (!read2.RName.Equals(read1.RName))
            {
                type = PairedReadType.Chimera;
            }
            else
            {
                bool isBothforwardReads = IsForwardRead(read1) && IsForwardRead(read2);
                bool isBothReverseReads = IsReverseRead(read1) && IsReverseRead(read2);

                if (isBothforwardReads || isBothReverseReads)
                {
                    type = PairedReadType.StructuralAnomaly;
                }
                else
                {
                    int forwardReadStartPos = 0;
                    int reverseReadStartPos = 0;

                    if (IsForwardRead(read1))
                    {
                        forwardReadStartPos = read1.Pos;
                        reverseReadStartPos = read2.Pos;
                    }
                    else
                    {
                        forwardReadStartPos = read2.Pos;
                        reverseReadStartPos = read1.Pos;
                    }

                    if (forwardReadStartPos > reverseReadStartPos)
                    {
                        type = PairedReadType.StructuralAnomaly;
                    }
                    else
                    {

                        int insertLength = GetInsertLength(read1, read2);

                        // µ + 3σ
                        float upperLimit = meanLengthOfInsert + (3*standardDeviationOfInsert);
                        // µ - 3σ
                        float lowerLimit = meanLengthOfInsert - (3*standardDeviationOfInsert);
                        if (insertLength > upperLimit || insertLength < lowerLimit)
                        {
                            type = PairedReadType.LengthAnomaly;
                        }
                    }
                }
            }

            return type;
        }
Beispiel #34
0
 public static void WriteFasta(this SAMAlignedSequence sam, StreamWriter sw)
 {
     sw.WriteLine(">" + sam.QName);
     sw.WriteLine(sam.GetQuerySequenceString());
 }
Beispiel #35
0
        /// <summary>
        /// General method to Invalidate Quality Sequences
        /// <param name="method">enum type to execute different overload</param>
        /// </summary>
        private static void ValidateQualitySeqLength(ParseOrFormatQualLength method)
        {
            SAMAlignedSequence align = new SAMAlignedSequence();

            try
            {
                switch (method)
                {
                    case ParseOrFormatQualLength.AlignedSeq:
                        SAMParser.ParseQualityNSequence(
                            align,
                            Alphabets.DNA,
                            null,
                            String.Empty);
                        break;
                    case ParseOrFormatQualLength.Sequencedata:
                        align.QName = "Quality Value";
                        SAMParser.ParseQualityNSequence(
                            align,
                            Alphabets.DNA,
                            null,
                            String.Empty);
                        break;
                    case ParseOrFormatQualLength.Qualitydata:
                        align.QName = "Quality Value";
                        SAMParser.ParseQualityNSequence(
                            align,
                            Alphabets.DNA,
                            null,
                            Constants.QualitySequence);
                        break;
                    case ParseOrFormatQualLength.QualityLength:
                        align.QName = "Quality Value";
                        SAMParser.ParseQualityNSequence(
                            align,
                            Alphabets.DNA,
                            null,
                            Constants.QualitySequence);
                        break;
                    default:
                        break;
                }

                Assert.Fail();
            }
            catch (ArgumentException)
            {
                ApplicationLog.WriteLine(
                    "SAM Parser P2 : Successfully validated the exception");
            }
            catch (FormatException)
            {
                ApplicationLog.WriteLine(
                    "SAM Parser P2 : Successfully validated the exception");
            }
        }
Beispiel #36
0
 /// <summary>
 /// Update the linear index array based on an aligned read and its current coordinates
 /// </summary>
 /// <param name="alignedSeq"></param>
 /// <param name="offset"></param>
 internal void UpdateLinearArrayIndex(SAMAlignedSequence alignedSeq, FileOffset offset)
 {
     int pos = alignedSeq.Pos > 0 ? alignedSeq.Pos - 1 : 0;
     int end = alignedSeq.RefEndPos > 0 ? alignedSeq.RefEndPos - 1 : 0;
     pos = pos >> 14;
     end = end >> 14;
     if (end > largestBinSeen) {largestBinSeen = end;}
     for (int i = pos; i <= end; i++)
     {
         var cur = offSetArray[i];
         //TODO: Is second check necessary?  Seems to always be true as we are doing things in order
         if (cur.BothDataElements == 0 || cur > offset) {
             offSetArray[i] = offset;
         }
     }
 }
Beispiel #37
0
        /// <summary>
        /// Returns an aligned sequence by parses the BAM file.
        /// </summary>
        private SAMAlignedSequence GetAlignedSequence(int start, int end)
        {
            byte[] array = new byte[4];

            ReadUnCompressedData(array, 0, 4);
            int blockLen = Helper.GetInt32(array, 0);
            byte[] alignmentBlock = new byte[blockLen];
            ReadUnCompressedData(alignmentBlock, 0, blockLen);
            SAMAlignedSequence alignedSeq = new SAMAlignedSequence();
            int value;
            UInt32 UnsignedValue;
            // 0-4 bytes
            int refSeqIndex = Helper.GetInt32(alignmentBlock, 0);

            if (refSeqIndex == -1)
                alignedSeq.RName = "*";
            else
                alignedSeq.RName = refSeqNames[refSeqIndex];

            // 4-8 bytes
            alignedSeq.Pos = Helper.GetInt32(alignmentBlock, 4) + 1;

            // if there is no overlap no need to parse further.
            //     BAMPos > closedEnd
            // => (alignedSeq.Pos - 1) > end -1
            if (alignedSeq.Pos > end)
            {
                return null;
            }

            // 8 - 12 bytes "bin<<16|mapQual<<8|read_name_len"
            UnsignedValue = Helper.GetUInt32(alignmentBlock, 8);

            // 10 -12 bytes
            alignedSeq.Bin = (int)(UnsignedValue & 0xFFFF0000) >> 16;
            // 9th bytes
            alignedSeq.MapQ = (int)(UnsignedValue & 0x0000FF00) >> 8;
            // 8th bytes
            int queryNameLen = (int)(UnsignedValue & 0x000000FF);

            // 12 - 16 bytes
            UnsignedValue = Helper.GetUInt32(alignmentBlock, 12);
            // 14-16 bytes
            int flagValue = (int)(UnsignedValue & 0xFFFF0000) >> 16;
            alignedSeq.Flag = (SAMFlags)flagValue;
            // 12-14 bytes
            int cigarLen = (int)(UnsignedValue & 0x0000FFFF);

            // 16-20 bytes
            int readLen = Helper.GetInt32(alignmentBlock, 16);

            // 20-24 bytes
            int mateRefSeqIndex = Helper.GetInt32(alignmentBlock, 20);
            if (mateRefSeqIndex != -1)
            {
                alignedSeq.MRNM = refSeqNames[mateRefSeqIndex];
            }
            else
            {
                alignedSeq.MRNM = "*";
            }

            // 24-28 bytes
            alignedSeq.MPos = Helper.GetInt32(alignmentBlock, 24) + 1;

            // 28-32 bytes
            alignedSeq.ISize = Helper.GetInt32(alignmentBlock, 28);

            // 32-(32+readLen) bytes
            alignedSeq.QName = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, 32, queryNameLen - 1);
            StringBuilder strbuilder = new StringBuilder();
            int startIndex = 32 + queryNameLen;

            for (int i = startIndex; i < (startIndex + cigarLen * 4); i += 4)
            {
                // Get the CIGAR operation length stored in first 28 bits.
                UInt32 cigarValue = Helper.GetUInt32(alignmentBlock, i);
                strbuilder.Append(((cigarValue & 0xFFFFFFF0) >> 4).ToString(CultureInfo.InvariantCulture));

                // Get the CIGAR operation stored in last 4 bits.
                value = (int)cigarValue & 0x0000000F;

                // MIDNSHP=>0123456
                switch (value)
                {
                    case 0:
                        strbuilder.Append("M");
                        break;
                    case 1:
                        strbuilder.Append("I");
                        break;
                    case 2:
                        strbuilder.Append("D");
                        break;
                    case 3:
                        strbuilder.Append("N");
                        break;
                    case 4:
                        strbuilder.Append("S");
                        break;
                    case 5:
                        strbuilder.Append("H");
                        break;
                    case 6:
                        strbuilder.Append("P");
                        break;
                    case 7:
                        strbuilder.Append("=");
                        break;
                    case 8:
                        strbuilder.Append("X");
                        break;
                    default:
                        throw new FileFormatException(Properties.Resource.BAM_InvalidCIGAR);
                }
            }

            string cigar = strbuilder.ToString();
            if (string.IsNullOrWhiteSpace(cigar))
            {
                alignedSeq.CIGAR = "*";
            }
            else
            {
                alignedSeq.CIGAR = cigar;
            }

            // if there is no overlap no need to parse further.
            // ZeroBasedRefEnd < start
            // => (alignedSeq.RefEndPos -1) < start
            if (alignedSeq.RefEndPos - 1 < start && alignedSeq.RName!=Properties.Resource.SAM_NO_REFERENCE_DEFINED_INDICATOR)
            {
                return null;
            }

            startIndex += cigarLen * 4;
            strbuilder = new StringBuilder();
            int index = startIndex;
            for (; index < (startIndex + (readLen + 1) / 2) - 1; index++)
            {
                // Get first 4 bit value
                value = (alignmentBlock[index] & 0xF0) >> 4;
                strbuilder.Append(GetSeqChar(value));
                // Get last 4 bit value
                value = alignmentBlock[index] & 0x0F;
                strbuilder.Append(GetSeqChar(value));
            }

            value = (alignmentBlock[index] & 0xF0) >> 4;
            strbuilder.Append(GetSeqChar(value));
            if (readLen % 2 == 0)
            {
                value = alignmentBlock[index] & 0x0F;
                strbuilder.Append(GetSeqChar(value));
            }

            startIndex = index + 1;
            string strSequence = strbuilder.ToString();
            byte[] qualValues = new byte[readLen];
            string strQualValues = "*";

            if (alignmentBlock[startIndex] != 0xFF)
            {
                for (int i = startIndex; i < (startIndex + readLen); i++)
                {
                    qualValues[i - startIndex] = (byte)(alignmentBlock[i] + 33);
                }

                strQualValues = System.Text.ASCIIEncoding.ASCII.GetString(qualValues);
            }

            SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, strSequence, strQualValues);

            startIndex += readLen;
            if (alignmentBlock.Length > startIndex + 4 && alignmentBlock[startIndex] != 0x0 && alignmentBlock[startIndex + 1] != 0x0)
            {
                for (index = startIndex; index < alignmentBlock.Length; )
                {
                    SAMOptionalField optionalField = new SAMOptionalField();
                    optionalField.Tag = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, index, 2);
                    index += 2;
                    char vType = (char)alignmentBlock[index++];
                    string valueType = vType.ToString();

                    // SAM format supports [AifZH] for value type.
                    // In BAM, an integer may be stored as a signed 8-bit integer (c), unsigned 8-bit integer (C), signed short (s), unsigned
                    // short (S), signed 32-bit (i) or unsigned 32-bit integer (I), depending on the signed magnitude of the integer. However,
                    // in SAM, all types of integers are presented as type ʻiʼ.
                    string message = Helper.IsValidPatternValue("VType", valueType, BAMOptionalFieldRegex);
                    if (!string.IsNullOrEmpty(message))
                    {
                        throw new FormatException(message);
                    }

                    optionalField.Value = GetOptionalValue(vType, alignmentBlock, ref index).ToString();

                    // Convert to SAM format.
                    if ("cCsSI".IndexOf(vType) >= 0)
                    {
                        valueType = "i";
                    }

                    optionalField.VType = valueType;

                    alignedSeq.OptionalFields.Add(optionalField);
                }
            }

            return alignedSeq;
        }
Beispiel #38
0
        /// <summary>
        /// Parases sequence data and quality values and updates SAMAlignedSequence instance.
        /// </summary>
        /// <param name="alignedSeq">SAM aligned Sequence.</param>
        /// <param name="alphabet">Alphabet of the sequence to be created.</param>
        /// <param name="sequencedata">Sequence data.</param>
        /// <param name="qualitydata">Quality values.</param>
        public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, string sequencedata, string qualitydata)
        {
            if (alignedSeq == null)
            {
                throw new ArgumentNullException("alignedSeq");
            }

            if (string.IsNullOrWhiteSpace(sequencedata))
            {
                throw new ArgumentNullException("sequencedata");
            }

            if (string.IsNullOrWhiteSpace(qualitydata))
            {
                throw new ArgumentNullException("qualitydata");
            }

            bool   isQualitativeSequence = true;
            string message = string.Empty;

            byte[]          qualScores = null;
            FastQFormatType fastQType  = QualityFormatType;

            if (sequencedata.Equals("*"))
            {
                return;
            }

            if (qualitydata.Equals("*"))
            {
                isQualitativeSequence = false;
            }

            if (isQualitativeSequence)
            {
                // Get the quality scores from the fourth line.
                qualScores = ASCIIEncoding.ASCII.GetBytes(qualitydata);

                // Check for sequence length and quality score length.
                if (sequencedata.Length != qualitydata.Length)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName);
                    message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, Properties.Resource.SAM_NAME, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            ISequence sequence = null;

            if (isQualitativeSequence)
            {
                QualitativeSequence qualSeq = new QualitativeSequence(alphabet, fastQType, sequencedata, ASCIIEncoding.ASCII.GetString(qualScores));
                qualSeq.ID = alignedSeq.QName;
                sequence   = qualSeq;
            }
            else
            {
                sequence    = new Sequence(alphabet, sequencedata);
                sequence.ID = alignedSeq.QName;
            }

            alignedSeq.QuerySequence = sequence;
        }
Beispiel #39
0
        // Validates the alignment.
        private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment)
        {
            SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap;
            if (seqAlignmentMap != null)
            {
                ValidateAlignmentHeader(seqAlignmentMap.Header);
                if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates)
                {
                    this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges());
                }
                else
                {
                    this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges();
                }

                return seqAlignmentMap;
            }

            SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader;
            if (header == null)
            {
                throw new ArgumentException(Properties.Resource.SAMAlignmentHeaderNotFound);
            }

            ValidateAlignmentHeader(header);

            seqAlignmentMap = new SequenceAlignmentMap(header);
            if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates)
            {
                this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges());
            }
            else
            {
                this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges();
            }

            foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences)
            {
                SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader;
                if (alignedHeader == null)
                {
                    throw new ArgumentException(Properties.Resource.SAMAlignedSequenceHeaderNotFound);
                }

                SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader);
                samAlignedSeq.QuerySequence = alignedSeq.Sequences[0];
                seqAlignmentMap.QuerySequences.Add(samAlignedSeq);
            }

            return seqAlignmentMap;
        }
Beispiel #40
0
 /// <summary>
 /// Writes aligned sequence to output stream.
 /// </summary>
 /// <param name="header">Alignment header.</param>
 /// <param name="alignedSequence">Aligned sequence to write.</param>
 private void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSequence)
 {
     if (UnCompressedBAM || BAMOutput)
     {
         // In case of compressed bamoutput uncompressed file will be compressed before sending it to output stream.
         bamformatter.WriteAlignedSequence(header, alignedSequence, bamUncompressedOutStream);
     }
     else
     {
         SAMFormatter.WriteSAMAlignedSequence(writer, alignedSequence);
     }
 }
Beispiel #41
0
        /// <summary>
        /// Writes SAMAlignedSequence to specified stream.
        /// </summary>
        /// <param name="header">Header from SAM object.</param>
        /// <param name="alignedSeq">SAMAlignedSequence object.</param>
        /// <param name="writer">Stream to write.</param>
        public void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSeq, Stream writer)
        {
            if (header == null)
            {
                throw new ArgumentNullException("header");
            }

            if (alignedSeq == null)
            {
                throw new ArgumentNullException("alignedSeq");
            }

            if (writer == null)
            {
                throw new ArgumentNullException("writer");
            }

            if (this.refSequences == null)
            {
                this.refSequences = header.GetReferenceSequenceRanges();
            }

            WriteAlignedSequence(alignedSeq, writer);
        }
Beispiel #42
0
 /// <summary>
 /// Method throws an exception if sequence violates any assumption made by this class anywhere.
 /// Avoids, separate checks within each method.
 /// </summary>
 /// <param name="seq"></param>
 private void validateSequence(SAMAlignedSequence seq)
 {
     if (seq == null) {
         throw new ArgumentNullException("seq");
     }
     if (String.IsNullOrEmpty(seq.RName) || 
         seq.RefEndPos <= seq.Pos || 
         String.IsNullOrEmpty(seq.CIGAR) || 
         seq.CIGAR =="*" ||
         !(seq.QuerySequence is QualitativeSequence) )
     {
         throw new ArgumentException("Tried to build a pileup with an invalid sequence.  Sequence was:\n"+
             seq.ToString());
     }
 }
Beispiel #43
0
        /// <summary>
        /// Writes SAMAlignedSequence to specified stream.
        /// </summary>
        /// <param name="alignedSeq">SAMAlignedSequence object.</param>
        /// <param name="writer">Stream to write.</param>
        private void WriteAlignedSequence(SAMAlignedSequence alignedSeq, Stream writer)
        {
            // Get the total block size required.
            int blocksize = GetBlockSize(alignedSeq);

            // Get Reference sequence index.
            int rid = GetRefSeqID(alignedSeq.RName);

            // bin<<16|mapQual<<8|read_name_len (including NULL)
            uint bin_mq_nl = (uint)alignedSeq.Bin << 16;
            bin_mq_nl = bin_mq_nl | (uint)alignedSeq.MapQ << 8;
            bin_mq_nl = bin_mq_nl | (uint)(alignedSeq.QName.Length + 1);

            // flag<<16|cigar_len
            uint flag_nc = (uint)alignedSeq.Flag << 16;
            flag_nc = flag_nc | (uint)GetCIGARLength(alignedSeq.CIGAR);

            int readLen = (int)alignedSeq.QuerySequence.Count;

            int mateRefId = GetRefSeqID(alignedSeq.MRNM);

            byte[] readName = Encoding.UTF8.GetBytes(alignedSeq.QName);

            // Cigar: op_len<<4|op. Op: MIDNSHP=X => 012345678
            IList<uint> encodedCIGAR = GetEncodedCIGAR(alignedSeq.CIGAR);

            //block size
            writer.Write(Helper.GetLittleEndianByteArray(blocksize), 0, 4);

            // Reference sequence index.
            writer.Write(Helper.GetLittleEndianByteArray(rid), 0, 4);

            // Pos
            writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.Pos > 0 ? alignedSeq.Pos - 1 : -1), 0, 4);

            // bin<<16|mapQual<<8|read_name_len (including NULL)
            writer.Write(Helper.GetLittleEndianByteArray(bin_mq_nl), 0, 4);

            // flag<<16|cigar_len
            writer.Write(Helper.GetLittleEndianByteArray(flag_nc), 0, 4);

            // Length of the read
            writer.Write(Helper.GetLittleEndianByteArray(readLen), 0, 4);

            // Mate reference sequence index
            writer.Write(Helper.GetLittleEndianByteArray(mateRefId), 0, 4);

            // mate_pos - Leftmost coordinate of the mate
            // As per SAM format Mpos will be 1 based and 0 indicates unpaired or pairing information is unavailabe.
            // In case of BAM format Mpos will be zero based and -1 indicates unpaired or pairing information is unavailabe.
            writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.MPos - 1), 0, 4);

            // Insert size of the read pair (if paired)
            writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.ISize), 0, 4);

            // Read name, null terminated
            writer.Write(readName, 0, readName.Length);
            writer.WriteByte((byte)'\0');

            // Cigar: op_len<<4|op. Op: MIDNSHP=>0123456
            foreach (uint data in encodedCIGAR)
            {
                writer.Write(Helper.GetLittleEndianByteArray(data), 0, 4);
            }

            // 4-bit encoded read: =ACGTN=>0,1,2,4,8,15; the earlier base is stored in the high-order 4 bits of the byte.
            byte[] encodedValues = GetEncodedSequence(alignedSeq);
            writer.Write(encodedValues, 0, encodedValues.Length);

            // Phred base quality (0xFF if absent)
            encodedValues = GetQualityValue(alignedSeq.QuerySequence);
            writer.Write(encodedValues, 0, encodedValues.Length);

            // Optional fields
            foreach (SAMOptionalField field in alignedSeq.OptionalFields)
            {
                byte[] optionalArray = GetOptioanField(field);
                writer.Write(optionalArray, 0, optionalArray.Length);
            }
        }
 /// <summary>
 /// Add a sequence into a dictionary value item which represents a list of sequences. The list to which to add 
 /// to is found using key
 /// </summary>
 private static void AddToDict(Dictionary<String, List<SAMAlignedSequence>> dict, string key, SAMAlignedSequence seq)
 {
     if (key != null)
     {
         if (dict.ContainsKey(key))
         {
             List<SAMAlignedSequence> existingVal = dict[key];
             existingVal.Add(seq);
             dict[key] = existingVal;
         }
         else
         {
             dict.Add(key, new List<SAMAlignedSequence> { seq });
         }
     }
     else
     {
         throw new ArgumentException(Properties.Resources.INVALID_KEY);
     }
 }
        /// <summary>
        /// Add a sequence. If the sequence belongs to the current cluster, store it. If the sequence
        /// is part of a new cluster, process the current sequence cluster then add the sequence
        /// to a new cluster
        /// </summary>
        /// <param name="sequence">A sequence.</param>
        /// <returns>Returns true if the sequence could be added, false if the handler has been closed.</returns>
        public bool Add(SAMAlignedSequence sequence)
        {
            if (allSequences == null)
            {
                allSequences = new Collection<SAMAlignedSequence>();
            }
            if(sequence == null)
            {
                return true;
            }

            if (!finished)
            {
                string thisSeqCluster = sequence.RName; // Cluster the sequence we just added belongs to

                // This is the first sequence for the first cluster
                if (currentClusterId == null)
                {
                    currentClusterId = thisSeqCluster;
                }

                // This sequence belongs to a different cluster from the ones currently stored by this handler
                // (Process currently stored sequences before adding the new sequence)
                else if (!currentClusterId.Equals(thisSeqCluster))
                {
                    ++numberClustersParsed; // mark off another cluster
                    ProcessSequences();

                    currentClusterId = thisSeqCluster;
                    allSequences = new Collection<SAMAlignedSequence>();
                }

                allSequences.Add(sequence);
                return true;
            }

            // Processing of sequences should be finished but we are still outputting to the bam file
            // Or we are supposed to write to a bam file and the header and body files have not yet been merged
            // Wait for output to the bam file to complete
            else if ((!canWriteToBam && writeToFilteredBam) || (!bamFilesMerged && writeToFilteredBam))
            {
                while (!canWriteToBam || !bamFilesMerged)
                {
                    Thread.Sleep(20000); // sleep 20 seconds
                }
                return true;
            }

            // finished == true, bam file is writable and bam files have been merged (or no bam file was ever written to)
            // returning false indicates to calling process that no more sequences will be accepted
            else
            {
                return false;
            }
        }
 /// <summary>
 /// Gets the insert length of reads.
 /// </summary>
 /// <param name="read1">First read.</param>
 /// <param name="read2">Second read.</param>
 public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2)
 {
     return(GetInsertLength(read1, read2, false));
 }
 /// <summary>
 /// Given a sequence, returns the reference ID, or null if sequences are unmapped
 /// </summary>
 private static string GetId(SAMAlignedSequence sequence)
 {
     if (!sequence.Flag.HasFlag(SAMFlags.UnmappedQuery))
     {
         return (sequence != null) ? sequence.RName : null;
     }
     else
     {
         return null;
     }
 }
Beispiel #48
0
 /// <summary>
 /// Initializes a new instance of the <see cref="Bio.IO.PacBio.PacBioCCSRead"/> class. From an initially parsed BAM file.
 /// </summary>
 /// <param name="s">S.</param>
 public PacBioCCSRead (SAMAlignedSequence s)
 {
     /* TODO: Converting from binary to string and back is beyond silly...
      * no performance hit worth worrying about at present, but in the future it might be worth
      * going directly from binary to the type rather than through string intermediates */
     foreach (var v in s.OptionalFields) {
         if (v.Tag == "sn") {
             var snrs = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
             SnrA = snrs [0];
             SnrC = snrs [1];
             SnrG = snrs [2];
             SnrT = snrs [3];
         } else if (v.Tag == "zm") {
             HoleNumber = (int)Convert.ToInt32 (v.Value);
         } else if (v.Tag == "pq") {
             // This tag is now deprecated by the rq tag
             ReadQuality = Convert.ToSingle (v.Value);
         } else if (v.Tag == "rq") {
             ReadQuality = Convert.ToSingle (v.Value);
         }else if (v.Tag == "za") {
             AvgZscore = (float)Convert.ToSingle (v.Value);
         } else if (v.Tag == "rs") {
             statusCounts = v.Value.Split (',').Skip (1).Select (x => Convert.ToInt32 (x)).ToArray ();
         } else if (v.Tag == "np") {
             NumPasses = Convert.ToInt32 (v.Value);
         } else if (v.Tag == "RG") {
             ReadGroup = v.Value;
         } else if (v.Tag == "zs") {
             ZScores = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
         }
     }
     // TODO: We should use String.Intern here, but not available in PCL...
     // Movie = String.Intern(s.QuerySequence.ID.Split ('/') [0]);
     Movie = s.QuerySequence.ID.Split ('/') [0];
     Sequence = s.QuerySequence as QualitativeSequence;
 }
 /// <summary>
 /// Given a SAMAlignedSequence, get the RG tag for that read
 /// </summary>
 private static string GetRgTag(SAMAlignedSequence seq)
 {
     foreach (SAMOptionalField field in seq.OptionalFields)
     {
         // I iterate through to find RG each time in case the optional fields
         // do not have a consistent format.
         if (field.Tag == "RG")
         {
             return field.Value;
         }
     }
     return null;
 }
        /// <summary>
        /// Gets the paired reads when SAMAligned sequences are in memory.
        /// </summary>
        /// <param name="meanLengthOfInsert">Mean of the insert length.</param>
        /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param>
        /// <param name="calculate">If this flag is set then mean and standard deviation will
        /// be calculated from the paired reads instead of specified.</param>
        /// <returns>List of paired read.</returns>
        private IList <PairedRead> GetInMemoryPairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false)
        {
            // Dictionary helps to get the information at one pass of alinged sequence list.
            Dictionary <string, PairedRead> pairedReads = new Dictionary <string, PairedRead>();
            double sum   = 0;
            int    count = 0;

            for (int i = 0; i < QuerySequences.Count; i++)
            {
                PairedRead         pairedRead;
                SAMAlignedSequence read = QuerySequences[i];
                if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead)
                {
                    if (pairedReads.TryGetValue(read.QName, out pairedRead))
                    {
                        if (pairedRead.Read2 == null || pairedRead.Read1 == null)
                        {
                            if (pairedRead.Read2 == null)
                            {
                                pairedRead.Read2 = read;
                            }
                            else
                            {
                                pairedRead.Read1 = read;
                            }

                            pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert);
                            if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly)
                            {
                                pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2);
                                if (calculate)
                                {
                                    sum += pairedRead.InsertLength;
                                    count++;
                                }
                            }
                        }
                        else
                        {
                            pairedRead.InsertLength = 0;
                            if (calculate)
                            {
                                sum -= pairedRead.InsertLength;
                                count--;
                            }

                            pairedRead.Reads.Add(read);
                            pairedRead.PairedType = PairedReadType.MultipleHits;
                        }
                    }
                    else
                    {
                        pairedRead = new PairedRead();
                        if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*"))
                        {
                            pairedRead.Read1 = read;
                        }
                        else
                        {
                            pairedRead.Read2 = read;
                        }

                        pairedRead.PairedType   = PairedReadType.Orphan;
                        pairedRead.InsertLength = 0;
                        pairedReads.Add(read.QName, pairedRead);
                    }
                }
            }

            List <PairedRead> allreads = pairedReads.Values.ToList();

            pairedReads = null;
            if (calculate && count > 0)
            {
                UpdateType(allreads, sum, count);
            }

            return(allreads);
        }
 /// <summary>
 /// Given a SAMAlignedSequence, returns a string representation of the genetic sequence
 /// </summary>
 private static string GetSequence(SAMAlignedSequence seq)
 {
     String seqStr = seq.QuerySequence.ToString();
     return Regex.Split(seqStr, "\r\n")[0];
 }