/// <summary> /// Gets a boolean value which indicates that whether the specified read is reverse oriented or not. /// </summary> /// <param name="read">Aligned Sequence.</param> public static bool IsReverseRead(SAMAlignedSequence read) { if (read == null) { throw new ArgumentNullException("read"); } return(!IsForwardRead(read)); }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="libraryInfo">Library information.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, CloneLibraryInformation libraryInfo) { if (libraryInfo == null) { throw new ArgumentNullException("libraryInfo"); } return(GetPairedReadType(read1, read2, libraryInfo.MeanLengthOfInsert, libraryInfo.StandardDeviationOfInsert)); }
/// <summary> /// Gets a boolean value which indicates that whether the specified read is forward oriented or not. /// </summary> /// <param name="read">Aligned Sequence.</param> public static bool IsForwardRead(SAMAlignedSequence read) { if (read == null) { throw new ArgumentNullException("read"); } return((read.Flag & SAMFlags.QueryOnReverseStrand) == 0); }
public static string GetOptionValue(this SAMAlignedSequence sam, string tag, string vtype, string defaultValue) { var field = sam.OptionalFields.FirstOrDefault(m => m.Tag.Equals(tag) && m.VType.Equals(vtype)); if (null == field) { return(defaultValue); } return(field.Value); }
public static string GetQualityScoresString(this SAMAlignedSequence sam) { if (sam.Flag.HasFlag(SAMFlags.QueryOnReverseStrand)) { return(new string(sam.GetEncodedQualityScores().Reverse().Select(a => (char)a).ToArray())); } else { return(new string(sam.GetEncodedQualityScores().Select(a => (char)a).ToArray())); } }
public static string GetQuerySequenceString(this SAMAlignedSequence sam) { if (sam.Flag.HasFlag(SAMFlags.QueryOnReverseStrand)) { return(sam.QuerySequence.GetReverseComplementedSequence().GetSequenceString()); } else { return(sam.QuerySequence.GetSequenceString()); } }
/// <summary> /// Parses sequence data and quality values and updates SAMAlignedSequence instance. /// </summary> /// <param name="alignedSeq">SAM aligned Sequence.</param> /// <param name="alphabet">Alphabet of the sequence to be created.</param> /// <param name="sequencedata">Sequence data.</param> /// <param name="qualitydata">Quality values.</param> /// <param name="validate">Validation needed</param> public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, byte[] sequencedata, byte[] qualitydata, bool validate = true) { if (alignedSeq == null) { throw new ArgumentNullException("alignedSeq"); } if (sequencedata == null || sequencedata.Length == 0) { throw new ArgumentNullException("sequencedata"); } if (qualitydata == null || qualitydata.Length == 0) { throw new ArgumentNullException("qualitydata"); } bool isQualitativeSequence = true; string message = string.Empty; FastQFormatType fastQType = QualityFormatType; if (sequencedata.Length == 1 && sequencedata[0] == AsteriskAsByte) { return; } if (qualitydata.Length == 1 && qualitydata[0] == AsteriskAsByte) { isQualitativeSequence = false; } if (isQualitativeSequence) { // Check for sequence length and quality score length. if (sequencedata.Length != qualitydata.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, Properties.Resource.SAM_NAME, message1); Trace.Report(message); throw new Exception(message); } } alignedSeq.QuerySequence = isQualitativeSequence ? (ISequence) new QualitativeSequence(alphabet, fastQType, sequencedata, qualitydata, validate) { ID = alignedSeq.QName } : new Sequence(alphabet, sequencedata, validate) { ID = alignedSeq.QName }; }
/// <summary> /// Parses alignments in SAM format from a reader into a SequenceAlignmentMap object. /// </summary> /// <param name="reader">A reader for a biological sequence alignment text.</param> /// <returns>A new SequenceAlignmentMap instance containing parsed data.</returns> public IEnumerable <SAMAlignedSequence> ParseSequencesAsEnumerable(string fileName) { FileInfo fileInfo = new FileInfo(fileName); using (StreamReader reader = new StreamReader(fileName)) { if (reader == null) { throw new ArgumentNullException("reader"); } // Parse the header lines and store them in a string. // This is being done as parsing the header using the textreader is parsing an extra line. List <string> headerStrings = new List <string>(); string line = ReadNextLine(reader); while (line != null && line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { headerStrings.Add(line); line = ReadNextLine(reader); } // Parse the alignment header strings. SAMAlignmentHeader header = ParseSamHeader(headerStrings); SequenceAlignmentMap sequenceAlignmentMap = new SequenceAlignmentMap(header); List <string> refSeqNames = null; bool hasSQHeader = header.ReferenceSequences.Count > 0; if (!hasSQHeader) { refSeqNames = new List <string>(); } // Parse aligned sequences // If the SQ header is not present in header then get the reference sequences information from reads. while (line != null && !line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { SAMAlignedSequence alignedSeq = ParseSequence(line, this.Alphabet); if (!hasSQHeader) { if (!alignedSeq.RName.Equals("*", StringComparison.OrdinalIgnoreCase) && !refSeqNames.Contains(alignedSeq.RName, StringComparer.OrdinalIgnoreCase)) { refSeqNames.Add(alignedSeq.RName); } } yield return(alignedSeq); //sequenceAlignmentMap.QuerySequences.Add(alignedSeq); line = ReadNextLine(reader); } } }
//public static string GetZezValue(this SAMAlignedSequence sam) //{ // return GetOptionValue(sam, "ZE", "Z"); //} public static void WriteFastq(this SAMAlignedSequence sam, StreamWriter sw, bool posAsPaired = false) { if (posAsPaired) { sw.WriteLine(string.Format("@{0} {1}", sam.QName, sam.Pos)); } else { sw.WriteLine("@" + sam.QName); } sw.WriteLine(sam.GetQuerySequenceString()); sw.WriteLine("+"); sw.WriteLine(sam.GetQualityScoresString()); }
/// <summary> /// Parse a single sequencer. /// </summary> /// <param name="bioText">sequence alignment text.</param> /// <param name="alphabet">Alphabet of the sequences.</param> /// <param name="referenceSequences">Reference sequences.</param> private static SAMAlignedSequence ParseSequence(string bioText, IAlphabet alphabet, IList <ISequence> referenceSequences) { const int optionalTokenStartingIndex = 11; string[] tokens = bioText.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); alignedSeq.QName = tokens[0]; alignedSeq.Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]); alignedSeq.RName = tokens[2]; alignedSeq.Pos = int.Parse(tokens[3], CultureInfo.InvariantCulture); alignedSeq.MapQ = int.Parse(tokens[4], CultureInfo.InvariantCulture); alignedSeq.CIGAR = tokens[5]; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7], CultureInfo.InvariantCulture); alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture); ISequence refSeq = null; if (referenceSequences != null && referenceSequences.Count > 0) { refSeq = referenceSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0); } ParseQualityNSequence(alignedSeq, alphabet, tokens[9], tokens[10], refSeq); SAMOptionalField optField = null; string message; for (int i = optionalTokenStartingIndex; i < tokens.Length; i++) { optField = new SAMOptionalField(); if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i])) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.InvalidOptionalField, tokens[i]); throw new FormatException(message); } string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; alignedSeq.OptionalFields.Add(optField); } return(alignedSeq); }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="libraryName">library name.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, string libraryName) { if (string.IsNullOrEmpty(libraryName)) { throw new ArgumentNullException("libraryName"); } CloneLibraryInformation libraryInfo = CloneLibrary.Instance.GetLibraryInformation(libraryName); if (libraryInfo == null) { throw new ArgumentOutOfRangeException("libraryName"); } return(GetPairedReadType(read1, read2, libraryInfo)); }
/// <summary> /// Gets the insert length of reads. /// </summary> /// <param name="read1">First read.</param> /// <param name="read2">Second read.</param> /// <param name="validate">Validates the reads before calculating the insert length.</param> public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2, bool validate) { // reference chromosome //5' --> 3' //----------------------------------------------------- F strand // //3' <-- 5' //----------------------------------------------------- R strand // read1 read2 // 5' 3' 3' 5' // --> <-- // |-------------- --------------| // |<----------insert length------------------>| if (read1 == null) { throw new ArgumentNullException("read1"); } if (read2 == null) { return(0); } if (validate) { PairedReadType type = GetPairedReadType(read1, read2, 0, 0); if (type != PairedReadType.Normal && type != PairedReadType.LengthAnomaly) { return(0); } } if (read1.ISize == -read2.ISize) { return(read1.ISize >= 0 ? read1.ISize : -read1.ISize); } else { return(0); } }
public static string GetOptionValue(this SAMAlignedSequence sam, string tag, string vtype, bool throwException = true, string parserName = null) { var field = sam.OptionalFields.FirstOrDefault(m => m.Tag.Equals(tag) && m.VType.Equals(vtype)); if (null == field) { if (throwException) { throw new Exception(string.Format("data error, cannot find {0}:{1}:XXX value in query {2}{3}.", tag, vtype, sam.QName, parserName == null ? "" : " by parser " + parserName)); } else { return(null); } } return(field.Value); }
public IEnumerable<ISequence> Parse(Stream stream) { FastQParser fqp = new FastQParser (); foreach (var seq in fqp.Parse (stream)) { var name = seq.ID; var sp = name.Split ('/'); var movie = sp [0]; var hole = sp [1]; SAMAlignedSequence sam = new SAMAlignedSequence (); sam.QuerySequence = seq; sam.OptionalFields.Add (new SAMOptionalField () { Tag = "sn", Value = "f,0,0,0,0" }); sam.OptionalFields.Add (new SAMOptionalField () { Tag = "rs", Value = "f,0,0,0,0,0,0" }); sam.OptionalFields.Add (new SAMOptionalField () { Tag = "zs", Value = "f,0,0,0,0,0,0" }); PacBioCCSRead read = new PacBioCCSRead (sam) { AvgZscore = Single.NaN, HoleNumber = Convert.ToInt32 (hole), Movie = movie }; yield return read; } }
/// <summary> /// Parse a single sequencer. /// </summary> /// <param name="bioText">sequence alignment text.</param> /// <param name="alphabet">Alphabet of the sequences.</param> public static SAMAlignedSequence ParseSequence(string bioText, IAlphabet alphabet) { const int optionalTokenStartingIndex = 11; string[] tokens = bioText.Split(TabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence { QName = tokens[0], Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]), RName = tokens[2], Pos = int.Parse(tokens[3]), MapQ = int.Parse(tokens[4]), CIGAR = tokens[5] }; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7]); alignedSeq.ISize = int.Parse(tokens[8]); ParseQualityNSequence(alignedSeq, alphabet, tokens[9], tokens[10]); for (int i = optionalTokenStartingIndex; i < tokens.Length; i++) { SAMOptionalField optField = new SAMOptionalField(); if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i])) { throw new FormatException(string.Format(Properties.Resource.InvalidOptionalField, tokens[i])); } string[] opttokens = tokens[i].Split(ColonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; alignedSeq.OptionalFields.Add(optField); } return(alignedSeq); }
public string SAMToString(SAMAlignedSequence sam) { if (sam == null) { return null; } return string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}", sam.QName, (int)sam.Flag, sam.RName, sam.Pos, sam.MapQ, sam.CIGAR, sam.MRNM, sam.MPos, sam.ISize, sam.GetQuerySequenceString(), sam.GetQualityScoresString(), (from of in sam.OptionalFields select string.Format("{0}:{1}:{2}", of.Tag, of.VType, of.Value)).Merge("\t")); }
/// <summary> /// Gets encoded sequence according to the BAM specification. /// </summary> /// <param name="alignedSeq"></param> /// <returns></returns> private static byte[] GetEncodedSequence(SAMAlignedSequence alignedSeq) { List<byte> byteList = new List<byte>(); ISequence seq = alignedSeq.QuerySequence; if (seq != null) { if (!(seq.Alphabet is DnaAlphabet)) { throw new ArgumentException(Properties.Resource.BAMFormatterSupportsDNAOnly); } byte[] symbolMap = seq.Alphabet.GetSymbolValueMap(); for (int i = 0; i < seq.Count; i++) { char symbol = (char)symbolMap[seq[i]]; byte encodedvalue = 0; // 4-bit encoded read: =ACMGRSVTWYHKDBN -> 0-15; the earlier base is stored in the // high-order 4 bits of the byte. //Note: // All the other symbols which are not supported by BAM specification (other than "=ACMGRSVTWYHKDBN") are converted to 'N' // for example a '.' symbol which is supported by SAM specification will be converted to symbol 'N' switch (symbol) { case '=': encodedvalue = 0; break; case 'A': encodedvalue = 1; break; case 'C': encodedvalue = 2; break; case 'M': encodedvalue = 3; break; case 'G': encodedvalue = 4; break; case 'R': encodedvalue = 5; break; case 'S': encodedvalue = 6; break; case 'V': encodedvalue = 7; break; case 'T': encodedvalue = 8; break; case 'W': encodedvalue = 9; break; case 'Y': encodedvalue = 10; break; case 'H': encodedvalue = 11; break; case 'K': encodedvalue = 12; break; case 'D': encodedvalue = 13; break; case 'B': encodedvalue = 14; break; default: encodedvalue = 15; break; } if ((i + 1) % 2 > 0) { byteList.Add((byte)(encodedvalue << 4)); } else { byteList[byteList.Count - 1] = (byte)(byteList[byteList.Count - 1] | encodedvalue); } } } return byteList.ToArray(); }
// Gets block size required for the specified SAMAlignedSequence object. private int GetBlockSize(SAMAlignedSequence alignedSeq) { int readNameLen = alignedSeq.QName.Length + 1; int cigarLen = GetCIGARLength(alignedSeq.CIGAR); int readLen = (int)alignedSeq.QuerySequence.Count; return 32 + readNameLen + (cigarLen * 4) + ((readLen + 1) / 2) + readLen + GetAuxiliaryDataLength(alignedSeq); }
List<BaseAndQualityAndPosition> getBasesForSequence(SAMAlignedSequence seq) { List<BaseAndQualityAndPosition> toReturn = new List<BaseAndQualityAndPosition>(seq.RefEndPos - seq.Pos + 10); // Decode the cigar string into operations. // TODO: This code is duplicated in many places string CIGAR = seq.CIGAR; List<KeyValuePair<char, int>> charsAndPositions = new List<KeyValuePair<char, int>>(); for (int i = 0; i < CIGAR.Length; i++) { char ch = CIGAR[i]; if (Char.IsDigit(ch)) { continue; } charsAndPositions.Add(new KeyValuePair<char, int>(ch, i)); } // Get sequence bases and error probabilities var qseq = seq.QuerySequence as QualitativeSequence; var seq_log10ErrorProb = qseq.GetPhredQualityScores().Select(Utils.GetLog10ErrorProbability).ToArray(); var seq_bases = qseq.ToArray(); // Use the cigar operations to emit bases. int curRef = seq.Pos; int curQuery = 0; for (int i = 0; i < charsAndPositions.Count; i++) { // Parse the current cigar operation char ch = charsAndPositions[i].Key; int cig_start = i==0 ? 0 : charsAndPositions[i - 1].Value + 1; int cig_end = charsAndPositions[i].Value - cig_start; int cig_len = int.Parse(CIGAR.Substring(cig_start, cig_end)); // Emit or advance based on cigar operation. switch (ch) { case 'P': //padding (Silent deltions from padded reference) case 'N': //skipped region from reference throw new Exception("Pile up methods not built to handle reference clipping (Cigar P or N) yet."); case 'M': //match or mismatch case '=': //match case 'X': //mismatch for (int k = 0; k < cig_len; k++) { var bqp= new BaseAndQualityAndPosition(curRef,0, new BaseAndQuality(seq_bases[curQuery], seq_log10ErrorProb[curQuery])); toReturn.Add(bqp); curQuery++; curRef++; } break; case 'I'://insertion to the reference for (int k = 0; k < cig_len; k++) { var bqp = new BaseAndQualityAndPosition(curRef,k, new BaseAndQuality(seq_bases[curQuery], seq_log10ErrorProb[curQuery])); toReturn.Add(bqp); curQuery++; } break; case 'D'://Deletion from the reference for (int k = 0; k < cig_len; k++) { var bqp = new BaseAndQualityAndPosition(curRef,k, new BaseAndQuality((byte)'-', Double.NaN)); toReturn.Add(bqp); curRef++; } break; case 'S': //soft clipped curQuery += cig_len; break; case 'H'://had clipped break; default: throw new FormatException("Unexpected SAM Cigar element found " + ch.ToString()); } } return toReturn; }
/// <summary> /// Gets the paired reads when DV is enabled. /// </summary> /// <param name="meanLengthOfInsert">Mean of the insert length.</param> /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param> /// <param name="calculate">If this flag is set then mean and standard deviation will /// be calculated from the paired reads instead of specified.</param> /// <returns>List of paired read.</returns> private IList <PairedRead> GetDVAwarePairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false) { // Dictionary helps to get the information at one pass of alinged sequence list. Dictionary <string, DVEnabledPairedRead> pairedReads = new Dictionary <string, DVEnabledPairedRead>(); double sum = 0; int count = 0; for (int i = 0; i < QuerySequences.Count; i++) { DVEnabledPairedRead pairedRead; SAMAlignedSequence read = QuerySequences[i]; if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead) { if (pairedReads.TryGetValue(read.QName, out pairedRead)) { if (pairedRead.Index2 == -1 || pairedRead.Index1 == -1) { if (pairedRead.Index2 == -1) { pairedRead.Index2 = i; } else { pairedRead.Index1 = i; } // For best performace, // 1. BAM/SAM file should be sorted by reads name. // 2. If sorted on mapping position then give unmapped read a coordinate (generally the coordinate of the mapped mate) // for sorting/indexing purposes only. pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert); if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly) { pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2); if (calculate) { sum += pairedRead.InsertLength; count++; } } } else { pairedRead.InsertLength = 0; if (calculate) { sum -= pairedRead.InsertLength; count--; } pairedRead.ReadIndexes.Add(i); pairedRead.PairedType = PairedReadType.MultipleHits; } } else { pairedRead = new DVEnabledPairedRead(QuerySequences); if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*")) { pairedRead.Index1 = i; } else { pairedRead.Index2 = i; } pairedRead.PairedType = PairedReadType.Orphan; pairedRead.InsertLength = 0; pairedReads.Add(read.QName, pairedRead); } } } List <PairedRead> allreads = pairedReads.Values.ToList <PairedRead>(); pairedReads = null; if (calculate && count > 0) { UpdateType(allreads, sum, count); } return(allreads); }
/// <summary> /// Parases sequence data and quality values and updates SAMAlignedSequence instance. /// </summary> /// <param name="alignedSeq">SAM aligned Sequence.</param> /// <param name="alphabet">Alphabet of the sequence to be created.</param> /// <param name="Encoding">Encoding to use while creating sequence.</param> /// <param name="sequencedata">Sequence data.</param> /// <param name="qualitydata">Quality values.</param> /// <param name="refSeq">Reference sequence if known.</param> /// <param name="isReadOnly">Flag to indicate whether the new sequence is required to in readonly or not.</param> public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, IEncoding Encoding, string sequencedata, string qualitydata, ISequence refSeq, bool isReadOnly) { if (alignedSeq == null) { throw new ArgumentNullException("alignedSeq"); } if (string.IsNullOrWhiteSpace(sequencedata)) { throw new ArgumentNullException("sequencedata"); } if (string.IsNullOrWhiteSpace(qualitydata)) { throw new ArgumentNullException("qualitydata"); } bool isQualitativeSequence = true; string message = string.Empty; byte[] qualScores = null; FastQFormatType fastQType = QualityFormatType; if (sequencedata.Equals("*")) { return; } if (qualitydata.Equals("*")) { isQualitativeSequence = false; } if (isQualitativeSequence) { // Get the quality scores from the fourth line. qualScores = ASCIIEncoding.ASCII.GetBytes(qualitydata); // Check for sequence length and quality score length. if (sequencedata.Length != qualitydata.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Resource.SAM_NAME, message1); Trace.Report(message); throw new FileFormatException(message); } } // get "." symbol indexes. int index = sequencedata.IndexOf('.', 0); while (index > -1) { alignedSeq.DotSymbolIndexes.Add(index); index = sequencedata.IndexOf('.', index); } // replace "." with N if (alignedSeq.DotSymbolIndexes.Count > 0) { sequencedata = sequencedata.Replace('.', 'N'); } // get "=" symbol indexes. index = sequencedata.IndexOf('=', 0); while (index > -1) { alignedSeq.EqualSymbolIndexes.Add(index); index = sequencedata.IndexOf('=', index); } // replace "=" with corresponding symbol from refSeq. if (alignedSeq.EqualSymbolIndexes.Count > 0) { if (refSeq == null) { throw new ArgumentException(Resource.RefSequenceNofFound); } for (int i = 0; i < alignedSeq.EqualSymbolIndexes.Count; i++) { index = alignedSeq.EqualSymbolIndexes[i]; sequencedata = sequencedata.Remove(index, 1); sequencedata = sequencedata.Insert(index, refSeq[index].Symbol.ToString()); } } ISequence sequence = null; if (isQualitativeSequence) { QualitativeSequence qualSeq = null; if (Encoding == null) { qualSeq = new QualitativeSequence(alphabet, fastQType, sequencedata, qualScores); } else { qualSeq = new QualitativeSequence(alphabet, fastQType, Encoding, sequencedata, qualScores); } qualSeq.ID = alignedSeq.QName; qualSeq.IsReadOnly = isReadOnly; sequence = qualSeq; } else { Sequence seq = null; if (Encoding == null) { seq = new Sequence(alphabet, sequencedata); } else { seq = new Sequence(alphabet, Encoding, sequencedata); } seq.ID = alignedSeq.QName; seq.IsReadOnly = isReadOnly; sequence = seq; } alignedSeq.QuerySequence = sequence; }
/// <summary> /// Filters Sequence based on user inputs. /// </summary> /// <param name="alignedSequence">Aligned Sequence.</param> /// <returns>Whether aligned sequence matches user defined options.</returns> private bool Filter(SAMAlignedSequence alignedSequence) { bool filter = true; if (filter && FlagRequired != 0) { filter = (((int)alignedSequence.Flag) & FlagRequired) == FlagRequired; } if (filter && FilteringFlag != 0) { filter = ((((int)alignedSequence.Flag) & FilteringFlag) == 0); } if (filter && QualityMinimumMapping != 0) { filter = alignedSequence.MapQ == QualityMinimumMapping; } if (filter && !string.IsNullOrEmpty(Library) && rgRecFields.Count > 0) { filter = rgRecFields.First( a => a.Tags.First( b => b.Tag.Equals("ID")).Value.Equals(alignedSequence.OptionalFields.First( c => c.Tag.Equals("RG")).Value)).Tags.First( d => d.Tag.Equals("LB")).Value.Equals(Library); } if (filter && !string.IsNullOrEmpty(ReadGroup)) { filter = alignedSequence.OptionalFields.AsParallel().Where( O => O.Tag.ToUpper().Equals("RG")).ToList().Any(a => a.Value.Equals(ReadGroup)); } if (filter && !string.IsNullOrEmpty(Region)) { if (alignedSequence.RName.Equals(region.Chromosome)) { if (region.Start > -1) { if (alignedSequence.Pos >= region.Start) { if (region.End > -1) { if (alignedSequence.Pos <= region.End) { filter = true; } else { filter = false; } } else { filter = true; } } else { filter = false; } } else { filter = true; } } else { filter = false; } } return filter; }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="libraryName">library name.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, string libraryName) { if (string.IsNullOrEmpty(libraryName)) { throw new ArgumentNullException("libraryName"); } CloneLibraryInformation libraryInfo = CloneLibrary.Instance.GetLibraryInformation(libraryName); if (libraryInfo == null) { throw new ArgumentOutOfRangeException("libraryName"); } return GetPairedReadType(read1, read2, libraryInfo); }
/// <summary> /// Gets the insert length of reads. /// </summary> /// <param name="read1">First read.</param> /// <param name="read2">Second read.</param> public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2) { return GetInsertLength(read1, read2, false); }
/// <summary> /// Gets an instance of SequenceRange class which represets alignment reigon of /// specified aligned sequence (read) with reference sequence. /// </summary> /// <param name="alignedSequence">Aligned sequence.</param> private static ISequenceRange GetRegion(SAMAlignedSequence alignedSequence) { string refSeqName = alignedSequence.RName; long startPos = alignedSequence.Pos; long endPos = alignedSequence.RefEndPos; return new SequenceRange(refSeqName, startPos, endPos); }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="libraryInfo">Library information.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, CloneLibraryInformation libraryInfo) { if (libraryInfo == null) { throw new ArgumentNullException("libraryInfo"); } return GetPairedReadType(read1, read2, libraryInfo.MeanLengthOfInsert, libraryInfo.StandardDeviationOfInsert); }
/// <summary> /// Gets a boolean value which indicates that whether the specified read is reverse oriented or not. /// </summary> /// <param name="read">Aligned Sequence.</param> public static bool IsReverseRead(SAMAlignedSequence read) { if (read == null) { throw new ArgumentNullException("read"); } return !IsForwardRead(read); }
/// <summary> /// Gets a boolean value which indicates that whether the specified read is forward oriented or not. /// </summary> /// <param name="read">Aligned Sequence.</param> public static bool IsForwardRead(SAMAlignedSequence read) { if (read == null) { throw new ArgumentNullException("read"); } return (read.Flag & SAMFlags.QueryOnReverseStrand) == 0; }
/// <summary> /// Gets the insert length of reads. /// </summary> /// <param name="read1">First read.</param> /// <param name="read2">Second read.</param> /// <param name="validate">Validates the reads before calculating the insert length.</param> public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2, bool validate) { // reference chromosome //5' --> 3' //----------------------------------------------------- F strand // //3' <-- 5' //----------------------------------------------------- R strand // read1 read2 // 5' 3' 3' 5' // --> <-- // |-------------- --------------| // |<----------insert length------------------>| if (read1 == null) { throw new ArgumentNullException("read1"); } if (read2 == null) { return 0; } if (validate) { PairedReadType type = GetPairedReadType(read1, read2, 0, 0); if (type != PairedReadType.Normal && type != PairedReadType.LengthAnomaly) { return 0; } } return read1.ISize == -read2.ISize ? (read1.ISize >= 0 ? read1.ISize : -read1.ISize) : 0; }
// Gets the length of the optional fields in a SAMAlignedSequence object. private static int GetAuxiliaryDataLength(SAMAlignedSequence alignedSeq) { int size = 0; foreach (SAMOptionalField field in alignedSeq.OptionalFields) { size += 3; int valueSize = GetOptionalFieldValueSize(field); if (valueSize == 0) { string message = string.Format(CultureInfo.InvariantCulture, Properties.Resource.BAM_InvalidIntValueInOptFieldOfAlignedSeq, field.Value, field.Tag, alignedSeq.QName); throw new FormatException(message); } size += valueSize < 0 ? -valueSize : valueSize; } return size; }
/// <summary> /// Add a sequence to the filtered output file header /// </summary> private void AddToHeader(SAMAlignedSequence seq) { newHeader.ReferenceSequences.Add(new ReferenceSequenceInfo(seq.RName, GetSequence(seq).Length)); // for each good cluster SAMRecordField sq = new SAMRecordField("SQ"); sq.Tags.Add(new SAMRecordFieldTag("SN", seq.RName)); sq.Tags.Add(new SAMRecordFieldTag("LN", GetSequence(seq).Length.ToString(ci))); newHeader.RecordFields.Add(sq); }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="meanLengthOfInsert">Mean of the insertion length.</param> /// <param name="standardDeviationOfInsert">Standard deviation of insertion length.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, float meanLengthOfInsert, float standardDeviationOfInsert) { PairedReadType type = PairedReadType.Normal; if (read1 == null) { throw new ArgumentNullException("read1"); } if (read2 == null) { return(PairedReadType.Orphan); } if (string.IsNullOrEmpty(read2.RName) || read2.RName.Equals("*") || ((read2.Flag & SAMFlags.UnmappedQuery) == SAMFlags.UnmappedQuery)) { type = PairedReadType.Orphan; } else if (!read2.RName.Equals(read1.RName)) { type = PairedReadType.Chimera; } else { bool isBothforwardReads = IsForwardRead(read1) && IsForwardRead(read2); bool isBothReverseReads = IsReverseRead(read1) && IsReverseRead(read2); if (isBothforwardReads || isBothReverseReads) { type = PairedReadType.StructuralAnomaly; } else { int forwardReadStartPos = 0; int reverseReadStartPos = 0; if (IsForwardRead(read1)) { forwardReadStartPos = read1.Pos; reverseReadStartPos = read2.Pos; } else { forwardReadStartPos = read2.Pos; reverseReadStartPos = read1.Pos; } if (forwardReadStartPos > reverseReadStartPos) { type = PairedReadType.StructuralAnomaly; } else { int insertLength = GetInsertLength(read1, read2); // µ + 3σ float upperLimit = meanLengthOfInsert + (3 * standardDeviationOfInsert); // µ - 3σ float lowerLimit = meanLengthOfInsert - (3 * standardDeviationOfInsert); if (insertLength > upperLimit || insertLength < lowerLimit) { type = PairedReadType.LengthAnomaly; } } } } return(type); }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="meanLengthOfInsert">Mean of the insertion length.</param> /// <param name="standardDeviationOfInsert">Standard deviation of insertion length.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, float meanLengthOfInsert, float standardDeviationOfInsert) { PairedReadType type = PairedReadType.Normal; if (read1 == null) { throw new ArgumentNullException("read1"); } if (read2 == null) { return PairedReadType.Orphan; } if (string.IsNullOrEmpty(read2.RName) || read2.RName.Equals("*") || ((read2.Flag & SAMFlags.UnmappedQuery) == SAMFlags.UnmappedQuery)) { type = PairedReadType.Orphan; } else if (!read2.RName.Equals(read1.RName)) { type = PairedReadType.Chimera; } else { bool isBothforwardReads = IsForwardRead(read1) && IsForwardRead(read2); bool isBothReverseReads = IsReverseRead(read1) && IsReverseRead(read2); if (isBothforwardReads || isBothReverseReads) { type = PairedReadType.StructuralAnomaly; } else { int forwardReadStartPos = 0; int reverseReadStartPos = 0; if (IsForwardRead(read1)) { forwardReadStartPos = read1.Pos; reverseReadStartPos = read2.Pos; } else { forwardReadStartPos = read2.Pos; reverseReadStartPos = read1.Pos; } if (forwardReadStartPos > reverseReadStartPos) { type = PairedReadType.StructuralAnomaly; } else { int insertLength = GetInsertLength(read1, read2); // µ + 3σ float upperLimit = meanLengthOfInsert + (3*standardDeviationOfInsert); // µ - 3σ float lowerLimit = meanLengthOfInsert - (3*standardDeviationOfInsert); if (insertLength > upperLimit || insertLength < lowerLimit) { type = PairedReadType.LengthAnomaly; } } } } return type; }
public static void WriteFasta(this SAMAlignedSequence sam, StreamWriter sw) { sw.WriteLine(">" + sam.QName); sw.WriteLine(sam.GetQuerySequenceString()); }
/// <summary> /// General method to Invalidate Quality Sequences /// <param name="method">enum type to execute different overload</param> /// </summary> private static void ValidateQualitySeqLength(ParseOrFormatQualLength method) { SAMAlignedSequence align = new SAMAlignedSequence(); try { switch (method) { case ParseOrFormatQualLength.AlignedSeq: SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, String.Empty); break; case ParseOrFormatQualLength.Sequencedata: align.QName = "Quality Value"; SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, String.Empty); break; case ParseOrFormatQualLength.Qualitydata: align.QName = "Quality Value"; SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, Constants.QualitySequence); break; case ParseOrFormatQualLength.QualityLength: align.QName = "Quality Value"; SAMParser.ParseQualityNSequence( align, Alphabets.DNA, null, Constants.QualitySequence); break; default: break; } Assert.Fail(); } catch (ArgumentException) { ApplicationLog.WriteLine( "SAM Parser P2 : Successfully validated the exception"); } catch (FormatException) { ApplicationLog.WriteLine( "SAM Parser P2 : Successfully validated the exception"); } }
/// <summary> /// Update the linear index array based on an aligned read and its current coordinates /// </summary> /// <param name="alignedSeq"></param> /// <param name="offset"></param> internal void UpdateLinearArrayIndex(SAMAlignedSequence alignedSeq, FileOffset offset) { int pos = alignedSeq.Pos > 0 ? alignedSeq.Pos - 1 : 0; int end = alignedSeq.RefEndPos > 0 ? alignedSeq.RefEndPos - 1 : 0; pos = pos >> 14; end = end >> 14; if (end > largestBinSeen) {largestBinSeen = end;} for (int i = pos; i <= end; i++) { var cur = offSetArray[i]; //TODO: Is second check necessary? Seems to always be true as we are doing things in order if (cur.BothDataElements == 0 || cur > offset) { offSetArray[i] = offset; } } }
/// <summary> /// Returns an aligned sequence by parses the BAM file. /// </summary> private SAMAlignedSequence GetAlignedSequence(int start, int end) { byte[] array = new byte[4]; ReadUnCompressedData(array, 0, 4); int blockLen = Helper.GetInt32(array, 0); byte[] alignmentBlock = new byte[blockLen]; ReadUnCompressedData(alignmentBlock, 0, blockLen); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); int value; UInt32 UnsignedValue; // 0-4 bytes int refSeqIndex = Helper.GetInt32(alignmentBlock, 0); if (refSeqIndex == -1) alignedSeq.RName = "*"; else alignedSeq.RName = refSeqNames[refSeqIndex]; // 4-8 bytes alignedSeq.Pos = Helper.GetInt32(alignmentBlock, 4) + 1; // if there is no overlap no need to parse further. // BAMPos > closedEnd // => (alignedSeq.Pos - 1) > end -1 if (alignedSeq.Pos > end) { return null; } // 8 - 12 bytes "bin<<16|mapQual<<8|read_name_len" UnsignedValue = Helper.GetUInt32(alignmentBlock, 8); // 10 -12 bytes alignedSeq.Bin = (int)(UnsignedValue & 0xFFFF0000) >> 16; // 9th bytes alignedSeq.MapQ = (int)(UnsignedValue & 0x0000FF00) >> 8; // 8th bytes int queryNameLen = (int)(UnsignedValue & 0x000000FF); // 12 - 16 bytes UnsignedValue = Helper.GetUInt32(alignmentBlock, 12); // 14-16 bytes int flagValue = (int)(UnsignedValue & 0xFFFF0000) >> 16; alignedSeq.Flag = (SAMFlags)flagValue; // 12-14 bytes int cigarLen = (int)(UnsignedValue & 0x0000FFFF); // 16-20 bytes int readLen = Helper.GetInt32(alignmentBlock, 16); // 20-24 bytes int mateRefSeqIndex = Helper.GetInt32(alignmentBlock, 20); if (mateRefSeqIndex != -1) { alignedSeq.MRNM = refSeqNames[mateRefSeqIndex]; } else { alignedSeq.MRNM = "*"; } // 24-28 bytes alignedSeq.MPos = Helper.GetInt32(alignmentBlock, 24) + 1; // 28-32 bytes alignedSeq.ISize = Helper.GetInt32(alignmentBlock, 28); // 32-(32+readLen) bytes alignedSeq.QName = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, 32, queryNameLen - 1); StringBuilder strbuilder = new StringBuilder(); int startIndex = 32 + queryNameLen; for (int i = startIndex; i < (startIndex + cigarLen * 4); i += 4) { // Get the CIGAR operation length stored in first 28 bits. UInt32 cigarValue = Helper.GetUInt32(alignmentBlock, i); strbuilder.Append(((cigarValue & 0xFFFFFFF0) >> 4).ToString(CultureInfo.InvariantCulture)); // Get the CIGAR operation stored in last 4 bits. value = (int)cigarValue & 0x0000000F; // MIDNSHP=>0123456 switch (value) { case 0: strbuilder.Append("M"); break; case 1: strbuilder.Append("I"); break; case 2: strbuilder.Append("D"); break; case 3: strbuilder.Append("N"); break; case 4: strbuilder.Append("S"); break; case 5: strbuilder.Append("H"); break; case 6: strbuilder.Append("P"); break; case 7: strbuilder.Append("="); break; case 8: strbuilder.Append("X"); break; default: throw new FileFormatException(Properties.Resource.BAM_InvalidCIGAR); } } string cigar = strbuilder.ToString(); if (string.IsNullOrWhiteSpace(cigar)) { alignedSeq.CIGAR = "*"; } else { alignedSeq.CIGAR = cigar; } // if there is no overlap no need to parse further. // ZeroBasedRefEnd < start // => (alignedSeq.RefEndPos -1) < start if (alignedSeq.RefEndPos - 1 < start && alignedSeq.RName!=Properties.Resource.SAM_NO_REFERENCE_DEFINED_INDICATOR) { return null; } startIndex += cigarLen * 4; strbuilder = new StringBuilder(); int index = startIndex; for (; index < (startIndex + (readLen + 1) / 2) - 1; index++) { // Get first 4 bit value value = (alignmentBlock[index] & 0xF0) >> 4; strbuilder.Append(GetSeqChar(value)); // Get last 4 bit value value = alignmentBlock[index] & 0x0F; strbuilder.Append(GetSeqChar(value)); } value = (alignmentBlock[index] & 0xF0) >> 4; strbuilder.Append(GetSeqChar(value)); if (readLen % 2 == 0) { value = alignmentBlock[index] & 0x0F; strbuilder.Append(GetSeqChar(value)); } startIndex = index + 1; string strSequence = strbuilder.ToString(); byte[] qualValues = new byte[readLen]; string strQualValues = "*"; if (alignmentBlock[startIndex] != 0xFF) { for (int i = startIndex; i < (startIndex + readLen); i++) { qualValues[i - startIndex] = (byte)(alignmentBlock[i] + 33); } strQualValues = System.Text.ASCIIEncoding.ASCII.GetString(qualValues); } SAMParser.ParseQualityNSequence(alignedSeq, Alphabet, strSequence, strQualValues); startIndex += readLen; if (alignmentBlock.Length > startIndex + 4 && alignmentBlock[startIndex] != 0x0 && alignmentBlock[startIndex + 1] != 0x0) { for (index = startIndex; index < alignmentBlock.Length; ) { SAMOptionalField optionalField = new SAMOptionalField(); optionalField.Tag = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, index, 2); index += 2; char vType = (char)alignmentBlock[index++]; string valueType = vType.ToString(); // SAM format supports [AifZH] for value type. // In BAM, an integer may be stored as a signed 8-bit integer (c), unsigned 8-bit integer (C), signed short (s), unsigned // short (S), signed 32-bit (i) or unsigned 32-bit integer (I), depending on the signed magnitude of the integer. However, // in SAM, all types of integers are presented as type ʻiʼ. string message = Helper.IsValidPatternValue("VType", valueType, BAMOptionalFieldRegex); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } optionalField.Value = GetOptionalValue(vType, alignmentBlock, ref index).ToString(); // Convert to SAM format. if ("cCsSI".IndexOf(vType) >= 0) { valueType = "i"; } optionalField.VType = valueType; alignedSeq.OptionalFields.Add(optionalField); } } return alignedSeq; }
/// <summary> /// Parases sequence data and quality values and updates SAMAlignedSequence instance. /// </summary> /// <param name="alignedSeq">SAM aligned Sequence.</param> /// <param name="alphabet">Alphabet of the sequence to be created.</param> /// <param name="sequencedata">Sequence data.</param> /// <param name="qualitydata">Quality values.</param> public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, string sequencedata, string qualitydata) { if (alignedSeq == null) { throw new ArgumentNullException("alignedSeq"); } if (string.IsNullOrWhiteSpace(sequencedata)) { throw new ArgumentNullException("sequencedata"); } if (string.IsNullOrWhiteSpace(qualitydata)) { throw new ArgumentNullException("qualitydata"); } bool isQualitativeSequence = true; string message = string.Empty; byte[] qualScores = null; FastQFormatType fastQType = QualityFormatType; if (sequencedata.Equals("*")) { return; } if (qualitydata.Equals("*")) { isQualitativeSequence = false; } if (isQualitativeSequence) { // Get the quality scores from the fourth line. qualScores = ASCIIEncoding.ASCII.GetBytes(qualitydata); // Check for sequence length and quality score length. if (sequencedata.Length != qualitydata.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, Properties.Resource.SAM_NAME, message1); Trace.Report(message); throw new FileFormatException(message); } } ISequence sequence = null; if (isQualitativeSequence) { QualitativeSequence qualSeq = new QualitativeSequence(alphabet, fastQType, sequencedata, ASCIIEncoding.ASCII.GetString(qualScores)); qualSeq.ID = alignedSeq.QName; sequence = qualSeq; } else { sequence = new Sequence(alphabet, sequencedata); sequence.ID = alignedSeq.QName; } alignedSeq.QuerySequence = sequence; }
// Validates the alignment. private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment) { SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap; if (seqAlignmentMap != null) { ValidateAlignmentHeader(seqAlignmentMap.Header); if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates) { this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); } else { this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges(); } return seqAlignmentMap; } SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader; if (header == null) { throw new ArgumentException(Properties.Resource.SAMAlignmentHeaderNotFound); } ValidateAlignmentHeader(header); seqAlignmentMap = new SequenceAlignmentMap(header); if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates) { this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); } else { this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges(); } foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences) { SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader; if (alignedHeader == null) { throw new ArgumentException(Properties.Resource.SAMAlignedSequenceHeaderNotFound); } SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader); samAlignedSeq.QuerySequence = alignedSeq.Sequences[0]; seqAlignmentMap.QuerySequences.Add(samAlignedSeq); } return seqAlignmentMap; }
/// <summary> /// Writes aligned sequence to output stream. /// </summary> /// <param name="header">Alignment header.</param> /// <param name="alignedSequence">Aligned sequence to write.</param> private void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSequence) { if (UnCompressedBAM || BAMOutput) { // In case of compressed bamoutput uncompressed file will be compressed before sending it to output stream. bamformatter.WriteAlignedSequence(header, alignedSequence, bamUncompressedOutStream); } else { SAMFormatter.WriteSAMAlignedSequence(writer, alignedSequence); } }
/// <summary> /// Writes SAMAlignedSequence to specified stream. /// </summary> /// <param name="header">Header from SAM object.</param> /// <param name="alignedSeq">SAMAlignedSequence object.</param> /// <param name="writer">Stream to write.</param> public void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSeq, Stream writer) { if (header == null) { throw new ArgumentNullException("header"); } if (alignedSeq == null) { throw new ArgumentNullException("alignedSeq"); } if (writer == null) { throw new ArgumentNullException("writer"); } if (this.refSequences == null) { this.refSequences = header.GetReferenceSequenceRanges(); } WriteAlignedSequence(alignedSeq, writer); }
/// <summary> /// Method throws an exception if sequence violates any assumption made by this class anywhere. /// Avoids, separate checks within each method. /// </summary> /// <param name="seq"></param> private void validateSequence(SAMAlignedSequence seq) { if (seq == null) { throw new ArgumentNullException("seq"); } if (String.IsNullOrEmpty(seq.RName) || seq.RefEndPos <= seq.Pos || String.IsNullOrEmpty(seq.CIGAR) || seq.CIGAR =="*" || !(seq.QuerySequence is QualitativeSequence) ) { throw new ArgumentException("Tried to build a pileup with an invalid sequence. Sequence was:\n"+ seq.ToString()); } }
/// <summary> /// Writes SAMAlignedSequence to specified stream. /// </summary> /// <param name="alignedSeq">SAMAlignedSequence object.</param> /// <param name="writer">Stream to write.</param> private void WriteAlignedSequence(SAMAlignedSequence alignedSeq, Stream writer) { // Get the total block size required. int blocksize = GetBlockSize(alignedSeq); // Get Reference sequence index. int rid = GetRefSeqID(alignedSeq.RName); // bin<<16|mapQual<<8|read_name_len (including NULL) uint bin_mq_nl = (uint)alignedSeq.Bin << 16; bin_mq_nl = bin_mq_nl | (uint)alignedSeq.MapQ << 8; bin_mq_nl = bin_mq_nl | (uint)(alignedSeq.QName.Length + 1); // flag<<16|cigar_len uint flag_nc = (uint)alignedSeq.Flag << 16; flag_nc = flag_nc | (uint)GetCIGARLength(alignedSeq.CIGAR); int readLen = (int)alignedSeq.QuerySequence.Count; int mateRefId = GetRefSeqID(alignedSeq.MRNM); byte[] readName = Encoding.UTF8.GetBytes(alignedSeq.QName); // Cigar: op_len<<4|op. Op: MIDNSHP=X => 012345678 IList<uint> encodedCIGAR = GetEncodedCIGAR(alignedSeq.CIGAR); //block size writer.Write(Helper.GetLittleEndianByteArray(blocksize), 0, 4); // Reference sequence index. writer.Write(Helper.GetLittleEndianByteArray(rid), 0, 4); // Pos writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.Pos > 0 ? alignedSeq.Pos - 1 : -1), 0, 4); // bin<<16|mapQual<<8|read_name_len (including NULL) writer.Write(Helper.GetLittleEndianByteArray(bin_mq_nl), 0, 4); // flag<<16|cigar_len writer.Write(Helper.GetLittleEndianByteArray(flag_nc), 0, 4); // Length of the read writer.Write(Helper.GetLittleEndianByteArray(readLen), 0, 4); // Mate reference sequence index writer.Write(Helper.GetLittleEndianByteArray(mateRefId), 0, 4); // mate_pos - Leftmost coordinate of the mate // As per SAM format Mpos will be 1 based and 0 indicates unpaired or pairing information is unavailabe. // In case of BAM format Mpos will be zero based and -1 indicates unpaired or pairing information is unavailabe. writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.MPos - 1), 0, 4); // Insert size of the read pair (if paired) writer.Write(Helper.GetLittleEndianByteArray(alignedSeq.ISize), 0, 4); // Read name, null terminated writer.Write(readName, 0, readName.Length); writer.WriteByte((byte)'\0'); // Cigar: op_len<<4|op. Op: MIDNSHP=>0123456 foreach (uint data in encodedCIGAR) { writer.Write(Helper.GetLittleEndianByteArray(data), 0, 4); } // 4-bit encoded read: =ACGTN=>0,1,2,4,8,15; the earlier base is stored in the high-order 4 bits of the byte. byte[] encodedValues = GetEncodedSequence(alignedSeq); writer.Write(encodedValues, 0, encodedValues.Length); // Phred base quality (0xFF if absent) encodedValues = GetQualityValue(alignedSeq.QuerySequence); writer.Write(encodedValues, 0, encodedValues.Length); // Optional fields foreach (SAMOptionalField field in alignedSeq.OptionalFields) { byte[] optionalArray = GetOptioanField(field); writer.Write(optionalArray, 0, optionalArray.Length); } }
/// <summary> /// Add a sequence into a dictionary value item which represents a list of sequences. The list to which to add /// to is found using key /// </summary> private static void AddToDict(Dictionary<String, List<SAMAlignedSequence>> dict, string key, SAMAlignedSequence seq) { if (key != null) { if (dict.ContainsKey(key)) { List<SAMAlignedSequence> existingVal = dict[key]; existingVal.Add(seq); dict[key] = existingVal; } else { dict.Add(key, new List<SAMAlignedSequence> { seq }); } } else { throw new ArgumentException(Properties.Resources.INVALID_KEY); } }
/// <summary> /// Add a sequence. If the sequence belongs to the current cluster, store it. If the sequence /// is part of a new cluster, process the current sequence cluster then add the sequence /// to a new cluster /// </summary> /// <param name="sequence">A sequence.</param> /// <returns>Returns true if the sequence could be added, false if the handler has been closed.</returns> public bool Add(SAMAlignedSequence sequence) { if (allSequences == null) { allSequences = new Collection<SAMAlignedSequence>(); } if(sequence == null) { return true; } if (!finished) { string thisSeqCluster = sequence.RName; // Cluster the sequence we just added belongs to // This is the first sequence for the first cluster if (currentClusterId == null) { currentClusterId = thisSeqCluster; } // This sequence belongs to a different cluster from the ones currently stored by this handler // (Process currently stored sequences before adding the new sequence) else if (!currentClusterId.Equals(thisSeqCluster)) { ++numberClustersParsed; // mark off another cluster ProcessSequences(); currentClusterId = thisSeqCluster; allSequences = new Collection<SAMAlignedSequence>(); } allSequences.Add(sequence); return true; } // Processing of sequences should be finished but we are still outputting to the bam file // Or we are supposed to write to a bam file and the header and body files have not yet been merged // Wait for output to the bam file to complete else if ((!canWriteToBam && writeToFilteredBam) || (!bamFilesMerged && writeToFilteredBam)) { while (!canWriteToBam || !bamFilesMerged) { Thread.Sleep(20000); // sleep 20 seconds } return true; } // finished == true, bam file is writable and bam files have been merged (or no bam file was ever written to) // returning false indicates to calling process that no more sequences will be accepted else { return false; } }
/// <summary> /// Gets the insert length of reads. /// </summary> /// <param name="read1">First read.</param> /// <param name="read2">Second read.</param> public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2) { return(GetInsertLength(read1, read2, false)); }
/// <summary> /// Given a sequence, returns the reference ID, or null if sequences are unmapped /// </summary> private static string GetId(SAMAlignedSequence sequence) { if (!sequence.Flag.HasFlag(SAMFlags.UnmappedQuery)) { return (sequence != null) ? sequence.RName : null; } else { return null; } }
/// <summary> /// Initializes a new instance of the <see cref="Bio.IO.PacBio.PacBioCCSRead"/> class. From an initially parsed BAM file. /// </summary> /// <param name="s">S.</param> public PacBioCCSRead (SAMAlignedSequence s) { /* TODO: Converting from binary to string and back is beyond silly... * no performance hit worth worrying about at present, but in the future it might be worth * going directly from binary to the type rather than through string intermediates */ foreach (var v in s.OptionalFields) { if (v.Tag == "sn") { var snrs = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray (); SnrA = snrs [0]; SnrC = snrs [1]; SnrG = snrs [2]; SnrT = snrs [3]; } else if (v.Tag == "zm") { HoleNumber = (int)Convert.ToInt32 (v.Value); } else if (v.Tag == "pq") { // This tag is now deprecated by the rq tag ReadQuality = Convert.ToSingle (v.Value); } else if (v.Tag == "rq") { ReadQuality = Convert.ToSingle (v.Value); }else if (v.Tag == "za") { AvgZscore = (float)Convert.ToSingle (v.Value); } else if (v.Tag == "rs") { statusCounts = v.Value.Split (',').Skip (1).Select (x => Convert.ToInt32 (x)).ToArray (); } else if (v.Tag == "np") { NumPasses = Convert.ToInt32 (v.Value); } else if (v.Tag == "RG") { ReadGroup = v.Value; } else if (v.Tag == "zs") { ZScores = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray (); } } // TODO: We should use String.Intern here, but not available in PCL... // Movie = String.Intern(s.QuerySequence.ID.Split ('/') [0]); Movie = s.QuerySequence.ID.Split ('/') [0]; Sequence = s.QuerySequence as QualitativeSequence; }
/// <summary> /// Given a SAMAlignedSequence, get the RG tag for that read /// </summary> private static string GetRgTag(SAMAlignedSequence seq) { foreach (SAMOptionalField field in seq.OptionalFields) { // I iterate through to find RG each time in case the optional fields // do not have a consistent format. if (field.Tag == "RG") { return field.Value; } } return null; }
/// <summary> /// Gets the paired reads when SAMAligned sequences are in memory. /// </summary> /// <param name="meanLengthOfInsert">Mean of the insert length.</param> /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param> /// <param name="calculate">If this flag is set then mean and standard deviation will /// be calculated from the paired reads instead of specified.</param> /// <returns>List of paired read.</returns> private IList <PairedRead> GetInMemoryPairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false) { // Dictionary helps to get the information at one pass of alinged sequence list. Dictionary <string, PairedRead> pairedReads = new Dictionary <string, PairedRead>(); double sum = 0; int count = 0; for (int i = 0; i < QuerySequences.Count; i++) { PairedRead pairedRead; SAMAlignedSequence read = QuerySequences[i]; if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead) { if (pairedReads.TryGetValue(read.QName, out pairedRead)) { if (pairedRead.Read2 == null || pairedRead.Read1 == null) { if (pairedRead.Read2 == null) { pairedRead.Read2 = read; } else { pairedRead.Read1 = read; } pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert); if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly) { pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2); if (calculate) { sum += pairedRead.InsertLength; count++; } } } else { pairedRead.InsertLength = 0; if (calculate) { sum -= pairedRead.InsertLength; count--; } pairedRead.Reads.Add(read); pairedRead.PairedType = PairedReadType.MultipleHits; } } else { pairedRead = new PairedRead(); if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*")) { pairedRead.Read1 = read; } else { pairedRead.Read2 = read; } pairedRead.PairedType = PairedReadType.Orphan; pairedRead.InsertLength = 0; pairedReads.Add(read.QName, pairedRead); } } } List <PairedRead> allreads = pairedReads.Values.ToList(); pairedReads = null; if (calculate && count > 0) { UpdateType(allreads, sum, count); } return(allreads); }
/// <summary> /// Given a SAMAlignedSequence, returns a string representation of the genetic sequence /// </summary> private static string GetSequence(SAMAlignedSequence seq) { String seqStr = seq.QuerySequence.ToString(); return Regex.Split(seqStr, "\r\n")[0]; }