/// <summary> /// Parse a single sequence using a MBFTextReader. /// </summary> /// <param name="bioText">sequence alignment text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the sequences in the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <param name="alphabet">Alphabet of the sequences.</param> /// <param name="encoding">Required encoding.</param> /// <param name="referenceSequences">Reference sequences.</param> private static SAMAlignedSequence ParseSequence(string bioText, bool isReadOnly, IAlphabet alphabet, IEncoding encoding, IList <ISequence> referenceSequences) { const int optionalTokenStartingIndex = 11; string[] tokens = bioText.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); alignedSeq.QName = tokens[0]; alignedSeq.Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]); alignedSeq.RName = tokens[2]; alignedSeq.Pos = int.Parse(tokens[3], CultureInfo.InvariantCulture); alignedSeq.MapQ = int.Parse(tokens[4], CultureInfo.InvariantCulture); alignedSeq.CIGAR = tokens[5]; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7], CultureInfo.InvariantCulture); alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture); ISequence refSeq = null; if (referenceSequences != null && referenceSequences.Count > 0) { refSeq = referenceSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0); } ParseQualityNSequence(alignedSeq, alphabet, encoding, tokens[9], tokens[10], refSeq, isReadOnly); SAMOptionalField optField = null; string message; for (int i = optionalTokenStartingIndex; i < tokens.Length; i++) { optField = new SAMOptionalField(); if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i])) { message = string.Format(CultureInfo.CurrentCulture, Resource.InvalidOptionalField, tokens[i]); throw new FormatException(message); } string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; alignedSeq.OptionalFields.Add(optField); } return(alignedSeq); }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="libraryName">library name.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, string libraryName) { if (string.IsNullOrEmpty(libraryName)) { throw new ArgumentNullException("libraryName"); } CloneLibraryInformation libraryInfo = CloneLibrary.Instance.GetLibraryInformation(libraryName); if (libraryInfo == null) { throw new ArgumentOutOfRangeException("libraryName"); } return(GetPairedReadType(read1, read2, libraryInfo)); }
/// <summary> /// Gets the insert length of reads. /// </summary> /// <param name="read1">First read.</param> /// <param name="read2">Second read.</param> /// <param name="validate">Validates the reads before calculating the insert length.</param> public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2, bool validate) { // reference chromosome //5' --> 3' //----------------------------------------------------- F strand // //3' <-- 5' //----------------------------------------------------- R strand // read1 read2 // 5' 3' 3' 5' // --> <-- // |-------------- --------------| // |<----------insert length------------------>| if (read1 == null) { throw new ArgumentNullException("read1"); } if (read2 == null) { return(0); } if (validate) { PairedReadType type = GetPairedReadType(read1, read2, 0, 0); if (type != PairedReadType.Normal && type != PairedReadType.LengthAnomaly) { return(0); } } if (read1.ISize == -read2.ISize) { return(read1.ISize >= 0 ? read1.ISize : -read1.ISize); } else { return(0); } }
/// <summary> /// Parases sequence data and quality values and updates SAMAlignedSequence instance. /// </summary> /// <param name="alignedSeq">SAM aligned Sequence.</param> /// <param name="alphabet">Alphabet of the sequence to be created.</param> /// <param name="Encoding">Encoding to use while creating sequence.</param> /// <param name="sequencedata">Sequence data.</param> /// <param name="qualitydata">Quality values.</param> /// <param name="refSeq">Reference sequence if known.</param> /// <param name="isReadOnly">Flag to indicate whether the new sequence is required to in readonly or not.</param> public static void ParseQualityNSequence(SAMAlignedSequence alignedSeq, IAlphabet alphabet, IEncoding Encoding, string sequencedata, string qualitydata, ISequence refSeq, bool isReadOnly) { if (alignedSeq == null) { throw new ArgumentNullException("alignedSeq"); } if (string.IsNullOrWhiteSpace(sequencedata)) { throw new ArgumentNullException("sequencedata"); } if (string.IsNullOrWhiteSpace(qualitydata)) { throw new ArgumentNullException("qualitydata"); } bool isQualitativeSequence = true; string message = string.Empty; byte[] qualScores = null; FastQFormatType fastQType = QualityFormatType; if (sequencedata.Equals("*")) { return; } if (qualitydata.Equals("*")) { isQualitativeSequence = false; } if (isQualitativeSequence) { // Get the quality scores from the fourth line. qualScores = ASCIIEncoding.ASCII.GetBytes(qualitydata); // Check for sequence length and quality score length. if (sequencedata.Length != qualitydata.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, alignedSeq.QName); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Resource.SAM_NAME, message1); Trace.Report(message); throw new FileFormatException(message); } } // get "." symbol indexes. int index = sequencedata.IndexOf('.', 0); while (index > -1) { alignedSeq.DotSymbolIndexes.Add(index++); index = sequencedata.IndexOf('.', index); } // replace "." with N if (alignedSeq.DotSymbolIndexes.Count > 0) { sequencedata = sequencedata.Replace('.', 'N'); } // get "=" symbol indexes. index = sequencedata.IndexOf('=', 0); while (index > -1) { alignedSeq.EqualSymbolIndexes.Add(index++); index = sequencedata.IndexOf('=', index); } // replace "=" with corresponding symbol from refSeq. if (alignedSeq.EqualSymbolIndexes.Count > 0) { if (refSeq == null) { throw new ArgumentException(Resource.RefSequenceNofFound); } for (int i = 0; i < alignedSeq.EqualSymbolIndexes.Count; i++) { index = alignedSeq.EqualSymbolIndexes[i]; sequencedata = sequencedata.Remove(index, 1); sequencedata = sequencedata.Insert(index, refSeq[index].Symbol.ToString()); } } ISequence sequence = null; if (isQualitativeSequence) { QualitativeSequence qualSeq = null; if (Encoding == null) { qualSeq = new QualitativeSequence(alphabet, fastQType, sequencedata, qualScores); } else { qualSeq = new QualitativeSequence(alphabet, fastQType, Encoding, sequencedata, qualScores); } qualSeq.ID = alignedSeq.QName; qualSeq.IsReadOnly = isReadOnly; sequence = qualSeq; } else { Sequence seq = null; if (Encoding == null) { seq = new Sequence(alphabet, sequencedata); } else { seq = new Sequence(alphabet, Encoding, sequencedata); } seq.ID = alignedSeq.QName; seq.IsReadOnly = isReadOnly; sequence = seq; } alignedSeq.QuerySequence = sequence; }
/// <summary> /// Gets the paired reads when DV is enabled. /// </summary> /// <param name="meanLengthOfInsert">Mean of the insert length.</param> /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param> /// <param name="calculate">If this flag is set then mean and standard deviation will /// be calculated from the paired reads instead of specified.</param> /// <returns>List of paired read.</returns> private IList <PairedRead> GetDVAwarePairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false) { // Dictionary helps to get the information at one pass of alinged sequence list. Dictionary <string, DVEnabledPairedRead> pairedReads = new Dictionary <string, DVEnabledPairedRead>(); double sum = 0; int count = 0; for (int i = 0; i < QuerySequences.Count; i++) { DVEnabledPairedRead pairedRead; SAMAlignedSequence read = QuerySequences[i]; if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead) { if (pairedReads.TryGetValue(read.QName, out pairedRead)) { if (pairedRead.Index2 == -1 || pairedRead.Index1 == -1) { if (pairedRead.Index2 == -1) { pairedRead.Index2 = i; } else { pairedRead.Index1 = i; } // For best performace, // 1. BAM/SAM file should be sorted by reads name. // 2. If sorted on mapping position then give unmapped read a coordinate (generally the coordinate of the mapped mate) // for sorting/indexing purposes only. pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert); if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly) { pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2); if (calculate) { sum += pairedRead.InsertLength; count++; } } } else { pairedRead.InsertLength = 0; if (calculate) { sum -= pairedRead.InsertLength; count--; } pairedRead.ReadIndexes.Add(i); pairedRead.PairedType = PairedReadType.MultipleHits; } } else { pairedRead = new DVEnabledPairedRead(QuerySequences); if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*")) { pairedRead.Index1 = i; } else { pairedRead.Index2 = i; } pairedRead.PairedType = PairedReadType.Orphan; pairedRead.InsertLength = 0; pairedReads.Add(read.QName, pairedRead); } } } List <PairedRead> allreads = pairedReads.Values.ToList <PairedRead>(); pairedReads = null; if (calculate && count > 0) { UpdateType(allreads, sum, count); } return(allreads); }
/// <summary> /// Gets the paired reads when SAMAligned sequences are in memory. /// </summary> /// <param name="meanLengthOfInsert">Mean of the insert length.</param> /// <param name="standardDeviationOfInsert">Standard deviation of insert length.</param> /// <param name="calculate">If this flag is set then mean and standard deviation will /// be calculated from the paired reads instead of specified.</param> /// <returns>List of paired read.</returns> private IList <PairedRead> GetInMemoryPairedReads(float meanLengthOfInsert, float standardDeviationOfInsert, bool calculate = false) { // Dictionary helps to get the information at one pass of alinged sequence list. Dictionary <string, PairedRead> pairedReads = new Dictionary <string, PairedRead>(); double sum = 0; int count = 0; for (int i = 0; i < QuerySequences.Count; i++) { PairedRead pairedRead; SAMAlignedSequence read = QuerySequences[i]; if ((read.Flag & SAMFlags.PairedRead) == SAMFlags.PairedRead) { if (pairedReads.TryGetValue(read.QName, out pairedRead)) { if (pairedRead.Read2 == null || pairedRead.Read1 == null) { if (pairedRead.Read2 == null) { pairedRead.Read2 = read; } else { pairedRead.Read1 = read; } pairedRead.PairedType = PairedRead.GetPairedReadType(pairedRead.Read1, pairedRead.Read2, meanLengthOfInsert, standardDeviationOfInsert); if (pairedRead.PairedType == PairedReadType.Normal || pairedRead.PairedType == PairedReadType.LengthAnomaly) { pairedRead.InsertLength = PairedRead.GetInsertLength(pairedRead.Read1, pairedRead.Read2); if (calculate) { sum += pairedRead.InsertLength; count++; } } } else { pairedRead.InsertLength = 0; if (calculate) { sum -= pairedRead.InsertLength; count--; } pairedRead.Reads.Add(read); pairedRead.PairedType = PairedReadType.MultipleHits; } } else { pairedRead = new PairedRead(); if (!string.IsNullOrEmpty(read.RName) && !read.RName.Equals("*")) { pairedRead.Read1 = read; } else { pairedRead.Read2 = read; } pairedRead.PairedType = PairedReadType.Orphan; pairedRead.InsertLength = 0; pairedReads.Add(read.QName, pairedRead); } } } List <PairedRead> allreads = pairedReads.Values.ToList(); pairedReads = null; if (calculate && count > 0) { UpdateType(allreads, sum, count); } return(allreads); }
/// <summary> /// Gets the insert length of reads. /// </summary> /// <param name="read1">First read.</param> /// <param name="read2">Second read.</param> public static int GetInsertLength(SAMAlignedSequence read1, SAMAlignedSequence read2) { return(GetInsertLength(read1, read2, false)); }
/// <summary> /// Gets the paired reads type. /// </summary> /// <param name="read1">First aligned sequence.</param> /// <param name="read2">Second aligned sequence.</param> /// <param name="meanLengthOfInsert">Mean of the insertion length.</param> /// <param name="standardDeviationOfInsert">Standard deviation of insertion length.</param> public static PairedReadType GetPairedReadType(SAMAlignedSequence read1, SAMAlignedSequence read2, float meanLengthOfInsert, float standardDeviationOfInsert) { PairedReadType type = PairedReadType.Normal; if (read1 == null) { throw new ArgumentNullException("read1"); } if (read2 == null) { return(PairedReadType.Orphan); } if (string.IsNullOrEmpty(read2.RName) || read2.RName.Equals("*") || ((read2.Flag & SAMFlags.UnmappedQuery) == SAMFlags.UnmappedQuery)) { type = PairedReadType.Orphan; } else if (!read2.RName.Equals(read1.RName)) { type = PairedReadType.Chimera; } else { bool isBothforwardReads = IsForwardRead(read1) && IsForwardRead(read2); bool isBothReverseReads = IsReverseRead(read1) && IsReverseRead(read2); if (isBothforwardReads || isBothReverseReads) { type = PairedReadType.StructuralAnomaly; } else { int forwardReadStartPos = 0; int reverseReadStartPos = 0; if (IsForwardRead(read1)) { forwardReadStartPos = read1.Pos; reverseReadStartPos = read2.Pos; } else { forwardReadStartPos = read2.Pos; reverseReadStartPos = read1.Pos; } if (forwardReadStartPos > reverseReadStartPos) { type = PairedReadType.StructuralAnomaly; } else { int insertLength = GetInsertLength(read1, read2); // µ + 3σ float upperLimit = meanLengthOfInsert + (3 * standardDeviationOfInsert); // µ - 3σ float lowerLimit = meanLengthOfInsert - (3 * standardDeviationOfInsert); if (insertLength > upperLimit || insertLength < lowerLimit) { type = PairedReadType.LengthAnomaly; } } } } return(type); }