/// <summary> /// Converts the Sequence to a QualitativeSequence in the alignment. /// </summary> /// <param name="aln">Aln.</param> /// <param name="qualScores">Qual scores.</param> public static void ConvertAlignedSequenceToQualSeq(IPairwiseSequenceAlignment aln, int[] qualScores) { var q = aln.PairwiseAlignedSequences [0].SecondSequence as Sequence; var qvs = new int[q.Count]; int queryPos = 0; for (int i = 0; i < qvs.Length; i++) { if (q [i] == '-') { qvs [i] = 0; } else { qvs [i] = qualScores[queryPos++]; } } var qseq = new QualitativeSequence (DnaAlphabet.Instance, FastQFormatType.Sanger, q.ToArray (), qvs, false); aln.PairwiseAlignedSequences [0].SecondSequence = qseq; }
public void TestConstructorWithByteArray() { byte[] sequenceData = new byte[6]; sequenceData[0] = (byte)'C'; sequenceData[1] = (byte)'A'; sequenceData[2] = (byte)'A'; sequenceData[3] = (byte)'G'; sequenceData[4] = (byte)'C'; sequenceData[5] = (byte)'T'; byte[] qualityScores = new byte[6]; qualityScores[0] = 65; qualityScores[1] = 65; qualityScores[2] = 65; qualityScores[3] = 65; qualityScores[4] = 110; qualityScores[5] = 125; string expectedSequence = "CAAGCT"; QualitativeSequence qualitativeSequence = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_3, sequenceData, qualityScores); string actual = ""; foreach (byte bt in qualitativeSequence) { actual += (char)bt; } Assert.AreEqual(expectedSequence, actual); Assert.AreEqual(qualitativeSequence.Alphabet, Alphabets.DNA); Assert.AreEqual(qualitativeSequence.Count, 6); // // Test for indexer Assert.AreEqual(qualitativeSequence[0], (byte)'C'); Assert.AreEqual(qualitativeSequence[1], (byte)'A'); Assert.AreEqual(qualitativeSequence[2], (byte)'A'); Assert.AreEqual(qualitativeSequence[3], (byte)'G'); Assert.AreEqual(qualitativeSequence[4], (byte)'C'); Assert.AreEqual(qualitativeSequence[5], (byte)'T'); int index = 0; foreach (byte qualityScore in qualitativeSequence.GetEncodedQualityScores()) { Assert.AreEqual(qualityScores[index++], qualityScore); } }
/// <summary> /// Convert quality values to string array /// </summary> /// <param name="sequence">Sequence which has the quality values</param> /// <param name="maxColumns">Max number of columns to write to</param> /// <returns>string array with quality values</returns> public static string[,] FastQQualityValuesToRange(QualitativeSequence sequence, int maxColumns) { var qualityScoreArray = sequence.GetEncodedQualityScores(); long rowCount = (int)Math.Ceiling((decimal)qualityScoreArray.Length / (decimal)maxColumns); long columnCount = sequence.Count > maxColumns ? maxColumns : sequence.Count; string[,] rangeData = new string[rowCount, columnCount]; int count = 0; for (int row = 0; row < rowCount; row++) { for (int col = 0; col < columnCount && count < qualityScoreArray.Length; col++, count++) { rangeData[row, col] = (qualityScoreArray[count]).ToString(CultureInfo.InvariantCulture); } } return rangeData; }
/// <summary> /// Initializes a new instance of the <see cref="Bio.IO.PacBio.PacBioCCSRead"/> class. From an initially parsed BAM file. /// </summary> /// <param name="s">S.</param> public PacBioCCSRead (SAMAlignedSequence s) { /* TODO: Converting from binary to string and back is beyond silly... * no performance hit worth worrying about at present, but in the future it might be worth * going directly from binary to the type rather than through string intermediates */ foreach (var v in s.OptionalFields) { if (v.Tag == "sn") { var snrs = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray (); SnrA = snrs [0]; SnrC = snrs [1]; SnrG = snrs [2]; SnrT = snrs [3]; } else if (v.Tag == "zm") { HoleNumber = (int)Convert.ToInt32 (v.Value); } else if (v.Tag == "pq") { // This tag is now deprecated by the rq tag ReadQuality = Convert.ToSingle (v.Value); } else if (v.Tag == "rq") { ReadQuality = Convert.ToSingle (v.Value); }else if (v.Tag == "za") { AvgZscore = (float)Convert.ToSingle (v.Value); } else if (v.Tag == "rs") { statusCounts = v.Value.Split (',').Skip (1).Select (x => Convert.ToInt32 (x)).ToArray (); } else if (v.Tag == "np") { NumPasses = Convert.ToInt32 (v.Value); } else if (v.Tag == "RG") { ReadGroup = v.Value; } else if (v.Tag == "zs") { ZScores = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray (); } } // TODO: We should use String.Intern here, but not available in PCL... // Movie = String.Intern(s.QuerySequence.ID.Split ('/') [0]); Movie = s.QuerySequence.ID.Split ('/') [0]; Sequence = s.QuerySequence as QualitativeSequence; }
public void TestGetPhredQualityScore() { // Validate using SangerFormat. List<int> pharedQualityScores = GetPharedQualityScoresForSanger(); byte[] encodedSangerQualityScores = GetSangerEncodedQualityScores(pharedQualityScores); byte[] symbols = GetSymbols(encodedSangerQualityScores.Length); QualitativeSequence qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Sanger, symbols, encodedSangerQualityScores); for (int i = 0; i < qualSeq.Count; i++) { Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i)); } // Validate using illumina v1.3 . pharedQualityScores = GetPharedQualityScoresForIllumina_v1_3(); byte[] encodedIllumina_v1_3_QualityScores = GetIllumina_v1_3_EncodedQualityScores(pharedQualityScores); symbols = GetSymbols(encodedIllumina_v1_3_QualityScores.Length); qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_3, symbols, encodedIllumina_v1_3_QualityScores); for (int i = 0; i < qualSeq.Count; i++) { Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i)); } // Validate using illumina v1.5 pharedQualityScores = GetPharedQualityScoresForIllumina_v1_5(); byte[] encodedIllumina_v1_5_QualityScores = GetIllumina_v1_5_EncodedQualityScores(pharedQualityScores); symbols = GetSymbols(encodedIllumina_v1_5_QualityScores.Length); qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_5, symbols, encodedIllumina_v1_5_QualityScores); for (int i = 0; i < qualSeq.Count; i++) { Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i)); } // Validate using illumina v1.8 pharedQualityScores = GetPharedQualityScoresForIllumina_v1_8(); byte[] encodedIllumina_v1_8_QualityScores = GetIllumina_v1_8_EncodedQualityScores(pharedQualityScores); symbols = GetSymbols(encodedIllumina_v1_8_QualityScores.Length); qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_8, symbols, encodedIllumina_v1_8_QualityScores); for (int i = 0; i < qualSeq.Count; i++) { Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i)); } // Validate using illumina v1.0 . List<int> solexaQualityScores = GetSolexaQualityScoresForIllumina_v1_0(); byte[] encodedIllumina_v1_0_QualityScores = GetIllumina_v1_0_EncodedQualityScores(solexaQualityScores); symbols = GetSymbols(encodedIllumina_v1_0_QualityScores.Length); qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Solexa_Illumina_v1_0, symbols, encodedIllumina_v1_0_QualityScores); for (int i = 0; i < qualSeq.Count; i++) { Assert.AreEqual(solexaQualityScores[i], qualSeq.GetSolexaQualityScore(i)); } }
/// <summary> /// Given two byte arrays representing a pairwise alignment, shift them so /// that all deletions start as early as possible. For example: /// /// <code> /// TTTTAAAATTTT -> Converts to -> TTTTAAAATTTT /// TTTTAA--TTTT TTTT--AATTTT /// </code> /// /// This function takes a IPairwiseSequenceAlignment and assumes that the first sequence is the reference and second /// sequence is the query. It returns a new Pairwise sequence alignment with all of the indels left aligned as well as a list of variants. /// </summary> /// <param name="aln">Aln. The second sequence should be of type QualitativeSequence or Sequence</param> /// <param name="callVariants">callVariants. If true, it will call variants, otherwise the second half of tuple will be null. </param> public static Tuple<IPairwiseSequenceAlignment, List<Variant>> LeftAlignIndelsAndCallVariants(IPairwiseSequenceAlignment aln, bool callVariants = true) { if (aln == null) { throw new NullReferenceException ("aln"); } if (aln.PairwiseAlignedSequences == null || aln.PairwiseAlignedSequences.Count != 1) { throw new ArgumentException ("The pairwise aligned sequence should only have one alignment"); } var frstAln = aln.PairwiseAlignedSequences.First (); var seq1 = frstAln.FirstSequence; var seq2 = frstAln.SecondSequence; if (seq1 == null) { throw new NullReferenceException ("seq1"); } else if (seq2 == null) { throw new NullReferenceException ("seq2"); } //TODO: Might implement an ambiguity check later. #if FALSE if (seq1.Alphabet.HasAmbiguity || seq2.Alphabet.HasAmbiguity) { throw new ArgumentException ("Cannot left align sequences with ambiguous symbols."); } #endif // Note we have to copy unless we can guarantee the array will not be mutated. byte[] refseq = seq1.ToArray (); ISequence newQuery; List<Variant> variants = null; // Call variants for a qualitative sequence if (seq2 is QualitativeSequence) { var qs = seq2 as QualitativeSequence; var query = Enumerable.Zip (qs, qs.GetQualityScores (), (bp, qv) => new BPandQV (bp, (byte)qv, false)).ToArray (); AlignmentUtils.LeftAlignIndels (refseq, query); AlignmentUtils.VerifyNoGapsOnEnds (refseq, query); if (callVariants) { variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement()); } var newQueryQS = new QualitativeSequence (qs.Alphabet, qs.FormatType, query.Select (z => z.BP).ToArray (), query.Select (p => p.QV).ToArray (), false); newQueryQS.Metadata = seq2.Metadata; newQuery = newQueryQS; } else if (seq2 is Sequence) { // For a sequence with no QV values. var qs = seq2 as Sequence; var query = qs.Select (v => new BPandQV (v, 0, false)).ToArray(); AlignmentUtils.LeftAlignIndels (refseq, query); AlignmentUtils.VerifyNoGapsOnEnds (refseq, query); // ISequence does not have a setable metadata var newQueryS = new Sequence(qs.Alphabet, query.Select(z=>z.BP).ToArray(), false); newQueryS.Metadata = seq2.Metadata; if (callVariants) { variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement()); } newQuery = newQueryS; } else { throw new ArgumentException ("Can only left align indels if the query sequence is of type Sequence or QualitativeSequence."); } if (aln.FirstSequence != null && aln.FirstSequence.ID != null) { foreach (var v in variants) { v.RefName = aln.FirstSequence.ID; } } var newRef = new Sequence (seq1.Alphabet, refseq, false); newRef.ID = seq1.ID; newRef.Metadata = seq1.Metadata; newQuery.ID = seq2.ID; var newaln = new PairwiseSequenceAlignment (aln.FirstSequence, aln.SecondSequence); var pas = new PairwiseAlignedSequence (); pas.FirstSequence = newRef; pas.SecondSequence = newQuery; newaln.Add (pas); return new Tuple<IPairwiseSequenceAlignment, List<Variant>> (newaln, variants); }
/// <summary> /// Convert given range containing any quality values to QualitativeSequence object /// </summary> /// <param name="range">Range of cells</param> /// <param name="sequence">Sequece object</param> /// <returns>QualitativeSequence Object</returns> public static QualitativeSequence RangeToQualitativeSequence(List<Range> range, ISequence sequence) { string[] rangeData = FlattenToArray(range); // see if we have enough quality scores to map with the sequence if (rangeData.Length < sequence.Count) throw new FormatException(Properties.Resources.ExportFastQ_SequenceAndScoresNotMapping); System.Collections.IEnumerator qualityScores = rangeData.GetEnumerator(); byte[] sequenceSymbols = new byte[sequence.Count]; byte[] sequencequalityValues = new byte[sequence.Count]; byte currentQualityScore; long curIndex = 0; foreach (byte sequenceSymbol in sequence) { qualityScores.MoveNext(); if (byte.TryParse(qualityScores.Current.ToString(), out currentQualityScore)) { sequenceSymbols[curIndex] = sequenceSymbol; sequencequalityValues[curIndex] = currentQualityScore; curIndex++; } else { throw new FormatException(Properties.Resources.ExportFasQ_InvalidQualityScore); } } QualitativeSequence qualitativeSequence = new QualitativeSequence(sequence.Alphabet, FastQFormatType.Sanger, sequenceSymbols, sequencequalityValues) { ID = sequence.ID }; return qualitativeSequence; }
/// <summary> /// Write quality scores to the sheet if its a FastQ file /// </summary> /// <param name="sequence">Sequence of which the quality scores should be written</param> /// <param name="worksheet">Worksheet to which to write</param> /// <param name="startingRow">Starting row</param> /// <param name="startingColumn">Column we wrote to</param> /// <param name="dataRange">Range where quality values were written</param> /// <returns>Index of row after last written row</returns> private int WriteQualityValues( QualitativeSequence sequence, Worksheet worksheet, int startingRow, int startingColumn, out Range dataRange) { string[,] qualityScores = ExcelImportFormatter.FastQQualityValuesToRange(sequence, maxNumberOfCharacters); dataRange = this.WriteToSheet(worksheet, qualityScores, startingRow, startingColumn); return startingRow + qualityScores.GetLength(0) + 1; }
/// <summary> /// Return a new QualitativeSequence representing the complement of this QualitativeSequence. /// </summary> public ISequence GetComplementedSequence() { byte[] newSequenceData = new byte[this.sequenceData.GetLongLength()]; sbyte[] newQualityScores = this.qualityScores; for (long index = 0; index < this.sequenceData.GetLongLength(); index++) { byte complementedSymbol; byte symbol = this.sequenceData[index]; if (!this.Alphabet.TryGetComplementSymbol(symbol, out complementedSymbol)) { throw new NotSupportedException(string.Format(CultureInfo.CurrentUICulture, Properties.Resource.ComplementNotSupportedByalphabet, (char)symbol, this.Alphabet.Name)); } newSequenceData[index] = complementedSymbol; } QualitativeSequence seq = new QualitativeSequence(this.Alphabet, this.FormatType, newSequenceData, newQualityScores, false); seq.ID = this.ID; seq.metadata = this.metadata; return seq; }
/// <summary> /// Return a new QualitativeSequence representing this QualitativeSequence with the orientation reversed. /// </summary> public ISequence GetReversedSequence() { byte[] newSequenceData = new byte[this.sequenceData.GetLongLength()]; sbyte[] newQualityScores = new sbyte[this.qualityScores.GetLongLength()]; for (long index = 0; index < this.sequenceData.GetLongLength(); index++) { newSequenceData[index] = this.sequenceData[this.sequenceData.GetLongLength() - index - 1]; newQualityScores[index] = (this.qualityScores[this.qualityScores.GetLongLength() - index - 1]); } QualitativeSequence seq = new QualitativeSequence(this.Alphabet, this.FormatType, newSequenceData, newQualityScores, false); seq.ID = this.ID; seq.metadata = this.metadata; return seq; }
/// <summary> /// Converts the current instance to the specified FastQ format type /// and returns a new instance of QualitativeSequence. /// </summary> /// <param name="formatType">FastQ format type to convert.</param> public QualitativeSequence ConvertTo(FastQFormatType formatType) { sbyte[] convertedQualityScores = ConvertQualityScores(this.FormatType, formatType, this.qualityScores); QualitativeSequence seq = new QualitativeSequence(this.Alphabet, formatType, this.sequenceData, convertedQualityScores, false); seq.ID = this.ID; seq.metadata = this.metadata; return seq; }
/// <summary> /// Iterate through sequenceDictionary once only, and perform operations on each sequence list /// Set the average read and alignment qualities for all distinct reads, and the frequency of occurrence of each /// distinct read /// </summary> private void IterateSequenceDict() { double[] alignmentQualities = new double[CountDistinct]; // one for every sequence in the map, just get its qualities double[] readQualities = new double[CountDistinct]; // Collection<int> frequencies = new Collection<int>(); int i = 0; foreach (List<SAMAlignedSequence> seqList in sequenceDict.Values) { // Alighment qualities alignmentQualities[i] = seqList[0].MapQ; // Read qualities QualitativeSequence qSeq = new QualitativeSequence(SAMDnaAlphabet.Instance, FastQFormatType.Sanger, GetSequence(seqList[0]), GetReadQuality(seqList[0])); readQualities[i++] = qSeq.GetQualityScores().Average(); // Frequencies frequencies.Add(seqList.Count); } alignmentQuality = alignmentQualities.Length > 0 ? Math.Round(alignmentQualities.Average(), 2) : 0; readQuality = readQualities.Length > 0 ? Math.Round(readQualities.Average(), 2) : 0; //frequencyDistributionSequences = frequencies; }
/* Cannot override, because base is not virtual public override IEnumerable<QualitativeSequence> Parse(StreamReader reader) { FastQFormatType formatType = this.FormatType; do { var seq = ParseOne(reader, formatType); if (seq != null) yield return ParseHeader(seq); } while (!reader.EndOfStream); }*/ private QualitativeSequence ParseHeader(QualitativeSequence seq) { Match m = pre18Regex.Match(seq.ID); if (m.Success) { seq.Metadata["Instrument"] = m.Captures[0].Value; seq.Metadata["Lane"] = Int32.Parse(m.Captures[1].Value); seq.Metadata["Tile"] = Int32.Parse(m.Captures[2].Value); seq.Metadata["X"] = Int32.Parse(m.Captures[3].Value); seq.Metadata["Y"] = Int32.Parse(m.Captures[4].Value); seq.Metadata["Index"] = Int32.Parse(m.Captures[5].Value); seq.Metadata["PairMember"] = Int32.Parse(m.Captures[6].Value); return seq; } else { m = post18Regex.Match(seq.ID); if (m.Success) { seq.Metadata["Instrument"] = m.Captures[0].Value; seq.Metadata["Run"] = Int32.Parse(m.Captures[1].Value); seq.Metadata["FlowCell"] = m.Captures[2].Value; seq.Metadata["Lane"] = Int32.Parse(m.Captures[3].Value); seq.Metadata["Tile"] = Int32.Parse(m.Captures[4].Value); seq.Metadata["X"] = Int32.Parse(m.Captures[5].Value); seq.Metadata["Y"] = Int32.Parse(m.Captures[6].Value); seq.Metadata["PairMember"] = Int32.Parse(m.Captures[7].Value); seq.Metadata["IsFiltered"] = m.Captures[8].Value == "Y"; seq.Metadata["ControlBits"] = Int32.Parse(m.Captures[9].Value); seq.Metadata["IndexSequence"] = m.Captures[10].Value; return seq; } } throw new FileFormatException("Sequence identifier not in Illumina format"); }