Exemple #1
0
        /// <summary>
        /// Converts the Sequence to a QualitativeSequence in the alignment.
        /// </summary>
        /// <param name="aln">Aln.</param>
        /// <param name="qualScores">Qual scores.</param>
        public static void ConvertAlignedSequenceToQualSeq(IPairwiseSequenceAlignment aln, int[] qualScores) {
            var q = aln.PairwiseAlignedSequences [0].SecondSequence as Sequence;
            var qvs = new int[q.Count];
            int queryPos = 0;
            for (int i = 0; i < qvs.Length; i++) {
                if (q [i] == '-') {
                    qvs [i] = 0;
                } else {
                    qvs [i] = qualScores[queryPos++];
                }            
            }
            var qseq = new QualitativeSequence (DnaAlphabet.Instance, FastQFormatType.Sanger, q.ToArray (), qvs, false);

            aln.PairwiseAlignedSequences [0].SecondSequence = qseq;

        }
        public void TestConstructorWithByteArray()
        {
            byte[] sequenceData = new byte[6];
            sequenceData[0] = (byte)'C';
            sequenceData[1] = (byte)'A';
            sequenceData[2] = (byte)'A';
            sequenceData[3] = (byte)'G';
            sequenceData[4] = (byte)'C';
            sequenceData[5] = (byte)'T';

            byte[] qualityScores = new byte[6];
            qualityScores[0] = 65;
            qualityScores[1] = 65;
            qualityScores[2] = 65;
            qualityScores[3] = 65;
            qualityScores[4] = 110;
            qualityScores[5] = 125;

            string expectedSequence = "CAAGCT";
            QualitativeSequence qualitativeSequence = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_3, sequenceData, qualityScores);

            string actual = "";
            foreach (byte bt in qualitativeSequence)
            {
                actual += (char)bt;
            }
            Assert.AreEqual(expectedSequence, actual);

            Assert.AreEqual(qualitativeSequence.Alphabet, Alphabets.DNA);
            Assert.AreEqual(qualitativeSequence.Count, 6);
            // 
            // Test for indexer
            Assert.AreEqual(qualitativeSequence[0], (byte)'C');
            Assert.AreEqual(qualitativeSequence[1], (byte)'A');
            Assert.AreEqual(qualitativeSequence[2], (byte)'A');
            Assert.AreEqual(qualitativeSequence[3], (byte)'G');
            Assert.AreEqual(qualitativeSequence[4], (byte)'C');
            Assert.AreEqual(qualitativeSequence[5], (byte)'T');

            int index = 0;
            foreach (byte qualityScore in qualitativeSequence.GetEncodedQualityScores())
            {
                Assert.AreEqual(qualityScores[index++], qualityScore);
            }
        }
Exemple #3
0
        /// <summary>
        /// Convert quality values to string array
        /// </summary>
        /// <param name="sequence">Sequence which has the quality values</param>
        /// <param name="maxColumns">Max number of columns to write to</param>
        /// <returns>string array with quality values</returns>
        public static string[,] FastQQualityValuesToRange(QualitativeSequence sequence, int maxColumns)
        {
            var qualityScoreArray = sequence.GetEncodedQualityScores();
            long rowCount = (int)Math.Ceiling((decimal)qualityScoreArray.Length / (decimal)maxColumns);
            long columnCount = sequence.Count > maxColumns ? maxColumns : sequence.Count;
            string[,] rangeData = new string[rowCount, columnCount];

            int count = 0;

            for (int row = 0; row < rowCount; row++)
            {
                for (int col = 0; col < columnCount && count < qualityScoreArray.Length; col++, count++)
                {
                    rangeData[row, col] = (qualityScoreArray[count]).ToString(CultureInfo.InvariantCulture);
                }
            }

            return rangeData;
        }
Exemple #4
0
 /// <summary>
 /// Initializes a new instance of the <see cref="Bio.IO.PacBio.PacBioCCSRead"/> class. From an initially parsed BAM file.
 /// </summary>
 /// <param name="s">S.</param>
 public PacBioCCSRead (SAMAlignedSequence s)
 {
     /* TODO: Converting from binary to string and back is beyond silly...
      * no performance hit worth worrying about at present, but in the future it might be worth
      * going directly from binary to the type rather than through string intermediates */
     foreach (var v in s.OptionalFields) {
         if (v.Tag == "sn") {
             var snrs = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
             SnrA = snrs [0];
             SnrC = snrs [1];
             SnrG = snrs [2];
             SnrT = snrs [3];
         } else if (v.Tag == "zm") {
             HoleNumber = (int)Convert.ToInt32 (v.Value);
         } else if (v.Tag == "pq") {
             // This tag is now deprecated by the rq tag
             ReadQuality = Convert.ToSingle (v.Value);
         } else if (v.Tag == "rq") {
             ReadQuality = Convert.ToSingle (v.Value);
         }else if (v.Tag == "za") {
             AvgZscore = (float)Convert.ToSingle (v.Value);
         } else if (v.Tag == "rs") {
             statusCounts = v.Value.Split (',').Skip (1).Select (x => Convert.ToInt32 (x)).ToArray ();
         } else if (v.Tag == "np") {
             NumPasses = Convert.ToInt32 (v.Value);
         } else if (v.Tag == "RG") {
             ReadGroup = v.Value;
         } else if (v.Tag == "zs") {
             ZScores = v.Value.Split (',').Skip (1).Select (x => Convert.ToSingle (x)).ToArray ();
         }
     }
     // TODO: We should use String.Intern here, but not available in PCL...
     // Movie = String.Intern(s.QuerySequence.ID.Split ('/') [0]);
     Movie = s.QuerySequence.ID.Split ('/') [0];
     Sequence = s.QuerySequence as QualitativeSequence;
 }
        public void TestGetPhredQualityScore()
        {
            // Validate using SangerFormat.
            List<int> pharedQualityScores = GetPharedQualityScoresForSanger();
            byte[] encodedSangerQualityScores = GetSangerEncodedQualityScores(pharedQualityScores);
            byte[] symbols = GetSymbols(encodedSangerQualityScores.Length);
            QualitativeSequence qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Sanger, symbols, encodedSangerQualityScores);

            for (int i = 0; i < qualSeq.Count; i++)
            {
                Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i));
            }

            // Validate using illumina v1.3 .
            pharedQualityScores = GetPharedQualityScoresForIllumina_v1_3();
            byte[] encodedIllumina_v1_3_QualityScores = GetIllumina_v1_3_EncodedQualityScores(pharedQualityScores);
            symbols = GetSymbols(encodedIllumina_v1_3_QualityScores.Length);
            qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_3, symbols, encodedIllumina_v1_3_QualityScores);
            for (int i = 0; i < qualSeq.Count; i++)
            {
                Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i));
            }

            // Validate using illumina v1.5
            pharedQualityScores = GetPharedQualityScoresForIllumina_v1_5();
            byte[] encodedIllumina_v1_5_QualityScores = GetIllumina_v1_5_EncodedQualityScores(pharedQualityScores);
            symbols = GetSymbols(encodedIllumina_v1_5_QualityScores.Length);
            qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_5, symbols, encodedIllumina_v1_5_QualityScores);
            for (int i = 0; i < qualSeq.Count; i++)
            {
                Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i));
            }

            // Validate using illumina v1.8
            pharedQualityScores = GetPharedQualityScoresForIllumina_v1_8();
            byte[] encodedIllumina_v1_8_QualityScores = GetIllumina_v1_8_EncodedQualityScores(pharedQualityScores);
            symbols = GetSymbols(encodedIllumina_v1_8_QualityScores.Length);
            qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Illumina_v1_8, symbols, encodedIllumina_v1_8_QualityScores);
            for (int i = 0; i < qualSeq.Count; i++)
            {
                Assert.AreEqual(pharedQualityScores[i], qualSeq.GetPhredQualityScore(i));
            }


            // Validate using illumina v1.0 .
            List<int> solexaQualityScores = GetSolexaQualityScoresForIllumina_v1_0();
            byte[] encodedIllumina_v1_0_QualityScores = GetIllumina_v1_0_EncodedQualityScores(solexaQualityScores);
            symbols = GetSymbols(encodedIllumina_v1_0_QualityScores.Length);
            qualSeq = new QualitativeSequence(Alphabets.DNA, FastQFormatType.Solexa_Illumina_v1_0, symbols, encodedIllumina_v1_0_QualityScores);
            for (int i = 0; i < qualSeq.Count; i++)
            {
                Assert.AreEqual(solexaQualityScores[i], qualSeq.GetSolexaQualityScore(i));
            }

        }
Exemple #6
0
        /// <summary>
        /// Given two byte arrays representing a pairwise alignment, shift them so 
        /// that all deletions start as early as possible.  For example:
        /// 
        /// <code>
        /// TTTTAAAATTTT  -> Converts to ->  TTTTAAAATTTT
        /// TTTTAA--TTTT                     TTTT--AATTTT
        /// </code>
        /// 
        /// This function takes a IPairwiseSequenceAlignment and assumes that the first sequence is the reference and second
        /// sequence is the query.  It returns a new Pairwise sequence alignment with all of the indels left aligned as well as a list of variants.
        /// </summary>
        /// <param name="aln">Aln. The second sequence should be of type QualitativeSequence or Sequence</param>
        /// <param name="callVariants">callVariants.  If true, it will call variants, otherwise the second half of tuple will be null. </param>
        public static Tuple<IPairwiseSequenceAlignment, List<Variant>> LeftAlignIndelsAndCallVariants(IPairwiseSequenceAlignment aln, bool callVariants = true) {

            if (aln == null) {
                throw new NullReferenceException ("aln");
            }
            if (aln.PairwiseAlignedSequences == null || aln.PairwiseAlignedSequences.Count != 1) {
                throw new ArgumentException ("The pairwise aligned sequence should only have one alignment");
            }
            var frstAln = aln.PairwiseAlignedSequences.First ();
            var seq1 = frstAln.FirstSequence;
            var seq2 = frstAln.SecondSequence;
            if (seq1 == null) {
                throw new NullReferenceException ("seq1");
            } else if (seq2 == null) {
                throw new NullReferenceException ("seq2");
            }

            //TODO: Might implement an ambiguity check later.
            #if FALSE
            if (seq1.Alphabet.HasAmbiguity || seq2.Alphabet.HasAmbiguity) {
                throw new ArgumentException ("Cannot left align sequences with ambiguous symbols.");
            }
            #endif

            // Note we have to copy unless we can guarantee the array will not be mutated.
            byte[] refseq = seq1.ToArray ();
            ISequence newQuery;
            List<Variant> variants = null;
            // Call variants for a qualitative sequence
            if (seq2 is QualitativeSequence) {
                var qs = seq2 as QualitativeSequence;
                var query = Enumerable.Zip (qs, qs.GetQualityScores (), (bp, qv) => new BPandQV (bp, (byte)qv, false)).ToArray ();
                AlignmentUtils.LeftAlignIndels (refseq, query);
                AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
                if (callVariants) {
                    variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
                }
                var newQueryQS = new QualitativeSequence (qs.Alphabet, 
                    qs.FormatType,
                    query.Select (z => z.BP).ToArray (),
                    query.Select (p => p.QV).ToArray (),
                    false);
                newQueryQS.Metadata = seq2.Metadata;
                newQuery = newQueryQS;
                
            } else if (seq2 is Sequence) {  // For a sequence with no QV values.
                var qs = seq2 as Sequence;
                var query = qs.Select (v => new BPandQV (v, 0, false)).ToArray();
                AlignmentUtils.LeftAlignIndels (refseq, query);
                AlignmentUtils.VerifyNoGapsOnEnds (refseq, query);
                // ISequence does not have a setable metadata
                var newQueryS = new Sequence(qs.Alphabet, query.Select(z=>z.BP).ToArray(), false);
                newQueryS.Metadata = seq2.Metadata;
                if (callVariants) {
                    variants = VariantCaller.CallVariants (refseq, query, seq2.IsMarkedAsReverseComplement());
                }
                newQuery = newQueryS;
            } else {
                throw new ArgumentException ("Can only left align indels if the query sequence is of type Sequence or QualitativeSequence.");
            }

            if (aln.FirstSequence != null && aln.FirstSequence.ID != null) {
                foreach (var v in variants) {
                    v.RefName = aln.FirstSequence.ID;
                }
            }

            var newRef = new Sequence (seq1.Alphabet, refseq, false);
            newRef.ID = seq1.ID;
            newRef.Metadata = seq1.Metadata;

            newQuery.ID = seq2.ID;

            var newaln = new PairwiseSequenceAlignment (aln.FirstSequence, aln.SecondSequence);
            var pas = new PairwiseAlignedSequence ();
            pas.FirstSequence = newRef;
            pas.SecondSequence = newQuery;
            newaln.Add (pas);
            return new Tuple<IPairwiseSequenceAlignment, List<Variant>> (newaln, variants);
        }
Exemple #7
0
        /// <summary>
        /// Convert given range containing any quality values to QualitativeSequence object
        /// </summary>
        /// <param name="range">Range of cells</param>
        /// <param name="sequence">Sequece object</param>
        /// <returns>QualitativeSequence Object</returns>
        public static QualitativeSequence RangeToQualitativeSequence(List<Range> range, ISequence sequence)
        {
            string[] rangeData = FlattenToArray(range);
            // see if we have enough quality scores to map with the sequence
            if (rangeData.Length < sequence.Count)
                throw new FormatException(Properties.Resources.ExportFastQ_SequenceAndScoresNotMapping);

            System.Collections.IEnumerator qualityScores = rangeData.GetEnumerator();
            byte[] sequenceSymbols = new byte[sequence.Count];
            byte[] sequencequalityValues = new byte[sequence.Count];
            byte currentQualityScore;

            long curIndex = 0;
            foreach (byte sequenceSymbol in sequence)
            {
                qualityScores.MoveNext();

                if (byte.TryParse(qualityScores.Current.ToString(), out currentQualityScore))
                {
                    sequenceSymbols[curIndex] = sequenceSymbol;
                    sequencequalityValues[curIndex] = currentQualityScore;
                    curIndex++;
                }
                else
                {
                    throw new FormatException(Properties.Resources.ExportFasQ_InvalidQualityScore);
                }
            }

            QualitativeSequence qualitativeSequence = new QualitativeSequence(sequence.Alphabet, FastQFormatType.Sanger, sequenceSymbols, sequencequalityValues)
            {
                ID = sequence.ID
            };
            
            return qualitativeSequence;
        }
Exemple #8
0
        /// <summary>
        /// Write quality scores to the sheet if its a FastQ file
        /// </summary>
        /// <param name="sequence">Sequence of which the quality scores should be written</param>
        /// <param name="worksheet">Worksheet to which to write</param>
        /// <param name="startingRow">Starting row</param>
        /// <param name="startingColumn">Column we wrote to</param>
        /// <param name="dataRange">Range where quality values were written</param>
        /// <returns>Index of row after last written row</returns>
        private int WriteQualityValues(
            QualitativeSequence sequence,
            Worksheet worksheet,
            int startingRow,
            int startingColumn,
            out Range dataRange)
        {
            string[,] qualityScores = ExcelImportFormatter.FastQQualityValuesToRange(sequence, maxNumberOfCharacters);
            dataRange = this.WriteToSheet(worksheet, qualityScores, startingRow, startingColumn);

            return startingRow + qualityScores.GetLength(0) + 1;
        }
Exemple #9
0
        /// <summary>
        /// Return a new QualitativeSequence representing the complement of this QualitativeSequence.
        /// </summary>
        public ISequence GetComplementedSequence()
        {
            byte[] newSequenceData = new byte[this.sequenceData.GetLongLength()];
            sbyte[] newQualityScores = this.qualityScores;

            for (long index = 0; index < this.sequenceData.GetLongLength(); index++)
            {
                byte complementedSymbol;
                byte symbol = this.sequenceData[index];
                if (!this.Alphabet.TryGetComplementSymbol(symbol, out complementedSymbol))
                {
                    throw new NotSupportedException(string.Format(CultureInfo.CurrentUICulture, Properties.Resource.ComplementNotSupportedByalphabet, (char)symbol, this.Alphabet.Name));
                }

                newSequenceData[index] = complementedSymbol;
            }

            QualitativeSequence seq = new QualitativeSequence(this.Alphabet, this.FormatType, newSequenceData, newQualityScores, false);
            seq.ID = this.ID;
            seq.metadata = this.metadata;

            return seq;
        }
Exemple #10
0
        /// <summary>
        /// Return a new QualitativeSequence representing this QualitativeSequence with the orientation reversed.
        /// </summary>
        public ISequence GetReversedSequence()
        {
            byte[] newSequenceData = new byte[this.sequenceData.GetLongLength()];
            sbyte[] newQualityScores = new sbyte[this.qualityScores.GetLongLength()];

            for (long index = 0; index < this.sequenceData.GetLongLength(); index++)
            {
                newSequenceData[index] = this.sequenceData[this.sequenceData.GetLongLength() - index - 1];
                newQualityScores[index] = (this.qualityScores[this.qualityScores.GetLongLength() - index - 1]);
            }

            QualitativeSequence seq = new QualitativeSequence(this.Alphabet, this.FormatType, newSequenceData, newQualityScores, false);
            seq.ID = this.ID;
            seq.metadata = this.metadata;

            return seq;
        }
Exemple #11
0
        /// <summary>
        /// Converts the current instance to the specified FastQ format type 
        /// and returns a new instance of QualitativeSequence.
        /// </summary>
        /// <param name="formatType">FastQ format type to convert.</param>
        public QualitativeSequence ConvertTo(FastQFormatType formatType)
        {
            sbyte[] convertedQualityScores = ConvertQualityScores(this.FormatType, formatType, this.qualityScores);

            QualitativeSequence seq = new QualitativeSequence(this.Alphabet, formatType, this.sequenceData, convertedQualityScores, false);
            seq.ID = this.ID;
            seq.metadata = this.metadata;

            return seq;
        }
        /// <summary>
        /// Iterate through sequenceDictionary once only, and perform operations on each sequence list
        /// Set the average read and alignment qualities for all distinct reads, and the frequency of occurrence of each
        /// distinct read
        /// </summary>
        private void IterateSequenceDict()
        {
            double[] alignmentQualities = new double[CountDistinct]; // one for every sequence in the map, just get its qualities
            double[] readQualities = new double[CountDistinct]; //
            Collection<int> frequencies = new Collection<int>();

            int i = 0;
            foreach (List<SAMAlignedSequence> seqList in sequenceDict.Values)
            {
                // Alighment qualities
                alignmentQualities[i] = seqList[0].MapQ;

                // Read qualities
                QualitativeSequence qSeq = new QualitativeSequence(SAMDnaAlphabet.Instance, FastQFormatType.Sanger, GetSequence(seqList[0]), GetReadQuality(seqList[0]));
                readQualities[i++] = qSeq.GetQualityScores().Average();

                // Frequencies
                frequencies.Add(seqList.Count);
            }

            alignmentQuality = alignmentQualities.Length > 0 ? Math.Round(alignmentQualities.Average(), 2) : 0;
            readQuality = readQualities.Length > 0 ? Math.Round(readQualities.Average(), 2) : 0;
            //frequencyDistributionSequences = frequencies;
        }
 /* Cannot override, because base is not virtual
 public override IEnumerable<QualitativeSequence> Parse(StreamReader reader)
 {
     FastQFormatType formatType = this.FormatType;
     do
     {
         var seq = ParseOne(reader, formatType);
         if (seq != null)
             yield return ParseHeader(seq);
     }
     while (!reader.EndOfStream);
 }*/
 private QualitativeSequence ParseHeader(QualitativeSequence seq)
 {
     Match m = pre18Regex.Match(seq.ID);
     if (m.Success)
     {
         seq.Metadata["Instrument"]  = m.Captures[0].Value;
         seq.Metadata["Lane"]        = Int32.Parse(m.Captures[1].Value);
         seq.Metadata["Tile"]        = Int32.Parse(m.Captures[2].Value);
         seq.Metadata["X"]           = Int32.Parse(m.Captures[3].Value);
         seq.Metadata["Y"]           = Int32.Parse(m.Captures[4].Value);
         seq.Metadata["Index"]       = Int32.Parse(m.Captures[5].Value);
         seq.Metadata["PairMember"]  = Int32.Parse(m.Captures[6].Value);
         return seq;
     }
     else
     {
         m = post18Regex.Match(seq.ID);
         if (m.Success)
         {
             seq.Metadata["Instrument"]      = m.Captures[0].Value;
             seq.Metadata["Run"]             = Int32.Parse(m.Captures[1].Value);
             seq.Metadata["FlowCell"]        = m.Captures[2].Value;
             seq.Metadata["Lane"]            = Int32.Parse(m.Captures[3].Value);
             seq.Metadata["Tile"]            = Int32.Parse(m.Captures[4].Value);
             seq.Metadata["X"]               = Int32.Parse(m.Captures[5].Value);
             seq.Metadata["Y"]               = Int32.Parse(m.Captures[6].Value);
             seq.Metadata["PairMember"]      = Int32.Parse(m.Captures[7].Value);
             seq.Metadata["IsFiltered"]      = m.Captures[8].Value == "Y";
             seq.Metadata["ControlBits"]     = Int32.Parse(m.Captures[9].Value);
             seq.Metadata["IndexSequence"]   = m.Captures[10].Value;
             return seq;
         }
     }
     throw new FileFormatException("Sequence identifier not in Illumina format");
 }