예제 #1
0
 /// <summary> Returns the name, description and sequence combined in one string.
 /// The length of each line in the sequence is FASTA.LINE_LENGTH</summary>
 public virtual string Format(Sequence sequence)
 {
     StringBuilder buffer = new StringBuilder(">");
     buffer.Append(sequence.Id == null ? "" : sequence.Id);
     buffer.Append("\n");
     for (int i = 0, n = sequence.Length; i*LINE_WIDTH < n; i++)
     {
         for (int j = i*LINE_WIDTH, m = (i + 1)*LINE_WIDTH < n ? (i + 1)*LINE_WIDTH : n; j < m; j++)
         {
             buffer.Append(sequence.Subsequence(j, 1));
         }
         buffer.Append("\n");
     }
     return buffer.ToString();
 }
        /// <summary> Aligns two sequences by Smith-Waterman algorithm</summary>
        /// <param name="s1">sequene #1 </param>
        /// <param name="s2">sequene #2 </param>
        /// <param name="matrix">scoring matrix </param>
        /// <param name="o">open gap penalty </param>
        /// <param name="e">extend gap penalty </param>
        /// <returns> alignment object contains the two aligned sequences, 
        /// the alignment score and alignment statistics</returns>
        /// <seealso cref="Sequence"/>
        /// <seealso cref="Matrix"/>
        public static Alignment Align(Sequence s1, Sequence s2, Matrix matrix, float o, float e)
        {
            float[,] scores = matrix.Scores;

            SmithWatermanGotoh sw = new SmithWatermanGotoh();

            int m = s1.Length + 1;
            int n = s2.Length + 1;

            byte[] pointers = new byte[m * n];

            // Initializes the boundaries of the traceback matrix to STOP.
            for (int i = 0, k = 0; i < m; i++, k += n)
            {
                pointers[k] = Directions.STOP;
            }
            for (int j = 1; j < n; j++)
            {
                pointers[j] = Directions.STOP;
            }

            short[] sizesOfVerticalGaps = new short[m * n];
            short[] sizesOfHorizontalGaps = new short[m * n];
            for (int i = 0, k = 0; i < m; i++, k += n)
            {
                for (int j = 0; j < n; j++)
                {
                    sizesOfVerticalGaps[k + j] = sizesOfHorizontalGaps[k + j] = 1;
                }
            }

            Cell cell = sw.Construct(s1, s2, scores, o, e, pointers,
                sizesOfVerticalGaps, sizesOfHorizontalGaps);

            Alignment alignment = sw.Traceback(s1, s2, matrix, pointers, cell,
                sizesOfVerticalGaps, sizesOfHorizontalGaps);

            alignment.Name1 = s1.Id;
            alignment.Name2 = s2.Id;
            alignment.Matrix = matrix;
            alignment.Open = o;
            alignment.Extend = e;

            return alignment;
        }
        /// <summary> Returns the alignment of two sequences based on the passed array of pointers</summary>
        /// <param name="s1">sequence #1 </param>
        /// <param name="s2">sequence #2 </param>
        /// <param name="m">scoring matrix </param>
        /// <param name="cell">The cell where the traceback starts. </param>
        /// <returns> <see cref="Alignment"/> with the two aligned sequences and alignment score. </returns>
        /// <seealso cref="Cell"/>
        /// <seealso cref="Alignment"/>
        private Alignment Traceback(Sequence s1, Sequence s2, Matrix m,
			byte[] pointers, Cell cell, short[] sizesOfVerticalGaps, short[] sizesOfHorizontalGaps)
        {
            char[] a1 = s1.ToArray();
            char[] a2 = s2.ToArray();

            float[,] scores = m.Scores;

            int n = s2.Length + 1;

            Alignment alignment = new Alignment();
            alignment.Score = cell.Score;

            int maxlen = s1.Length + s2.Length; // maximum length after the
            // aligned sequences

            char[] reversed1 = new char[maxlen]; // reversed sequence #1
            char[] reversed2 = new char[maxlen]; // reversed sequence #2
            char[] reversed3 = new char[maxlen]; // reversed markup

            int len1 = 0; // length of sequence #1 after alignment
            int len2 = 0; // length of sequence #2 after alignment
            int len3 = 0; // length of the markup line

            int identity = 0; // count of identitcal pairs
            int similarity = 0; // count of similar pairs
            int gaps = 0; // count of gaps

            char c1, c2;

            int i = cell.Row; // traceback start row
            int j = cell.Column; // traceback start col
            int k = i * n;

            bool stillGoing = true; // traceback flag: true -> continue & false
            // -> stop

            while (stillGoing)
            {
                switch (pointers[k + j])
                {
                    case Directions.UP:

                        for (int l = 0, len = sizesOfVerticalGaps[k + j]; l < len; l++)
                        {
                            reversed1[len1++] = a1[--i];
                            reversed2[len2++] = Alignment.GAP;
                            reversed3[len3++] = Markups.GAP;
                            k -= n;
                            gaps++;
                        }
                        break;

                    case Directions.DIAGONAL:
                        c1 = a1[--i];
                        c2 = a2[--j];
                        k -= n;
                        reversed1[len1++] = c1;
                        reversed2[len2++] = c2;
                        if (c1 == c2)
                        {
                            reversed3[len3++] = Markups.IDENTITY;
                            identity++;
                            similarity++;
                        }
                        else if (scores[c1,c2] > 0)
                        {
                            reversed3[len3++] = Markups.SIMILARITY;
                            similarity++;
                        }
                        else
                        {
                            reversed3[len3++] = Markups.MISMATCH;
                        }
                        break;

                    case Directions.LEFT:
                        for (int l = 0, len = sizesOfHorizontalGaps[k + j]; l < len; l++)
                        {
                            reversed1[len1++] = Alignment.GAP;
                            reversed2[len2++] = a2[--j];
                            reversed3[len3++] = Markups.GAP;
                            gaps++;
                        }
                        break;

                    case Directions.STOP:
                        stillGoing = false;
                        break;
                }
            }

            alignment.Sequence1 = Reverse(reversed1, len1);
            alignment.Start1 = i;
            alignment.Sequence2 = Reverse(reversed2, len2);
            alignment.Start2 = j;
            alignment.MarkupLine = Reverse(reversed3, len3);
            alignment.Identity = identity;
            alignment.Gaps = gaps;
            alignment.Similarity = similarity;

            return alignment;
        }
        /// <summary> Constructs directions matrix for the traceback </summary>
        /// <param name="s1">sequence #1 </param>
        /// <param name="s2">sequence #2 </param>
        /// <param name="matrix">scoring matrix </param>
        /// <param name="o">open gap penalty </param>
        /// <param name="e">extend gap penalty </param>
        /// <returns> The cell where the traceback starts. </returns>
        private Cell Construct(Sequence s1, Sequence s2, float[,] matrix, float o,
			float e, byte[] pointers, short[] sizesOfVerticalGaps, short[] sizesOfHorizontalGaps)
        {
            char[] a1 = s1.ToArray();
            char[] a2 = s2.ToArray();

            int m = s1.Length + 1;
            int n = s2.Length + 1;

            float f; // score of alignment x1...xi to y1...yi if xi aligns to yi
            float[] g = new float[n]; // score if xi aligns to a gap after yi
            float h; // score if yi aligns to a gap after xi
            float[] v = new float[n]; // best score of alignment x1...xi to y1...yi
            float vDiagonal;

            g[0] = float.NegativeInfinity;
            h = float.NegativeInfinity;
            v[0] = 0;

            for (int j = 1; j < n; j++)
            {
                g[j] = float.NegativeInfinity;
                v[j] = 0;
            }

            float similarityScore, g1, g2, h1, h2;

            Cell cell = new Cell();

            for (int i = 1, k = n; i < m; i++, k += n)
            {
                h = float.NegativeInfinity;
                vDiagonal = v[0];
                for (int j = 1, l = k + 1; j < n; j++, l++)
                {
                    similarityScore = matrix[a1[i - 1], a2[j - 1]];

                    // Fill the matrices
                    f = vDiagonal + similarityScore;

                    g1 = g[j] - e;
                    g2 = v[j] - o;
                    if (g1 > g2)
                    {
                        g[j] = g1;
                        sizesOfVerticalGaps[l] = (short) (sizesOfVerticalGaps[l - n] + 1);
                    }
                    else
                    {
                        g[j] = g2;
                    }

                    h1 = h - e;
                    h2 = v[j - 1] - o;
                    if (h1 > h2)
                    {
                        h = h1;
                        sizesOfHorizontalGaps[l] = (short) (sizesOfHorizontalGaps[l - 1] + 1);
                    }
                    else
                    {
                        h = h2;
                    }

                    vDiagonal = v[j];
                    v[j] = Max(f, g[j], h, 0);

                    // Determine the traceback direction
                    if (v[j] == 0)
                    {
                        pointers[l] = Directions.STOP;
                    }
                    else if (v[j] == f)
                    {
                        pointers[l] = Directions.DIAGONAL;
                    }
                    else if (v[j] == g[j])
                    {
                        pointers[l] = Directions.UP;
                    }
                    else
                    {
                        pointers[l] = Directions.LEFT;
                    }

                    // Set the traceback start at the current cell i, j and score
                    if (v[j] > cell.Score)
                    {
                        cell.Set(i, j, v[j]);
                    }
                }
            }

            return cell;
        }
예제 #5
0
        /// <summary> Returns a <see cref="Sequence"/> parsed and loaded from a file</summary>
        /// <param name="file">to parse </param>
        /// <returns> parsed sequence  </returns>
        public static Sequence Parse(FileInfo file)
        {
            string sequenceName = null;
            string sequenceDescription = null;

            StreamReader reader = new StreamReader(file.FullName);
            StringBuilder buffer = new StringBuilder();

            // Read & parse the first line
            string line = reader.ReadLine();

            if (line.StartsWith(">"))
            {
                // FASTA sequence

                line = line.Substring(1).Trim();
                int index = 0;
                for (int i = 0; i < line.Length && line[i] != ' ' && line[i] != '\t'; i++, index++)
                {
                    // Skip white spaces
                }

                sequenceName = line.Substring(0, (index) - (0));
                sequenceDescription = index + 1 > line.Length ? "" : line.Substring(index + 1);
            }
            else
            {
                // Plain sequence
                buffer.Append(PrepareAndValidate(line));
            }

            // Read the remaining the file (the actual sequence)
            while ((line = reader.ReadLine()) != null)
            {
                buffer.Append(PrepareAndValidate(line));
            }
            reader.Close();

            Sequence s = new Sequence(buffer.ToString(), sequenceName, sequenceDescription, SequenceType.Protein);
            return s;
        }
예제 #6
0
        /// <summary> Returns a parsed Sequence from a FASTA string.</summary>
        /// <param name="stringToParse">FASTA string to parse</param>
        public static Sequence Parse(string stringToParse)
        {
            stringToParse = stringToParse.Replace("\r\n", "\n");

            string sequenceName = null;
            string sequenceDescription = null;

            if (stringToParse.StartsWith(">"))
            {
                // FASTA format
                int index = stringToParse.IndexOf("\n");

                if (index == - 1)
                {
                    throw new System.Exception("Invalid sequence");
                }

                string first = stringToParse.Substring(1, (index) - (1));
                stringToParse = stringToParse.Substring(index);

                index = 0;
                for (int i = 0; i < first.Length && first[i] != ' ' && first[i] != '\t'; i++, index++)
                {
                    // Skip white spaces
                }
                sequenceName = first.Substring(0, (index) - (0));
                sequenceDescription = index + 1 > first.Length ? "" : first.Substring(index + 1);
            }
            else
            {
                // Plain format ... nothing to do here
            }

            Sequence s = new Sequence(PrepareAndValidate(stringToParse), sequenceName, sequenceDescription, SequenceType.Protein);

            return s;
        }