/// <summary> Returns the name, description and sequence combined in one string. /// The length of each line in the sequence is FASTA.LINE_LENGTH</summary> public virtual string Format(Sequence sequence) { StringBuilder buffer = new StringBuilder(">"); buffer.Append(sequence.Id == null ? "" : sequence.Id); buffer.Append("\n"); for (int i = 0, n = sequence.Length; i*LINE_WIDTH < n; i++) { for (int j = i*LINE_WIDTH, m = (i + 1)*LINE_WIDTH < n ? (i + 1)*LINE_WIDTH : n; j < m; j++) { buffer.Append(sequence.Subsequence(j, 1)); } buffer.Append("\n"); } return buffer.ToString(); }
/// <summary> Aligns two sequences by Smith-Waterman algorithm</summary> /// <param name="s1">sequene #1 </param> /// <param name="s2">sequene #2 </param> /// <param name="matrix">scoring matrix </param> /// <param name="o">open gap penalty </param> /// <param name="e">extend gap penalty </param> /// <returns> alignment object contains the two aligned sequences, /// the alignment score and alignment statistics</returns> /// <seealso cref="Sequence"/> /// <seealso cref="Matrix"/> public static Alignment Align(Sequence s1, Sequence s2, Matrix matrix, float o, float e) { float[,] scores = matrix.Scores; SmithWatermanGotoh sw = new SmithWatermanGotoh(); int m = s1.Length + 1; int n = s2.Length + 1; byte[] pointers = new byte[m * n]; // Initializes the boundaries of the traceback matrix to STOP. for (int i = 0, k = 0; i < m; i++, k += n) { pointers[k] = Directions.STOP; } for (int j = 1; j < n; j++) { pointers[j] = Directions.STOP; } short[] sizesOfVerticalGaps = new short[m * n]; short[] sizesOfHorizontalGaps = new short[m * n]; for (int i = 0, k = 0; i < m; i++, k += n) { for (int j = 0; j < n; j++) { sizesOfVerticalGaps[k + j] = sizesOfHorizontalGaps[k + j] = 1; } } Cell cell = sw.Construct(s1, s2, scores, o, e, pointers, sizesOfVerticalGaps, sizesOfHorizontalGaps); Alignment alignment = sw.Traceback(s1, s2, matrix, pointers, cell, sizesOfVerticalGaps, sizesOfHorizontalGaps); alignment.Name1 = s1.Id; alignment.Name2 = s2.Id; alignment.Matrix = matrix; alignment.Open = o; alignment.Extend = e; return alignment; }
/// <summary> Returns the alignment of two sequences based on the passed array of pointers</summary> /// <param name="s1">sequence #1 </param> /// <param name="s2">sequence #2 </param> /// <param name="m">scoring matrix </param> /// <param name="cell">The cell where the traceback starts. </param> /// <returns> <see cref="Alignment"/> with the two aligned sequences and alignment score. </returns> /// <seealso cref="Cell"/> /// <seealso cref="Alignment"/> private Alignment Traceback(Sequence s1, Sequence s2, Matrix m, byte[] pointers, Cell cell, short[] sizesOfVerticalGaps, short[] sizesOfHorizontalGaps) { char[] a1 = s1.ToArray(); char[] a2 = s2.ToArray(); float[,] scores = m.Scores; int n = s2.Length + 1; Alignment alignment = new Alignment(); alignment.Score = cell.Score; int maxlen = s1.Length + s2.Length; // maximum length after the // aligned sequences char[] reversed1 = new char[maxlen]; // reversed sequence #1 char[] reversed2 = new char[maxlen]; // reversed sequence #2 char[] reversed3 = new char[maxlen]; // reversed markup int len1 = 0; // length of sequence #1 after alignment int len2 = 0; // length of sequence #2 after alignment int len3 = 0; // length of the markup line int identity = 0; // count of identitcal pairs int similarity = 0; // count of similar pairs int gaps = 0; // count of gaps char c1, c2; int i = cell.Row; // traceback start row int j = cell.Column; // traceback start col int k = i * n; bool stillGoing = true; // traceback flag: true -> continue & false // -> stop while (stillGoing) { switch (pointers[k + j]) { case Directions.UP: for (int l = 0, len = sizesOfVerticalGaps[k + j]; l < len; l++) { reversed1[len1++] = a1[--i]; reversed2[len2++] = Alignment.GAP; reversed3[len3++] = Markups.GAP; k -= n; gaps++; } break; case Directions.DIAGONAL: c1 = a1[--i]; c2 = a2[--j]; k -= n; reversed1[len1++] = c1; reversed2[len2++] = c2; if (c1 == c2) { reversed3[len3++] = Markups.IDENTITY; identity++; similarity++; } else if (scores[c1,c2] > 0) { reversed3[len3++] = Markups.SIMILARITY; similarity++; } else { reversed3[len3++] = Markups.MISMATCH; } break; case Directions.LEFT: for (int l = 0, len = sizesOfHorizontalGaps[k + j]; l < len; l++) { reversed1[len1++] = Alignment.GAP; reversed2[len2++] = a2[--j]; reversed3[len3++] = Markups.GAP; gaps++; } break; case Directions.STOP: stillGoing = false; break; } } alignment.Sequence1 = Reverse(reversed1, len1); alignment.Start1 = i; alignment.Sequence2 = Reverse(reversed2, len2); alignment.Start2 = j; alignment.MarkupLine = Reverse(reversed3, len3); alignment.Identity = identity; alignment.Gaps = gaps; alignment.Similarity = similarity; return alignment; }
/// <summary> Constructs directions matrix for the traceback </summary> /// <param name="s1">sequence #1 </param> /// <param name="s2">sequence #2 </param> /// <param name="matrix">scoring matrix </param> /// <param name="o">open gap penalty </param> /// <param name="e">extend gap penalty </param> /// <returns> The cell where the traceback starts. </returns> private Cell Construct(Sequence s1, Sequence s2, float[,] matrix, float o, float e, byte[] pointers, short[] sizesOfVerticalGaps, short[] sizesOfHorizontalGaps) { char[] a1 = s1.ToArray(); char[] a2 = s2.ToArray(); int m = s1.Length + 1; int n = s2.Length + 1; float f; // score of alignment x1...xi to y1...yi if xi aligns to yi float[] g = new float[n]; // score if xi aligns to a gap after yi float h; // score if yi aligns to a gap after xi float[] v = new float[n]; // best score of alignment x1...xi to y1...yi float vDiagonal; g[0] = float.NegativeInfinity; h = float.NegativeInfinity; v[0] = 0; for (int j = 1; j < n; j++) { g[j] = float.NegativeInfinity; v[j] = 0; } float similarityScore, g1, g2, h1, h2; Cell cell = new Cell(); for (int i = 1, k = n; i < m; i++, k += n) { h = float.NegativeInfinity; vDiagonal = v[0]; for (int j = 1, l = k + 1; j < n; j++, l++) { similarityScore = matrix[a1[i - 1], a2[j - 1]]; // Fill the matrices f = vDiagonal + similarityScore; g1 = g[j] - e; g2 = v[j] - o; if (g1 > g2) { g[j] = g1; sizesOfVerticalGaps[l] = (short) (sizesOfVerticalGaps[l - n] + 1); } else { g[j] = g2; } h1 = h - e; h2 = v[j - 1] - o; if (h1 > h2) { h = h1; sizesOfHorizontalGaps[l] = (short) (sizesOfHorizontalGaps[l - 1] + 1); } else { h = h2; } vDiagonal = v[j]; v[j] = Max(f, g[j], h, 0); // Determine the traceback direction if (v[j] == 0) { pointers[l] = Directions.STOP; } else if (v[j] == f) { pointers[l] = Directions.DIAGONAL; } else if (v[j] == g[j]) { pointers[l] = Directions.UP; } else { pointers[l] = Directions.LEFT; } // Set the traceback start at the current cell i, j and score if (v[j] > cell.Score) { cell.Set(i, j, v[j]); } } } return cell; }
/// <summary> Returns a <see cref="Sequence"/> parsed and loaded from a file</summary> /// <param name="file">to parse </param> /// <returns> parsed sequence </returns> public static Sequence Parse(FileInfo file) { string sequenceName = null; string sequenceDescription = null; StreamReader reader = new StreamReader(file.FullName); StringBuilder buffer = new StringBuilder(); // Read & parse the first line string line = reader.ReadLine(); if (line.StartsWith(">")) { // FASTA sequence line = line.Substring(1).Trim(); int index = 0; for (int i = 0; i < line.Length && line[i] != ' ' && line[i] != '\t'; i++, index++) { // Skip white spaces } sequenceName = line.Substring(0, (index) - (0)); sequenceDescription = index + 1 > line.Length ? "" : line.Substring(index + 1); } else { // Plain sequence buffer.Append(PrepareAndValidate(line)); } // Read the remaining the file (the actual sequence) while ((line = reader.ReadLine()) != null) { buffer.Append(PrepareAndValidate(line)); } reader.Close(); Sequence s = new Sequence(buffer.ToString(), sequenceName, sequenceDescription, SequenceType.Protein); return s; }
/// <summary> Returns a parsed Sequence from a FASTA string.</summary> /// <param name="stringToParse">FASTA string to parse</param> public static Sequence Parse(string stringToParse) { stringToParse = stringToParse.Replace("\r\n", "\n"); string sequenceName = null; string sequenceDescription = null; if (stringToParse.StartsWith(">")) { // FASTA format int index = stringToParse.IndexOf("\n"); if (index == - 1) { throw new System.Exception("Invalid sequence"); } string first = stringToParse.Substring(1, (index) - (1)); stringToParse = stringToParse.Substring(index); index = 0; for (int i = 0; i < first.Length && first[i] != ' ' && first[i] != '\t'; i++, index++) { // Skip white spaces } sequenceName = first.Substring(0, (index) - (0)); sequenceDescription = index + 1 > first.Length ? "" : first.Substring(index + 1); } else { // Plain format ... nothing to do here } Sequence s = new Sequence(PrepareAndValidate(stringToParse), sequenceName, sequenceDescription, SequenceType.Protein); return s; }