/// <summary> /// Combine two profiles with alignment array results from dynamic programming algorithm. /// The dynamic programming algorithm returns two arrays containing the alignment operations /// on the two profiles. This method applies the operation information in the two arrays to /// the two original profiles, and combine them into a new aligned profile. /// </summary> /// <param name="profileA">first profile</param> /// <param name="profileB">second profile</param> /// <param name="numberOfSequencesA">the number of sequences in the first profile</param> /// <param name="numberOfSequencesB">the number of sequences in the second profile</param> /// <param name="aAligned">aligned interger array generated by dynamic programming</param> /// <param name="bAligned">aligned interger array generated by dynamic programming</param> /// <param name="gapCode">the gap integer code defined in dynamic programming class</param> /// <param name="weights">the weights of two profiles</param> public static IProfiles GenerateProfiles( IProfiles profileA, IProfiles profileB, int numberOfSequencesA, int numberOfSequencesB, int[] aAligned, int[] bAligned, int gapCode, float[] weights) { if (aAligned.Length != bAligned.Length) { throw new ArgumentException("not aligned sequences"); } IProfiles profiles = new Profiles(aAligned.Length, profileA.ColumnSize); MsaUtils.Normalize(weights); // a profile with gap only float[] gapProfile = new float[profiles.ColumnSize]; gapProfile[gapProfile.Length - 1] = 1; for (int i = 0; i < aAligned.Length; ++i) { if (aAligned[i] == gapCode && bAligned[i] == gapCode) { throw new Exception("Both positions are gap between two sets of sequences"); } if (aAligned[i] == gapCode) { for (int j = 0; j < profiles.ColumnSize; ++j) { profiles[i][j] = ((gapProfile[j] * numberOfSequencesA * weights[0]) + (profileB[bAligned[i]][j] * numberOfSequencesB * weights[1])) / (numberOfSequencesA + numberOfSequencesB); } } else if (bAligned[i] == gapCode) { for (int j = 0; j < profiles.ColumnSize; ++j) { profiles[i][j] = ((gapProfile[j] * numberOfSequencesA * weights[0]) + (profileA[aAligned[i]][j] * numberOfSequencesB * weights[1])) / (numberOfSequencesA + numberOfSequencesB); } } else { for (int j = 0; j < profiles.ColumnSize; ++j) { profiles[i][j] = ((profileA[aAligned[i]][j] * numberOfSequencesA * weights[0]) + (profileB[bAligned[i]][j] * numberOfSequencesB * weights[1])) / (numberOfSequencesA + numberOfSequencesB); } } } return profiles; }
/// <summary> /// Combine two profiles into one. /// The frequencies in the two profiles are weighted by the number of sequences. /// The new frequencies are defined as: /// (frequencyA * numberOfSequenceA + frequencyB * numberOfSequenceB) / (numberOfSequenceA + numberOfSequenceB) /// </summary> /// <param name="profileA">first profile alignment</param> /// <param name="profileB">second profile alignment</param> /// <param name="numberOfSequencesA">the number of sequences in the first profile</param> /// <param name="numberOfSequencesB">the number of sequences in the second profile</param> public static IProfiles GenerateProfiles(IProfiles profileA, IProfiles profileB, int numberOfSequencesA, int numberOfSequencesB) { if (profileA.RowSize != profileB.RowSize || profileA.ColumnSize != profileB.ColumnSize) { throw new Exception("different profiles sizes"); } IProfiles profiles; profiles = new Profiles(profileA); for (int i = 0; i < profiles.RowSize; ++i) { for (int j = 0; j < profiles.ColumnSize; ++j) { profiles[i][j] = (profileA[i][j] * numberOfSequencesA + profileB[i][j] * numberOfSequencesB) / (numberOfSequencesA + numberOfSequencesB); } } return profiles; }
/// <summary> /// Generate IProfiles from a subset of aligned sequences. /// In the subset of sequences, those columns containing no residues, /// i.e. indels only, are discarded. /// </summary> /// <param name="sequences">a set of aligned sequences</param> /// <param name="sequenceIndices">the subset indices of the aligned sequences</param> /// <param name="allIndelPositions">the list of all-indel positions that have been removed when constructing</param> /// <param name="weights">sequence weights</param> public static IProfiles GenerateProfiles(List<ISequence> sequences, List<int> sequenceIndices, out List<int> allIndelPositions, float[] weights) { IProfiles profiles; if (sequences.Count <= 0) { throw new ArgumentException("Empty input sequences"); } if (sequenceIndices.Count > sequences.Count) { throw new ArgumentException("Invalid subset indices"); } MsaUtils.Normalize(weights); try { int sequenceLength = (int)sequences[sequenceIndices[0]].Count; IAlphabet alphabet = sequences[sequenceIndices[0]].Alphabet; foreach (int i in sequenceIndices) { if (sequences[i].Count != sequenceLength) { throw new ArgumentException("Input sequences are not aligned"); } if (sequences[i].Alphabet != alphabet) { throw new ArgumentException("Input sequences use different alphabets"); } } allIndelPositions = new List<int>(); profiles = new Profiles(); int colSize = (ItemSet.Count + 1) / 2; // Discard all indels columns. for (int col = 0; col < sequenceLength; ++col) { float[] vector = new float[colSize]; bool isAllIndels = true; foreach (int i in sequenceIndices) { if (!sequences[i].Alphabet.CheckIsGap(sequences[i][col])) { isAllIndels = false; } if (sequences[i].Alphabet.CheckIsAmbiguous(sequences[i][col])) { //Console.WriteLine("residue {0} is {1}, ambiguous? {2}", i, seq[i].Symbol, seq[i].IsAmbiguous); for (int b = 0; b < AmbiguousCharactersMap[sequences[i][col]].Count; ++b) { vector[ItemSet[AmbiguousCharactersMap[sequences[i][col]][b]]] += weights[i]; } } else { vector[ItemSet[sequences[i][col]]] += weights[i]; } } if (!isAllIndels) { MsaUtils.Normalize(vector); profiles.ProfilesMatrix.Add(vector); } else { allIndelPositions.Add(col); } } profiles.ColumnSize = colSize; profiles.RowSize = profiles.ProfilesMatrix.Count; } catch (IndexOutOfRangeException ex) { throw new Exception("Invalid index", ex.InnerException); } return profiles; }
/// <summary> /// Generate IProfiles from a set of aligned sequences /// </summary> /// <param name="sequences">a set of aligned sequences</param> /// <param name="weights">sequence weights</param> public static IProfiles GenerateProfiles(ICollection<ISequence> sequences, float[] weights) { if (sequences.Count != weights.Length) { throw new ArgumentException("Invalid inputs"); } MsaUtils.Normalize(weights); IEnumerator<ISequence> enumeratorSeq = sequences.GetEnumerator(); enumeratorSeq.MoveNext(); int sequenceLength = (int)enumeratorSeq.Current.Count; IAlphabet alphabet = enumeratorSeq.Current.Alphabet; while (enumeratorSeq.MoveNext()) { if (enumeratorSeq.Current.Count != sequenceLength) { throw new ArgumentException("Input sequences are not aligned"); } if (enumeratorSeq.Current.Alphabet != alphabet) { throw new ArgumentException("Input sequences use different alphabets"); } } // each row is a column; each column is a profile int colSize = (ItemSet.Count + 1) / 2; IProfiles profiles = new Profiles(sequenceLength, colSize); for (int i = 0; i < sequenceLength; ++i) { enumeratorSeq.Reset(); while (enumeratorSeq.MoveNext()) { if (!enumeratorSeq.Current.Alphabet.CheckIsAmbiguous(enumeratorSeq.Current[i])) // IsAmbiguous { for (int b = 0; b < AmbiguousCharactersMap[enumeratorSeq.Current[i]].Count; ++b) { profiles[i][ItemSet[AmbiguousCharactersMap[enumeratorSeq.Current[i]][b]]] += weights[i]; } } else { profiles[i][ItemSet[enumeratorSeq.Current[i]]] += weights[i]; } } MsaUtils.Normalize(profiles[i]); } profiles.ColumnSize = colSize; profiles.RowSize = sequenceLength; return profiles; }
/// <summary> /// Generate profiles from one single sequence /// The set of sequence items of the seq should be the same as /// 'static ItemSet' of this class. /// </summary> /// <param name="seq">an input sequence</param> /// <param name="weight">sequence weight</param> public static IProfiles GenerateProfiles(ISequence seq, float weight) { int sequenceLength = (int)seq.Count; int colSize = (ItemSet.Count + 1) / 2; IProfiles profiles = new Profiles(sequenceLength, colSize); for (int i = 0; i < sequenceLength; ++i) { try { if (seq.Alphabet.CheckIsAmbiguous(seq[i])) { for (int b = 0; b < AmbiguousCharactersMap[seq[i]].Count; ++b) { //profiles[i][ItemSet[AmbiguousCharactersMap[seq[i]][b]]] += weight; ++(profiles[i][ItemSet[AmbiguousCharactersMap[seq[i]][b]]]); } } else { //profiles[i][ItemSet[seq[i]]] += weight; ++(profiles[i][ItemSet[seq[i]]]); } } catch (IndexOutOfRangeException ex) { throw new Exception("Invalid alphabet", ex.InnerException); } //MsaUtils.Normalize(profiles[i]); } profiles.ColumnSize = colSize; profiles.RowSize = sequenceLength; return profiles; }
/// <summary> /// Generate IProfiles from a subset of aligned sequences. /// In the subset of sequences, those columns containing no residues, /// i.e. indels only, are discarded. /// </summary> /// <param name="sequences">a set of aligned sequences</param> /// <param name="sequenceIndices">the subset indices of the aligned sequences</param> /// <param name="allIndelPositions">the list of all-indel positions that have been removed when constructing</param> /// <param name="weights">sequence weights</param> public static IProfiles GenerateProfiles(List <ISequence> sequences, List <int> sequenceIndices, out List <int> allIndelPositions, float[] weights) { IProfiles profiles; if (sequences.Count <= 0) { throw new ArgumentException("Empty input sequences"); } if (sequenceIndices.Count > sequences.Count) { throw new ArgumentException("Invalid subset indices"); } MsaUtils.Normalize(weights); try { int sequenceLength = (int)sequences[sequenceIndices[0]].Count; IAlphabet alphabet = sequences[sequenceIndices[0]].Alphabet; foreach (int i in sequenceIndices) { if (sequences[i].Count != sequenceLength) { throw new ArgumentException("Input sequences are not aligned"); } if (sequences[i].Alphabet != alphabet) { throw new ArgumentException("Input sequences use different alphabets"); } } allIndelPositions = new List <int>(); profiles = new Profiles(); int colSize = (ItemSet.Count + 1) / 2; // Discard all indels columns. for (int col = 0; col < sequenceLength; ++col) { float[] vector = new float[colSize]; bool isAllIndels = true; foreach (int i in sequenceIndices) { if (!sequences[i].Alphabet.CheckIsGap(sequences[i][col])) { isAllIndels = false; } if (sequences[i].Alphabet.CheckIsAmbiguous(sequences[i][col])) { //Console.WriteLine("residue {0} is {1}, ambiguous? {2}", i, seq[i].Symbol, seq[i].IsAmbiguous); for (int b = 0; b < AmbiguousCharactersMap[sequences[i][col]].Count; ++b) { vector[ItemSet[AmbiguousCharactersMap[sequences[i][col]][b]]] += weights[i]; } } else { vector[ItemSet[sequences[i][col]]] += weights[i]; } } if (!isAllIndels) { MsaUtils.Normalize(vector); profiles.ProfilesMatrix.Add(vector); } else { allIndelPositions.Add(col); } } profiles.ColumnSize = colSize; profiles.RowSize = profiles.ProfilesMatrix.Count; } catch (IndexOutOfRangeException ex) { throw new Exception("Invalid index", ex.InnerException); } return(profiles); }