Esempio n. 1
0
        /// <summary>
        /// Generate a symmetric distance matrix from a set of unaligned sequences.
        /// </summary>
        /// <param name="sequences">a set of unaligned sequences</param>
        public void GenerateDistanceMatrix(IList <ISequence> sequences)
        {
            // Generate k-mer counting dictionary for each sequence
            try
            {
                _allCountsDictionary = new Dictionary <string, float> [sequences.Count];

                Parallel.For(0, sequences.Count, i =>
                {
                    Dictionary <string, float> currentDictionary = KmerDistanceScoreCalculator.CalculateKmerCounting(sequences[i], _kmerLength);
                    MsaUtils.Normalize(currentDictionary);
                    _allCountsDictionary[i] = currentDictionary;
                });
            }
            catch (OutOfMemoryException ex)
            {
                throw new Exception("Out of memory when generating kmer counting", ex.InnerException);
            }

            // Construct a SymmetricDistanceMatrix
            // with dimension equals to the number of sequences
            _distanceMatrix = new SymmetricDistanceMatrix(sequences.Count);

            // Fill in DistanceMatrix
            Parallel.For(1, sequences.Count, PAMSAMMultipleSequenceAligner.ParallelOption, row =>
            {
                for (int col = 0; col < row; ++col)
                {
                    float distanceScore = _kmerScoreCalculator.CalculateDistanceScore
                                              (_allCountsDictionary[row], _allCountsDictionary[col]);
                    _distanceMatrix[row, col] = distanceScore;
                    _distanceMatrix[col, row] = distanceScore;
                }
            });
        }
Esempio n. 2
0
        /// <summary>
        /// Generate IProfiles from a set of aligned sequences
        /// </summary>
        /// <param name="sequences">a set of aligned sequences</param>
        /// <param name="weights">sequence weights</param>
        public static IProfiles GenerateProfiles(ICollection <ISequence> sequences, float[] weights)
        {
            if (sequences.Count != weights.Length)
            {
                throw new ArgumentException("Invalid inputs");
            }

            MsaUtils.Normalize(weights);

            IProfiles profiles;
            IEnumerator <ISequence> enumeratorSeq = sequences.GetEnumerator();

            enumeratorSeq.MoveNext();
            int       sequenceLength = (int)enumeratorSeq.Current.Count;
            IAlphabet alphabet       = enumeratorSeq.Current.Alphabet;

            while (enumeratorSeq.MoveNext())
            {
                if (enumeratorSeq.Current.Count != sequenceLength)
                {
                    throw new ArgumentException("Input sequences are not aligned");
                }
                if (enumeratorSeq.Current.Alphabet != alphabet)
                {
                    throw new ArgumentException("Input sequences use different alphabets");
                }
            }

            // each row is a column; each column is a profile
            int colSize = (ItemSet.Count + 1) / 2;

            profiles = new Profiles(sequenceLength, colSize);

            for (int i = 0; i < sequenceLength; ++i)
            {
                enumeratorSeq.Reset();
                while (enumeratorSeq.MoveNext())
                {
                    if (!enumeratorSeq.Current.Alphabet.CheckIsAmbiguous(enumeratorSeq.Current[i])) // IsAmbiguous
                    {
                        for (int b = 0; b < AmbiguousCharactersMap[enumeratorSeq.Current[i]].Count; ++b)
                        {
                            profiles[i][ItemSet[AmbiguousCharactersMap[enumeratorSeq.Current[i]][b]]] += weights[i];
                        }
                    }
                    else
                    {
                        profiles[i][ItemSet[enumeratorSeq.Current[i]]] += weights[i];
                    }
                }
                MsaUtils.Normalize(profiles[i]);
            }
            profiles.ColumnSize = colSize;
            profiles.RowSize    = sequenceLength;
            return(profiles);
        }
Esempio n. 3
0
        /// <summary>
        /// Combine two profiles with alignment array results from dynamic programming algorithm.
        /// The dynamic programming algorithm returns two arrays containing the alignment operations
        /// on the two profiles. This method applies the operation information in the two arrays to
        /// the two original profiles, and combine them into a new aligned profile.
        /// </summary>
        /// <param name="profileA">first profile</param>
        /// <param name="profileB">second profile</param>
        /// <param name="numberOfSequencesA">the number of sequences in the first profile</param>
        /// <param name="numberOfSequencesB">the number of sequences in the second profile</param>
        /// <param name="aAligned">aligned interger array generated by dynamic programming</param>
        /// <param name="bAligned">aligned interger array generated by dynamic programming</param>
        /// <param name="gapCode">the gap integer code defined in dynamic programming class</param>
        /// <param name="weights">the weights of two profiles</param>
        public static IProfiles GenerateProfiles(
            IProfiles profileA,
            IProfiles profileB,
            int numberOfSequencesA,
            int numberOfSequencesB,
            int[] aAligned,
            int[] bAligned,
            int gapCode,
            float[] weights)
        {
            if (aAligned.Length != bAligned.Length)
            {
                throw new ArgumentException("not aligned sequences");
            }
            IProfiles profiles = new Profiles(aAligned.Length, profileA.ColumnSize);

            MsaUtils.Normalize(weights);

            // a profile with gap only
            float[] gapProfile = new float[profiles.ColumnSize];
            gapProfile[gapProfile.Length - 1] = 1;

            for (int i = 0; i < aAligned.Length; ++i)
            {
                if (aAligned[i] == gapCode && bAligned[i] == gapCode)
                {
                    throw new Exception("Both positions are gap between two sets of sequences");
                }
                if (aAligned[i] == gapCode)
                {
                    for (int j = 0; j < profiles.ColumnSize; ++j)
                    {
                        profiles[i][j] = ((gapProfile[j] * numberOfSequencesA * weights[0]) + (profileB[bAligned[i]][j] * numberOfSequencesB * weights[1]))
                                         / (numberOfSequencesA + numberOfSequencesB);
                    }
                }
                else if (bAligned[i] == gapCode)
                {
                    for (int j = 0; j < profiles.ColumnSize; ++j)
                    {
                        profiles[i][j] = ((gapProfile[j] * numberOfSequencesA * weights[0]) + (profileA[aAligned[i]][j] * numberOfSequencesB * weights[1]))
                                         / (numberOfSequencesA + numberOfSequencesB);
                    }
                }
                else
                {
                    for (int j = 0; j < profiles.ColumnSize; ++j)
                    {
                        profiles[i][j] = ((profileA[aAligned[i]][j] * numberOfSequencesA * weights[0]) + (profileB[bAligned[i]][j] * numberOfSequencesB * weights[1]))
                                         / (numberOfSequencesA + numberOfSequencesB);
                    }
                }
            }
            return(profiles);
        }
Esempio n. 4
0
        /// <summary>
        /// Generate IProfiles from a subset of aligned sequences.
        /// In the subset of sequences, those columns containing no residues,
        /// i.e. indels only, are discarded.
        /// </summary>
        /// <param name="sequences">a set of aligned sequences</param>
        /// <param name="sequenceIndices">the subset indices of the aligned sequences</param>
        /// <param name="allIndelPositions">the list of all-indel positions that have been removed when constructing</param>
        /// <param name="weights">sequence weights</param>
        public static IProfiles GenerateProfiles(List <ISequence> sequences, List <int> sequenceIndices, out List <int> allIndelPositions, float[] weights)
        {
            IProfiles profiles;

            if (sequences.Count <= 0)
            {
                throw new ArgumentException("Empty input sequences");
            }
            if (sequenceIndices.Count > sequences.Count)
            {
                throw new ArgumentException("Invalid subset indices");
            }

            MsaUtils.Normalize(weights);

            try
            {
                int       sequenceLength = (int)sequences[sequenceIndices[0]].Count;
                IAlphabet alphabet       = sequences[sequenceIndices[0]].Alphabet;

                foreach (int i in sequenceIndices)
                {
                    if (sequences[i].Count != sequenceLength)
                    {
                        throw new ArgumentException("Input sequences are not aligned");
                    }
                    if (sequences[i].Alphabet != alphabet)
                    {
                        throw new ArgumentException("Input sequences use different alphabets");
                    }
                }

                allIndelPositions = new List <int>();

                profiles = new Profiles();
                int colSize = (ItemSet.Count + 1) / 2;

                // Discard all indels columns.
                for (int col = 0; col < sequenceLength; ++col)
                {
                    float[] vector      = new float[colSize];
                    bool    isAllIndels = true;
                    foreach (int i in sequenceIndices)
                    {
                        if (!sequences[i].Alphabet.CheckIsGap(sequences[i][col]))
                        {
                            isAllIndels = false;
                        }
                        if (sequences[i].Alphabet.CheckIsAmbiguous(sequences[i][col]))
                        {
                            //Console.WriteLine("residue {0} is {1}, ambiguous? {2}", i, seq[i].Symbol, seq[i].IsAmbiguous);
                            for (int b = 0; b < AmbiguousCharactersMap[sequences[i][col]].Count; ++b)
                            {
                                vector[ItemSet[AmbiguousCharactersMap[sequences[i][col]][b]]] += weights[i];
                            }
                        }
                        else
                        {
                            vector[ItemSet[sequences[i][col]]] += weights[i];
                        }
                    }
                    if (!isAllIndels)
                    {
                        MsaUtils.Normalize(vector);
                        profiles.ProfilesMatrix.Add(vector);
                    }
                    else
                    {
                        allIndelPositions.Add(col);
                    }
                }
                profiles.ColumnSize = colSize;
                profiles.RowSize    = profiles.ProfilesMatrix.Count;
            }
            catch (IndexOutOfRangeException ex)
            {
                throw new Exception("Invalid index", ex.InnerException);
            }
            return(profiles);
        }