/// <summary>
        ///     This method converts a sequence list into a distinct PDB ID list.
        /// </summary>
        /// <param name="sequenceList"></param>
        /// <param name="distinct"></param>
        /// <returns></returns>
        public static List <string> SequenceListToPdbIdList(List <ISequence> sequenceList, bool distinct = true)
        {
            if (sequenceList == null)// || sequenceList.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequenceList));
            }

            if (distinct)
            {
                return(sequenceList.Select(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId).Distinct().ToList());
            }
            else
            {
                return(sequenceList.Select(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId).ToList());
            }
        }
        /// <summary>
        ///     This method loads a single sequence file.  FASTA is the preferred format.
        /// </summary>
        /// <param name="sequenceFilename"></param>
        /// <param name="molNames"></param>
        /// <param name="distinct"></param>
        /// <returns></returns>
        public static List <ISequence> LoadSequenceFile(string sequenceFilename, string[] molNames, bool distinct = true)
        {
            if (string.IsNullOrWhiteSpace(sequenceFilename))
            {
                throw new ArgumentOutOfRangeException(nameof(sequenceFilename));
            }

            if (!File.Exists(sequenceFilename))
            {
                throw new FileNotFoundException(sequenceFilename);
            }

            List <ISequence> sequences = null;

            ISequenceParser sequenceParser = null;

            try
            {
                sequenceParser = SequenceParsers.FindParserByFileName(sequenceFilename);
            }
            catch (DirectoryNotFoundException directoryNotFoundException)
            {
                // just forward exception for now
                throw new DirectoryNotFoundException(directoryNotFoundException.Message, directoryNotFoundException.InnerException);
            }


            if (sequenceParser != null)
            {
                sequences = sequenceParser.Parse().ToList();
                sequenceParser.Close();


                if (distinct)
                {
                    sequences = sequences.Distinct().ToList();
                }
            }

            if (sequences != null && sequences.Count > 0 && molNames != null && molNames.Length > 0)
            {
                sequences = sequences.Where(a => molNames.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).Mol)).ToList();
            }

            return(sequences);
        }
        /// <summary>
        ///     Save to disk a list of sequences in CSV/TSV format.
        /// </summary>
        /// <param name="sequences"></param>
        /// <param name="filename"></param>
        public static string[] SaveSequencesAsSpreadsheet(List <ISequence> sequences, string filename, bool tsvFormat = false, bool xlsxFormat = true)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (string.IsNullOrWhiteSpace(filename))
            {
                throw new ArgumentOutOfRangeException(nameof(filename));
            }

            if (!tsvFormat && !xlsxFormat)
            {
                throw new ArgumentOutOfRangeException(nameof(tsvFormat), tsvFormat, "No file formats were selected");
            }

            var headerColumnsRow = new[]
            {
                new SpreadsheetCell("PDB ID"),
                new SpreadsheetCell("Chain"),
                new SpreadsheetCell("Sequence"),
            };

            var rowList = new List <SpreadsheetCell[]>();

            rowList.Add(headerColumnsRow);

            foreach (ISequence sequence in sequences)
            {
                SequenceIdSplit.SequenceIdToPdbIdAndChainIdResult id = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequence.ID);

                var row = new[]
                {
                    new SpreadsheetCell(id.PdbId),
                    new SpreadsheetCell(id.ChainId),
                    new SpreadsheetCell(sequence.ConvertToString()),
                };

                rowList.Add(row);
            }

            string[] filesSavedStrings = SpreadsheetFileHandler.SaveSpreadsheet(filename, null, rowList, null, tsvFormat, xlsxFormat);

            return(filesSavedStrings);
        }
示例#4
0
        public static Dictionary <string, List <string> > PdbIdChainIdList(List <ISequence> sequenceList)
        {
            var result = new Dictionary <string, List <string> >();

            foreach (var seq in sequenceList)
            {
                var seqId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(seq.ID);

                if (!result.ContainsKey(seqId.PdbId))
                {
                    result.Add(seqId.PdbId, new List <string>());
                }

                result[seqId.PdbId].Add(seqId.ChainId);
            }

            return(result);
        }
        /// <summary>
        ///     This method removes sequences from the list which are not proteins (e.g. DNA, RNA, Hybrid).
        /// </summary>
        /// <returns></returns>
        public static List <ISequence> RemoveNonProteinAlphabetSequences(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            WorkDivision <List <string> > workDivision = new WorkDivision <List <string> >(sequences.Count, totalThreads);

            ProgressActionSet.StartAction(sequences.Count, progressActionSet);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex       = threadIndex;
                Task <List <string> > task = Task.Run(() =>
                {
                    var taskResult = new List <string>();

                    for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[index].ID).PdbId;

                        if (sequences[index].Alphabet != Alphabets.Protein)
                        {
                            taskResult.Add(proteinId);
                        }

                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                    }

                    return(taskResult);
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            var result = new List <string>();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted))
            {
                result.AddRange(task.Result);
            }

            result = result.Distinct().ToList();

            List <ISequence> seq = RemoveSequences(cancellationToken, sequences, result);

            return(seq);
        }
        /// <summary>
        ///     This method removes specified ids from the list of sequences.
        /// </summary>
        /// <param name="sequences"></param>
        /// <param name="sequencesToKeepOrRemove"></param>
        /// <param name="options"></param>
        /// <returns></returns>
        public static List <ISequence> RemoveSequences(CancellationToken cancellationToken, List <ISequence> sequences, List <string> sequencesToKeepOrRemove, RemoveSequencesOptions options = RemoveSequencesOptions.RemoveSequencesInList, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (sequencesToKeepOrRemove == null)
            {
                throw new ArgumentOutOfRangeException(nameof(sequencesToKeepOrRemove));
            }


            if (sequencesToKeepOrRemove != null)// && sequencesToKeepOrRemove.Count > 0)
            {
                var workDivision = new WorkDivision <List <int> >(sequences.Count, totalThreads);

                for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
                {
                    int localThreadIndex = threadIndex;

                    Task <List <int> > task = Task.Run(() =>
                    {
                        var taskResult = new List <int>();

                        for (int sequencesIndex = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndex <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndex++)
                        {
                            string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndex].ID).PdbId;

                            if (((options == RemoveSequencesOptions.RemoveSequencesInList) && (sequencesToKeepOrRemove.Contains(proteinId))) ||
                                ((options == RemoveSequencesOptions.RemoveSequencesNotInList) && (!sequencesToKeepOrRemove.Contains(proteinId))))
                            {
                                taskResult.Add(sequencesIndex);
                            }

                            workDivision.IncrementItemsCompleted(1);
                        }

                        return(taskResult);
                    }, cancellationToken);

                    workDivision.TaskList.Add(task);
                }

                workDivision.WaitAllTasks();

                var sequenceIndexesToRemove = new List <int>();

                foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted))
                {
                    sequenceIndexesToRemove.AddRange(task.Result);
                }

                sequenceIndexesToRemove = sequenceIndexesToRemove.Distinct().ToList();

                sequenceIndexesToRemove.Sort();

                for (int sequenceIndexesToRemoveIndex = sequenceIndexesToRemove.Count - 1; sequenceIndexesToRemoveIndex >= 0; sequenceIndexesToRemoveIndex--)
                {
                    sequences.RemoveAt(sequenceIndexesToRemove[sequenceIndexesToRemoveIndex]);
                }
            }

            return(sequences);
        }
        public static List <ISequence> RemoveDuplicates(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            var pdbIdList      = SequenceListToPdbIdList(sequences);
            var pdbIdSequences = pdbIdList.Select(a => sequences.Where(b => SequenceIdSplit.SequenceIdToPdbIdAndChainId(b.ID).PdbId == a).ToList()).ToList();

            var workDivision = new WorkDivision(pdbIdList.Count, totalThreads);


            ProgressActionSet.StartAction(pdbIdList.Count, progressActionSet);

            var done       = new List <ISequence>();
            var remove     = new List <ISequence>();
            var removeLock = new object();



            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                var task = Task.Run(() =>
                {
                    for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        var iterationPdbId     = pdbIdList[index];
                        var iterationPdbIdSeqs = pdbIdSequences[index];// sequences.Where(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId == pdbId).ToList();

                        //var seq = sequences[index];
                        //var seqid = SequenceIdSplit.SequenceIdToPdbIdAndChainId(seq.ID).PdbId.ToUpperInvariant();
                        lock (done)
                        {
                            if (iterationPdbIdSeqs.All(done.Contains))
                            {
                                continue;
                            }
                        }

                        foreach (var pdbIdSeqSet in pdbIdSequences)
                        {
                            if (pdbIdSeqSet == iterationPdbIdSeqs)
                            {
                                continue;
                            }

                            foreach (var pdbIdSeq in pdbIdSeqSet)
                            {
                                foreach (var iterationPdbIdSeq in iterationPdbIdSeqs)
                                {
                                }
                            }
                        }

                        // find sequences equal to the current iteration item
                        //var equalseq = sequences.Where(a => a.SequenceEqual(seq)).ToList();


                        /*
                         * var equalseq = sequences.Where(a => AlignedSequenceSimilarityPercentage(seq,a) >= 90).ToList();
                         *
                         *
                         *
                         * // get a list of pdbids, ordered alphabetically
                         * var equalseqids = equalseq.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).OrderBy(p => p).ToList();
                         *
                         * // one or more of the chains might have a difference sequence and so not in the list, update by the ids in the list
                         * //equalseq = sequences.Where(p => equalseqids.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant())).ToList();
                         *
                         * // add this iteration item and all others with the same sequence to a list to skip in future
                         * lock (done)
                         * {
                         *  done.AddRange(equalseq);
                         * }
                         *
                         * // keep the very last item in the list and all with the same pdbid that it has
                         * var keepid = equalseqids.Last();
                         * var equalseqkeep = equalseq.Where(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant() == keepid).ToList();
                         *
                         * // remove the sequences to keep from the removal list
                         * equalseq = equalseq.Where(a => !equalseqkeep.Contains(a)).ToList();
                         *
                         * lock (remove)
                         * {
                         *  remove.AddRange(equalseq);
                         * }
                         */
                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                    }
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();


            var remove2 = remove.Distinct().ToList();


            return(RemoveSequences(cancellationToken, sequences, remove2.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).ToList()));
        }
        /// <summary>
        ///     Perform sequence alignment on the chains of each protein to see if it is a homodimer or heterodimer
        /// </summary>
        /// <returns></returns>
        public static DimerSequenceTypeCategories <string> SplitDimersHomoHetero(CancellationToken cancellationToken, List <ISequence> sequences, decimal minimumHeterodimerSimilarityRequired = 30.0m, decimal minimumHomodimerSimiliarityRequired = 90.0m, ProgressActionSet progressActionSet = null, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            var workDivision = new WorkDivision <DimerSequenceTypeCategories <string> >(sequences.Count, totalThreads);

            ProgressActionSet.StartAction(sequences.Count, progressActionSet);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                Task <DimerSequenceTypeCategories <string> > task = Task.Run(() =>
                {
                    var taskResult = new DimerSequenceTypeCategories <string>();

                    for (int sequencesIndexA = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndexA <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndexA++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        string proteinIdA = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexA].ID).PdbId;

                        for (int sequencesIndexB = 0; sequencesIndexB < sequences.Count; sequencesIndexB++)
                        {
                            if (cancellationToken.IsCancellationRequested)
                            {
                                break;
                            }

                            // Don't align the same sequence index. Skip calculating indexes already calculated. Perform alignment operation if protein id is the same.
                            var proteinIdB = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexB].ID).PdbId;

                            if (sequencesIndexA == sequencesIndexB || sequencesIndexB < sequencesIndexA || proteinIdA != proteinIdB)
                            {
                                continue;
                            }

                            var dimerType = FindDimerType(sequences[sequencesIndexA], sequences[sequencesIndexB], minimumHeterodimerSimilarityRequired, minimumHomodimerSimiliarityRequired);

                            if (dimerType == DimerType.HeteroDimer)
                            {
                                taskResult.HeteroDimerPdbIdList.Add(proteinIdA);
                            }
                            else if (dimerType == DimerType.HomoDimer)
                            {
                                taskResult.HomoDimerPdbIdList.Add(proteinIdA);
                            }
                            else if (dimerType == DimerType.HomologyDimer)
                            {
                                taskResult.HomologyDimerPdbIdList.Add(proteinIdA);
                            }
                        }

                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                    }

                    return(taskResult);
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            var dimerSequenceTypeCategories = new DimerSequenceTypeCategories <string>();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.IsCompleted && !t.IsFaulted && !t.IsCanceled && t.Result != null))
            {
                dimerSequenceTypeCategories.HeteroDimerPdbIdList.AddRange(task.Result.HeteroDimerPdbIdList);
                dimerSequenceTypeCategories.HomoDimerPdbIdList.AddRange(task.Result.HomoDimerPdbIdList);
                dimerSequenceTypeCategories.HomologyDimerPdbIdList.AddRange(task.Result.HomologyDimerPdbIdList);
            }

            dimerSequenceTypeCategories.HeteroDimerPdbIdList   = dimerSequenceTypeCategories.HeteroDimerPdbIdList.Distinct().ToList();
            dimerSequenceTypeCategories.HomoDimerPdbIdList     = dimerSequenceTypeCategories.HomoDimerPdbIdList.Distinct().ToList();
            dimerSequenceTypeCategories.HomologyDimerPdbIdList = dimerSequenceTypeCategories.HomologyDimerPdbIdList.Distinct().ToList();


            return(dimerSequenceTypeCategories);
        }
        /// <summary>
        ///     This method loads multiple sequence files.  FASTA is the preferred format.
        /// </summary>
        /// <param name="sequenceFilenames"></param>
        /// <param name="numberSequencesLoaded"></param>
        /// <param name="distinct"></param>
        /// <returns></returns>
        public static List <ISequence> LoadSequenceFileList(string[] sequenceFilenames, string[] molNames, out int[] numberSequencesLoaded, bool distinct = true)
        {
            if (sequenceFilenames == null || sequenceFilenames.Length == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequenceFilenames));
            }

            var sequences = new List <ISequence>();

            numberSequencesLoaded = new int[sequenceFilenames.Length];

            for (int sequenceFilenameIndex = sequenceFilenames.GetLowerBound(0); sequenceFilenameIndex <= sequenceFilenames.GetUpperBound(0); sequenceFilenameIndex++)
            {
                List <ISequence> nextSequences = LoadSequenceFile(sequenceFilenames[sequenceFilenameIndex], molNames, distinct);

                if ((nextSequences != null) && (nextSequences.Count > 0))
                {
                    nextSequences = nextSequences.Where(a => !ProteinDataBankFileOperations.PdbIdBadList.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant())).ToList();

                    numberSequencesLoaded[sequenceFilenameIndex] = nextSequences.Count;
                    sequences.AddRange(nextSequences);
                }
                else
                {
                    numberSequencesLoaded[sequenceFilenameIndex] = 0;
                }
            }



            if (distinct)
            {
                sequences = sequences.Distinct().ToList();
            }

            return(sequences);
        }