/// <summary> /// This method converts a sequence list into a distinct PDB ID list. /// </summary> /// <param name="sequenceList"></param> /// <param name="distinct"></param> /// <returns></returns> public static List <string> SequenceListToPdbIdList(List <ISequence> sequenceList, bool distinct = true) { if (sequenceList == null)// || sequenceList.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequenceList)); } if (distinct) { return(sequenceList.Select(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId).Distinct().ToList()); } else { return(sequenceList.Select(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId).ToList()); } }
/// <summary> /// This method loads a single sequence file. FASTA is the preferred format. /// </summary> /// <param name="sequenceFilename"></param> /// <param name="molNames"></param> /// <param name="distinct"></param> /// <returns></returns> public static List <ISequence> LoadSequenceFile(string sequenceFilename, string[] molNames, bool distinct = true) { if (string.IsNullOrWhiteSpace(sequenceFilename)) { throw new ArgumentOutOfRangeException(nameof(sequenceFilename)); } if (!File.Exists(sequenceFilename)) { throw new FileNotFoundException(sequenceFilename); } List <ISequence> sequences = null; ISequenceParser sequenceParser = null; try { sequenceParser = SequenceParsers.FindParserByFileName(sequenceFilename); } catch (DirectoryNotFoundException directoryNotFoundException) { // just forward exception for now throw new DirectoryNotFoundException(directoryNotFoundException.Message, directoryNotFoundException.InnerException); } if (sequenceParser != null) { sequences = sequenceParser.Parse().ToList(); sequenceParser.Close(); if (distinct) { sequences = sequences.Distinct().ToList(); } } if (sequences != null && sequences.Count > 0 && molNames != null && molNames.Length > 0) { sequences = sequences.Where(a => molNames.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).Mol)).ToList(); } return(sequences); }
/// <summary> /// Save to disk a list of sequences in CSV/TSV format. /// </summary> /// <param name="sequences"></param> /// <param name="filename"></param> public static string[] SaveSequencesAsSpreadsheet(List <ISequence> sequences, string filename, bool tsvFormat = false, bool xlsxFormat = true) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (string.IsNullOrWhiteSpace(filename)) { throw new ArgumentOutOfRangeException(nameof(filename)); } if (!tsvFormat && !xlsxFormat) { throw new ArgumentOutOfRangeException(nameof(tsvFormat), tsvFormat, "No file formats were selected"); } var headerColumnsRow = new[] { new SpreadsheetCell("PDB ID"), new SpreadsheetCell("Chain"), new SpreadsheetCell("Sequence"), }; var rowList = new List <SpreadsheetCell[]>(); rowList.Add(headerColumnsRow); foreach (ISequence sequence in sequences) { SequenceIdSplit.SequenceIdToPdbIdAndChainIdResult id = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequence.ID); var row = new[] { new SpreadsheetCell(id.PdbId), new SpreadsheetCell(id.ChainId), new SpreadsheetCell(sequence.ConvertToString()), }; rowList.Add(row); } string[] filesSavedStrings = SpreadsheetFileHandler.SaveSpreadsheet(filename, null, rowList, null, tsvFormat, xlsxFormat); return(filesSavedStrings); }
public static Dictionary <string, List <string> > PdbIdChainIdList(List <ISequence> sequenceList) { var result = new Dictionary <string, List <string> >(); foreach (var seq in sequenceList) { var seqId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(seq.ID); if (!result.ContainsKey(seqId.PdbId)) { result.Add(seqId.PdbId, new List <string>()); } result[seqId.PdbId].Add(seqId.ChainId); } return(result); }
/// <summary> /// This method removes sequences from the list which are not proteins (e.g. DNA, RNA, Hybrid). /// </summary> /// <returns></returns> public static List <ISequence> RemoveNonProteinAlphabetSequences(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } WorkDivision <List <string> > workDivision = new WorkDivision <List <string> >(sequences.Count, totalThreads); ProgressActionSet.StartAction(sequences.Count, progressActionSet); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <List <string> > task = Task.Run(() => { var taskResult = new List <string>(); for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++) { if (cancellationToken.IsCancellationRequested) { break; } string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[index].ID).PdbId; if (sequences[index].Alphabet != Alphabets.Protein) { taskResult.Add(proteinId); } workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var result = new List <string>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted)) { result.AddRange(task.Result); } result = result.Distinct().ToList(); List <ISequence> seq = RemoveSequences(cancellationToken, sequences, result); return(seq); }
/// <summary> /// This method removes specified ids from the list of sequences. /// </summary> /// <param name="sequences"></param> /// <param name="sequencesToKeepOrRemove"></param> /// <param name="options"></param> /// <returns></returns> public static List <ISequence> RemoveSequences(CancellationToken cancellationToken, List <ISequence> sequences, List <string> sequencesToKeepOrRemove, RemoveSequencesOptions options = RemoveSequencesOptions.RemoveSequencesInList, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (sequencesToKeepOrRemove == null) { throw new ArgumentOutOfRangeException(nameof(sequencesToKeepOrRemove)); } if (sequencesToKeepOrRemove != null)// && sequencesToKeepOrRemove.Count > 0) { var workDivision = new WorkDivision <List <int> >(sequences.Count, totalThreads); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <List <int> > task = Task.Run(() => { var taskResult = new List <int>(); for (int sequencesIndex = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndex <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndex++) { string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndex].ID).PdbId; if (((options == RemoveSequencesOptions.RemoveSequencesInList) && (sequencesToKeepOrRemove.Contains(proteinId))) || ((options == RemoveSequencesOptions.RemoveSequencesNotInList) && (!sequencesToKeepOrRemove.Contains(proteinId)))) { taskResult.Add(sequencesIndex); } workDivision.IncrementItemsCompleted(1); } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var sequenceIndexesToRemove = new List <int>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted)) { sequenceIndexesToRemove.AddRange(task.Result); } sequenceIndexesToRemove = sequenceIndexesToRemove.Distinct().ToList(); sequenceIndexesToRemove.Sort(); for (int sequenceIndexesToRemoveIndex = sequenceIndexesToRemove.Count - 1; sequenceIndexesToRemoveIndex >= 0; sequenceIndexesToRemoveIndex--) { sequences.RemoveAt(sequenceIndexesToRemove[sequenceIndexesToRemoveIndex]); } } return(sequences); }
public static List <ISequence> RemoveDuplicates(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } var pdbIdList = SequenceListToPdbIdList(sequences); var pdbIdSequences = pdbIdList.Select(a => sequences.Where(b => SequenceIdSplit.SequenceIdToPdbIdAndChainId(b.ID).PdbId == a).ToList()).ToList(); var workDivision = new WorkDivision(pdbIdList.Count, totalThreads); ProgressActionSet.StartAction(pdbIdList.Count, progressActionSet); var done = new List <ISequence>(); var remove = new List <ISequence>(); var removeLock = new object(); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; var task = Task.Run(() => { for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++) { if (cancellationToken.IsCancellationRequested) { break; } var iterationPdbId = pdbIdList[index]; var iterationPdbIdSeqs = pdbIdSequences[index];// sequences.Where(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId == pdbId).ToList(); //var seq = sequences[index]; //var seqid = SequenceIdSplit.SequenceIdToPdbIdAndChainId(seq.ID).PdbId.ToUpperInvariant(); lock (done) { if (iterationPdbIdSeqs.All(done.Contains)) { continue; } } foreach (var pdbIdSeqSet in pdbIdSequences) { if (pdbIdSeqSet == iterationPdbIdSeqs) { continue; } foreach (var pdbIdSeq in pdbIdSeqSet) { foreach (var iterationPdbIdSeq in iterationPdbIdSeqs) { } } } // find sequences equal to the current iteration item //var equalseq = sequences.Where(a => a.SequenceEqual(seq)).ToList(); /* * var equalseq = sequences.Where(a => AlignedSequenceSimilarityPercentage(seq,a) >= 90).ToList(); * * * * // get a list of pdbids, ordered alphabetically * var equalseqids = equalseq.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).OrderBy(p => p).ToList(); * * // one or more of the chains might have a difference sequence and so not in the list, update by the ids in the list * //equalseq = sequences.Where(p => equalseqids.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant())).ToList(); * * // add this iteration item and all others with the same sequence to a list to skip in future * lock (done) * { * done.AddRange(equalseq); * } * * // keep the very last item in the list and all with the same pdbid that it has * var keepid = equalseqids.Last(); * var equalseqkeep = equalseq.Where(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant() == keepid).ToList(); * * // remove the sequences to keep from the removal list * equalseq = equalseq.Where(a => !equalseqkeep.Contains(a)).ToList(); * * lock (remove) * { * remove.AddRange(equalseq); * } */ workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var remove2 = remove.Distinct().ToList(); return(RemoveSequences(cancellationToken, sequences, remove2.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).ToList())); }
/// <summary> /// Perform sequence alignment on the chains of each protein to see if it is a homodimer or heterodimer /// </summary> /// <returns></returns> public static DimerSequenceTypeCategories <string> SplitDimersHomoHetero(CancellationToken cancellationToken, List <ISequence> sequences, decimal minimumHeterodimerSimilarityRequired = 30.0m, decimal minimumHomodimerSimiliarityRequired = 90.0m, ProgressActionSet progressActionSet = null, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } var workDivision = new WorkDivision <DimerSequenceTypeCategories <string> >(sequences.Count, totalThreads); ProgressActionSet.StartAction(sequences.Count, progressActionSet); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <DimerSequenceTypeCategories <string> > task = Task.Run(() => { var taskResult = new DimerSequenceTypeCategories <string>(); for (int sequencesIndexA = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndexA <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndexA++) { if (cancellationToken.IsCancellationRequested) { break; } string proteinIdA = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexA].ID).PdbId; for (int sequencesIndexB = 0; sequencesIndexB < sequences.Count; sequencesIndexB++) { if (cancellationToken.IsCancellationRequested) { break; } // Don't align the same sequence index. Skip calculating indexes already calculated. Perform alignment operation if protein id is the same. var proteinIdB = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexB].ID).PdbId; if (sequencesIndexA == sequencesIndexB || sequencesIndexB < sequencesIndexA || proteinIdA != proteinIdB) { continue; } var dimerType = FindDimerType(sequences[sequencesIndexA], sequences[sequencesIndexB], minimumHeterodimerSimilarityRequired, minimumHomodimerSimiliarityRequired); if (dimerType == DimerType.HeteroDimer) { taskResult.HeteroDimerPdbIdList.Add(proteinIdA); } else if (dimerType == DimerType.HomoDimer) { taskResult.HomoDimerPdbIdList.Add(proteinIdA); } else if (dimerType == DimerType.HomologyDimer) { taskResult.HomologyDimerPdbIdList.Add(proteinIdA); } } workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var dimerSequenceTypeCategories = new DimerSequenceTypeCategories <string>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.IsCompleted && !t.IsFaulted && !t.IsCanceled && t.Result != null)) { dimerSequenceTypeCategories.HeteroDimerPdbIdList.AddRange(task.Result.HeteroDimerPdbIdList); dimerSequenceTypeCategories.HomoDimerPdbIdList.AddRange(task.Result.HomoDimerPdbIdList); dimerSequenceTypeCategories.HomologyDimerPdbIdList.AddRange(task.Result.HomologyDimerPdbIdList); } dimerSequenceTypeCategories.HeteroDimerPdbIdList = dimerSequenceTypeCategories.HeteroDimerPdbIdList.Distinct().ToList(); dimerSequenceTypeCategories.HomoDimerPdbIdList = dimerSequenceTypeCategories.HomoDimerPdbIdList.Distinct().ToList(); dimerSequenceTypeCategories.HomologyDimerPdbIdList = dimerSequenceTypeCategories.HomologyDimerPdbIdList.Distinct().ToList(); return(dimerSequenceTypeCategories); }
/// <summary> /// This method loads multiple sequence files. FASTA is the preferred format. /// </summary> /// <param name="sequenceFilenames"></param> /// <param name="numberSequencesLoaded"></param> /// <param name="distinct"></param> /// <returns></returns> public static List <ISequence> LoadSequenceFileList(string[] sequenceFilenames, string[] molNames, out int[] numberSequencesLoaded, bool distinct = true) { if (sequenceFilenames == null || sequenceFilenames.Length == 0) { throw new ArgumentOutOfRangeException(nameof(sequenceFilenames)); } var sequences = new List <ISequence>(); numberSequencesLoaded = new int[sequenceFilenames.Length]; for (int sequenceFilenameIndex = sequenceFilenames.GetLowerBound(0); sequenceFilenameIndex <= sequenceFilenames.GetUpperBound(0); sequenceFilenameIndex++) { List <ISequence> nextSequences = LoadSequenceFile(sequenceFilenames[sequenceFilenameIndex], molNames, distinct); if ((nextSequences != null) && (nextSequences.Count > 0)) { nextSequences = nextSequences.Where(a => !ProteinDataBankFileOperations.PdbIdBadList.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant())).ToList(); numberSequencesLoaded[sequenceFilenameIndex] = nextSequences.Count; sequences.AddRange(nextSequences); } else { numberSequencesLoaded[sequenceFilenameIndex] = 0; } } if (distinct) { sequences = sequences.Distinct().ToList(); } return(sequences); }