/// <summary> /// This method removes sequences not having the required number of chains. /// </summary> /// <returns></returns> public static List <ISequence> RemoveSequencesWithIncorrectNumberOfChains(CancellationToken cancellationToken, List <ISequence> sequenceList, int numberOfChainsRequired = 2, ProgressActionSet progressActionSet = null) { if (sequenceList == null || sequenceList.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequenceList)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } var pdbIdListNotDistinct = FilterProteins.SequenceListToPdbIdList(sequenceList, false); ProgressActionSet.StartAction(pdbIdListNotDistinct.Count, progressActionSet); var workDivision = new WorkDivision <List <string> >(pdbIdListNotDistinct.Count); for (var threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { var localThreadIndex = threadIndex; Task <List <string> > task = Task.Run(() => { var taskResult = pdbIdListNotDistinct.Where((a, pdbIdIndex) => { if (pdbIdIndex < workDivision.ThreadFirstIndex[localThreadIndex] || pdbIdIndex > workDivision.ThreadLastIndex[localThreadIndex]) { return(false); } workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); return(pdbIdListNotDistinct.Count(b => a == b) != numberOfChainsRequired); }).ToList(); return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var sequencesWithIncorrectNumberOfChains = new List <string>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted)) { sequencesWithIncorrectNumberOfChains.AddRange(task.Result); } var result = RemoveSequences(cancellationToken, sequenceList, sequencesWithIncorrectNumberOfChains); ProgressActionSet.FinishAction(true, progressActionSet); return(result); }
public static List <string> RemoveStructuresWithIncorrectNumberOfChains(CancellationToken cancellationToken, string[] pdbFolders, List <string> pdbIdList = null, Dictionary <string, List <string> > pdbIdChainIdList = null, int numberChainsRequired = 2, ProgressActionSet progressActionSet = null, int totalThreads = -1) { if (pdbFolders == null || pdbFolders.Length == 0) { throw new ArgumentOutOfRangeException(nameof(pdbFolders)); } if (pdbIdList == null || pdbIdList.Count == 0) { throw new ArgumentOutOfRangeException(nameof(pdbIdList)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } var pdbFilesArray = ProteinDataBankFileOperations.GetPdbFilesArray(pdbFolders); pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, pdbFilesArray); ProteinDataBankFileOperations.ShowMissingPdbFiles(pdbFilesArray, pdbIdList, progressActionSet); WorkDivision <List <string> > workDivision = new WorkDivision <List <string> >(pdbFilesArray.Length, totalThreads); ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <List <string> > task = Task.Run(() => { var taskResult = new List <string>(); for (int pdbFileNumber = workDivision.ThreadFirstIndex[localThreadIndex]; pdbFileNumber <= workDivision.ThreadLastIndex[localThreadIndex]; pdbFileNumber++) { if (cancellationToken.IsCancellationRequested) { break; } try { string pdbFilename = pdbFilesArray[pdbFileNumber]; string proteinId = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename); // Check if the file found is included in the white list. if (pdbIdList != null && !pdbIdList.Contains(proteinId)) { continue; } var sequenceChainIdList = pdbIdChainIdList != null ? (pdbIdChainIdList.ContainsKey(proteinId) ? pdbIdChainIdList[proteinId].ToArray() : null) : null; int chainCount = ProteinDataBankFileOperations.PdbAtomicChainsCount(pdbFilename, sequenceChainIdList, numberChainsRequired); if (chainCount != numberChainsRequired) { if (!taskResult.Contains(proteinId)) { taskResult.Add(proteinId); } } } finally { workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); ProgressActionSet.FinishAction(true, progressActionSet); var result = new List <string>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted)) { result.AddRange(task.Result); } result = result.Distinct().ToList(); return(result); }
/// <summary> /// This method removes sequences from the list which are not proteins (e.g. DNA, RNA, Hybrid). /// </summary> /// <returns></returns> public static List <ISequence> RemoveNonProteinAlphabetSequences(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } WorkDivision <List <string> > workDivision = new WorkDivision <List <string> >(sequences.Count, totalThreads); ProgressActionSet.StartAction(sequences.Count, progressActionSet); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <List <string> > task = Task.Run(() => { var taskResult = new List <string>(); for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++) { if (cancellationToken.IsCancellationRequested) { break; } string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[index].ID).PdbId; if (sequences[index].Alphabet != Alphabets.Protein) { taskResult.Add(proteinId); } workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var result = new List <string>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted)) { result.AddRange(task.Result); } result = result.Distinct().ToList(); List <ISequence> seq = RemoveSequences(cancellationToken, sequences, result); return(seq); }
/// <summary> /// This method removes specified ids from the list of sequences. /// </summary> /// <param name="sequences"></param> /// <param name="sequencesToKeepOrRemove"></param> /// <param name="options"></param> /// <returns></returns> public static List <ISequence> RemoveSequences(CancellationToken cancellationToken, List <ISequence> sequences, List <string> sequencesToKeepOrRemove, RemoveSequencesOptions options = RemoveSequencesOptions.RemoveSequencesInList, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (sequencesToKeepOrRemove == null) { throw new ArgumentOutOfRangeException(nameof(sequencesToKeepOrRemove)); } if (sequencesToKeepOrRemove != null)// && sequencesToKeepOrRemove.Count > 0) { var workDivision = new WorkDivision <List <int> >(sequences.Count, totalThreads); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <List <int> > task = Task.Run(() => { var taskResult = new List <int>(); for (int sequencesIndex = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndex <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndex++) { string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndex].ID).PdbId; if (((options == RemoveSequencesOptions.RemoveSequencesInList) && (sequencesToKeepOrRemove.Contains(proteinId))) || ((options == RemoveSequencesOptions.RemoveSequencesNotInList) && (!sequencesToKeepOrRemove.Contains(proteinId)))) { taskResult.Add(sequencesIndex); } workDivision.IncrementItemsCompleted(1); } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var sequenceIndexesToRemove = new List <int>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted)) { sequenceIndexesToRemove.AddRange(task.Result); } sequenceIndexesToRemove = sequenceIndexesToRemove.Distinct().ToList(); sequenceIndexesToRemove.Sort(); for (int sequenceIndexesToRemoveIndex = sequenceIndexesToRemove.Count - 1; sequenceIndexesToRemoveIndex >= 0; sequenceIndexesToRemoveIndex--) { sequences.RemoveAt(sequenceIndexesToRemove[sequenceIndexesToRemoveIndex]); } } return(sequences); }
public static List <ISequence> RemoveDuplicates(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } var pdbIdList = SequenceListToPdbIdList(sequences); var pdbIdSequences = pdbIdList.Select(a => sequences.Where(b => SequenceIdSplit.SequenceIdToPdbIdAndChainId(b.ID).PdbId == a).ToList()).ToList(); var workDivision = new WorkDivision(pdbIdList.Count, totalThreads); ProgressActionSet.StartAction(pdbIdList.Count, progressActionSet); var done = new List <ISequence>(); var remove = new List <ISequence>(); var removeLock = new object(); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; var task = Task.Run(() => { for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++) { if (cancellationToken.IsCancellationRequested) { break; } var iterationPdbId = pdbIdList[index]; var iterationPdbIdSeqs = pdbIdSequences[index];// sequences.Where(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId == pdbId).ToList(); //var seq = sequences[index]; //var seqid = SequenceIdSplit.SequenceIdToPdbIdAndChainId(seq.ID).PdbId.ToUpperInvariant(); lock (done) { if (iterationPdbIdSeqs.All(done.Contains)) { continue; } } foreach (var pdbIdSeqSet in pdbIdSequences) { if (pdbIdSeqSet == iterationPdbIdSeqs) { continue; } foreach (var pdbIdSeq in pdbIdSeqSet) { foreach (var iterationPdbIdSeq in iterationPdbIdSeqs) { } } } // find sequences equal to the current iteration item //var equalseq = sequences.Where(a => a.SequenceEqual(seq)).ToList(); /* * var equalseq = sequences.Where(a => AlignedSequenceSimilarityPercentage(seq,a) >= 90).ToList(); * * * * // get a list of pdbids, ordered alphabetically * var equalseqids = equalseq.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).OrderBy(p => p).ToList(); * * // one or more of the chains might have a difference sequence and so not in the list, update by the ids in the list * //equalseq = sequences.Where(p => equalseqids.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant())).ToList(); * * // add this iteration item and all others with the same sequence to a list to skip in future * lock (done) * { * done.AddRange(equalseq); * } * * // keep the very last item in the list and all with the same pdbid that it has * var keepid = equalseqids.Last(); * var equalseqkeep = equalseq.Where(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant() == keepid).ToList(); * * // remove the sequences to keep from the removal list * equalseq = equalseq.Where(a => !equalseqkeep.Contains(a)).ToList(); * * lock (remove) * { * remove.AddRange(equalseq); * } */ workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var remove2 = remove.Distinct().ToList(); return(RemoveSequences(cancellationToken, sequences, remove2.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).ToList())); }
/// <summary> /// Perform sequence alignment on the chains of each protein to see if it is a homodimer or heterodimer /// </summary> /// <returns></returns> public static DimerSequenceTypeCategories <string> SplitDimersHomoHetero(CancellationToken cancellationToken, List <ISequence> sequences, decimal minimumHeterodimerSimilarityRequired = 30.0m, decimal minimumHomodimerSimiliarityRequired = 90.0m, ProgressActionSet progressActionSet = null, int totalThreads = -1) { if (sequences == null || sequences.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequences)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } var workDivision = new WorkDivision <DimerSequenceTypeCategories <string> >(sequences.Count, totalThreads); ProgressActionSet.StartAction(sequences.Count, progressActionSet); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <DimerSequenceTypeCategories <string> > task = Task.Run(() => { var taskResult = new DimerSequenceTypeCategories <string>(); for (int sequencesIndexA = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndexA <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndexA++) { if (cancellationToken.IsCancellationRequested) { break; } string proteinIdA = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexA].ID).PdbId; for (int sequencesIndexB = 0; sequencesIndexB < sequences.Count; sequencesIndexB++) { if (cancellationToken.IsCancellationRequested) { break; } // Don't align the same sequence index. Skip calculating indexes already calculated. Perform alignment operation if protein id is the same. var proteinIdB = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexB].ID).PdbId; if (sequencesIndexA == sequencesIndexB || sequencesIndexB < sequencesIndexA || proteinIdA != proteinIdB) { continue; } var dimerType = FindDimerType(sequences[sequencesIndexA], sequences[sequencesIndexB], minimumHeterodimerSimilarityRequired, minimumHomodimerSimiliarityRequired); if (dimerType == DimerType.HeteroDimer) { taskResult.HeteroDimerPdbIdList.Add(proteinIdA); } else if (dimerType == DimerType.HomoDimer) { taskResult.HomoDimerPdbIdList.Add(proteinIdA); } else if (dimerType == DimerType.HomologyDimer) { taskResult.HomologyDimerPdbIdList.Add(proteinIdA); } } workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var dimerSequenceTypeCategories = new DimerSequenceTypeCategories <string>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.IsCompleted && !t.IsFaulted && !t.IsCanceled && t.Result != null)) { dimerSequenceTypeCategories.HeteroDimerPdbIdList.AddRange(task.Result.HeteroDimerPdbIdList); dimerSequenceTypeCategories.HomoDimerPdbIdList.AddRange(task.Result.HomoDimerPdbIdList); dimerSequenceTypeCategories.HomologyDimerPdbIdList.AddRange(task.Result.HomologyDimerPdbIdList); } dimerSequenceTypeCategories.HeteroDimerPdbIdList = dimerSequenceTypeCategories.HeteroDimerPdbIdList.Distinct().ToList(); dimerSequenceTypeCategories.HomoDimerPdbIdList = dimerSequenceTypeCategories.HomoDimerPdbIdList.Distinct().ToList(); dimerSequenceTypeCategories.HomologyDimerPdbIdList = dimerSequenceTypeCategories.HomologyDimerPdbIdList.Distinct().ToList(); return(dimerSequenceTypeCategories); }
/// <summary> /// Generate stats of interactions... also removes proteins not meeting minimum interactions requirement. /// </summary> /// <param name="cancellationToken"></param> /// <param name="pdbFolders"></param> /// <param name="pdbIdList"></param> /// <param name="progressBar"></param> /// <param name="estimatedTimeRemainingLabel"></param> /// <returns></returns> public static Dictionary <string, decimal> CalculateStructureSymmetry(CancellationToken cancellationToken, decimal maxAtomInterationDistance, string[] pdbFolders, List <string> pdbIdList = null, Dictionary <string, List <string> > pdbIdChainIdList = null, ProgressActionSet progressActionSet = null, int totalThreads = -1) { if (pdbFolders == null || pdbFolders.Length == 0) { throw new ArgumentOutOfRangeException(nameof(pdbFolders)); } if (pdbIdList == null || pdbIdList.Count == 0) { throw new ArgumentOutOfRangeException(nameof(pdbIdList)); } if (progressActionSet == null) { throw new ArgumentOutOfRangeException(nameof(progressActionSet)); } const int requiredNumberOfChains = 2; string[] pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, ProteinDataBankFileOperations.GetPdbFilesArray(pdbFolders)); WorkDivision <Dictionary <string, decimal> > workDivision = new WorkDivision <Dictionary <string, decimal> >(pdbFilesArray.Length, totalThreads); ProteinDataBankFileOperations.ShowMissingPdbFiles(pdbFilesArray, pdbIdList, progressActionSet); ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <Dictionary <string, decimal> > task = Task.Run(() => { var taskResult = new Dictionary <string, decimal>(); for (int pdbFileNumber = workDivision.ThreadFirstIndex[localThreadIndex]; pdbFileNumber <= workDivision.ThreadLastIndex[localThreadIndex]; pdbFileNumber++) { if (cancellationToken.IsCancellationRequested) { break; } try { string pdbFilename = pdbFilesArray[pdbFileNumber]; string proteinId = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename); // Check if the file found is included in the white list. if (pdbIdList != null && !pdbIdList.Contains(proteinId)) { continue; } var chainIdList = pdbIdChainIdList != null ? (proteinId != null && pdbIdChainIdList.ContainsKey(proteinId) ? pdbIdChainIdList[proteinId].ToArray() : null) : null; // Get atom chains. ProteinChainListContainer proteinFileChains = ProteinDataBankFileOperations.PdbAtomicChains(pdbFilename, chainIdList, requiredNumberOfChains, requiredNumberOfChains, true); if (proteinFileChains == null || proteinFileChains.ChainList == null || proteinFileChains.ChainList.Count != 2 || proteinFileChains.ChainList[StaticValues.ChainA] == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList.Count == 0 || proteinFileChains.ChainList[StaticValues.ChainB] == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList.Count == 0) { continue; } // Make a list to save interactions found. var interactionMatchPercentage = new InteractionMatchPercentage(proteinId); List <AtomPair> interactions = SearchInteractions.FindInteractions(cancellationToken, maxAtomInterationDistance, pdbFilename, pdbIdChainIdList); interactionMatchPercentage.IncrementTotalInteractions(interactions.Count); for (int interactionsIndex = 0; interactionsIndex < interactions.Count; interactionsIndex++) { interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainA, interactions[interactionsIndex].Atom1.resSeq.FieldValue); interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainB, interactions[interactionsIndex].Atom2.resSeq.FieldValue); } InteractionMatchPercentage.CalculatePercentageResult calculatedPercentage = interactionMatchPercentage.CalculatePercentage(); taskResult.Add(interactionMatchPercentage.ProteinId, calculatedPercentage.InteractionMatchPercentageAverage); } finally { workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); var result = new Dictionary <string, decimal>(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted)) { foreach (var kvp in task.Result) { //if (result.ContainsKey(kvp.Key)) //{ // Console.WriteLine("Key already exists: '" + kvp.Key + "'"); //} result.Add(kvp.Key, kvp.Value); } } return(result); }
/// <summary> /// Makes spreadsheets with scientific data outputs about given proteins. /// </summary> /// <param name="cancellationToken"></param> /// <param name="pdbFolders">The location of the PDB files</param> /// <param name="pdbIdList">The PDB files which should be used.</param> /// <param name="consoleTextBox"></param> /// <param name="progressBar">User proteinInterface progress bar for user feedback.</param> /// <param name="estimatedTimeRemainingLabel">User proteinInterface estimated time remaining label for user feedback.</param> /// <param name="requestedTotalThreads"></param> /// <returns>Returns the generated spreadsheets with scientific data.</returns> public static List <List <SpreadsheetCell[]> > MakeHomodimerStatisticsSpreadsheetsList(CancellationToken cancellationToken, decimal maxAtomInterationDistance, string[] pdbFolders, List <string> pdbIdList = null, Dictionary <string, List <string> > pdbIdChainIdList = null, ProgressActionSet progressActionSet = null, int requestedTotalThreads = -1) { if (pdbFolders == null || pdbFolders.Length == 0) { throw new ArgumentOutOfRangeException(nameof(pdbFolders)); } if (pdbIdList == null || pdbIdList.Count == 0) { throw new ArgumentOutOfRangeException(nameof(pdbIdList)); } if (progressActionSet == null) { throw new ArgumentNullException(nameof(progressActionSet)); } // this method creates // 1. a list of interactions // 2. a list of symmetry percentage // 3. an "expected" heatmap by combining every possible a/b amino acid combination // 4. an actual heatmap for the proteinInterfaces // 5. normalised versions of both of the heatmaps string[] pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, ProteinDataBankFileOperations.GetPdbFilesArray(pdbFolders)); //var interactionRecordList = new List<ProteinInteractionRecord>(); //var interactionMatchPercentageList = new List<InteractionMatchPercentage>(); //var wholeProteinChainsAminoAcidCounter = new List<AminoAcidChainComposition>(); //var interactionChainsAminoAcidCounter = new List<AminoAcidChainComposition>(); //var interactionsAminoAcidToAminoAcidCounter = new AminoAcidPairCompositionMatrix(); ////var wholeProteinAminoAcidToAminoAcidCounter2x2 = new AminoAcidPairCompositionMatrix(); // composition of every amino acid paired in every possible combination var workDivision = new WorkDivision <HomodimersStatisticsMinerTaskResult>(pdbFilesArray.Length, requestedTotalThreads); ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet); int checkAllFilesProcessed = 0; var lockCheckAllFilesProcessed = new object(); var pdbFilesProcessed = new bool[pdbFilesArray.Length]; Array.Clear(pdbFilesProcessed, 0, pdbFilesProcessed.Length); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; Task <HomodimersStatisticsMinerTaskResult> task = Task.Run(() => { var result = new HomodimersStatisticsMinerTaskResult(); for (int pdbFileNumber = workDivision.ThreadFirstIndex[localThreadIndex]; pdbFileNumber <= workDivision.ThreadLastIndex[localThreadIndex]; pdbFileNumber++) { if (cancellationToken.IsCancellationRequested) { break; } lock (lockCheckAllFilesProcessed) { checkAllFilesProcessed++; pdbFilesProcessed[pdbFileNumber] = true; } try { string pdbFilename = pdbFilesArray[pdbFileNumber]; string proteinId = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename); // Check if the file found is included in the white list. if (/*pdbIdList != null && */ !pdbIdList.Contains(proteinId)) { ProgressActionSet.Report("Error: " + proteinId + " was not in the PDB ID white list.", progressActionSet); continue; } List <AtomPair> interactions = SearchInteractions.FindInteractions(cancellationToken, maxAtomInterationDistance, pdbFilename, pdbIdChainIdList); // Make a list to save interactions found. var interactionMatchPercentage = new InteractionMatchPercentage(proteinId); var chainAminoAcidCounterA1X1 = new AminoAcidChainComposition(proteinId, "A"); var chainAminoAcidCounterB1X1 = new AminoAcidChainComposition(proteinId, "B"); var chainInteractionAminoAcidCounterA = new AminoAcidChainComposition(proteinId, "A"); var chainInteractionAminoAcidCounterB = new AminoAcidChainComposition(proteinId, "B"); if (interactions != null && interactions.Count > 0) { interactionMatchPercentage.IncrementTotalInteractions(interactions.Count); for (int interactionsIndex = 0; interactionsIndex < interactions.Count; interactionsIndex++) { chainInteractionAminoAcidCounterA.IncrementAminoAcidCount(interactions[interactionsIndex].Atom1.resName.FieldValue); chainInteractionAminoAcidCounterB.IncrementAminoAcidCount(interactions[interactionsIndex].Atom2.resName.FieldValue); result.InteractionRecordList.Add(new ProteinInteractionRecord(proteinId, interactionsIndex + 1, interactions[interactionsIndex])); interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainA, interactions[interactionsIndex].Atom1.resSeq.FieldValue); interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainB, interactions[interactionsIndex].Atom2.resSeq.FieldValue); result.InteractionsAminoAcidToAminoAcidCounter.IncrementAminoAcidCount(interactions[interactionsIndex].Atom1.resName.FieldValue, interactions[interactionsIndex].Atom2.resName.FieldValue); } } var chainIdList = pdbIdChainIdList != null ? (pdbIdChainIdList.ContainsKey(proteinId) ? pdbIdChainIdList[proteinId].ToArray() : null) : null; ProteinChainListContainer proteinFileChains = ProteinDataBankFileOperations.PdbAtomicChains(pdbFilename, chainIdList, 2, 2, true); if (proteinFileChains == null || proteinFileChains.ChainList == null || proteinFileChains.ChainList.Count != 2 || proteinFileChains.ChainList[StaticValues.ChainA] == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList.Count == 0 || proteinFileChains.ChainList[StaticValues.ChainB] == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList.Count == 0) { if (!File.Exists(pdbFilename)) { ProgressActionSet.Report("Error: " + pdbFilename + " (" + proteinId + ") file not found", progressActionSet); } else { int proteinFileChainCount = ProteinDataBankFileOperations.PdbAtomicChainsCount(pdbFilename); ProgressActionSet.Report("Error: " + proteinId + " did not have exactly 2 chains (" + proteinFileChainCount + " chains found) - skipping.", progressActionSet); } continue; } // count total of how many of each type of amino acids are in Chain A. for (int atomIndexA = 0; atomIndexA < proteinFileChains.ChainList[StaticValues.ChainA].AtomList.Count; atomIndexA++) { chainAminoAcidCounterA1X1.IncrementAminoAcidCount(proteinFileChains.ChainList[StaticValues.ChainA].AtomList[atomIndexA].resName.FieldValue); } // count total of how many of each type of amino acids are in Chain B. for (int atomIndexB = 0; atomIndexB < proteinFileChains.ChainList[StaticValues.ChainB].AtomList.Count; atomIndexB++) { chainAminoAcidCounterB1X1.IncrementAminoAcidCount(proteinFileChains.ChainList[StaticValues.ChainB].AtomList[atomIndexB].resName.FieldValue); } interactionMatchPercentage.CalculatePercentage(); result.InteractionMatchPercentageList.Add(interactionMatchPercentage); result.WholeProteinChainsAminoAcidCounter.Add(chainAminoAcidCounterA1X1); result.WholeProteinChainsAminoAcidCounter.Add(chainAminoAcidCounterB1X1); result.InteractionChainsAminoAcidCounter.Add(chainInteractionAminoAcidCounterA); result.InteractionChainsAminoAcidCounter.Add(chainInteractionAminoAcidCounterB); } finally { workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } } return(result); }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); ProgressActionSet.FinishAction(true, progressActionSet); // merge all instances of the results var spreadsheetTaskResult = new HomodimersStatisticsMinerTaskResult(); foreach (var task in workDivision.TaskList.Where(t => t != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted && t.Result != null)) { if (task.Result.InteractionChainsAminoAcidCounter != null && task.Result.InteractionChainsAminoAcidCounter.Count > 0) { spreadsheetTaskResult.InteractionChainsAminoAcidCounter.AddRange(task.Result.InteractionChainsAminoAcidCounter); } if (task.Result.InteractionMatchPercentageList != null && task.Result.InteractionMatchPercentageList.Count > 0) { spreadsheetTaskResult.InteractionMatchPercentageList.AddRange(task.Result.InteractionMatchPercentageList); } if (task.Result.InteractionRecordList != null && task.Result.InteractionRecordList.Count > 0) { spreadsheetTaskResult.InteractionRecordList.AddRange(task.Result.InteractionRecordList); } if (task.Result.InteractionsAminoAcidToAminoAcidCounter != null) { if (task.Result.InteractionsAminoAcidToAminoAcidCounter.AminoAcidToAminoAcid != null) { foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { var totalGroups = AminoAcidGroups.AminoAcidGroups.GetTotalSubgroups(enumAminoAcidGroups); for (int x = 0; x < totalGroups; x++) { for (int y = 0; y < totalGroups; y++) { spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter.AminoAcidToAminoAcid[(int)enumAminoAcidGroups][x, y] += task.Result.InteractionsAminoAcidToAminoAcidCounter.AminoAcidToAminoAcid[(int)enumAminoAcidGroups][x, y]; } } } } } if (task.Result.WholeProteinChainsAminoAcidCounter != null && task.Result.WholeProteinChainsAminoAcidCounter.Count > 0) { spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.AddRange(task.Result.WholeProteinChainsAminoAcidCounter); } } if (pdbFilesProcessed.Count(file => file == false) > 0) { ProgressActionSet.Report("ERROR: " + pdbFilesProcessed.Count(file => file == false) + " PDB FILES WERE SKIPPED! 0x01", progressActionSet); } else { ProgressActionSet.Report("CHECK: NO PDB FILES WERE SKIPPED! 0x01", progressActionSet); } if (checkAllFilesProcessed != pdbFilesArray.Length) { ProgressActionSet.Report("ERROR: " + (pdbFilesArray.Length - checkAllFilesProcessed) + " PDB FILES WERE SKIPPED! 0x02", progressActionSet); } else { ProgressActionSet.Report("CHECK: NO PDB FILES WERE SKIPPED! 0x02", progressActionSet); } spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter = spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.OrderBy(a => a.ProteinId).ThenBy(b => b.ChainId).ToList(); spreadsheetTaskResult.InteractionChainsAminoAcidCounter = spreadsheetTaskResult.InteractionChainsAminoAcidCounter.OrderBy(a => a.ProteinId).ThenBy(b => b.ChainId).ToList(); AminoAcidChainComposition wholeProteinChainsTotals = AminoAcidChainComposition.TotalFromAminoAcidChainCompositionList(spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter); AminoAcidChainComposition interactionChainsTotals = AminoAcidChainComposition.TotalFromAminoAcidChainCompositionList(spreadsheetTaskResult.InteractionChainsAminoAcidCounter); AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounter1X1 = AminoAcidChainComposition.ConvertToMatrix(wholeProteinChainsTotals); var results = new List <List <SpreadsheetCell[]> >(); { /* start test */ var spreadsheet1 = new List <SpreadsheetCell[]>(); spreadsheet1.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% TEST SHEET 0"), }); spreadsheet1.Add(new[] { new SpreadsheetCell("TEST SHEET 0"), }); foreach (AminoAcidChainComposition item in spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter) { //spreadsheet1.Add(item.ProteinId); //spreadsheet1.Add(item.ChainId); spreadsheet1.Add(item.SpreadsheetDataRow()); } results.Add(spreadsheet1); spreadsheet1 = null; /* end test */ } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// { var spreadsheet2 = new List <SpreadsheetCell[]>(); spreadsheet2.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Symmetry"), }); spreadsheet2.Add(new[] { new SpreadsheetCell("Homodimers - List - Interaction Count And Interaction Match Percentage (Symmetry Measurement)") }); spreadsheet2.Add(InteractionMatchPercentage.SpreadsheetColumnHeadersRow()); var range2 = spreadsheetTaskResult.InteractionMatchPercentageList.Select(record => record.SpreadsheetDataRow()).ToList(); //range2.Sort(); range2 = range2 .OrderBy(a => a[0].CellData) .ThenBy(a => a[1].CellData) .ThenBy(a => a[2].CellData) .ThenBy(a => a[3].CellData) .ThenBy(a => a[4].CellData) .ThenBy(a => a[5].CellData) .ThenBy(a => a[6].CellData) .ThenBy(a => a[7].CellData) .ThenBy(a => a[8].CellData) .ToList(); spreadsheet2.AddRange(range2); range2 = null; results.Add(spreadsheet2); var spreadsheetHistogram2 = new List <SpreadsheetCell[]>(); spreadsheetHistogram2.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HG Interaction Symmetry"), }); spreadsheetHistogram2.Add(new[] { new SpreadsheetCell("Homodimers - List - Interaction Count And Interaction Match Percentage (Symmetry Measurement) Histogram") }); spreadsheetHistogram2.AddRange(Histogram.MatrixToHistogram(spreadsheet2.ToArray(), Histogram.MakeBinDecimals(0, 100, 9, 10), new[] { 6, 7, 8 }, 2, -1, true)); results.Add(spreadsheetHistogram2); spreadsheet2 = null; spreadsheetHistogram2 = null; } // { var spreadsheet3 = new List <SpreadsheetCell[]>(); spreadsheet3.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Records"), }); spreadsheet3.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Interaction Record"), }); spreadsheet3.Add(ProteinInteractionRecord.TsvColumnHeadersRow()); var range3 = spreadsheetTaskResult.InteractionRecordList.Select(record => record.SpreadsheetDataRow()).ToList(); //range3.Sort(); range3 = range3 .OrderBy(a => a[0].CellData) .ThenBy(a => a[1].CellData) .ThenBy(a => a[3].CellData) .ThenBy(a => a[5].CellData) .ThenBy(a => a[13].CellData) .ThenBy(a => a[15].CellData) .ToList(); spreadsheet3.AddRange(range3); range3 = null; results.Add(spreadsheet3); var spreadsheetHistogram3 = new List <SpreadsheetCell[]>(); spreadsheetHistogram3.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Records Histogram"), }); spreadsheetHistogram3.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Interaction Record - Histogram"), }); spreadsheetHistogram3.AddRange(Histogram.MatrixToHistogram(spreadsheet3.ToArray(), Histogram.MakeBinDecimals(0m, 5m, 0m, 0.05m), new[] { 1 }, 2, -1, true)); results.Add(spreadsheetHistogram3); //spreadsheet3 = Histogram.InsertMatrixOverwrite(spreadsheet3.ToArray(), histogram3, 2, Histogram.MaxColumns(spreadsheet3.ToArray()) + 1).ToList(); spreadsheet3 = null; spreadsheetHistogram3 = null; } // { var spreadsheet4 = new List <SpreadsheetCell[]>(); spreadsheet4.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Count - A-Z"), }); spreadsheet4.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - Interactions - A to Z"), }); spreadsheet4.Add(AminoAcidChainComposition.SpreadsheetTitleRow()); var range4 = spreadsheetTaskResult.InteractionChainsAminoAcidCounter.Select(record => record.SpreadsheetDataRow()).ToList(); //range4.Sort(); range4 = range4 .OrderBy(a => a[0].CellData) .ThenBy(a => a[1].CellData) .ToList(); spreadsheet4.AddRange(range4); range4 = null; spreadsheet4.Add(interactionChainsTotals.SpreadsheetDataRow()); results.Add(spreadsheet4); spreadsheet4 = null; } // { foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { var spreadsheet5 = new List <SpreadsheetCell[]>(); spreadsheet5.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Count - Groups " + enumAminoAcidGroups), }); spreadsheet5.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - Interactions - Acid Groups " + enumAminoAcidGroups), }); spreadsheet5.Add(AminoAcidChainComposition.SpreadsheetGroupsTitleRow(enumAminoAcidGroups)); var range5 = spreadsheetTaskResult.InteractionChainsAminoAcidCounter.Select(record => record.SpreadsheetGroupsDataRow(enumAminoAcidGroups)).ToList(); //range4.Sort(); range5 = range5 .OrderBy(a => a[0].CellData) .ThenBy(a => a[1].CellData) .ToList(); spreadsheet5.AddRange(range5); range5 = null; spreadsheet5.Add(interactionChainsTotals.SpreadsheetGroupsDataRow(enumAminoAcidGroups)); results.Add(spreadsheet5); spreadsheet5 = null; } } // { var spreadsheet6 = new List <SpreadsheetCell[]>(); spreadsheet6.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Entire Count - A-Z"), }); spreadsheet6.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - All Atoms - A to Z"), }); spreadsheet6.Add(AminoAcidChainComposition.SpreadsheetTitleRow()); var range6 = spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.Select(record => record.SpreadsheetDataRow()).ToList(); //range6.Sort(); range6 = range6 .OrderBy(a => a[0].CellData) .ThenBy(a => a[1].CellData) .ToList(); spreadsheet6.AddRange(range6); range6 = null; spreadsheet6.Add(wholeProteinChainsTotals.SpreadsheetDataRow()); results.Add(spreadsheet6); var spreadsheetHistogram6 = new List <SpreadsheetCell[]>(); spreadsheetHistogram6.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Entire Count - A-Z - Historgram"), }); spreadsheetHistogram6.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - All Atoms - A to Z - Histogram"), }); spreadsheetHistogram6.AddRange(Histogram.MatrixToHistogram(spreadsheet6.ToArray(), Histogram.MakeBinDecimals(0, 10500, 0, 500), new[] { 28 }, 2, -1, true)); spreadsheetHistogram6.Add(new [] { new SpreadsheetCell(""), }); spreadsheetHistogram6.AddRange(Histogram.MatrixToHistogram(spreadsheet6.ToArray(), Histogram.MakeBinDecimals(0, 1000, 0, 100), new[] { 28 }, 2, -1, true)); results.Add(spreadsheetHistogram6); spreadsheet6 = null; spreadsheetHistogram6 = null; } // { foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { var spreadsheet7 = new List <SpreadsheetCell[]>(); spreadsheet7.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Entire Count - Groups " + enumAminoAcidGroups), }); spreadsheet7.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - All Atoms - Acid Groups " + enumAminoAcidGroups), }); spreadsheet7.Add(AminoAcidChainComposition.SpreadsheetGroupsTitleRow(enumAminoAcidGroups)); var range7 = spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.Select(record => record.SpreadsheetGroupsDataRow(enumAminoAcidGroups)).ToList(); //range7.Sort(); range7 = range7 .OrderBy(a => a[0].CellData) .ThenBy(a => a[1].CellData) .ToList(); spreadsheet7.AddRange(range7); range7 = null; spreadsheet7.Add(wholeProteinChainsTotals.SpreadsheetGroupsDataRow(enumAminoAcidGroups)); results.Add(spreadsheet7); spreadsheet7 = null; } } // convert to percentage for creating mean average protein composition var meanProteinComposition = new AminoAcidChainComposition("Mean Composition", "-"); foreach (AminoAcidChainComposition aminoAcidChainComposition in spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter) { // get percentage for row AminoAcidChainComposition percentage = AminoAcidChainComposition.ConvertToPercentage(aminoAcidChainComposition); // add percentage to overall tally foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { for (int x = 0; x < AminoAcidGroups.AminoAcidGroups.GetTotalSubgroups(enumAminoAcidGroups); x++) { meanProteinComposition.AminoAcidGroupsCount[(int)enumAminoAcidGroups][x] += (percentage.AminoAcidGroupsCount[(int)enumAminoAcidGroups][x] / spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.Count); } } } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// { /* start test */ var spreadsheet8 = new List <SpreadsheetCell[]>(); spreadsheet8.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% TEST SHEET 1"), }); // Worksheet name. spreadsheet8.Add(new[] { new SpreadsheetCell("TEST SHEET 1"), }); // Spreadsheet title spreadsheet8.Add(new[] { new SpreadsheetCell(string.Empty), }); spreadsheet8.Add(meanProteinComposition.SpreadsheetDataRow()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet8.Add(meanProteinComposition.SpreadsheetGroupsDataRow(enumAminoAcidGroups)); } results.Add(spreadsheet8); spreadsheet8 = null; /* end test */ } AminoAcidPairCompositionMatrix meanProteinMatrix = AminoAcidChainComposition.ConvertToMatrix(meanProteinComposition); { var spreadsheet9 = new List <SpreadsheetCell[]>(); spreadsheet9.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM All Atoms 3x3"), }); // Worksheet name. spreadsheet9.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Average Chain Composition"), }); // Spreadsheet title. //spreadsheet9.Add(new[] { new SpreadsheetCell(string.Empty), }); //spreadsheet9.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Average Chain Composition - Percentage Composition - A to Z"), }); // Section title. //spreadsheet9.AddRange(meanProteinMatrix.SpreadsheetAminoAcidColorGroupsHeatMap()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet9.Add(new[] { new SpreadsheetCell(string.Empty), }); spreadsheet9.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Average Chain Composition - Percentage Composition - Acid Groups " + enumAminoAcidGroups), }); // Section title. spreadsheet9.AddRange(meanProteinMatrix.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); } results.Add(spreadsheet9); spreadsheet9 = null; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //if (outputAllAtoms1x1) //{ AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounterPercentage1X1 = AminoAcidPairCompositionMatrix.CalculatePercentageMatrix(wholeProteinAminoAcidToAminoAcidCounter1X1); { var spreadsheet10 = new List <SpreadsheetCell[]>(); spreadsheet10.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM All Atoms 1x1") }); // Worksheet name. spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Composition") }); // Spreadsheet title. //spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty)}); //spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Percentage Composition - A to Z")}); // Section title. //spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1.SpreadsheetAminoAcidColorGroupsHeatMap()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty) }); spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Percentage Composition - Acid Groups " + enumAminoAcidGroups) }); // Section title. spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); } AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounterNormalised1X1 = AminoAcidPairCompositionMatrix.NormalizeWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix()); //spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty)}); //spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall UniProt Normalised - A to Z ")}); // Section title. //spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterNormalised1X1.SpreadsheetAminoAcidColorGroupsHeatMap()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty) }); spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall UniProt Normalised - Acid Groups " + enumAminoAcidGroups) }); // Section title. spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterNormalised1X1.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); } AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounterDifference1X1 = AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix()); //spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty)}); //spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall A to Z - UniProt Difference")}); // Section title. //spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterDifference1X1.SpreadsheetAminoAcidColorGroupsHeatMap()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty) }); spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Acid Groups " + enumAminoAcidGroups + " - UniProt Difference") }); // Section title. spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterDifference1X1.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); } results.Add(spreadsheet10); spreadsheet10 = null; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// { AminoAcidPairCompositionMatrix interactionsAminoAcidToAminoAcidCounterPercentage = AminoAcidPairCompositionMatrix.CalculatePercentageMatrix(spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter); var spreadsheet11 = new List <SpreadsheetCell[]>(); spreadsheet11.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM Interactions Only") }); // Worksheet name. spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only") }); // Spreadsheet title. //spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty)}); //spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - A to Z")}); // Section title. //spreadsheet11.AddRange(spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter.SpreadsheetAminoAcidColorGroupsHeatMap()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty) }); spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - Acid Groups " + enumAminoAcidGroups) }); // Section title. spreadsheet11.AddRange(spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); } AminoAcidPairCompositionMatrix interactionsAminoAcidToAminoAcidCounterNormalised = AminoAcidPairCompositionMatrix.NormalizeWithCompositionMatrix(interactionsAminoAcidToAminoAcidCounterPercentage, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix()); //spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty)}); //spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - A to Z - UniProt Normalised")}); // Section title. //spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterNormalised.SpreadsheetAminoAcidColorGroupsHeatMap()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty) }); spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - Acid Groups " + enumAminoAcidGroups + " - UniProt Normalised") }); // Section title. spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterNormalised.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); } AminoAcidPairCompositionMatrix interactionsAminoAcidToAminoAcidCounterDifference = AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(interactionsAminoAcidToAminoAcidCounterPercentage, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix()); //spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty)}); //spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - A to Z - UniProt Difference")}); // Section title. //spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterDifference.SpreadsheetAminoAcidColorGroupsHeatMap()); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty) }); spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - Acid Groups " + enumAminoAcidGroups + " - UniProt Difference") }); // Section title. spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterDifference.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); } results.Add(spreadsheet11); spreadsheet11 = null; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// { var spreadsheet12 = new List <SpreadsheetCell[]>(); spreadsheet12.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM Interactions v Homodimers") }); // Worksheet name. spreadsheet12.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Difference between homodimer composition and homodimer interactions") }); // Spreadsheet title spreadsheet12.Add(new[] { new SpreadsheetCell(string.Empty) }); //spreadsheet12.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Difference between homodimer composition and homodimer interactions - A to Z")}); // Section title //spreadsheet12.AddRange(AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter).SpreadsheetAminoAcidColorGroupsHeatMap()); //spreadsheet12.Add(new[] { new SpreadsheetCell(string.Empty)}); foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups))) { spreadsheet12.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Difference between homodimer composition and homodimer interactions - Acid Groups " + enumAminoAcidGroups) }); // Section title. spreadsheet12.AddRange(AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter).SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups)); spreadsheet12.Add(new[] { new SpreadsheetCell(string.Empty) }); } results.Add(spreadsheet12); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// return(results); }
static void Main(string[] args) { // this program takes a fasta or pdb file and finds all matching homologs // FindHomologs.exe "c:\ds96ub\ds96ub.fasta" * "c:\pdb\pdb_seqres.fasta" NMW Y 0.3 75 c:\pdb\ // alignment_type = (n)one, (s)imple, NMW, SWM var query_sequence_file = args[0]; //query.fasta var query_id_chain = args[1]; //1A2G:B var target_sequence_file = args[2]; //targets.fasta var alignment_type_str = args[3]; //NMW,SWM,SIM,NON if (alignment_type_str == "*") { alignment_type_str = "NMW,SWM,SIM,NON"; } var alignment_type_str_split = alignment_type_str.ToUpperInvariant().Split(new char[] { ',', ';', ' ', '\t' }); var compare_physicochemically = args[4]; //Y/N var compare_physicochemically_bool = compare_physicochemically == "Y"; var min_similarity_str = args[5]; // 0.3 var max_len_difference = args[6]; var max_len_difference_int = int.Parse(max_len_difference); var output_folder = args[7]; var minSimilarity = decimal.Parse(min_similarity_str); var alignmentTypes = new List <ProteinBioClass.AlignmentType>(); if (alignment_type_str_split.Contains("NMW")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.NMW); } if (alignment_type_str_split.Contains("SWM")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.SWM); } if (alignment_type_str_split.Contains("SIM")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.SIM); } if (alignment_type_str_split.Contains("NON") || alignmentTypes.Count == 0) { alignmentTypes.Add(ProteinBioClass.AlignmentType.NON); } if (alignmentTypes.Count < alignment_type_str_split.Length) { Console.WriteLine("; unknown alignment type"); return; } // load list of query sequences var queryPdbid = query_id_chain.Split(new char[] { ':' })[0]; var queryChainid = (query_id_chain.Contains(":") ? query_id_chain.Split(new char[] { ':' })[1] : "*")[0]; var querySeq = Sequence.LoadSequenceFile(query_sequence_file, null); var queryResults = querySeq.Where(a => { var id = new ProteinBioClass.SequenceId(a.Id); return((queryPdbid == "*" || id.PdbId.ToUpperInvariant() == queryPdbid.ToUpperInvariant()) && (queryChainid == '*' || id.ChainId == queryChainid)); }).ToList(); if (queryResults.Count == 0) { Console.WriteLine("; the query pdbids/chainids were not found"); return; } // load list of target sequences var targetSeq = Sequence.LoadSequenceFile(target_sequence_file, new string[] { null, "", "protein" }); targetSeq = targetSeq.Where(a => a.Count() >= 50).ToList(); Console.WriteLine("; aligning " + queryResults.Count + " query sequences to " + targetSeq.Count + " target sequences"); // perform alignment //var startTime = DateTime.Now; //var progress = 0; //var progressLock = new object(); //var tasks = new List<Task<StringBuilder>>(); var queryPdbIds = queryResults.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId); var targetPdbIds = targetSeq.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId); var queryPdbIdCounts = new Dictionary <string, int>(); foreach (var x in queryPdbIds) { if (!queryPdbIdCounts.ContainsKey(x)) { queryPdbIdCounts.Add(x, 1); } else { queryPdbIdCounts[x]++; } } var targetPdbIdCounts = new Dictionary <string, int>(); foreach (var x in targetPdbIds) { if (!targetPdbIdCounts.ContainsKey(x)) { targetPdbIdCounts.Add(x, 1); } else { targetPdbIdCounts[x]++; } } foreach (var _query in queryResults) { var _queryId = new ProteinBioClass.SequenceId(_query.Id); var filename = (new DirectoryInfo(output_folder).FullName) + @"\homologs_" + _queryId.PdbId + _queryId.ChainId + @".csv"; // skip if already processed if (File.Exists(filename) && new FileInfo(filename).Length > 0) { continue; } var totalQueryPdbIdChains = queryPdbIdCounts[_queryId.PdbId]; WorkDivision wd = new WorkDivision(targetSeq.Count); for (var thread = 0; thread < wd.ThreadCount; thread++) { var query = _query; var queryId = _queryId; var lti = thread; wd.TaskList.Add(Task.Run(() => { var result = new List <HomologChain>(); for (var target = wd.ThreadFirstIndex[lti]; target <= wd.ThreadLastIndex[lti]; target++) { var targetobj = targetSeq[target]; if (max_len_difference_int != -1 && Math.Abs(targetobj.Count() - query.Count()) > max_len_difference_int) { continue; } var targetId = new ProteinBioClass.SequenceId(targetobj.Id); //var totalTargetPdbIdChains = targetSeq.Count(a => FindAtomicContacts.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant() == targetId.PdbId.ToUpperInvariant()); //var timeRemaining = // TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks * // ((targetSeq.Count * queryResults.Count) - (progress + 1)) / // (progress + 1)); foreach (var alignmentType in alignmentTypes) { var scores = ProteinBioClass.AlignedSequenceSimilarityPercentage(query, targetobj, alignmentType /*, * compare_physicochemically_bool*/); decimal percentSimilar; if (compare_physicochemically_bool) { percentSimilar = scores.ScoreEvo; } else { percentSimilar = scores.Score; } if (percentSimilar >= minSimilarity) { result.Add(new HomologChain( queryId.PdbId, queryId.ChainId, totalQueryPdbIdChains, targetId.PdbId, targetId.ChainId, targetPdbIdCounts[targetId.PdbId], alignmentType.ToString(), scores.Score, scores.ScoreEvo)); } } //if (progress % 20 == 0) // Console.Write("\r{0}% eta {1} ", // Math.Round((decimal)(progress + 1) / (decimal)(targetSeq.Count * queryResults.Count), // 2) // .ToString(CultureInfo.InvariantCulture), // timeRemaining.ToString(@"d\d\:h\h\:m\m\:s\s", // CultureInfo.InvariantCulture)); //lock (progressLock) // progress++; } return(result); })); } wd.WaitAllTasks(); var mergedlist = new List <string>(); mergedlist.Add("; " + _queryId.PdbId + ":" + _queryId.ChainId); mergedlist.Add(String.Join(",", new string[] { "query pdb id", "query chain id", "query chains", "target pdb id", "target chain id", "target chains", "alignment method", "sequence similarity", "sequence evo similarity" })); foreach (var t in wd.TaskList) { var tc = t as Task <List <HomologChain> >; if (tc == null) { throw new Exception("task in tasklist was null"); } mergedlist.AddRange(tc.Result.Select(a => a.ToString()).ToList()); } if (string.IsNullOrWhiteSpace(output_folder)) { Console.WriteLine(String.Join(Environment.NewLine, mergedlist)); } else { File.WriteAllLines(filename, mergedlist); } } }
public static List <AtomPair> FindInteractions(CancellationToken cancellationToken, decimal maxAtomInterationDistance /*= 8.0m*/, string proteinId, Dictionary <string, List <string> > pdbIdChainIdList, ProteinChainListContainer proteinFileChains, bool breakWhenFirstInteractionFound = false, int totalThreads = -1, bool sort = true, int requiredChains = -1) { //const decimal maxInterationDistance = 8.0m; bool useCache = false; if (useCache && !string.IsNullOrWhiteSpace(proteinId)) { var cachedInteractions = InteractionsCache.LoadPdbInteractionCache(proteinId, requiredChains); if (cachedInteractions != null) { return(cachedInteractions); } } // check required number of chains are found if (proteinFileChains == null || proteinFileChains.ChainList == null || (requiredChains > -1 && proteinFileChains.ChainList.Count != requiredChains)) { return(null); } // check that all chains have atoms if (proteinFileChains.ChainList.Any(chain => chain.AtomList == null || chain.AtomList.Count == 0)) { return(null); } // Make list of 3D positions of atoms. var positions = new List <Point3D> [proteinFileChains.ChainList.Count]; for (int chainIndex = 0; chainIndex < proteinFileChains.ChainList.Count; chainIndex++) { positions[chainIndex] = Clustering.AtomRecordListToPoint3DList(proteinFileChains.ChainList[chainIndex]); } var tasks = new List <Task <List <AtomPair> > >(); for (int chainIndexA = 0; chainIndexA < proteinFileChains.ChainList.Count; chainIndexA++) { for (int chainIndexB = 0; chainIndexB < proteinFileChains.ChainList.Count; chainIndexB++) { if (chainIndexB == chainIndexA || chainIndexB < chainIndexA) { continue; } WorkDivision <List <AtomPair> > workDivision = new WorkDivision <List <AtomPair> >(proteinFileChains.ChainList[chainIndexA].AtomList.Count, totalThreads); bool breakOut = false; var lockBreakOut = new object(); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; int localChainIndexA = chainIndexA; int localChainIndexB = chainIndexB; WorkDivision <List <AtomPair> > localWorkDivision = workDivision; Task <List <AtomPair> > task = Task.Run(() => { var taskResult = new List <AtomPair>(); for (int atomIndexA = localWorkDivision.ThreadFirstIndex[localThreadIndex]; atomIndexA <= localWorkDivision.ThreadLastIndex[localThreadIndex]; atomIndexA++) { if (breakOut) { break; } for (int atomIndexB = 0; atomIndexB < proteinFileChains.ChainList[localChainIndexB].AtomList.Count; atomIndexB++) { if (breakOut || (breakWhenFirstInteractionFound && taskResult.Count > 0)) { lock (lockBreakOut) { breakOut = true; } break; } if ((!positions[localChainIndexA][atomIndexA].ParseOK) || (!positions[localChainIndexB][atomIndexB].ParseOK)) { continue; } decimal atomicDistanceAngstroms3D = Point3D.Distance3D(positions[localChainIndexA][atomIndexA], positions[localChainIndexB][atomIndexB], true); // Chemical proteinInterface bonds found at 5 angstrom or less. if (atomicDistanceAngstroms3D <= 0.0m || atomicDistanceAngstroms3D > maxAtomInterationDistance) { continue; } var atomPair = new AtomPair( proteinId, proteinFileChains.ChainList[localChainIndexA].AtomList[atomIndexA], localChainIndexA, proteinId, localChainIndexB, proteinFileChains.ChainList[localChainIndexB].AtomList[atomIndexB], atomicDistanceAngstroms3D); taskResult.Add(atomPair); } } if (taskResult.Count == 0) { return(null); } return(taskResult); }, cancellationToken); workDivision.TaskList.Add(task); } tasks.AddRange(workDivision.TaskList); } } try { Task[] tasksToWait = tasks.Where(task => task != null && !task.IsCompleted).ToArray <Task>(); if (tasksToWait.Length > 0) { Task.WaitAll(tasksToWait); } } catch (AggregateException) { } // merge all results var atomPairList = new List <AtomPair>(); foreach (var task in tasks.Where(t => t != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted && t.Result != null && t.Result.Count > 0)) { atomPairList.AddRange(task.Result); } if (sort && atomPairList != null && atomPairList.Count > 1) { atomPairList = atomPairList .OrderBy(i => ProteinDataBankFileOperations.NullableTryParseInt32(i.Atom1.resSeq.FieldValue)) .ThenBy(i => ProteinDataBankFileOperations.NullableTryParseInt32(i.Atom1.serial.FieldValue)) .ThenBy(j => ProteinDataBankFileOperations.NullableTryParseInt32(j.Atom2.resSeq.FieldValue)) .ThenBy(j => ProteinDataBankFileOperations.NullableTryParseInt32(j.Atom2.serial.FieldValue)) .ToList(); } if (useCache) { InteractionsCache.SavePdbInteractionCache(proteinId, atomPairList, requiredChains); } return(atomPairList); }
/* * public static void ClusterVectorDistanceMatrixUpgma(List<VectorProteinInterfaceWhole> vectorProteinInterfaceWholeList, decimal[,] vectorDistanceMatrix, int minimumOutputTreeLeafs, out List<string> vectorNames, out List<List<UpgmaNode>> nodeList, out List<List<string>> treeList, ProgressActionSet progressActionSet) * { * if (vectorProteinInterfaceWholeList == null) throw new ArgumentNullException(nameof(vectorProteinInterfaceWholeList)); * if (vectorDistanceMatrix == null) throw new ArgumentNullException(nameof(vectorDistanceMatrix)); * * vectorNames = vectorProteinInterfaceWholeList.Select(VectorProteinInterfaceWholeTreeHeader).ToList(); * * List<string> finalTreeLeafOrderList; * UpgmaClustering.Upgma(vectorDistanceMatrix, vectorNames, minimumOutputTreeLeafs, out nodeList, out treeList, out finalTreeLeafOrderList, false, progressActionSet); * } */ public static void BestDistanceMatrixWithPartsAlignment(CancellationToken cancellationToken, List <VectorProteinInterfaceWhole> vectorProteinInterfaceWholeList, VectorDistanceMeasurementValues vectorDistanceMeasurementValues, out double[,] optimisticDistanceMatrix, /* out double[,] pessimisticDistanceMatrix,*/ ProgressActionSet progressActionSet) { if (vectorProteinInterfaceWholeList == null) { throw new ArgumentNullException(nameof(vectorProteinInterfaceWholeList)); } if (vectorDistanceMeasurementValues == null) { throw new ArgumentNullException(nameof(vectorDistanceMeasurementValues)); } var totalVectors = vectorProteinInterfaceWholeList.Count; var optimisticDistanceMatrix2 = new double[totalVectors, totalVectors]; //var pessimisticDistanceMatrix2 = new double[totalVectors, totalVectors]; var workDivision = new WorkDivision(vectorProteinInterfaceWholeList.Count, -1); ProgressActionSet.StartAction(vectorProteinInterfaceWholeList.Count, progressActionSet); for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++) { int localThreadIndex = threadIndex; var task = Task.Run(() => { for (int indexX = workDivision.ThreadFirstIndex[localThreadIndex]; indexX <= workDivision.ThreadLastIndex[localThreadIndex]; indexX++) { if (cancellationToken.IsCancellationRequested) { break; } var vectorProteinInterfaceWholeX = vectorProteinInterfaceWholeList[indexX]; for (int indexY = 0; indexY < vectorProteinInterfaceWholeList.Count; indexY++) { if (indexX >= indexY) { continue; } var vectorProteinInterfaceWholeY = vectorProteinInterfaceWholeList[indexY]; if (vectorProteinInterfaceWholeX.FullProteinInterfaceId == vectorProteinInterfaceWholeY.FullProteinInterfaceId) { continue; } double optimisticDistance; //double pessimisticDistance; BestDistanceWithPartsAlignment(vectorProteinInterfaceWholeX, vectorProteinInterfaceWholeY, vectorDistanceMeasurementValues, out optimisticDistance /*, out pessimisticDistance*/); var lengthDifference = Math.Abs(vectorProteinInterfaceWholeX.ProteinInterfaceLength - vectorProteinInterfaceWholeY.ProteinInterfaceLength); var lengthDistance = lengthDifference * vectorDistanceMeasurementValues.DifferentLengthProteinInterface; optimisticDistance += lengthDistance; //pessimisticDistance += lengthDistance; optimisticDistanceMatrix2[indexX, indexY] = optimisticDistance; //pessimisticDistanceMatrix2[indexX, indexY] = pessimisticDistance; optimisticDistanceMatrix2[indexY, indexX] = optimisticDistance; //pessimisticDistanceMatrix2[indexY, indexX] = pessimisticDistance; } workDivision.IncrementItemsCompleted(1); ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet); } }, cancellationToken); workDivision.TaskList.Add(task); } workDivision.WaitAllTasks(); ProgressActionSet.FinishAction(true, progressActionSet); optimisticDistanceMatrix = optimisticDistanceMatrix2; //pessimisticDistanceMatrix = pessimisticDistanceMatrix2; }
private static void Main(string[] args) { // this program will load the homolog list in csv format and for homologs of X sequence distance return a list of all partners // however, some partners may be duplicates, which cannot initially be removed, since they may bind differently in other instances // then, because of such cases, unique id to describe each protein must be created... this is slightly problematic because // close target homologs of proteins are also considered to be the same protein as the query protein // which means that they could exist for more than one query protein // FindHomologsCluster.exe c:\pdb\ds96ub_homologs\ c:\pdb\pdb_seqres.fasta 0.9 0.9 > ds96ub_homologs.csv var homolog_csv_folder = args[0]; var sequence_file = args[1]; var min_similarity_str = args[2]; var min_similarity_evo_str = args[3]; var min_similarity = decimal.Parse(min_similarity_str); var min_similarity_evo = decimal.Parse(min_similarity_evo_str); var seqList = Sequence.LoadSequenceFile(sequence_file, new[] { null, "", "protein" }); var homologCsvFiles = Directory.GetFiles(homolog_csv_folder, "homologs_?????.csv"); var parsedData = FindHomologs.FindHomologs.HomologChain.Load(homologCsvFiles); Array.Clear(homologCsvFiles, 0, homologCsvFiles.Length); //var query_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.query_pdb_id, a.query_chainid)).ToList(); //var target_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.target_pdb_id, a.target_chainid)).ToList(); //var query_alignments = new List<homolog_csv>(); var homologs_clustered = new List <List <Tuple <string, char> > >(); //var min_similarity = 0.9m; foreach (var rec in parsedData) { if (rec.AlignmentScore >= min_similarity && rec.AlignmentScoreEvo >= min_similarity_evo) { //var query_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.query_pdb_id && b.Item2 == rec.query_chainid) != null); //var target_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.target_pdb_id && b.Item2 == rec.target_chainid) != null); List <Tuple <string, char> > query_group = null; List <Tuple <string, char> > target_group = null; foreach (var cluster in homologs_clustered) { var xq = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.QueryPdbId.ToUpperInvariant() && b.Item2 == rec.QueryChainId); if (xq == null) { continue; } query_group = cluster; break; } foreach (var cluster in homologs_clustered) { var xt = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.TargetPdbId.ToUpperInvariant() && b.Item2 == rec.TargetChainId); if (xt == null) { continue; } target_group = cluster; break; } var new_group = new List <Tuple <string, char> >(); if (query_group != null) { new_group.AddRange(query_group); homologs_clustered.Remove(query_group); query_group.Clear(); } else { new_group.Add(new Tuple <string, char>(rec.QueryPdbId, rec.QueryChainId)); } if (target_group != null) { new_group.AddRange(target_group); homologs_clustered.Remove(target_group); target_group.Clear(); } else { new_group.Add(new Tuple <string, char>(rec.TargetPdbId, rec.TargetChainId)); } new_group = new_group.Distinct().ToList(); // try without distinct? new_group = new_group.OrderBy(a => a.Item1).ThenBy(a => a.Item2).ToList(); homologs_clustered.Add(new_group); } } var seq_list_ids = seqList.Select(a => new ProteinBioClass.SequenceId(a.Id)).ToList(); var wd2 = new WorkDivision(homologs_clustered.Count); for (var thread2 = 0; thread2 < wd2.ThreadCount; thread2++) { var lti2 = thread2; wd2.TaskList.Add(Task.Run(() => { var result2 = new List <string>(); for (var index2 = wd2.ThreadFirstIndex[lti2]; index2 <= wd2.ThreadLastIndex[lti2]; index2++) { var cluster2 = homologs_clustered[index2]; var wd3 = new WorkDivision(cluster2.Count); for (var thread3 = 0; thread3 < wd3.ThreadCount; thread3++) { var lti3 = thread3; var cluster3 = cluster2; var index4 = index2; wd3.TaskList.Add(Task.Run(() => { var result = new List <HomologClusterData>(); for (var index3 = wd3.ThreadFirstIndex[lti3]; index3 <= wd3.ThreadLastIndex[lti3]; index3++) { var item = cluster3[index3]; Sequence s = null; for (var j = 0; j < seqList.Count; j++) { if (seq_list_ids[j].PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant() && seq_list_ids[j].ChainId == item.Item2) { s = seqList[j]; break; } } if (s == null) { throw new Exception("sequence not found for " + item.Item1 + ":" + item.Item2); } var complexChains = seq_list_ids.Count(a => a.PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant()); var minAlignmentScore = -1m; var maxAlignmentScore = -1m; var minAlignmentScoreEvo = -1m; var maxAlignmentScoreEvo = -1m; foreach (var item2 in cluster3) { if (ReferenceEquals(item, item2)) { continue; } Sequence s2 = null; for (var j2 = 0; j2 < seqList.Count; j2++) { if (seq_list_ids[j2].PdbId.ToUpperInvariant() == item2.Item1.ToUpperInvariant() && seq_list_ids[j2].ChainId == item2.Item2) { s2 = seqList[j2]; break; } } if (s2 == null) { continue; } var alignmentScore = ProteinBioClass.AlignedSequenceSimilarityPercentage(s, s2, ProteinBioClass.AlignmentType.NMW); if (alignmentScore.Score > maxAlignmentScore || maxAlignmentScore == -1m) { maxAlignmentScore = alignmentScore.Score; } if (alignmentScore.Score < minAlignmentScore || minAlignmentScore == -1m) { minAlignmentScore = alignmentScore.Score; } if (alignmentScore.ScoreEvo > maxAlignmentScoreEvo || maxAlignmentScoreEvo == -1m) { maxAlignmentScoreEvo = alignmentScore.ScoreEvo; } if (alignmentScore.ScoreEvo < minAlignmentScoreEvo || minAlignmentScoreEvo == -1m) { minAlignmentScoreEvo = alignmentScore.ScoreEvo; } } var r = new HomologClusterData(index4 + 1, index3 + 1, item.Item1, item.Item2, complexChains, Convert.ToInt32(s.Count()), minAlignmentScore, maxAlignmentScore, minAlignmentScoreEvo, maxAlignmentScoreEvo, s.FullSequence); result.Add(r); } return(result); })); } wd3.WaitAllTasks(); result2.Add("; Cluster # " + (index2 + 1) + " with " + wd3.ItemsToProcess + " protein chains"); result2.Add("cluster index,item index,pdb id,chain id,complex chains,seq len,min clstr sid,max clstr sid,min evo clstr sid,max evo clstr sid,sequence"); foreach (var task in wd3.TaskList) { //if (task.IsFaulted || task.IsCanceled) continue; var tr = task as Task <List <HomologClusterData> >; if (tr == null || tr.Result == null) { continue; } result2.AddRange(tr.Result.Select(a => a.ToString()).ToList()); } result2.Add(""); } return(result2); })); //wd2.TaskList.Add(task2); } wd2.WaitAllTasks(); var result1 = new List <string>(); foreach (var task in wd2.TaskList) { //if (task.IsFaulted || task.IsCanceled) continue; var tr = task as Task <List <string> >; if (tr == null || tr.Result == null) { continue; } result1.AddRange(tr.Result); } foreach (var line in result1) { Console.WriteLine(line); } // partners may have other interfaces, should those also be considered? }