/// <summary> /// This method returns a dictionary entry for each protein id (pdb id), with a list of interaction vectors /// </summary> /// <returns></returns> public static List <VectorProteinInterfaceWhole> LoadProteinInterfaceVectorFromFiles( CancellationToken cancellationToken, decimal maxAtomInterationDistance, decimal minimumProteinInterfaceDensity, string[] sequenceListFileArray, string[] pdbFileDirectoryLocationArray, ProgressActionSet progressActionSet) { if (sequenceListFileArray == null) { throw new ArgumentNullException(nameof(sequenceListFileArray)); } if (pdbFileDirectoryLocationArray == null) { throw new ArgumentNullException(nameof(pdbFileDirectoryLocationArray)); } var vectorProteinInterfaceWholeList = new List <VectorProteinInterfaceWhole>(); // 1: Open list of sequences already cleaned to have only symmetrical homodimers (fasta file only contains 100% symmetrical homodimers with all other junk removed - but could have any number of proteinInterfaces per chain) List <ISequence> sequenceList = SequenceFileHandler.LoadSequenceFileList(sequenceListFileArray, StaticValues.MolNameProteinAcceptedValues); var pdbIdChainIdList = ProteinDataBankFileOperations.PdbIdChainIdList(sequenceList); // 2: Get a list of the unique ids for the sequences List <string> pdbIdList = FilterProteins.SequenceListToPdbIdList(sequenceList); if (pdbIdList == null || pdbIdList.Count == 0) { throw new ArgumentOutOfRangeException(nameof(sequenceListFileArray), "Error loading PDB ID list"); } // 3: Get a list of PDB files found in user specified directory string[] pdbFilesArray = ProteinDataBankFileOperations.GetPdbFilesArray(pdbFileDirectoryLocationArray); ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet); var startTicks = DateTime.Now.Ticks; // 4: Loop through each pdb file for (int pdbFileNumber = 0; pdbFileNumber < pdbFilesArray.Length; pdbFileNumber++) // +1 is for progress update { ProgressActionSet.ProgressAction(1, progressActionSet); ProgressActionSet.EstimatedTimeRemainingAction(startTicks, pdbFileNumber + 1, pdbFilesArray.Length, progressActionSet); // get unique id of pdb file string pdbFilename = pdbFilesArray[pdbFileNumber]; string proteinId = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename); // check pdb unique id was in the loaded sequence list if (!pdbIdList.Contains(proteinId)) { continue; } ClusterProteinDataBankFileResult clusterPdbFileResult = Clustering.ClusterProteinDataBankFile(cancellationToken, maxAtomInterationDistance, minimumProteinInterfaceDensity, pdbFilename, pdbIdChainIdList, ClusteringMethodOptions.ClusterWithResidueSequenceIndex, -1, -1, progressActionSet); if (clusterPdbFileResult == null) { continue; } List <ProteinInterfaceSequenceAndPositionData> proteinInterfaceSequenceAndPositionDataList = clusterPdbFileResult.ProteinInterfaceAnalysisResultData.ProteinInterfacesSequenceAndPositionDataList; proteinInterfaceSequenceAndPositionDataList = proteinInterfaceSequenceAndPositionDataList.OrderBy(a => a.FullProteinInterfaceId.ProteinId).ThenBy(a => a.FullProteinInterfaceId.ChainId).ThenBy(a => a.FullProteinInterfaceId.ProteinInterfaceId).ToList(); for (int proteinInterfaceSequenceAndPositionDataListIndex = 0; proteinInterfaceSequenceAndPositionDataListIndex < proteinInterfaceSequenceAndPositionDataList.Count; proteinInterfaceSequenceAndPositionDataListIndex++) { ProteinInterfaceSequenceAndPositionData proteinInterfaceSequenceAndPositionData = proteinInterfaceSequenceAndPositionDataList[proteinInterfaceSequenceAndPositionDataListIndex]; var seq = sequenceList.FirstOrDefault(a => { var p = SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID); return(p.PdbId.ToUpperInvariant() == proteinInterfaceSequenceAndPositionData.FullProteinInterfaceId.ProteinId.ToUpperInvariant() && p.ChainId.ToUpperInvariant() == proteinInterfaceSequenceAndPositionData.ChainIdLetter.ToUpperInvariant()); }); var seqLen = seq != null ? seq.Count : -1; var vectorProteinInterfaceWholeFwd = MakeVectorProteinInterfaceWhole(pdbFilename, proteinInterfaceSequenceAndPositionData, false, false); vectorProteinInterfaceWholeFwd.FullSequenceLength = seqLen; vectorProteinInterfaceWholeList.Add(vectorProteinInterfaceWholeFwd); var vectorProteinInterfaceWholeRev = MakeVectorProteinInterfaceWhole(pdbFilename, proteinInterfaceSequenceAndPositionData, true, false); vectorProteinInterfaceWholeRev.FullSequenceLength = seqLen; vectorProteinInterfaceWholeList.Add(vectorProteinInterfaceWholeRev); } } ProgressActionSet.FinishAction(true, progressActionSet); vectorProteinInterfaceWholeList = vectorProteinInterfaceWholeList.OrderBy(a => a.FullProteinInterfaceId.ProteinId).ThenBy(a => a.FullProteinInterfaceId.ChainId).ThenBy(a => a.FullProteinInterfaceId.ProteinInterfaceId).ToList(); return(vectorProteinInterfaceWholeList); }
public static void FilterProteinInterfaceLengths( CancellationToken cancellationToken, decimal maxAtomInterationDistance, decimal minimumProteinInterfaceDensity, string[] sequenceListFileArray, string[] pdbFileDirectoryLocationArray, string filterProteinInterfacesLengthOutputFilename, bool filterProteinInterfaceCountsWithoutLengths, bool filterProteinInterfaceCountsWithLengths, FileExistsHandler.FileExistsOptions fileExistsOptions, ProgressActionSet progressActionSet) { if (sequenceListFileArray == null) { throw new ArgumentNullException(nameof(sequenceListFileArray)); } if (pdbFileDirectoryLocationArray == null) { throw new ArgumentNullException(nameof(pdbFileDirectoryLocationArray)); } if (filterProteinInterfacesLengthOutputFilename == null) { throw new ArgumentNullException(nameof(filterProteinInterfacesLengthOutputFilename)); } if (!filterProteinInterfaceCountsWithoutLengths && !filterProteinInterfaceCountsWithLengths) { ProgressActionSet.Report("Cancelled: No filter options selected.", progressActionSet); return; } // Check all sequence files are found var missingSequenceFiles = sequenceListFileArray.Where(sequenceFile => !string.IsNullOrWhiteSpace(sequenceFile) && !File.Exists(sequenceFile)).ToList(); if (missingSequenceFiles.Count > 0) { foreach (string missingSequenceFile in missingSequenceFiles) { //throw new FileNotFoundException(sequenceFile); ProgressActionSet.Report("Warning: Sequence file missing: " + missingSequenceFile, progressActionSet); } ProgressActionSet.Report("Cancelled: missing sequence files.", progressActionSet); return; } // Check all pdb folders are found var missingDirectoryList = pdbFileDirectoryLocationArray.Where(pdbDirectory => !string.IsNullOrWhiteSpace(pdbDirectory) && !Directory.Exists(pdbDirectory)).ToList(); if (missingDirectoryList.Count > 0) { foreach (string pdbDirectory in missingDirectoryList) { //throw new DirectoryNotFoundException(pdbDirectory); ProgressActionSet.Report("Warning: Structure file directory missing: " + pdbDirectory, progressActionSet); } ProgressActionSet.Report("Cancelled: missing structure file directory.", progressActionSet); return; } const string proteinInterfacesTemplateText = "%proteinInterfaces%"; if (string.IsNullOrWhiteSpace(filterProteinInterfacesLengthOutputFilename) || !filterProteinInterfacesLengthOutputFilename.Contains(proteinInterfacesTemplateText)) { throw new ArgumentOutOfRangeException(nameof(filterProteinInterfacesLengthOutputFilename)); } // Load fasta sequence files List <ISequence> sequenceList = SequenceFileHandler.LoadSequenceFileList(sequenceListFileArray, StaticValues.MolNameProteinAcceptedValues); // Get a list of the PDB Unique IDs with unique chain IDs which are wanted, ignoring others which may be present e.g. dna var pdbIdChainIdList = ProteinDataBankFileOperations.PdbIdChainIdList(sequenceList); // Get list of PDB Unique IDs List <string> pdbIdList = FilterProteins.SequenceListToPdbIdList(sequenceList); // Check PDB Unique IDs were successfully loaded if (pdbIdList == null || pdbIdList.Count == 0) { //throw new Exception("PDB ID List is empty or could not be loaded."); ProgressActionSet.Report("Error: Sequence list could not be loaded", progressActionSet); return; } // 3: Get a list of PDB files found in user specified directory string[] pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, ProteinDataBankFileOperations.GetPdbFilesArray(pdbFileDirectoryLocationArray)); // Check all PDB files are found List <string> missingPdbFilesList = ProteinDataBankFileOperations.CheckForMissingPdbFiles(pdbFilesArray, pdbIdList); if (missingPdbFilesList != null && missingPdbFilesList.Count > 0) { ProgressActionSet.Report("Missing PDB Files: " + string.Join(", ", missingPdbFilesList), progressActionSet); } ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet); int progressIncrement = 0; var proteinInterfacesCountResultWithLengths = new Dictionary <string, List <string> >(); var startTicks = DateTime.Now.Ticks; // 4: Loop through each pdb file for (int pdbFileNumber = 0; pdbFileNumber < pdbFilesArray.Length + 1; pdbFileNumber++) // +1 is for progress update { if (progressIncrement > 0) { ProgressActionSet.ProgressAction(progressIncrement, progressActionSet); progressIncrement = 0; if (pdbFileNumber >= pdbFilesArray.Length) { break; } } ProgressActionSet.EstimatedTimeRemainingAction(startTicks, pdbFileNumber, pdbFilesArray.Length, progressActionSet); progressIncrement++; // get unique id of pdb file string pdbFilename = pdbFilesArray[pdbFileNumber]; string proteinId = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename); // check pdb unique id was in the loaded sequence list if (!pdbIdList.Contains(proteinId)) { continue; } // perform clustering to detect interaction proteinInterfaces ClusterProteinDataBankFileResult clusterPdbFileResult = Clustering.ClusterProteinDataBankFile(cancellationToken, maxAtomInterationDistance, minimumProteinInterfaceDensity, pdbFilename, pdbIdChainIdList, ClusteringMethodOptions.ClusterWithResidueSequenceIndex, -1, -1, null); if (clusterPdbFileResult == null) { continue; } int[] proteinInterfacesCount = new int[clusterPdbFileResult.ClusteringFullResultListContainer.ChainList.Count]; for (int chainIndex = 0; chainIndex < clusterPdbFileResult.ClusteringFullResultListContainer.ChainList.Count; chainIndex++) { int totalProteinInterfaces = clusterPdbFileResult.ProteinInterfaceAnalysisResultData.ProteinInterfacesSequenceAndPositionDataList.Count(proteinInterface => proteinInterface.FullProteinInterfaceId.ChainId == chainIndex); proteinInterfacesCount[chainIndex] = totalProteinInterfaces; } var proteinInterfacesCountStr = string.Join(" ", proteinInterfacesCount.OrderBy(x => x)); List <ProteinInterfaceSequenceAndPositionData> proteinInterfaces = clusterPdbFileResult.ProteinInterfaceAnalysisResultData.ProteinInterfacesSequenceAndPositionDataList; int[] proteinInterfacesLength = new int[proteinInterfaces.Count]; for (int index = 0; index < proteinInterfaces.Count; index++) { ProteinInterfaceSequenceAndPositionData proteinInterface = proteinInterfaces[index]; proteinInterfacesLength[index] = proteinInterface.ProteinInterfaceLength; } var proteinInterfacesLengthStr = string.Join(" ", proteinInterfacesLength.Distinct().OrderBy(x => x)); if (proteinInterfacesLength.Length == 0) { proteinInterfacesLengthStr = 0.ToString(); } var chainsCountStr = clusterPdbFileResult.ClusteringFullResultListContainer.ChainList.Count; if (filterProteinInterfaceCountsWithoutLengths) { var combinedKeyAll = "chains [" + chainsCountStr + "] proteinInterfaces [" + proteinInterfacesCountStr + "]"; if (!proteinInterfacesCountResultWithLengths.ContainsKey(combinedKeyAll)) { proteinInterfacesCountResultWithLengths.Add(combinedKeyAll, new List <string>()); } proteinInterfacesCountResultWithLengths[combinedKeyAll].Add(proteinId); } if (filterProteinInterfaceCountsWithLengths) { var combinedKeyWithLengths = "chains [" + chainsCountStr + "] proteinInterfaces [" + proteinInterfacesCountStr + "] lengths [" + proteinInterfacesLengthStr + "]"; if (!proteinInterfacesCountResultWithLengths.ContainsKey(combinedKeyWithLengths)) { proteinInterfacesCountResultWithLengths.Add(combinedKeyWithLengths, new List <string>()); } proteinInterfacesCountResultWithLengths[combinedKeyWithLengths].Add(proteinId); } } var confirmSaveList = new List <string>(); foreach (var kvp in proteinInterfacesCountResultWithLengths) { var seq2 = new List <ISequence>(sequenceList); seq2 = FilterProteins.RemoveSequences(cancellationToken, seq2, kvp.Value, FilterProteins.RemoveSequencesOptions.RemoveSequencesNotInList); var saveFilename = filterProteinInterfacesLengthOutputFilename; saveFilename = saveFilename.Replace(proteinInterfacesTemplateText, kvp.Key); var actualSavedFilename = SequenceFileHandler.SaveSequencesAsFasta(seq2, saveFilename, true, fileExistsOptions, progressActionSet); if (!string.IsNullOrWhiteSpace(actualSavedFilename)) { confirmSaveList.Add(actualSavedFilename); } } // Confirm the total number of sequences saved is equal to original number loaded ConfirmSequencesSaved(pdbIdList, confirmSaveList, progressActionSet); ProgressActionSet.FinishAction(true, progressActionSet); }
/* * private static int[] LastPdbChainResidueIndexes(string pdbFilename) * { * //var result = new Dictionary<string,int>(); * var pdbFile = new ProteinDataBankFile(pdbFilename, new []{ ATOM_Record.ATOM_Field.FieldName }); * * //var x = ProteinDataBankFileOperations.PdbAtomAcidList(); * * var atomList = pdbFile.ProteinDataBankFileRecordList.Where(a => a.GetType() == typeof (ATOM_Record)).Select(a=>(ATOM_Record)a).ToList(); * * var chainIdList = atomList.Select(a=>a.chainID.FieldValue.ToUpperInvariant()).Distinct().ToList(); * * var result = new int[chainIdList.Count]; * * for (int index = 0; index < chainIdList.Count; index++) * { * var chainId = chainIdList[index]; * var maxResidueIndex = atomList.Where(a => a.chainID.FieldValue.ToUpperInvariant() == chainId).Select(a => int.Parse(a.resSeq.FieldValue)).Max(); * * result[index] = maxResidueIndex; * } * * return result; * } */ private static VectorProteinInterfaceWhole MakeVectorProteinInterfaceWhole(string pdbFilename, ProteinInterfaceSequenceAndPositionData proteinInterfaceSequenceAndPositionData, bool reversedSequence, bool reversedInteractions) { if (pdbFilename == null) { throw new ArgumentNullException(nameof(pdbFilename)); } if (proteinInterfaceSequenceAndPositionData == null) { throw new ArgumentNullException(nameof(proteinInterfaceSequenceAndPositionData)); } ProteinInterfaceAminoAcidMetaData[] proteinInterfaceAminoAcidMetaDataArray = proteinInterfaceSequenceAndPositionData.AminoAcidSequenceAllResidueSequenceIndexes; var vectorProteinInterfaceWhole = new VectorProteinInterfaceWhole { FullProteinInterfaceId = new FullProteinInterfaceId(proteinInterfaceSequenceAndPositionData.FullProteinInterfaceId), ProteinInterfaceLength = proteinInterfaceSequenceAndPositionData.ProteinInterfaceLength, FirstResidueSequenceIndex = proteinInterfaceSequenceAndPositionData.StartPosition, LastResidueSequenceIndex = proteinInterfaceSequenceAndPositionData.EndPosition, ReversedInteractions = reversedInteractions, ReversedSequence = reversedSequence, }; //vectorProteinInterfaceWhole.FullSequenceLength = LastPdbChainResidueIndexes(pdbFilename)[vectorProteinInterfaceWhole.FullProteinInterfaceId.ChainId]; vectorProteinInterfaceWhole.SecondaryStructure = ProteinInterfaceSecondaryStructureLoader.ProteinInterfaceSecondaryStructure(pdbFilename, SpreadsheetFileHandler.AlphabetLetterRollOver(vectorProteinInterfaceWhole.FullProteinInterfaceId.ChainId), vectorProteinInterfaceWhole.FirstResidueSequenceIndex, vectorProteinInterfaceWhole.LastResidueSequenceIndex, vectorProteinInterfaceWhole.ReversedSequence); for (int proteinInterfaceAminoAcidMetaDataArrayIndex = 0; proteinInterfaceAminoAcidMetaDataArrayIndex < proteinInterfaceAminoAcidMetaDataArray.Length; proteinInterfaceAminoAcidMetaDataArrayIndex++) { ProteinInterfaceAminoAcidMetaData proteinInterfaceAminoAcidMetaData = proteinInterfaceAminoAcidMetaDataArray[proteinInterfaceAminoAcidMetaDataArrayIndex]; var vectorProteinInterfacePart = new VectorProteinInterfacePart(proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions.Length) { FullProteinInterfaceId = new FullProteinInterfaceId(proteinInterfaceSequenceAndPositionData.FullProteinInterfaceId), ResidueId = proteinInterfaceAminoAcidMetaDataArrayIndex, SourceAminoAcid1L = proteinInterfaceAminoAcidMetaData.ResidueName1L, SourceAminoAcid3L = proteinInterfaceAminoAcidMetaData.ResidueName3L, InteractionAminoAcids1L = proteinInterfaceAminoAcidMetaData.ProteinInterfaceInteractionResidueNames1L, InteractionNonProteinInterfaceAminoAcids1L = proteinInterfaceAminoAcidMetaData.NonProteinInterfaceInteractionResidueNames1L, InteractionFlagBools = new bool[proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions.Length] }; vectorProteinInterfaceWhole.VectorProteinInterfacePartList.Add(vectorProteinInterfacePart); Array.Copy(proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions, vectorProteinInterfacePart.InteractionFlagBools, proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions.Length); if (reversedInteractions) { Array.Reverse(vectorProteinInterfacePart.InteractionFlagBools); } vectorProteinInterfacePart.InteractionToNonProteinInterface = proteinInterfaceAminoAcidMetaData.ProteinInterfaceInteractionType.HasFlag(ProteinInterfaceInteractionType.InteractionWithNonProteinInterface); } if (vectorProteinInterfaceWhole.ReversedSequence) { vectorProteinInterfaceWhole.VectorProteinInterfacePartList.Reverse(); } return(vectorProteinInterfaceWhole); }