/// <summary>
        ///     This method returns a dictionary entry for each protein id (pdb id), with a list of interaction vectors
        /// </summary>
        /// <returns></returns>
        public static List <VectorProteinInterfaceWhole> LoadProteinInterfaceVectorFromFiles(
            CancellationToken cancellationToken,
            decimal maxAtomInterationDistance,
            decimal minimumProteinInterfaceDensity,
            string[] sequenceListFileArray,
            string[] pdbFileDirectoryLocationArray,
            ProgressActionSet progressActionSet)
        {
            if (sequenceListFileArray == null)
            {
                throw new ArgumentNullException(nameof(sequenceListFileArray));
            }
            if (pdbFileDirectoryLocationArray == null)
            {
                throw new ArgumentNullException(nameof(pdbFileDirectoryLocationArray));
            }

            var vectorProteinInterfaceWholeList = new List <VectorProteinInterfaceWhole>();

            // 1: Open list of sequences already cleaned to have only symmetrical homodimers (fasta file only contains 100% symmetrical homodimers with all other junk removed - but could have any number of proteinInterfaces per chain)
            List <ISequence> sequenceList = SequenceFileHandler.LoadSequenceFileList(sequenceListFileArray, StaticValues.MolNameProteinAcceptedValues);

            var pdbIdChainIdList = ProteinDataBankFileOperations.PdbIdChainIdList(sequenceList);

            // 2: Get a list of the unique ids for the sequences
            List <string> pdbIdList = FilterProteins.SequenceListToPdbIdList(sequenceList);

            if (pdbIdList == null || pdbIdList.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequenceListFileArray), "Error loading PDB ID list");
            }

            // 3: Get a list of PDB files found in user specified directory
            string[] pdbFilesArray = ProteinDataBankFileOperations.GetPdbFilesArray(pdbFileDirectoryLocationArray);



            ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet);



            var startTicks = DateTime.Now.Ticks;

            // 4: Loop through each pdb file
            for (int pdbFileNumber = 0; pdbFileNumber < pdbFilesArray.Length; pdbFileNumber++) // +1 is for progress update
            {
                ProgressActionSet.ProgressAction(1, progressActionSet);

                ProgressActionSet.EstimatedTimeRemainingAction(startTicks, pdbFileNumber + 1, pdbFilesArray.Length, progressActionSet);

                // get unique id of pdb file
                string pdbFilename = pdbFilesArray[pdbFileNumber];
                string proteinId   = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename);

                // check pdb unique id was in the loaded sequence list
                if (!pdbIdList.Contains(proteinId))
                {
                    continue;
                }

                ClusterProteinDataBankFileResult clusterPdbFileResult = Clustering.ClusterProteinDataBankFile(cancellationToken, maxAtomInterationDistance, minimumProteinInterfaceDensity, pdbFilename, pdbIdChainIdList, ClusteringMethodOptions.ClusterWithResidueSequenceIndex, -1, -1, progressActionSet);

                if (clusterPdbFileResult == null)
                {
                    continue;
                }

                List <ProteinInterfaceSequenceAndPositionData> proteinInterfaceSequenceAndPositionDataList = clusterPdbFileResult.ProteinInterfaceAnalysisResultData.ProteinInterfacesSequenceAndPositionDataList;
                proteinInterfaceSequenceAndPositionDataList = proteinInterfaceSequenceAndPositionDataList.OrderBy(a => a.FullProteinInterfaceId.ProteinId).ThenBy(a => a.FullProteinInterfaceId.ChainId).ThenBy(a => a.FullProteinInterfaceId.ProteinInterfaceId).ToList();

                for (int proteinInterfaceSequenceAndPositionDataListIndex = 0; proteinInterfaceSequenceAndPositionDataListIndex < proteinInterfaceSequenceAndPositionDataList.Count; proteinInterfaceSequenceAndPositionDataListIndex++)
                {
                    ProteinInterfaceSequenceAndPositionData proteinInterfaceSequenceAndPositionData = proteinInterfaceSequenceAndPositionDataList[proteinInterfaceSequenceAndPositionDataListIndex];

                    var seq = sequenceList.FirstOrDefault(a =>
                    {
                        var p = SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID);
                        return(p.PdbId.ToUpperInvariant() == proteinInterfaceSequenceAndPositionData.FullProteinInterfaceId.ProteinId.ToUpperInvariant() && p.ChainId.ToUpperInvariant() == proteinInterfaceSequenceAndPositionData.ChainIdLetter.ToUpperInvariant());
                    });

                    var seqLen = seq != null ? seq.Count : -1;

                    var vectorProteinInterfaceWholeFwd = MakeVectorProteinInterfaceWhole(pdbFilename, proteinInterfaceSequenceAndPositionData, false, false);
                    vectorProteinInterfaceWholeFwd.FullSequenceLength = seqLen;

                    vectorProteinInterfaceWholeList.Add(vectorProteinInterfaceWholeFwd);

                    var vectorProteinInterfaceWholeRev = MakeVectorProteinInterfaceWhole(pdbFilename, proteinInterfaceSequenceAndPositionData, true, false);
                    vectorProteinInterfaceWholeRev.FullSequenceLength = seqLen;

                    vectorProteinInterfaceWholeList.Add(vectorProteinInterfaceWholeRev);
                }
            }

            ProgressActionSet.FinishAction(true, progressActionSet);

            vectorProteinInterfaceWholeList = vectorProteinInterfaceWholeList.OrderBy(a => a.FullProteinInterfaceId.ProteinId).ThenBy(a => a.FullProteinInterfaceId.ChainId).ThenBy(a => a.FullProteinInterfaceId.ProteinInterfaceId).ToList();

            return(vectorProteinInterfaceWholeList);
        }
Exemplo n.º 2
0
        public static void FilterProteinInterfaceLengths(
            CancellationToken cancellationToken,
            decimal maxAtomInterationDistance,
            decimal minimumProteinInterfaceDensity,
            string[] sequenceListFileArray,
            string[] pdbFileDirectoryLocationArray,
            string filterProteinInterfacesLengthOutputFilename,
            bool filterProteinInterfaceCountsWithoutLengths,
            bool filterProteinInterfaceCountsWithLengths,
            FileExistsHandler.FileExistsOptions fileExistsOptions,
            ProgressActionSet progressActionSet)
        {
            if (sequenceListFileArray == null)
            {
                throw new ArgumentNullException(nameof(sequenceListFileArray));
            }
            if (pdbFileDirectoryLocationArray == null)
            {
                throw new ArgumentNullException(nameof(pdbFileDirectoryLocationArray));
            }
            if (filterProteinInterfacesLengthOutputFilename == null)
            {
                throw new ArgumentNullException(nameof(filterProteinInterfacesLengthOutputFilename));
            }
            if (!filterProteinInterfaceCountsWithoutLengths && !filterProteinInterfaceCountsWithLengths)
            {
                ProgressActionSet.Report("Cancelled: No filter options selected.", progressActionSet);
                return;
            }

            // Check all sequence files are found
            var missingSequenceFiles = sequenceListFileArray.Where(sequenceFile => !string.IsNullOrWhiteSpace(sequenceFile) && !File.Exists(sequenceFile)).ToList();

            if (missingSequenceFiles.Count > 0)
            {
                foreach (string missingSequenceFile in missingSequenceFiles)
                {
                    //throw new FileNotFoundException(sequenceFile);

                    ProgressActionSet.Report("Warning: Sequence file missing: " + missingSequenceFile, progressActionSet);
                }

                ProgressActionSet.Report("Cancelled: missing sequence files.", progressActionSet);
                return;
            }

            // Check all pdb folders are found
            var missingDirectoryList = pdbFileDirectoryLocationArray.Where(pdbDirectory => !string.IsNullOrWhiteSpace(pdbDirectory) && !Directory.Exists(pdbDirectory)).ToList();

            if (missingDirectoryList.Count > 0)
            {
                foreach (string pdbDirectory in missingDirectoryList)
                {
                    //throw new DirectoryNotFoundException(pdbDirectory);
                    ProgressActionSet.Report("Warning: Structure file directory missing: " + pdbDirectory, progressActionSet);
                }

                ProgressActionSet.Report("Cancelled: missing structure file directory.", progressActionSet);
                return;
            }

            const string proteinInterfacesTemplateText = "%proteinInterfaces%";

            if (string.IsNullOrWhiteSpace(filterProteinInterfacesLengthOutputFilename) || !filterProteinInterfacesLengthOutputFilename.Contains(proteinInterfacesTemplateText))
            {
                throw new ArgumentOutOfRangeException(nameof(filterProteinInterfacesLengthOutputFilename));
            }

            // Load fasta sequence files
            List <ISequence> sequenceList = SequenceFileHandler.LoadSequenceFileList(sequenceListFileArray, StaticValues.MolNameProteinAcceptedValues);

            // Get a list of the PDB Unique IDs with unique chain IDs which are wanted, ignoring others which may be present e.g. dna
            var pdbIdChainIdList = ProteinDataBankFileOperations.PdbIdChainIdList(sequenceList);

            // Get list of PDB Unique IDs
            List <string> pdbIdList = FilterProteins.SequenceListToPdbIdList(sequenceList);

            // Check PDB Unique IDs were successfully loaded
            if (pdbIdList == null || pdbIdList.Count == 0)
            {
                //throw new Exception("PDB ID List is empty or could not be loaded.");

                ProgressActionSet.Report("Error: Sequence list could not be loaded", progressActionSet);
                return;
            }

            // 3: Get a list of PDB files found in user specified directory

            string[] pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, ProteinDataBankFileOperations.GetPdbFilesArray(pdbFileDirectoryLocationArray));

            // Check all PDB files are found
            List <string> missingPdbFilesList = ProteinDataBankFileOperations.CheckForMissingPdbFiles(pdbFilesArray, pdbIdList);

            if (missingPdbFilesList != null && missingPdbFilesList.Count > 0)
            {
                ProgressActionSet.Report("Missing PDB Files: " + string.Join(", ", missingPdbFilesList), progressActionSet);
            }



            ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet);


            int progressIncrement = 0;

            var proteinInterfacesCountResultWithLengths = new Dictionary <string, List <string> >();

            var startTicks = DateTime.Now.Ticks;

            // 4: Loop through each pdb file
            for (int pdbFileNumber = 0; pdbFileNumber < pdbFilesArray.Length + 1; pdbFileNumber++) // +1 is for progress update
            {
                if (progressIncrement > 0)
                {
                    ProgressActionSet.ProgressAction(progressIncrement, progressActionSet);
                    progressIncrement = 0;
                    if (pdbFileNumber >= pdbFilesArray.Length)
                    {
                        break;
                    }
                }
                ProgressActionSet.EstimatedTimeRemainingAction(startTicks, pdbFileNumber, pdbFilesArray.Length, progressActionSet);

                progressIncrement++;

                // get unique id of pdb file
                string pdbFilename = pdbFilesArray[pdbFileNumber];
                string proteinId   = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename);

                // check pdb unique id was in the loaded sequence list
                if (!pdbIdList.Contains(proteinId))
                {
                    continue;
                }

                // perform clustering to detect interaction proteinInterfaces
                ClusterProteinDataBankFileResult clusterPdbFileResult = Clustering.ClusterProteinDataBankFile(cancellationToken, maxAtomInterationDistance, minimumProteinInterfaceDensity, pdbFilename, pdbIdChainIdList, ClusteringMethodOptions.ClusterWithResidueSequenceIndex, -1, -1, null);

                if (clusterPdbFileResult == null)
                {
                    continue;
                }

                int[] proteinInterfacesCount = new int[clusterPdbFileResult.ClusteringFullResultListContainer.ChainList.Count];

                for (int chainIndex = 0; chainIndex < clusterPdbFileResult.ClusteringFullResultListContainer.ChainList.Count; chainIndex++)
                {
                    int totalProteinInterfaces = clusterPdbFileResult.ProteinInterfaceAnalysisResultData.ProteinInterfacesSequenceAndPositionDataList.Count(proteinInterface => proteinInterface.FullProteinInterfaceId.ChainId == chainIndex);

                    proteinInterfacesCount[chainIndex] = totalProteinInterfaces;
                }

                var proteinInterfacesCountStr = string.Join(" ", proteinInterfacesCount.OrderBy(x => x));

                List <ProteinInterfaceSequenceAndPositionData> proteinInterfaces = clusterPdbFileResult.ProteinInterfaceAnalysisResultData.ProteinInterfacesSequenceAndPositionDataList;
                int[] proteinInterfacesLength = new int[proteinInterfaces.Count];

                for (int index = 0; index < proteinInterfaces.Count; index++)
                {
                    ProteinInterfaceSequenceAndPositionData proteinInterface = proteinInterfaces[index];

                    proteinInterfacesLength[index] = proteinInterface.ProteinInterfaceLength;
                }

                var proteinInterfacesLengthStr = string.Join(" ", proteinInterfacesLength.Distinct().OrderBy(x => x));

                if (proteinInterfacesLength.Length == 0)
                {
                    proteinInterfacesLengthStr = 0.ToString();
                }

                var chainsCountStr = clusterPdbFileResult.ClusteringFullResultListContainer.ChainList.Count;

                if (filterProteinInterfaceCountsWithoutLengths)
                {
                    var combinedKeyAll = "chains [" + chainsCountStr + "] proteinInterfaces [" + proteinInterfacesCountStr + "]";

                    if (!proteinInterfacesCountResultWithLengths.ContainsKey(combinedKeyAll))
                    {
                        proteinInterfacesCountResultWithLengths.Add(combinedKeyAll, new List <string>());
                    }

                    proteinInterfacesCountResultWithLengths[combinedKeyAll].Add(proteinId);
                }

                if (filterProteinInterfaceCountsWithLengths)
                {
                    var combinedKeyWithLengths = "chains [" + chainsCountStr + "] proteinInterfaces [" + proteinInterfacesCountStr + "] lengths [" + proteinInterfacesLengthStr + "]";

                    if (!proteinInterfacesCountResultWithLengths.ContainsKey(combinedKeyWithLengths))
                    {
                        proteinInterfacesCountResultWithLengths.Add(combinedKeyWithLengths, new List <string>());
                    }

                    proteinInterfacesCountResultWithLengths[combinedKeyWithLengths].Add(proteinId);
                }
            }

            var confirmSaveList = new List <string>();

            foreach (var kvp in proteinInterfacesCountResultWithLengths)
            {
                var seq2 = new List <ISequence>(sequenceList);
                seq2 = FilterProteins.RemoveSequences(cancellationToken, seq2, kvp.Value, FilterProteins.RemoveSequencesOptions.RemoveSequencesNotInList);

                var saveFilename = filterProteinInterfacesLengthOutputFilename;
                saveFilename = saveFilename.Replace(proteinInterfacesTemplateText, kvp.Key);

                var actualSavedFilename = SequenceFileHandler.SaveSequencesAsFasta(seq2, saveFilename, true, fileExistsOptions, progressActionSet);

                if (!string.IsNullOrWhiteSpace(actualSavedFilename))
                {
                    confirmSaveList.Add(actualSavedFilename);
                }
            }

            // Confirm the total number of sequences saved is equal to original number loaded
            ConfirmSequencesSaved(pdbIdList, confirmSaveList, progressActionSet);

            ProgressActionSet.FinishAction(true, progressActionSet);
        }
        /*
         * private static int[] LastPdbChainResidueIndexes(string pdbFilename)
         * {
         *  //var result = new Dictionary<string,int>();
         *  var pdbFile = new ProteinDataBankFile(pdbFilename, new []{ ATOM_Record.ATOM_Field.FieldName });
         *
         *  //var x = ProteinDataBankFileOperations.PdbAtomAcidList();
         *
         *  var atomList = pdbFile.ProteinDataBankFileRecordList.Where(a => a.GetType() == typeof (ATOM_Record)).Select(a=>(ATOM_Record)a).ToList();
         *
         *  var chainIdList = atomList.Select(a=>a.chainID.FieldValue.ToUpperInvariant()).Distinct().ToList();
         *
         *  var result = new int[chainIdList.Count];
         *
         *  for (int index = 0; index < chainIdList.Count; index++)
         *  {
         *      var chainId = chainIdList[index];
         *      var maxResidueIndex = atomList.Where(a => a.chainID.FieldValue.ToUpperInvariant() == chainId).Select(a => int.Parse(a.resSeq.FieldValue)).Max();
         *
         *      result[index] = maxResidueIndex;
         *  }
         *
         *  return result;
         * }
         */

        private static VectorProteinInterfaceWhole MakeVectorProteinInterfaceWhole(string pdbFilename, ProteinInterfaceSequenceAndPositionData proteinInterfaceSequenceAndPositionData, bool reversedSequence, bool reversedInteractions)
        {
            if (pdbFilename == null)
            {
                throw new ArgumentNullException(nameof(pdbFilename));
            }
            if (proteinInterfaceSequenceAndPositionData == null)
            {
                throw new ArgumentNullException(nameof(proteinInterfaceSequenceAndPositionData));
            }

            ProteinInterfaceAminoAcidMetaData[] proteinInterfaceAminoAcidMetaDataArray = proteinInterfaceSequenceAndPositionData.AminoAcidSequenceAllResidueSequenceIndexes;

            var vectorProteinInterfaceWhole = new VectorProteinInterfaceWhole
            {
                FullProteinInterfaceId    = new FullProteinInterfaceId(proteinInterfaceSequenceAndPositionData.FullProteinInterfaceId),
                ProteinInterfaceLength    = proteinInterfaceSequenceAndPositionData.ProteinInterfaceLength,
                FirstResidueSequenceIndex = proteinInterfaceSequenceAndPositionData.StartPosition,
                LastResidueSequenceIndex  = proteinInterfaceSequenceAndPositionData.EndPosition,
                ReversedInteractions      = reversedInteractions,
                ReversedSequence          = reversedSequence,
            };

            //vectorProteinInterfaceWhole.FullSequenceLength = LastPdbChainResidueIndexes(pdbFilename)[vectorProteinInterfaceWhole.FullProteinInterfaceId.ChainId];

            vectorProteinInterfaceWhole.SecondaryStructure = ProteinInterfaceSecondaryStructureLoader.ProteinInterfaceSecondaryStructure(pdbFilename, SpreadsheetFileHandler.AlphabetLetterRollOver(vectorProteinInterfaceWhole.FullProteinInterfaceId.ChainId), vectorProteinInterfaceWhole.FirstResidueSequenceIndex, vectorProteinInterfaceWhole.LastResidueSequenceIndex, vectorProteinInterfaceWhole.ReversedSequence);

            for (int proteinInterfaceAminoAcidMetaDataArrayIndex = 0; proteinInterfaceAminoAcidMetaDataArrayIndex < proteinInterfaceAminoAcidMetaDataArray.Length; proteinInterfaceAminoAcidMetaDataArrayIndex++)
            {
                ProteinInterfaceAminoAcidMetaData proteinInterfaceAminoAcidMetaData = proteinInterfaceAminoAcidMetaDataArray[proteinInterfaceAminoAcidMetaDataArrayIndex];

                var vectorProteinInterfacePart = new VectorProteinInterfacePart(proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions.Length)
                {
                    FullProteinInterfaceId = new FullProteinInterfaceId(proteinInterfaceSequenceAndPositionData.FullProteinInterfaceId),
                    ResidueId               = proteinInterfaceAminoAcidMetaDataArrayIndex,
                    SourceAminoAcid1L       = proteinInterfaceAminoAcidMetaData.ResidueName1L,
                    SourceAminoAcid3L       = proteinInterfaceAminoAcidMetaData.ResidueName3L,
                    InteractionAminoAcids1L = proteinInterfaceAminoAcidMetaData.ProteinInterfaceInteractionResidueNames1L,
                    InteractionNonProteinInterfaceAminoAcids1L = proteinInterfaceAminoAcidMetaData.NonProteinInterfaceInteractionResidueNames1L,
                    InteractionFlagBools = new bool[proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions.Length]
                };

                vectorProteinInterfaceWhole.VectorProteinInterfacePartList.Add(vectorProteinInterfacePart);

                Array.Copy(proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions, vectorProteinInterfacePart.InteractionFlagBools, proteinInterfaceAminoAcidMetaData.OppoproteinInterfaceInteractions.Length);

                if (reversedInteractions)
                {
                    Array.Reverse(vectorProteinInterfacePart.InteractionFlagBools);
                }

                vectorProteinInterfacePart.InteractionToNonProteinInterface = proteinInterfaceAminoAcidMetaData.ProteinInterfaceInteractionType.HasFlag(ProteinInterfaceInteractionType.InteractionWithNonProteinInterface);
            }

            if (vectorProteinInterfaceWhole.ReversedSequence)
            {
                vectorProteinInterfaceWhole.VectorProteinInterfacePartList.Reverse();
            }

            return(vectorProteinInterfaceWhole);
        }