Ejemplo n.º 1
0
        public static List <AminoAcidDistributionSpreadsheetRecord> PatternDistributionSpreadsheetRecords(List <VectorProteinInterfaceWhole> vectorProteinInterfaceWholeList, List <string> pdbIdList, List <ISequence> seqList, int vectorType)
        {
            if (vectorProteinInterfaceWholeList == null)
            {
                throw new ArgumentNullException(nameof(vectorProteinInterfaceWholeList));
            }
            if (pdbIdList == null)
            {
                throw new ArgumentNullException(nameof(pdbIdList));
            }
            if (seqList == null)
            {
                throw new ArgumentNullException(nameof(seqList));
            }

            var result = new List <AminoAcidDistributionSpreadsheetRecord>();

            // patterns

            var patternProteinDictionary                        = new Dictionary <string, AminoAcidChainComposition>();
            var patternProteinInterfaceDictionary               = new Dictionary <string, AminoAcidChainComposition>();
            var patternProteinInterfaceInteractionDictionary    = new Dictionary <string, AminoAcidChainComposition>();
            var patternProteinInterfaceNonInteractionDictionary = new Dictionary <string, AminoAcidChainComposition>();

            foreach (var vectorProteinInterfaceWhole in vectorProteinInterfaceWholeList)
            {
                var interactionBools = vectorProteinInterfaceWhole.InteractionBools();

                string pattern;
                if (vectorType >= 0 && vectorType <= 3)
                {
                    pattern = vectorProteinInterfaceWhole.VectorString(vectorType);
                }
                else if (vectorType == 4)
                {
                    pattern = vectorProteinInterfaceWhole.SecondaryStructure;
                }
                else if (vectorType == 5)
                {
                    pattern = "ProteinInterface Length " + vectorProteinInterfaceWhole.ProteinInterfaceLength;
                }
                else
                {
                    throw new ArgumentOutOfRangeException(nameof(vectorType));
                }

                if (!patternProteinDictionary.ContainsKey(pattern))
                {
                    patternProteinDictionary.Add(pattern, new AminoAcidChainComposition());
                }
                if (!patternProteinInterfaceDictionary.ContainsKey(pattern))
                {
                    patternProteinInterfaceDictionary.Add(pattern, new AminoAcidChainComposition());
                }
                if (!patternProteinInterfaceInteractionDictionary.ContainsKey(pattern))
                {
                    patternProteinInterfaceInteractionDictionary.Add(pattern, new AminoAcidChainComposition());
                }
                if (!patternProteinInterfaceNonInteractionDictionary.ContainsKey(pattern))
                {
                    patternProteinInterfaceNonInteractionDictionary.Add(pattern, new AminoAcidChainComposition());
                }

                var aminoAcids1L = vectorProteinInterfaceWhole.ProteinInterfaceAminoAcids1L();

                for (int index = 0; index < aminoAcids1L.Length; index++)
                {
                    var c = aminoAcids1L[index];

                    patternProteinInterfaceDictionary[pattern].IncrementAminoAcidCount(c);

                    if (interactionBools[index])
                    {
                        patternProteinInterfaceInteractionDictionary[pattern].IncrementAminoAcidCount(c);
                    }
                    else
                    {
                        patternProteinInterfaceNonInteractionDictionary[pattern].IncrementAminoAcidCount(c);
                    }
                }

                patternProteinDictionary[pattern].NumberSamples++;
                patternProteinInterfaceDictionary[pattern].NumberSamples++;
                patternProteinInterfaceInteractionDictionary[pattern].NumberSamples++;
                patternProteinInterfaceNonInteractionDictionary[pattern].NumberSamples++;


                var bsSeqList = seqList.Where(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId == vectorProteinInterfaceWhole.FullProteinInterfaceId.ProteinId && SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).ChainId == SpreadsheetFileHandler.AlphabetLetterRollOver(vectorProteinInterfaceWhole.FullProteinInterfaceId.ChainId)).ToList();
                foreach (var chain in bsSeqList)
                {
                    var seq = chain.ConvertToString();
                    foreach (var c in seq)
                    {
                        patternProteinDictionary[pattern].IncrementAminoAcidCount(c);
                    }
                }
            }

            foreach (var kvp in patternProteinDictionary)
            {
                var recordAllComposition = new AminoAcidDistributionSpreadsheetRecord();
                var recordProteinInterfaceComposition               = new AminoAcidDistributionSpreadsheetRecord();
                var recordProteinInterfaceInteractionComposition    = new AminoAcidDistributionSpreadsheetRecord();
                var recordProteinInterfaceNonInteractionComposition = new AminoAcidDistributionSpreadsheetRecord();

                result.Add(recordAllComposition);
                result.Add(recordProteinInterfaceComposition);
                result.Add(recordProteinInterfaceInteractionComposition);
                result.Add(recordProteinInterfaceNonInteractionComposition);

                recordAllComposition.Pattern = kvp.Key;
                recordProteinInterfaceComposition.Pattern               = kvp.Key;
                recordProteinInterfaceInteractionComposition.Pattern    = kvp.Key;
                recordProteinInterfaceNonInteractionComposition.Pattern = kvp.Key;

                var vectorTypeStr = VectorProteinInterfaceWhole.VectorStringDescription(vectorType);

                recordAllComposition.Type = vectorTypeStr;
                recordProteinInterfaceComposition.Type               = vectorTypeStr;
                recordProteinInterfaceInteractionComposition.Type    = vectorTypeStr;
                recordProteinInterfaceNonInteractionComposition.Type = vectorTypeStr;

                recordAllComposition.Metric = "Protein";
                recordProteinInterfaceComposition.Metric               = "ProteinInterface";
                recordProteinInterfaceInteractionComposition.Metric    = "ProteinInterface interactions";
                recordProteinInterfaceNonInteractionComposition.Metric = "ProteinInterface non interactions";

                // number of samples
                recordAllComposition.NumberOfSamples = kvp.Value.NumberSamples;
                recordProteinInterfaceComposition.NumberOfSamples               = patternProteinInterfaceDictionary[kvp.Key].NumberSamples;
                recordProteinInterfaceInteractionComposition.NumberOfSamples    = patternProteinInterfaceInteractionDictionary[kvp.Key].NumberSamples;
                recordProteinInterfaceNonInteractionComposition.NumberOfSamples = patternProteinInterfaceNonInteractionDictionary[kvp.Key].NumberSamples;

                // number of amino acids
                recordAllComposition.TotalAminoAcids = kvp.Value.AminoAcidGroupsCount[(int)AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups.AminoAcids].Sum();
                recordProteinInterfaceComposition.TotalAminoAcids               = patternProteinInterfaceDictionary[kvp.Key].AminoAcidGroupsCount[(int)AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups.AminoAcids].Sum();
                recordProteinInterfaceInteractionComposition.TotalAminoAcids    = patternProteinInterfaceInteractionDictionary[kvp.Key].AminoAcidGroupsCount[(int)AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups.AminoAcids].Sum();
                recordProteinInterfaceNonInteractionComposition.TotalAminoAcids = patternProteinInterfaceNonInteractionDictionary[kvp.Key].AminoAcidGroupsCount[(int)AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups.AminoAcids].Sum();

                var allPercentage = AminoAcidChainComposition.ConvertToPercentage(kvp.Value);
                var proteinInterfacePercentage               = AminoAcidChainComposition.ConvertToPercentage(patternProteinInterfaceDictionary[kvp.Key]);
                var proteinInterfaceInteractionPercentage    = AminoAcidChainComposition.ConvertToPercentage(patternProteinInterfaceInteractionDictionary[kvp.Key]);
                var proteinInterfaceNonInteractionPercentage = AminoAcidChainComposition.ConvertToPercentage(patternProteinInterfaceNonInteractionDictionary[kvp.Key]);

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    var groupItemsTotal = AminoAcidGroups.AminoAcidGroups.GetTotalSubgroups(enumAminoAcidGroups);

                    for (var groupItemIndex = 0; groupItemIndex < groupItemsTotal; groupItemIndex++)
                    {
                        recordAllComposition.Distribution[(int)enumAminoAcidGroups][groupItemIndex] = allPercentage.AminoAcidGroupsCount[(int)enumAminoAcidGroups][groupItemIndex];
                        recordProteinInterfaceComposition.Distribution[(int)enumAminoAcidGroups][groupItemIndex]               = proteinInterfacePercentage.AminoAcidGroupsCount[(int)enumAminoAcidGroups][groupItemIndex];
                        recordProteinInterfaceInteractionComposition.Distribution[(int)enumAminoAcidGroups][groupItemIndex]    = proteinInterfaceInteractionPercentage.AminoAcidGroupsCount[(int)enumAminoAcidGroups][groupItemIndex];
                        recordProteinInterfaceNonInteractionComposition.Distribution[(int)enumAminoAcidGroups][groupItemIndex] = proteinInterfaceNonInteractionPercentage.AminoAcidGroupsCount[(int)enumAminoAcidGroups][groupItemIndex];
                    }
                }
            }

            return(result);
        }
        /// <summary>
        ///     This method returns a dictionary entry for each protein id (pdb id), with a list of interaction vectors
        /// </summary>
        /// <returns></returns>
        public static List <VectorProteinInterfaceWhole> LoadProteinInterfaceVectorFromFiles(
            CancellationToken cancellationToken,
            decimal maxAtomInterationDistance,
            decimal minimumProteinInterfaceDensity,
            string[] sequenceListFileArray,
            string[] pdbFileDirectoryLocationArray,
            ProgressActionSet progressActionSet)
        {
            if (sequenceListFileArray == null)
            {
                throw new ArgumentNullException(nameof(sequenceListFileArray));
            }
            if (pdbFileDirectoryLocationArray == null)
            {
                throw new ArgumentNullException(nameof(pdbFileDirectoryLocationArray));
            }

            var vectorProteinInterfaceWholeList = new List <VectorProteinInterfaceWhole>();

            // 1: Open list of sequences already cleaned to have only symmetrical homodimers (fasta file only contains 100% symmetrical homodimers with all other junk removed - but could have any number of proteinInterfaces per chain)
            List <ISequence> sequenceList = SequenceFileHandler.LoadSequenceFileList(sequenceListFileArray, StaticValues.MolNameProteinAcceptedValues);

            var pdbIdChainIdList = ProteinDataBankFileOperations.PdbIdChainIdList(sequenceList);

            // 2: Get a list of the unique ids for the sequences
            List <string> pdbIdList = FilterProteins.SequenceListToPdbIdList(sequenceList);

            if (pdbIdList == null || pdbIdList.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequenceListFileArray), "Error loading PDB ID list");
            }

            // 3: Get a list of PDB files found in user specified directory
            string[] pdbFilesArray = ProteinDataBankFileOperations.GetPdbFilesArray(pdbFileDirectoryLocationArray);



            ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet);



            var startTicks = DateTime.Now.Ticks;

            // 4: Loop through each pdb file
            for (int pdbFileNumber = 0; pdbFileNumber < pdbFilesArray.Length; pdbFileNumber++) // +1 is for progress update
            {
                ProgressActionSet.ProgressAction(1, progressActionSet);

                ProgressActionSet.EstimatedTimeRemainingAction(startTicks, pdbFileNumber + 1, pdbFilesArray.Length, progressActionSet);

                // get unique id of pdb file
                string pdbFilename = pdbFilesArray[pdbFileNumber];
                string proteinId   = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename);

                // check pdb unique id was in the loaded sequence list
                if (!pdbIdList.Contains(proteinId))
                {
                    continue;
                }

                ClusterProteinDataBankFileResult clusterPdbFileResult = Clustering.ClusterProteinDataBankFile(cancellationToken, maxAtomInterationDistance, minimumProteinInterfaceDensity, pdbFilename, pdbIdChainIdList, ClusteringMethodOptions.ClusterWithResidueSequenceIndex, -1, -1, progressActionSet);

                if (clusterPdbFileResult == null)
                {
                    continue;
                }

                List <ProteinInterfaceSequenceAndPositionData> proteinInterfaceSequenceAndPositionDataList = clusterPdbFileResult.ProteinInterfaceAnalysisResultData.ProteinInterfacesSequenceAndPositionDataList;
                proteinInterfaceSequenceAndPositionDataList = proteinInterfaceSequenceAndPositionDataList.OrderBy(a => a.FullProteinInterfaceId.ProteinId).ThenBy(a => a.FullProteinInterfaceId.ChainId).ThenBy(a => a.FullProteinInterfaceId.ProteinInterfaceId).ToList();

                for (int proteinInterfaceSequenceAndPositionDataListIndex = 0; proteinInterfaceSequenceAndPositionDataListIndex < proteinInterfaceSequenceAndPositionDataList.Count; proteinInterfaceSequenceAndPositionDataListIndex++)
                {
                    ProteinInterfaceSequenceAndPositionData proteinInterfaceSequenceAndPositionData = proteinInterfaceSequenceAndPositionDataList[proteinInterfaceSequenceAndPositionDataListIndex];

                    var seq = sequenceList.FirstOrDefault(a =>
                    {
                        var p = SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID);
                        return(p.PdbId.ToUpperInvariant() == proteinInterfaceSequenceAndPositionData.FullProteinInterfaceId.ProteinId.ToUpperInvariant() && p.ChainId.ToUpperInvariant() == proteinInterfaceSequenceAndPositionData.ChainIdLetter.ToUpperInvariant());
                    });

                    var seqLen = seq != null ? seq.Count : -1;

                    var vectorProteinInterfaceWholeFwd = MakeVectorProteinInterfaceWhole(pdbFilename, proteinInterfaceSequenceAndPositionData, false, false);
                    vectorProteinInterfaceWholeFwd.FullSequenceLength = seqLen;

                    vectorProteinInterfaceWholeList.Add(vectorProteinInterfaceWholeFwd);

                    var vectorProteinInterfaceWholeRev = MakeVectorProteinInterfaceWhole(pdbFilename, proteinInterfaceSequenceAndPositionData, true, false);
                    vectorProteinInterfaceWholeRev.FullSequenceLength = seqLen;

                    vectorProteinInterfaceWholeList.Add(vectorProteinInterfaceWholeRev);
                }
            }

            ProgressActionSet.FinishAction(true, progressActionSet);

            vectorProteinInterfaceWholeList = vectorProteinInterfaceWholeList.OrderBy(a => a.FullProteinInterfaceId.ProteinId).ThenBy(a => a.FullProteinInterfaceId.ChainId).ThenBy(a => a.FullProteinInterfaceId.ProteinInterfaceId).ToList();

            return(vectorProteinInterfaceWholeList);
        }