Exemple #1
0
        public static List <Sequence> LoadStructureFile(string atomsFilename, char[] chainIdWhiteList = null, bool padMissingBool = true, int[] startResSeq = null, int[] endResSeq = null, char outsidePaddingChar = ' ', char insidePaddingChar = 'X')
        {
            var pdb   = ProteinBioClass.PdbAtomicChains(atomsFilename, chainIdWhiteList, -1, -1, true);
            var pdbId = ProteinBioClass.PdbIdFromPdbFilename(atomsFilename);

            return(LoadStructureFile(pdb, pdbId, chainIdWhiteList, padMissingBool, startResSeq, endResSeq,
                                     outsidePaddingChar, insidePaddingChar));
        }
        public static List <SequenceIdentityClusterMember> ClusterSequenceByAlignedSequenceIdentity(List <Sequence> seqList, ProteinBioClass.AlignmentIdentityOption alignmentIdentityOption, decimal mininumClusterPairwiseSimilarity = 0.3m, decimal mininumEvoClusterPairwiseSimilarity = 0.3m)
        {
            var allsequences = seqList.Select(a => new Tuple <string, char, string>(new ProteinBioClass.SequenceId(a.Id).PdbId, new ProteinBioClass.SequenceId(a.Id).ChainId, Sequence.EscapeAminoAcidSequence(a.FullSequence))).ToList();

            var sequences = allsequences.Select(a => a.Item3).Distinct().ToList();

            var sequenceIds = sequences.Select(a => allsequences.Where(b => b.Item3 == a).ToList()).ToList();


            var seqClusters = new List <List <string> >();



            for (int x = 0; x < sequences.Count; x++)
            {
                var seq1       = sequences[x];
                var newCluster = new List <string>();
                newCluster.Add(seq1);
                seqClusters.Add(newCluster);
            }

            for (int indexX = 0; indexX < sequences.Count; indexX++)
            {
                Console.WriteLine("Aligning sequence " + indexX);
                var seqX = sequences[indexX];
                //List<decimal> scoreList = new List<decimal>();
                //List<decimal> scoreEvoList = new List<decimal>();

                for (int indexY = 0; indexY < sequences.Count; indexY++)
                {
                    if (indexY <= indexX)
                    {
                        continue;
                    }

                    var seqY = sequences[indexY];

                    if ((decimal)Math.Min(seqX.Length, seqY.Length) / (decimal)Math.Max(seqX.Length, seqY.Length) < mininumClusterPairwiseSimilarity)
                    {
                        continue;
                    }

                    var cluster1 = seqClusters.FirstOrDefault(a => a.Contains(seqX));
                    var cluster2 = seqClusters.FirstOrDefault(a => a.Contains(seqY));

                    if (cluster1 != null && cluster2 != null && cluster1 == cluster2)
                    {
                        continue;
                    }


                    var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NON, alignmentIdentityOption);

                    Console.WriteLine("1: " + seqX);
                    Console.WriteLine("2: " + seqY);
                    Console.WriteLine("Score1: " + score.Score);
                    Console.WriteLine("Score2: " + score.ScoreEvo);

                    if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity)
                    {
                        var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.SIM, alignmentIdentityOption);
                        if (x.Score > score.Score)
                        {
                            score.Score = x.Score;
                        }
                        if (x.ScoreEvo > score.ScoreEvo)
                        {
                            score.ScoreEvo = x.ScoreEvo;
                        }
                    }

                    if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity)
                    {
                        var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NMW, alignmentIdentityOption);
                        if (x.Score > score.Score)
                        {
                            score = x;
                        }
                        if (x.ScoreEvo > score.ScoreEvo)
                        {
                            score.ScoreEvo = x.ScoreEvo;
                        }
                    }


                    if (score.Score >= mininumClusterPairwiseSimilarity && score.ScoreEvo >= mininumEvoClusterPairwiseSimilarity)
                    {
                        var newCluster = new List <string>();

                        newCluster.AddRange(cluster1);
                        newCluster.AddRange(cluster2);

                        seqClusters.Remove(cluster1);
                        seqClusters.Remove(cluster2);

                        seqClusters.Add(newCluster);
                    }

                    //scoreList.Add(score.Score);
                    //scoreEvoList.Add(score.ScoreEvo);
                }
                //Console.WriteLine("[" + string.Join(", ", scoreList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]");
                //Console.WriteLine("[" + string.Join(", ", scoreEvoList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]");
            }

            seqClusters = seqClusters.OrderBy(a => a.Count).ToList();

            var output = new List <SequenceIdentityClusterMember>();

            for (var index = 0; index < seqClusters.Count; index++)
            {
                var seqCluster = seqClusters[index];
                foreach (var item in seqCluster)
                {
                    var indexIds = sequences.IndexOf(item);
                    var ids      = sequenceIds[indexIds];

                    foreach (var id in ids)
                    {
                        output.Add(new SequenceIdentityClusterMember(index + 1, ProteinBioClass.PdbIdFromPdbFilename(id.Item1), id.Item2, id.Item3));
                    }
                }
            }

            return(output);
        }
        public static string LoadDsspStructureSequence(string pdbFilename, string chainId = null, int startResidueSequenceIndex = -1, int endResidueSequenceIndex = -1, bool reversedSequence = false)
        {
            if (string.IsNullOrWhiteSpace(pdbFilename))
            {
                return("");
            }

            var pdbId = ProteinBioClass.PdbIdFromPdbFilename(pdbFilename);

            var dsspFilename = pdbFilename;

            if (Path.GetExtension(dsspFilename) != ".dssp")
            {
                dsspFilename += ".dssp";
            }

            if (!File.Exists(dsspFilename))
            {
                return("");
            }

            var secondaryStructure = DsspFormatFile.LoadDsspFile(dsspFilename);

            if (chainId != null && secondaryStructure.FirstOrDefault(a => a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()) == null)
            {
                return("");
            }

            if (startResidueSequenceIndex == -1)
            {
                startResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Min(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue));
            }
            if (endResidueSequenceIndex == -1)
            {
                endResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Max(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue));
            }


            // dssp specification says order may not be correct
            secondaryStructure = secondaryStructure.Where(a => !string.IsNullOrWhiteSpace(a.FieldChain.FieldValue) && !string.IsNullOrWhiteSpace(a.FieldPdbResidueSequenceIndex.FieldValue)).OrderBy(a => a.FieldChain.FieldValue).ThenBy(a => NullableTryParseInt32(a.FieldPdbResidueSequenceIndex.FieldValue)).ToList();

            var proteinInterfaceLen = CalculateProteinInterfaceLength(startResidueSequenceIndex, endResidueSequenceIndex);

            char[] result = new char[proteinInterfaceLen];
            for (int index = 0; index < result.Length; index++)
            {
                result[index] = '_';
            }

            foreach (var record in secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()))
            {
                var resSeq = NullableTryParseInt32(record.FieldPdbResidueSequenceIndex.FieldValue);

                if (resSeq == null || resSeq < startResidueSequenceIndex || resSeq > endResidueSequenceIndex)
                {
                    continue;
                }

                var position = resSeq - startResidueSequenceIndex;

                if (record.FieldSecondaryStructure.FieldValue.Length == 0)
                {
                    continue;
                }

                result[position.Value] = record.FieldSecondaryStructure.FieldValue[0];
            }

            if (reversedSequence)
            {
                Array.Reverse(result);
            }

            return(new string(result));
        }