public static List <Sequence> LoadStructureFile(string atomsFilename, char[] chainIdWhiteList = null, bool padMissingBool = true, int[] startResSeq = null, int[] endResSeq = null, char outsidePaddingChar = ' ', char insidePaddingChar = 'X') { var pdb = ProteinBioClass.PdbAtomicChains(atomsFilename, chainIdWhiteList, -1, -1, true); var pdbId = ProteinBioClass.PdbIdFromPdbFilename(atomsFilename); return(LoadStructureFile(pdb, pdbId, chainIdWhiteList, padMissingBool, startResSeq, endResSeq, outsidePaddingChar, insidePaddingChar)); }
public static List <SequenceIdentityClusterMember> ClusterSequenceByAlignedSequenceIdentity(List <Sequence> seqList, ProteinBioClass.AlignmentIdentityOption alignmentIdentityOption, decimal mininumClusterPairwiseSimilarity = 0.3m, decimal mininumEvoClusterPairwiseSimilarity = 0.3m) { var allsequences = seqList.Select(a => new Tuple <string, char, string>(new ProteinBioClass.SequenceId(a.Id).PdbId, new ProteinBioClass.SequenceId(a.Id).ChainId, Sequence.EscapeAminoAcidSequence(a.FullSequence))).ToList(); var sequences = allsequences.Select(a => a.Item3).Distinct().ToList(); var sequenceIds = sequences.Select(a => allsequences.Where(b => b.Item3 == a).ToList()).ToList(); var seqClusters = new List <List <string> >(); for (int x = 0; x < sequences.Count; x++) { var seq1 = sequences[x]; var newCluster = new List <string>(); newCluster.Add(seq1); seqClusters.Add(newCluster); } for (int indexX = 0; indexX < sequences.Count; indexX++) { Console.WriteLine("Aligning sequence " + indexX); var seqX = sequences[indexX]; //List<decimal> scoreList = new List<decimal>(); //List<decimal> scoreEvoList = new List<decimal>(); for (int indexY = 0; indexY < sequences.Count; indexY++) { if (indexY <= indexX) { continue; } var seqY = sequences[indexY]; if ((decimal)Math.Min(seqX.Length, seqY.Length) / (decimal)Math.Max(seqX.Length, seqY.Length) < mininumClusterPairwiseSimilarity) { continue; } var cluster1 = seqClusters.FirstOrDefault(a => a.Contains(seqX)); var cluster2 = seqClusters.FirstOrDefault(a => a.Contains(seqY)); if (cluster1 != null && cluster2 != null && cluster1 == cluster2) { continue; } var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NON, alignmentIdentityOption); Console.WriteLine("1: " + seqX); Console.WriteLine("2: " + seqY); Console.WriteLine("Score1: " + score.Score); Console.WriteLine("Score2: " + score.ScoreEvo); if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity) { var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.SIM, alignmentIdentityOption); if (x.Score > score.Score) { score.Score = x.Score; } if (x.ScoreEvo > score.ScoreEvo) { score.ScoreEvo = x.ScoreEvo; } } if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity) { var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NMW, alignmentIdentityOption); if (x.Score > score.Score) { score = x; } if (x.ScoreEvo > score.ScoreEvo) { score.ScoreEvo = x.ScoreEvo; } } if (score.Score >= mininumClusterPairwiseSimilarity && score.ScoreEvo >= mininumEvoClusterPairwiseSimilarity) { var newCluster = new List <string>(); newCluster.AddRange(cluster1); newCluster.AddRange(cluster2); seqClusters.Remove(cluster1); seqClusters.Remove(cluster2); seqClusters.Add(newCluster); } //scoreList.Add(score.Score); //scoreEvoList.Add(score.ScoreEvo); } //Console.WriteLine("[" + string.Join(", ", scoreList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]"); //Console.WriteLine("[" + string.Join(", ", scoreEvoList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]"); } seqClusters = seqClusters.OrderBy(a => a.Count).ToList(); var output = new List <SequenceIdentityClusterMember>(); for (var index = 0; index < seqClusters.Count; index++) { var seqCluster = seqClusters[index]; foreach (var item in seqCluster) { var indexIds = sequences.IndexOf(item); var ids = sequenceIds[indexIds]; foreach (var id in ids) { output.Add(new SequenceIdentityClusterMember(index + 1, ProteinBioClass.PdbIdFromPdbFilename(id.Item1), id.Item2, id.Item3)); } } } return(output); }
public static string LoadDsspStructureSequence(string pdbFilename, string chainId = null, int startResidueSequenceIndex = -1, int endResidueSequenceIndex = -1, bool reversedSequence = false) { if (string.IsNullOrWhiteSpace(pdbFilename)) { return(""); } var pdbId = ProteinBioClass.PdbIdFromPdbFilename(pdbFilename); var dsspFilename = pdbFilename; if (Path.GetExtension(dsspFilename) != ".dssp") { dsspFilename += ".dssp"; } if (!File.Exists(dsspFilename)) { return(""); } var secondaryStructure = DsspFormatFile.LoadDsspFile(dsspFilename); if (chainId != null && secondaryStructure.FirstOrDefault(a => a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()) == null) { return(""); } if (startResidueSequenceIndex == -1) { startResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Min(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue)); } if (endResidueSequenceIndex == -1) { endResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Max(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue)); } // dssp specification says order may not be correct secondaryStructure = secondaryStructure.Where(a => !string.IsNullOrWhiteSpace(a.FieldChain.FieldValue) && !string.IsNullOrWhiteSpace(a.FieldPdbResidueSequenceIndex.FieldValue)).OrderBy(a => a.FieldChain.FieldValue).ThenBy(a => NullableTryParseInt32(a.FieldPdbResidueSequenceIndex.FieldValue)).ToList(); var proteinInterfaceLen = CalculateProteinInterfaceLength(startResidueSequenceIndex, endResidueSequenceIndex); char[] result = new char[proteinInterfaceLen]; for (int index = 0; index < result.Length; index++) { result[index] = '_'; } foreach (var record in secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant())) { var resSeq = NullableTryParseInt32(record.FieldPdbResidueSequenceIndex.FieldValue); if (resSeq == null || resSeq < startResidueSequenceIndex || resSeq > endResidueSequenceIndex) { continue; } var position = resSeq - startResidueSequenceIndex; if (record.FieldSecondaryStructure.FieldValue.Length == 0) { continue; } result[position.Value] = record.FieldSecondaryStructure.FieldValue[0]; } if (reversedSequence) { Array.Reverse(result); } return(new string(result)); }