public void MeasureAminoAcidDistances(string pdbFilePath, int sequenceNumber1, int sequenceNumber2) { var pdb = PdbReader.ReadFile(pdbFilePath); var firstChain = pdb.Models.First().Chains.First(); var aminoAcidAngles = AminoAcidAngleMeasurer.MeasureAngles(firstChain); var aminoAcid1 = firstChain.AminoAcids.Find(x => x.SequenceNumber == sequenceNumber1); var aminoAcid2 = firstChain.AminoAcids.Find(x => x.SequenceNumber == sequenceNumber2); var aminoAcid1Angles = aminoAcidAngles[aminoAcid1]; var aminoAcid2Angles = aminoAcidAngles[aminoAcid2]; var carbonAlphaDistance = aminoAcid1.GetAtomFromName("CA").Position .DistanceTo(aminoAcid2.GetAtomFromName("CA").Position); Console.WriteLine("Carbon alpha distance: " + carbonAlphaDistance); Console.WriteLine($"Amino acid {sequenceNumber1} ({aminoAcid1.Name}) angles: {aminoAcid1Angles}"); Console.WriteLine($"Amino acid {sequenceNumber2} ({aminoAcid2.Name}) angles: {aminoAcid2Angles}"); var carbonAlphaCarbonDistance = aminoAcid1.GetAtomFromName("CA").Position .DistanceTo(aminoAcid1.GetAtomFromName("C").Position); Console.WriteLine($"Carbon alpha-Carbon distance: {carbonAlphaCarbonDistance}"); var carbonNitrogenDistance = aminoAcid1.GetAtomFromName("C").Position .DistanceTo(aminoAcid2.GetAtomFromName("N").Position); Console.WriteLine($"Carbon-Nitrogen distance: {carbonNitrogenDistance}"); }
public void DetectedAlphaHelixMatchesOriginalAnnotation(string pdbFilePath) { var pdb = PdbReader.ReadFile(pdbFilePath); var sut = new AlphaHelixDetector(); var peptide = pdb.Models.First().Chains.First(); var actual = sut.Detect(peptide); var expectedHelixAminoAcids = peptide.Annotations .SelectMany(x => x.AminoAcidReferences) .Select(x => x.SequenceNumber) .ToList(); var actualHelixAminoAcids = actual .SelectMany(x => x.AminoAcidReferences) .Select(x => x.SequenceNumber) .ToList(); var truePositives = actualHelixAminoAcids.Intersect(expectedHelixAminoAcids).ToList(); Assert.That( truePositives.Count, Is.GreaterThan(0.9 * expectedHelixAminoAcids.Count)); var falsePositives = actualHelixAminoAcids.Except(expectedHelixAminoAcids).ToList(); Assert.That( falsePositives.Count, Is.LessThan(0.1 * expectedHelixAminoAcids.Count)); var falseNegatives = expectedHelixAminoAcids.Except(actualHelixAminoAcids).ToList(); Assert.That( falseNegatives.Count, Is.LessThan(0.1 * expectedHelixAminoAcids.Count)); }
public static void ExtractFullSequenceFromFile( string pdbFile, List <string> output) { try { var pdbResult = PdbReader.ReadFile(pdbFile); if (!pdbResult.Models.Any() || !pdbResult.Models.First().Chains.Any()) { return; } output.Add("#" + Path.GetFileNameWithoutExtension(pdbFile)); foreach (var chain in pdbResult.Models.First().Chains) { var helixAnnotations = chain.Annotations.Where(annot => annot.Type == PeptideSecondaryStructure.AlphaHelix).ToList(); var fullSequence = GetFullSequence(chain, helixAnnotations); //var helixSequence = GetHelixSequences(helixAnnotations); output.Add(fullSequence); } } catch (Exception e) { Console.WriteLine("Exception: " + e.Message); } }
public void AlignPdbSubsequences() { var pdbCode1 = "1xmj"; var startIndex1 = 48; var pdbCode2 = "2bbo"; var startIndex2 = 60; var length = 40; var outputDirectory = @"C:\Temp"; var pdbFile1 = $@"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned\pdb{pdbCode1}.ent"; var pdbFile2 = $@"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned\pdb{pdbCode2}.ent"; var peptide1 = PdbReader.ReadFile(pdbFile1).Models.First().Chains.First(); var peptide2 = PdbReader.ReadFile(pdbFile2).Models.First().Chains.First(); var proteinAligner = new ProteinAligner(); var proteinAlignerResult = proteinAligner.AlignSubsequence(peptide1, startIndex1, peptide2, startIndex2, length); var alignmentTransform = proteinAlignerResult.Transformation; peptide2.Molecule.Atoms .Where(atom => atom.IsPositioned) .ForEach(atom => { atom.IsPositionFixed = false; atom.Position = alignmentTransform.Apply(atom.Position.In(SIPrefix.Pico, Unit.Meter)).To(SIPrefix.Pico, Unit.Meter); }); var repositionedPdb = PdbSerializer.Serialize(pdbCode2, peptide2); File.Copy(pdbFile1, Path.Combine(outputDirectory, $@"pdb{pdbCode1}.ent"), true); File.WriteAllText( Path.Combine(outputDirectory, $@"pdb{pdbCode2}_repositioned_{pdbCode1}_sub{startIndex1}-{startIndex2}-{length}.ent"), repositionedPdb); }
public void PdbReaderDebug() { var inputDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\Failed"; foreach (var pdbFile in Directory.EnumerateFiles(inputDirectory, "*.ent")) { PdbReader.ReadFile(pdbFile); File.Delete(pdbFile); } }
public void AminoAcidSequenceNumberAsExpected(string pdbFile, int sequenceStart, int sequenceStop) { var pdbResult = PdbReader.ReadFile(pdbFile); Assume.That(pdbResult.Models.Any()); var firstModel = pdbResult.Models.First(); var firstChain = firstModel.Chains.First(); Assert.That(firstChain.AminoAcids.First().SequenceNumber, Is.EqualTo(sequenceStart)); Assert.That(firstChain.AminoAcids.Last().SequenceNumber, Is.EqualTo(sequenceStop)); }
public void ExtractAminoAcidPositions() { var inputDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned"; var failedDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\Failed"; var outputDirection = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\AminoAcidPositions"; var files = Directory.EnumerateFiles(inputDirectory, "pdb5uak.ent"); Parallel.ForEach(files, pdbFile => { try { using (var pdbResult = PdbReader.ReadFile(pdbFile)) { for (var modelIdx = 0; modelIdx < pdbResult.Models.Count; modelIdx++) { var model = pdbResult.Models[modelIdx]; if (model.Chains.Count != 1) { continue; } var chain = model.Chains.Single(); var carbonAlphaAtoms = chain.Molecule.Atoms.Where(atom => atom.AminoAcidAtomName == "CA").ToList(); if (!carbonAlphaAtoms.All(atom => atom.IsPositioned)) { continue; } var lines = new List <string>(); var allPositioned = true; foreach (var aminoAcid in chain.AminoAcids) { var carbonAlpha = aminoAcid.GetAtomFromName("CA"); if (carbonAlpha == null || !carbonAlpha.IsPositioned) { allPositioned = false; break; } lines.Add($"{aminoAcid.Name.ToOneLetterCode()};{carbonAlpha.Position.In(SIPrefix.Pico, Unit.Meter)}"); } if (!allPositioned) { continue; } File.WriteAllLines( Path.Combine(outputDirection, $"{Path.GetFileNameWithoutExtension(pdbFile)}_model{modelIdx:D3}.csv"), lines); } } } catch { File.Move(pdbFile, Path.Combine(failedDirectory, Path.GetFileName(pdbFile))); } }); }
public void AllModelsRead(string file, int expectedModelCount, int expectedPeptideLength) { var pdbResult = PdbReader.ReadFile(file); Assert.That(pdbResult.Models.Count, Is.EqualTo(expectedModelCount)); foreach (var pdbModel in pdbResult.Models) { Assert.That(pdbModel.Chains.Count, Is.EqualTo(1)); var peptide = pdbModel.Chains.Single(); Assert.That(peptide.AminoAcids.Count, Is.EqualTo(expectedPeptideLength)); } }
private static Dictionary <AminoAcidReference, AminoAcidAngles> MeasureDihedralAngles(string pdbFilename) { var result = PdbReader.ReadFile(pdbFilename); var angleMeasurements = new Dictionary <AminoAcidReference, AminoAcidAngles>(); foreach (var chain in result.Models.First().Chains) { var angleMeasurement = AminoAcidAngleMeasurer.MeasureAngles(chain); foreach (var kvp in angleMeasurement) { angleMeasurements.Add(kvp.Key, kvp.Value); } } return(angleMeasurements); }
public static Peptide Load(string filename) { var extension = Path.GetExtension(filename).ToLowerInvariant(); switch (extension) { case ".pdb": var result = PdbReader.ReadFile(filename); return(result.Models.First().Chains.First()); case ".aminoseq": return(AminoseqReader.ReadFile(filename)); default: throw new ArgumentException($"File extension '{extension}' is unsupported"); } }
public void PdbReadTest() { var inputDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins"; Directory.CreateDirectory(Path.Combine(inputDirectory, "NoChain")); Directory.CreateDirectory(Path.Combine(inputDirectory, "SingleChain")); Directory.CreateDirectory(Path.Combine(inputDirectory, "MultiChain")); var cancellationTokenSource = new CancellationTokenSource(); var files = Directory.EnumerateFiles(inputDirectory, "*.ent"); Parallel.ForEach(files, pdbFile => { //cancellationTokenSource.Token.ThrowIfCancellationRequested(); try { using (var pdbResult = PdbReader.ReadFile(pdbFile)) { var maxChainCount = pdbResult.Models.Max(model => model.Chains.Count); if (maxChainCount == 0) { File.Move(pdbFile, Path.Combine(inputDirectory, "NoChain", Path.GetFileName(pdbFile))); } else if (maxChainCount == 1) { File.Move(pdbFile, Path.Combine(inputDirectory, "SingleChain", Path.GetFileName(pdbFile))); } else { File.Move(pdbFile, Path.Combine(inputDirectory, "MultiChain", Path.GetFileName(pdbFile))); } } } catch { File.Move(pdbFile, Path.Combine(inputDirectory, "Failed", Path.GetFileName(pdbFile))); //cancellationTokenSource.Cancel(); } }); if (cancellationTokenSource.IsCancellationRequested) { Assert.Fail(); } Assert.Pass(); }
public void ProteinPdbSequenceAlignment() { var proteinIndexCsvFileDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned\ByProtein"; var outputDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned\SequenceOutput"; var csvFiles = Directory.EnumerateFiles(proteinIndexCsvFileDirectory, "*.csv"); var failingSequences = new ConcurrentBag <string>(); Parallel.ForEach(csvFiles, csvFile => { try { var proteinPdbPaths = File.ReadLines(csvFile); var sequences = new List <string>(); foreach (var proteinPdbPath in proteinPdbPaths) { var pdbFile = PdbReader.ReadFile(proteinPdbPath); var peptide = pdbFile.Models.First().Chains.Single(); var maxSequenceNumber = peptide.AminoAcids.Max(aa => aa.SequenceNumber); var sequence = Enumerable.Repeat(' ', maxSequenceNumber).ToList(); foreach (var aminoAcidReference in peptide.AminoAcids) { if (aminoAcidReference.SequenceNumber < 1) { continue; } sequence[aminoAcidReference.SequenceNumber - 1] = aminoAcidReference.Name.ToOneLetterCode(); } sequences.Add(new string(sequence.ToArray())); } var outputFile = Path.Combine(outputDirectory, Path.GetFileName(csvFile)); File.WriteAllLines(outputFile, sequences); } catch { failingSequences.Add(csvFile); } }); Console.WriteLine("Failing proteins:"); foreach (var failingProteins in failingSequences) { Console.WriteLine(failingProteins); } }
public void FilterByPositionedMolecules() { var inputDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain"; Directory.CreateDirectory(Path.Combine(inputDirectory, "FullyPositioned")); Directory.CreateDirectory(Path.Combine(inputDirectory, "PartiallyPositioned")); Directory.CreateDirectory(Path.Combine(inputDirectory, "NotPositioned")); var files = Directory.EnumerateFiles(inputDirectory, "*.ent"); Parallel.ForEach(files, pdbFile => { using (var pdbResult = PdbReader.ReadFile(pdbFile)) { foreach (var model in pdbResult.Models) { if (model.Chains.Count != 1) { continue; } var chain = model.Chains.Single(); var carbonAlphaAtoms = chain.Molecule.Atoms.Where(atom => atom.AminoAcidAtomName == "CA").ToList(); if (carbonAlphaAtoms.All(atom => atom.IsPositioned)) { File.Move(pdbFile, Path.Combine(inputDirectory, "FullyPositioned", Path.GetFileName(pdbFile))); } else if (carbonAlphaAtoms.Any(atom => atom.IsPositioned)) { File.Move(pdbFile, Path.Combine(inputDirectory, "PartiallyPositioned", Path.GetFileName(pdbFile))); } else { File.Move(pdbFile, Path.Combine(inputDirectory, "NotPositioned", Path.GetFileName(pdbFile))); } break; } } }); }
public void MeasureAverageAminoAcidDistance(string pdbFilePath) { var pdb = PdbReader.ReadFile(pdbFilePath); var firstChain = pdb.Models.First().Chains.First(); var lastAminoAcid = firstChain.AminoAcids.First(); var aminoAcidDistances = new List <UnitValue>(); foreach (var aminoAcid in firstChain.AminoAcids.Skip(1)) { var p1 = lastAminoAcid.GetAtomFromName("CA").Position; var p2 = aminoAcid.GetAtomFromName("CA").Position; var distance = p1.DistanceTo(p2); aminoAcidDistances.Add(distance); } var distancesInNanoMeter = aminoAcidDistances.Select(x => x.In(SIPrefix.Nano, Unit.Meter)).ToList(); Console.WriteLine($"Average: {distancesInNanoMeter.Average():F3} nm"); Console.WriteLine($"Median: {distancesInNanoMeter.Median():F3} nm"); Console.WriteLine($"Minimum: {distancesInNanoMeter.Min():F3} nm"); Console.WriteLine($"Maximum: {distancesInNanoMeter.Max():F3} nm"); distancesInNanoMeter.ForEach(Console.WriteLine); }
public void ApproximatePeptideIsFoldedToKnownStableState(string pdbFilePath) { var pdbReadResult = PdbReader.ReadFile(pdbFilePath); var peptide = pdbReadResult.Models.First().Chains.First(); var approximatePeptide = ApproximatePeptideBuilder.FromPeptide(peptide); var simulationSettings = new ApproximatePeptideSimulationSettings { SimulationTime = 10.To(SIPrefix.Pico, Unit.Second), TimeStep = 2.To(SIPrefix.Femto, Unit.Second) }; var ramachadranDataDirectory = @"G:\Projects\HumanGenome\ramachadranDistributions"; var simulator = ApproximatePeptideFoldingSimulatorFactory.Create( approximatePeptide, simulationSettings, ramachadranDataDirectory); simulator.TimestepCompleted += Simulator_TimestepCompleted; simulator.SimulationCompleted += Simulator_SimulationCompleted; simulationWaitHandle.Reset(); simulator.StartSimulation(); simulationWaitHandle.WaitOne(); Assert.Pass(); }
public void ExtractAndAnnotateHelixSequences() { var directory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned\ByProtein"; var outputFilePath = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\humanFullyPositionedSingleChainUniqueProteinHelixMarked.txt"; var pdbLookupFiles = Directory.EnumerateFiles(directory, "*.csv"); var pdbReaderOptions = new PdbReaderOptions { MaximumModelCount = 1, BuildMolecule = false }; var outputLock = new object(); File.Delete(outputFilePath); Parallel.ForEach(pdbLookupFiles, pdbLookupFile => { var pdbFilePaths = File.ReadAllLines(pdbLookupFile); var proteinOutput = new List <string>(); var maxProteinAminoAcidCount = 0; foreach (var pdbFilePath in pdbFilePaths) { try { var pdbResult = PdbReader.ReadFile(pdbFilePath, pdbReaderOptions); if (!pdbResult.Models.Any()) { return; } var firstModel = pdbResult.Models.First(); if (!firstModel.Chains.Any()) { return; } var hasHelixAnnotations = firstModel.Chains .SelectMany(chain => chain.Annotations) .Any(annotation => annotation.Type == PeptideSecondaryStructure.AlphaHelix); if (!hasHelixAnnotations) { return; } var aminoAcidCount = firstModel.Chains.Sum(chain => chain.AminoAcids.Count); if (aminoAcidCount <= maxProteinAminoAcidCount) { return; } proteinOutput.Clear(); proteinOutput.Add("#" + Path.GetFileNameWithoutExtension(pdbFilePath)); foreach (var chain in pdbResult.Models.First().Chains) { var helixAnnotations = chain.Annotations.Where(annot => annot.Type == PeptideSecondaryStructure.AlphaHelix).ToList(); var fullSequence = AlphaHelixAnnotationTool.GetFullSequence(chain, helixAnnotations); //var helixSequence = AlphaHelixAnnotationTool.GetHelixSequences(helixAnnotations); proteinOutput.Add(fullSequence); } maxProteinAminoAcidCount = aminoAcidCount; } catch (Exception e) { var errorMessage = $"Exception: {e.Message}(Path: {pdbFilePath})"; Console.WriteLine(errorMessage); File.AppendAllLines(@"C:\Temp\errors.txt", new [] { errorMessage }); } } lock (outputLock) { File.AppendAllLines(outputFilePath, proteinOutput); } }); }
public void AlignAllModelsOfProtein(string proteinName, bool storeIndividualAlignedPdb) { var outputDirectory = Path.Combine(@"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned\AlignedProteins", proteinName); if (!Directory.Exists(outputDirectory)) { Directory.CreateDirectory(outputDirectory); } var proteinListDirectory = @"G:\Projects\HumanGenome\Protein-PDBs\HumanProteins\SingleChain\FullyPositioned\ByProtein"; var pdbListFile = Path.Combine(proteinListDirectory, proteinName + ".csv"); var pdbFiles = File.ReadLines(pdbListFile).ToList(); var firstPeptide = PdbReader.ReadFile(pdbFiles.First()).Models.First().Chains.First(); File.Copy(pdbFiles.First(), Path.Combine(outputDirectory, Path.GetFileName(pdbFiles.First())), true); var proteinAligner = new ProteinAligner(); var combinedModels = new List <Peptide> { firstPeptide }; var modelErrors = new Dictionary <string, UnitValue> { { pdbFiles.First(), 0.To(Unit.Meter) } }; foreach (var pdbFile in pdbFiles.Skip(1)) { var peptide = PdbReader.ReadFile(pdbFile).Models.First().Chains.First(); var proteinAlignerResult = proteinAligner.Align(firstPeptide, peptide); var alignmentTransform = proteinAlignerResult.Transformation; peptide.Molecule.Atoms .Where(atom => atom.IsPositioned) .ForEach(atom => { atom.IsPositionFixed = false; atom.Position = alignmentTransform.Apply(atom.Position.In(SIPrefix.Pico, Unit.Meter)).To(SIPrefix.Pico, Unit.Meter); }); var modelError = proteinAlignerResult.IsTransformationValid ? proteinAlignerResult.AveragePositionError : double.PositiveInfinity.To(Unit.Meter); modelErrors.Add(pdbFile, modelError); combinedModels.Add(peptide); if (storeIndividualAlignedPdb) { var pdbId = Path.GetFileNameWithoutExtension(pdbFile).Replace("pdb", ""); var repositionedPdb = PdbSerializer.Serialize(pdbId, peptide); File.WriteAllText( Path.Combine(outputDirectory, $"pdb{pdbId}.ent"), repositionedPdb); } } var medianError = modelErrors.Values.Select(x => x.In(SIPrefix.Pico, Unit.Meter)).Median(); var stdError = modelErrors.Values .Select(x => x.In(SIPrefix.Pico, Unit.Meter)) .Average(x => x.Square()).Sqrt(); var validModels = pdbFiles .Select((pdbFile, idx) => new { PdbFile = pdbFile, Model = combinedModels[idx], Error = modelErrors[pdbFile].In(SIPrefix.Pico, Unit.Meter) }) .Where(x => x.Error < Math.Min(medianError + 2 * stdError, 1000)) .Select(x => x.Model) .ToArray(); var combinedPdb = PdbSerializer.Serialize("1234", validModels); File.WriteAllText( Path.Combine(outputDirectory, "pdb_combined.ent"), combinedPdb); File.WriteAllLines( Path.Combine(outputDirectory, "averageError.csv"), modelErrors.Select(kvp => $"{kvp.Key};{kvp.Value.In(SIPrefix.Pico, Unit.Meter)}")); }