public static List <Sequence> LoadStructureFile(string atomsFilename, char[] chainIdWhiteList = null, bool padMissingBool = true, int[] startResSeq = null, int[] endResSeq = null, char outsidePaddingChar = ' ', char insidePaddingChar = 'X') { var pdb = ProteinBioClass.PdbAtomicChains(atomsFilename, chainIdWhiteList, -1, -1, true); var pdbId = ProteinBioClass.PdbIdFromPdbFilename(atomsFilename); return(LoadStructureFile(pdb, pdbId, chainIdWhiteList, padMissingBool, startResSeq, endResSeq, outsidePaddingChar, insidePaddingChar)); }
public static List <Sequence> LoadStructureFile(string[] structureFileLines, string pdbId = "", char[] chainIdWhiteList = null, bool padMissingBool = true, int[] startResSeq = null, int[] endResSeq = null, char outsidePaddingChar = ' ', char insidePaddingChar = 'X') { var pdb = ProteinBioClass.PdbAtomicChains(structureFileLines, chainIdWhiteList, -1, -1, false); foreach (var c in pdb.ChainList) { c.AtomList = c.AtomList.GroupBy(a => a.resSeq.FieldValue).OrderBy(g => int.Parse(g.Key)).Select(g => g.First()).ToList(); } return(LoadStructureFile(pdb, pdbId, chainIdWhiteList, padMissingBool, startResSeq, endResSeq, outsidePaddingChar, insidePaddingChar)); }
public static StructureToSequenceAlignment.StructureToSequenceAlignmentResult GetSequence(string pdbId, char chainId, int first = -1, int last = -1) { //var chainId = pdbId[4]; pdbId = pdbId.Substring(0, 4); var pdbFilename = @"c:\pdbe\" + pdbId + ".pdb"; var fastaFilename = @"c:\pdbe\pdb_seqres.fasta"; var fastaSequence = Sequence.LoadSequenceFile(fastaFilename, new string[] { null, "", "protein" }).First(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.ToUpperInvariant() && a.IdSplit.ChainId == chainId); var pdbSequence = Sequence.LoadStructureFile(pdbFilename, new[] { chainId }, true, null, null, '-', '-').First(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.ToUpperInvariant() && a.IdSplit.ChainId == chainId); var atoms = ProteinBioClass.PdbAtomicChains(pdbFilename, new char[] { chainId }).ChainList.First(a => a.ChainId == chainId).AtomList; var align = StructureToSequenceAlignment.Align(atoms, fastaSequence.FullSequence, pdbSequence.FullSequence, first, last); return(align); }
static void Main(string[] args) { var parameters = new string[, ] { { "[pdb_file]", "PDB ~v3.3 Protein Data Bank format file [*.pdb, *.ent]" }, { "[interface-interface_file]", "interface-interface file" }, { "[[chain_ids]]", "molecule chains to output [* for all]" }, { "[[output_file]]", "optional output file. use ? for chain id. when ommitted, output to console" }, }; var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length); var exeFilename = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName); if (args.Length == 0) { Console.WriteLine(exeFilename + @" is a program to extract ATOM records from a PDB file."); Console.WriteLine(); Console.WriteLine(@"Usage:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Example:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\pdb1a12.pdb"" 8.0 ""c:\pdb_atoms\atoms1a12.pdb""", maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Arguments:"); for (var i = 0; i < parameters.GetLength(0); i++) { Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false)); } Console.WriteLine(); return; } // load and echo arguments var p = 0; var pdbFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; pdbFilename = pdbFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + pdbFilename); p++; var interfaceInterfaceFile = args.Length > p && args[p].Length > 0 ? args[p].ToUpperInvariant() : ""; interfaceInterfaceFile = interfaceInterfaceFile.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + interfaceInterfaceFile); p++; var chainIds = args.Length > p && args[p].Length > 0 ? args[p] : ""; Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + chainIds); p++; var outputFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; outputFilename = outputFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFilename); Console.WriteLine(); if (!File.Exists(pdbFilename)) { Console.WriteLine("; File not found: " + pdbFilename); return; } if (!File.Exists(interfaceInterfaceFile)) { Console.WriteLine("; File not found: " + interfaceInterfaceFile); return; } if (string.IsNullOrWhiteSpace(pdbFilename)) { return; } if (chainIds.Contains('*')) { chainIds = null; } var chainIdWhiteList = !string.IsNullOrEmpty(chainIds) ? chainIds.ToUpperInvariant().Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries) : null; var interfaceData = ComplexInterfaces.ComplexInterfaces.InterfaceInterfaceData.Load(interfaceInterfaceFile); var terminatedChains = new List <string>(); var pdbfilenameShort = Path.GetFileNameWithoutExtension(pdbFilename); var pdbId = pdbfilenameShort.Substring(pdbfilenameShort.Length - 4).ToUpperInvariant(); var lines = File.ReadAllLines(pdbFilename); var result = new List <Tuple <string, string> >(); var interfaceDataChains = interfaceData.Select(a => a.ReceptorChainId).Distinct().ToList(); var interfaceDataStart = interfaceDataChains.Select(a => interfaceData.Where(b => b.ReceptorChainId == a).Min(b => b.ReceptorInterfaceResSeqStart)).ToList(); var interfaceDataEnd = interfaceDataChains.Select(a => interfaceData.Where(b => b.ReceptorChainId == a).Max(b => b.ReceptorInterfaceResSeqEnd)).ToList(); foreach (var line in lines) { if (line.Length < 22) { continue; } if (line.Substring(0, 4).ToUpperInvariant() == "TER ") { var chainId = ("" + line[21]).ToUpperInvariant(); terminatedChains.Add(chainId); } if (line.Substring(0, 5).ToUpperInvariant() == "ATOM ") { var chainId = ("" + line[21]).ToUpperInvariant(); if (terminatedChains.Contains(chainId)) { continue; } if (chainIdWhiteList != null && chainIdWhiteList.Length > 0 && !chainIdWhiteList.Contains(chainId)) { continue; } if (!interfaceDataChains.Contains(chainId[0])) { continue; } //if (caTraceOnlyBool && (line[13] != 'C' || line[14] != 'A')) continue; var interfaceDataChainIndex = interfaceDataChains.IndexOf(chainId[0]); var resSeq = int.Parse(line.Substring(22, 4).Trim()); if (resSeq >= interfaceDataStart[interfaceDataChainIndex] && resSeq <= interfaceDataEnd[interfaceDataChainIndex]) { result.Add(new Tuple <string, string>(chainId, line)); } } } if (!string.IsNullOrWhiteSpace(outputFilename)) { Directory.CreateDirectory(Path.GetDirectoryName(outputFilename.Replace("?", ""))); if (!outputFilename.Contains("?")) { File.WriteAllLines(outputFilename, result.Select(a => a.Item2).ToList()); } else { var chains = result.Select(a => a.Item1).Distinct().ToList(); foreach (var chain in chains) { var outputFilename2 = outputFilename.Replace("?", ""); outputFilename2 = Path.GetDirectoryName(outputFilename2) + @"\" + Path.GetFileNameWithoutExtension(outputFilename2) + chain + Path.GetExtension(outputFilename2); File.WriteAllLines(outputFilename2, result.Where(a => a.Item1 == chain).Select(a => a.Item2).ToList()); } } } else { foreach (var line in result) { Console.WriteLine(line); } Console.WriteLine(); } }
public static void Main(string[] args) { var parameters = new string[, ] { { "[pdb_or_atoms_file]", "standard crystal pdb file or output from the ComplexAtoms program" }, { "[[pad_missing]]", "Y or N (default: Y)" }, { "[[output_fasta_file]]", "optional output fasta file. when ommitted, output to console" }, { "[[append_or_overwrite]]", "optional (A) append or (O) overwrite (default: overwrite)" }, }; var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length); var exeFilename = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName); if (args.Length < 1) { Console.WriteLine(exeFilename + @" is a program to extract the protein amino acid fasta sequence from protein structure pdb file."); Console.WriteLine(); Console.WriteLine(@"Usage:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Example:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\atoms\atoms1a12.pdb"" ""c:\pdb_db\fasta_from_pdb\atoms1a12.pdb.fasta""", maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Arguments:"); for (var i = 0; i < parameters.GetLength(0); i++) { Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false)); } Console.WriteLine(); return; } // load arguments var p = 0; var atomsFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; atomsFilename = atomsFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + atomsFilename); p++; var padMissing = args.Length > p && args[p].Length > 0 ? args[p] : "Y"; padMissing = padMissing.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + padMissing); if (padMissing != "Y" && padMissing != "N") { padMissing = "Y"; } var padMissingBool = padMissing == "Y"; p++; var outputFastaFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; outputFastaFilename = outputFastaFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFastaFilename); p++; var appendOrOverwrite = args.Length > p && args[p].Length > 0 ? args[p] : ""; appendOrOverwrite = appendOrOverwrite.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + appendOrOverwrite); if (!(string.IsNullOrWhiteSpace(appendOrOverwrite) || appendOrOverwrite == "O" || appendOrOverwrite == "A")) { return; } Console.WriteLine(); var sequenceList = Sequence.LoadStructureFile(atomsFilename, null, padMissingBool);// ProteinBioClass.StructureFileToAaFastaSequence(atomsFilename, null, padMissingBool); var output = Sequence.GetAsFasta(sequenceList); if (string.IsNullOrWhiteSpace(outputFastaFilename)) { Console.WriteLine(); Console.WriteLine(output); Console.WriteLine(); } else { Directory.CreateDirectory(Path.GetDirectoryName(outputFastaFilename)); if (appendOrOverwrite == "A" && File.Exists(outputFastaFilename)) { var data = File.ReadAllText(outputFastaFilename); if (!data.EndsWith(Environment.NewLine)) { data = data + Environment.NewLine; } output = data + output; } File.WriteAllText(outputFastaFilename, output); } }
static void Main(string[] args) { // the indexes of data, contacts1 and contacts2 all match var data = MultiBindingInterface.LoadAuthorData(@"c:\multibinding\multibinding.csv", @"c:\multibinding\multibinding_homolog_clusters.csv"); var contactsPartner1 = data.Select( a => { var x = ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\multibinding\contacts\contacts_pdb" + a.InteractionChainsPdb1.ToUpperInvariant() + ".pdb") .Where( b => (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb1Chain1 && b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb1Chain2) || (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb1Chain2 && b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb1Chain1)).ToList(); x = x.Select(c => { if (c.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb1Chain2) { c.SwapAtoms(); } return(c); }).ToList(); return(x); }).ToList(); var contactsPartner2 = data.Select( a => { var x = ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\multibinding\contacts\contacts_pdb" + a.InteractionChainsPdb2.ToUpperInvariant() + ".pdb") .Where( b => (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb2Chain1 && b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb2Chain2) || (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb2Chain2 && b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb2Chain1)).ToList(); x = x.Select(c => { if (c.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb2Chain2) { c.SwapAtoms(); } return(c); }).ToList(); return(x); }).ToList(); var interfacePartner1 = contactsPartner1.Select(a => { var resSeqChain1 = a.Select(b => int.Parse(b.Atom1.resSeq.FieldValue)).ToList(); var resSeqChain2 = a.Select(b => int.Parse(b.Atom2.resSeq.FieldValue)).ToList(); if (resSeqChain1.Count > 0 && resSeqChain2.Count > 0) { return(new Tuple <int, int, int, int>(resSeqChain1.Min(), resSeqChain1.Max(), resSeqChain2.Min(), resSeqChain2.Max())); } else { return(null); } }).ToList(); var interfacePartner2 = contactsPartner2.Select(a => { var resSeqChain1 = a.Select(b => int.Parse(b.Atom1.resSeq.FieldValue)).ToList(); var resSeqChain2 = a.Select(b => int.Parse(b.Atom2.resSeq.FieldValue)).ToList(); if (resSeqChain1.Count > 0 && resSeqChain2.Count > 0) { return(new Tuple <int, int, int, int>(resSeqChain1.Min(), resSeqChain1.Max(), resSeqChain2.Min(), resSeqChain2.Max())); } else { return(null); } }).ToList(); // var resultData = new List<MultiBindingInterface>(); for (int index = 0; index < data.Count; index++) { var d = data[index]; var cp1 = contactsPartner1[index]; var cp2 = contactsPartner2[index]; var ip1 = interfacePartner1[index]; var ip2 = interfacePartner2[index]; if (d == null || cp1 == null || cp2 == null || ip1 == null || ip2 == null) { continue; } if (cp1.Count == 0 || cp2.Count == 0) { continue; } var p1c1_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb1 + ".pdb", new char[] { d.InteractionChainsPdb1Chain1 }, -1, -1, true); var p1c2_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb1 + ".pdb", new char[] { d.InteractionChainsPdb1Chain2 }, -1, -1, true); var p2c1_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb2 + ".pdb", new char[] { d.InteractionChainsPdb2Chain1 }, -1, -1, true); var p2c2_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb2 + ".pdb", new char[] { d.InteractionChainsPdb2Chain2 }, -1, -1, true); var p1c1_res_seq = p1c1_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList(); var p1c2_res_seq = p1c2_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList(); var p2c1_res_seq = p2c1_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList(); var p2c2_res_seq = p2c2_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList(); var cp1a1_res_seq = cp1.Select(a => int.Parse(a.Atom1.resSeq.FieldValue)).ToList(); var cp1a2_res_seq = cp1.Select(a => int.Parse(a.Atom2.resSeq.FieldValue)).ToList(); var cp2a1_res_seq = cp2.Select(a => int.Parse(a.Atom1.resSeq.FieldValue)).ToList(); var cp2a2_res_seq = cp2.Select(a => int.Parse(a.Atom2.resSeq.FieldValue)).ToList(); var cp1a1_min = cp1a1_res_seq.Min(); var cp1a1_max = cp1a1_res_seq.Max(); var cp1a2_min = cp1a2_res_seq.Min(); var cp1a2_max = cp1a2_res_seq.Max(); var cp2a1_min = cp2a1_res_seq.Min(); var cp2a1_max = cp2a1_res_seq.Max(); var cp2a2_min = cp2a2_res_seq.Min(); var cp2a2_max = cp2a2_res_seq.Max(); var cp1a1_best50_min = int.MinValue; var cp1a1_best50_max = int.MinValue; var cp1a1_best50_interactions = int.MinValue; var cp1a1_best50_middle_finder = new List <Tuple <int, int, int> >(); var interface_target_length = 50; for (var x = cp1a1_min - interface_target_length; x <= cp1a1_max; x++) { if (Math.Abs(cp1a1_max - cp1a1_min) <= interface_target_length) { cp1a1_best50_min = cp1a1_min; cp1a1_best50_max = cp1a1_max; cp1a1_best50_interactions = cp1a1_res_seq.Count; break; } var min = x; var max = x + interface_target_length > cp1a1_max ? cp1a1_max : x + interface_target_length; var best50 = cp1a1_res_seq.Count(a => a >= cp1a1_best50_min && a <= cp1a1_best50_max); if (best50 == cp1a1_best50_interactions) { cp1a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); } if (cp1a1_best50_interactions == int.MinValue || best50 > cp1a1_best50_interactions) { cp1a1_best50_middle_finder.Clear(); cp1a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); cp1a1_best50_min = min; cp1a1_best50_max = max; cp1a1_best50_interactions = best50; } if (x + interface_target_length >= cp1a1_max) { break; } } if (cp1a1_best50_middle_finder.Count > 2) { var middle = cp1a1_best50_middle_finder[cp1a1_best50_middle_finder.Count / 2]; cp1a1_best50_min = middle.Item1; cp1a1_best50_max = middle.Item2; cp1a1_best50_interactions = middle.Item3; } var cp1a2_best50_min = int.MinValue; var cp1a2_best50_max = int.MinValue; var cp1a2_best50_interactions = int.MinValue; var cp1a2_best50_middle_finder = new List <Tuple <int, int, int> >(); for (var x = cp1a2_min - interface_target_length; x <= cp1a2_max; x++) { if (Math.Abs(cp1a2_max - cp1a2_min) <= interface_target_length) { cp1a2_best50_min = cp1a2_min; cp1a2_best50_max = cp1a2_max; cp1a2_best50_interactions = cp1a2_res_seq.Count; break; } var min = x; var max = x + interface_target_length > cp1a2_max ? cp1a2_max : x + interface_target_length; var best50 = cp1a2_res_seq.Count(a => a >= cp1a2_best50_min && a <= cp1a2_best50_max); if (best50 == cp1a2_best50_interactions) { cp1a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); } if (cp1a2_best50_interactions == int.MinValue || best50 > cp1a2_best50_interactions) { cp1a2_best50_middle_finder.Clear(); cp1a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); cp1a2_best50_min = min; cp1a2_best50_max = max; cp1a2_best50_interactions = best50; } if (x + interface_target_length >= cp1a2_max) { break; } } if (cp1a2_best50_middle_finder.Count > 2) { var middle = cp1a2_best50_middle_finder[cp1a2_best50_middle_finder.Count / 2]; cp1a2_best50_min = middle.Item1; cp1a2_best50_max = middle.Item2; cp1a2_best50_interactions = middle.Item3; } var cp2a1_best50_min = int.MinValue; var cp2a1_best50_max = int.MinValue; var cp2a1_best50_interactions = int.MinValue; var cp2a1_best50_middle_finder = new List <Tuple <int, int, int> >(); for (var x = cp2a1_min - interface_target_length; x <= cp2a1_max; x++) { if (Math.Abs(cp2a1_max - cp2a1_min) <= interface_target_length) { cp2a1_best50_min = cp2a1_min; cp2a1_best50_max = cp2a1_max; cp2a1_best50_interactions = cp2a1_res_seq.Count; break; } var min = x; var max = x + interface_target_length > cp2a1_max ? cp2a1_max : x + interface_target_length; var best50 = cp2a1_res_seq.Count(a => a >= cp2a1_best50_min && a <= cp2a1_best50_max); if (best50 == cp2a1_best50_interactions) { cp2a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); } if (cp2a1_best50_interactions == int.MinValue || best50 > cp2a1_best50_interactions) { cp2a1_best50_middle_finder.Clear(); cp2a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); cp2a1_best50_min = min; cp2a1_best50_max = max; cp2a1_best50_interactions = best50; } if (x + interface_target_length >= cp2a1_max) { break; } } if (cp2a1_best50_middle_finder.Count > 2) { var middle = cp2a1_best50_middle_finder[cp2a1_best50_middle_finder.Count / 2]; cp2a1_best50_min = middle.Item1; cp2a1_best50_max = middle.Item2; cp2a1_best50_interactions = middle.Item3; } var cp2a2_best50_min = int.MinValue; var cp2a2_best50_max = int.MinValue; var cp2a2_best50_interactions = int.MinValue; var cp2a2_best50_middle_finder = new List <Tuple <int, int, int> >(); for (var x = cp2a2_min - interface_target_length; x <= cp2a2_max; x++) { if (Math.Abs(cp2a2_max - cp2a2_min) <= interface_target_length) { cp2a2_best50_min = cp2a2_min; cp2a2_best50_max = cp2a2_max; cp2a2_best50_interactions = cp2a2_res_seq.Count; break; } var min = x; var max = x + interface_target_length > cp2a2_max ? cp2a2_max : x + interface_target_length; var best50 = cp2a2_res_seq.Count(a => a >= cp2a2_best50_min && a <= cp2a2_best50_max); if (best50 == cp2a2_best50_interactions) { cp2a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); } if (cp2a2_best50_interactions == int.MinValue || best50 > cp2a2_best50_interactions) { cp2a2_best50_middle_finder.Clear(); cp2a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); cp2a2_best50_min = min; cp2a2_best50_max = max; cp2a2_best50_interactions = best50; } if (x + interface_target_length >= cp2a2_max) { break; } } if (cp2a2_best50_middle_finder.Count > 2) { var middle = cp2a2_best50_middle_finder[cp2a2_best50_middle_finder.Count / 2]; cp2a2_best50_min = middle.Item1; cp2a2_best50_max = middle.Item2; cp2a2_best50_interactions = middle.Item3; } var cp1a1_interface = string.Join("", p1c1_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp1a1_min && l <= cp1a1_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp1a2_interface = string.Join("", p1c2_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp1a2_min && l <= cp1a2_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp2a1_interface = string.Join("", p2c1_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp2a1_min && l <= cp2a1_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp2a2_interface = string.Join("", p2c2_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp2a2_min && l <= cp2a2_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp1a1_interface_interactions = new string('_', cp1a1_interface.Length); cp1a1_interface_interactions = string.Join("", cp1a1_interface_interactions.Select((a, i) => cp1a1_res_seq.Contains(i + cp1a1_min) ? "X" : "_").ToList()); var cp1a2_interface_interactions = new string('_', cp1a2_interface.Length); cp1a2_interface_interactions = string.Join("", cp1a2_interface_interactions.Select((a, i) => cp1a2_res_seq.Contains(i + cp1a2_min) ? "X" : "_").ToList()); var cp2a1_interface_interactions = new string('_', cp2a1_interface.Length); cp2a1_interface_interactions = string.Join("", cp2a1_interface_interactions.Select((a, i) => cp2a1_res_seq.Contains(i + cp2a1_min) ? "X" : "_").ToList()); var cp2a2_interface_interactions = new string('_', cp2a2_interface.Length); cp2a2_interface_interactions = string.Join("", cp2a2_interface_interactions.Select((a, i) => cp2a2_res_seq.Contains(i + cp2a2_min) ? "X" : "_").ToList()); var cp1a1_interactions = cp1a1_interface_interactions.Count(a => a == 'X'); var cp1a2_interactions = cp1a2_interface_interactions.Count(a => a == 'X'); var cp2a1_interactions = cp2a1_interface_interactions.Count(a => a == 'X'); var cp2a2_interactions = cp2a2_interface_interactions.Count(a => a == 'X'); var cp1a1_best50_interface = string.Join("", p1c1_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp1a1_best50_min && l <= cp1a1_best50_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp1a2_best50_interface = string.Join("", p1c2_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp1a2_best50_min && l <= cp1a2_best50_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp2a1_best50_interface = string.Join("", p2c1_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp2a1_best50_min && l <= cp2a1_best50_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp2a2_best50_interface = string.Join("", p2c2_pdb.ChainList.First().AtomList.Where(a => { var l = int.Parse(a.resSeq.FieldValue); return(l >= cp2a2_best50_min && l <= cp2a2_best50_max); }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var cp1a1_best50_interface_interactions = new string('_', cp1a1_best50_interface.Length); cp1a1_best50_interface_interactions = string.Join("", cp1a1_best50_interface_interactions.Select((a, i) => cp1a1_res_seq.Contains(i + cp1a1_best50_min) ? "X" : "_").ToList()); var cp1a2_best50_interface_interactions = new string('_', cp1a2_best50_interface.Length); cp1a2_best50_interface_interactions = string.Join("", cp1a2_best50_interface_interactions.Select((a, i) => cp1a2_res_seq.Contains(i + cp1a2_best50_min) ? "X" : "_").ToList()); var cp2a1_best50_interface_interactions = new string('_', cp2a1_best50_interface.Length); cp2a1_best50_interface_interactions = string.Join("", cp2a1_best50_interface_interactions.Select((a, i) => cp2a1_res_seq.Contains(i + cp2a1_best50_min) ? "X" : "_").ToList()); var cp2a2_best50_interface_interactions = new string('_', cp2a2_best50_interface.Length); cp2a2_best50_interface_interactions = string.Join("", cp2a2_best50_interface_interactions.Select((a, i) => cp2a2_res_seq.Contains(i + cp2a2_best50_min) ? "X" : "_").ToList()); d.Pdb1Chain1InterfaceStart = ip1.Item1; d.Pdb1Chain1InterfaceEnd = ip1.Item2; d.Pdb1Chain1TotalInteractions = cp1a1_interactions; d.Pdb1Chain1InterfaceSequence = cp1a1_interface; d.Pdb1Chain1InterfaceMask = cp1a1_interface_interactions; d.Pdb1Chain1Best50InterfaceStart = cp1a1_best50_min; d.Pdb1Chain1Best50InterfaceEnd = cp1a1_best50_max; d.Pdb1Chain1Best50TotalInteractions = cp1a1_best50_interactions; d.Pdb1Chain1Best50InterfaceSequence = cp1a1_best50_interface; d.Pdb1Chain1Best50InterfaceMask = cp1a1_best50_interface_interactions; d.Pdb1Chain2InterfaceStart = ip1.Item3; d.Pdb1Chain2InterfaceEnd = ip1.Item4; d.Pdb1Chain2TotalInteractions = cp1a2_interactions; d.Pdb1Chain2InterfaceSequence = cp1a2_interface; d.Pdb1Chain2InterfaceMask = cp1a2_interface_interactions; d.Pdb1Chain2Best50InterfaceStart = cp1a2_best50_min; d.Pdb1Chain2Best50InterfaceEnd = cp1a2_best50_max; d.Pdb1Chain2Best50TotalInteractions = cp1a2_best50_interactions; d.Pdb1Chain2Best50InterfaceSequence = cp1a2_best50_interface; d.Pdb1Chain2Best50InterfaceMask = cp1a2_best50_interface_interactions; d.Pdb2Chain1InterfaceStart = ip2.Item1; d.Pdb2Chain1InterfaceEnd = ip2.Item2; d.Pdb2Chain1TotalInteractions = cp2a1_interactions; d.Pdb2Chain1InterfaceSequence = cp2a1_interface; d.Pdb2Chain1InterfaceMask = cp2a1_interface_interactions; d.Pdb2Chain1Best50InterfaceStart = cp2a1_best50_min; d.Pdb2Chain1Best50InterfaceEnd = cp2a1_best50_max; d.Pdb2Chain1Best50TotalInteractions = cp2a1_best50_interactions; d.Pdb2Chain1Best50InterfaceSequence = cp2a1_best50_interface; d.Pdb2Chain1Best50InterfaceMask = cp2a1_best50_interface_interactions; d.Pdb2Chain2InterfaceStart = ip2.Item3; d.Pdb2Chain2InterfaceEnd = ip2.Item4; d.Pdb2Chain2TotalInteractions = cp2a2_interactions; d.Pdb2Chain2InterfaceSequence = cp2a2_interface; d.Pdb2Chain2InterfaceMask = cp2a2_interface_interactions; d.Pdb2Chain2Best50InterfaceStart = cp2a2_best50_min; d.Pdb2Chain2Best50InterfaceEnd = cp2a2_best50_max; d.Pdb2Chain2Best50TotalInteractions = cp2a2_best50_interactions; d.Pdb2Chain2Best50InterfaceSequence = cp2a2_best50_interface; d.Pdb2Chain2Best50InterfaceMask = cp2a2_best50_interface_interactions; } var output = data.Select(a => a.ToString()).ToList(); output.Insert(0, MultiBindingInterface.Header()); File.WriteAllLines(@"c:\multibinding\MultiBinding_parsed_results.csv", output); return; }
static void Main(string[] args) { //var s1 = // @"XXXXXXXXXXXXXXXXXXXXKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVELQEKVVQVSAGDSHTAALTDDGRVFLWGSFRDNNGVIGLLEPMKKSMVPVQVQLDVPVVKVASGNDHLVMLTADGDLYTLGCGEQGQLGRVPELFANRGGRQGLERLLVPKCVMLKSRGSRGHVRFQDAFCGAYFTFAISHEGHVYGFGLSNYHQLGTPGTESCFIPQNLTSFKNSTKSWVGFSGGQHHTVCMDSEGKAYSLGRAEYGRLGLGEGAEEKSIPTLISRLPAVSSVACGASVGYAVTKDGRVFAWGMGTNYQLGTGQDEDAWSPVEMMGKQLENRVVLSVSSGGQHTVLLVKDKEQS"; //var s2 = @"RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVELQEKVVQVSAGDSHTAALTDDGRVFLWGSFRDNNGVIGLLEPMKKSMVPVQVQLDVPVVKVASGNDHLVMLTADGDLYTLGCGEQGQLGRVPELFANRGGRQGLERLLVPKCVMLKSRGSRGHVRFQDAFCGAYFTFAISHEGHVYGFGLSNYHQLGTPGTESCFIPQNLTSFKNSTKSWVGFSGGQHHTVCMDSEGKAYSLGRAEYGRLGLGEGAEEKSIPTLISRLPAVSSVACGASVGYAVTKDGRVFAWGMGTNYQLGTGQDEDAWSPVEMMGKQLENRVVLSVSSGGQHTVLLVKDKEQS"; //var x = SimpleAlignmentOffset(s1,s2); //Console.WriteLine(); //Console.WriteLine(x.Item1); //Console.WriteLine(x.Item2); //Console.WriteLine(); //Console.ReadLine(); //return; // MutateSequence example.fasta start end mutation original (will find closest to start/end in case of sequence/structure index misalignment) var parameters = new string[, ] { { "[input_fasta_file]", "fasta file with sequence to mutate" }, { "[chain_ids]", "chain ids to mutate" }, { "[start_positions]", "mutation start position (one based)" }, { "[end_positions]", "mutation end position (one based)" }, { "[offsets]", "offsets (for where pdb index doesn't match fasta sequence index) (one based)" }, { "[mutation_sequence]", "new amino acids to overwrite with" }, { "[[output_fasta_file]]", "optional output fasta file. when ommitted, output to console" }, }; var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length); var exeFilename = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName); if (args.Length < 5) { Console.WriteLine(exeFilename + @" is a program to mutate (substitute) a subsequence of a protein amino acid sequence within a fasta file."); Console.WriteLine(); Console.WriteLine(@"Usage:"); Console.WriteLine( ProteinBioClass.WrapConsoleText( exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Example:"); Console.WriteLine( ProteinBioClass.WrapConsoleText( exeFilename + @" ""c:\pdb_db\fasta\fasta_pdb1a12.pdb.fasta"" A,B,C 10,76,100 15,77,102 GBVBGA,AA,GHG ""c:\pdb_db\fasta_mutated\mutated_pdb1a12.pdb.fasta""", maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Arguments:"); for (var i = 0; i < parameters.GetLength(0); i++) { Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false)); } Console.WriteLine(); return; } var p = 0; var input_fasta_file = args.Length > p && args[p].Length > 0 ? args[p] : ""; input_fasta_file = input_fasta_file.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + input_fasta_file); p++; var chain_ids = args.Length > p && args[p].Length > 0 ? args[p] : ""; chain_ids = chain_ids.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + chain_ids); var chain_ids_split = chain_ids.ToUpperInvariant().Split(','); p++; var start_position = args.Length > p && args[p].Length > 0 ? args[p] : ""; start_position = start_position.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + start_position); var start_position_split = start_position.Split(',').Select(int.Parse).ToArray(); p++; var end_position = args.Length > p && args[p].Length > 0 ? args[p] : ""; end_position = end_position.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + end_position); var end_position_split = end_position.Split(',').Select(int.Parse).ToArray(); p++; var offset_position = args.Length > p && args[p].Length > 0 ? args[p] : ""; offset_position = offset_position.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + offset_position); var offset_position_split = offset_position.Split(',').Select(int.Parse).ToArray(); p++; var mutation_sequence = args.Length > p && args[p].Length > 0 ? args[p] : ""; mutation_sequence = mutation_sequence.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + mutation_sequence); var mutation_sequence_split = mutation_sequence.Split(','); p++; var output_fasta_file = args.Length > p && args[p].Length > 0 ? args[p] : ""; output_fasta_file = output_fasta_file.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + output_fasta_file); Console.WriteLine(); MutateFastaSequenceSave(input_fasta_file, chain_ids_split, start_position_split, end_position_split, offset_position_split, mutation_sequence_split, output_fasta_file); }
static void Main(string[] args) { //var requiredInterfaceLengths = new int[] { 7, 9, 11, 13, 15 }; var requiredInterfaceLengths = new int[] { 9 }; var sequenceFilename = @"C:\pdbe\pdb_seqres.fasta"; var dsspFilename = @"C:\pdbe\ss.txt"; var sequenceListFromFastaFile = Sequence.LoadSequenceFile(sequenceFilename, new string[] { null, "", "protein" }); var dsspList = Sequence.LoadSequenceFile(dsspFilename, new string[] { null, "", "protein" }); dsspList = dsspList.Where(a => a.IdSplit.Description == "secstr").ToList(); //var pdbIdList = new List<string>(); {"1a00"}; var pdbFiles = Directory.GetFiles(@"c:\pdbe\contacts_all\", "????.pdb", SearchOption.TopDirectoryOnly).ToList(); var pdbFileLengths = pdbFiles.Select(a => new Tuple <string, long>(a, new FileInfo(a).Length)).ToList(); pdbFileLengths = pdbFileLengths.Where(a => a.Item2 > 0).ToList(); pdbFileLengths = pdbFileLengths.OrderBy(a => a.Item2).ToList(); var ifList = InterfaceFragment.Load(@"c:\r\if.csv"); ifList = ifList.Where(a => requiredInterfaceLengths.Contains(a.FragmentLength)).ToList(); if (ifList.Count == 0) { var pdbIdList = pdbFileLengths.Select(a => a.Item1).Select(Path.GetFileNameWithoutExtension).Select(a => a.ToUpperInvariant()).ToList(); //pdbIdList = pdbIdList.GetRange(0, 100); //var result = new List<InterfaceFragment>(); var taskList1 = new List <Task <List <InterfaceFragment> > >(); foreach (var pdbId1 in pdbIdList) { var pdbId = pdbId1; Console.WriteLine(pdbId + " " + (pdbIdList.IndexOf(pdbId) + 1) + "/" + pdbIdList.Count); while (taskList1.Count(a => !a.IsCompleted) >= Environment.ProcessorCount) { Task.WaitAny(taskList1.Where(a => !a.IsCompleted).ToArray <Task>()); } taskList1.Add(Task.Run(() => { var pdbResult = new List <InterfaceFragment>(); var atomsFilename = @"c:\pdbe\atoms_all\" + pdbId + ".pdb"; var contactsFilename = @"c:\pdbe\contacts_all\" + pdbId + ".pdb"; var interfaceCsv = @"c:\r\interface_9\" + pdbId + @".csv"; if (File.Exists(interfaceCsv)) { return(InterfaceFragment.Load(interfaceCsv)); } if (!File.Exists(atomsFilename) || !File.Exists(contactsFilename)) { return(null); // new List<InterfaceFragment>(); // continue; } if (new FileInfo(atomsFilename).Length == 0 || new FileInfo(contactsFilename).Length == 0) { return(null); //new List<InterfaceFragment>(); // continue; } var contacts = ProteinBioClass.AtomPair.LoadAtomPairList(contactsFilename); var contactChainIds = contacts.SelectMany(a => new char[] { a.Atom1.chainID.FieldValue[0], a.Atom2.chainID.FieldValue[0] }).Distinct().ToList(); var proteinFileChains = ProteinBioClass.PdbAtomicChains(atomsFilename, contactChainIds.ToArray(), -1, -1, false); var sequenceListFromPdbFile = proteinFileChains.ChainList.Select(pdbChain => Sequence.LoadStructureFile(atomsFilename, new[] { pdbChain.ChainId }, true, null, null, '-', '-')).ToList(); var dsspFromFastaFile = proteinFileChains.ChainList.Select(pdbChain => dsspList.FirstOrDefault(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.Substring(0, a.IdSplit.PdbId.Length).ToUpperInvariant() && a.IdSplit.ChainId == pdbChain.ChainId)?.FullSequence).ToList(); var sequenceFromFastaFile = proteinFileChains.ChainList.Select(pdbChain => sequenceListFromFastaFile.FirstOrDefault(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.Substring(0, a.IdSplit.PdbId.Length).ToUpperInvariant() && a.IdSplit.ChainId == pdbChain.ChainId)?.FullSequence).ToList(); var sequenceFromPdbFile = proteinFileChains.ChainList.Select((pdbChain, i) => sequenceListFromPdbFile[i].FirstOrDefault(a => (string.IsNullOrWhiteSpace(a.IdSplit.PdbId) || a.IdSplit.PdbId.ToUpperInvariant() == pdbId.Substring(0, a.IdSplit.PdbId.Length).ToUpperInvariant()) && a.IdSplit.ChainId == pdbChain.ChainId)?.FullSequence).ToList(); var structureToSequenceAlignmentResult = proteinFileChains.ChainList.Select((pdbChain, i) => ProteinBioClass.StructureToSequenceAlignment.Align(pdbChain.AtomList, sequenceFromFastaFile[i], sequenceFromPdbFile[i])).ToList(); if (structureToSequenceAlignmentResult.Any(a => a == null)) { return(null); } for (var chainIndex3 = 0; chainIndex3 < proteinFileChains.ChainList.Count; chainIndex3++) { for (var i = 0; i < structureToSequenceAlignmentResult[chainIndex3].FastaSequenceAligned.Length; i++) { if (structureToSequenceAlignmentResult[chainIndex3].FastaSequenceAligned[i] == '-') { dsspFromFastaFile[chainIndex3] = dsspFromFastaFile[chainIndex3].Insert(i, "-"); } } } foreach (var contactChainId1 in contactChainIds) { var chainIndex1 = proteinFileChains.ChainList.FindIndex(g => g.ChainId == contactChainId1); if (structureToSequenceAlignmentResult[chainIndex1] == null) { continue; } if (string.IsNullOrWhiteSpace(sequenceFromPdbFile[chainIndex1]) || string.IsNullOrWhiteSpace(sequenceFromFastaFile[chainIndex1]) || string.IsNullOrWhiteSpace(dsspFromFastaFile[chainIndex1])) { continue; } var chainResult = new List <InterfaceFragment>(); //var pdbChain = proteinFileChains.ChainList.First(a => a.ChainId == contactChainId).AtomList; var chainContacts = contacts.Where(a => a.Atom1.chainID.FieldValue[0] == contactChainId1 || a.Atom2.chainID.FieldValue[0] == contactChainId1).Select(a => a.Atom1.chainID.FieldValue[0] == contactChainId1 ? a : a.SwapAtoms()).ToList(); //chainContacts = chainContacts.GroupBy(a => int.Parse(a.resSeq.FieldValue)).ToList().Select(a => a.First()).ToList(); //chainContacts = chainContacts.Distinct().ToList(); chainContacts = chainContacts.OrderBy(a => int.Parse(a.Atom1.resSeq.FieldValue)).ThenBy(a => int.Parse(a.Atom1.serial.FieldValue)).ToList(); foreach (var atomPair in chainContacts) { foreach (var requiredInterfaceLength in requiredInterfaceLengths) { var contactChainId2 = atomPair.Atom2.chainID.FieldValue[0]; var chainIndex2 = proteinFileChains.ChainList.FindIndex(g => g.ChainId == contactChainId2); if (structureToSequenceAlignmentResult[chainIndex2] == null) { continue; } if (string.IsNullOrWhiteSpace(sequenceFromPdbFile[chainIndex2]) || string.IsNullOrWhiteSpace(sequenceFromFastaFile[chainIndex2]) || string.IsNullOrWhiteSpace(dsspFromFastaFile[chainIndex2])) { continue; } var interfaceLength1 = requiredInterfaceLength; var interfaceLength2 = requiredInterfaceLength; var resSeq1 = int.Parse(atomPair.Atom1.resSeq.FieldValue); var resSeq2 = int.Parse(atomPair.Atom2.resSeq.FieldValue); var resSeqIndex1a = structureToSequenceAlignmentResult[chainIndex1].AlignmentMap.ToList().FindIndex(a => a == resSeq1); var resSeqIndex2a = structureToSequenceAlignmentResult[chainIndex2].AlignmentMap.ToList().FindIndex(a => a == resSeq2); if (structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length < interfaceLength1) { interfaceLength1 = structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length; } if (structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length < interfaceLength2) { interfaceLength2 = structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length; } var resSeqIndex1 = resSeqIndex1a - (interfaceLength1 / 2); var resSeqIndex2 = resSeqIndex2a - (interfaceLength2 / 2); if (resSeqIndex1 < 0) { resSeqIndex1 = 0; } if (resSeqIndex2 < 0) { resSeqIndex2 = 0; } if (resSeqIndex1 + interfaceLength1 > structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length) { resSeqIndex1 = (structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length - interfaceLength1); } if (resSeqIndex2 + interfaceLength2 > structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length) { resSeqIndex2 = (structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length - interfaceLength2); } var interfaceSuper1 = structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Substring(resSeqIndex1, interfaceLength1); var interfaceSuper2 = structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Substring(resSeqIndex2, interfaceLength2); var dsspSuper1 = dsspFromFastaFile[chainIndex1].Substring(resSeqIndex1, interfaceLength1); var dsspSuper2 = dsspFromFastaFile[chainIndex2].Substring(resSeqIndex2, interfaceLength2); if (interfaceSuper1.Length != requiredInterfaceLength || dsspSuper1.Length != requiredInterfaceLength) { continue; } if (interfaceSuper2.Length != requiredInterfaceLength || dsspSuper2.Length != requiredInterfaceLength) { continue; } var interfaceFragment = new InterfaceFragment() { FragmentLength = requiredInterfaceLength, PdbId = pdbId, ReceptorChainId = contactChainId1, ReceptorResSeq = resSeq1, ReceptorIndex = resSeqIndex1a, ReceptorAminoAcidSequence = interfaceSuper1, ReceptorSecondaryStructure = dsspSuper1, LigandChainId = contactChainId2, LigandResSeq = resSeq2, LigandIndex = resSeqIndex2a, LigandAminoAcidSequence = interfaceSuper2, LigandSecondaryStructure = dsspSuper2 }; chainResult.Add(interfaceFragment); } } pdbResult.AddRange(chainResult); } InterfaceFragment.Save(interfaceCsv, pdbResult.Distinct().ToList()); return(null); // return pdbResult; })); //result.AddRange(pdbResult); } //try //{ Task.WaitAll(taskList1.ToArray <Task>()); //} //catch (AggregateException ae) //{ // throw ae.Flatten(); //} ifList = taskList1.Where(a => a.Result != null).SelectMany(a => a.Result).ToList(); ifList = ifList.Distinct().ToList(); InterfaceFragment.Save(@"c:\r\if.csv", ifList); } var pairs1 = ifList.Select(a => new Tuple <string, string>(a.ReceptorAminoAcidSequence, a.ReceptorSecondaryStructure)).Distinct().ToList(); sequenceFilename = null; dsspFilename = null; sequenceListFromFastaFile.Clear(); sequenceListFromFastaFile = null; dsspList.Clear(); dsspList = null; pdbFiles.Clear(); pdbFiles = null; pdbFileLengths.Clear(); pdbFileLengths = null; //ifList.Clear(); ifList = null; var similarAA = new string[] { "LAGVIP", "DE", "ST", "RKH", "FYW", "NQ", "CM", "BJOUXZ", "-" }; var simAaList = similarAA.SelectMany(a => a.ToList()).ToList(); var simAaDict = new Dictionary <char, string>(); foreach (var s in simAaList) { simAaDict.Add(s, similarAA.First(a => a.Contains(s))); } DateTime startTime = DateTime.Now; var simList = new List <Tuple <string, string, string, string, decimal, decimal, decimal> >(); // aa, ss, aa-sim, aa-evo-sim, ss-sim // compare without alignments, as aligments takes too long, also allow for insertion/deletions with index pos -1/+1. for (int i = 0; i < pairs1.Count; i++) { TimeSpan timeRemaining = TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks *(pairs1.Count - (i + 1)) / (i + 1)); Console.WriteLine((i + 1) + " / " + pairs1.Count + " " + timeRemaining.ToString("d'd 'h'h 'm'm 's's'")); var a = pairs1[i]; for (int j = 0; j < pairs1.Count; j++) { if (j <= i) { continue; } var b = pairs1[j]; if (a.Item1.Length != b.Item1.Length) { continue; } var scoreAa = 0; var scoreSs = 0; var scoreAaEvo = 0; for (var x = 0; x < a.Item1.Length; x++) { if (a.Item1[x] == b.Item1[x]) { scoreAa++; } else if (x > 0 && (a.Item1[x - 1] == b.Item1[x] || a.Item1[x] == b.Item1[x - 1])) { scoreAa++; } else if (x < a.Item1.Length - 1 && (a.Item1[x + 1] == b.Item1[x] || a.Item1[x] == b.Item1[x + 1])) { scoreAa++; } if (simAaDict[a.Item1[x]].Contains(b.Item1[x])) { scoreAaEvo++; } else if (x > 0 && (simAaDict[a.Item1[x - 1]].Contains(b.Item1[x]) || simAaDict[a.Item1[x]].Contains(b.Item1[x - 1]))) { scoreAaEvo++; } else if (x < a.Item1.Length - 1 && (simAaDict[a.Item1[x + 1]].Contains(b.Item1[x]) || simAaDict[a.Item1[x]].Contains(b.Item1[x + 1]))) { scoreAaEvo++; } if (a.Item2[x] == b.Item2[x]) { scoreSs++; } else if (x > 0 && (a.Item2[x - 1] == b.Item2[x] || a.Item2[x] == b.Item2[x - 1])) { scoreSs++; } else if (x < a.Item2.Length - 1 && (a.Item2[x + 1] == b.Item2[x] || a.Item2[x] == b.Item2[x + 1])) { scoreSs++; } } decimal scoreAaPct = (decimal)scoreAa / (decimal)a.Item1.Length; decimal scoreAaEvoPct = (decimal)scoreAaEvo / (decimal)a.Item1.Length; decimal scoreSsPct = (decimal)scoreSs / (decimal)a.Item2.Length; simList.Add(new Tuple <string, string, string, string, decimal, decimal, decimal>(a.Item1, a.Item2, b.Item1, b.Item2, scoreAaPct, scoreAaEvoPct, scoreSsPct)); //simList.Add(new Tuple<string, string, string, string, decimal, decimal, decimal>(b.AminoAcidSequence, b.SecondaryStructure, a.AminoAcidSequence, a.SecondaryStructure, scoreAaPct, scoreAaEvoPct, scoreSsPct)); } } //var if2 = simList.SelectMany(a => new List<string>() { string.Join(",", new string[] { a.Item1, a.Item2, a.Item3, a.Item4, "" + a.Item5, "" + a.Item6, "" + a.Item7 }) /*, // string.Join(",", new string[] { a.Item3, a.Item4, a.Item1, a.Item2, "" + a.Item5, "" + a.Item6, "" + a.Item7 })*/ }).ToList(); var taskList3 = new List <Task>(); for (int i = 0; i < pairs1.Count; i++) { TimeSpan timeRemaining = TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks *(pairs1.Count - (i + 1)) / (i + 1)); Console.WriteLine((i + 1) + " / " + pairs1.Count + " " + timeRemaining.ToString("d'd 'h'h 'm'm 's's'")); while (taskList3.Count(a => !a.IsCompleted) >= Environment.ProcessorCount) { Task.WaitAny(taskList3.Where(a => !a.IsCompleted).ToArray <Task>()); } var i1 = i; var t = Task.Run(() => { var a = pairs1[i1]; // find all instances of this pair with good simple alignment var cluster = simList.Where(c => (c.Item1 == a.Item1 || c.Item3 == a.Item1) && (c.Item2 == a.Item2 || c.Item4 == a.Item2) && (c.Item5 >= 0.4m && c.Item6 >= 0.8m && c.Item7 >= 0.9m) && (c.Item5 < 1.0m /*&& c.Item6 < 1.0m && c.Item7 < 1.0m*/)).ToList(); if (cluster.Count < 5) { return; } var clusterPdbs = ifList.Where(c => cluster.Any(d => (d.Item1 == c.ReceptorAminoAcidSequence || d.Item3 == c.ReceptorAminoAcidSequence) && (d.Item2 == c.ReceptorSecondaryStructure || d.Item4 == c.ReceptorSecondaryStructure))).Select(e => new Tuple <string, char>(e.PdbId, e.ReceptorChainId)).ToList(); if (clusterPdbs.Count < 5) { return; } List <string> o = new List <string>(); o.Add("delete *"); o.AddRange(clusterPdbs.Select(e => @"load c:\pdbe\" + e.Item1 + ".pdb").ToList()); o.Add("hide all"); o.Add("show cartoon"); o.AddRange(clusterPdbs.Select((e, w) => w == 0 ? "" : @"super /" + e.Item1 + @"//" + e.Item2 + @", /" + clusterPdbs[0].Item1 + @"//" + clusterPdbs[0].Item2).ToList()); File.WriteAllLines(@"c:\r\cluster_" + (i1 + 1) + ".txt", o); }); taskList3.Add(t); } Task.WaitAll(taskList3.ToArray <Task>()); //File.WriteAllLines(@"c:\r\if2.csv", if2); Console.WriteLine("Calculating aa/ss sequence identities - part 1"); var taskList2 = new List <Task <InterfaceFragmentData> >(); //var interfaceFragmentLengths = new int[] { 15, 13, 11, 9, 7, 5, 3 }; //var interfaceFragmentLengths = new int[] { requiredInterfaceLengths[0] }; var interfaceFragmentLengths = new int[] { 11 }; for (int index = 0; index < interfaceFragmentLengths.Length; index++) { var index1 = index; var interfaceFragmentLength = interfaceFragmentLengths[index1]; taskList2.Add(Task.Run(() => { var pairs = ifList.Where(a => a.FragmentLength == interfaceFragmentLength).Select(a => new Tuple <string, string>(a.ReceptorAminoAcidSequence.Substring(index1, interfaceFragmentLength), a.ReceptorSecondaryStructure.Substring(index1, interfaceFragmentLength))).Distinct().ToList(); var aassList = new List <Tuple <string, string, string, string, decimal, decimal, decimal> >(); var aaSequenceList = pairs.Select(a => a.Item1).Distinct().ToList(); var aaAlignmentList = Align(aaSequenceList); var ssSequenceList = pairs.Select(a => a.Item2).Distinct().ToList(); var ssAlignmentList = Align(ssSequenceList); for (int i = 0; i < pairs.Count; i++) { Console.WriteLine((i + 1) + " / " + pairs.Count); var pair1 = pairs[i]; for (int j = 0; j < pairs.Count; j++) { if (j < i) { continue; } var pair2 = pairs[j]; var aaSid = aaAlignmentList.First(alignment => alignment.Item1 == pair1.Item1 && alignment.Item2 == pair2.Item1).Item3; var ssSid = ssAlignmentList.First(alignment => alignment.Item1 == pair1.Item2 && alignment.Item2 == pair2.Item2).Item3; var weighted = (aaSid * 0.5m) + (ssSid * 0.5m); aassList.Add(new Tuple <string, string, string, string, decimal, decimal, decimal>(pair1.Item1, pair1.Item2, pair2.Item1, pair2.Item2, aaSid, ssSid, weighted)); aassList.Add(new Tuple <string, string, string, string, decimal, decimal, decimal>(pair2.Item1, pair2.Item2, pair1.Item1, pair1.Item2, aaSid, ssSid, weighted)); } } var r = new InterfaceFragmentData(); r.AaSsData = aassList; r.AaData = aaAlignmentList; r.SsData = ssAlignmentList; return(r); })); } Task.WaitAll(taskList2.ToArray <Task>()); Console.WriteLine("Calculating aa/ss sequence identities - part 2"); var sids = new decimal[] { 1.0m, 0.9m, 0.8m, 0.7m, 0.6m, 0.5m, 0.4m, 0.3m, 0.2m, 0.1m, 0.0m }; //aa count non-transivity clusters var aaData = taskList2.SelectMany(a => a.Result.AaData).ToList(); var aaDistinct = aaData.Select(a => a.Item1).Distinct().ToList(); var aaNeighbours = new List <Tuple <string, decimal, decimal> >(); foreach (var aa in aaDistinct) { var subset = aaData.Where(a => a.Item1 == aa).ToList(); foreach (var sid in sids) { aaNeighbours.Add(new Tuple <string, decimal, decimal>(aa, sid, subset.Count(b => b.Item3 >= sid))); } } File.WriteAllLines(@"c:\r\aa.csv", aaData.Select(a => string.Join(",", new string[] { "" + a.Item1.Length, a.Item1, a.Item2, "" + a.Item3 }))); File.WriteAllLines(@"c:\r\aa-clusters-1.csv", aaDistinct.Select(a => a + "," + string.Join(",", aaNeighbours.Where(d => d.Item1 == a).Select(d => "" + d.Item3).ToArray()))); //ss count non-transivity clusters var ssData = taskList2.SelectMany(a => a.Result.SsData).ToList(); var ssDistinct = ssData.Select(a => a.Item1).Distinct().ToList(); var ssNeighbours = new List <Tuple <string, decimal, decimal> >(); foreach (var ss in ssDistinct) { var subset = ssData.Where(a => a.Item1 == ss).ToList(); foreach (var sid in sids) { ssNeighbours.Add(new Tuple <string, decimal, decimal>(ss, sid, subset.Count(b => b.Item3 >= sid))); } } File.WriteAllLines(@"c:\r\ss.csv", ssData.Select(a => string.Join(",", new string[] { "" + a.Item1.Length, a.Item1, a.Item2, "" + a.Item3 }))); File.WriteAllLines(@"c:\r\ss-clusters-1.csv", ssDistinct.Select(a => a + "," + string.Join(",", ssNeighbours.Where(d => d.Item1 == a).Select(d => "" + d.Item3).ToList()))); //aa-ss count non-transivity clusters var aaSsData = taskList2.SelectMany(a => a.Result.AaSsData).ToList(); var aaSsDistinct = aaSsData.Select(a => new Tuple <string, string>(a.Item1, a.Item2)).Distinct().ToList(); var aaSsNeighbours = new List <Tuple <string, string, decimal, decimal> >(); foreach (var aaSs in aaSsDistinct) { var subset = aaSsData.Where(a => a.Item1 == aaSs.Item1 && a.Item2 == aaSs.Item2).ToList(); foreach (var sid in sids) { aaSsNeighbours.Add(new Tuple <string, string, decimal, decimal>(aaSs.Item1, aaSs.Item2, sid, subset.Count(b => b.Item5 >= sid))); } } File.WriteAllLines(@"c:\r\aa-ss.csv", aaSsData.Select(a => string.Join(",", new string[] { "" + a.Item1.Length, a.Item1, a.Item2, a.Item3, a.Item4, "" + a.Item5, "" + a.Item6, "" + a.Item7 }))); File.WriteAllLines(@"c:\r\aa-ss-clusters-1.csv", aaSsDistinct.Select(a => a.Item1 + "," + a.Item2 + "," + string.Join(",", aaSsNeighbours.Where(d => d.Item1 == a.Item1 && d.Item2 == a.Item2).Select(d => "" + d.Item4).ToList()))); // cluster by transivity //var aaSsPairList = aaSsData.Select(a => new Tuple<string, string>(a.Item1, a.Item2)).Distinct().ToList(); var aaSsPairClusters = aaSsData.Select(a => new Tuple <string, string>(a.Item1, a.Item2)).Distinct().Select(a => new List <Tuple <string, string> >() { a }).ToList(); var aaClusters = aaSsData.Select(a => a.Item1).Distinct().Select(a => new List <string>() { a }).ToList(); var ssClusters = aaSsData.Select(a => a.Item2).Distinct().Select(a => new List <string>() { a }).ToList(); decimal minTransivitySid = 0.3m; foreach (var x in aaSsDistinct) { foreach (var y in aaSsDistinct) { if (x == y) { continue; } if (x.Item1.Length != y.Item1.Length) { continue; //items have not been sequence aligned if not the same length } var z = aaSsData.First(a => a.Item1 == x.Item1 && a.Item2 == x.Item2 && a.Item3 == y.Item1 && a.Item4 == y.Item2); if (z.Item5 >= minTransivitySid) { var c1 = aaClusters.First(a => a.Any(b => b == x.Item1)); var c2 = aaClusters.First(a => a.Any(b => b == y.Item1)); if (c1 != c2) { c1.AddRange(c2); aaClusters.Remove(c2); } } if (z.Item6 >= minTransivitySid) { var c1 = ssClusters.First(a => a.Any(b => b == x.Item2)); var c2 = ssClusters.First(a => a.Any(b => b == y.Item2)); if (c1 != c2) { c1.AddRange(c2); ssClusters.Remove(c2); } } if (z.Item7 >= minTransivitySid) { var c1 = aaSsPairClusters.First(a => a.Any(b => b.Item1 == x.Item1 && b.Item2 == x.Item2)); var c2 = aaSsPairClusters.First(a => a.Any(b => b.Item1 == y.Item1 && b.Item2 == y.Item2)); if (c1 != c2) { c1.AddRange(c2); aaSsPairClusters.Remove(c2); } } } } var aaSsPairClusters2 = aaSsPairClusters.SelectMany((a, i) => a.Select(b => new Tuple <int, string, string>(i + 1, b.Item1, b.Item2))).ToList(); aaSsPairClusters2 = aaSsPairClusters2.OrderByDescending(a => aaSsPairClusters2.Count(b => b.Item1 == a.Item1)).ToList(); File.WriteAllLines(@"c:\r\aa-ss-clusters-2.csv", aaSsPairClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2, a.Item3 }))); var aaClusters2 = aaClusters.SelectMany((a, i) => a.Select(b => new Tuple <int, string>(i + 1, b))).ToList(); aaClusters2 = aaClusters2.OrderByDescending(a => aaClusters2.Count(b => b.Item1 == a.Item1)).ToList(); File.WriteAllLines(@"c:\r\aa-clusters-2.csv", aaClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2 }))); var ssClusters2 = ssClusters.SelectMany((a, i) => a.Select(b => new Tuple <int, string>(i + 1, b))).ToList(); ssClusters2 = ssClusters2.OrderByDescending(a => ssClusters2.Count(b => b.Item1 == a.Item1)).ToList(); File.WriteAllLines(@"c:\r\ss-clusters-2.csv", ssClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2 }))); //File.WriteAllLines(@"c:\r\clusters-ss.csv", ssClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2 }))); // clusters by aa sid, ss sid, aa-ss sid }
private static void Main(string[] args) { // this program will load the homolog list in csv format and for homologs of X sequence distance return a list of all partners // however, some partners may be duplicates, which cannot initially be removed, since they may bind differently in other instances // then, because of such cases, unique id to describe each protein must be created... this is slightly problematic because // close target homologs of proteins are also considered to be the same protein as the query protein // which means that they could exist for more than one query protein // FindHomologsCluster.exe c:\pdb\ds96ub_homologs\ c:\pdb\pdb_seqres.fasta 0.9 0.9 > ds96ub_homologs.csv var homolog_csv_folder = args[0]; var sequence_file = args[1]; var min_similarity_str = args[2]; var min_similarity_evo_str = args[3]; var min_similarity = decimal.Parse(min_similarity_str); var min_similarity_evo = decimal.Parse(min_similarity_evo_str); var seqList = Sequence.LoadSequenceFile(sequence_file, new[] { null, "", "protein" }); var homologCsvFiles = Directory.GetFiles(homolog_csv_folder, "homologs_?????.csv"); var parsedData = FindHomologs.FindHomologs.HomologChain.Load(homologCsvFiles); Array.Clear(homologCsvFiles, 0, homologCsvFiles.Length); //var query_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.query_pdb_id, a.query_chainid)).ToList(); //var target_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.target_pdb_id, a.target_chainid)).ToList(); //var query_alignments = new List<homolog_csv>(); var homologs_clustered = new List <List <Tuple <string, char> > >(); //var min_similarity = 0.9m; foreach (var rec in parsedData) { if (rec.AlignmentScore >= min_similarity && rec.AlignmentScoreEvo >= min_similarity_evo) { //var query_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.query_pdb_id && b.Item2 == rec.query_chainid) != null); //var target_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.target_pdb_id && b.Item2 == rec.target_chainid) != null); List <Tuple <string, char> > query_group = null; List <Tuple <string, char> > target_group = null; foreach (var cluster in homologs_clustered) { var xq = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.QueryPdbId.ToUpperInvariant() && b.Item2 == rec.QueryChainId); if (xq == null) { continue; } query_group = cluster; break; } foreach (var cluster in homologs_clustered) { var xt = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.TargetPdbId.ToUpperInvariant() && b.Item2 == rec.TargetChainId); if (xt == null) { continue; } target_group = cluster; break; } var new_group = new List <Tuple <string, char> >(); if (query_group != null) { new_group.AddRange(query_group); homologs_clustered.Remove(query_group); query_group.Clear(); } else { new_group.Add(new Tuple <string, char>(rec.QueryPdbId, rec.QueryChainId)); } if (target_group != null) { new_group.AddRange(target_group); homologs_clustered.Remove(target_group); target_group.Clear(); } else { new_group.Add(new Tuple <string, char>(rec.TargetPdbId, rec.TargetChainId)); } new_group = new_group.Distinct().ToList(); // try without distinct? new_group = new_group.OrderBy(a => a.Item1).ThenBy(a => a.Item2).ToList(); homologs_clustered.Add(new_group); } } var seq_list_ids = seqList.Select(a => new ProteinBioClass.SequenceId(a.Id)).ToList(); var wd2 = new WorkDivision(homologs_clustered.Count); for (var thread2 = 0; thread2 < wd2.ThreadCount; thread2++) { var lti2 = thread2; wd2.TaskList.Add(Task.Run(() => { var result2 = new List <string>(); for (var index2 = wd2.ThreadFirstIndex[lti2]; index2 <= wd2.ThreadLastIndex[lti2]; index2++) { var cluster2 = homologs_clustered[index2]; var wd3 = new WorkDivision(cluster2.Count); for (var thread3 = 0; thread3 < wd3.ThreadCount; thread3++) { var lti3 = thread3; var cluster3 = cluster2; var index4 = index2; wd3.TaskList.Add(Task.Run(() => { var result = new List <HomologClusterData>(); for (var index3 = wd3.ThreadFirstIndex[lti3]; index3 <= wd3.ThreadLastIndex[lti3]; index3++) { var item = cluster3[index3]; Sequence s = null; for (var j = 0; j < seqList.Count; j++) { if (seq_list_ids[j].PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant() && seq_list_ids[j].ChainId == item.Item2) { s = seqList[j]; break; } } if (s == null) { throw new Exception("sequence not found for " + item.Item1 + ":" + item.Item2); } var complexChains = seq_list_ids.Count(a => a.PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant()); var minAlignmentScore = -1m; var maxAlignmentScore = -1m; var minAlignmentScoreEvo = -1m; var maxAlignmentScoreEvo = -1m; foreach (var item2 in cluster3) { if (ReferenceEquals(item, item2)) { continue; } Sequence s2 = null; for (var j2 = 0; j2 < seqList.Count; j2++) { if (seq_list_ids[j2].PdbId.ToUpperInvariant() == item2.Item1.ToUpperInvariant() && seq_list_ids[j2].ChainId == item2.Item2) { s2 = seqList[j2]; break; } } if (s2 == null) { continue; } var alignmentScore = ProteinBioClass.AlignedSequenceSimilarityPercentage(s, s2, ProteinBioClass.AlignmentType.NMW); if (alignmentScore.Score > maxAlignmentScore || maxAlignmentScore == -1m) { maxAlignmentScore = alignmentScore.Score; } if (alignmentScore.Score < minAlignmentScore || minAlignmentScore == -1m) { minAlignmentScore = alignmentScore.Score; } if (alignmentScore.ScoreEvo > maxAlignmentScoreEvo || maxAlignmentScoreEvo == -1m) { maxAlignmentScoreEvo = alignmentScore.ScoreEvo; } if (alignmentScore.ScoreEvo < minAlignmentScoreEvo || minAlignmentScoreEvo == -1m) { minAlignmentScoreEvo = alignmentScore.ScoreEvo; } } var r = new HomologClusterData(index4 + 1, index3 + 1, item.Item1, item.Item2, complexChains, Convert.ToInt32(s.Count()), minAlignmentScore, maxAlignmentScore, minAlignmentScoreEvo, maxAlignmentScoreEvo, s.FullSequence); result.Add(r); } return(result); })); } wd3.WaitAllTasks(); result2.Add("; Cluster # " + (index2 + 1) + " with " + wd3.ItemsToProcess + " protein chains"); result2.Add("cluster index,item index,pdb id,chain id,complex chains,seq len,min clstr sid,max clstr sid,min evo clstr sid,max evo clstr sid,sequence"); foreach (var task in wd3.TaskList) { //if (task.IsFaulted || task.IsCanceled) continue; var tr = task as Task <List <HomologClusterData> >; if (tr == null || tr.Result == null) { continue; } result2.AddRange(tr.Result.Select(a => a.ToString()).ToList()); } result2.Add(""); } return(result2); })); //wd2.TaskList.Add(task2); } wd2.WaitAllTasks(); var result1 = new List <string>(); foreach (var task in wd2.TaskList) { //if (task.IsFaulted || task.IsCanceled) continue; var tr = task as Task <List <string> >; if (tr == null || tr.Result == null) { continue; } result1.AddRange(tr.Result); } foreach (var line in result1) { Console.WriteLine(line); } // partners may have other interfaces, should those also be considered? }
public static void Main(string[] args) { //var logResultsFolder = @"c:\r\r\"; //var logResultsFolder = @"c:\r-some modelled\"; //var logResultsFolder = @"c:\r\" ; //args[0]; //var saveFile = args[1]; //var logResultsFolder = @"c:\pdbe_split\models\" ; //args[0]; //var logResultsFolder = @"C:\pdbe_split\manual\sw_1SBNI_2SICI_4GI3C\"; //args[0]; //var logResultsFolder = @"C:\pdbe_split\manual\sw_1OYVI_1R0RI_1SBNI_1V5IB_2SICI_3BX1C_4GI3C_4LVNP\"; //args[0]; var logResultsFolder = @"C:\pdbe_split\manual\sw_3BX1C\"; //args[0]; //C:\pdbe_split\manual\sw_1H1VG_1KXPD_1RGIG_1T44G_3JBIV_4EAHA_4PKHB_5AFUb var seq = ProteinBioinformaticsSharedLibrary.Sequence.LoadSequenceFile(logResultsFolder + "sequences.fasta"); var inf = ProteinBioinformaticsSharedLibrary.Sequence.LoadSequenceFile(logResultsFolder + "interfaces_fixed_length.fasta"); foreach (var s1 in seq) { var r = new List <Tuple <string, ProteinBioClass.AlignmentScore> >(); foreach (var s2 in seq) { //if (s1==s2) continue; var nmw = new NeedlemanWunsch(s1.FullSequence, s2.FullSequence); var a = nmw.getAlignment(); ProteinBioClass.AlignmentScore s = ProteinBioClass.SequenceSimilarityPercentage(a[0], a[1], ProteinBioClass.AlignmentIdentityOption.MinimumSequenceLength); //r.Add(s1.Id.Substring(1, 5) + " " + s2.Id.Substring(1, 5) + " " + s.Score + " " + s.ScoreEvo); r.Add(new Tuple <string, ProteinBioClass.AlignmentScore>(s1.Id.Substring(1, 5) + "," + s2.Id.Substring(1, 5), s)); } r = r.OrderByDescending(a => a.Item2.Score).ThenByDescending(a => a.Item2.ScoreEvo).ToList(); var e = r.Select(a => a.Item1 + "," + string.Format("{0:0.00}", Math.Round(a.Item2.Score, 2)) + "," + string.Format("{0:0.00}", Math.Round(a.Item2.ScoreEvo, 2))).ToList(); e.Insert(0, "Sequence Alignment"); e.Insert(1, "ID1,ID2,Match%,Physicochemical%"); e = e.Select(a => a.Replace(",", "\t")).ToList(); File.WriteAllLines(logResultsFolder + "score_all_" + s1.Id.Substring(1, 5) + ".txt", e); } foreach (var s1 in inf) { var r = new List <Tuple <string, ProteinBioClass.AlignmentScore> >(); foreach (var s2 in inf) { //if (s1==s2) continue; var nmw = new NeedlemanWunsch(s1.FullSequence, s2.FullSequence); var a = nmw.getAlignment(); ProteinBioClass.AlignmentScore s = ProteinBioClass.SequenceSimilarityPercentage(a[0], a[1], ProteinBioClass.AlignmentIdentityOption.MinimumSequenceLength); //r.Add(s1.Id.Substring(1, 5) + " " + s2.Id.Substring(1, 5) + " " + s.Score + " " + s.ScoreEvo); r.Add(new Tuple <string, ProteinBioClass.AlignmentScore>(s1.Id.Substring(1, 5) + "," + s2.Id.Substring(1, 5), s)); } r = r.OrderByDescending(a => a.Item2.Score).ThenByDescending(a => a.Item2.ScoreEvo).ToList(); var e = r.Select(a => a.Item1 + "," + string.Format("{0:0.00}", Math.Round(a.Item2.Score, 2)) + "," + string.Format("{0:0.00}", Math.Round(a.Item2.ScoreEvo, 2))).ToList(); e.Insert(0, "Interface Alignment"); e.Insert(1, "ID1,ID2,Match%,Physicochemical%"); e.Insert(0, ""); e = e.Select(a => a.Replace(",", "\t")).ToList(); File.AppendAllLines(logResultsFolder + "score_all_" + s1.Id.Substring(1, 5) + ".txt", e); } //return; //r-some modelled //var pdbFileNames = Directory.GetFiles(logResultsFolder, "*.pdb", SearchOption.AllDirectories).Select(a=>Path.GetFileName(a).ToLowerInvariant()).Distinct().ToList(); var modellerLogFiles = Directory.GetFiles(logResultsFolder, "modeller_monomer_assessment.log", SearchOption.AllDirectories).ToList(); //modellerLogFiles = modellerLogFiles.Where(a => a.StartsWith(logResultsFolder + @"sw2\")).ToList(); //var dimerModellerLogFiles = Directory.GetFiles(logResultsFolder, "modeller_dimer_assessment.log", SearchOption.AllDirectories).ToList(); //var pisaLogFiles = Directory.GetFiles(logResultsFolder, "pisa_monomer_assessment.log", SearchOption.AllDirectories).ToList(); var data = new List <List <string> >(); var nats = new List <List <string> >(); var rowlen = 0; var scores = modellerLogFiles.SelectMany(m => ParseModellerLog(m)).ToList(); foreach (var scoreGroup in scores.GroupBy(a => { var structureFolderSplit = a.StructureFolder.Split('\\'); // \ -4 \ -3 \ -2 \ -1 \ // \sw_1OYVI_1R0RI_1SBNI_1V5IB_2SICI_3BX1C_4GI3C_4LVNP\1V5IB\1V5IB\all_0016_0026_1_1\ return(structureFolderSplit[structureFolderSplit.Length - 1].Substring(0, 3) + '_' + structureFolderSplit[structureFolderSplit.Length - 3] + '_' + structureFolderSplit[structureFolderSplit.Length - 2]); })) { var group = scoreGroup.ToList(); group = group.OrderBy(a => a.StructureFolder).ToList(); var natives1 = group.Where(a => a.StructureFolder.Contains("_native")).ToList(); foreach (var n in natives1) { nats.Add(new List <string>() { "nat_" + scoreGroup.Key.Substring(4), n.ModellerDope }); } //data.Add(group.Select(a => a.ModellerDope).ToList()); if (!scoreGroup.Key.StartsWith("nat")) { // make index line if (scoreGroup.Key.Substring(4, 5) == scoreGroup.Key.Substring(10, 5)) { //data.Add(new List<string>()); data.Add(group.Select(a => a.StructureFolder.Split('\\').Last().Substring(4)).ToList()); rowlen = data[data.Count - 1].Count; data[data.Count - 1].Insert(0, scoreGroup.Key + "_index"); } } data.Add(group.Select(a => a.ModellerDope).ToList()); data[data.Count - 1].Insert(0, scoreGroup.Key + "_energy"); } var output = new List <string>(); var nats2 = nats.Select(a => string.Join(",", a)).Distinct().OrderBy(a => a[0]).ToList(); //nats = nats.Distinct().OrderBy(a => a[0]).ToList(); foreach (var g in data.GroupBy(a => a[0].Substring(0, 3 + 1 + 5))) { var gi = g.ToList(); var index = gi.First(a => a[0].Contains("_index")); var len = index.Count - 1; var main = gi.First(a => a != index && a[0].Substring(4, 5) == a[0].Substring(10, 5)); var others = gi.Where(a => a != index && a != main).OrderBy(a => a[0]).ToList(); var natives = nats2.Where(a => a.Substring(4, 5) == index[0].Substring(4, 5)).OrderBy(a => a[0]).ToList(); natives = natives.Select(a => { var b = a.Split(','); var r = b[0]; for (var j = 0; j < len; j++) { r = r + ',' + string.Join(",", b.Skip(1).ToList()); } return(r); }).ToList(); var nativemain = natives.First(a => a.Substring(4, 5) == a.Substring(10, 5)); natives.Remove(nativemain); output.Add(string.Join(",", index)); output.Add(string.Join(",", main)); others.ForEach(a => output.Add(string.Join(",", a))); output.Add(string.Join(",", nativemain)); natives.ForEach(a => output.Add(string.Join(",", a))); output.Add(""); } //var output = data.Select(a => string.Join(",", a)) // .Distinct() // .OrderByDescending(a => a.Substring(4, 5)) // .ThenBy(a => a.Substring(0, 3)) // .ThenByDescending(a => a.Substring(4, 5) == a.Substring(10, 5)) // .ThenByDescending(a => a.Contains("_index")) // .ToList(); //for (var j = output.Count - 1; j >= 0; j--) //{ // if (output[j].Contains("_index")) // output.Insert(j, ""); //} File.WriteAllLines(logResultsFolder + Environment.MachineName + "_energy.csv", output); }
static void Main(string[] args) { const int atom_chain = 21; const int atom_chain_len = 1; const int atom_icode = 26; const int atom_icode_len = 1; const int atom_type = 14; const int atom_type_len = 3; const int atom_resseq = 22; const int atom_resseq_len = 4; var parameters = new string[, ] { { "[pdb_file]", "PDB ~v3.3 Protein Data Bank format file [*.pdb, *.ent]" }, { "[[subset]]", "-, mc, sc, ca" }, { "[[chain_ids]]", "molecule chains to output [2 formats: - for all, ABC, or A,1,50,B,2,40,C,5,200]" }, { "[[output_file]]", "optional output file. use ? for chain id. when ommitted, output to console" }, }; var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length); var exeFilename = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName); if (args.Length == 0) { Console.WriteLine(exeFilename + @" is a program to extract ATOM records from a PDB file."); Console.WriteLine(); Console.WriteLine(@"Usage:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Example:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\pdb1a12.pdb"" 8.0 ""c:\pdb_atoms\atoms1a12.pdb""", maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Arguments:"); for (var i = 0; i < parameters.GetLength(0); i++) { Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false)); } Console.WriteLine(); return; } // load and echo arguments var p = 0; var pdbFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; pdbFilename = pdbFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + pdbFilename); p++; var subset = args.Length > p && args[p].Length > 0 ? args[p].ToUpperInvariant() : ""; Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + subset); p++; var chainIds = args.Length > p && args[p].Length > 0 ? args[p] : ""; Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + chainIds); p++; var outputFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; outputFilename = outputFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFilename); Console.WriteLine(); if (string.IsNullOrWhiteSpace(pdbFilename)) { return; } if (!File.Exists(pdbFilename)) { return; } if (chainIds.Contains('-')) { chainIds = null; } var chainStartEnd = new List <Tuple <char, int, int> >(); var chainIdsSplit = chainIds?.Split(',').ToList(); char[] chainIdWhiteList; if (chainIdsSplit?.Count > 1) { if (chainIdsSplit.Count % 3 != 0) { return; } for (var i = 0; i < chainIdsSplit.Count; i += 3) { var id = chainIdsSplit[i + 0][0]; var start = chainIdsSplit[i + 1]; if (string.IsNullOrWhiteSpace(start)) { start = "-1"; } var end = chainIdsSplit[i + 2]; if (string.IsNullOrWhiteSpace(end)) { end = "-1"; } chainStartEnd.Add(new Tuple <char, int, int>(id, int.Parse(start), int.Parse(end))); } chainIdWhiteList = chainStartEnd.Select(a => a.Item1).Distinct().ToArray(); } else { chainIdWhiteList = chainIds?.Where(char.IsLetter).Distinct().ToArray();//!string.IsNullOrEmpty(chainIds) ? chainIds.ToUpperInvariant().Split(new char[] { ' ', ',' },StringSplitOptions.RemoveEmptyEntries) : null; } var terminatedChains = new List <char>(); var pdbfilenameShort = Path.GetFileNameWithoutExtension(pdbFilename); var pdbId = pdbfilenameShort.Substring(pdbfilenameShort.Length - 4).ToUpperInvariant(); var lines = File.ReadAllLines(pdbFilename); var result = new List <Tuple <char, string> >(); string[] ca = new string[] { "CA" }; string[] bb = new[] { "N", "CA", "C", "O" }; foreach (var line in lines) { if (line.Length < 22) { continue; } if (line.Substring(0, 4).ToUpperInvariant() == "TER ") { var chainId = line[21];//).ToUpperInvariant(); terminatedChains.Add(chainId); } if (line.Substring(0, 5).ToUpperInvariant() == "ATOM ") { var chainId = line[21];//).ToUpperInvariant(); if (terminatedChains.Contains(chainId)) { continue; } if (chainIdWhiteList != null && chainIdWhiteList.Length > 0 && !chainIdWhiteList.Contains(chainId)) { continue; } //if (subset == "ca" || subset == "sc") && (line[13] != 'C' || line[14] != 'A')) continue; var add = false; var atom_type_s = line.Substring(13, 3).Trim(); if (atom_type_s[0] != 'C' && atom_type_s[0] != 'N' && atom_type_s[0] != 'O') { continue; } // check chainIdsSplit var resId = int.Parse(line.Substring(atom_resseq, atom_resseq_len)); var chainStartEndItem = chainStartEnd.FirstOrDefault(a => a.Item1 == chainId); if (chainStartEndItem != null) { if (!((chainStartEndItem.Item2 == -1 || resId >= chainStartEndItem.Item2) && (chainStartEndItem.Item3 == -1 || resId <= chainStartEndItem.Item3))) { continue; } } if (subset == "-") { add = true; } else if (subset == "CA" && ca.Contains(atom_type_s)) { add = true; } else if (subset == "MC" && bb.Contains(atom_type_s)) { add = true; } else if (subset == "SC" && !bb.Contains(atom_type_s)) { add = true; } if (add) { result.Add(new Tuple <char, string>(chainId, line)); } } } if (!string.IsNullOrWhiteSpace(outputFilename)) { var outputFilename2 = outputFilename.Replace("?", ""); Directory.CreateDirectory(Path.GetDirectoryName(outputFilename2)); if (!outputFilename.Contains("?")) { File.WriteAllLines(outputFilename, result.Select(a => a.Item2).ToList()); } else if (outputFilename.Contains("??")) { var chains = new string(result.Select(a => a.Item1).Where(char.IsLetter).Distinct().OrderBy(a => a).ToArray()); outputFilename2 = Path.GetDirectoryName(outputFilename2) + @"\" + Path.GetFileNameWithoutExtension(outputFilename2) + chains + Path.GetExtension(outputFilename2); File.WriteAllLines(outputFilename2, result.Select(a => a.Item2).ToList()); } else if (outputFilename.Contains("?")) { var chains = result.Select(a => a.Item1).Distinct().ToList(); foreach (var chain in chains) { outputFilename2 = outputFilename.Replace("?", ""); outputFilename2 = Path.GetDirectoryName(outputFilename2) + @"\" + Path.GetFileNameWithoutExtension(outputFilename2) + chain + Path.GetExtension(outputFilename2); File.WriteAllLines(outputFilename2, result.Where(a => a.Item1 == chain).Select(a => a.Item2).ToList()); } } } else { foreach (var line in result) { Console.WriteLine(line); } Console.WriteLine(); } }
static void Main(string[] args) { //var crystals = new List<Tuple<string, char, char>>(); //crystals.Add(new Tuple<string, char, char>("2SIC", 'E', 'I')); //crystals.Add(new Tuple<string, char, char>("3BX1", 'A', 'C')); //crystals.Add(new Tuple<string, char, char>("1RGI", 'A', 'G')); //MakeCrystalTemplates(crystals); //return; // uncomment one of the options below or add a new one //var pdbSumInterfaceIdFirstLastList = @" //3JBIV 990 1001 //1T44G 95 110 //4PKHB 114 137 //5AFUb 52 60 //1RGIAG 95 110 //1H1VG 473 488 //4EAHA 641 653 //1KXPD 196 210 //1M8QA 529 544 //3JBIAV 990 1001 //1T44AG 95 110 //4PKHAB 114 137 //1RGIAG 95 110 //1H1VAG 473 488 //4EAHHA 641 653 //1KXPAD 196 210 //var pdbSumInterfaceIdFirstLastList = @" //4GI3AC 22 29 //2SICEI 65 74 //1SBNEI 35 53 //4LVNAP 207 216 //1OYVBI 54 64 //1V5IAB 68 76 //1R0REI 11 20 //3BX1AC 84 94 //"; // receptor ligand lig-inf-start lig-inf-end /* * var ligandSiblingInterfaceList = @" * 1RGIAG 95 110 * 3JBIAV 990 1001 * 4PKHAB 114 137 * 1H1VAG 473 488 * 4EAHHA 641 653 * 1KXPAD 196 210 * "; */ var ligandSiblingInterfaceList = @" 4GI3AC 22 29 2SICEI 65 74 1SBNEI 35 53 4LVNAP 207 216 1OYVBI 54 64 1V5IAB 68 76 1R0REI 11 20 3BX1AC 84 94 "; /* * 2SICEI 65 74 * 3BX1AC 84 94 * 1RGIAG 95 110 */ var recLigInfoList = ligandSiblingInterfaceList.Trim().Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries) .Select(a => { var b = a.Trim().Split(new char[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries); var pdbId = b[0].Substring(0, 4); var recChainId = b[0][4]; var recInfStart = -1; var recInfEnd = -1; var recSequenceAlignment = GetSequence(pdbId, recChainId); var recSequence = recSequenceAlignment.SuperSequence; var recInfSequence = ""; var recInfPosInRecSeq = -1; var ligChainId = b[0][5]; var ligInfStart = int.Parse(b[1]); var ligInfEnd = int.Parse(b[2]); var ligSequenceAlignment = GetSequence(pdbId, ligChainId); var ligSequence = ligSequenceAlignment.SuperSequence; var ligInfSequenceAlignment = GetSequence(pdbId, ligChainId, ligInfStart, ligInfEnd); var ligInfSequence = ligInfSequenceAlignment.SuperSequence; var ligInfPosInLigSeq = ligSequence.IndexOf(ligInfSequence); return(new RecLigInfo() { PdbId = pdbId, RecChainId = recChainId, RecInfStart = recInfStart, RecInfEnd = recInfEnd, RecSequence = recSequence, RecInfSequence = recInfSequence, RecInfPosInRecSeq = recInfPosInRecSeq, RecSequenceAlignment = recSequenceAlignment, RecInfSequenceAlignment = null, LigChainId = ligChainId, LigInfStart = ligInfStart, LigInfEnd = ligInfEnd, LigSequence = ligSequence, LigInfSequence = ligInfSequence, LigInfPosInLigSeq = ligInfPosInLigSeq, LigSequenceAlignment = ligSequenceAlignment, LigInfSequenceAlignment = ligInfSequenceAlignment, }); }).ToList(); var allSequenceIds = recLigInfoList.Select(a => a.PdbId + a.RecChainId + a.LigChainId).Distinct().OrderBy(a => a).ToList(); var rootFolder = @"C:\pdbe_split_4\sw_" + string.Join("_", allSequenceIds) + @"\"; Directory.CreateDirectory(rootFolder); //var rootFolderSubDirs = Directory.GetDirectories(rootFolder, "*", SearchOption.AllDirectories).ToList(); //rootFolderSubDirs.Remove(Path.GetDirectoryName(rootFolder)); if (recLigInfoList.Any(a => string.IsNullOrWhiteSpace(a.LigInfSequence))) { Console.WriteLine("Error: empty interface"); return; } if (recLigInfoList.Any(a => string.IsNullOrWhiteSpace(a.LigSequence) || a.LigSequence.Length < 50)) { Console.WriteLine("Error: empty/short sequence"); return; } var fastaSequences = recLigInfoList.Select(a => ">" + a.PdbId + a.LigChainId + "_pdb\r\n" + a.LigSequence).ToList(); var fastaInfSequences = recLigInfoList.Select(a => ">" + a.PdbId + a.LigChainId + "_interface\r\n" + a.LigInfSequence).ToList(); File.WriteAllLines(rootFolder + "sequences.fasta", fastaSequences); File.WriteAllLines(rootFolder + "interfaces_pdbsum.fasta", fastaInfSequences); if (recLigInfoList.Any(a => a.LigSequence.IndexOf(a.LigInfSequence) != a.LigSequence.LastIndexOf(a.LigInfSequence))) { Console.WriteLine("More than one interface match in the sequence"); return; } var folders = new List <string>(); foreach (var template in recLigInfoList.Where(a => (new List <string>() { "1RGI", "3BX1", "2SIC" }).Contains(a.PdbId))) { /* * // replacement with direct overwrite of interface * foreach (var templateForSiblingInterface in recLigInfoList) * { * var superOffset = templateForSiblingInterface.LigSequenceAlignment.SuperSequenceStartIndex; * * var siblingInterfaceToFit = InterfaceSubsequence(templateForSiblingInterface.LigSequence, templateForSiblingInterface.LigInfStart - superOffset, templateForSiblingInterface.LigInfEnd - superOffset, templateForModelling.LigInfSequence.Length).InterfaceSequence; * * var substitutionDescription = "sibling_" + templateForSiblingInterface.PdbId + templateForSiblingInterface.RecChainId + templateForSiblingInterface.LigChainId; * * var templateFolder = rootFolder + templateForModelling.PdbId + templateForModelling.RecChainId + templateForModelling.LigChainId + @"\" + templateForSiblingInterface.PdbId + templateForSiblingInterface.RecChainId + templateForSiblingInterface.LigChainId + @"\"; * var currentFolder = rootFolder + templateForModelling.PdbId + templateForModelling.RecChainId + templateForModelling.LigChainId + @"\" + templateForSiblingInterface.PdbId + templateForSiblingInterface.RecChainId + templateForSiblingInterface.LigChainId + @"\" + substitutionDescription + @"\"; * Directory.CreateDirectory(Path.GetDirectoryName(currentFolder)); * rootFolderSubDirs.Remove(Path.GetDirectoryName(currentFolder)); * * var ligSeqModified = templateForModelling.LigSequence.Remove(templateForModelling.LigInfPosInLigSeq, siblingInterfaceToFit.Length).Insert(templateForModelling.LigInfPosInLigSeq, siblingInterfaceToFit); * * var file = currentFolder + "template_ligand_after_substitution.ali"; * var seqModPir = Pir(new List<string>() { templateForModelling.RecSequence, ligSeqModified }); * * File.WriteAllText(file, seqModPir); * folders.Add(currentFolder); * * //DimerTemplate(, templateForModelling.RecChainId, templateForModelling.LigChainId); * * var templateSrc = @"c:\pdb_templates\repaired_crystal_and_repaired_model\" + templateForModelling.PdbId + templateForModelling.RecChainId + templateForModelling.LigChainId + ".pdb"; //@"c:\pdbe\" + templateForModelling.PdbId + ".pdb"; * var templateDest = templateFolder + "template_ligand_all.pdb"; * File.Copy(templateSrc, templateDest, true); * * if (templateForModelling.PdbId == templateForSiblingInterface.PdbId) * { * Debug.WriteLine(""); * } * } */ // replacements with sliding window method foreach (var sibling in recLigInfoList /*.Where(a => a == templateForModelling).ToList()*/)//.Where(a => a.PdbId == templateForModelling.PdbId && a.RecChainId == templateForModelling.RecChainId)) { if (template != sibling && (template.LigInfSequence.Contains(sibling.LigInfSequence) || sibling.LigInfSequence.Contains(template.LigInfSequence))) { Debug.WriteLine("template and sibling have matching interface"); continue; } var currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\"; Directory.CreateDirectory(Path.GetDirectoryName(currentFolder)); var templateSrc = @"c:\pdb_templates\repaired_crystal_and_repaired_model\" + template.PdbId + template.RecChainId + template.LigChainId + ".pdb"; //@"c:\pdbe\" + templateForModelling.PdbId + ".pdb"; var templateDest = currentFolder + "template_ligand_all.pdb"; File.Copy(templateSrc, templateDest, true); // 1. slide sibling interface over template interface (if bigger, try each possible) // 2. best alignments? // 3. // 1. ignore size difference (sibling can be same size, longer or shorter, still copied as-is) //Debug.WriteLine("Template: " + template.LigInfSequence + " " + template.LigInfSequence.Length); //Debug.WriteLine("Sibling: " + sibling.LigInfSequence + " " + sibling.LigInfSequence.Length); //part 0 - random sequence tests if (template == sibling) { var firstPosition = template.LigInfPosInLigSeq - 50;// (template.LigInfSequence.Length - 1); if (firstPosition < 0) { firstPosition = 0; } var lastPosition = template.LigInfPosInLigSeq + template.LigInfSequence.Length + 50; if (lastPosition + (template.LigInfSequence.Length - 1) > template.LigSequence.Length) { lastPosition = template.LigSequence.Length - sibling.LigInfSequence.Length; } var substitutionMode = "0"; for (var i = firstPosition; i <= lastPosition; i++) { var overlap = ProteinBioClass.InterfaceOverlapPercentage(i, (i + template.LigInfSequence.Length) - 1, template.LigInfPosInLigSeq, (template.LigInfPosInLigSeq + template.LigInfSequence.Length) - 1); overlap = Math.Round(overlap, 2); var random = template.LigSequence.Substring(i, template.LigInfSequence.Length); var ligSeqMod = template.LigSequence.Remove(template.LigInfPosInLigSeq, template.LigInfSequence.Length).Insert(template.LigInfPosInLigSeq, random); currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + template.PdbId + template.RecChainId + template.LigChainId + @"\" + substitutionMode + "_" + (i + 1).ToString().PadLeft(4, '0') + "_" + (template.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (sibling.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (overlap) + @"\"; Directory.CreateDirectory(Path.GetDirectoryName(currentFolder)); var file = currentFolder + "template_ligand_after_substitution.ali"; var seqModPir = Pir(new List <string>() { template.RecSequence, ligSeqMod }); if (!File.Exists(file)) { if (!folders.Contains(currentFolder)) { folders.Add(currentFolder); } File.WriteAllText(file, seqModPir); } } } continue; { //Debug.WriteLine("Part 1"); var firstPosition = template.LigInfPosInLigSeq - (sibling.LigInfSequence.Length - 1); if (firstPosition < 0) { firstPosition = 0; } var lastPosition = template.LigInfPosInLigSeq + (template.LigInfSequence.Length - 1); if (lastPosition + (sibling.LigInfSequence.Length - 1) > template.LigSequence.Length) { lastPosition = template.LigSequence.Length - sibling.LigInfSequence.Length; } for (var i = firstPosition; i <= lastPosition; i++) { var substitutionMode = "1"; var overlap = ProteinBioClass.InterfaceOverlapPercentage(i, (i + sibling.LigInfSequence.Length) - 1, template.LigInfPosInLigSeq, (template.LigInfPosInLigSeq + template.LigInfSequence.Length) - 1); overlap = Math.Round(overlap, 2); var ligSeqMod = template.LigSequence.Remove(i, sibling.LigInfSequence.Length).Insert(i, sibling.LigInfSequence); currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" + substitutionMode + "_" + (i + 1).ToString().PadLeft(4, '0') + "_" + (template.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (sibling.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (overlap) + @"\"; if (!folders.Contains(currentFolder)) { folders.Add(currentFolder); } Directory.CreateDirectory(Path.GetDirectoryName(currentFolder)); var file = currentFolder + "template_ligand_after_substitution.ali"; var seqModPir = Pir(new List <string>() { template.RecSequence, ligSeqMod }); File.WriteAllText(file, seqModPir); // Debug.WriteLine(io + " " + ligSeqMod); //Debug.WriteLine(""); } } // 2. keep longer sibling interface the same size as template interface if (sibling.LigInfSequence.Length > template.LigInfSequence.Length) { //Debug.WriteLine("Part 2"); var firstPosition = template.LigInfPosInLigSeq - (template.LigInfSequence.Length - 1); var lastPosition = template.LigInfPosInLigSeq + (template.LigInfSequence.Length - 1); for (var i = firstPosition; i <= lastPosition; i++) { for (var j = 0; j <= sibling.LigInfSequence.Length - template.LigInfSequence.Length; j++) { var ligSeqMod = template.LigSequence.Remove(i, template.LigInfSequence.Length).Insert(i, sibling.LigInfSequence.Substring(j, template.LigInfSequence.Length)); var substitutionMode = "2"; var overlap = ProteinBioClass.InterfaceOverlapPercentage(i, (i + template.LigInfSequence.Length) - 1, template.LigInfPosInLigSeq, (template.LigInfPosInLigSeq + template.LigInfSequence.Length) - 1); overlap = Math.Round(overlap, 2); currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" + substitutionMode + "_" + (i + 1).ToString().PadLeft(4, '0') + "_" + (j + 1).ToString().PadLeft(4, '0') + "_" + (overlap) + @"\"; if (!folders.Contains(currentFolder)) { folders.Add(currentFolder); } Directory.CreateDirectory(Path.GetDirectoryName(currentFolder)); var file = currentFolder + "template_ligand_after_substitution.ali"; var seqModPir = Pir(new List <string>() { template.RecSequence, ligSeqMod }); File.WriteAllText(file, seqModPir); } } //Debug.WriteLine(""); } // 3. delete original template interface, insert sibling interface (will already be in part 1 if size is equal) if (sibling.LigInfSequence.Length != template.LigInfSequence.Length) { Debug.WriteLine("Part 3"); var ligSeqMod = template.LigSequence.Remove(template.LigInfPosInLigSeq, template.LigInfSequence.Length).Insert(template.LigInfPosInLigSeq, sibling.LigInfSequence); var substitutionMode = "3"; currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" + substitutionMode + "_" + "delete-insert" + @"\"; if (!folders.Contains(currentFolder)) { folders.Add(currentFolder); } Directory.CreateDirectory(Path.GetDirectoryName(currentFolder)); var file = currentFolder + "template_ligand_after_substitution.ali"; var seqModPir = Pir(new List <string>() { template.RecSequence, ligSeqMod }); File.WriteAllText(file, seqModPir); Debug.WriteLine(""); } /* * continue; * * var firstBound = -1; * var lastBound = -1; * var resolution = -1; * var substitutionDescription = ""; * * var interfaceLengthDifference = template.LigInfSequence.Length - sibling.LigInfSequence.Length; // positive=replacement is shorter, negative=replacement is longer, zero=the same * * const int flank = 10; * firstBound = sibling.LigInfPosInLigSeq - flank; * lastBound = (firstBound + sibling.LigInfSequence.Length + (flank * 2)) - 1; * * resolution = 1; * substitutionDescription = "inf"; * * * if (firstBound < 0) firstBound = 0; * if (lastBound > sibling.LigSequence.Length - 1) lastBound = sibling.LigSequence.Length - 1; * * var totalPossibleInterfaceOverlapPositions = 1; * * * if (interfaceLengthDifference < 0) * { * totalPossibleInterfaceOverlapPositions = Math.Abs(interfaceLengthDifference) + 1; * } * * * * var lastPossibleSubPos = (lastBound - (interfaceLengthDifference > 0 ? sibling.LigInfSequence.Length : template.LigInfSequence.Length)) + 1; * * for (var substitutionSourcePos = firstBound; substitutionSourcePos <= lastBound && substitutionSourcePos <= lastPossibleSubPos; substitutionSourcePos += resolution) * { * for (var interfaceOverlapPosition = 0; interfaceOverlapPosition < totalPossibleInterfaceOverlapPositions; interfaceOverlapPosition++) * { * var replacementInterfaceSubsequence = sibling.LigSequence.Substring(substitutionSourcePos, sibling.LigInfSequence.Length); * * if (sibling.LigInfSequence.Length > template.LigInfSequence.Length) * { * replacementInterfaceSubsequence = replacementInterfaceSubsequence.Substring(interfaceOverlapPosition, template.LigInfSequence.Length); * } * * var ligSeqModified = template.LigSequence.Remove(template.LigInfPosInLigSeq, replacementInterfaceSubsequence.Length).Insert(template.LigInfPosInLigSeq, replacementInterfaceSubsequence); * * if (ligSeqModified.Length != template.LigSequence.Length) * { * throw new Exception("Wrong sub pos or len"); * } * * var native = (substitutionSourcePos >= sibling.LigInfPosInLigSeq && substitutionSourcePos + replacementInterfaceSubsequence.Length <= sibling.LigInfPosInLigSeq + sibling.LigInfSequence.Length); * if (native) * { * Console.WriteLine(""); * } * currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" + substitutionDescription + "_" + (substitutionSourcePos + 1).ToString().PadLeft(4, '0') + "_" + (substitutionSourcePos + replacementInterfaceSubsequence.Length).ToString().PadLeft(4, '0') + "_" + (interfaceOverlapPosition + 1) + "_" + totalPossibleInterfaceOverlapPositions + (native ? "_native" : "") + @"\"; * Directory.CreateDirectory(Path.GetDirectoryName(currentFolder)); * //rootFolderSubDirs.Remove(Path.GetDirectoryName(currentFolder)); * * var file = currentFolder + "template_ligand_after_substitution.ali"; * var seqModPir = Pir(new List<string>() { template.RecSequence, ligSeqModified }); * File.WriteAllText(file, seqModPir); * } * } */ } } //File.WriteAllLines(rootFolder + "obsolete_dirs.txt", rootFolderSubDirs); if (folders.Count > 0) { var scripts = new List <string> { "modeller_monomer.bat", "foldx_dimer.bat" }; //, "pisa_dimer.bat" }; int div = folders.Count / Environment.ProcessorCount; var batch = new List <string>(); var c = 0; while (folders.Count > 0) { c++; batch.Add(@"@echo off"); batch.Add(@"set HDF5_DISABLE_VERSION_CHECK=2"); batch.Add(@"set THIS_DIR=%cd%"); batch.Add(@"set PATH=%PATH%;c:\modeller_scripts;"); var t = folders.Count >= div * 2 ? div : folders.Count; folders.Take(t).ToList().ForEach(a => { batch.Add(@"echo " + a); batch.Add(@"cd " + a); batch.Add(@"CALL %script1%"); batch.Add(@"CALL %script2%"); }); folders = folders.Skip(t).ToList(); batch.Add(@"pause"); var d = batch /*.Select(a => a.Replace("%script%", script))*/.ToList(); var n = ""; for (var i = 0; i < scripts.Count; i++) { var script = scripts[i]; d = d.Select(a => a.Replace("%script" + (i + 1).ToString() + "%", script)).ToList(); n = n + Path.GetFileNameWithoutExtension(script) + "_"; } File.WriteAllLines(rootFolder + @"r_" + c + "_" + n + DateTime.Now.Ticks + @".bat", d); batch.Clear(); } } }
public static List <SequenceIdentityClusterMember> ClusterSequenceByAlignedSequenceIdentity(List <Sequence> seqList, ProteinBioClass.AlignmentIdentityOption alignmentIdentityOption, decimal mininumClusterPairwiseSimilarity = 0.3m, decimal mininumEvoClusterPairwiseSimilarity = 0.3m) { var allsequences = seqList.Select(a => new Tuple <string, char, string>(new ProteinBioClass.SequenceId(a.Id).PdbId, new ProteinBioClass.SequenceId(a.Id).ChainId, Sequence.EscapeAminoAcidSequence(a.FullSequence))).ToList(); var sequences = allsequences.Select(a => a.Item3).Distinct().ToList(); var sequenceIds = sequences.Select(a => allsequences.Where(b => b.Item3 == a).ToList()).ToList(); var seqClusters = new List <List <string> >(); for (int x = 0; x < sequences.Count; x++) { var seq1 = sequences[x]; var newCluster = new List <string>(); newCluster.Add(seq1); seqClusters.Add(newCluster); } for (int indexX = 0; indexX < sequences.Count; indexX++) { Console.WriteLine("Aligning sequence " + indexX); var seqX = sequences[indexX]; //List<decimal> scoreList = new List<decimal>(); //List<decimal> scoreEvoList = new List<decimal>(); for (int indexY = 0; indexY < sequences.Count; indexY++) { if (indexY <= indexX) { continue; } var seqY = sequences[indexY]; if ((decimal)Math.Min(seqX.Length, seqY.Length) / (decimal)Math.Max(seqX.Length, seqY.Length) < mininumClusterPairwiseSimilarity) { continue; } var cluster1 = seqClusters.FirstOrDefault(a => a.Contains(seqX)); var cluster2 = seqClusters.FirstOrDefault(a => a.Contains(seqY)); if (cluster1 != null && cluster2 != null && cluster1 == cluster2) { continue; } var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NON, alignmentIdentityOption); Console.WriteLine("1: " + seqX); Console.WriteLine("2: " + seqY); Console.WriteLine("Score1: " + score.Score); Console.WriteLine("Score2: " + score.ScoreEvo); if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity) { var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.SIM, alignmentIdentityOption); if (x.Score > score.Score) { score.Score = x.Score; } if (x.ScoreEvo > score.ScoreEvo) { score.ScoreEvo = x.ScoreEvo; } } if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity) { var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NMW, alignmentIdentityOption); if (x.Score > score.Score) { score = x; } if (x.ScoreEvo > score.ScoreEvo) { score.ScoreEvo = x.ScoreEvo; } } if (score.Score >= mininumClusterPairwiseSimilarity && score.ScoreEvo >= mininumEvoClusterPairwiseSimilarity) { var newCluster = new List <string>(); newCluster.AddRange(cluster1); newCluster.AddRange(cluster2); seqClusters.Remove(cluster1); seqClusters.Remove(cluster2); seqClusters.Add(newCluster); } //scoreList.Add(score.Score); //scoreEvoList.Add(score.ScoreEvo); } //Console.WriteLine("[" + string.Join(", ", scoreList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]"); //Console.WriteLine("[" + string.Join(", ", scoreEvoList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]"); } seqClusters = seqClusters.OrderBy(a => a.Count).ToList(); var output = new List <SequenceIdentityClusterMember>(); for (var index = 0; index < seqClusters.Count; index++) { var seqCluster = seqClusters[index]; foreach (var item in seqCluster) { var indexIds = sequences.IndexOf(item); var ids = sequenceIds[indexIds]; foreach (var id in ids) { output.Add(new SequenceIdentityClusterMember(index + 1, ProteinBioClass.PdbIdFromPdbFilename(id.Item1), id.Item2, id.Item3)); } } } return(output); }
static void Main(string[] args) { // this program takes a fasta or pdb file and finds all matching homologs // FindHomologs.exe "c:\ds96ub\ds96ub.fasta" * "c:\pdb\pdb_seqres.fasta" NMW Y 0.3 75 c:\pdb\ // alignment_type = (n)one, (s)imple, NMW, SWM var query_sequence_file = args[0]; //query.fasta var query_id_chain = args[1]; //1A2G:B var target_sequence_file = args[2]; //targets.fasta var alignment_type_str = args[3]; //NMW,SWM,SIM,NON if (alignment_type_str == "*") { alignment_type_str = "NMW,SWM,SIM,NON"; } var alignment_type_str_split = alignment_type_str.ToUpperInvariant().Split(new char[] { ',', ';', ' ', '\t' }); var compare_physicochemically = args[4]; //Y/N var compare_physicochemically_bool = compare_physicochemically == "Y"; var min_similarity_str = args[5]; // 0.3 var max_len_difference = args[6]; var max_len_difference_int = int.Parse(max_len_difference); var output_folder = args[7]; var minSimilarity = decimal.Parse(min_similarity_str); var alignmentTypes = new List <ProteinBioClass.AlignmentType>(); if (alignment_type_str_split.Contains("NMW")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.NMW); } if (alignment_type_str_split.Contains("SWM")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.SWM); } if (alignment_type_str_split.Contains("SIM")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.SIM); } if (alignment_type_str_split.Contains("NON") || alignmentTypes.Count == 0) { alignmentTypes.Add(ProteinBioClass.AlignmentType.NON); } if (alignmentTypes.Count < alignment_type_str_split.Length) { Console.WriteLine("; unknown alignment type"); return; } // load list of query sequences var queryPdbid = query_id_chain.Split(new char[] { ':' })[0]; var queryChainid = (query_id_chain.Contains(":") ? query_id_chain.Split(new char[] { ':' })[1] : "*")[0]; var querySeq = Sequence.LoadSequenceFile(query_sequence_file, null); var queryResults = querySeq.Where(a => { var id = new ProteinBioClass.SequenceId(a.Id); return((queryPdbid == "*" || id.PdbId.ToUpperInvariant() == queryPdbid.ToUpperInvariant()) && (queryChainid == '*' || id.ChainId == queryChainid)); }).ToList(); if (queryResults.Count == 0) { Console.WriteLine("; the query pdbids/chainids were not found"); return; } // load list of target sequences var targetSeq = Sequence.LoadSequenceFile(target_sequence_file, new string[] { null, "", "protein" }); targetSeq = targetSeq.Where(a => a.Count() >= 50).ToList(); Console.WriteLine("; aligning " + queryResults.Count + " query sequences to " + targetSeq.Count + " target sequences"); // perform alignment //var startTime = DateTime.Now; //var progress = 0; //var progressLock = new object(); //var tasks = new List<Task<StringBuilder>>(); var queryPdbIds = queryResults.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId); var targetPdbIds = targetSeq.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId); var queryPdbIdCounts = new Dictionary <string, int>(); foreach (var x in queryPdbIds) { if (!queryPdbIdCounts.ContainsKey(x)) { queryPdbIdCounts.Add(x, 1); } else { queryPdbIdCounts[x]++; } } var targetPdbIdCounts = new Dictionary <string, int>(); foreach (var x in targetPdbIds) { if (!targetPdbIdCounts.ContainsKey(x)) { targetPdbIdCounts.Add(x, 1); } else { targetPdbIdCounts[x]++; } } foreach (var _query in queryResults) { var _queryId = new ProteinBioClass.SequenceId(_query.Id); var filename = (new DirectoryInfo(output_folder).FullName) + @"\homologs_" + _queryId.PdbId + _queryId.ChainId + @".csv"; // skip if already processed if (File.Exists(filename) && new FileInfo(filename).Length > 0) { continue; } var totalQueryPdbIdChains = queryPdbIdCounts[_queryId.PdbId]; WorkDivision wd = new WorkDivision(targetSeq.Count); for (var thread = 0; thread < wd.ThreadCount; thread++) { var query = _query; var queryId = _queryId; var lti = thread; wd.TaskList.Add(Task.Run(() => { var result = new List <HomologChain>(); for (var target = wd.ThreadFirstIndex[lti]; target <= wd.ThreadLastIndex[lti]; target++) { var targetobj = targetSeq[target]; if (max_len_difference_int != -1 && Math.Abs(targetobj.Count() - query.Count()) > max_len_difference_int) { continue; } var targetId = new ProteinBioClass.SequenceId(targetobj.Id); //var totalTargetPdbIdChains = targetSeq.Count(a => FindAtomicContacts.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant() == targetId.PdbId.ToUpperInvariant()); //var timeRemaining = // TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks * // ((targetSeq.Count * queryResults.Count) - (progress + 1)) / // (progress + 1)); foreach (var alignmentType in alignmentTypes) { var scores = ProteinBioClass.AlignedSequenceSimilarityPercentage(query, targetobj, alignmentType /*, * compare_physicochemically_bool*/); decimal percentSimilar; if (compare_physicochemically_bool) { percentSimilar = scores.ScoreEvo; } else { percentSimilar = scores.Score; } if (percentSimilar >= minSimilarity) { result.Add(new HomologChain( queryId.PdbId, queryId.ChainId, totalQueryPdbIdChains, targetId.PdbId, targetId.ChainId, targetPdbIdCounts[targetId.PdbId], alignmentType.ToString(), scores.Score, scores.ScoreEvo)); } } //if (progress % 20 == 0) // Console.Write("\r{0}% eta {1} ", // Math.Round((decimal)(progress + 1) / (decimal)(targetSeq.Count * queryResults.Count), // 2) // .ToString(CultureInfo.InvariantCulture), // timeRemaining.ToString(@"d\d\:h\h\:m\m\:s\s", // CultureInfo.InvariantCulture)); //lock (progressLock) // progress++; } return(result); })); } wd.WaitAllTasks(); var mergedlist = new List <string>(); mergedlist.Add("; " + _queryId.PdbId + ":" + _queryId.ChainId); mergedlist.Add(String.Join(",", new string[] { "query pdb id", "query chain id", "query chains", "target pdb id", "target chain id", "target chains", "alignment method", "sequence similarity", "sequence evo similarity" })); foreach (var t in wd.TaskList) { var tc = t as Task <List <HomologChain> >; if (tc == null) { throw new Exception("task in tasklist was null"); } mergedlist.AddRange(tc.Result.Select(a => a.ToString()).ToList()); } if (string.IsNullOrWhiteSpace(output_folder)) { Console.WriteLine(String.Join(Environment.NewLine, mergedlist)); } else { File.WriteAllLines(filename, mergedlist); } } }
static void Main(string[] args) { var parameters = new string[, ] { { "[pdb_or_atoms_file]", "output from the ComplexAtoms program" }, { "[max_distance]", "maximum allowed contact distance in angstroms [i.e. 5.0 or 8.0]" }, { "[[output_file]]", "optional output file. when ommitted, output to console" }, { "[[overwrite]]", "overwrite if output file exists" } }; var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length); var exeFilename = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName); if (args.Length == 0) { Console.WriteLine(exeFilename + @" is a program to list atomic contacts for a PDB file ATOM records."); Console.WriteLine(); Console.WriteLine(@"Usage:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Example:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\pdb1a12.pdb"" 8.0 ""c:\pdb_atoms\atoms1a12.pdb""", maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Arguments:"); for (var i = 0; i < parameters.GetLength(0); i++) { Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false)); } Console.WriteLine(); //return; } // load arguments var p = 0; var atomsFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; atomsFilename = atomsFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + atomsFilename); p++; var maxDistance = args.Length > p && args[p].Length > 0 ? Decimal.Parse(args[p]) : 0.0m; Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + maxDistance); p++; var outputFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; outputFilename = outputFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFilename); p++; var overwrite = args.Length > p && args[p].Length > 0 ? args[p] : ""; overwrite = overwrite.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + overwrite); if (!string.IsNullOrWhiteSpace(overwrite) && overwrite.ToUpperInvariant() != "Y" && File.Exists(outputFilename)) { Console.Write("; File exists, skipping."); return; } Console.WriteLine(); if (!File.Exists(atomsFilename)) { return; } var interactions = ProteinBioClass.FindInteractions(CancellationToken.None, maxDistance, atomsFilename, new Dictionary <string, List <char> >()); if (!string.IsNullOrWhiteSpace(outputFilename)) { ProteinBioClass.AtomPair.SaveAtomPairList(outputFilename, interactions); } else { //Console.WriteLine("; Atom pairs with contacts: " + interactions.Count); foreach (var a in interactions.Select(a => a.ToString()).ToList()) { Console.WriteLine(a); } } }
static void Main(string[] args) { var pdbFolder = @"C:\ds96ub_homologs\"; var homologClusterData = FindHomologsCluster.FindHomologsCluster.HomologClusterData.Load(@"c:\ds96ub_homologs\ds96ub_homologs_0.7.csv"); var pdbFiles = Directory.GetFiles(pdbFolder, "*.pdb", SearchOption.TopDirectoryOnly); var pdbIdList = pdbFiles.Select(ProteinBioClass.PdbIdFromPdbFilename).ToList(); // only ca-atoms, ters and endmdls var pdbAtomsText = pdbFiles.Select( a => File.ReadAllLines(a) .Where(b => (b.StartsWith("ATOM ") && b[13] == 'C' && b[14] == 'A') || /*b.StartsWith("TER ") ||*/ b.StartsWith("ENDMDL ")) .ToList()).ToList(); // only first nmr model pdbAtomsText = pdbAtomsText.Select(a => { var x = a.FindIndex(b => b.StartsWith("ENDMDL ")); return(x == -1 ? a : a.GetRange(0, x - 1)); }).ToList(); var pdbAtoms = pdbAtomsText.Select(a => a.Select(b => new ATOM_Record(b)).ToList()).ToList(); // get list of unique chain ids var pdbChainIds = pdbAtoms.Select((a, i) => a.Select(b => char.ToUpperInvariant(b.chainID.FieldValue[0])).ToList()).Distinct().ToList(); var pdbIdChainIdList = new List <Tuple <string, char> >(); for (var i = 0; i < pdbIdList.Count; i++) { pdbIdChainIdList.AddRange(pdbChainIds[i].Select(chainId => new Tuple <string, char>(pdbIdList[i], chainId))); } pdbIdChainIdList = pdbIdChainIdList.Distinct().ToList(); // for each chain var pdbContacts = pdbIdChainIdList.Select(a => { var x = ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\ds96ub_homologs\contacts\contacts_pdb" + a.Item1.ToUpperInvariant() + ".pdb") .Where(b => char.ToUpperInvariant(b.Atom1.chainID.FieldValue[0]) == a.Item2 || char.ToUpperInvariant(b.Atom2.chainID.FieldValue[0]) == a.Item2) .Select(c => { if (char.ToUpperInvariant(c.Atom1.chainID.FieldValue[0]) != a.Item2) { c.SwapAtoms(); } return(c); }).ToList(); return(x); }).ToList(); // res min, res max, best min, best max, interface aa, interface mask var pdbInterfaces = new List <Ds93UbInterface>(); var interface_target_length = 50; for (int index = 0; index < pdbContacts.Count; index++) { var pdbId = pdbIdChainIdList[index].Item1; var chainId = pdbIdChainIdList[index].Item2; var pdbContact = pdbContacts[index]; if (pdbContact.Count == 0) { continue; } var contactChains = pdbContact.Where(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) != chainId).Select(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0])).Distinct().ToList(); foreach (var contactChain in contactChains) { var pdbContactsResSeqIds = pdbContact.Where(a => char.ToUpperInvariant(a.Atom1.chainID.FieldValue[0]) == chainId && char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) == contactChain) .Select(a => int.Parse(a.Atom1.resSeq.FieldValue)) .ToList(); var res_seq = pdbContactsResSeqIds; var min_res_seq = pdbContactsResSeqIds.Min(); var max_res_seq = pdbContactsResSeqIds.Max(); var best50_min = int.MinValue; var best50_max = int.MinValue; var best50_interactions = int.MinValue; var best50_middle_finder = new List <Tuple <int, int, int> >(); for (var x = min_res_seq - interface_target_length; x <= max_res_seq; x++) { if (Math.Abs(max_res_seq - min_res_seq) <= interface_target_length) { best50_min = min_res_seq; best50_max = max_res_seq; best50_interactions = res_seq.Count; break; } var min = x; var max = x + interface_target_length > max_res_seq ? max_res_seq : x + interface_target_length; var best50 = res_seq.Count(a => a >= best50_min && a <= best50_max); if (best50 == best50_interactions) { best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); } if (best50_interactions == int.MinValue || best50 > best50_interactions) { best50_middle_finder.Clear(); best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); best50_min = min; best50_max = max; best50_interactions = best50; } if (x + interface_target_length >= max) { break; } } if (best50_middle_finder.Count > 2) { var middle = best50_middle_finder[best50_middle_finder.Count / 2]; best50_min = middle.Item1; best50_max = middle.Item2; best50_interactions = middle.Item3; } var best50_interface_atoms = pdbAtoms[pdbIdList.IndexOf(pdbId)].Where(a => { var l = int.Parse(a.resSeq.FieldValue); var c = char.ToUpperInvariant(a.chainID.FieldValue[0]); return(c == chainId && l >= best50_min && l <= best50_max); }).ToList(); best50_interface_atoms = best50_interface_atoms.OrderBy(c => int.Parse(c.resSeq.FieldValue)).ToList(); var best50_interface = string.Join("", best50_interface_atoms.Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var best50_mask = new string('_', best50_interface.Length); best50_mask = string.Join("", best50_mask.Select((a, i) => res_seq.Contains(i + best50_min) ? "X" : "_").ToList()); pdbInterfaces.Add(new Ds93UbInterface(pdbId, chainId, contactChain, min_res_seq, max_res_seq, best50_min, best50_max, best50_interactions, best50_interface, best50_mask, -1, "", "", 0, -1, "", "", 0)); } } var homologClusterIndexes = homologClusterData.Select(a => a.ClusterIndex).Distinct().ToList(); var homologClusters = homologClusterIndexes.Select(a => homologClusterData.Where(b => b.ClusterIndex == a).ToList()).ToList(); var pdbInterfacesSorted = homologClusters.Select(a => pdbInterfaces.Where(b => a.Any(c => c.PdbId == b.PdbId && (char.ToUpperInvariant(c.ChainId) == b.ChainId1 || char.ToUpperInvariant(c.ChainId) == b.ChainId2))).ToList()).ToList(); var outputData = new List <string>(); foreach (var clusterIndex in homologClusterIndexes) { var cluster = pdbInterfacesSorted[clusterIndex - 1]; // currently, cluster is a list of chain1-->chain2 interfaces ... so the 'chain2' interface needs adding to the record foreach (var inf1 in cluster) { var partner = cluster.Where(a => a != inf1 && a.PdbId == inf1.PdbId && inf1.ChainId2 == a.ChainId1) .OrderByDescending( a => InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, a.MinResSeq, a.MaxResSeq)) .ToList(); var first = partner.FirstOrDefault(); if (first != null) { inf1.Partner1InterfaceAminoAcids = first.InterfaceAminoAcids; inf1.Partner1InterfaceInteractionsMask = first.InterfaceInteractionsMask; inf1.Partner1InterfaceOverlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, first.MinResSeq, first.MaxResSeq); } var second = partner.ElementAtOrDefault(1); if (second != null) { inf1.Partner2InterfaceAminoAcids = second.InterfaceAminoAcids; inf1.Partner2InterfaceInteractionsMask = second.InterfaceInteractionsMask; inf1.Partner2InterfaceOverlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, second.MinResSeq, second.MaxResSeq); } } cluster = cluster.Where(a => a.Partner1InterfaceOverlap > 0 || a.Partner2InterfaceOverlap > 0).ToList(); /* * var partners = * foreach (var inf2 in cluster) * { * if (inf1.PdbId!=inf2.PdbId) continue; * * if (inf1==inf2) continue; * * if (!(inf1.ChainId1==inf2.ChainId2 || inf1.ChainId2==inf2.ChainId1)) continue; * * // * var overlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, inf2.MinResSeq, inf2.MaxResSeq); * * if (overlap > 0) * { * if (overlap > inf1.Partner1InterfaceOverlap) * { * inf1.Partner1InterfaceOverlap = overlap; * inf1.Partner1InterfaceAminoAcids = inf2.InterfaceAminoAcids; * inf1.Partner1InterfaceInteractionsMask = inf2.InterfaceInteractionsMask; * } * * if (overlap > inf2.Partner1InterfaceOverlap) * { * inf2.Partner1InterfaceOverlap = overlap; * inf2.Partner1InterfaceAminoAcids = inf1.InterfaceAminoAcids; * inf2.Partner1InterfaceInteractionsMask = inf1.InterfaceInteractionsMask; * } * } * } * } */ //var interfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList(); //interfaces = interfaces.Where(a => interfaces.Count(b => b == a) > 1).ToList(); //cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5 && cluster.Count(b => b.InterfaceAminoAcids == a.InterfaceAminoAcids) > 1).ToList(); cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5).ToList(); var clusterInterfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList(); var homologInterfaces = new List <List <string> >(); foreach (var inf1 in clusterInterfaces) { var highest_score = decimal.MinValue; string highest_inf = null; foreach (var inf2 in clusterInterfaces) { if (inf1 == inf2) { continue; } var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(inf1, inf2, ProteinBioClass.AlignmentType.NMW); if (score.Score > highest_score) { highest_score = score.Score; highest_inf = inf2; } } var y = homologInterfaces.FirstOrDefault(a => a.Contains(inf1) || a.Contains(highest_inf)); if (y != null) { if (!y.Contains(inf1)) { y.Add(inf1); } if (!y.Contains(highest_inf)) { y.Add(highest_inf); } } else { var z = new List <string>(); z.Add(inf1); z.Add(highest_inf); homologInterfaces.Add(z); } } foreach (var c in cluster) { c.Partner1ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner1InterfaceAminoAcids)); c.Partner2ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner2InterfaceAminoAcids)); } for (int index = 0; index < homologInterfaces.Count; index++) { var homologInterface = homologInterfaces[index]; var cluster2 = cluster.Where(a => homologInterface.Contains(a.InterfaceAminoAcids) ) .OrderBy(a => a.Partner1ClusterIndex) .ThenBy(a => a.Partner2ClusterIndex) .ThenBy(a => a.InterfaceAminoAcids) .ThenBy(a => a.Partner1InterfaceAminoAcids) .ThenBy(a => a.Partner2InterfaceAminoAcids) .ToList(); var partners = cluster2.Select( a => new Tuple <string, string, string>(a.InterfaceAminoAcids, a.Partner1InterfaceAminoAcids, a.Partner2InterfaceAminoAcids)).Distinct(); cluster2 = partners.Select( a => cluster2.FirstOrDefault( b => b.InterfaceAminoAcids == a.Item1 && b.Partner1InterfaceAminoAcids == a.Item2 && b.Partner2InterfaceAminoAcids == a.Item3)).ToList(); outputData.Add("cluster " + clusterIndex + "." + index); outputData.AddRange(cluster2.Select(a => a.ToString()).ToList()); outputData.Add(""); } } File.WriteAllLines(@"c:\ds96ub_homologs\ds96ub_homologs_interfaces.csv", outputData);//pdbInterfaces.Select(a=>a.ToString()).ToList()); }
static void Main(string[] args) { var parameters = new string[, ] { { "[pdb_or_atoms_file]", "input structure for sequence" }, { "[fasta_file]", "input sequence for structure" }, { "[[output_file]]", "optional output file" }, }; var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length); var exeFilename = Path.GetFileName(Process.GetCurrentProcess().MainModule.FileName); if (args.Length < 1) { Console.WriteLine(exeFilename + @" is a program to calculate offset between the sequence and structure."); Console.WriteLine(); Console.WriteLine(@"Usage:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Example:"); Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\atoms\atoms1a12.pdb"" ""c:\pdb_db\fasta\atoms1a12.fasta""", maxParamLength + 2, 1)); Console.WriteLine(); Console.WriteLine(@"Arguments:"); for (var i = 0; i < parameters.GetLength(0); i++) { Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false)); } Console.WriteLine(); return; } // load arguments var p = 0; var atomsFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; atomsFilename = atomsFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + atomsFilename); p++; var inputFastaFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; inputFastaFilename = inputFastaFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + inputFastaFilename); p++; var outputDataFilename = args.Length > p && args[p].Length > 0 ? args[p] : ""; outputDataFilename = outputDataFilename.Replace("\"", ""); Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputDataFilename); Console.WriteLine(); var struct_seq = ProteinBioinformaticsSharedLibrary.ProteinBioClass.StructureFileToAaSequence(atomsFilename, null, false); //foreach (var s in struct_seq) //Console.WriteLine(s); //var fasta = File.ReadAllLines(inputFastaFilename); //foreach (var line in fasta) //{ // if (string.IsNullOrWhiteSpace(line))continue; // if (line[0] == '>') // { // if (line.Contains()) // } //} /// not finished! }
public static string LoadDsspStructureSequence(string pdbFilename, string chainId = null, int startResidueSequenceIndex = -1, int endResidueSequenceIndex = -1, bool reversedSequence = false) { if (string.IsNullOrWhiteSpace(pdbFilename)) { return(""); } var pdbId = ProteinBioClass.PdbIdFromPdbFilename(pdbFilename); var dsspFilename = pdbFilename; if (Path.GetExtension(dsspFilename) != ".dssp") { dsspFilename += ".dssp"; } if (!File.Exists(dsspFilename)) { return(""); } var secondaryStructure = DsspFormatFile.LoadDsspFile(dsspFilename); if (chainId != null && secondaryStructure.FirstOrDefault(a => a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()) == null) { return(""); } if (startResidueSequenceIndex == -1) { startResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Min(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue)); } if (endResidueSequenceIndex == -1) { endResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Max(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue)); } // dssp specification says order may not be correct secondaryStructure = secondaryStructure.Where(a => !string.IsNullOrWhiteSpace(a.FieldChain.FieldValue) && !string.IsNullOrWhiteSpace(a.FieldPdbResidueSequenceIndex.FieldValue)).OrderBy(a => a.FieldChain.FieldValue).ThenBy(a => NullableTryParseInt32(a.FieldPdbResidueSequenceIndex.FieldValue)).ToList(); var proteinInterfaceLen = CalculateProteinInterfaceLength(startResidueSequenceIndex, endResidueSequenceIndex); char[] result = new char[proteinInterfaceLen]; for (int index = 0; index < result.Length; index++) { result[index] = '_'; } foreach (var record in secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant())) { var resSeq = NullableTryParseInt32(record.FieldPdbResidueSequenceIndex.FieldValue); if (resSeq == null || resSeq < startResidueSequenceIndex || resSeq > endResidueSequenceIndex) { continue; } var position = resSeq - startResidueSequenceIndex; if (record.FieldSecondaryStructure.FieldValue.Length == 0) { continue; } result[position.Value] = record.FieldSecondaryStructure.FieldValue[0]; } if (reversedSequence) { Array.Reverse(result); } return(new string(result)); }