Ejemplo n.º 1
0
        public static List <Sequence> LoadStructureFile(string atomsFilename, char[] chainIdWhiteList = null, bool padMissingBool = true, int[] startResSeq = null, int[] endResSeq = null, char outsidePaddingChar = ' ', char insidePaddingChar = 'X')
        {
            var pdb   = ProteinBioClass.PdbAtomicChains(atomsFilename, chainIdWhiteList, -1, -1, true);
            var pdbId = ProteinBioClass.PdbIdFromPdbFilename(atomsFilename);

            return(LoadStructureFile(pdb, pdbId, chainIdWhiteList, padMissingBool, startResSeq, endResSeq,
                                     outsidePaddingChar, insidePaddingChar));
        }
Ejemplo n.º 2
0
        public static List <Sequence> LoadStructureFile(string[] structureFileLines, string pdbId = "", char[] chainIdWhiteList = null, bool padMissingBool = true, int[] startResSeq = null, int[] endResSeq = null, char outsidePaddingChar = ' ', char insidePaddingChar = 'X')
        {
            var pdb = ProteinBioClass.PdbAtomicChains(structureFileLines, chainIdWhiteList, -1, -1, false);

            foreach (var c in pdb.ChainList)
            {
                c.AtomList = c.AtomList.GroupBy(a => a.resSeq.FieldValue).OrderBy(g => int.Parse(g.Key)).Select(g => g.First()).ToList();
            }

            return(LoadStructureFile(pdb, pdbId, chainIdWhiteList, padMissingBool, startResSeq, endResSeq, outsidePaddingChar, insidePaddingChar));
        }
        public static StructureToSequenceAlignment.StructureToSequenceAlignmentResult GetSequence(string pdbId, char chainId, int first = -1, int last = -1)
        {
            //var chainId = pdbId[4];
            pdbId = pdbId.Substring(0, 4);
            var pdbFilename   = @"c:\pdbe\" + pdbId + ".pdb";
            var fastaFilename = @"c:\pdbe\pdb_seqres.fasta";
            var fastaSequence = Sequence.LoadSequenceFile(fastaFilename, new string[] { null, "", "protein" }).First(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.ToUpperInvariant() && a.IdSplit.ChainId == chainId);
            var pdbSequence   = Sequence.LoadStructureFile(pdbFilename, new[] { chainId }, true, null, null, '-', '-').First(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.ToUpperInvariant() && a.IdSplit.ChainId == chainId);
            var atoms         = ProteinBioClass.PdbAtomicChains(pdbFilename, new char[] { chainId }).ChainList.First(a => a.ChainId == chainId).AtomList;
            var align         = StructureToSequenceAlignment.Align(atoms, fastaSequence.FullSequence, pdbSequence.FullSequence, first, last);

            return(align);
        }
        static void Main(string[] args)
        {
            var parameters = new string[, ]
            {
                { "[pdb_file]", "PDB ~v3.3 Protein Data Bank format file [*.pdb, *.ent]" },
                { "[interface-interface_file]", "interface-interface file" },
                { "[[chain_ids]]", "molecule chains to output [* for all]" },
                { "[[output_file]]", "optional output file. use ? for chain id. when ommitted, output to console" },
            };

            var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length);
            var exeFilename    = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);

            if (args.Length == 0)
            {
                Console.WriteLine(exeFilename + @" is a program to extract ATOM records from a PDB file.");
                Console.WriteLine();
                Console.WriteLine(@"Usage:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Example:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\pdb1a12.pdb"" 8.0 ""c:\pdb_atoms\atoms1a12.pdb""", maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Arguments:");
                for (var i = 0; i < parameters.GetLength(0); i++)
                {
                    Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false));
                }
                Console.WriteLine();
                return;
            }

            // load and echo arguments
            var p           = 0;
            var pdbFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            pdbFilename = pdbFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + pdbFilename);

            p++;
            var interfaceInterfaceFile = args.Length > p && args[p].Length > 0 ? args[p].ToUpperInvariant() : "";

            interfaceInterfaceFile = interfaceInterfaceFile.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + interfaceInterfaceFile);

            p++;
            var chainIds = args.Length > p && args[p].Length > 0 ? args[p] : "";

            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + chainIds);

            p++;
            var outputFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            outputFilename = outputFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFilename);

            Console.WriteLine();

            if (!File.Exists(pdbFilename))
            {
                Console.WriteLine("; File not found: " + pdbFilename);
                return;
            }

            if (!File.Exists(interfaceInterfaceFile))
            {
                Console.WriteLine("; File not found: " + interfaceInterfaceFile);
                return;
            }

            if (string.IsNullOrWhiteSpace(pdbFilename))
            {
                return;
            }

            if (chainIds.Contains('*'))
            {
                chainIds = null;
            }

            var chainIdWhiteList = !string.IsNullOrEmpty(chainIds) ? chainIds.ToUpperInvariant().Split(new char[] { ' ', ',' }, StringSplitOptions.RemoveEmptyEntries) : null;

            var interfaceData = ComplexInterfaces.ComplexInterfaces.InterfaceInterfaceData.Load(interfaceInterfaceFile);

            var terminatedChains = new List <string>();

            var pdbfilenameShort = Path.GetFileNameWithoutExtension(pdbFilename);

            var pdbId = pdbfilenameShort.Substring(pdbfilenameShort.Length - 4).ToUpperInvariant();

            var lines = File.ReadAllLines(pdbFilename);

            var result = new List <Tuple <string, string> >();

            var interfaceDataChains = interfaceData.Select(a => a.ReceptorChainId).Distinct().ToList();

            var interfaceDataStart = interfaceDataChains.Select(a => interfaceData.Where(b => b.ReceptorChainId == a).Min(b => b.ReceptorInterfaceResSeqStart)).ToList();
            var interfaceDataEnd   = interfaceDataChains.Select(a => interfaceData.Where(b => b.ReceptorChainId == a).Max(b => b.ReceptorInterfaceResSeqEnd)).ToList();


            foreach (var line in lines)
            {
                if (line.Length < 22)
                {
                    continue;
                }

                if (line.Substring(0, 4).ToUpperInvariant() == "TER ")
                {
                    var chainId = ("" + line[21]).ToUpperInvariant();

                    terminatedChains.Add(chainId);
                }


                if (line.Substring(0, 5).ToUpperInvariant() == "ATOM ")
                {
                    var chainId = ("" + line[21]).ToUpperInvariant();

                    if (terminatedChains.Contains(chainId))
                    {
                        continue;
                    }

                    if (chainIdWhiteList != null && chainIdWhiteList.Length > 0 && !chainIdWhiteList.Contains(chainId))
                    {
                        continue;
                    }

                    if (!interfaceDataChains.Contains(chainId[0]))
                    {
                        continue;
                    }

                    //if (caTraceOnlyBool && (line[13] != 'C' || line[14] != 'A')) continue;

                    var interfaceDataChainIndex = interfaceDataChains.IndexOf(chainId[0]);

                    var resSeq = int.Parse(line.Substring(22, 4).Trim());

                    if (resSeq >= interfaceDataStart[interfaceDataChainIndex] && resSeq <= interfaceDataEnd[interfaceDataChainIndex])
                    {
                        result.Add(new Tuple <string, string>(chainId, line));
                    }
                }
            }


            if (!string.IsNullOrWhiteSpace(outputFilename))
            {
                Directory.CreateDirectory(Path.GetDirectoryName(outputFilename.Replace("?", "")));
                if (!outputFilename.Contains("?"))
                {
                    File.WriteAllLines(outputFilename, result.Select(a => a.Item2).ToList());
                }
                else
                {
                    var chains = result.Select(a => a.Item1).Distinct().ToList();
                    foreach (var chain in chains)
                    {
                        var outputFilename2 = outputFilename.Replace("?", "");
                        outputFilename2 = Path.GetDirectoryName(outputFilename2) + @"\" + Path.GetFileNameWithoutExtension(outputFilename2) + chain + Path.GetExtension(outputFilename2);
                        File.WriteAllLines(outputFilename2, result.Where(a => a.Item1 == chain).Select(a => a.Item2).ToList());
                    }
                }
            }
            else
            {
                foreach (var line in result)
                {
                    Console.WriteLine(line);
                }
                Console.WriteLine();
            }
        }
Ejemplo n.º 5
0
        public static void Main(string[] args)
        {
            var parameters = new string[, ]
            {
                { "[pdb_or_atoms_file]", "standard crystal pdb file or output from the ComplexAtoms program" },
                { "[[pad_missing]]", "Y or N (default: Y)" },
                { "[[output_fasta_file]]", "optional output fasta file.  when ommitted, output to console" },
                { "[[append_or_overwrite]]", "optional (A) append or (O) overwrite (default: overwrite)" },
            };

            var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length);
            var exeFilename    = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);

            if (args.Length < 1)
            {
                Console.WriteLine(exeFilename + @" is a program to extract the protein amino acid fasta sequence from protein structure pdb file.");
                Console.WriteLine();
                Console.WriteLine(@"Usage:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Example:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\atoms\atoms1a12.pdb"" ""c:\pdb_db\fasta_from_pdb\atoms1a12.pdb.fasta""", maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Arguments:");
                for (var i = 0; i < parameters.GetLength(0); i++)
                {
                    Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false));
                }
                Console.WriteLine();

                return;
            }

            // load arguments
            var p             = 0;
            var atomsFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            atomsFilename = atomsFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + atomsFilename);

            p++;
            var padMissing = args.Length > p && args[p].Length > 0 ? args[p] : "Y";

            padMissing = padMissing.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + padMissing);
            if (padMissing != "Y" && padMissing != "N")
            {
                padMissing = "Y";
            }
            var padMissingBool = padMissing == "Y";

            p++;
            var outputFastaFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            outputFastaFilename = outputFastaFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFastaFilename);

            p++;
            var appendOrOverwrite = args.Length > p && args[p].Length > 0 ? args[p] : "";

            appendOrOverwrite = appendOrOverwrite.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + appendOrOverwrite);

            if (!(string.IsNullOrWhiteSpace(appendOrOverwrite) || appendOrOverwrite == "O" || appendOrOverwrite == "A"))
            {
                return;
            }

            Console.WriteLine();

            var sequenceList = Sequence.LoadStructureFile(atomsFilename, null, padMissingBool);// ProteinBioClass.StructureFileToAaFastaSequence(atomsFilename, null, padMissingBool);

            var output = Sequence.GetAsFasta(sequenceList);

            if (string.IsNullOrWhiteSpace(outputFastaFilename))
            {
                Console.WriteLine();

                Console.WriteLine(output);

                Console.WriteLine();
            }
            else
            {
                Directory.CreateDirectory(Path.GetDirectoryName(outputFastaFilename));
                if (appendOrOverwrite == "A" && File.Exists(outputFastaFilename))
                {
                    var data = File.ReadAllText(outputFastaFilename);
                    if (!data.EndsWith(Environment.NewLine))
                    {
                        data = data + Environment.NewLine;
                    }
                    output = data + output;
                }

                File.WriteAllText(outputFastaFilename, output);
            }
        }
Ejemplo n.º 6
0
        static void Main(string[] args)
        {
            // the indexes of data, contacts1 and contacts2 all match

            var data = MultiBindingInterface.LoadAuthorData(@"c:\multibinding\multibinding.csv", @"c:\multibinding\multibinding_homolog_clusters.csv");

            var contactsPartner1 =
                data.Select(
                    a =>
            {
                var x = ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\multibinding\contacts\contacts_pdb" +
                                                                  a.InteractionChainsPdb1.ToUpperInvariant() + ".pdb")

                        .Where(
                    b =>
                    (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb1Chain1
                     &&
                     b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb1Chain2)
                    ||
                    (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb1Chain2
                     &&
                     b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb1Chain1)).ToList();

                x = x.Select(c =>
                {
                    if (c.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb1Chain2)
                    {
                        c.SwapAtoms();
                    }

                    return(c);
                }).ToList();

                return(x);
            }).ToList();

            var contactsPartner2 =
                data.Select(
                    a =>
            {
                var x = ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\multibinding\contacts\contacts_pdb" +
                                                                  a.InteractionChainsPdb2.ToUpperInvariant() + ".pdb")

                        .Where(
                    b =>
                    (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb2Chain1
                     &&
                     b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb2Chain2)
                    ||
                    (b.Atom1.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb2Chain2
                     &&
                     b.Atom2.chainID.FieldValue.ToUpperInvariant()[0] ==
                     a.InteractionChainsPdb2Chain1)).ToList();

                x = x.Select(c =>
                {
                    if (c.Atom1.chainID.FieldValue.ToUpperInvariant()[0] == a.InteractionChainsPdb2Chain2)
                    {
                        c.SwapAtoms();
                    }

                    return(c);
                }).ToList();

                return(x);
            }).ToList();

            var interfacePartner1 = contactsPartner1.Select(a =>
            {
                var resSeqChain1 = a.Select(b => int.Parse(b.Atom1.resSeq.FieldValue)).ToList();
                var resSeqChain2 = a.Select(b => int.Parse(b.Atom2.resSeq.FieldValue)).ToList();

                if (resSeqChain1.Count > 0 && resSeqChain2.Count > 0)
                {
                    return(new Tuple <int, int, int, int>(resSeqChain1.Min(), resSeqChain1.Max(), resSeqChain2.Min(),
                                                          resSeqChain2.Max()));
                }
                else
                {
                    return(null);
                }
            }).ToList();


            var interfacePartner2 = contactsPartner2.Select(a =>
            {
                var resSeqChain1 = a.Select(b => int.Parse(b.Atom1.resSeq.FieldValue)).ToList();
                var resSeqChain2 = a.Select(b => int.Parse(b.Atom2.resSeq.FieldValue)).ToList();

                if (resSeqChain1.Count > 0 && resSeqChain2.Count > 0)
                {
                    return(new Tuple <int, int, int, int>(resSeqChain1.Min(), resSeqChain1.Max(), resSeqChain2.Min(),
                                                          resSeqChain2.Max()));
                }
                else
                {
                    return(null);
                }
            }).ToList();

            // var resultData = new List<MultiBindingInterface>();

            for (int index = 0; index < data.Count; index++)
            {
                var d   = data[index];
                var cp1 = contactsPartner1[index];
                var cp2 = contactsPartner2[index];
                var ip1 = interfacePartner1[index];
                var ip2 = interfacePartner2[index];

                if (d == null || cp1 == null || cp2 == null || ip1 == null || ip2 == null)
                {
                    continue;
                }
                if (cp1.Count == 0 || cp2.Count == 0)
                {
                    continue;
                }

                var p1c1_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb1 + ".pdb", new char[] { d.InteractionChainsPdb1Chain1 }, -1, -1, true);
                var p1c2_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb1 + ".pdb", new char[] { d.InteractionChainsPdb1Chain2 }, -1, -1, true);
                var p2c1_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb2 + ".pdb", new char[] { d.InteractionChainsPdb2Chain1 }, -1, -1, true);
                var p2c2_pdb = ProteinBioClass.PdbAtomicChains(@"c:\multibinding\pdb" + d.InteractionChainsPdb2 + ".pdb", new char[] { d.InteractionChainsPdb2Chain2 }, -1, -1, true);

                var p1c1_res_seq = p1c1_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList();
                var p1c2_res_seq = p1c2_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList();
                var p2c1_res_seq = p2c1_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList();
                var p2c2_res_seq = p2c2_pdb.ChainList.First().AtomList.Select(a => int.Parse(a.resSeq.FieldValue)).ToList();

                var cp1a1_res_seq = cp1.Select(a => int.Parse(a.Atom1.resSeq.FieldValue)).ToList();
                var cp1a2_res_seq = cp1.Select(a => int.Parse(a.Atom2.resSeq.FieldValue)).ToList();
                var cp2a1_res_seq = cp2.Select(a => int.Parse(a.Atom1.resSeq.FieldValue)).ToList();
                var cp2a2_res_seq = cp2.Select(a => int.Parse(a.Atom2.resSeq.FieldValue)).ToList();


                var cp1a1_min = cp1a1_res_seq.Min();
                var cp1a1_max = cp1a1_res_seq.Max();
                var cp1a2_min = cp1a2_res_seq.Min();
                var cp1a2_max = cp1a2_res_seq.Max();

                var cp2a1_min = cp2a1_res_seq.Min();
                var cp2a1_max = cp2a1_res_seq.Max();
                var cp2a2_min = cp2a2_res_seq.Min();
                var cp2a2_max = cp2a2_res_seq.Max();

                var cp1a1_best50_min           = int.MinValue;
                var cp1a1_best50_max           = int.MinValue;
                var cp1a1_best50_interactions  = int.MinValue;
                var cp1a1_best50_middle_finder = new List <Tuple <int, int, int> >();

                var interface_target_length = 50;

                for (var x = cp1a1_min - interface_target_length; x <= cp1a1_max; x++)
                {
                    if (Math.Abs(cp1a1_max - cp1a1_min) <= interface_target_length)
                    {
                        cp1a1_best50_min          = cp1a1_min;
                        cp1a1_best50_max          = cp1a1_max;
                        cp1a1_best50_interactions = cp1a1_res_seq.Count;
                        break;
                    }

                    var min = x;
                    var max = x + interface_target_length > cp1a1_max ? cp1a1_max : x + interface_target_length;

                    var best50 = cp1a1_res_seq.Count(a => a >= cp1a1_best50_min && a <= cp1a1_best50_max);

                    if (best50 == cp1a1_best50_interactions)
                    {
                        cp1a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                    }

                    if (cp1a1_best50_interactions == int.MinValue || best50 > cp1a1_best50_interactions)
                    {
                        cp1a1_best50_middle_finder.Clear();
                        cp1a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                        cp1a1_best50_min          = min;
                        cp1a1_best50_max          = max;
                        cp1a1_best50_interactions = best50;
                    }

                    if (x + interface_target_length >= cp1a1_max)
                    {
                        break;
                    }
                }

                if (cp1a1_best50_middle_finder.Count > 2)
                {
                    var middle = cp1a1_best50_middle_finder[cp1a1_best50_middle_finder.Count / 2];
                    cp1a1_best50_min          = middle.Item1;
                    cp1a1_best50_max          = middle.Item2;
                    cp1a1_best50_interactions = middle.Item3;
                }

                var cp1a2_best50_min           = int.MinValue;
                var cp1a2_best50_max           = int.MinValue;
                var cp1a2_best50_interactions  = int.MinValue;
                var cp1a2_best50_middle_finder = new List <Tuple <int, int, int> >();
                for (var x = cp1a2_min - interface_target_length; x <= cp1a2_max; x++)
                {
                    if (Math.Abs(cp1a2_max - cp1a2_min) <= interface_target_length)
                    {
                        cp1a2_best50_min          = cp1a2_min;
                        cp1a2_best50_max          = cp1a2_max;
                        cp1a2_best50_interactions = cp1a2_res_seq.Count;
                        break;
                    }

                    var min = x;
                    var max = x + interface_target_length > cp1a2_max ? cp1a2_max : x + interface_target_length;

                    var best50 = cp1a2_res_seq.Count(a => a >= cp1a2_best50_min && a <= cp1a2_best50_max);

                    if (best50 == cp1a2_best50_interactions)
                    {
                        cp1a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                    }

                    if (cp1a2_best50_interactions == int.MinValue || best50 > cp1a2_best50_interactions)
                    {
                        cp1a2_best50_middle_finder.Clear();
                        cp1a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                        cp1a2_best50_min          = min;
                        cp1a2_best50_max          = max;
                        cp1a2_best50_interactions = best50;
                    }

                    if (x + interface_target_length >= cp1a2_max)
                    {
                        break;
                    }
                }

                if (cp1a2_best50_middle_finder.Count > 2)
                {
                    var middle = cp1a2_best50_middle_finder[cp1a2_best50_middle_finder.Count / 2];
                    cp1a2_best50_min          = middle.Item1;
                    cp1a2_best50_max          = middle.Item2;
                    cp1a2_best50_interactions = middle.Item3;
                }


                var cp2a1_best50_min           = int.MinValue;
                var cp2a1_best50_max           = int.MinValue;
                var cp2a1_best50_interactions  = int.MinValue;
                var cp2a1_best50_middle_finder = new List <Tuple <int, int, int> >();
                for (var x = cp2a1_min - interface_target_length; x <= cp2a1_max; x++)
                {
                    if (Math.Abs(cp2a1_max - cp2a1_min) <= interface_target_length)
                    {
                        cp2a1_best50_min          = cp2a1_min;
                        cp2a1_best50_max          = cp2a1_max;
                        cp2a1_best50_interactions = cp2a1_res_seq.Count;
                        break;
                    }
                    var min = x;
                    var max = x + interface_target_length > cp2a1_max ? cp2a1_max : x + interface_target_length;

                    var best50 = cp2a1_res_seq.Count(a => a >= cp2a1_best50_min && a <= cp2a1_best50_max);

                    if (best50 == cp2a1_best50_interactions)
                    {
                        cp2a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                    }

                    if (cp2a1_best50_interactions == int.MinValue || best50 > cp2a1_best50_interactions)
                    {
                        cp2a1_best50_middle_finder.Clear();
                        cp2a1_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                        cp2a1_best50_min          = min;
                        cp2a1_best50_max          = max;
                        cp2a1_best50_interactions = best50;
                    }

                    if (x + interface_target_length >= cp2a1_max)
                    {
                        break;
                    }
                }

                if (cp2a1_best50_middle_finder.Count > 2)
                {
                    var middle = cp2a1_best50_middle_finder[cp2a1_best50_middle_finder.Count / 2];
                    cp2a1_best50_min          = middle.Item1;
                    cp2a1_best50_max          = middle.Item2;
                    cp2a1_best50_interactions = middle.Item3;
                }

                var cp2a2_best50_min           = int.MinValue;
                var cp2a2_best50_max           = int.MinValue;
                var cp2a2_best50_interactions  = int.MinValue;
                var cp2a2_best50_middle_finder = new List <Tuple <int, int, int> >();
                for (var x = cp2a2_min - interface_target_length; x <= cp2a2_max; x++)
                {
                    if (Math.Abs(cp2a2_max - cp2a2_min) <= interface_target_length)
                    {
                        cp2a2_best50_min          = cp2a2_min;
                        cp2a2_best50_max          = cp2a2_max;
                        cp2a2_best50_interactions = cp2a2_res_seq.Count;
                        break;
                    }
                    var min = x;
                    var max = x + interface_target_length > cp2a2_max ? cp2a2_max : x + interface_target_length;

                    var best50 = cp2a2_res_seq.Count(a => a >= cp2a2_best50_min && a <= cp2a2_best50_max);

                    if (best50 == cp2a2_best50_interactions)
                    {
                        cp2a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                    }

                    if (cp2a2_best50_interactions == int.MinValue || best50 > cp2a2_best50_interactions)
                    {
                        cp2a2_best50_middle_finder.Clear();
                        cp2a2_best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                        cp2a2_best50_min          = min;
                        cp2a2_best50_max          = max;
                        cp2a2_best50_interactions = best50;
                    }

                    if (x + interface_target_length >= cp2a2_max)
                    {
                        break;
                    }
                }

                if (cp2a2_best50_middle_finder.Count > 2)
                {
                    var middle = cp2a2_best50_middle_finder[cp2a2_best50_middle_finder.Count / 2];
                    cp2a2_best50_min          = middle.Item1;
                    cp2a2_best50_max          = middle.Item2;
                    cp2a2_best50_interactions = middle.Item3;
                }

                var cp1a1_interface = string.Join("", p1c1_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp1a1_min && l <= cp1a1_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp1a2_interface = string.Join("", p1c2_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp1a2_min && l <= cp1a2_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp2a1_interface = string.Join("", p2c1_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp2a1_min && l <= cp2a1_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp2a2_interface = string.Join("", p2c2_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp2a2_min && l <= cp2a2_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp1a1_interface_interactions = new string('_', cp1a1_interface.Length);
                cp1a1_interface_interactions = string.Join("", cp1a1_interface_interactions.Select((a, i) => cp1a1_res_seq.Contains(i + cp1a1_min) ? "X" : "_").ToList());

                var cp1a2_interface_interactions = new string('_', cp1a2_interface.Length);
                cp1a2_interface_interactions = string.Join("", cp1a2_interface_interactions.Select((a, i) => cp1a2_res_seq.Contains(i + cp1a2_min) ? "X" : "_").ToList());

                var cp2a1_interface_interactions = new string('_', cp2a1_interface.Length);
                cp2a1_interface_interactions = string.Join("", cp2a1_interface_interactions.Select((a, i) => cp2a1_res_seq.Contains(i + cp2a1_min) ? "X" : "_").ToList());

                var cp2a2_interface_interactions = new string('_', cp2a2_interface.Length);
                cp2a2_interface_interactions = string.Join("", cp2a2_interface_interactions.Select((a, i) => cp2a2_res_seq.Contains(i + cp2a2_min) ? "X" : "_").ToList());

                var cp1a1_interactions = cp1a1_interface_interactions.Count(a => a == 'X');
                var cp1a2_interactions = cp1a2_interface_interactions.Count(a => a == 'X');
                var cp2a1_interactions = cp2a1_interface_interactions.Count(a => a == 'X');
                var cp2a2_interactions = cp2a2_interface_interactions.Count(a => a == 'X');

                var cp1a1_best50_interface = string.Join("", p1c1_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp1a1_best50_min && l <= cp1a1_best50_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp1a2_best50_interface = string.Join("", p1c2_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp1a2_best50_min && l <= cp1a2_best50_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp2a1_best50_interface = string.Join("", p2c1_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp2a1_best50_min && l <= cp2a1_best50_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp2a2_best50_interface = string.Join("", p2c2_pdb.ChainList.First().AtomList.Where(a =>
                {
                    var l = int.Parse(a.resSeq.FieldValue);
                    return(l >= cp2a2_best50_min && l <= cp2a2_best50_max);
                }).OrderBy(c => int.Parse(c.resSeq.FieldValue)).Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                var cp1a1_best50_interface_interactions = new string('_', cp1a1_best50_interface.Length);
                cp1a1_best50_interface_interactions = string.Join("", cp1a1_best50_interface_interactions.Select((a, i) => cp1a1_res_seq.Contains(i + cp1a1_best50_min) ? "X" : "_").ToList());

                var cp1a2_best50_interface_interactions = new string('_', cp1a2_best50_interface.Length);
                cp1a2_best50_interface_interactions = string.Join("", cp1a2_best50_interface_interactions.Select((a, i) => cp1a2_res_seq.Contains(i + cp1a2_best50_min) ? "X" : "_").ToList());

                var cp2a1_best50_interface_interactions = new string('_', cp2a1_best50_interface.Length);
                cp2a1_best50_interface_interactions = string.Join("", cp2a1_best50_interface_interactions.Select((a, i) => cp2a1_res_seq.Contains(i + cp2a1_best50_min) ? "X" : "_").ToList());

                var cp2a2_best50_interface_interactions = new string('_', cp2a2_best50_interface.Length);
                cp2a2_best50_interface_interactions = string.Join("", cp2a2_best50_interface_interactions.Select((a, i) => cp2a2_res_seq.Contains(i + cp2a2_best50_min) ? "X" : "_").ToList());

                d.Pdb1Chain1InterfaceStart    = ip1.Item1;
                d.Pdb1Chain1InterfaceEnd      = ip1.Item2;
                d.Pdb1Chain1TotalInteractions = cp1a1_interactions;
                d.Pdb1Chain1InterfaceSequence = cp1a1_interface;
                d.Pdb1Chain1InterfaceMask     = cp1a1_interface_interactions;

                d.Pdb1Chain1Best50InterfaceStart    = cp1a1_best50_min;
                d.Pdb1Chain1Best50InterfaceEnd      = cp1a1_best50_max;
                d.Pdb1Chain1Best50TotalInteractions = cp1a1_best50_interactions;
                d.Pdb1Chain1Best50InterfaceSequence = cp1a1_best50_interface;
                d.Pdb1Chain1Best50InterfaceMask     = cp1a1_best50_interface_interactions;

                d.Pdb1Chain2InterfaceStart    = ip1.Item3;
                d.Pdb1Chain2InterfaceEnd      = ip1.Item4;
                d.Pdb1Chain2TotalInteractions = cp1a2_interactions;
                d.Pdb1Chain2InterfaceSequence = cp1a2_interface;
                d.Pdb1Chain2InterfaceMask     = cp1a2_interface_interactions;

                d.Pdb1Chain2Best50InterfaceStart    = cp1a2_best50_min;
                d.Pdb1Chain2Best50InterfaceEnd      = cp1a2_best50_max;
                d.Pdb1Chain2Best50TotalInteractions = cp1a2_best50_interactions;
                d.Pdb1Chain2Best50InterfaceSequence = cp1a2_best50_interface;
                d.Pdb1Chain2Best50InterfaceMask     = cp1a2_best50_interface_interactions;

                d.Pdb2Chain1InterfaceStart    = ip2.Item1;
                d.Pdb2Chain1InterfaceEnd      = ip2.Item2;
                d.Pdb2Chain1TotalInteractions = cp2a1_interactions;
                d.Pdb2Chain1InterfaceSequence = cp2a1_interface;
                d.Pdb2Chain1InterfaceMask     = cp2a1_interface_interactions;

                d.Pdb2Chain1Best50InterfaceStart    = cp2a1_best50_min;
                d.Pdb2Chain1Best50InterfaceEnd      = cp2a1_best50_max;
                d.Pdb2Chain1Best50TotalInteractions = cp2a1_best50_interactions;
                d.Pdb2Chain1Best50InterfaceSequence = cp2a1_best50_interface;
                d.Pdb2Chain1Best50InterfaceMask     = cp2a1_best50_interface_interactions;

                d.Pdb2Chain2InterfaceStart    = ip2.Item3;
                d.Pdb2Chain2InterfaceEnd      = ip2.Item4;
                d.Pdb2Chain2TotalInteractions = cp2a2_interactions;
                d.Pdb2Chain2InterfaceSequence = cp2a2_interface;
                d.Pdb2Chain2InterfaceMask     = cp2a2_interface_interactions;

                d.Pdb2Chain2Best50InterfaceStart    = cp2a2_best50_min;
                d.Pdb2Chain2Best50InterfaceEnd      = cp2a2_best50_max;
                d.Pdb2Chain2Best50TotalInteractions = cp2a2_best50_interactions;
                d.Pdb2Chain2Best50InterfaceSequence = cp2a2_best50_interface;
                d.Pdb2Chain2Best50InterfaceMask     = cp2a2_best50_interface_interactions;
            }

            var output = data.Select(a => a.ToString()).ToList();

            output.Insert(0, MultiBindingInterface.Header());
            File.WriteAllLines(@"c:\multibinding\MultiBinding_parsed_results.csv", output);
            return;
        }
Ejemplo n.º 7
0
        static void Main(string[] args)
        {
            //var s1 =
            //    @"XXXXXXXXXXXXXXXXXXXXKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVELQEKVVQVSAGDSHTAALTDDGRVFLWGSFRDNNGVIGLLEPMKKSMVPVQVQLDVPVVKVASGNDHLVMLTADGDLYTLGCGEQGQLGRVPELFANRGGRQGLERLLVPKCVMLKSRGSRGHVRFQDAFCGAYFTFAISHEGHVYGFGLSNYHQLGTPGTESCFIPQNLTSFKNSTKSWVGFSGGQHHTVCMDSEGKAYSLGRAEYGRLGLGEGAEEKSIPTLISRLPAVSSVACGASVGYAVTKDGRVFAWGMGTNYQLGTGQDEDAWSPVEMMGKQLENRVVLSVSSGGQHTVLLVKDKEQS";

            //var s2 = @"RRSPPADAIPKSKKVKVSHRSHSTEPGLVLTLGQGDVGQLGLGENVMERKKPALVSIPEDVVQAEAGGMHTVCLSKSGQVYSFGCNDEGALGRDTSVEGSEMVPGKVELQEKVVQVSAGDSHTAALTDDGRVFLWGSFRDNNGVIGLLEPMKKSMVPVQVQLDVPVVKVASGNDHLVMLTADGDLYTLGCGEQGQLGRVPELFANRGGRQGLERLLVPKCVMLKSRGSRGHVRFQDAFCGAYFTFAISHEGHVYGFGLSNYHQLGTPGTESCFIPQNLTSFKNSTKSWVGFSGGQHHTVCMDSEGKAYSLGRAEYGRLGLGEGAEEKSIPTLISRLPAVSSVACGASVGYAVTKDGRVFAWGMGTNYQLGTGQDEDAWSPVEMMGKQLENRVVLSVSSGGQHTVLLVKDKEQS";

            //var x = SimpleAlignmentOffset(s1,s2);

            //Console.WriteLine();
            //Console.WriteLine(x.Item1);
            //Console.WriteLine(x.Item2);
            //Console.WriteLine();
            //Console.ReadLine();
            //return;

            // MutateSequence example.fasta start end mutation original (will find closest to start/end in case of sequence/structure index misalignment)

            var parameters = new string[, ]
            {
                { "[input_fasta_file]", "fasta file with sequence to mutate" },
                { "[chain_ids]", "chain ids to mutate" },
                { "[start_positions]", "mutation start position (one based)" },
                { "[end_positions]", "mutation end position (one based)" },
                { "[offsets]", "offsets (for where pdb index doesn't match fasta sequence index) (one based)" },
                { "[mutation_sequence]", "new amino acids to overwrite with" },
                { "[[output_fasta_file]]", "optional output fasta file.  when ommitted, output to console" },
            };

            var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length);
            var exeFilename    = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);

            if (args.Length < 5)
            {
                Console.WriteLine(exeFilename +
                                  @" is a program to mutate (substitute) a subsequence of a protein amino acid sequence within a fasta file.");
                Console.WriteLine();
                Console.WriteLine(@"Usage:");
                Console.WriteLine(
                    ProteinBioClass.WrapConsoleText(
                        exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)),
                        maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Example:");
                Console.WriteLine(
                    ProteinBioClass.WrapConsoleText(
                        exeFilename +
                        @" ""c:\pdb_db\fasta\fasta_pdb1a12.pdb.fasta"" A,B,C 10,76,100 15,77,102 GBVBGA,AA,GHG ""c:\pdb_db\fasta_mutated\mutated_pdb1a12.pdb.fasta""",
                        maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Arguments:");
                for (var i = 0; i < parameters.GetLength(0); i++)
                {
                    Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " +
                                      ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false));
                }
                Console.WriteLine();
                return;
            }

            var p = 0;
            var input_fasta_file = args.Length > p && args[p].Length > 0 ? args[p] : "";

            input_fasta_file = input_fasta_file.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + input_fasta_file);

            p++;
            var chain_ids = args.Length > p && args[p].Length > 0 ? args[p] : "";

            chain_ids = chain_ids.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + chain_ids);
            var chain_ids_split = chain_ids.ToUpperInvariant().Split(',');

            p++;
            var start_position = args.Length > p && args[p].Length > 0 ? args[p] : "";

            start_position = start_position.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + start_position);
            var start_position_split = start_position.Split(',').Select(int.Parse).ToArray();

            p++;
            var end_position = args.Length > p && args[p].Length > 0 ? args[p] : "";

            end_position = end_position.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + end_position);
            var end_position_split = end_position.Split(',').Select(int.Parse).ToArray();

            p++;
            var offset_position = args.Length > p && args[p].Length > 0 ? args[p] : "";

            offset_position = offset_position.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + offset_position);
            var offset_position_split = offset_position.Split(',').Select(int.Parse).ToArray();

            p++;
            var mutation_sequence = args.Length > p && args[p].Length > 0 ? args[p] : "";

            mutation_sequence = mutation_sequence.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + mutation_sequence);
            var mutation_sequence_split = mutation_sequence.Split(',');

            p++;
            var output_fasta_file = args.Length > p && args[p].Length > 0 ? args[p] : "";

            output_fasta_file = output_fasta_file.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + output_fasta_file);

            Console.WriteLine();

            MutateFastaSequenceSave(input_fasta_file, chain_ids_split, start_position_split, end_position_split,
                                    offset_position_split, mutation_sequence_split, output_fasta_file);
        }
        static void Main(string[] args)
        {
            //var requiredInterfaceLengths = new int[] { 7, 9, 11, 13, 15 };
            var requiredInterfaceLengths = new int[] { 9 };

            var sequenceFilename = @"C:\pdbe\pdb_seqres.fasta";
            var dsspFilename     = @"C:\pdbe\ss.txt";

            var sequenceListFromFastaFile = Sequence.LoadSequenceFile(sequenceFilename, new string[] { null, "", "protein" });
            var dsspList = Sequence.LoadSequenceFile(dsspFilename, new string[] { null, "", "protein" });

            dsspList = dsspList.Where(a => a.IdSplit.Description == "secstr").ToList();

            //var pdbIdList = new List<string>(); {"1a00"};
            var pdbFiles       = Directory.GetFiles(@"c:\pdbe\contacts_all\", "????.pdb", SearchOption.TopDirectoryOnly).ToList();
            var pdbFileLengths = pdbFiles.Select(a => new Tuple <string, long>(a, new FileInfo(a).Length)).ToList();

            pdbFileLengths = pdbFileLengths.Where(a => a.Item2 > 0).ToList();
            pdbFileLengths = pdbFileLengths.OrderBy(a => a.Item2).ToList();

            var ifList = InterfaceFragment.Load(@"c:\r\if.csv");

            ifList = ifList.Where(a => requiredInterfaceLengths.Contains(a.FragmentLength)).ToList();

            if (ifList.Count == 0)
            {
                var pdbIdList = pdbFileLengths.Select(a => a.Item1).Select(Path.GetFileNameWithoutExtension).Select(a => a.ToUpperInvariant()).ToList();
                //pdbIdList = pdbIdList.GetRange(0, 100);
                //var result = new List<InterfaceFragment>();

                var taskList1 = new List <Task <List <InterfaceFragment> > >();


                foreach (var pdbId1 in pdbIdList)
                {
                    var pdbId = pdbId1;

                    Console.WriteLine(pdbId + " " + (pdbIdList.IndexOf(pdbId) + 1) + "/" + pdbIdList.Count);
                    while (taskList1.Count(a => !a.IsCompleted) >= Environment.ProcessorCount)
                    {
                        Task.WaitAny(taskList1.Where(a => !a.IsCompleted).ToArray <Task>());
                    }

                    taskList1.Add(Task.Run(() =>
                    {
                        var pdbResult = new List <InterfaceFragment>();

                        var atomsFilename    = @"c:\pdbe\atoms_all\" + pdbId + ".pdb";
                        var contactsFilename = @"c:\pdbe\contacts_all\" + pdbId + ".pdb";
                        var interfaceCsv     = @"c:\r\interface_9\" + pdbId + @".csv";

                        if (File.Exists(interfaceCsv))
                        {
                            return(InterfaceFragment.Load(interfaceCsv));
                        }
                        if (!File.Exists(atomsFilename) || !File.Exists(contactsFilename))
                        {
                            return(null);                                                                                    // new List<InterfaceFragment>(); // continue;
                        }
                        if (new FileInfo(atomsFilename).Length == 0 || new FileInfo(contactsFilename).Length == 0)
                        {
                            return(null);                                                                                                            //new List<InterfaceFragment>(); // continue;
                        }
                        var contacts        = ProteinBioClass.AtomPair.LoadAtomPairList(contactsFilename);
                        var contactChainIds = contacts.SelectMany(a => new char[] { a.Atom1.chainID.FieldValue[0], a.Atom2.chainID.FieldValue[0] }).Distinct().ToList();

                        var proteinFileChains = ProteinBioClass.PdbAtomicChains(atomsFilename, contactChainIds.ToArray(), -1, -1, false);


                        var sequenceListFromPdbFile = proteinFileChains.ChainList.Select(pdbChain => Sequence.LoadStructureFile(atomsFilename, new[] { pdbChain.ChainId }, true, null, null, '-', '-')).ToList();

                        var dsspFromFastaFile     = proteinFileChains.ChainList.Select(pdbChain => dsspList.FirstOrDefault(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.Substring(0, a.IdSplit.PdbId.Length).ToUpperInvariant() && a.IdSplit.ChainId == pdbChain.ChainId)?.FullSequence).ToList();
                        var sequenceFromFastaFile = proteinFileChains.ChainList.Select(pdbChain => sequenceListFromFastaFile.FirstOrDefault(a => a.IdSplit.PdbId.ToUpperInvariant() == pdbId.Substring(0, a.IdSplit.PdbId.Length).ToUpperInvariant() && a.IdSplit.ChainId == pdbChain.ChainId)?.FullSequence).ToList();
                        var sequenceFromPdbFile   = proteinFileChains.ChainList.Select((pdbChain, i) => sequenceListFromPdbFile[i].FirstOrDefault(a => (string.IsNullOrWhiteSpace(a.IdSplit.PdbId) || a.IdSplit.PdbId.ToUpperInvariant() == pdbId.Substring(0, a.IdSplit.PdbId.Length).ToUpperInvariant()) && a.IdSplit.ChainId == pdbChain.ChainId)?.FullSequence).ToList();
                        var structureToSequenceAlignmentResult = proteinFileChains.ChainList.Select((pdbChain, i) => ProteinBioClass.StructureToSequenceAlignment.Align(pdbChain.AtomList, sequenceFromFastaFile[i], sequenceFromPdbFile[i])).ToList();

                        if (structureToSequenceAlignmentResult.Any(a => a == null))
                        {
                            return(null);
                        }

                        for (var chainIndex3 = 0; chainIndex3 < proteinFileChains.ChainList.Count; chainIndex3++)
                        {
                            for (var i = 0; i < structureToSequenceAlignmentResult[chainIndex3].FastaSequenceAligned.Length; i++)
                            {
                                if (structureToSequenceAlignmentResult[chainIndex3].FastaSequenceAligned[i] == '-')
                                {
                                    dsspFromFastaFile[chainIndex3] = dsspFromFastaFile[chainIndex3].Insert(i, "-");
                                }
                            }
                        }

                        foreach (var contactChainId1 in contactChainIds)
                        {
                            var chainIndex1 = proteinFileChains.ChainList.FindIndex(g => g.ChainId == contactChainId1);
                            if (structureToSequenceAlignmentResult[chainIndex1] == null)
                            {
                                continue;
                            }

                            if (string.IsNullOrWhiteSpace(sequenceFromPdbFile[chainIndex1]) || string.IsNullOrWhiteSpace(sequenceFromFastaFile[chainIndex1]) || string.IsNullOrWhiteSpace(dsspFromFastaFile[chainIndex1]))
                            {
                                continue;
                            }

                            var chainResult = new List <InterfaceFragment>();
                            //var pdbChain = proteinFileChains.ChainList.First(a => a.ChainId == contactChainId).AtomList;

                            var chainContacts = contacts.Where(a => a.Atom1.chainID.FieldValue[0] == contactChainId1 || a.Atom2.chainID.FieldValue[0] == contactChainId1).Select(a => a.Atom1.chainID.FieldValue[0] == contactChainId1 ? a : a.SwapAtoms()).ToList();
                            //chainContacts = chainContacts.GroupBy(a => int.Parse(a.resSeq.FieldValue)).ToList().Select(a => a.First()).ToList();
                            //chainContacts = chainContacts.Distinct().ToList();
                            chainContacts = chainContacts.OrderBy(a => int.Parse(a.Atom1.resSeq.FieldValue)).ThenBy(a => int.Parse(a.Atom1.serial.FieldValue)).ToList();



                            foreach (var atomPair in chainContacts)
                            {
                                foreach (var requiredInterfaceLength in requiredInterfaceLengths)
                                {
                                    var contactChainId2 = atomPair.Atom2.chainID.FieldValue[0];
                                    var chainIndex2     = proteinFileChains.ChainList.FindIndex(g => g.ChainId == contactChainId2);
                                    if (structureToSequenceAlignmentResult[chainIndex2] == null)
                                    {
                                        continue;
                                    }

                                    if (string.IsNullOrWhiteSpace(sequenceFromPdbFile[chainIndex2]) || string.IsNullOrWhiteSpace(sequenceFromFastaFile[chainIndex2]) || string.IsNullOrWhiteSpace(dsspFromFastaFile[chainIndex2]))
                                    {
                                        continue;
                                    }

                                    var interfaceLength1 = requiredInterfaceLength;
                                    var interfaceLength2 = requiredInterfaceLength;

                                    var resSeq1 = int.Parse(atomPair.Atom1.resSeq.FieldValue);
                                    var resSeq2 = int.Parse(atomPair.Atom2.resSeq.FieldValue);

                                    var resSeqIndex1a = structureToSequenceAlignmentResult[chainIndex1].AlignmentMap.ToList().FindIndex(a => a == resSeq1);
                                    var resSeqIndex2a = structureToSequenceAlignmentResult[chainIndex2].AlignmentMap.ToList().FindIndex(a => a == resSeq2);

                                    if (structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length < interfaceLength1)
                                    {
                                        interfaceLength1 = structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length;
                                    }
                                    if (structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length < interfaceLength2)
                                    {
                                        interfaceLength2 = structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length;
                                    }

                                    var resSeqIndex1 = resSeqIndex1a - (interfaceLength1 / 2);
                                    var resSeqIndex2 = resSeqIndex2a - (interfaceLength2 / 2);

                                    if (resSeqIndex1 < 0)
                                    {
                                        resSeqIndex1 = 0;
                                    }
                                    if (resSeqIndex2 < 0)
                                    {
                                        resSeqIndex2 = 0;
                                    }

                                    if (resSeqIndex1 + interfaceLength1 > structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length)
                                    {
                                        resSeqIndex1 = (structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Length - interfaceLength1);
                                    }
                                    if (resSeqIndex2 + interfaceLength2 > structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length)
                                    {
                                        resSeqIndex2 = (structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Length - interfaceLength2);
                                    }


                                    var interfaceSuper1 = structureToSequenceAlignmentResult[chainIndex1].FastaSequenceAligned.Substring(resSeqIndex1, interfaceLength1);
                                    var interfaceSuper2 = structureToSequenceAlignmentResult[chainIndex2].FastaSequenceAligned.Substring(resSeqIndex2, interfaceLength2);

                                    var dsspSuper1 = dsspFromFastaFile[chainIndex1].Substring(resSeqIndex1, interfaceLength1);
                                    var dsspSuper2 = dsspFromFastaFile[chainIndex2].Substring(resSeqIndex2, interfaceLength2);

                                    if (interfaceSuper1.Length != requiredInterfaceLength || dsspSuper1.Length != requiredInterfaceLength)
                                    {
                                        continue;
                                    }
                                    if (interfaceSuper2.Length != requiredInterfaceLength || dsspSuper2.Length != requiredInterfaceLength)
                                    {
                                        continue;
                                    }

                                    var interfaceFragment = new InterfaceFragment()
                                    {
                                        FragmentLength             = requiredInterfaceLength,
                                        PdbId                      = pdbId,
                                        ReceptorChainId            = contactChainId1,
                                        ReceptorResSeq             = resSeq1,
                                        ReceptorIndex              = resSeqIndex1a,
                                        ReceptorAminoAcidSequence  = interfaceSuper1,
                                        ReceptorSecondaryStructure = dsspSuper1,
                                        LigandChainId              = contactChainId2,
                                        LigandResSeq               = resSeq2,
                                        LigandIndex                = resSeqIndex2a,
                                        LigandAminoAcidSequence    = interfaceSuper2,
                                        LigandSecondaryStructure   = dsspSuper2
                                    };


                                    chainResult.Add(interfaceFragment);
                                }
                            }
                            pdbResult.AddRange(chainResult);
                        }

                        InterfaceFragment.Save(interfaceCsv, pdbResult.Distinct().ToList());
                        return(null);                     // return pdbResult;
                    }));
                    //result.AddRange(pdbResult);
                }
                //try
                //{
                Task.WaitAll(taskList1.ToArray <Task>());
                //}
                //catch (AggregateException ae)
                //{
                //    throw ae.Flatten();
                //}

                ifList = taskList1.Where(a => a.Result != null).SelectMany(a => a.Result).ToList();

                ifList = ifList.Distinct().ToList();
                InterfaceFragment.Save(@"c:\r\if.csv", ifList);
            }

            var pairs1 = ifList.Select(a => new Tuple <string, string>(a.ReceptorAminoAcidSequence, a.ReceptorSecondaryStructure)).Distinct().ToList();

            sequenceFilename = null;
            dsspFilename     = null;
            sequenceListFromFastaFile.Clear(); sequenceListFromFastaFile = null;
            dsspList.Clear(); dsspList             = null;
            pdbFiles.Clear(); pdbFiles             = null;
            pdbFileLengths.Clear(); pdbFileLengths = null;
            //ifList.Clear(); ifList = null;

            var similarAA = new string[] { "LAGVIP", "DE", "ST", "RKH", "FYW", "NQ", "CM", "BJOUXZ", "-" };
            var simAaList = similarAA.SelectMany(a => a.ToList()).ToList();
            var simAaDict = new Dictionary <char, string>();



            foreach (var s in simAaList)
            {
                simAaDict.Add(s, similarAA.First(a => a.Contains(s)));
            }


            DateTime startTime = DateTime.Now;

            var simList = new List <Tuple <string, string, string, string, decimal, decimal, decimal> >(); // aa, ss, aa-sim, aa-evo-sim, ss-sim

            // compare without alignments, as aligments takes too long, also allow for insertion/deletions with index pos -1/+1.
            for (int i = 0; i < pairs1.Count; i++)
            {
                TimeSpan timeRemaining = TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks *(pairs1.Count - (i + 1)) / (i + 1));
                Console.WriteLine((i + 1) + " / " + pairs1.Count + " " + timeRemaining.ToString("d'd 'h'h 'm'm 's's'"));
                var a = pairs1[i];
                for (int j = 0; j < pairs1.Count; j++)
                {
                    if (j <= i)
                    {
                        continue;
                    }

                    var b = pairs1[j];

                    if (a.Item1.Length != b.Item1.Length)
                    {
                        continue;
                    }

                    var scoreAa = 0;
                    var scoreSs = 0;

                    var scoreAaEvo = 0;

                    for (var x = 0; x < a.Item1.Length; x++)
                    {
                        if (a.Item1[x] == b.Item1[x])
                        {
                            scoreAa++;
                        }
                        else if (x > 0 && (a.Item1[x - 1] == b.Item1[x] || a.Item1[x] == b.Item1[x - 1]))
                        {
                            scoreAa++;
                        }
                        else if (x < a.Item1.Length - 1 && (a.Item1[x + 1] == b.Item1[x] || a.Item1[x] == b.Item1[x + 1]))
                        {
                            scoreAa++;
                        }

                        if (simAaDict[a.Item1[x]].Contains(b.Item1[x]))
                        {
                            scoreAaEvo++;
                        }
                        else if (x > 0 && (simAaDict[a.Item1[x - 1]].Contains(b.Item1[x]) || simAaDict[a.Item1[x]].Contains(b.Item1[x - 1])))
                        {
                            scoreAaEvo++;
                        }
                        else if (x < a.Item1.Length - 1 && (simAaDict[a.Item1[x + 1]].Contains(b.Item1[x]) || simAaDict[a.Item1[x]].Contains(b.Item1[x + 1])))
                        {
                            scoreAaEvo++;
                        }


                        if (a.Item2[x] == b.Item2[x])
                        {
                            scoreSs++;
                        }
                        else if (x > 0 && (a.Item2[x - 1] == b.Item2[x] || a.Item2[x] == b.Item2[x - 1]))
                        {
                            scoreSs++;
                        }
                        else if (x < a.Item2.Length - 1 && (a.Item2[x + 1] == b.Item2[x] || a.Item2[x] == b.Item2[x + 1]))
                        {
                            scoreSs++;
                        }
                    }

                    decimal scoreAaPct    = (decimal)scoreAa / (decimal)a.Item1.Length;
                    decimal scoreAaEvoPct = (decimal)scoreAaEvo / (decimal)a.Item1.Length;
                    decimal scoreSsPct    = (decimal)scoreSs / (decimal)a.Item2.Length;

                    simList.Add(new Tuple <string, string, string, string, decimal, decimal, decimal>(a.Item1, a.Item2, b.Item1, b.Item2, scoreAaPct, scoreAaEvoPct, scoreSsPct));
                    //simList.Add(new Tuple<string, string, string, string, decimal, decimal, decimal>(b.AminoAcidSequence, b.SecondaryStructure, a.AminoAcidSequence, a.SecondaryStructure, scoreAaPct, scoreAaEvoPct, scoreSsPct));
                }
            }

            //var if2 = simList.SelectMany(a => new List<string>() { string.Join(",", new string[] { a.Item1, a.Item2, a.Item3, a.Item4, "" + a.Item5, "" + a.Item6, "" + a.Item7 }) /*,
            //    string.Join(",", new string[] { a.Item3, a.Item4, a.Item1, a.Item2, "" + a.Item5, "" + a.Item6, "" + a.Item7 })*/ }).ToList();

            var taskList3 = new List <Task>();

            for (int i = 0; i < pairs1.Count; i++)
            {
                TimeSpan timeRemaining = TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks *(pairs1.Count - (i + 1)) / (i + 1));
                Console.WriteLine((i + 1) + " / " + pairs1.Count + " " + timeRemaining.ToString("d'd 'h'h 'm'm 's's'"));

                while (taskList3.Count(a => !a.IsCompleted) >= Environment.ProcessorCount)
                {
                    Task.WaitAny(taskList3.Where(a => !a.IsCompleted).ToArray <Task>());
                }
                var i1 = i;
                var t  = Task.Run(() =>
                {
                    var a = pairs1[i1];

                    // find all instances of this pair with good simple alignment
                    var cluster = simList.Where(c => (c.Item1 == a.Item1 || c.Item3 == a.Item1) && (c.Item2 == a.Item2 || c.Item4 == a.Item2) && (c.Item5 >= 0.4m && c.Item6 >= 0.8m && c.Item7 >= 0.9m) && (c.Item5 < 1.0m /*&& c.Item6 < 1.0m && c.Item7 < 1.0m*/)).ToList();

                    if (cluster.Count < 5)
                    {
                        return;
                    }
                    var clusterPdbs = ifList.Where(c => cluster.Any(d => (d.Item1 == c.ReceptorAminoAcidSequence || d.Item3 == c.ReceptorAminoAcidSequence) && (d.Item2 == c.ReceptorSecondaryStructure || d.Item4 == c.ReceptorSecondaryStructure))).Select(e => new Tuple <string, char>(e.PdbId, e.ReceptorChainId)).ToList();
                    if (clusterPdbs.Count < 5)
                    {
                        return;
                    }
                    List <string> o = new List <string>();
                    o.Add("delete *");
                    o.AddRange(clusterPdbs.Select(e => @"load c:\pdbe\" + e.Item1 + ".pdb").ToList());
                    o.Add("hide all");
                    o.Add("show cartoon");
                    o.AddRange(clusterPdbs.Select((e, w) => w == 0 ? "" : @"super /" + e.Item1 + @"//" + e.Item2 + @", /" + clusterPdbs[0].Item1 + @"//" + clusterPdbs[0].Item2).ToList());


                    File.WriteAllLines(@"c:\r\cluster_" + (i1 + 1) + ".txt", o);
                });
                taskList3.Add(t);
            }
            Task.WaitAll(taskList3.ToArray <Task>());

            //File.WriteAllLines(@"c:\r\if2.csv", if2);
            Console.WriteLine("Calculating aa/ss sequence identities - part 1");

            var taskList2 = new List <Task <InterfaceFragmentData> >();
            //var interfaceFragmentLengths = new int[] { 15, 13, 11, 9, 7, 5, 3 };
            //var interfaceFragmentLengths = new int[] { requiredInterfaceLengths[0] };
            var interfaceFragmentLengths = new int[] { 11 };

            for (int index = 0; index < interfaceFragmentLengths.Length; index++)
            {
                var index1 = index;
                var interfaceFragmentLength = interfaceFragmentLengths[index1];

                taskList2.Add(Task.Run(() =>
                {
                    var pairs = ifList.Where(a => a.FragmentLength == interfaceFragmentLength).Select(a => new Tuple <string, string>(a.ReceptorAminoAcidSequence.Substring(index1, interfaceFragmentLength), a.ReceptorSecondaryStructure.Substring(index1, interfaceFragmentLength))).Distinct().ToList();

                    var aassList = new List <Tuple <string, string, string, string, decimal, decimal, decimal> >();

                    var aaSequenceList  = pairs.Select(a => a.Item1).Distinct().ToList();
                    var aaAlignmentList = Align(aaSequenceList);

                    var ssSequenceList  = pairs.Select(a => a.Item2).Distinct().ToList();
                    var ssAlignmentList = Align(ssSequenceList);


                    for (int i = 0; i < pairs.Count; i++)
                    {
                        Console.WriteLine((i + 1) + " / " + pairs.Count);

                        var pair1 = pairs[i];
                        for (int j = 0; j < pairs.Count; j++)
                        {
                            if (j < i)
                            {
                                continue;
                            }

                            var pair2 = pairs[j];

                            var aaSid    = aaAlignmentList.First(alignment => alignment.Item1 == pair1.Item1 && alignment.Item2 == pair2.Item1).Item3;
                            var ssSid    = ssAlignmentList.First(alignment => alignment.Item1 == pair1.Item2 && alignment.Item2 == pair2.Item2).Item3;
                            var weighted = (aaSid * 0.5m) + (ssSid * 0.5m);
                            aassList.Add(new Tuple <string, string, string, string, decimal, decimal, decimal>(pair1.Item1, pair1.Item2, pair2.Item1, pair2.Item2, aaSid, ssSid, weighted));
                            aassList.Add(new Tuple <string, string, string, string, decimal, decimal, decimal>(pair2.Item1, pair2.Item2, pair1.Item1, pair1.Item2, aaSid, ssSid, weighted));
                        }
                    }


                    var r      = new InterfaceFragmentData();
                    r.AaSsData = aassList;
                    r.AaData   = aaAlignmentList;
                    r.SsData   = ssAlignmentList;
                    return(r);
                }));
            }



            Task.WaitAll(taskList2.ToArray <Task>());

            Console.WriteLine("Calculating aa/ss sequence identities - part 2");

            var sids = new decimal[] { 1.0m, 0.9m, 0.8m, 0.7m, 0.6m, 0.5m, 0.4m, 0.3m, 0.2m, 0.1m, 0.0m };

            //aa count non-transivity clusters
            var aaData       = taskList2.SelectMany(a => a.Result.AaData).ToList();
            var aaDistinct   = aaData.Select(a => a.Item1).Distinct().ToList();
            var aaNeighbours = new List <Tuple <string, decimal, decimal> >();

            foreach (var aa in aaDistinct)
            {
                var subset = aaData.Where(a => a.Item1 == aa).ToList();
                foreach (var sid in sids)
                {
                    aaNeighbours.Add(new Tuple <string, decimal, decimal>(aa, sid, subset.Count(b => b.Item3 >= sid)));
                }
            }
            File.WriteAllLines(@"c:\r\aa.csv", aaData.Select(a => string.Join(",", new string[] { "" + a.Item1.Length, a.Item1, a.Item2, "" + a.Item3 })));
            File.WriteAllLines(@"c:\r\aa-clusters-1.csv", aaDistinct.Select(a => a + "," + string.Join(",", aaNeighbours.Where(d => d.Item1 == a).Select(d => "" + d.Item3).ToArray())));

            //ss count non-transivity clusters
            var ssData       = taskList2.SelectMany(a => a.Result.SsData).ToList();
            var ssDistinct   = ssData.Select(a => a.Item1).Distinct().ToList();
            var ssNeighbours = new List <Tuple <string, decimal, decimal> >();

            foreach (var ss in ssDistinct)
            {
                var subset = ssData.Where(a => a.Item1 == ss).ToList();
                foreach (var sid in sids)
                {
                    ssNeighbours.Add(new Tuple <string, decimal, decimal>(ss, sid, subset.Count(b => b.Item3 >= sid)));
                }
            }
            File.WriteAllLines(@"c:\r\ss.csv", ssData.Select(a => string.Join(",", new string[] { "" + a.Item1.Length, a.Item1, a.Item2, "" + a.Item3 })));
            File.WriteAllLines(@"c:\r\ss-clusters-1.csv", ssDistinct.Select(a => a + "," + string.Join(",", ssNeighbours.Where(d => d.Item1 == a).Select(d => "" + d.Item3).ToList())));

            //aa-ss count non-transivity clusters
            var aaSsData       = taskList2.SelectMany(a => a.Result.AaSsData).ToList();
            var aaSsDistinct   = aaSsData.Select(a => new Tuple <string, string>(a.Item1, a.Item2)).Distinct().ToList();
            var aaSsNeighbours = new List <Tuple <string, string, decimal, decimal> >();

            foreach (var aaSs in aaSsDistinct)
            {
                var subset = aaSsData.Where(a => a.Item1 == aaSs.Item1 && a.Item2 == aaSs.Item2).ToList();
                foreach (var sid in sids)
                {
                    aaSsNeighbours.Add(new Tuple <string, string, decimal, decimal>(aaSs.Item1, aaSs.Item2, sid, subset.Count(b => b.Item5 >= sid)));
                }
            }
            File.WriteAllLines(@"c:\r\aa-ss.csv", aaSsData.Select(a => string.Join(",", new string[] { "" + a.Item1.Length, a.Item1, a.Item2, a.Item3, a.Item4, "" + a.Item5, "" + a.Item6, "" + a.Item7 })));
            File.WriteAllLines(@"c:\r\aa-ss-clusters-1.csv", aaSsDistinct.Select(a => a.Item1 + "," + a.Item2 + "," + string.Join(",", aaSsNeighbours.Where(d => d.Item1 == a.Item1 && d.Item2 == a.Item2).Select(d => "" + d.Item4).ToList())));


            // cluster by transivity
            //var aaSsPairList = aaSsData.Select(a => new Tuple<string, string>(a.Item1, a.Item2)).Distinct().ToList();

            var aaSsPairClusters = aaSsData.Select(a => new Tuple <string, string>(a.Item1, a.Item2)).Distinct().Select(a => new List <Tuple <string, string> >()
            {
                a
            }).ToList();
            var aaClusters = aaSsData.Select(a => a.Item1).Distinct().Select(a => new List <string>()
            {
                a
            }).ToList();
            var ssClusters = aaSsData.Select(a => a.Item2).Distinct().Select(a => new List <string>()
            {
                a
            }).ToList();

            decimal minTransivitySid = 0.3m;

            foreach (var x in aaSsDistinct)
            {
                foreach (var y in aaSsDistinct)
                {
                    if (x == y)
                    {
                        continue;
                    }
                    if (x.Item1.Length != y.Item1.Length)
                    {
                        continue;                                  //items have not been sequence aligned if not the same length
                    }
                    var z = aaSsData.First(a => a.Item1 == x.Item1 && a.Item2 == x.Item2 && a.Item3 == y.Item1 && a.Item4 == y.Item2);

                    if (z.Item5 >= minTransivitySid)
                    {
                        var c1 = aaClusters.First(a => a.Any(b => b == x.Item1));
                        var c2 = aaClusters.First(a => a.Any(b => b == y.Item1));

                        if (c1 != c2)
                        {
                            c1.AddRange(c2);
                            aaClusters.Remove(c2);
                        }
                    }

                    if (z.Item6 >= minTransivitySid)
                    {
                        var c1 = ssClusters.First(a => a.Any(b => b == x.Item2));
                        var c2 = ssClusters.First(a => a.Any(b => b == y.Item2));

                        if (c1 != c2)
                        {
                            c1.AddRange(c2);
                            ssClusters.Remove(c2);
                        }
                    }

                    if (z.Item7 >= minTransivitySid)
                    {
                        var c1 = aaSsPairClusters.First(a => a.Any(b => b.Item1 == x.Item1 && b.Item2 == x.Item2));
                        var c2 = aaSsPairClusters.First(a => a.Any(b => b.Item1 == y.Item1 && b.Item2 == y.Item2));

                        if (c1 != c2)
                        {
                            c1.AddRange(c2);
                            aaSsPairClusters.Remove(c2);
                        }
                    }
                }
            }

            var aaSsPairClusters2 = aaSsPairClusters.SelectMany((a, i) => a.Select(b => new Tuple <int, string, string>(i + 1, b.Item1, b.Item2))).ToList();

            aaSsPairClusters2 = aaSsPairClusters2.OrderByDescending(a => aaSsPairClusters2.Count(b => b.Item1 == a.Item1)).ToList();
            File.WriteAllLines(@"c:\r\aa-ss-clusters-2.csv", aaSsPairClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2, a.Item3 })));

            var aaClusters2 = aaClusters.SelectMany((a, i) => a.Select(b => new Tuple <int, string>(i + 1, b))).ToList();

            aaClusters2 = aaClusters2.OrderByDescending(a => aaClusters2.Count(b => b.Item1 == a.Item1)).ToList();
            File.WriteAllLines(@"c:\r\aa-clusters-2.csv", aaClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2 })));

            var ssClusters2 = ssClusters.SelectMany((a, i) => a.Select(b => new Tuple <int, string>(i + 1, b))).ToList();

            ssClusters2 = ssClusters2.OrderByDescending(a => ssClusters2.Count(b => b.Item1 == a.Item1)).ToList();
            File.WriteAllLines(@"c:\r\ss-clusters-2.csv", ssClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2 })));

            //File.WriteAllLines(@"c:\r\clusters-ss.csv", ssClusters2.Select(a => string.Join(",", new string[] { "" + a.Item1, a.Item2 })));
            // clusters by aa sid, ss sid, aa-ss sid
        }
        private static void Main(string[] args)
        {
            // this program will load the homolog list in csv format and for homologs of X sequence distance return a list of all partners
            // however, some partners may be duplicates, which cannot initially be removed, since they may bind differently in other instances
            // then, because of such cases, unique id to describe each protein must be created... this is slightly problematic because
            // close target homologs of proteins are also considered to be the same protein as the query protein
            // which means that they could exist for more than one query protein


            // FindHomologsCluster.exe c:\pdb\ds96ub_homologs\ c:\pdb\pdb_seqres.fasta 0.9 0.9 > ds96ub_homologs.csv

            var homolog_csv_folder     = args[0];
            var sequence_file          = args[1];
            var min_similarity_str     = args[2];
            var min_similarity_evo_str = args[3];

            var min_similarity     = decimal.Parse(min_similarity_str);
            var min_similarity_evo = decimal.Parse(min_similarity_evo_str);

            var seqList = Sequence.LoadSequenceFile(sequence_file, new[] { null, "", "protein" });


            var homologCsvFiles = Directory.GetFiles(homolog_csv_folder, "homologs_?????.csv");

            var parsedData = FindHomologs.FindHomologs.HomologChain.Load(homologCsvFiles);


            Array.Clear(homologCsvFiles, 0, homologCsvFiles.Length);

            //var query_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.query_pdb_id, a.query_chainid)).ToList();

            //var target_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.target_pdb_id, a.target_chainid)).ToList();


            //var query_alignments = new List<homolog_csv>();

            var homologs_clustered = new List <List <Tuple <string, char> > >();

            //var min_similarity = 0.9m;

            foreach (var rec in parsedData)
            {
                if (rec.AlignmentScore >= min_similarity && rec.AlignmentScoreEvo >= min_similarity_evo)
                {
                    //var query_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.query_pdb_id && b.Item2 == rec.query_chainid) != null);
                    //var target_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.target_pdb_id && b.Item2 == rec.target_chainid) != null);

                    List <Tuple <string, char> > query_group  = null;
                    List <Tuple <string, char> > target_group = null;

                    foreach (var cluster in homologs_clustered)
                    {
                        var xq = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.QueryPdbId.ToUpperInvariant() && b.Item2 == rec.QueryChainId);
                        if (xq == null)
                        {
                            continue;
                        }
                        query_group = cluster;
                        break;
                    }

                    foreach (var cluster in homologs_clustered)
                    {
                        var xt =
                            cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.TargetPdbId.ToUpperInvariant() && b.Item2 == rec.TargetChainId);
                        if (xt == null)
                        {
                            continue;
                        }
                        target_group = cluster;
                        break;
                    }

                    var new_group = new List <Tuple <string, char> >();

                    if (query_group != null)
                    {
                        new_group.AddRange(query_group);
                        homologs_clustered.Remove(query_group);
                        query_group.Clear();
                    }
                    else
                    {
                        new_group.Add(new Tuple <string, char>(rec.QueryPdbId, rec.QueryChainId));
                    }

                    if (target_group != null)
                    {
                        new_group.AddRange(target_group);
                        homologs_clustered.Remove(target_group);
                        target_group.Clear();
                    }
                    else
                    {
                        new_group.Add(new Tuple <string, char>(rec.TargetPdbId, rec.TargetChainId));
                    }

                    new_group = new_group.Distinct().ToList(); // try without distinct?
                    new_group = new_group.OrderBy(a => a.Item1).ThenBy(a => a.Item2).ToList();

                    homologs_clustered.Add(new_group);
                }
            }

            var seq_list_ids = seqList.Select(a => new ProteinBioClass.SequenceId(a.Id)).ToList();


            var wd2 = new WorkDivision(homologs_clustered.Count);

            for (var thread2 = 0; thread2 < wd2.ThreadCount; thread2++)
            {
                var lti2 = thread2;

                wd2.TaskList.Add(Task.Run(() =>
                {
                    var result2 = new List <string>();

                    for (var index2 = wd2.ThreadFirstIndex[lti2]; index2 <= wd2.ThreadLastIndex[lti2]; index2++)
                    {
                        var cluster2 = homologs_clustered[index2];


                        var wd3 = new WorkDivision(cluster2.Count);

                        for (var thread3 = 0; thread3 < wd3.ThreadCount; thread3++)
                        {
                            var lti3     = thread3;
                            var cluster3 = cluster2;

                            var index4 = index2;
                            wd3.TaskList.Add(Task.Run(() =>
                            {
                                var result = new List <HomologClusterData>();
                                for (var index3 = wd3.ThreadFirstIndex[lti3]; index3 <= wd3.ThreadLastIndex[lti3]; index3++)
                                {
                                    var item   = cluster3[index3];
                                    Sequence s = null;
                                    for (var j = 0; j < seqList.Count; j++)
                                    {
                                        if (seq_list_ids[j].PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant() && seq_list_ids[j].ChainId == item.Item2)
                                        {
                                            s = seqList[j];
                                            break;
                                        }
                                    }
                                    if (s == null)
                                    {
                                        throw new Exception("sequence not found for " + item.Item1 + ":" + item.Item2);
                                    }

                                    var complexChains = seq_list_ids.Count(a => a.PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant());

                                    var minAlignmentScore = -1m;
                                    var maxAlignmentScore = -1m;


                                    var minAlignmentScoreEvo = -1m;
                                    var maxAlignmentScoreEvo = -1m;

                                    foreach (var item2 in cluster3)
                                    {
                                        if (ReferenceEquals(item, item2))
                                        {
                                            continue;
                                        }

                                        Sequence s2 = null;
                                        for (var j2 = 0; j2 < seqList.Count; j2++)
                                        {
                                            if (seq_list_ids[j2].PdbId.ToUpperInvariant() == item2.Item1.ToUpperInvariant() &&
                                                seq_list_ids[j2].ChainId == item2.Item2)
                                            {
                                                s2 = seqList[j2];
                                                break;
                                            }
                                        }
                                        if (s2 == null)
                                        {
                                            continue;
                                        }

                                        var alignmentScore = ProteinBioClass.AlignedSequenceSimilarityPercentage(s,
                                                                                                                 s2,
                                                                                                                 ProteinBioClass.AlignmentType.NMW);

                                        if (alignmentScore.Score > maxAlignmentScore || maxAlignmentScore == -1m)
                                        {
                                            maxAlignmentScore = alignmentScore.Score;
                                        }
                                        if (alignmentScore.Score < minAlignmentScore || minAlignmentScore == -1m)
                                        {
                                            minAlignmentScore = alignmentScore.Score;
                                        }

                                        if (alignmentScore.ScoreEvo > maxAlignmentScoreEvo || maxAlignmentScoreEvo == -1m)
                                        {
                                            maxAlignmentScoreEvo = alignmentScore.ScoreEvo;
                                        }
                                        if (alignmentScore.ScoreEvo < minAlignmentScoreEvo || minAlignmentScoreEvo == -1m)
                                        {
                                            minAlignmentScoreEvo = alignmentScore.ScoreEvo;
                                        }
                                    }

                                    var r = new HomologClusterData(index4 + 1, index3 + 1, item.Item1, item.Item2, complexChains, Convert.ToInt32(s.Count()), minAlignmentScore, maxAlignmentScore, minAlignmentScoreEvo, maxAlignmentScoreEvo, s.FullSequence);

                                    result.Add(r);
                                }
                                return(result);
                            }));
                        }
                        wd3.WaitAllTasks();



                        result2.Add("; Cluster # " + (index2 + 1) + " with " + wd3.ItemsToProcess + " protein chains");
                        result2.Add("cluster index,item index,pdb id,chain id,complex chains,seq len,min clstr sid,max clstr sid,min evo clstr sid,max evo clstr sid,sequence");

                        foreach (var task in wd3.TaskList)
                        {
                            //if (task.IsFaulted || task.IsCanceled) continue;
                            var tr = task as Task <List <HomologClusterData> >;
                            if (tr == null || tr.Result == null)
                            {
                                continue;
                            }
                            result2.AddRange(tr.Result.Select(a => a.ToString()).ToList());
                        }

                        result2.Add("");
                    }

                    return(result2);
                }));
                //wd2.TaskList.Add(task2);
            }
            wd2.WaitAllTasks();

            var result1 = new List <string>();

            foreach (var task in wd2.TaskList)
            {
                //if (task.IsFaulted || task.IsCanceled) continue;
                var tr = task as Task <List <string> >;
                if (tr == null || tr.Result == null)
                {
                    continue;
                }
                result1.AddRange(tr.Result);
            }

            foreach (var line in result1)
            {
                Console.WriteLine(line);
            }
            // partners may have other interfaces, should those also be considered?
        }
        public static void Main(string[] args)
        {
            //var logResultsFolder = @"c:\r\r\";
            //var logResultsFolder = @"c:\r-some modelled\";
            //var logResultsFolder = @"c:\r\" ; //args[0];
            //var saveFile = args[1];

            //var logResultsFolder = @"c:\pdbe_split\models\" ; //args[0];
            //var logResultsFolder = @"C:\pdbe_split\manual\sw_1SBNI_2SICI_4GI3C\"; //args[0];
            //var logResultsFolder = @"C:\pdbe_split\manual\sw_1OYVI_1R0RI_1SBNI_1V5IB_2SICI_3BX1C_4GI3C_4LVNP\"; //args[0];

            var logResultsFolder = @"C:\pdbe_split\manual\sw_3BX1C\"; //args[0];
            //C:\pdbe_split\manual\sw_1H1VG_1KXPD_1RGIG_1T44G_3JBIV_4EAHA_4PKHB_5AFUb

            var seq = ProteinBioinformaticsSharedLibrary.Sequence.LoadSequenceFile(logResultsFolder + "sequences.fasta");
            var inf = ProteinBioinformaticsSharedLibrary.Sequence.LoadSequenceFile(logResultsFolder + "interfaces_fixed_length.fasta");

            foreach (var s1 in seq)
            {
                var r = new List <Tuple <string, ProteinBioClass.AlignmentScore> >();

                foreach (var s2 in seq)
                {
                    //if (s1==s2) continue;

                    var nmw = new NeedlemanWunsch(s1.FullSequence, s2.FullSequence);

                    var a = nmw.getAlignment();

                    ProteinBioClass.AlignmentScore s = ProteinBioClass.SequenceSimilarityPercentage(a[0], a[1], ProteinBioClass.AlignmentIdentityOption.MinimumSequenceLength);

                    //r.Add(s1.Id.Substring(1, 5) + " " + s2.Id.Substring(1, 5) + " " + s.Score + " " + s.ScoreEvo);
                    r.Add(new Tuple <string, ProteinBioClass.AlignmentScore>(s1.Id.Substring(1, 5) + "," + s2.Id.Substring(1, 5), s));
                }
                r = r.OrderByDescending(a => a.Item2.Score).ThenByDescending(a => a.Item2.ScoreEvo).ToList();
                var e = r.Select(a => a.Item1 + "," + string.Format("{0:0.00}", Math.Round(a.Item2.Score, 2)) + "," + string.Format("{0:0.00}", Math.Round(a.Item2.ScoreEvo, 2))).ToList();
                e.Insert(0, "Sequence Alignment");
                e.Insert(1, "ID1,ID2,Match%,Physicochemical%");

                e = e.Select(a => a.Replace(",", "\t")).ToList();
                File.WriteAllLines(logResultsFolder + "score_all_" + s1.Id.Substring(1, 5) + ".txt", e);
            }

            foreach (var s1 in inf)
            {
                var r = new List <Tuple <string, ProteinBioClass.AlignmentScore> >();

                foreach (var s2 in inf)
                {
                    //if (s1==s2) continue;

                    var nmw = new NeedlemanWunsch(s1.FullSequence, s2.FullSequence);

                    var a = nmw.getAlignment();

                    ProteinBioClass.AlignmentScore s = ProteinBioClass.SequenceSimilarityPercentage(a[0], a[1], ProteinBioClass.AlignmentIdentityOption.MinimumSequenceLength);

                    //r.Add(s1.Id.Substring(1, 5) + " " + s2.Id.Substring(1, 5) + " " + s.Score + " " + s.ScoreEvo);
                    r.Add(new Tuple <string, ProteinBioClass.AlignmentScore>(s1.Id.Substring(1, 5) + "," + s2.Id.Substring(1, 5), s));
                }
                r = r.OrderByDescending(a => a.Item2.Score).ThenByDescending(a => a.Item2.ScoreEvo).ToList();
                var e = r.Select(a => a.Item1 + "," + string.Format("{0:0.00}", Math.Round(a.Item2.Score, 2)) + "," + string.Format("{0:0.00}", Math.Round(a.Item2.ScoreEvo, 2))).ToList();
                e.Insert(0, "Interface Alignment");
                e.Insert(1, "ID1,ID2,Match%,Physicochemical%");
                e.Insert(0, "");

                e = e.Select(a => a.Replace(",", "\t")).ToList();
                File.AppendAllLines(logResultsFolder + "score_all_" + s1.Id.Substring(1, 5) + ".txt", e);
            }
            //return;

            //r-some modelled

            //var pdbFileNames = Directory.GetFiles(logResultsFolder, "*.pdb", SearchOption.AllDirectories).Select(a=>Path.GetFileName(a).ToLowerInvariant()).Distinct().ToList();

            var modellerLogFiles = Directory.GetFiles(logResultsFolder, "modeller_monomer_assessment.log", SearchOption.AllDirectories).ToList();
            //modellerLogFiles = modellerLogFiles.Where(a => a.StartsWith(logResultsFolder + @"sw2\")).ToList();
            //var dimerModellerLogFiles = Directory.GetFiles(logResultsFolder, "modeller_dimer_assessment.log", SearchOption.AllDirectories).ToList();

            //var pisaLogFiles = Directory.GetFiles(logResultsFolder, "pisa_monomer_assessment.log", SearchOption.AllDirectories).ToList();

            var data = new List <List <string> >();
            var nats = new List <List <string> >();

            var rowlen = 0;

            var scores = modellerLogFiles.SelectMany(m => ParseModellerLog(m)).ToList();

            foreach (var scoreGroup in scores.GroupBy(a =>
            {
                var structureFolderSplit = a.StructureFolder.Split('\\');
                // \                                       -4        \ -3  \  -2 \    -1           \
                // \sw_1OYVI_1R0RI_1SBNI_1V5IB_2SICI_3BX1C_4GI3C_4LVNP\1V5IB\1V5IB\all_0016_0026_1_1\
                return(structureFolderSplit[structureFolderSplit.Length - 1].Substring(0, 3) + '_' + structureFolderSplit[structureFolderSplit.Length - 3] + '_' + structureFolderSplit[structureFolderSplit.Length - 2]);
            }))
            {
                var group = scoreGroup.ToList();
                group = group.OrderBy(a => a.StructureFolder).ToList();

                var natives1 = group.Where(a => a.StructureFolder.Contains("_native")).ToList();



                foreach (var n in natives1)
                {
                    nats.Add(new List <string>()
                    {
                        "nat_" + scoreGroup.Key.Substring(4), n.ModellerDope
                    });
                }



                //data.Add(group.Select(a => a.ModellerDope).ToList());

                if (!scoreGroup.Key.StartsWith("nat"))
                {
                    // make index line
                    if (scoreGroup.Key.Substring(4, 5) == scoreGroup.Key.Substring(10, 5))
                    {
                        //data.Add(new List<string>());

                        data.Add(group.Select(a => a.StructureFolder.Split('\\').Last().Substring(4)).ToList());

                        rowlen = data[data.Count - 1].Count;


                        data[data.Count - 1].Insert(0, scoreGroup.Key + "_index");
                    }
                }


                data.Add(group.Select(a => a.ModellerDope).ToList());
                data[data.Count - 1].Insert(0, scoreGroup.Key + "_energy");
            }

            var output = new List <string>();
            var nats2  = nats.Select(a => string.Join(",", a)).Distinct().OrderBy(a => a[0]).ToList();

            //nats = nats.Distinct().OrderBy(a => a[0]).ToList();

            foreach (var g in data.GroupBy(a => a[0].Substring(0, 3 + 1 + 5)))
            {
                var gi = g.ToList();

                var index  = gi.First(a => a[0].Contains("_index"));
                var len    = index.Count - 1;
                var main   = gi.First(a => a != index && a[0].Substring(4, 5) == a[0].Substring(10, 5));
                var others = gi.Where(a => a != index && a != main).OrderBy(a => a[0]).ToList();

                var natives = nats2.Where(a => a.Substring(4, 5) == index[0].Substring(4, 5)).OrderBy(a => a[0]).ToList();
                natives = natives.Select(a =>
                {
                    var b = a.Split(',');
                    var r = b[0];
                    for (var j = 0; j < len; j++)
                    {
                        r = r + ',' + string.Join(",", b.Skip(1).ToList());
                    }
                    return(r);
                }).ToList();

                var nativemain = natives.First(a => a.Substring(4, 5) == a.Substring(10, 5));
                natives.Remove(nativemain);

                output.Add(string.Join(",", index));
                output.Add(string.Join(",", main));
                others.ForEach(a => output.Add(string.Join(",", a)));
                output.Add(string.Join(",", nativemain));
                natives.ForEach(a => output.Add(string.Join(",", a)));
                output.Add("");
            }

            //var output = data.Select(a => string.Join(",", a))
            //    .Distinct()
            //    .OrderByDescending(a => a.Substring(4, 5))
            //    .ThenBy(a => a.Substring(0, 3))
            //    .ThenByDescending(a => a.Substring(4, 5) == a.Substring(10, 5))
            //    .ThenByDescending(a => a.Contains("_index"))
            //    .ToList();


            //for (var j = output.Count - 1; j >= 0; j--)
            //{
            //    if (output[j].Contains("_index"))
            //        output.Insert(j, "");
            //}
            File.WriteAllLines(logResultsFolder + Environment.MachineName + "_energy.csv", output);
        }
Ejemplo n.º 11
0
        static void Main(string[] args)
        {
            const int atom_chain     = 21;
            const int atom_chain_len = 1;

            const int atom_icode     = 26;
            const int atom_icode_len = 1;

            const int atom_type     = 14;
            const int atom_type_len = 3;

            const int atom_resseq     = 22;
            const int atom_resseq_len = 4;

            var parameters = new string[, ]
            {
                { "[pdb_file]", "PDB ~v3.3 Protein Data Bank format file [*.pdb, *.ent]" },
                { "[[subset]]", "-, mc, sc, ca" },
                { "[[chain_ids]]", "molecule chains to output [2 formats: - for all, ABC, or A,1,50,B,2,40,C,5,200]" },
                { "[[output_file]]", "optional output file. use ? for chain id. when ommitted, output to console" },
            };

            var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length);
            var exeFilename    = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);

            if (args.Length == 0)
            {
                Console.WriteLine(exeFilename + @" is a program to extract ATOM records from a PDB file.");
                Console.WriteLine();
                Console.WriteLine(@"Usage:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Example:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\pdb1a12.pdb"" 8.0 ""c:\pdb_atoms\atoms1a12.pdb""", maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Arguments:");
                for (var i = 0; i < parameters.GetLength(0); i++)
                {
                    Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false));
                }
                Console.WriteLine();
                return;
            }

            // load and echo arguments
            var p           = 0;
            var pdbFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            pdbFilename = pdbFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + pdbFilename);

            p++;
            var subset = args.Length > p && args[p].Length > 0 ? args[p].ToUpperInvariant() : "";

            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + subset);



            p++;
            var chainIds = args.Length > p && args[p].Length > 0 ? args[p] : "";

            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + chainIds);

            p++;
            var outputFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            outputFilename = outputFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFilename);

            Console.WriteLine();

            if (string.IsNullOrWhiteSpace(pdbFilename))
            {
                return;
            }

            if (!File.Exists(pdbFilename))
            {
                return;
            }

            if (chainIds.Contains('-'))
            {
                chainIds = null;
            }

            var chainStartEnd = new List <Tuple <char, int, int> >();

            var chainIdsSplit = chainIds?.Split(',').ToList();

            char[] chainIdWhiteList;

            if (chainIdsSplit?.Count > 1)
            {
                if (chainIdsSplit.Count % 3 != 0)
                {
                    return;
                }
                for (var i = 0; i < chainIdsSplit.Count; i += 3)
                {
                    var id = chainIdsSplit[i + 0][0];

                    var start = chainIdsSplit[i + 1];
                    if (string.IsNullOrWhiteSpace(start))
                    {
                        start = "-1";
                    }

                    var end = chainIdsSplit[i + 2];
                    if (string.IsNullOrWhiteSpace(end))
                    {
                        end = "-1";
                    }

                    chainStartEnd.Add(new Tuple <char, int, int>(id, int.Parse(start), int.Parse(end)));
                }

                chainIdWhiteList = chainStartEnd.Select(a => a.Item1).Distinct().ToArray();
            }
            else
            {
                chainIdWhiteList = chainIds?.Where(char.IsLetter).Distinct().ToArray();//!string.IsNullOrEmpty(chainIds) ? chainIds.ToUpperInvariant().Split(new char[] { ' ', ',' },StringSplitOptions.RemoveEmptyEntries) : null;
            }



            var terminatedChains = new List <char>();

            var pdbfilenameShort = Path.GetFileNameWithoutExtension(pdbFilename);

            var pdbId = pdbfilenameShort.Substring(pdbfilenameShort.Length - 4).ToUpperInvariant();

            var lines = File.ReadAllLines(pdbFilename);

            var result = new List <Tuple <char, string> >();

            string[] ca = new string[] { "CA" };
            string[] bb = new[] { "N", "CA", "C", "O" };

            foreach (var line in lines)
            {
                if (line.Length < 22)
                {
                    continue;
                }

                if (line.Substring(0, 4).ToUpperInvariant() == "TER ")
                {
                    var chainId = line[21];//).ToUpperInvariant();

                    terminatedChains.Add(chainId);
                }


                if (line.Substring(0, 5).ToUpperInvariant() == "ATOM ")
                {
                    var chainId = line[21];//).ToUpperInvariant();

                    if (terminatedChains.Contains(chainId))
                    {
                        continue;
                    }

                    if (chainIdWhiteList != null && chainIdWhiteList.Length > 0 && !chainIdWhiteList.Contains(chainId))
                    {
                        continue;
                    }

                    //if (subset == "ca" || subset == "sc") && (line[13] != 'C' || line[14] != 'A')) continue;

                    var add = false;

                    var atom_type_s = line.Substring(13, 3).Trim();

                    if (atom_type_s[0] != 'C' && atom_type_s[0] != 'N' && atom_type_s[0] != 'O')
                    {
                        continue;
                    }

                    // check chainIdsSplit
                    var resId = int.Parse(line.Substring(atom_resseq, atom_resseq_len));

                    var chainStartEndItem = chainStartEnd.FirstOrDefault(a => a.Item1 == chainId);

                    if (chainStartEndItem != null)
                    {
                        if (!((chainStartEndItem.Item2 == -1 || resId >= chainStartEndItem.Item2) && (chainStartEndItem.Item3 == -1 || resId <= chainStartEndItem.Item3)))
                        {
                            continue;
                        }
                    }

                    if (subset == "-")
                    {
                        add = true;
                    }
                    else if (subset == "CA" && ca.Contains(atom_type_s))
                    {
                        add = true;
                    }
                    else if (subset == "MC" && bb.Contains(atom_type_s))
                    {
                        add = true;
                    }
                    else if (subset == "SC" && !bb.Contains(atom_type_s))
                    {
                        add = true;
                    }

                    if (add)
                    {
                        result.Add(new Tuple <char, string>(chainId, line));
                    }
                }
            }


            if (!string.IsNullOrWhiteSpace(outputFilename))
            {
                var outputFilename2 = outputFilename.Replace("?", "");
                Directory.CreateDirectory(Path.GetDirectoryName(outputFilename2));
                if (!outputFilename.Contains("?"))
                {
                    File.WriteAllLines(outputFilename, result.Select(a => a.Item2).ToList());
                }
                else if (outputFilename.Contains("??"))
                {
                    var chains = new string(result.Select(a => a.Item1).Where(char.IsLetter).Distinct().OrderBy(a => a).ToArray());

                    outputFilename2 = Path.GetDirectoryName(outputFilename2) + @"\" + Path.GetFileNameWithoutExtension(outputFilename2) + chains + Path.GetExtension(outputFilename2);

                    File.WriteAllLines(outputFilename2, result.Select(a => a.Item2).ToList());
                }
                else if (outputFilename.Contains("?"))
                {
                    var chains = result.Select(a => a.Item1).Distinct().ToList();

                    foreach (var chain in chains)
                    {
                        outputFilename2 = outputFilename.Replace("?", "");
                        outputFilename2 = Path.GetDirectoryName(outputFilename2) + @"\" + Path.GetFileNameWithoutExtension(outputFilename2) + chain + Path.GetExtension(outputFilename2);
                        File.WriteAllLines(outputFilename2, result.Where(a => a.Item1 == chain).Select(a => a.Item2).ToList());
                    }
                }
            }
            else
            {
                foreach (var line in result)
                {
                    Console.WriteLine(line);
                }
                Console.WriteLine();
            }
        }
        static void Main(string[] args)
        {
            //var crystals = new List<Tuple<string, char, char>>();
            //crystals.Add(new Tuple<string, char, char>("2SIC", 'E', 'I'));
            //crystals.Add(new Tuple<string, char, char>("3BX1", 'A', 'C'));
            //crystals.Add(new Tuple<string, char, char>("1RGI", 'A', 'G'));


            //MakeCrystalTemplates(crystals);

            //return;
            // uncomment one of the options below or add a new one

            //var pdbSumInterfaceIdFirstLastList = @"
            //3JBIV 990 1001
            //1T44G 95 110
            //4PKHB 114 137
            //5AFUb 52 60
            //1RGIAG 95 110
            //1H1VG 473 488
            //4EAHA 641 653
            //1KXPD 196 210
            //1M8QA 529 544

            //3JBIAV 990 1001
            //1T44AG 95 110
            //4PKHAB 114 137
            //1RGIAG 95 110
            //1H1VAG 473 488
            //4EAHHA 641 653
            //1KXPAD 196 210

            //var pdbSumInterfaceIdFirstLastList = @"
            //4GI3AC 22 29
            //2SICEI 65 74
            //1SBNEI 35 53
            //4LVNAP 207 216
            //1OYVBI 54 64
            //1V5IAB 68 76
            //1R0REI 11 20
            //3BX1AC 84 94
            //";


            // receptor ligand lig-inf-start lig-inf-end

            /*
             * var ligandSiblingInterfaceList = @"
             * 1RGIAG 95 110
             * 3JBIAV 990 1001
             * 4PKHAB 114 137
             * 1H1VAG 473 488
             * 4EAHHA 641 653
             * 1KXPAD 196 210
             * ";
             */

            var ligandSiblingInterfaceList = @"
            4GI3AC 22 29
            2SICEI 65 74
            1SBNEI 35 53
            4LVNAP 207 216
            1OYVBI 54 64
            1V5IAB 68 76
            1R0REI 11 20
            3BX1AC 84 94
            ";


            /*
             * 2SICEI 65 74
             * 3BX1AC 84 94
             * 1RGIAG 95 110
             */

            var recLigInfoList = ligandSiblingInterfaceList.Trim().Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries)
                                 .Select(a =>
            {
                var b = a.Trim().Split(new char[] { ' ', '\t' }, StringSplitOptions.RemoveEmptyEntries);

                var pdbId = b[0].Substring(0, 4);

                var recChainId           = b[0][4];
                var recInfStart          = -1;
                var recInfEnd            = -1;
                var recSequenceAlignment = GetSequence(pdbId, recChainId);
                var recSequence          = recSequenceAlignment.SuperSequence;
                var recInfSequence       = "";
                var recInfPosInRecSeq    = -1;


                var ligChainId              = b[0][5];
                var ligInfStart             = int.Parse(b[1]);
                var ligInfEnd               = int.Parse(b[2]);
                var ligSequenceAlignment    = GetSequence(pdbId, ligChainId);
                var ligSequence             = ligSequenceAlignment.SuperSequence;
                var ligInfSequenceAlignment = GetSequence(pdbId, ligChainId, ligInfStart, ligInfEnd);



                var ligInfSequence    = ligInfSequenceAlignment.SuperSequence;
                var ligInfPosInLigSeq = ligSequence.IndexOf(ligInfSequence);

                return(new RecLigInfo()
                {
                    PdbId = pdbId,
                    RecChainId = recChainId,
                    RecInfStart = recInfStart,
                    RecInfEnd = recInfEnd,
                    RecSequence = recSequence,
                    RecInfSequence = recInfSequence,
                    RecInfPosInRecSeq = recInfPosInRecSeq,
                    RecSequenceAlignment = recSequenceAlignment,
                    RecInfSequenceAlignment = null,

                    LigChainId = ligChainId,
                    LigInfStart = ligInfStart,
                    LigInfEnd = ligInfEnd,
                    LigSequence = ligSequence,
                    LigInfSequence = ligInfSequence,
                    LigInfPosInLigSeq = ligInfPosInLigSeq,
                    LigSequenceAlignment = ligSequenceAlignment,
                    LigInfSequenceAlignment = ligInfSequenceAlignment,
                });
            }).ToList();

            var allSequenceIds = recLigInfoList.Select(a => a.PdbId + a.RecChainId + a.LigChainId).Distinct().OrderBy(a => a).ToList();
            var rootFolder     = @"C:\pdbe_split_4\sw_" + string.Join("_", allSequenceIds) + @"\";

            Directory.CreateDirectory(rootFolder);
            //var rootFolderSubDirs = Directory.GetDirectories(rootFolder, "*", SearchOption.AllDirectories).ToList();
            //rootFolderSubDirs.Remove(Path.GetDirectoryName(rootFolder));

            if (recLigInfoList.Any(a => string.IsNullOrWhiteSpace(a.LigInfSequence)))
            {
                Console.WriteLine("Error: empty interface");
                return;
            }

            if (recLigInfoList.Any(a => string.IsNullOrWhiteSpace(a.LigSequence) || a.LigSequence.Length < 50))
            {
                Console.WriteLine("Error: empty/short sequence");
                return;
            }

            var fastaSequences    = recLigInfoList.Select(a => ">" + a.PdbId + a.LigChainId + "_pdb\r\n" + a.LigSequence).ToList();
            var fastaInfSequences = recLigInfoList.Select(a => ">" + a.PdbId + a.LigChainId + "_interface\r\n" + a.LigInfSequence).ToList();

            File.WriteAllLines(rootFolder + "sequences.fasta", fastaSequences);
            File.WriteAllLines(rootFolder + "interfaces_pdbsum.fasta", fastaInfSequences);


            if (recLigInfoList.Any(a => a.LigSequence.IndexOf(a.LigInfSequence) != a.LigSequence.LastIndexOf(a.LigInfSequence)))
            {
                Console.WriteLine("More than one interface match in the sequence");
                return;
            }



            var folders = new List <string>();

            foreach (var template in recLigInfoList.Where(a => (new List <string>()
            {
                "1RGI", "3BX1", "2SIC"
            }).Contains(a.PdbId)))
            {
                /*
                 * // replacement with direct overwrite of interface
                 * foreach (var templateForSiblingInterface in recLigInfoList)
                 * {
                 *  var superOffset = templateForSiblingInterface.LigSequenceAlignment.SuperSequenceStartIndex;
                 *
                 *  var siblingInterfaceToFit = InterfaceSubsequence(templateForSiblingInterface.LigSequence, templateForSiblingInterface.LigInfStart - superOffset, templateForSiblingInterface.LigInfEnd - superOffset, templateForModelling.LigInfSequence.Length).InterfaceSequence;
                 *
                 *  var substitutionDescription = "sibling_" + templateForSiblingInterface.PdbId + templateForSiblingInterface.RecChainId + templateForSiblingInterface.LigChainId;
                 *
                 *  var templateFolder = rootFolder + templateForModelling.PdbId + templateForModelling.RecChainId + templateForModelling.LigChainId + @"\" + templateForSiblingInterface.PdbId + templateForSiblingInterface.RecChainId + templateForSiblingInterface.LigChainId + @"\";
                 *  var currentFolder = rootFolder + templateForModelling.PdbId + templateForModelling.RecChainId + templateForModelling.LigChainId + @"\" + templateForSiblingInterface.PdbId + templateForSiblingInterface.RecChainId + templateForSiblingInterface.LigChainId + @"\" + substitutionDescription + @"\";
                 *  Directory.CreateDirectory(Path.GetDirectoryName(currentFolder));
                 *  rootFolderSubDirs.Remove(Path.GetDirectoryName(currentFolder));
                 *
                 *  var ligSeqModified = templateForModelling.LigSequence.Remove(templateForModelling.LigInfPosInLigSeq, siblingInterfaceToFit.Length).Insert(templateForModelling.LigInfPosInLigSeq, siblingInterfaceToFit);
                 *
                 *  var file = currentFolder + "template_ligand_after_substitution.ali";
                 *  var seqModPir = Pir(new List<string>() { templateForModelling.RecSequence, ligSeqModified });
                 *
                 *  File.WriteAllText(file, seqModPir);
                 *  folders.Add(currentFolder);
                 *
                 *  //DimerTemplate(, templateForModelling.RecChainId, templateForModelling.LigChainId);
                 *
                 *  var templateSrc = @"c:\pdb_templates\repaired_crystal_and_repaired_model\" + templateForModelling.PdbId + templateForModelling.RecChainId + templateForModelling.LigChainId + ".pdb"; //@"c:\pdbe\" + templateForModelling.PdbId + ".pdb";
                 *  var templateDest = templateFolder + "template_ligand_all.pdb";
                 *  File.Copy(templateSrc, templateDest, true);
                 *
                 *  if (templateForModelling.PdbId == templateForSiblingInterface.PdbId)
                 *  {
                 *      Debug.WriteLine("");
                 *  }
                 * }
                 */

                // replacements with sliding window method
                foreach (var sibling in recLigInfoList /*.Where(a => a == templateForModelling).ToList()*/)//.Where(a => a.PdbId == templateForModelling.PdbId && a.RecChainId == templateForModelling.RecChainId))
                {
                    if (template != sibling && (template.LigInfSequence.Contains(sibling.LigInfSequence) || sibling.LigInfSequence.Contains(template.LigInfSequence)))
                    {
                        Debug.WriteLine("template and sibling have matching interface");
                        continue;
                    }

                    var currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\";
                    Directory.CreateDirectory(Path.GetDirectoryName(currentFolder));

                    var templateSrc  = @"c:\pdb_templates\repaired_crystal_and_repaired_model\" + template.PdbId + template.RecChainId + template.LigChainId + ".pdb"; //@"c:\pdbe\" + templateForModelling.PdbId + ".pdb";
                    var templateDest = currentFolder + "template_ligand_all.pdb";
                    File.Copy(templateSrc, templateDest, true);

                    // 1. slide sibling interface over template interface (if bigger, try each possible)
                    // 2. best alignments?
                    // 3.

                    // 1. ignore size difference (sibling can be same size, longer or shorter, still copied as-is)

                    //Debug.WriteLine("Template: " + template.LigInfSequence + " " + template.LigInfSequence.Length);
                    //Debug.WriteLine("Sibling: " + sibling.LigInfSequence + " " + sibling.LigInfSequence.Length);

                    //part 0 - random sequence tests
                    if (template == sibling)
                    {
                        var firstPosition = template.LigInfPosInLigSeq - 50;// (template.LigInfSequence.Length - 1);
                        if (firstPosition < 0)
                        {
                            firstPosition = 0;
                        }

                        var lastPosition = template.LigInfPosInLigSeq + template.LigInfSequence.Length + 50;

                        if (lastPosition + (template.LigInfSequence.Length - 1) > template.LigSequence.Length)
                        {
                            lastPosition = template.LigSequence.Length - sibling.LigInfSequence.Length;
                        }

                        var substitutionMode = "0";

                        for (var i = firstPosition; i <= lastPosition; i++)
                        {
                            var overlap = ProteinBioClass.InterfaceOverlapPercentage(i, (i + template.LigInfSequence.Length) - 1, template.LigInfPosInLigSeq, (template.LigInfPosInLigSeq + template.LigInfSequence.Length) - 1); overlap = Math.Round(overlap, 2);

                            var random = template.LigSequence.Substring(i, template.LigInfSequence.Length);

                            var ligSeqMod = template.LigSequence.Remove(template.LigInfPosInLigSeq, template.LigInfSequence.Length).Insert(template.LigInfPosInLigSeq, random);

                            currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" +
                                            template.PdbId + template.RecChainId + template.LigChainId + @"\" +
                                            substitutionMode + "_" + (i + 1).ToString().PadLeft(4, '0') + "_" + (template.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (sibling.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (overlap) + @"\";



                            Directory.CreateDirectory(Path.GetDirectoryName(currentFolder));
                            var file      = currentFolder + "template_ligand_after_substitution.ali";
                            var seqModPir = Pir(new List <string>()
                            {
                                template.RecSequence, ligSeqMod
                            });

                            if (!File.Exists(file))
                            {
                                if (!folders.Contains(currentFolder))
                                {
                                    folders.Add(currentFolder);
                                }
                                File.WriteAllText(file, seqModPir);
                            }
                        }
                    }
                    continue;

                    {
                        //Debug.WriteLine("Part 1");
                        var firstPosition = template.LigInfPosInLigSeq - (sibling.LigInfSequence.Length - 1);
                        if (firstPosition < 0)
                        {
                            firstPosition = 0;
                        }

                        var lastPosition = template.LigInfPosInLigSeq + (template.LigInfSequence.Length - 1);
                        if (lastPosition + (sibling.LigInfSequence.Length - 1) > template.LigSequence.Length)
                        {
                            lastPosition = template.LigSequence.Length - sibling.LigInfSequence.Length;
                        }

                        for (var i = firstPosition; i <= lastPosition; i++)
                        {
                            var substitutionMode = "1";

                            var overlap = ProteinBioClass.InterfaceOverlapPercentage(i, (i + sibling.LigInfSequence.Length) - 1, template.LigInfPosInLigSeq, (template.LigInfPosInLigSeq + template.LigInfSequence.Length) - 1); overlap = Math.Round(overlap, 2);

                            var ligSeqMod = template.LigSequence.Remove(i, sibling.LigInfSequence.Length).Insert(i, sibling.LigInfSequence);


                            currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" +
                                            sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" +
                                            substitutionMode + "_" + (i + 1).ToString().PadLeft(4, '0') + "_" + (template.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (sibling.LigInfSequence.Length + 1).ToString().PadLeft(4, '0') + "_" + (overlap) + @"\";
                            if (!folders.Contains(currentFolder))
                            {
                                folders.Add(currentFolder);
                            }

                            Directory.CreateDirectory(Path.GetDirectoryName(currentFolder));
                            var file      = currentFolder + "template_ligand_after_substitution.ali";
                            var seqModPir = Pir(new List <string>()
                            {
                                template.RecSequence, ligSeqMod
                            });
                            File.WriteAllText(file, seqModPir);

                            //                        Debug.WriteLine(io + " " + ligSeqMod);
                            //Debug.WriteLine("");
                        }
                    }


                    // 2. keep longer sibling interface the same size as template interface
                    if (sibling.LigInfSequence.Length > template.LigInfSequence.Length)
                    {
                        //Debug.WriteLine("Part 2");
                        var firstPosition = template.LigInfPosInLigSeq - (template.LigInfSequence.Length - 1);
                        var lastPosition  = template.LigInfPosInLigSeq + (template.LigInfSequence.Length - 1);

                        for (var i = firstPosition; i <= lastPosition; i++)
                        {
                            for (var j = 0; j <= sibling.LigInfSequence.Length - template.LigInfSequence.Length; j++)
                            {
                                var ligSeqMod = template.LigSequence.Remove(i, template.LigInfSequence.Length).Insert(i, sibling.LigInfSequence.Substring(j, template.LigInfSequence.Length));


                                var substitutionMode = "2";
                                var overlap          = ProteinBioClass.InterfaceOverlapPercentage(i, (i + template.LigInfSequence.Length) - 1, template.LigInfPosInLigSeq, (template.LigInfPosInLigSeq + template.LigInfSequence.Length) - 1);
                                overlap = Math.Round(overlap, 2);

                                currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" +
                                                sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" +
                                                substitutionMode + "_" + (i + 1).ToString().PadLeft(4, '0') + "_" + (j + 1).ToString().PadLeft(4, '0') + "_" + (overlap) + @"\";
                                if (!folders.Contains(currentFolder))
                                {
                                    folders.Add(currentFolder);
                                }

                                Directory.CreateDirectory(Path.GetDirectoryName(currentFolder));
                                var file      = currentFolder + "template_ligand_after_substitution.ali";
                                var seqModPir = Pir(new List <string>()
                                {
                                    template.RecSequence, ligSeqMod
                                });
                                File.WriteAllText(file, seqModPir);
                            }
                        }

                        //Debug.WriteLine("");
                    }

                    // 3. delete original template interface, insert sibling interface (will already be in part 1 if size is equal)
                    if (sibling.LigInfSequence.Length != template.LigInfSequence.Length)
                    {
                        Debug.WriteLine("Part 3");

                        var ligSeqMod = template.LigSequence.Remove(template.LigInfPosInLigSeq, template.LigInfSequence.Length).Insert(template.LigInfPosInLigSeq, sibling.LigInfSequence);

                        var substitutionMode = "3";


                        currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" +
                                        sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" +
                                        substitutionMode + "_" + "delete-insert" + @"\";
                        if (!folders.Contains(currentFolder))
                        {
                            folders.Add(currentFolder);
                        }

                        Directory.CreateDirectory(Path.GetDirectoryName(currentFolder));
                        var file      = currentFolder + "template_ligand_after_substitution.ali";
                        var seqModPir = Pir(new List <string>()
                        {
                            template.RecSequence, ligSeqMod
                        });
                        File.WriteAllText(file, seqModPir);

                        Debug.WriteLine("");
                    }

                    /*
                     * continue;
                     *
                     * var firstBound = -1;
                     * var lastBound = -1;
                     * var resolution = -1;
                     * var substitutionDescription = "";
                     *
                     * var interfaceLengthDifference = template.LigInfSequence.Length - sibling.LigInfSequence.Length; // positive=replacement is shorter, negative=replacement is longer, zero=the same
                     *
                     * const int flank = 10;
                     * firstBound = sibling.LigInfPosInLigSeq - flank;
                     * lastBound = (firstBound + sibling.LigInfSequence.Length + (flank * 2)) - 1;
                     *
                     * resolution = 1;
                     * substitutionDescription = "inf";
                     *
                     *
                     * if (firstBound < 0) firstBound = 0;
                     * if (lastBound > sibling.LigSequence.Length - 1) lastBound = sibling.LigSequence.Length - 1;
                     *
                     * var totalPossibleInterfaceOverlapPositions = 1;
                     *
                     *
                     * if (interfaceLengthDifference < 0)
                     * {
                     *  totalPossibleInterfaceOverlapPositions = Math.Abs(interfaceLengthDifference) + 1;
                     * }
                     *
                     *
                     *
                     * var lastPossibleSubPos = (lastBound - (interfaceLengthDifference > 0 ? sibling.LigInfSequence.Length : template.LigInfSequence.Length)) + 1;
                     *
                     * for (var substitutionSourcePos = firstBound; substitutionSourcePos <= lastBound && substitutionSourcePos <= lastPossibleSubPos; substitutionSourcePos += resolution)
                     * {
                     *  for (var interfaceOverlapPosition = 0; interfaceOverlapPosition < totalPossibleInterfaceOverlapPositions; interfaceOverlapPosition++)
                     *  {
                     *      var replacementInterfaceSubsequence = sibling.LigSequence.Substring(substitutionSourcePos, sibling.LigInfSequence.Length);
                     *
                     *      if (sibling.LigInfSequence.Length > template.LigInfSequence.Length)
                     *      {
                     *          replacementInterfaceSubsequence = replacementInterfaceSubsequence.Substring(interfaceOverlapPosition, template.LigInfSequence.Length);
                     *      }
                     *
                     *      var ligSeqModified = template.LigSequence.Remove(template.LigInfPosInLigSeq, replacementInterfaceSubsequence.Length).Insert(template.LigInfPosInLigSeq, replacementInterfaceSubsequence);
                     *
                     *      if (ligSeqModified.Length != template.LigSequence.Length)
                     *      {
                     *          throw new Exception("Wrong sub pos or len");
                     *      }
                     *
                     *      var native = (substitutionSourcePos >= sibling.LigInfPosInLigSeq && substitutionSourcePos + replacementInterfaceSubsequence.Length <= sibling.LigInfPosInLigSeq + sibling.LigInfSequence.Length);
                     *      if (native)
                     *      {
                     *          Console.WriteLine("");
                     *      }
                     *      currentFolder = rootFolder + template.PdbId + template.RecChainId + template.LigChainId + @"\" + sibling.PdbId + sibling.RecChainId + sibling.LigChainId + @"\" + substitutionDescription + "_" + (substitutionSourcePos + 1).ToString().PadLeft(4, '0') + "_" + (substitutionSourcePos + replacementInterfaceSubsequence.Length).ToString().PadLeft(4, '0') + "_" + (interfaceOverlapPosition + 1) + "_" + totalPossibleInterfaceOverlapPositions + (native ? "_native" : "") + @"\";
                     *      Directory.CreateDirectory(Path.GetDirectoryName(currentFolder));
                     *      //rootFolderSubDirs.Remove(Path.GetDirectoryName(currentFolder));
                     *
                     *      var file = currentFolder + "template_ligand_after_substitution.ali";
                     *      var seqModPir = Pir(new List<string>() { template.RecSequence, ligSeqModified });
                     *      File.WriteAllText(file, seqModPir);
                     *  }
                     * }
                     */
                }
            }

            //File.WriteAllLines(rootFolder + "obsolete_dirs.txt", rootFolderSubDirs);

            if (folders.Count > 0)
            {
                var scripts = new List <string> {
                    "modeller_monomer.bat", "foldx_dimer.bat"
                };                                                                           //, "pisa_dimer.bat" };


                int div = folders.Count / Environment.ProcessorCount;


                var batch = new List <string>();
                var c     = 0;
                while (folders.Count > 0)
                {
                    c++;
                    batch.Add(@"@echo off");
                    batch.Add(@"set HDF5_DISABLE_VERSION_CHECK=2");
                    batch.Add(@"set THIS_DIR=%cd%");
                    batch.Add(@"set PATH=%PATH%;c:\modeller_scripts;");

                    var t = folders.Count >= div * 2 ? div : folders.Count;

                    folders.Take(t).ToList().ForEach(a =>
                    {
                        batch.Add(@"echo " + a);
                        batch.Add(@"cd " + a);
                        batch.Add(@"CALL %script1%");
                        batch.Add(@"CALL %script2%");
                    });
                    folders = folders.Skip(t).ToList();

                    batch.Add(@"pause");

                    var d = batch /*.Select(a => a.Replace("%script%", script))*/.ToList();
                    var n = "";
                    for (var i = 0; i < scripts.Count; i++)
                    {
                        var script = scripts[i];
                        d = d.Select(a => a.Replace("%script" + (i + 1).ToString() + "%", script)).ToList();
                        n = n + Path.GetFileNameWithoutExtension(script) + "_";
                    }
                    File.WriteAllLines(rootFolder + @"r_" + c + "_" + n + DateTime.Now.Ticks + @".bat", d);

                    batch.Clear();
                }
            }
        }
        public static List <SequenceIdentityClusterMember> ClusterSequenceByAlignedSequenceIdentity(List <Sequence> seqList, ProteinBioClass.AlignmentIdentityOption alignmentIdentityOption, decimal mininumClusterPairwiseSimilarity = 0.3m, decimal mininumEvoClusterPairwiseSimilarity = 0.3m)
        {
            var allsequences = seqList.Select(a => new Tuple <string, char, string>(new ProteinBioClass.SequenceId(a.Id).PdbId, new ProteinBioClass.SequenceId(a.Id).ChainId, Sequence.EscapeAminoAcidSequence(a.FullSequence))).ToList();

            var sequences = allsequences.Select(a => a.Item3).Distinct().ToList();

            var sequenceIds = sequences.Select(a => allsequences.Where(b => b.Item3 == a).ToList()).ToList();


            var seqClusters = new List <List <string> >();



            for (int x = 0; x < sequences.Count; x++)
            {
                var seq1       = sequences[x];
                var newCluster = new List <string>();
                newCluster.Add(seq1);
                seqClusters.Add(newCluster);
            }

            for (int indexX = 0; indexX < sequences.Count; indexX++)
            {
                Console.WriteLine("Aligning sequence " + indexX);
                var seqX = sequences[indexX];
                //List<decimal> scoreList = new List<decimal>();
                //List<decimal> scoreEvoList = new List<decimal>();

                for (int indexY = 0; indexY < sequences.Count; indexY++)
                {
                    if (indexY <= indexX)
                    {
                        continue;
                    }

                    var seqY = sequences[indexY];

                    if ((decimal)Math.Min(seqX.Length, seqY.Length) / (decimal)Math.Max(seqX.Length, seqY.Length) < mininumClusterPairwiseSimilarity)
                    {
                        continue;
                    }

                    var cluster1 = seqClusters.FirstOrDefault(a => a.Contains(seqX));
                    var cluster2 = seqClusters.FirstOrDefault(a => a.Contains(seqY));

                    if (cluster1 != null && cluster2 != null && cluster1 == cluster2)
                    {
                        continue;
                    }


                    var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NON, alignmentIdentityOption);

                    Console.WriteLine("1: " + seqX);
                    Console.WriteLine("2: " + seqY);
                    Console.WriteLine("Score1: " + score.Score);
                    Console.WriteLine("Score2: " + score.ScoreEvo);

                    if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity)
                    {
                        var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.SIM, alignmentIdentityOption);
                        if (x.Score > score.Score)
                        {
                            score.Score = x.Score;
                        }
                        if (x.ScoreEvo > score.ScoreEvo)
                        {
                            score.ScoreEvo = x.ScoreEvo;
                        }
                    }

                    if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity)
                    {
                        var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NMW, alignmentIdentityOption);
                        if (x.Score > score.Score)
                        {
                            score = x;
                        }
                        if (x.ScoreEvo > score.ScoreEvo)
                        {
                            score.ScoreEvo = x.ScoreEvo;
                        }
                    }


                    if (score.Score >= mininumClusterPairwiseSimilarity && score.ScoreEvo >= mininumEvoClusterPairwiseSimilarity)
                    {
                        var newCluster = new List <string>();

                        newCluster.AddRange(cluster1);
                        newCluster.AddRange(cluster2);

                        seqClusters.Remove(cluster1);
                        seqClusters.Remove(cluster2);

                        seqClusters.Add(newCluster);
                    }

                    //scoreList.Add(score.Score);
                    //scoreEvoList.Add(score.ScoreEvo);
                }
                //Console.WriteLine("[" + string.Join(", ", scoreList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]");
                //Console.WriteLine("[" + string.Join(", ", scoreEvoList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]");
            }

            seqClusters = seqClusters.OrderBy(a => a.Count).ToList();

            var output = new List <SequenceIdentityClusterMember>();

            for (var index = 0; index < seqClusters.Count; index++)
            {
                var seqCluster = seqClusters[index];
                foreach (var item in seqCluster)
                {
                    var indexIds = sequences.IndexOf(item);
                    var ids      = sequenceIds[indexIds];

                    foreach (var id in ids)
                    {
                        output.Add(new SequenceIdentityClusterMember(index + 1, ProteinBioClass.PdbIdFromPdbFilename(id.Item1), id.Item2, id.Item3));
                    }
                }
            }

            return(output);
        }
Ejemplo n.º 14
0
        static void Main(string[] args)
        {
            // this program takes a fasta or pdb file and finds all matching homologs

            // FindHomologs.exe "c:\ds96ub\ds96ub.fasta" * "c:\pdb\pdb_seqres.fasta" NMW Y 0.3 75 c:\pdb\

            // alignment_type = (n)one, (s)imple, NMW, SWM

            var query_sequence_file  = args[0]; //query.fasta
            var query_id_chain       = args[1]; //1A2G:B
            var target_sequence_file = args[2]; //targets.fasta
            var alignment_type_str   = args[3]; //NMW,SWM,SIM,NON

            if (alignment_type_str == "*")
            {
                alignment_type_str = "NMW,SWM,SIM,NON";
            }
            var alignment_type_str_split       = alignment_type_str.ToUpperInvariant().Split(new char[] { ',', ';', ' ', '\t' });
            var compare_physicochemically      = args[4]; //Y/N
            var compare_physicochemically_bool = compare_physicochemically == "Y";
            var min_similarity_str             = args[5]; // 0.3
            var max_len_difference             = args[6];
            var max_len_difference_int         = int.Parse(max_len_difference);
            var output_folder = args[7];

            var minSimilarity = decimal.Parse(min_similarity_str);

            var alignmentTypes = new List <ProteinBioClass.AlignmentType>();

            if (alignment_type_str_split.Contains("NMW"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.NMW);
            }
            if (alignment_type_str_split.Contains("SWM"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.SWM);
            }
            if (alignment_type_str_split.Contains("SIM"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.SIM);
            }
            if (alignment_type_str_split.Contains("NON") || alignmentTypes.Count == 0)
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.NON);
            }
            if (alignmentTypes.Count < alignment_type_str_split.Length)
            {
                Console.WriteLine("; unknown alignment type");
                return;
            }

            // load list of query sequences
            var queryPdbid   = query_id_chain.Split(new char[] { ':' })[0];
            var queryChainid = (query_id_chain.Contains(":") ? query_id_chain.Split(new char[] { ':' })[1] : "*")[0];


            var querySeq     = Sequence.LoadSequenceFile(query_sequence_file, null);
            var queryResults = querySeq.Where(a =>
            {
                var id = new ProteinBioClass.SequenceId(a.Id);
                return((queryPdbid == "*" || id.PdbId.ToUpperInvariant() == queryPdbid.ToUpperInvariant()) &&
                       (queryChainid == '*' || id.ChainId == queryChainid));
            }).ToList();

            if (queryResults.Count == 0)
            {
                Console.WriteLine("; the query pdbids/chainids were not found");
                return;
            }


            // load list of target sequences
            var targetSeq = Sequence.LoadSequenceFile(target_sequence_file, new string[] { null, "", "protein" });

            targetSeq = targetSeq.Where(a => a.Count() >= 50).ToList();

            Console.WriteLine("; aligning " + queryResults.Count + " query sequences to " + targetSeq.Count + " target sequences");

            // perform alignment

            //var startTime = DateTime.Now;


            //var progress = 0;
            //var progressLock = new object();


            //var tasks = new List<Task<StringBuilder>>();

            var queryPdbIds  = queryResults.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId);
            var targetPdbIds = targetSeq.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId);

            var queryPdbIdCounts = new Dictionary <string, int>();

            foreach (var x in queryPdbIds)
            {
                if (!queryPdbIdCounts.ContainsKey(x))
                {
                    queryPdbIdCounts.Add(x, 1);
                }
                else
                {
                    queryPdbIdCounts[x]++;
                }
            }

            var targetPdbIdCounts = new Dictionary <string, int>();

            foreach (var x in targetPdbIds)
            {
                if (!targetPdbIdCounts.ContainsKey(x))
                {
                    targetPdbIdCounts.Add(x, 1);
                }
                else
                {
                    targetPdbIdCounts[x]++;
                }
            }

            foreach (var _query in queryResults)
            {
                var _queryId = new ProteinBioClass.SequenceId(_query.Id);
                var filename = (new DirectoryInfo(output_folder).FullName) + @"\homologs_" + _queryId.PdbId + _queryId.ChainId + @".csv";

                // skip if already processed
                if (File.Exists(filename) && new FileInfo(filename).Length > 0)
                {
                    continue;
                }

                var totalQueryPdbIdChains = queryPdbIdCounts[_queryId.PdbId];

                WorkDivision wd = new WorkDivision(targetSeq.Count);


                for (var thread = 0; thread < wd.ThreadCount; thread++)
                {
                    var query   = _query;
                    var queryId = _queryId;
                    var lti     = thread;
                    wd.TaskList.Add(Task.Run(() =>
                    {
                        var result = new List <HomologChain>();


                        for (var target = wd.ThreadFirstIndex[lti]; target <= wd.ThreadLastIndex[lti]; target++)
                        {
                            var targetobj = targetSeq[target];

                            if (max_len_difference_int != -1 && Math.Abs(targetobj.Count() - query.Count()) > max_len_difference_int)
                            {
                                continue;
                            }

                            var targetId = new ProteinBioClass.SequenceId(targetobj.Id);

                            //var totalTargetPdbIdChains = targetSeq.Count(a => FindAtomicContacts.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant() == targetId.PdbId.ToUpperInvariant());



                            //var timeRemaining =
                            //    TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks *
                            //                       ((targetSeq.Count * queryResults.Count) - (progress + 1)) /
                            //                       (progress + 1));

                            foreach (var alignmentType in alignmentTypes)
                            {
                                var scores = ProteinBioClass.AlignedSequenceSimilarityPercentage(query, targetobj, alignmentType /*,
                                                                                                                                  * compare_physicochemically_bool*/);
                                decimal percentSimilar;

                                if (compare_physicochemically_bool)
                                {
                                    percentSimilar = scores.ScoreEvo;
                                }
                                else
                                {
                                    percentSimilar = scores.Score;
                                }

                                if (percentSimilar >= minSimilarity)
                                {
                                    result.Add(new HomologChain(
                                                   queryId.PdbId, queryId.ChainId, totalQueryPdbIdChains,
                                                   targetId.PdbId, targetId.ChainId, targetPdbIdCounts[targetId.PdbId],

                                                   alignmentType.ToString(),
                                                   scores.Score,
                                                   scores.ScoreEvo));
                                }
                            }
                            //if (progress % 20 == 0)
                            //    Console.Write("\r{0}% eta {1}     ",
                            //        Math.Round((decimal)(progress + 1) / (decimal)(targetSeq.Count * queryResults.Count),
                            //            2)
                            //            .ToString(CultureInfo.InvariantCulture),
                            //        timeRemaining.ToString(@"d\d\:h\h\:m\m\:s\s",
                            //            CultureInfo.InvariantCulture));
                            //lock (progressLock)
                            //    progress++;
                        }

                        return(result);
                    }));
                }

                wd.WaitAllTasks();

                var mergedlist = new List <string>();

                mergedlist.Add("; " + _queryId.PdbId + ":" + _queryId.ChainId);
                mergedlist.Add(String.Join(",",
                                           new string[]
                {
                    "query pdb id", "query chain id", "query chains",
                    "target pdb id", "target chain id", "target chains",

                    "alignment method", "sequence similarity", "sequence evo similarity"
                }));

                foreach (var t in wd.TaskList)
                {
                    var tc = t as Task <List <HomologChain> >;

                    if (tc == null)
                    {
                        throw new Exception("task in tasklist was null");
                    }

                    mergedlist.AddRange(tc.Result.Select(a => a.ToString()).ToList());
                }

                if (string.IsNullOrWhiteSpace(output_folder))
                {
                    Console.WriteLine(String.Join(Environment.NewLine, mergedlist));
                }
                else
                {
                    File.WriteAllLines(filename, mergedlist);
                }
            }
        }
Ejemplo n.º 15
0
        static void Main(string[] args)
        {
            var parameters = new string[, ]
            {
                { "[pdb_or_atoms_file]", "output from the ComplexAtoms program" },
                { "[max_distance]", "maximum allowed contact distance in angstroms [i.e. 5.0 or 8.0]" },
                { "[[output_file]]", "optional output file.  when ommitted, output to console" },
                { "[[overwrite]]", "overwrite if output file exists" }
            };

            var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length);
            var exeFilename    = Path.GetFileName(System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName);

            if (args.Length == 0)
            {
                Console.WriteLine(exeFilename + @" is a program to list atomic contacts for a PDB file ATOM records.");
                Console.WriteLine();
                Console.WriteLine(@"Usage:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Example:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\pdb1a12.pdb"" 8.0 ""c:\pdb_atoms\atoms1a12.pdb""", maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Arguments:");
                for (var i = 0; i < parameters.GetLength(0); i++)
                {
                    Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false));
                }
                Console.WriteLine();
                //return;
            }

            // load arguments
            var p             = 0;
            var atomsFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            atomsFilename = atomsFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + atomsFilename);

            p++;
            var maxDistance = args.Length > p && args[p].Length > 0 ? Decimal.Parse(args[p]) : 0.0m;

            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + maxDistance);

            p++;
            var outputFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            outputFilename = outputFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputFilename);


            p++;
            var overwrite = args.Length > p && args[p].Length > 0 ? args[p] : "";

            overwrite = overwrite.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + overwrite);

            if (!string.IsNullOrWhiteSpace(overwrite) && overwrite.ToUpperInvariant() != "Y" && File.Exists(outputFilename))
            {
                Console.Write("; File exists, skipping.");
                return;
            }

            Console.WriteLine();

            if (!File.Exists(atomsFilename))
            {
                return;
            }


            var interactions = ProteinBioClass.FindInteractions(CancellationToken.None, maxDistance, atomsFilename, new Dictionary <string, List <char> >());

            if (!string.IsNullOrWhiteSpace(outputFilename))
            {
                ProteinBioClass.AtomPair.SaveAtomPairList(outputFilename, interactions);
            }
            else
            {
                //Console.WriteLine("; Atom pairs with contacts: " + interactions.Count);
                foreach (var a in interactions.Select(a => a.ToString()).ToList())
                {
                    Console.WriteLine(a);
                }
            }
        }
Ejemplo n.º 16
0
        static void Main(string[] args)
        {
            var pdbFolder = @"C:\ds96ub_homologs\";

            var homologClusterData = FindHomologsCluster.FindHomologsCluster.HomologClusterData.Load(@"c:\ds96ub_homologs\ds96ub_homologs_0.7.csv");

            var pdbFiles = Directory.GetFiles(pdbFolder, "*.pdb", SearchOption.TopDirectoryOnly);

            var pdbIdList = pdbFiles.Select(ProteinBioClass.PdbIdFromPdbFilename).ToList();

            // only ca-atoms, ters and endmdls
            var pdbAtomsText =
                pdbFiles.Select(
                    a =>
                    File.ReadAllLines(a)
                    .Where(b => (b.StartsWith("ATOM ") && b[13] == 'C' && b[14] == 'A') || /*b.StartsWith("TER ") ||*/ b.StartsWith("ENDMDL "))
                    .ToList()).ToList();

            // only first nmr model
            pdbAtomsText = pdbAtomsText.Select(a =>
            {
                var x = a.FindIndex(b => b.StartsWith("ENDMDL "));
                return(x == -1 ? a : a.GetRange(0, x - 1));
            }).ToList();

            var pdbAtoms = pdbAtomsText.Select(a => a.Select(b => new ATOM_Record(b)).ToList()).ToList();

            // get list of unique chain ids
            var pdbChainIds = pdbAtoms.Select((a, i) => a.Select(b => char.ToUpperInvariant(b.chainID.FieldValue[0])).ToList()).Distinct().ToList();

            var pdbIdChainIdList = new List <Tuple <string, char> >();

            for (var i = 0; i < pdbIdList.Count; i++)
            {
                pdbIdChainIdList.AddRange(pdbChainIds[i].Select(chainId => new Tuple <string, char>(pdbIdList[i], chainId)));
            }
            pdbIdChainIdList = pdbIdChainIdList.Distinct().ToList();

            // for each chain
            var pdbContacts =
                pdbIdChainIdList.Select(a =>
            {
                var x =
                    ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\ds96ub_homologs\contacts\contacts_pdb" + a.Item1.ToUpperInvariant() + ".pdb")
                    .Where(b => char.ToUpperInvariant(b.Atom1.chainID.FieldValue[0]) == a.Item2 || char.ToUpperInvariant(b.Atom2.chainID.FieldValue[0]) == a.Item2)
                    .Select(c =>
                {
                    if (char.ToUpperInvariant(c.Atom1.chainID.FieldValue[0]) != a.Item2)
                    {
                        c.SwapAtoms();
                    }

                    return(c);
                }).ToList();

                return(x);
            }).ToList();



            // res min, res max, best min, best max, interface aa, interface mask
            var pdbInterfaces = new List <Ds93UbInterface>();

            var interface_target_length = 50;


            for (int index = 0; index < pdbContacts.Count; index++)
            {
                var pdbId   = pdbIdChainIdList[index].Item1;
                var chainId = pdbIdChainIdList[index].Item2;

                var pdbContact = pdbContacts[index];

                if (pdbContact.Count == 0)
                {
                    continue;
                }

                var contactChains = pdbContact.Where(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) != chainId).Select(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0])).Distinct().ToList();

                foreach (var contactChain in contactChains)
                {
                    var pdbContactsResSeqIds =
                        pdbContact.Where(a => char.ToUpperInvariant(a.Atom1.chainID.FieldValue[0]) == chainId &&
                                         char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) == contactChain)
                        .Select(a => int.Parse(a.Atom1.resSeq.FieldValue))
                        .ToList();


                    var res_seq     = pdbContactsResSeqIds;
                    var min_res_seq = pdbContactsResSeqIds.Min();
                    var max_res_seq = pdbContactsResSeqIds.Max();

                    var best50_min           = int.MinValue;
                    var best50_max           = int.MinValue;
                    var best50_interactions  = int.MinValue;
                    var best50_middle_finder = new List <Tuple <int, int, int> >();
                    for (var x = min_res_seq - interface_target_length; x <= max_res_seq; x++)
                    {
                        if (Math.Abs(max_res_seq - min_res_seq) <= interface_target_length)
                        {
                            best50_min          = min_res_seq;
                            best50_max          = max_res_seq;
                            best50_interactions = res_seq.Count;
                            break;
                        }

                        var min = x;
                        var max = x + interface_target_length > max_res_seq ? max_res_seq : x + interface_target_length;

                        var best50 = res_seq.Count(a => a >= best50_min && a <= best50_max);

                        if (best50 == best50_interactions)
                        {
                            best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                        }

                        if (best50_interactions == int.MinValue || best50 > best50_interactions)
                        {
                            best50_middle_finder.Clear();
                            best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                            best50_min          = min;
                            best50_max          = max;
                            best50_interactions = best50;
                        }

                        if (x + interface_target_length >= max)
                        {
                            break;
                        }
                    }

                    if (best50_middle_finder.Count > 2)
                    {
                        var middle = best50_middle_finder[best50_middle_finder.Count / 2];
                        best50_min          = middle.Item1;
                        best50_max          = middle.Item2;
                        best50_interactions = middle.Item3;
                    }

                    var best50_interface_atoms = pdbAtoms[pdbIdList.IndexOf(pdbId)].Where(a =>
                    {
                        var l = int.Parse(a.resSeq.FieldValue);
                        var c = char.ToUpperInvariant(a.chainID.FieldValue[0]);
                        return(c == chainId && l >= best50_min && l <= best50_max);
                    }).ToList();

                    best50_interface_atoms = best50_interface_atoms.OrderBy(c => int.Parse(c.resSeq.FieldValue)).ToList();

                    var best50_interface = string.Join("", best50_interface_atoms.Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                    var best50_mask = new string('_', best50_interface.Length);
                    best50_mask = string.Join("",
                                              best50_mask.Select((a, i) => res_seq.Contains(i + best50_min) ? "X" : "_").ToList());

                    pdbInterfaces.Add(new Ds93UbInterface(pdbId, chainId, contactChain, min_res_seq, max_res_seq, best50_min,
                                                          best50_max, best50_interactions, best50_interface, best50_mask, -1, "", "", 0, -1, "", "", 0));
                }
            }

            var homologClusterIndexes = homologClusterData.Select(a => a.ClusterIndex).Distinct().ToList();

            var homologClusters = homologClusterIndexes.Select(a => homologClusterData.Where(b => b.ClusterIndex == a).ToList()).ToList();

            var pdbInterfacesSorted = homologClusters.Select(a => pdbInterfaces.Where(b => a.Any(c => c.PdbId == b.PdbId && (char.ToUpperInvariant(c.ChainId) == b.ChainId1 || char.ToUpperInvariant(c.ChainId) == b.ChainId2))).ToList()).ToList();

            var outputData = new List <string>();


            foreach (var clusterIndex in homologClusterIndexes)
            {
                var cluster = pdbInterfacesSorted[clusterIndex - 1];

                // currently, cluster is a list of chain1-->chain2 interfaces ... so the 'chain2' interface needs adding to the record



                foreach (var inf1 in cluster)
                {
                    var partner =
                        cluster.Where(a => a != inf1 && a.PdbId == inf1.PdbId && inf1.ChainId2 == a.ChainId1)
                        .OrderByDescending(
                            a => InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, a.MinResSeq, a.MaxResSeq))
                        .ToList();

                    var first = partner.FirstOrDefault();
                    if (first != null)
                    {
                        inf1.Partner1InterfaceAminoAcids       = first.InterfaceAminoAcids;
                        inf1.Partner1InterfaceInteractionsMask = first.InterfaceInteractionsMask;
                        inf1.Partner1InterfaceOverlap          = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, first.MinResSeq, first.MaxResSeq);
                    }

                    var second = partner.ElementAtOrDefault(1);
                    if (second != null)
                    {
                        inf1.Partner2InterfaceAminoAcids       = second.InterfaceAminoAcids;
                        inf1.Partner2InterfaceInteractionsMask = second.InterfaceInteractionsMask;
                        inf1.Partner2InterfaceOverlap          = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, second.MinResSeq, second.MaxResSeq);
                    }
                }

                cluster = cluster.Where(a => a.Partner1InterfaceOverlap > 0 || a.Partner2InterfaceOverlap > 0).ToList();

                /*
                 * var partners =
                 *  foreach (var inf2 in cluster)
                 *  {
                 *      if (inf1.PdbId!=inf2.PdbId) continue;
                 *
                 *      if (inf1==inf2) continue;
                 *
                 *      if (!(inf1.ChainId1==inf2.ChainId2 || inf1.ChainId2==inf2.ChainId1)) continue;
                 *
                 *      //
                 *      var overlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, inf2.MinResSeq, inf2.MaxResSeq);
                 *
                 *      if (overlap > 0)
                 *      {
                 *          if (overlap > inf1.Partner1InterfaceOverlap)
                 *          {
                 *              inf1.Partner1InterfaceOverlap = overlap;
                 *              inf1.Partner1InterfaceAminoAcids = inf2.InterfaceAminoAcids;
                 *              inf1.Partner1InterfaceInteractionsMask = inf2.InterfaceInteractionsMask;
                 *          }
                 *
                 *          if (overlap > inf2.Partner1InterfaceOverlap)
                 *          {
                 *              inf2.Partner1InterfaceOverlap = overlap;
                 *              inf2.Partner1InterfaceAminoAcids = inf1.InterfaceAminoAcids;
                 *              inf2.Partner1InterfaceInteractionsMask = inf1.InterfaceInteractionsMask;
                 *          }
                 *      }
                 *  }
                 * }
                 */

                //var interfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList();
                //interfaces = interfaces.Where(a => interfaces.Count(b => b == a) > 1).ToList();

                //cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5 && cluster.Count(b => b.InterfaceAminoAcids == a.InterfaceAminoAcids) > 1).ToList();
                cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5).ToList();

                var clusterInterfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList();

                var homologInterfaces = new List <List <string> >();
                foreach (var inf1 in clusterInterfaces)
                {
                    var    highest_score = decimal.MinValue;
                    string highest_inf   = null;

                    foreach (var inf2 in clusterInterfaces)
                    {
                        if (inf1 == inf2)
                        {
                            continue;
                        }

                        var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(inf1, inf2, ProteinBioClass.AlignmentType.NMW);
                        if (score.Score > highest_score)
                        {
                            highest_score = score.Score;
                            highest_inf   = inf2;
                        }
                    }
                    var y = homologInterfaces.FirstOrDefault(a => a.Contains(inf1) || a.Contains(highest_inf));
                    if (y != null)
                    {
                        if (!y.Contains(inf1))
                        {
                            y.Add(inf1);
                        }
                        if (!y.Contains(highest_inf))
                        {
                            y.Add(highest_inf);
                        }
                    }
                    else
                    {
                        var z = new List <string>();
                        z.Add(inf1);
                        z.Add(highest_inf);
                        homologInterfaces.Add(z);
                    }
                }

                foreach (var c in cluster)
                {
                    c.Partner1ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner1InterfaceAminoAcids));
                    c.Partner2ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner2InterfaceAminoAcids));
                }

                for (int index = 0; index < homologInterfaces.Count; index++)
                {
                    var homologInterface = homologInterfaces[index];



                    var cluster2 =
                        cluster.Where(a => homologInterface.Contains(a.InterfaceAminoAcids)
                                      )
                        .OrderBy(a => a.Partner1ClusterIndex)
                        .ThenBy(a => a.Partner2ClusterIndex)
                        .ThenBy(a => a.InterfaceAminoAcids)
                        .ThenBy(a => a.Partner1InterfaceAminoAcids)
                        .ThenBy(a => a.Partner2InterfaceAminoAcids)
                        .ToList();

                    var partners =
                        cluster2.Select(
                            a =>
                            new Tuple <string, string, string>(a.InterfaceAminoAcids, a.Partner1InterfaceAminoAcids,
                                                               a.Partner2InterfaceAminoAcids)).Distinct();

                    cluster2 =
                        partners.Select(
                            a =>
                            cluster2.FirstOrDefault(
                                b =>
                                b.InterfaceAminoAcids == a.Item1 && b.Partner1InterfaceAminoAcids == a.Item2 &&
                                b.Partner2InterfaceAminoAcids == a.Item3)).ToList();

                    outputData.Add("cluster " + clusterIndex + "." + index);
                    outputData.AddRange(cluster2.Select(a => a.ToString()).ToList());
                    outputData.Add("");
                }
            }

            File.WriteAllLines(@"c:\ds96ub_homologs\ds96ub_homologs_interfaces.csv", outputData);//pdbInterfaces.Select(a=>a.ToString()).ToList());
        }
Ejemplo n.º 17
0
        static void Main(string[] args)
        {
            var parameters = new string[, ]
            {
                { "[pdb_or_atoms_file]", "input structure for sequence" },
                { "[fasta_file]", "input sequence for structure" },
                { "[[output_file]]", "optional output file" },
            };

            var maxParamLength = parameters.Cast <string>().Where((a, i) => i % 2 == 0).Max(a => a.Length);
            var exeFilename    = Path.GetFileName(Process.GetCurrentProcess().MainModule.FileName);

            if (args.Length < 1)
            {
                Console.WriteLine(exeFilename + @" is a program to calculate offset between the sequence and structure.");
                Console.WriteLine();
                Console.WriteLine(@"Usage:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" " + String.Join(" ", parameters.Cast <string>().Where((a, i) => i % 2 == 0)), maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Example:");
                Console.WriteLine(ProteinBioClass.WrapConsoleText(exeFilename + @" ""c:\pdb_db\atoms\atoms1a12.pdb"" ""c:\pdb_db\fasta\atoms1a12.fasta""", maxParamLength + 2, 1));
                Console.WriteLine();
                Console.WriteLine(@"Arguments:");
                for (var i = 0; i < parameters.GetLength(0); i++)
                {
                    Console.WriteLine(@" " + parameters[i, 0].PadLeft(maxParamLength, ' ') + " " + ProteinBioClass.WrapConsoleText(parameters[i, 1], maxParamLength + 2, 1, false));
                }
                Console.WriteLine();

                return;
            }

            // load arguments
            var p             = 0;
            var atomsFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            atomsFilename = atomsFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + atomsFilename);

            p++;
            var inputFastaFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            inputFastaFilename = inputFastaFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + inputFastaFilename);

            p++;
            var outputDataFilename = args.Length > p && args[p].Length > 0 ? args[p] : "";

            outputDataFilename = outputDataFilename.Replace("\"", "");
            Console.WriteLine("; " + parameters[p, 0].PadLeft(maxParamLength, ' ') + " = " + outputDataFilename);

            Console.WriteLine();

            var struct_seq = ProteinBioinformaticsSharedLibrary.ProteinBioClass.StructureFileToAaSequence(atomsFilename, null,
                                                                                                          false);


            //foreach (var s in struct_seq)
            //Console.WriteLine(s);

            //var fasta = File.ReadAllLines(inputFastaFilename);
            //foreach (var line in fasta)
            //{
            //    if (string.IsNullOrWhiteSpace(line))continue;
            //    if (line[0] == '>')
            //    {
            //        if (line.Contains())
            //    }

            //}

            /// not finished!
        }
        public static string LoadDsspStructureSequence(string pdbFilename, string chainId = null, int startResidueSequenceIndex = -1, int endResidueSequenceIndex = -1, bool reversedSequence = false)
        {
            if (string.IsNullOrWhiteSpace(pdbFilename))
            {
                return("");
            }

            var pdbId = ProteinBioClass.PdbIdFromPdbFilename(pdbFilename);

            var dsspFilename = pdbFilename;

            if (Path.GetExtension(dsspFilename) != ".dssp")
            {
                dsspFilename += ".dssp";
            }

            if (!File.Exists(dsspFilename))
            {
                return("");
            }

            var secondaryStructure = DsspFormatFile.LoadDsspFile(dsspFilename);

            if (chainId != null && secondaryStructure.FirstOrDefault(a => a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()) == null)
            {
                return("");
            }

            if (startResidueSequenceIndex == -1)
            {
                startResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Min(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue));
            }
            if (endResidueSequenceIndex == -1)
            {
                endResidueSequenceIndex = secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()).Max(a => int.Parse(a.FieldPdbResidueSequenceIndex.FieldValue));
            }


            // dssp specification says order may not be correct
            secondaryStructure = secondaryStructure.Where(a => !string.IsNullOrWhiteSpace(a.FieldChain.FieldValue) && !string.IsNullOrWhiteSpace(a.FieldPdbResidueSequenceIndex.FieldValue)).OrderBy(a => a.FieldChain.FieldValue).ThenBy(a => NullableTryParseInt32(a.FieldPdbResidueSequenceIndex.FieldValue)).ToList();

            var proteinInterfaceLen = CalculateProteinInterfaceLength(startResidueSequenceIndex, endResidueSequenceIndex);

            char[] result = new char[proteinInterfaceLen];
            for (int index = 0; index < result.Length; index++)
            {
                result[index] = '_';
            }

            foreach (var record in secondaryStructure.Where(a => chainId == null || a.FieldChain.FieldValue.ToUpperInvariant() == chainId.ToUpperInvariant()))
            {
                var resSeq = NullableTryParseInt32(record.FieldPdbResidueSequenceIndex.FieldValue);

                if (resSeq == null || resSeq < startResidueSequenceIndex || resSeq > endResidueSequenceIndex)
                {
                    continue;
                }

                var position = resSeq - startResidueSequenceIndex;

                if (record.FieldSecondaryStructure.FieldValue.Length == 0)
                {
                    continue;
                }

                result[position.Value] = record.FieldSecondaryStructure.FieldValue[0];
            }

            if (reversedSequence)
            {
                Array.Reverse(result);
            }

            return(new string(result));
        }