Exemple #1
0
        static void Main(string[] args)
        {
            var pdbFolder = @"C:\ds96ub_homologs\";

            var homologClusterData = FindHomologsCluster.FindHomologsCluster.HomologClusterData.Load(@"c:\ds96ub_homologs\ds96ub_homologs_0.7.csv");

            var pdbFiles = Directory.GetFiles(pdbFolder, "*.pdb", SearchOption.TopDirectoryOnly);

            var pdbIdList = pdbFiles.Select(ProteinBioClass.PdbIdFromPdbFilename).ToList();

            // only ca-atoms, ters and endmdls
            var pdbAtomsText =
                pdbFiles.Select(
                    a =>
                    File.ReadAllLines(a)
                    .Where(b => (b.StartsWith("ATOM ") && b[13] == 'C' && b[14] == 'A') || /*b.StartsWith("TER ") ||*/ b.StartsWith("ENDMDL "))
                    .ToList()).ToList();

            // only first nmr model
            pdbAtomsText = pdbAtomsText.Select(a =>
            {
                var x = a.FindIndex(b => b.StartsWith("ENDMDL "));
                return(x == -1 ? a : a.GetRange(0, x - 1));
            }).ToList();

            var pdbAtoms = pdbAtomsText.Select(a => a.Select(b => new ATOM_Record(b)).ToList()).ToList();

            // get list of unique chain ids
            var pdbChainIds = pdbAtoms.Select((a, i) => a.Select(b => char.ToUpperInvariant(b.chainID.FieldValue[0])).ToList()).Distinct().ToList();

            var pdbIdChainIdList = new List <Tuple <string, char> >();

            for (var i = 0; i < pdbIdList.Count; i++)
            {
                pdbIdChainIdList.AddRange(pdbChainIds[i].Select(chainId => new Tuple <string, char>(pdbIdList[i], chainId)));
            }
            pdbIdChainIdList = pdbIdChainIdList.Distinct().ToList();

            // for each chain
            var pdbContacts =
                pdbIdChainIdList.Select(a =>
            {
                var x =
                    ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\ds96ub_homologs\contacts\contacts_pdb" + a.Item1.ToUpperInvariant() + ".pdb")
                    .Where(b => char.ToUpperInvariant(b.Atom1.chainID.FieldValue[0]) == a.Item2 || char.ToUpperInvariant(b.Atom2.chainID.FieldValue[0]) == a.Item2)
                    .Select(c =>
                {
                    if (char.ToUpperInvariant(c.Atom1.chainID.FieldValue[0]) != a.Item2)
                    {
                        c.SwapAtoms();
                    }

                    return(c);
                }).ToList();

                return(x);
            }).ToList();



            // res min, res max, best min, best max, interface aa, interface mask
            var pdbInterfaces = new List <Ds93UbInterface>();

            var interface_target_length = 50;


            for (int index = 0; index < pdbContacts.Count; index++)
            {
                var pdbId   = pdbIdChainIdList[index].Item1;
                var chainId = pdbIdChainIdList[index].Item2;

                var pdbContact = pdbContacts[index];

                if (pdbContact.Count == 0)
                {
                    continue;
                }

                var contactChains = pdbContact.Where(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) != chainId).Select(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0])).Distinct().ToList();

                foreach (var contactChain in contactChains)
                {
                    var pdbContactsResSeqIds =
                        pdbContact.Where(a => char.ToUpperInvariant(a.Atom1.chainID.FieldValue[0]) == chainId &&
                                         char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) == contactChain)
                        .Select(a => int.Parse(a.Atom1.resSeq.FieldValue))
                        .ToList();


                    var res_seq     = pdbContactsResSeqIds;
                    var min_res_seq = pdbContactsResSeqIds.Min();
                    var max_res_seq = pdbContactsResSeqIds.Max();

                    var best50_min           = int.MinValue;
                    var best50_max           = int.MinValue;
                    var best50_interactions  = int.MinValue;
                    var best50_middle_finder = new List <Tuple <int, int, int> >();
                    for (var x = min_res_seq - interface_target_length; x <= max_res_seq; x++)
                    {
                        if (Math.Abs(max_res_seq - min_res_seq) <= interface_target_length)
                        {
                            best50_min          = min_res_seq;
                            best50_max          = max_res_seq;
                            best50_interactions = res_seq.Count;
                            break;
                        }

                        var min = x;
                        var max = x + interface_target_length > max_res_seq ? max_res_seq : x + interface_target_length;

                        var best50 = res_seq.Count(a => a >= best50_min && a <= best50_max);

                        if (best50 == best50_interactions)
                        {
                            best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                        }

                        if (best50_interactions == int.MinValue || best50 > best50_interactions)
                        {
                            best50_middle_finder.Clear();
                            best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50));
                            best50_min          = min;
                            best50_max          = max;
                            best50_interactions = best50;
                        }

                        if (x + interface_target_length >= max)
                        {
                            break;
                        }
                    }

                    if (best50_middle_finder.Count > 2)
                    {
                        var middle = best50_middle_finder[best50_middle_finder.Count / 2];
                        best50_min          = middle.Item1;
                        best50_max          = middle.Item2;
                        best50_interactions = middle.Item3;
                    }

                    var best50_interface_atoms = pdbAtoms[pdbIdList.IndexOf(pdbId)].Where(a =>
                    {
                        var l = int.Parse(a.resSeq.FieldValue);
                        var c = char.ToUpperInvariant(a.chainID.FieldValue[0]);
                        return(c == chainId && l >= best50_min && l <= best50_max);
                    }).ToList();

                    best50_interface_atoms = best50_interface_atoms.OrderBy(c => int.Parse(c.resSeq.FieldValue)).ToList();

                    var best50_interface = string.Join("", best50_interface_atoms.Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList());

                    var best50_mask = new string('_', best50_interface.Length);
                    best50_mask = string.Join("",
                                              best50_mask.Select((a, i) => res_seq.Contains(i + best50_min) ? "X" : "_").ToList());

                    pdbInterfaces.Add(new Ds93UbInterface(pdbId, chainId, contactChain, min_res_seq, max_res_seq, best50_min,
                                                          best50_max, best50_interactions, best50_interface, best50_mask, -1, "", "", 0, -1, "", "", 0));
                }
            }

            var homologClusterIndexes = homologClusterData.Select(a => a.ClusterIndex).Distinct().ToList();

            var homologClusters = homologClusterIndexes.Select(a => homologClusterData.Where(b => b.ClusterIndex == a).ToList()).ToList();

            var pdbInterfacesSorted = homologClusters.Select(a => pdbInterfaces.Where(b => a.Any(c => c.PdbId == b.PdbId && (char.ToUpperInvariant(c.ChainId) == b.ChainId1 || char.ToUpperInvariant(c.ChainId) == b.ChainId2))).ToList()).ToList();

            var outputData = new List <string>();


            foreach (var clusterIndex in homologClusterIndexes)
            {
                var cluster = pdbInterfacesSorted[clusterIndex - 1];

                // currently, cluster is a list of chain1-->chain2 interfaces ... so the 'chain2' interface needs adding to the record



                foreach (var inf1 in cluster)
                {
                    var partner =
                        cluster.Where(a => a != inf1 && a.PdbId == inf1.PdbId && inf1.ChainId2 == a.ChainId1)
                        .OrderByDescending(
                            a => InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, a.MinResSeq, a.MaxResSeq))
                        .ToList();

                    var first = partner.FirstOrDefault();
                    if (first != null)
                    {
                        inf1.Partner1InterfaceAminoAcids       = first.InterfaceAminoAcids;
                        inf1.Partner1InterfaceInteractionsMask = first.InterfaceInteractionsMask;
                        inf1.Partner1InterfaceOverlap          = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, first.MinResSeq, first.MaxResSeq);
                    }

                    var second = partner.ElementAtOrDefault(1);
                    if (second != null)
                    {
                        inf1.Partner2InterfaceAminoAcids       = second.InterfaceAminoAcids;
                        inf1.Partner2InterfaceInteractionsMask = second.InterfaceInteractionsMask;
                        inf1.Partner2InterfaceOverlap          = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, second.MinResSeq, second.MaxResSeq);
                    }
                }

                cluster = cluster.Where(a => a.Partner1InterfaceOverlap > 0 || a.Partner2InterfaceOverlap > 0).ToList();

                /*
                 * var partners =
                 *  foreach (var inf2 in cluster)
                 *  {
                 *      if (inf1.PdbId!=inf2.PdbId) continue;
                 *
                 *      if (inf1==inf2) continue;
                 *
                 *      if (!(inf1.ChainId1==inf2.ChainId2 || inf1.ChainId2==inf2.ChainId1)) continue;
                 *
                 *      //
                 *      var overlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, inf2.MinResSeq, inf2.MaxResSeq);
                 *
                 *      if (overlap > 0)
                 *      {
                 *          if (overlap > inf1.Partner1InterfaceOverlap)
                 *          {
                 *              inf1.Partner1InterfaceOverlap = overlap;
                 *              inf1.Partner1InterfaceAminoAcids = inf2.InterfaceAminoAcids;
                 *              inf1.Partner1InterfaceInteractionsMask = inf2.InterfaceInteractionsMask;
                 *          }
                 *
                 *          if (overlap > inf2.Partner1InterfaceOverlap)
                 *          {
                 *              inf2.Partner1InterfaceOverlap = overlap;
                 *              inf2.Partner1InterfaceAminoAcids = inf1.InterfaceAminoAcids;
                 *              inf2.Partner1InterfaceInteractionsMask = inf1.InterfaceInteractionsMask;
                 *          }
                 *      }
                 *  }
                 * }
                 */

                //var interfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList();
                //interfaces = interfaces.Where(a => interfaces.Count(b => b == a) > 1).ToList();

                //cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5 && cluster.Count(b => b.InterfaceAminoAcids == a.InterfaceAminoAcids) > 1).ToList();
                cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5).ToList();

                var clusterInterfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList();

                var homologInterfaces = new List <List <string> >();
                foreach (var inf1 in clusterInterfaces)
                {
                    var    highest_score = decimal.MinValue;
                    string highest_inf   = null;

                    foreach (var inf2 in clusterInterfaces)
                    {
                        if (inf1 == inf2)
                        {
                            continue;
                        }

                        var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(inf1, inf2, ProteinBioClass.AlignmentType.NMW);
                        if (score.Score > highest_score)
                        {
                            highest_score = score.Score;
                            highest_inf   = inf2;
                        }
                    }
                    var y = homologInterfaces.FirstOrDefault(a => a.Contains(inf1) || a.Contains(highest_inf));
                    if (y != null)
                    {
                        if (!y.Contains(inf1))
                        {
                            y.Add(inf1);
                        }
                        if (!y.Contains(highest_inf))
                        {
                            y.Add(highest_inf);
                        }
                    }
                    else
                    {
                        var z = new List <string>();
                        z.Add(inf1);
                        z.Add(highest_inf);
                        homologInterfaces.Add(z);
                    }
                }

                foreach (var c in cluster)
                {
                    c.Partner1ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner1InterfaceAminoAcids));
                    c.Partner2ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner2InterfaceAminoAcids));
                }

                for (int index = 0; index < homologInterfaces.Count; index++)
                {
                    var homologInterface = homologInterfaces[index];



                    var cluster2 =
                        cluster.Where(a => homologInterface.Contains(a.InterfaceAminoAcids)
                                      )
                        .OrderBy(a => a.Partner1ClusterIndex)
                        .ThenBy(a => a.Partner2ClusterIndex)
                        .ThenBy(a => a.InterfaceAminoAcids)
                        .ThenBy(a => a.Partner1InterfaceAminoAcids)
                        .ThenBy(a => a.Partner2InterfaceAminoAcids)
                        .ToList();

                    var partners =
                        cluster2.Select(
                            a =>
                            new Tuple <string, string, string>(a.InterfaceAminoAcids, a.Partner1InterfaceAminoAcids,
                                                               a.Partner2InterfaceAminoAcids)).Distinct();

                    cluster2 =
                        partners.Select(
                            a =>
                            cluster2.FirstOrDefault(
                                b =>
                                b.InterfaceAminoAcids == a.Item1 && b.Partner1InterfaceAminoAcids == a.Item2 &&
                                b.Partner2InterfaceAminoAcids == a.Item3)).ToList();

                    outputData.Add("cluster " + clusterIndex + "." + index);
                    outputData.AddRange(cluster2.Select(a => a.ToString()).ToList());
                    outputData.Add("");
                }
            }

            File.WriteAllLines(@"c:\ds96ub_homologs\ds96ub_homologs_interfaces.csv", outputData);//pdbInterfaces.Select(a=>a.ToString()).ToList());
        }
Exemple #2
0
        static void Main(string[] args)
        {
            // this program takes a fasta or pdb file and finds all matching homologs

            // FindHomologs.exe "c:\ds96ub\ds96ub.fasta" * "c:\pdb\pdb_seqres.fasta" NMW Y 0.3 75 c:\pdb\

            // alignment_type = (n)one, (s)imple, NMW, SWM

            var query_sequence_file  = args[0]; //query.fasta
            var query_id_chain       = args[1]; //1A2G:B
            var target_sequence_file = args[2]; //targets.fasta
            var alignment_type_str   = args[3]; //NMW,SWM,SIM,NON

            if (alignment_type_str == "*")
            {
                alignment_type_str = "NMW,SWM,SIM,NON";
            }
            var alignment_type_str_split       = alignment_type_str.ToUpperInvariant().Split(new char[] { ',', ';', ' ', '\t' });
            var compare_physicochemically      = args[4]; //Y/N
            var compare_physicochemically_bool = compare_physicochemically == "Y";
            var min_similarity_str             = args[5]; // 0.3
            var max_len_difference             = args[6];
            var max_len_difference_int         = int.Parse(max_len_difference);
            var output_folder = args[7];

            var minSimilarity = decimal.Parse(min_similarity_str);

            var alignmentTypes = new List <ProteinBioClass.AlignmentType>();

            if (alignment_type_str_split.Contains("NMW"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.NMW);
            }
            if (alignment_type_str_split.Contains("SWM"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.SWM);
            }
            if (alignment_type_str_split.Contains("SIM"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.SIM);
            }
            if (alignment_type_str_split.Contains("NON") || alignmentTypes.Count == 0)
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.NON);
            }
            if (alignmentTypes.Count < alignment_type_str_split.Length)
            {
                Console.WriteLine("; unknown alignment type");
                return;
            }

            // load list of query sequences
            var queryPdbid   = query_id_chain.Split(new char[] { ':' })[0];
            var queryChainid = (query_id_chain.Contains(":") ? query_id_chain.Split(new char[] { ':' })[1] : "*")[0];


            var querySeq     = Sequence.LoadSequenceFile(query_sequence_file, null);
            var queryResults = querySeq.Where(a =>
            {
                var id = new ProteinBioClass.SequenceId(a.Id);
                return((queryPdbid == "*" || id.PdbId.ToUpperInvariant() == queryPdbid.ToUpperInvariant()) &&
                       (queryChainid == '*' || id.ChainId == queryChainid));
            }).ToList();

            if (queryResults.Count == 0)
            {
                Console.WriteLine("; the query pdbids/chainids were not found");
                return;
            }


            // load list of target sequences
            var targetSeq = Sequence.LoadSequenceFile(target_sequence_file, new string[] { null, "", "protein" });

            targetSeq = targetSeq.Where(a => a.Count() >= 50).ToList();

            Console.WriteLine("; aligning " + queryResults.Count + " query sequences to " + targetSeq.Count + " target sequences");

            // perform alignment

            //var startTime = DateTime.Now;


            //var progress = 0;
            //var progressLock = new object();


            //var tasks = new List<Task<StringBuilder>>();

            var queryPdbIds  = queryResults.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId);
            var targetPdbIds = targetSeq.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId);

            var queryPdbIdCounts = new Dictionary <string, int>();

            foreach (var x in queryPdbIds)
            {
                if (!queryPdbIdCounts.ContainsKey(x))
                {
                    queryPdbIdCounts.Add(x, 1);
                }
                else
                {
                    queryPdbIdCounts[x]++;
                }
            }

            var targetPdbIdCounts = new Dictionary <string, int>();

            foreach (var x in targetPdbIds)
            {
                if (!targetPdbIdCounts.ContainsKey(x))
                {
                    targetPdbIdCounts.Add(x, 1);
                }
                else
                {
                    targetPdbIdCounts[x]++;
                }
            }

            foreach (var _query in queryResults)
            {
                var _queryId = new ProteinBioClass.SequenceId(_query.Id);
                var filename = (new DirectoryInfo(output_folder).FullName) + @"\homologs_" + _queryId.PdbId + _queryId.ChainId + @".csv";

                // skip if already processed
                if (File.Exists(filename) && new FileInfo(filename).Length > 0)
                {
                    continue;
                }

                var totalQueryPdbIdChains = queryPdbIdCounts[_queryId.PdbId];

                WorkDivision wd = new WorkDivision(targetSeq.Count);


                for (var thread = 0; thread < wd.ThreadCount; thread++)
                {
                    var query   = _query;
                    var queryId = _queryId;
                    var lti     = thread;
                    wd.TaskList.Add(Task.Run(() =>
                    {
                        var result = new List <HomologChain>();


                        for (var target = wd.ThreadFirstIndex[lti]; target <= wd.ThreadLastIndex[lti]; target++)
                        {
                            var targetobj = targetSeq[target];

                            if (max_len_difference_int != -1 && Math.Abs(targetobj.Count() - query.Count()) > max_len_difference_int)
                            {
                                continue;
                            }

                            var targetId = new ProteinBioClass.SequenceId(targetobj.Id);

                            //var totalTargetPdbIdChains = targetSeq.Count(a => FindAtomicContacts.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant() == targetId.PdbId.ToUpperInvariant());



                            //var timeRemaining =
                            //    TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks *
                            //                       ((targetSeq.Count * queryResults.Count) - (progress + 1)) /
                            //                       (progress + 1));

                            foreach (var alignmentType in alignmentTypes)
                            {
                                var scores = ProteinBioClass.AlignedSequenceSimilarityPercentage(query, targetobj, alignmentType /*,
                                                                                                                                  * compare_physicochemically_bool*/);
                                decimal percentSimilar;

                                if (compare_physicochemically_bool)
                                {
                                    percentSimilar = scores.ScoreEvo;
                                }
                                else
                                {
                                    percentSimilar = scores.Score;
                                }

                                if (percentSimilar >= minSimilarity)
                                {
                                    result.Add(new HomologChain(
                                                   queryId.PdbId, queryId.ChainId, totalQueryPdbIdChains,
                                                   targetId.PdbId, targetId.ChainId, targetPdbIdCounts[targetId.PdbId],

                                                   alignmentType.ToString(),
                                                   scores.Score,
                                                   scores.ScoreEvo));
                                }
                            }
                            //if (progress % 20 == 0)
                            //    Console.Write("\r{0}% eta {1}     ",
                            //        Math.Round((decimal)(progress + 1) / (decimal)(targetSeq.Count * queryResults.Count),
                            //            2)
                            //            .ToString(CultureInfo.InvariantCulture),
                            //        timeRemaining.ToString(@"d\d\:h\h\:m\m\:s\s",
                            //            CultureInfo.InvariantCulture));
                            //lock (progressLock)
                            //    progress++;
                        }

                        return(result);
                    }));
                }

                wd.WaitAllTasks();

                var mergedlist = new List <string>();

                mergedlist.Add("; " + _queryId.PdbId + ":" + _queryId.ChainId);
                mergedlist.Add(String.Join(",",
                                           new string[]
                {
                    "query pdb id", "query chain id", "query chains",
                    "target pdb id", "target chain id", "target chains",

                    "alignment method", "sequence similarity", "sequence evo similarity"
                }));

                foreach (var t in wd.TaskList)
                {
                    var tc = t as Task <List <HomologChain> >;

                    if (tc == null)
                    {
                        throw new Exception("task in tasklist was null");
                    }

                    mergedlist.AddRange(tc.Result.Select(a => a.ToString()).ToList());
                }

                if (string.IsNullOrWhiteSpace(output_folder))
                {
                    Console.WriteLine(String.Join(Environment.NewLine, mergedlist));
                }
                else
                {
                    File.WriteAllLines(filename, mergedlist);
                }
            }
        }
        public static List <SequenceIdentityClusterMember> ClusterSequenceByAlignedSequenceIdentity(List <Sequence> seqList, ProteinBioClass.AlignmentIdentityOption alignmentIdentityOption, decimal mininumClusterPairwiseSimilarity = 0.3m, decimal mininumEvoClusterPairwiseSimilarity = 0.3m)
        {
            var allsequences = seqList.Select(a => new Tuple <string, char, string>(new ProteinBioClass.SequenceId(a.Id).PdbId, new ProteinBioClass.SequenceId(a.Id).ChainId, Sequence.EscapeAminoAcidSequence(a.FullSequence))).ToList();

            var sequences = allsequences.Select(a => a.Item3).Distinct().ToList();

            var sequenceIds = sequences.Select(a => allsequences.Where(b => b.Item3 == a).ToList()).ToList();


            var seqClusters = new List <List <string> >();



            for (int x = 0; x < sequences.Count; x++)
            {
                var seq1       = sequences[x];
                var newCluster = new List <string>();
                newCluster.Add(seq1);
                seqClusters.Add(newCluster);
            }

            for (int indexX = 0; indexX < sequences.Count; indexX++)
            {
                Console.WriteLine("Aligning sequence " + indexX);
                var seqX = sequences[indexX];
                //List<decimal> scoreList = new List<decimal>();
                //List<decimal> scoreEvoList = new List<decimal>();

                for (int indexY = 0; indexY < sequences.Count; indexY++)
                {
                    if (indexY <= indexX)
                    {
                        continue;
                    }

                    var seqY = sequences[indexY];

                    if ((decimal)Math.Min(seqX.Length, seqY.Length) / (decimal)Math.Max(seqX.Length, seqY.Length) < mininumClusterPairwiseSimilarity)
                    {
                        continue;
                    }

                    var cluster1 = seqClusters.FirstOrDefault(a => a.Contains(seqX));
                    var cluster2 = seqClusters.FirstOrDefault(a => a.Contains(seqY));

                    if (cluster1 != null && cluster2 != null && cluster1 == cluster2)
                    {
                        continue;
                    }


                    var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NON, alignmentIdentityOption);

                    Console.WriteLine("1: " + seqX);
                    Console.WriteLine("2: " + seqY);
                    Console.WriteLine("Score1: " + score.Score);
                    Console.WriteLine("Score2: " + score.ScoreEvo);

                    if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity)
                    {
                        var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.SIM, alignmentIdentityOption);
                        if (x.Score > score.Score)
                        {
                            score.Score = x.Score;
                        }
                        if (x.ScoreEvo > score.ScoreEvo)
                        {
                            score.ScoreEvo = x.ScoreEvo;
                        }
                    }

                    if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity)
                    {
                        var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NMW, alignmentIdentityOption);
                        if (x.Score > score.Score)
                        {
                            score = x;
                        }
                        if (x.ScoreEvo > score.ScoreEvo)
                        {
                            score.ScoreEvo = x.ScoreEvo;
                        }
                    }


                    if (score.Score >= mininumClusterPairwiseSimilarity && score.ScoreEvo >= mininumEvoClusterPairwiseSimilarity)
                    {
                        var newCluster = new List <string>();

                        newCluster.AddRange(cluster1);
                        newCluster.AddRange(cluster2);

                        seqClusters.Remove(cluster1);
                        seqClusters.Remove(cluster2);

                        seqClusters.Add(newCluster);
                    }

                    //scoreList.Add(score.Score);
                    //scoreEvoList.Add(score.ScoreEvo);
                }
                //Console.WriteLine("[" + string.Join(", ", scoreList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]");
                //Console.WriteLine("[" + string.Join(", ", scoreEvoList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]");
            }

            seqClusters = seqClusters.OrderBy(a => a.Count).ToList();

            var output = new List <SequenceIdentityClusterMember>();

            for (var index = 0; index < seqClusters.Count; index++)
            {
                var seqCluster = seqClusters[index];
                foreach (var item in seqCluster)
                {
                    var indexIds = sequences.IndexOf(item);
                    var ids      = sequenceIds[indexIds];

                    foreach (var id in ids)
                    {
                        output.Add(new SequenceIdentityClusterMember(index + 1, ProteinBioClass.PdbIdFromPdbFilename(id.Item1), id.Item2, id.Item3));
                    }
                }
            }

            return(output);
        }
        private static void Main(string[] args)
        {
            // this program will load the homolog list in csv format and for homologs of X sequence distance return a list of all partners
            // however, some partners may be duplicates, which cannot initially be removed, since they may bind differently in other instances
            // then, because of such cases, unique id to describe each protein must be created... this is slightly problematic because
            // close target homologs of proteins are also considered to be the same protein as the query protein
            // which means that they could exist for more than one query protein


            // FindHomologsCluster.exe c:\pdb\ds96ub_homologs\ c:\pdb\pdb_seqres.fasta 0.9 0.9 > ds96ub_homologs.csv

            var homolog_csv_folder     = args[0];
            var sequence_file          = args[1];
            var min_similarity_str     = args[2];
            var min_similarity_evo_str = args[3];

            var min_similarity     = decimal.Parse(min_similarity_str);
            var min_similarity_evo = decimal.Parse(min_similarity_evo_str);

            var seqList = Sequence.LoadSequenceFile(sequence_file, new[] { null, "", "protein" });


            var homologCsvFiles = Directory.GetFiles(homolog_csv_folder, "homologs_?????.csv");

            var parsedData = FindHomologs.FindHomologs.HomologChain.Load(homologCsvFiles);


            Array.Clear(homologCsvFiles, 0, homologCsvFiles.Length);

            //var query_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.query_pdb_id, a.query_chainid)).ToList();

            //var target_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.target_pdb_id, a.target_chainid)).ToList();


            //var query_alignments = new List<homolog_csv>();

            var homologs_clustered = new List <List <Tuple <string, char> > >();

            //var min_similarity = 0.9m;

            foreach (var rec in parsedData)
            {
                if (rec.AlignmentScore >= min_similarity && rec.AlignmentScoreEvo >= min_similarity_evo)
                {
                    //var query_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.query_pdb_id && b.Item2 == rec.query_chainid) != null);
                    //var target_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.target_pdb_id && b.Item2 == rec.target_chainid) != null);

                    List <Tuple <string, char> > query_group  = null;
                    List <Tuple <string, char> > target_group = null;

                    foreach (var cluster in homologs_clustered)
                    {
                        var xq = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.QueryPdbId.ToUpperInvariant() && b.Item2 == rec.QueryChainId);
                        if (xq == null)
                        {
                            continue;
                        }
                        query_group = cluster;
                        break;
                    }

                    foreach (var cluster in homologs_clustered)
                    {
                        var xt =
                            cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.TargetPdbId.ToUpperInvariant() && b.Item2 == rec.TargetChainId);
                        if (xt == null)
                        {
                            continue;
                        }
                        target_group = cluster;
                        break;
                    }

                    var new_group = new List <Tuple <string, char> >();

                    if (query_group != null)
                    {
                        new_group.AddRange(query_group);
                        homologs_clustered.Remove(query_group);
                        query_group.Clear();
                    }
                    else
                    {
                        new_group.Add(new Tuple <string, char>(rec.QueryPdbId, rec.QueryChainId));
                    }

                    if (target_group != null)
                    {
                        new_group.AddRange(target_group);
                        homologs_clustered.Remove(target_group);
                        target_group.Clear();
                    }
                    else
                    {
                        new_group.Add(new Tuple <string, char>(rec.TargetPdbId, rec.TargetChainId));
                    }

                    new_group = new_group.Distinct().ToList(); // try without distinct?
                    new_group = new_group.OrderBy(a => a.Item1).ThenBy(a => a.Item2).ToList();

                    homologs_clustered.Add(new_group);
                }
            }

            var seq_list_ids = seqList.Select(a => new ProteinBioClass.SequenceId(a.Id)).ToList();


            var wd2 = new WorkDivision(homologs_clustered.Count);

            for (var thread2 = 0; thread2 < wd2.ThreadCount; thread2++)
            {
                var lti2 = thread2;

                wd2.TaskList.Add(Task.Run(() =>
                {
                    var result2 = new List <string>();

                    for (var index2 = wd2.ThreadFirstIndex[lti2]; index2 <= wd2.ThreadLastIndex[lti2]; index2++)
                    {
                        var cluster2 = homologs_clustered[index2];


                        var wd3 = new WorkDivision(cluster2.Count);

                        for (var thread3 = 0; thread3 < wd3.ThreadCount; thread3++)
                        {
                            var lti3     = thread3;
                            var cluster3 = cluster2;

                            var index4 = index2;
                            wd3.TaskList.Add(Task.Run(() =>
                            {
                                var result = new List <HomologClusterData>();
                                for (var index3 = wd3.ThreadFirstIndex[lti3]; index3 <= wd3.ThreadLastIndex[lti3]; index3++)
                                {
                                    var item   = cluster3[index3];
                                    Sequence s = null;
                                    for (var j = 0; j < seqList.Count; j++)
                                    {
                                        if (seq_list_ids[j].PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant() && seq_list_ids[j].ChainId == item.Item2)
                                        {
                                            s = seqList[j];
                                            break;
                                        }
                                    }
                                    if (s == null)
                                    {
                                        throw new Exception("sequence not found for " + item.Item1 + ":" + item.Item2);
                                    }

                                    var complexChains = seq_list_ids.Count(a => a.PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant());

                                    var minAlignmentScore = -1m;
                                    var maxAlignmentScore = -1m;


                                    var minAlignmentScoreEvo = -1m;
                                    var maxAlignmentScoreEvo = -1m;

                                    foreach (var item2 in cluster3)
                                    {
                                        if (ReferenceEquals(item, item2))
                                        {
                                            continue;
                                        }

                                        Sequence s2 = null;
                                        for (var j2 = 0; j2 < seqList.Count; j2++)
                                        {
                                            if (seq_list_ids[j2].PdbId.ToUpperInvariant() == item2.Item1.ToUpperInvariant() &&
                                                seq_list_ids[j2].ChainId == item2.Item2)
                                            {
                                                s2 = seqList[j2];
                                                break;
                                            }
                                        }
                                        if (s2 == null)
                                        {
                                            continue;
                                        }

                                        var alignmentScore = ProteinBioClass.AlignedSequenceSimilarityPercentage(s,
                                                                                                                 s2,
                                                                                                                 ProteinBioClass.AlignmentType.NMW);

                                        if (alignmentScore.Score > maxAlignmentScore || maxAlignmentScore == -1m)
                                        {
                                            maxAlignmentScore = alignmentScore.Score;
                                        }
                                        if (alignmentScore.Score < minAlignmentScore || minAlignmentScore == -1m)
                                        {
                                            minAlignmentScore = alignmentScore.Score;
                                        }

                                        if (alignmentScore.ScoreEvo > maxAlignmentScoreEvo || maxAlignmentScoreEvo == -1m)
                                        {
                                            maxAlignmentScoreEvo = alignmentScore.ScoreEvo;
                                        }
                                        if (alignmentScore.ScoreEvo < minAlignmentScoreEvo || minAlignmentScoreEvo == -1m)
                                        {
                                            minAlignmentScoreEvo = alignmentScore.ScoreEvo;
                                        }
                                    }

                                    var r = new HomologClusterData(index4 + 1, index3 + 1, item.Item1, item.Item2, complexChains, Convert.ToInt32(s.Count()), minAlignmentScore, maxAlignmentScore, minAlignmentScoreEvo, maxAlignmentScoreEvo, s.FullSequence);

                                    result.Add(r);
                                }
                                return(result);
                            }));
                        }
                        wd3.WaitAllTasks();



                        result2.Add("; Cluster # " + (index2 + 1) + " with " + wd3.ItemsToProcess + " protein chains");
                        result2.Add("cluster index,item index,pdb id,chain id,complex chains,seq len,min clstr sid,max clstr sid,min evo clstr sid,max evo clstr sid,sequence");

                        foreach (var task in wd3.TaskList)
                        {
                            //if (task.IsFaulted || task.IsCanceled) continue;
                            var tr = task as Task <List <HomologClusterData> >;
                            if (tr == null || tr.Result == null)
                            {
                                continue;
                            }
                            result2.AddRange(tr.Result.Select(a => a.ToString()).ToList());
                        }

                        result2.Add("");
                    }

                    return(result2);
                }));
                //wd2.TaskList.Add(task2);
            }
            wd2.WaitAllTasks();

            var result1 = new List <string>();

            foreach (var task in wd2.TaskList)
            {
                //if (task.IsFaulted || task.IsCanceled) continue;
                var tr = task as Task <List <string> >;
                if (tr == null || tr.Result == null)
                {
                    continue;
                }
                result1.AddRange(tr.Result);
            }

            foreach (var line in result1)
            {
                Console.WriteLine(line);
            }
            // partners may have other interfaces, should those also be considered?
        }