static void Main(string[] args) { var pdbFolder = @"C:\ds96ub_homologs\"; var homologClusterData = FindHomologsCluster.FindHomologsCluster.HomologClusterData.Load(@"c:\ds96ub_homologs\ds96ub_homologs_0.7.csv"); var pdbFiles = Directory.GetFiles(pdbFolder, "*.pdb", SearchOption.TopDirectoryOnly); var pdbIdList = pdbFiles.Select(ProteinBioClass.PdbIdFromPdbFilename).ToList(); // only ca-atoms, ters and endmdls var pdbAtomsText = pdbFiles.Select( a => File.ReadAllLines(a) .Where(b => (b.StartsWith("ATOM ") && b[13] == 'C' && b[14] == 'A') || /*b.StartsWith("TER ") ||*/ b.StartsWith("ENDMDL ")) .ToList()).ToList(); // only first nmr model pdbAtomsText = pdbAtomsText.Select(a => { var x = a.FindIndex(b => b.StartsWith("ENDMDL ")); return(x == -1 ? a : a.GetRange(0, x - 1)); }).ToList(); var pdbAtoms = pdbAtomsText.Select(a => a.Select(b => new ATOM_Record(b)).ToList()).ToList(); // get list of unique chain ids var pdbChainIds = pdbAtoms.Select((a, i) => a.Select(b => char.ToUpperInvariant(b.chainID.FieldValue[0])).ToList()).Distinct().ToList(); var pdbIdChainIdList = new List <Tuple <string, char> >(); for (var i = 0; i < pdbIdList.Count; i++) { pdbIdChainIdList.AddRange(pdbChainIds[i].Select(chainId => new Tuple <string, char>(pdbIdList[i], chainId))); } pdbIdChainIdList = pdbIdChainIdList.Distinct().ToList(); // for each chain var pdbContacts = pdbIdChainIdList.Select(a => { var x = ProteinBioClass.AtomPair.LoadAtomPairList(@"C:\ds96ub_homologs\contacts\contacts_pdb" + a.Item1.ToUpperInvariant() + ".pdb") .Where(b => char.ToUpperInvariant(b.Atom1.chainID.FieldValue[0]) == a.Item2 || char.ToUpperInvariant(b.Atom2.chainID.FieldValue[0]) == a.Item2) .Select(c => { if (char.ToUpperInvariant(c.Atom1.chainID.FieldValue[0]) != a.Item2) { c.SwapAtoms(); } return(c); }).ToList(); return(x); }).ToList(); // res min, res max, best min, best max, interface aa, interface mask var pdbInterfaces = new List <Ds93UbInterface>(); var interface_target_length = 50; for (int index = 0; index < pdbContacts.Count; index++) { var pdbId = pdbIdChainIdList[index].Item1; var chainId = pdbIdChainIdList[index].Item2; var pdbContact = pdbContacts[index]; if (pdbContact.Count == 0) { continue; } var contactChains = pdbContact.Where(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) != chainId).Select(a => char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0])).Distinct().ToList(); foreach (var contactChain in contactChains) { var pdbContactsResSeqIds = pdbContact.Where(a => char.ToUpperInvariant(a.Atom1.chainID.FieldValue[0]) == chainId && char.ToUpperInvariant(a.Atom2.chainID.FieldValue[0]) == contactChain) .Select(a => int.Parse(a.Atom1.resSeq.FieldValue)) .ToList(); var res_seq = pdbContactsResSeqIds; var min_res_seq = pdbContactsResSeqIds.Min(); var max_res_seq = pdbContactsResSeqIds.Max(); var best50_min = int.MinValue; var best50_max = int.MinValue; var best50_interactions = int.MinValue; var best50_middle_finder = new List <Tuple <int, int, int> >(); for (var x = min_res_seq - interface_target_length; x <= max_res_seq; x++) { if (Math.Abs(max_res_seq - min_res_seq) <= interface_target_length) { best50_min = min_res_seq; best50_max = max_res_seq; best50_interactions = res_seq.Count; break; } var min = x; var max = x + interface_target_length > max_res_seq ? max_res_seq : x + interface_target_length; var best50 = res_seq.Count(a => a >= best50_min && a <= best50_max); if (best50 == best50_interactions) { best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); } if (best50_interactions == int.MinValue || best50 > best50_interactions) { best50_middle_finder.Clear(); best50_middle_finder.Add(new Tuple <int, int, int>(min, max, best50)); best50_min = min; best50_max = max; best50_interactions = best50; } if (x + interface_target_length >= max) { break; } } if (best50_middle_finder.Count > 2) { var middle = best50_middle_finder[best50_middle_finder.Count / 2]; best50_min = middle.Item1; best50_max = middle.Item2; best50_interactions = middle.Item3; } var best50_interface_atoms = pdbAtoms[pdbIdList.IndexOf(pdbId)].Where(a => { var l = int.Parse(a.resSeq.FieldValue); var c = char.ToUpperInvariant(a.chainID.FieldValue[0]); return(c == chainId && l >= best50_min && l <= best50_max); }).ToList(); best50_interface_atoms = best50_interface_atoms.OrderBy(c => int.Parse(c.resSeq.FieldValue)).ToList(); var best50_interface = string.Join("", best50_interface_atoms.Select(b => AminoAcidConversions.AminoAcidNameToCode1L(b.resName.FieldValue)).ToList()); var best50_mask = new string('_', best50_interface.Length); best50_mask = string.Join("", best50_mask.Select((a, i) => res_seq.Contains(i + best50_min) ? "X" : "_").ToList()); pdbInterfaces.Add(new Ds93UbInterface(pdbId, chainId, contactChain, min_res_seq, max_res_seq, best50_min, best50_max, best50_interactions, best50_interface, best50_mask, -1, "", "", 0, -1, "", "", 0)); } } var homologClusterIndexes = homologClusterData.Select(a => a.ClusterIndex).Distinct().ToList(); var homologClusters = homologClusterIndexes.Select(a => homologClusterData.Where(b => b.ClusterIndex == a).ToList()).ToList(); var pdbInterfacesSorted = homologClusters.Select(a => pdbInterfaces.Where(b => a.Any(c => c.PdbId == b.PdbId && (char.ToUpperInvariant(c.ChainId) == b.ChainId1 || char.ToUpperInvariant(c.ChainId) == b.ChainId2))).ToList()).ToList(); var outputData = new List <string>(); foreach (var clusterIndex in homologClusterIndexes) { var cluster = pdbInterfacesSorted[clusterIndex - 1]; // currently, cluster is a list of chain1-->chain2 interfaces ... so the 'chain2' interface needs adding to the record foreach (var inf1 in cluster) { var partner = cluster.Where(a => a != inf1 && a.PdbId == inf1.PdbId && inf1.ChainId2 == a.ChainId1) .OrderByDescending( a => InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, a.MinResSeq, a.MaxResSeq)) .ToList(); var first = partner.FirstOrDefault(); if (first != null) { inf1.Partner1InterfaceAminoAcids = first.InterfaceAminoAcids; inf1.Partner1InterfaceInteractionsMask = first.InterfaceInteractionsMask; inf1.Partner1InterfaceOverlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, first.MinResSeq, first.MaxResSeq); } var second = partner.ElementAtOrDefault(1); if (second != null) { inf1.Partner2InterfaceAminoAcids = second.InterfaceAminoAcids; inf1.Partner2InterfaceInteractionsMask = second.InterfaceInteractionsMask; inf1.Partner2InterfaceOverlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, second.MinResSeq, second.MaxResSeq); } } cluster = cluster.Where(a => a.Partner1InterfaceOverlap > 0 || a.Partner2InterfaceOverlap > 0).ToList(); /* * var partners = * foreach (var inf2 in cluster) * { * if (inf1.PdbId!=inf2.PdbId) continue; * * if (inf1==inf2) continue; * * if (!(inf1.ChainId1==inf2.ChainId2 || inf1.ChainId2==inf2.ChainId1)) continue; * * // * var overlap = InterfaceOverlap(inf1.MinResSeq, inf1.MaxResSeq, inf2.MinResSeq, inf2.MaxResSeq); * * if (overlap > 0) * { * if (overlap > inf1.Partner1InterfaceOverlap) * { * inf1.Partner1InterfaceOverlap = overlap; * inf1.Partner1InterfaceAminoAcids = inf2.InterfaceAminoAcids; * inf1.Partner1InterfaceInteractionsMask = inf2.InterfaceInteractionsMask; * } * * if (overlap > inf2.Partner1InterfaceOverlap) * { * inf2.Partner1InterfaceOverlap = overlap; * inf2.Partner1InterfaceAminoAcids = inf1.InterfaceAminoAcids; * inf2.Partner1InterfaceInteractionsMask = inf1.InterfaceInteractionsMask; * } * } * } * } */ //var interfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList(); //interfaces = interfaces.Where(a => interfaces.Count(b => b == a) > 1).ToList(); //cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5 && cluster.Count(b => b.InterfaceAminoAcids == a.InterfaceAminoAcids) > 1).ToList(); cluster = cluster.Where(a => a.InterfaceAminoAcids.Length >= 5).ToList(); var clusterInterfaces = cluster.Select(a => a.InterfaceAminoAcids).ToList(); var homologInterfaces = new List <List <string> >(); foreach (var inf1 in clusterInterfaces) { var highest_score = decimal.MinValue; string highest_inf = null; foreach (var inf2 in clusterInterfaces) { if (inf1 == inf2) { continue; } var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(inf1, inf2, ProteinBioClass.AlignmentType.NMW); if (score.Score > highest_score) { highest_score = score.Score; highest_inf = inf2; } } var y = homologInterfaces.FirstOrDefault(a => a.Contains(inf1) || a.Contains(highest_inf)); if (y != null) { if (!y.Contains(inf1)) { y.Add(inf1); } if (!y.Contains(highest_inf)) { y.Add(highest_inf); } } else { var z = new List <string>(); z.Add(inf1); z.Add(highest_inf); homologInterfaces.Add(z); } } foreach (var c in cluster) { c.Partner1ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner1InterfaceAminoAcids)); c.Partner2ClusterIndex = homologInterfaces.FindIndex(b => b.Contains(c.Partner2InterfaceAminoAcids)); } for (int index = 0; index < homologInterfaces.Count; index++) { var homologInterface = homologInterfaces[index]; var cluster2 = cluster.Where(a => homologInterface.Contains(a.InterfaceAminoAcids) ) .OrderBy(a => a.Partner1ClusterIndex) .ThenBy(a => a.Partner2ClusterIndex) .ThenBy(a => a.InterfaceAminoAcids) .ThenBy(a => a.Partner1InterfaceAminoAcids) .ThenBy(a => a.Partner2InterfaceAminoAcids) .ToList(); var partners = cluster2.Select( a => new Tuple <string, string, string>(a.InterfaceAminoAcids, a.Partner1InterfaceAminoAcids, a.Partner2InterfaceAminoAcids)).Distinct(); cluster2 = partners.Select( a => cluster2.FirstOrDefault( b => b.InterfaceAminoAcids == a.Item1 && b.Partner1InterfaceAminoAcids == a.Item2 && b.Partner2InterfaceAminoAcids == a.Item3)).ToList(); outputData.Add("cluster " + clusterIndex + "." + index); outputData.AddRange(cluster2.Select(a => a.ToString()).ToList()); outputData.Add(""); } } File.WriteAllLines(@"c:\ds96ub_homologs\ds96ub_homologs_interfaces.csv", outputData);//pdbInterfaces.Select(a=>a.ToString()).ToList()); }
static void Main(string[] args) { // this program takes a fasta or pdb file and finds all matching homologs // FindHomologs.exe "c:\ds96ub\ds96ub.fasta" * "c:\pdb\pdb_seqres.fasta" NMW Y 0.3 75 c:\pdb\ // alignment_type = (n)one, (s)imple, NMW, SWM var query_sequence_file = args[0]; //query.fasta var query_id_chain = args[1]; //1A2G:B var target_sequence_file = args[2]; //targets.fasta var alignment_type_str = args[3]; //NMW,SWM,SIM,NON if (alignment_type_str == "*") { alignment_type_str = "NMW,SWM,SIM,NON"; } var alignment_type_str_split = alignment_type_str.ToUpperInvariant().Split(new char[] { ',', ';', ' ', '\t' }); var compare_physicochemically = args[4]; //Y/N var compare_physicochemically_bool = compare_physicochemically == "Y"; var min_similarity_str = args[5]; // 0.3 var max_len_difference = args[6]; var max_len_difference_int = int.Parse(max_len_difference); var output_folder = args[7]; var minSimilarity = decimal.Parse(min_similarity_str); var alignmentTypes = new List <ProteinBioClass.AlignmentType>(); if (alignment_type_str_split.Contains("NMW")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.NMW); } if (alignment_type_str_split.Contains("SWM")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.SWM); } if (alignment_type_str_split.Contains("SIM")) { alignmentTypes.Add(ProteinBioClass.AlignmentType.SIM); } if (alignment_type_str_split.Contains("NON") || alignmentTypes.Count == 0) { alignmentTypes.Add(ProteinBioClass.AlignmentType.NON); } if (alignmentTypes.Count < alignment_type_str_split.Length) { Console.WriteLine("; unknown alignment type"); return; } // load list of query sequences var queryPdbid = query_id_chain.Split(new char[] { ':' })[0]; var queryChainid = (query_id_chain.Contains(":") ? query_id_chain.Split(new char[] { ':' })[1] : "*")[0]; var querySeq = Sequence.LoadSequenceFile(query_sequence_file, null); var queryResults = querySeq.Where(a => { var id = new ProteinBioClass.SequenceId(a.Id); return((queryPdbid == "*" || id.PdbId.ToUpperInvariant() == queryPdbid.ToUpperInvariant()) && (queryChainid == '*' || id.ChainId == queryChainid)); }).ToList(); if (queryResults.Count == 0) { Console.WriteLine("; the query pdbids/chainids were not found"); return; } // load list of target sequences var targetSeq = Sequence.LoadSequenceFile(target_sequence_file, new string[] { null, "", "protein" }); targetSeq = targetSeq.Where(a => a.Count() >= 50).ToList(); Console.WriteLine("; aligning " + queryResults.Count + " query sequences to " + targetSeq.Count + " target sequences"); // perform alignment //var startTime = DateTime.Now; //var progress = 0; //var progressLock = new object(); //var tasks = new List<Task<StringBuilder>>(); var queryPdbIds = queryResults.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId); var targetPdbIds = targetSeq.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId); var queryPdbIdCounts = new Dictionary <string, int>(); foreach (var x in queryPdbIds) { if (!queryPdbIdCounts.ContainsKey(x)) { queryPdbIdCounts.Add(x, 1); } else { queryPdbIdCounts[x]++; } } var targetPdbIdCounts = new Dictionary <string, int>(); foreach (var x in targetPdbIds) { if (!targetPdbIdCounts.ContainsKey(x)) { targetPdbIdCounts.Add(x, 1); } else { targetPdbIdCounts[x]++; } } foreach (var _query in queryResults) { var _queryId = new ProteinBioClass.SequenceId(_query.Id); var filename = (new DirectoryInfo(output_folder).FullName) + @"\homologs_" + _queryId.PdbId + _queryId.ChainId + @".csv"; // skip if already processed if (File.Exists(filename) && new FileInfo(filename).Length > 0) { continue; } var totalQueryPdbIdChains = queryPdbIdCounts[_queryId.PdbId]; WorkDivision wd = new WorkDivision(targetSeq.Count); for (var thread = 0; thread < wd.ThreadCount; thread++) { var query = _query; var queryId = _queryId; var lti = thread; wd.TaskList.Add(Task.Run(() => { var result = new List <HomologChain>(); for (var target = wd.ThreadFirstIndex[lti]; target <= wd.ThreadLastIndex[lti]; target++) { var targetobj = targetSeq[target]; if (max_len_difference_int != -1 && Math.Abs(targetobj.Count() - query.Count()) > max_len_difference_int) { continue; } var targetId = new ProteinBioClass.SequenceId(targetobj.Id); //var totalTargetPdbIdChains = targetSeq.Count(a => FindAtomicContacts.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant() == targetId.PdbId.ToUpperInvariant()); //var timeRemaining = // TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks * // ((targetSeq.Count * queryResults.Count) - (progress + 1)) / // (progress + 1)); foreach (var alignmentType in alignmentTypes) { var scores = ProteinBioClass.AlignedSequenceSimilarityPercentage(query, targetobj, alignmentType /*, * compare_physicochemically_bool*/); decimal percentSimilar; if (compare_physicochemically_bool) { percentSimilar = scores.ScoreEvo; } else { percentSimilar = scores.Score; } if (percentSimilar >= minSimilarity) { result.Add(new HomologChain( queryId.PdbId, queryId.ChainId, totalQueryPdbIdChains, targetId.PdbId, targetId.ChainId, targetPdbIdCounts[targetId.PdbId], alignmentType.ToString(), scores.Score, scores.ScoreEvo)); } } //if (progress % 20 == 0) // Console.Write("\r{0}% eta {1} ", // Math.Round((decimal)(progress + 1) / (decimal)(targetSeq.Count * queryResults.Count), // 2) // .ToString(CultureInfo.InvariantCulture), // timeRemaining.ToString(@"d\d\:h\h\:m\m\:s\s", // CultureInfo.InvariantCulture)); //lock (progressLock) // progress++; } return(result); })); } wd.WaitAllTasks(); var mergedlist = new List <string>(); mergedlist.Add("; " + _queryId.PdbId + ":" + _queryId.ChainId); mergedlist.Add(String.Join(",", new string[] { "query pdb id", "query chain id", "query chains", "target pdb id", "target chain id", "target chains", "alignment method", "sequence similarity", "sequence evo similarity" })); foreach (var t in wd.TaskList) { var tc = t as Task <List <HomologChain> >; if (tc == null) { throw new Exception("task in tasklist was null"); } mergedlist.AddRange(tc.Result.Select(a => a.ToString()).ToList()); } if (string.IsNullOrWhiteSpace(output_folder)) { Console.WriteLine(String.Join(Environment.NewLine, mergedlist)); } else { File.WriteAllLines(filename, mergedlist); } } }
public static List <SequenceIdentityClusterMember> ClusterSequenceByAlignedSequenceIdentity(List <Sequence> seqList, ProteinBioClass.AlignmentIdentityOption alignmentIdentityOption, decimal mininumClusterPairwiseSimilarity = 0.3m, decimal mininumEvoClusterPairwiseSimilarity = 0.3m) { var allsequences = seqList.Select(a => new Tuple <string, char, string>(new ProteinBioClass.SequenceId(a.Id).PdbId, new ProteinBioClass.SequenceId(a.Id).ChainId, Sequence.EscapeAminoAcidSequence(a.FullSequence))).ToList(); var sequences = allsequences.Select(a => a.Item3).Distinct().ToList(); var sequenceIds = sequences.Select(a => allsequences.Where(b => b.Item3 == a).ToList()).ToList(); var seqClusters = new List <List <string> >(); for (int x = 0; x < sequences.Count; x++) { var seq1 = sequences[x]; var newCluster = new List <string>(); newCluster.Add(seq1); seqClusters.Add(newCluster); } for (int indexX = 0; indexX < sequences.Count; indexX++) { Console.WriteLine("Aligning sequence " + indexX); var seqX = sequences[indexX]; //List<decimal> scoreList = new List<decimal>(); //List<decimal> scoreEvoList = new List<decimal>(); for (int indexY = 0; indexY < sequences.Count; indexY++) { if (indexY <= indexX) { continue; } var seqY = sequences[indexY]; if ((decimal)Math.Min(seqX.Length, seqY.Length) / (decimal)Math.Max(seqX.Length, seqY.Length) < mininumClusterPairwiseSimilarity) { continue; } var cluster1 = seqClusters.FirstOrDefault(a => a.Contains(seqX)); var cluster2 = seqClusters.FirstOrDefault(a => a.Contains(seqY)); if (cluster1 != null && cluster2 != null && cluster1 == cluster2) { continue; } var score = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NON, alignmentIdentityOption); Console.WriteLine("1: " + seqX); Console.WriteLine("2: " + seqY); Console.WriteLine("Score1: " + score.Score); Console.WriteLine("Score2: " + score.ScoreEvo); if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity) { var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.SIM, alignmentIdentityOption); if (x.Score > score.Score) { score.Score = x.Score; } if (x.ScoreEvo > score.ScoreEvo) { score.ScoreEvo = x.ScoreEvo; } } if (score.Score < mininumClusterPairwiseSimilarity || score.ScoreEvo < mininumEvoClusterPairwiseSimilarity) { var x = ProteinBioClass.AlignedSequenceSimilarityPercentage(seqX, seqY, ProteinBioClass.AlignmentType.NMW, alignmentIdentityOption); if (x.Score > score.Score) { score = x; } if (x.ScoreEvo > score.ScoreEvo) { score.ScoreEvo = x.ScoreEvo; } } if (score.Score >= mininumClusterPairwiseSimilarity && score.ScoreEvo >= mininumEvoClusterPairwiseSimilarity) { var newCluster = new List <string>(); newCluster.AddRange(cluster1); newCluster.AddRange(cluster2); seqClusters.Remove(cluster1); seqClusters.Remove(cluster2); seqClusters.Add(newCluster); } //scoreList.Add(score.Score); //scoreEvoList.Add(score.ScoreEvo); } //Console.WriteLine("[" + string.Join(", ", scoreList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]"); //Console.WriteLine("[" + string.Join(", ", scoreEvoList.Select(a => String.Format("{0:0.00}", a)).ToList()) + "]"); } seqClusters = seqClusters.OrderBy(a => a.Count).ToList(); var output = new List <SequenceIdentityClusterMember>(); for (var index = 0; index < seqClusters.Count; index++) { var seqCluster = seqClusters[index]; foreach (var item in seqCluster) { var indexIds = sequences.IndexOf(item); var ids = sequenceIds[indexIds]; foreach (var id in ids) { output.Add(new SequenceIdentityClusterMember(index + 1, ProteinBioClass.PdbIdFromPdbFilename(id.Item1), id.Item2, id.Item3)); } } } return(output); }
private static void Main(string[] args) { // this program will load the homolog list in csv format and for homologs of X sequence distance return a list of all partners // however, some partners may be duplicates, which cannot initially be removed, since they may bind differently in other instances // then, because of such cases, unique id to describe each protein must be created... this is slightly problematic because // close target homologs of proteins are also considered to be the same protein as the query protein // which means that they could exist for more than one query protein // FindHomologsCluster.exe c:\pdb\ds96ub_homologs\ c:\pdb\pdb_seqres.fasta 0.9 0.9 > ds96ub_homologs.csv var homolog_csv_folder = args[0]; var sequence_file = args[1]; var min_similarity_str = args[2]; var min_similarity_evo_str = args[3]; var min_similarity = decimal.Parse(min_similarity_str); var min_similarity_evo = decimal.Parse(min_similarity_evo_str); var seqList = Sequence.LoadSequenceFile(sequence_file, new[] { null, "", "protein" }); var homologCsvFiles = Directory.GetFiles(homolog_csv_folder, "homologs_?????.csv"); var parsedData = FindHomologs.FindHomologs.HomologChain.Load(homologCsvFiles); Array.Clear(homologCsvFiles, 0, homologCsvFiles.Length); //var query_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.query_pdb_id, a.query_chainid)).ToList(); //var target_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.target_pdb_id, a.target_chainid)).ToList(); //var query_alignments = new List<homolog_csv>(); var homologs_clustered = new List <List <Tuple <string, char> > >(); //var min_similarity = 0.9m; foreach (var rec in parsedData) { if (rec.AlignmentScore >= min_similarity && rec.AlignmentScoreEvo >= min_similarity_evo) { //var query_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.query_pdb_id && b.Item2 == rec.query_chainid) != null); //var target_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.target_pdb_id && b.Item2 == rec.target_chainid) != null); List <Tuple <string, char> > query_group = null; List <Tuple <string, char> > target_group = null; foreach (var cluster in homologs_clustered) { var xq = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.QueryPdbId.ToUpperInvariant() && b.Item2 == rec.QueryChainId); if (xq == null) { continue; } query_group = cluster; break; } foreach (var cluster in homologs_clustered) { var xt = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.TargetPdbId.ToUpperInvariant() && b.Item2 == rec.TargetChainId); if (xt == null) { continue; } target_group = cluster; break; } var new_group = new List <Tuple <string, char> >(); if (query_group != null) { new_group.AddRange(query_group); homologs_clustered.Remove(query_group); query_group.Clear(); } else { new_group.Add(new Tuple <string, char>(rec.QueryPdbId, rec.QueryChainId)); } if (target_group != null) { new_group.AddRange(target_group); homologs_clustered.Remove(target_group); target_group.Clear(); } else { new_group.Add(new Tuple <string, char>(rec.TargetPdbId, rec.TargetChainId)); } new_group = new_group.Distinct().ToList(); // try without distinct? new_group = new_group.OrderBy(a => a.Item1).ThenBy(a => a.Item2).ToList(); homologs_clustered.Add(new_group); } } var seq_list_ids = seqList.Select(a => new ProteinBioClass.SequenceId(a.Id)).ToList(); var wd2 = new WorkDivision(homologs_clustered.Count); for (var thread2 = 0; thread2 < wd2.ThreadCount; thread2++) { var lti2 = thread2; wd2.TaskList.Add(Task.Run(() => { var result2 = new List <string>(); for (var index2 = wd2.ThreadFirstIndex[lti2]; index2 <= wd2.ThreadLastIndex[lti2]; index2++) { var cluster2 = homologs_clustered[index2]; var wd3 = new WorkDivision(cluster2.Count); for (var thread3 = 0; thread3 < wd3.ThreadCount; thread3++) { var lti3 = thread3; var cluster3 = cluster2; var index4 = index2; wd3.TaskList.Add(Task.Run(() => { var result = new List <HomologClusterData>(); for (var index3 = wd3.ThreadFirstIndex[lti3]; index3 <= wd3.ThreadLastIndex[lti3]; index3++) { var item = cluster3[index3]; Sequence s = null; for (var j = 0; j < seqList.Count; j++) { if (seq_list_ids[j].PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant() && seq_list_ids[j].ChainId == item.Item2) { s = seqList[j]; break; } } if (s == null) { throw new Exception("sequence not found for " + item.Item1 + ":" + item.Item2); } var complexChains = seq_list_ids.Count(a => a.PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant()); var minAlignmentScore = -1m; var maxAlignmentScore = -1m; var minAlignmentScoreEvo = -1m; var maxAlignmentScoreEvo = -1m; foreach (var item2 in cluster3) { if (ReferenceEquals(item, item2)) { continue; } Sequence s2 = null; for (var j2 = 0; j2 < seqList.Count; j2++) { if (seq_list_ids[j2].PdbId.ToUpperInvariant() == item2.Item1.ToUpperInvariant() && seq_list_ids[j2].ChainId == item2.Item2) { s2 = seqList[j2]; break; } } if (s2 == null) { continue; } var alignmentScore = ProteinBioClass.AlignedSequenceSimilarityPercentage(s, s2, ProteinBioClass.AlignmentType.NMW); if (alignmentScore.Score > maxAlignmentScore || maxAlignmentScore == -1m) { maxAlignmentScore = alignmentScore.Score; } if (alignmentScore.Score < minAlignmentScore || minAlignmentScore == -1m) { minAlignmentScore = alignmentScore.Score; } if (alignmentScore.ScoreEvo > maxAlignmentScoreEvo || maxAlignmentScoreEvo == -1m) { maxAlignmentScoreEvo = alignmentScore.ScoreEvo; } if (alignmentScore.ScoreEvo < minAlignmentScoreEvo || minAlignmentScoreEvo == -1m) { minAlignmentScoreEvo = alignmentScore.ScoreEvo; } } var r = new HomologClusterData(index4 + 1, index3 + 1, item.Item1, item.Item2, complexChains, Convert.ToInt32(s.Count()), minAlignmentScore, maxAlignmentScore, minAlignmentScoreEvo, maxAlignmentScoreEvo, s.FullSequence); result.Add(r); } return(result); })); } wd3.WaitAllTasks(); result2.Add("; Cluster # " + (index2 + 1) + " with " + wd3.ItemsToProcess + " protein chains"); result2.Add("cluster index,item index,pdb id,chain id,complex chains,seq len,min clstr sid,max clstr sid,min evo clstr sid,max evo clstr sid,sequence"); foreach (var task in wd3.TaskList) { //if (task.IsFaulted || task.IsCanceled) continue; var tr = task as Task <List <HomologClusterData> >; if (tr == null || tr.Result == null) { continue; } result2.AddRange(tr.Result.Select(a => a.ToString()).ToList()); } result2.Add(""); } return(result2); })); //wd2.TaskList.Add(task2); } wd2.WaitAllTasks(); var result1 = new List <string>(); foreach (var task in wd2.TaskList) { //if (task.IsFaulted || task.IsCanceled) continue; var tr = task as Task <List <string> >; if (tr == null || tr.Result == null) { continue; } result1.AddRange(tr.Result); } foreach (var line in result1) { Console.WriteLine(line); } // partners may have other interfaces, should those also be considered? }