Пример #1
0
        /// <summary>
        ///     This method removes sequences not having the required number of chains.
        /// </summary>
        /// <returns></returns>
        public static List <ISequence> RemoveSequencesWithIncorrectNumberOfChains(CancellationToken cancellationToken, List <ISequence> sequenceList, int numberOfChainsRequired = 2, ProgressActionSet progressActionSet = null)
        {
            if (sequenceList == null || sequenceList.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequenceList));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            var pdbIdListNotDistinct = FilterProteins.SequenceListToPdbIdList(sequenceList, false);

            ProgressActionSet.StartAction(pdbIdListNotDistinct.Count, progressActionSet);

            var workDivision = new WorkDivision <List <string> >(pdbIdListNotDistinct.Count);

            for (var threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                var localThreadIndex       = threadIndex;
                Task <List <string> > task = Task.Run(() =>
                {
                    var taskResult = pdbIdListNotDistinct.Where((a, pdbIdIndex) =>
                    {
                        if (pdbIdIndex < workDivision.ThreadFirstIndex[localThreadIndex] || pdbIdIndex > workDivision.ThreadLastIndex[localThreadIndex])
                        {
                            return(false);
                        }

                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);

                        return(pdbIdListNotDistinct.Count(b => a == b) != numberOfChainsRequired);
                    }).ToList();

                    return(taskResult);
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            var sequencesWithIncorrectNumberOfChains = new List <string>();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted))
            {
                sequencesWithIncorrectNumberOfChains.AddRange(task.Result);
            }

            var result = RemoveSequences(cancellationToken, sequenceList, sequencesWithIncorrectNumberOfChains);

            ProgressActionSet.FinishAction(true, progressActionSet);

            return(result);
        }
Пример #2
0
        public static List <string> RemoveStructuresWithIncorrectNumberOfChains(CancellationToken cancellationToken, string[] pdbFolders, List <string> pdbIdList = null, Dictionary <string, List <string> > pdbIdChainIdList = null, int numberChainsRequired = 2, ProgressActionSet progressActionSet = null, int totalThreads = -1)
        {
            if (pdbFolders == null || pdbFolders.Length == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(pdbFolders));
            }

            if (pdbIdList == null || pdbIdList.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(pdbIdList));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            var pdbFilesArray = ProteinDataBankFileOperations.GetPdbFilesArray(pdbFolders);

            pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, pdbFilesArray);

            ProteinDataBankFileOperations.ShowMissingPdbFiles(pdbFilesArray, pdbIdList, progressActionSet);

            WorkDivision <List <string> > workDivision = new WorkDivision <List <string> >(pdbFilesArray.Length, totalThreads);

            ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                Task <List <string> > task = Task.Run(() =>
                {
                    var taskResult = new List <string>();

                    for (int pdbFileNumber = workDivision.ThreadFirstIndex[localThreadIndex]; pdbFileNumber <= workDivision.ThreadLastIndex[localThreadIndex]; pdbFileNumber++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        try
                        {
                            string pdbFilename = pdbFilesArray[pdbFileNumber];
                            string proteinId   = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename);

                            // Check if the file found is included in the white list.
                            if (pdbIdList != null && !pdbIdList.Contains(proteinId))
                            {
                                continue;
                            }

                            var sequenceChainIdList = pdbIdChainIdList != null ? (pdbIdChainIdList.ContainsKey(proteinId) ? pdbIdChainIdList[proteinId].ToArray() : null) : null;
                            int chainCount          = ProteinDataBankFileOperations.PdbAtomicChainsCount(pdbFilename, sequenceChainIdList, numberChainsRequired);

                            if (chainCount != numberChainsRequired)
                            {
                                if (!taskResult.Contains(proteinId))
                                {
                                    taskResult.Add(proteinId);
                                }
                            }
                        }
                        finally
                        {
                            workDivision.IncrementItemsCompleted(1);

                            ProgressActionSet.ProgressAction(1, progressActionSet);
                            ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                        }
                    }

                    return(taskResult);
                }, cancellationToken);
                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            ProgressActionSet.FinishAction(true, progressActionSet);

            var result = new List <string>();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted))
            {
                result.AddRange(task.Result);
            }

            result = result.Distinct().ToList();

            return(result);
        }
Пример #3
0
        /// <summary>
        ///     This method removes sequences from the list which are not proteins (e.g. DNA, RNA, Hybrid).
        /// </summary>
        /// <returns></returns>
        public static List <ISequence> RemoveNonProteinAlphabetSequences(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            WorkDivision <List <string> > workDivision = new WorkDivision <List <string> >(sequences.Count, totalThreads);

            ProgressActionSet.StartAction(sequences.Count, progressActionSet);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex       = threadIndex;
                Task <List <string> > task = Task.Run(() =>
                {
                    var taskResult = new List <string>();

                    for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[index].ID).PdbId;

                        if (sequences[index].Alphabet != Alphabets.Protein)
                        {
                            taskResult.Add(proteinId);
                        }

                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                    }

                    return(taskResult);
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            var result = new List <string>();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted))
            {
                result.AddRange(task.Result);
            }

            result = result.Distinct().ToList();

            List <ISequence> seq = RemoveSequences(cancellationToken, sequences, result);

            return(seq);
        }
Пример #4
0
        /// <summary>
        ///     This method removes specified ids from the list of sequences.
        /// </summary>
        /// <param name="sequences"></param>
        /// <param name="sequencesToKeepOrRemove"></param>
        /// <param name="options"></param>
        /// <returns></returns>
        public static List <ISequence> RemoveSequences(CancellationToken cancellationToken, List <ISequence> sequences, List <string> sequencesToKeepOrRemove, RemoveSequencesOptions options = RemoveSequencesOptions.RemoveSequencesInList, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (sequencesToKeepOrRemove == null)
            {
                throw new ArgumentOutOfRangeException(nameof(sequencesToKeepOrRemove));
            }


            if (sequencesToKeepOrRemove != null)// && sequencesToKeepOrRemove.Count > 0)
            {
                var workDivision = new WorkDivision <List <int> >(sequences.Count, totalThreads);

                for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
                {
                    int localThreadIndex = threadIndex;

                    Task <List <int> > task = Task.Run(() =>
                    {
                        var taskResult = new List <int>();

                        for (int sequencesIndex = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndex <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndex++)
                        {
                            string proteinId = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndex].ID).PdbId;

                            if (((options == RemoveSequencesOptions.RemoveSequencesInList) && (sequencesToKeepOrRemove.Contains(proteinId))) ||
                                ((options == RemoveSequencesOptions.RemoveSequencesNotInList) && (!sequencesToKeepOrRemove.Contains(proteinId))))
                            {
                                taskResult.Add(sequencesIndex);
                            }

                            workDivision.IncrementItemsCompleted(1);
                        }

                        return(taskResult);
                    }, cancellationToken);

                    workDivision.TaskList.Add(task);
                }

                workDivision.WaitAllTasks();

                var sequenceIndexesToRemove = new List <int>();

                foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted))
                {
                    sequenceIndexesToRemove.AddRange(task.Result);
                }

                sequenceIndexesToRemove = sequenceIndexesToRemove.Distinct().ToList();

                sequenceIndexesToRemove.Sort();

                for (int sequenceIndexesToRemoveIndex = sequenceIndexesToRemove.Count - 1; sequenceIndexesToRemoveIndex >= 0; sequenceIndexesToRemoveIndex--)
                {
                    sequences.RemoveAt(sequenceIndexesToRemove[sequenceIndexesToRemoveIndex]);
                }
            }

            return(sequences);
        }
Пример #5
0
        public static List <ISequence> RemoveDuplicates(CancellationToken cancellationToken, List <ISequence> sequences, ProgressActionSet progressActionSet, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            var pdbIdList      = SequenceListToPdbIdList(sequences);
            var pdbIdSequences = pdbIdList.Select(a => sequences.Where(b => SequenceIdSplit.SequenceIdToPdbIdAndChainId(b.ID).PdbId == a).ToList()).ToList();

            var workDivision = new WorkDivision(pdbIdList.Count, totalThreads);


            ProgressActionSet.StartAction(pdbIdList.Count, progressActionSet);

            var done       = new List <ISequence>();
            var remove     = new List <ISequence>();
            var removeLock = new object();



            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                var task = Task.Run(() =>
                {
                    for (int index = workDivision.ThreadFirstIndex[localThreadIndex]; index <= workDivision.ThreadLastIndex[localThreadIndex]; index++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        var iterationPdbId     = pdbIdList[index];
                        var iterationPdbIdSeqs = pdbIdSequences[index];// sequences.Where(a => SequenceIdSplit.SequenceIdToPdbIdAndChainId(a.ID).PdbId == pdbId).ToList();

                        //var seq = sequences[index];
                        //var seqid = SequenceIdSplit.SequenceIdToPdbIdAndChainId(seq.ID).PdbId.ToUpperInvariant();
                        lock (done)
                        {
                            if (iterationPdbIdSeqs.All(done.Contains))
                            {
                                continue;
                            }
                        }

                        foreach (var pdbIdSeqSet in pdbIdSequences)
                        {
                            if (pdbIdSeqSet == iterationPdbIdSeqs)
                            {
                                continue;
                            }

                            foreach (var pdbIdSeq in pdbIdSeqSet)
                            {
                                foreach (var iterationPdbIdSeq in iterationPdbIdSeqs)
                                {
                                }
                            }
                        }

                        // find sequences equal to the current iteration item
                        //var equalseq = sequences.Where(a => a.SequenceEqual(seq)).ToList();


                        /*
                         * var equalseq = sequences.Where(a => AlignedSequenceSimilarityPercentage(seq,a) >= 90).ToList();
                         *
                         *
                         *
                         * // get a list of pdbids, ordered alphabetically
                         * var equalseqids = equalseq.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).OrderBy(p => p).ToList();
                         *
                         * // one or more of the chains might have a difference sequence and so not in the list, update by the ids in the list
                         * //equalseq = sequences.Where(p => equalseqids.Contains(SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant())).ToList();
                         *
                         * // add this iteration item and all others with the same sequence to a list to skip in future
                         * lock (done)
                         * {
                         *  done.AddRange(equalseq);
                         * }
                         *
                         * // keep the very last item in the list and all with the same pdbid that it has
                         * var keepid = equalseqids.Last();
                         * var equalseqkeep = equalseq.Where(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant() == keepid).ToList();
                         *
                         * // remove the sequences to keep from the removal list
                         * equalseq = equalseq.Where(a => !equalseqkeep.Contains(a)).ToList();
                         *
                         * lock (remove)
                         * {
                         *  remove.AddRange(equalseq);
                         * }
                         */
                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                    }
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();


            var remove2 = remove.Distinct().ToList();


            return(RemoveSequences(cancellationToken, sequences, remove2.Select(p => SequenceIdSplit.SequenceIdToPdbIdAndChainId(p.ID).PdbId.ToUpperInvariant()).ToList()));
        }
Пример #6
0
        /// <summary>
        ///     Perform sequence alignment on the chains of each protein to see if it is a homodimer or heterodimer
        /// </summary>
        /// <returns></returns>
        public static DimerSequenceTypeCategories <string> SplitDimersHomoHetero(CancellationToken cancellationToken, List <ISequence> sequences, decimal minimumHeterodimerSimilarityRequired = 30.0m, decimal minimumHomodimerSimiliarityRequired = 90.0m, ProgressActionSet progressActionSet = null, int totalThreads = -1)
        {
            if (sequences == null || sequences.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(sequences));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            var workDivision = new WorkDivision <DimerSequenceTypeCategories <string> >(sequences.Count, totalThreads);

            ProgressActionSet.StartAction(sequences.Count, progressActionSet);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                Task <DimerSequenceTypeCategories <string> > task = Task.Run(() =>
                {
                    var taskResult = new DimerSequenceTypeCategories <string>();

                    for (int sequencesIndexA = workDivision.ThreadFirstIndex[localThreadIndex]; sequencesIndexA <= workDivision.ThreadLastIndex[localThreadIndex]; sequencesIndexA++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        string proteinIdA = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexA].ID).PdbId;

                        for (int sequencesIndexB = 0; sequencesIndexB < sequences.Count; sequencesIndexB++)
                        {
                            if (cancellationToken.IsCancellationRequested)
                            {
                                break;
                            }

                            // Don't align the same sequence index. Skip calculating indexes already calculated. Perform alignment operation if protein id is the same.
                            var proteinIdB = SequenceIdSplit.SequenceIdToPdbIdAndChainId(sequences[sequencesIndexB].ID).PdbId;

                            if (sequencesIndexA == sequencesIndexB || sequencesIndexB < sequencesIndexA || proteinIdA != proteinIdB)
                            {
                                continue;
                            }

                            var dimerType = FindDimerType(sequences[sequencesIndexA], sequences[sequencesIndexB], minimumHeterodimerSimilarityRequired, minimumHomodimerSimiliarityRequired);

                            if (dimerType == DimerType.HeteroDimer)
                            {
                                taskResult.HeteroDimerPdbIdList.Add(proteinIdA);
                            }
                            else if (dimerType == DimerType.HomoDimer)
                            {
                                taskResult.HomoDimerPdbIdList.Add(proteinIdA);
                            }
                            else if (dimerType == DimerType.HomologyDimer)
                            {
                                taskResult.HomologyDimerPdbIdList.Add(proteinIdA);
                            }
                        }

                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                    }

                    return(taskResult);
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            var dimerSequenceTypeCategories = new DimerSequenceTypeCategories <string>();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.IsCompleted && !t.IsFaulted && !t.IsCanceled && t.Result != null))
            {
                dimerSequenceTypeCategories.HeteroDimerPdbIdList.AddRange(task.Result.HeteroDimerPdbIdList);
                dimerSequenceTypeCategories.HomoDimerPdbIdList.AddRange(task.Result.HomoDimerPdbIdList);
                dimerSequenceTypeCategories.HomologyDimerPdbIdList.AddRange(task.Result.HomologyDimerPdbIdList);
            }

            dimerSequenceTypeCategories.HeteroDimerPdbIdList   = dimerSequenceTypeCategories.HeteroDimerPdbIdList.Distinct().ToList();
            dimerSequenceTypeCategories.HomoDimerPdbIdList     = dimerSequenceTypeCategories.HomoDimerPdbIdList.Distinct().ToList();
            dimerSequenceTypeCategories.HomologyDimerPdbIdList = dimerSequenceTypeCategories.HomologyDimerPdbIdList.Distinct().ToList();


            return(dimerSequenceTypeCategories);
        }
Пример #7
0
        /// <summary>
        ///     Generate stats of interactions... also removes proteins not meeting minimum interactions requirement.
        /// </summary>
        /// <param name="cancellationToken"></param>
        /// <param name="pdbFolders"></param>
        /// <param name="pdbIdList"></param>
        /// <param name="progressBar"></param>
        /// <param name="estimatedTimeRemainingLabel"></param>
        /// <returns></returns>
        public static Dictionary <string, decimal> CalculateStructureSymmetry(CancellationToken cancellationToken, decimal maxAtomInterationDistance, string[] pdbFolders, List <string> pdbIdList = null, Dictionary <string, List <string> > pdbIdChainIdList = null, ProgressActionSet progressActionSet = null, int totalThreads = -1)
        {
            if (pdbFolders == null || pdbFolders.Length == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(pdbFolders));
            }

            if (pdbIdList == null || pdbIdList.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(pdbIdList));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentOutOfRangeException(nameof(progressActionSet));
            }

            const int requiredNumberOfChains = 2;

            string[] pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, ProteinDataBankFileOperations.GetPdbFilesArray(pdbFolders));

            WorkDivision <Dictionary <string, decimal> > workDivision = new WorkDivision <Dictionary <string, decimal> >(pdbFilesArray.Length, totalThreads);

            ProteinDataBankFileOperations.ShowMissingPdbFiles(pdbFilesArray, pdbIdList, progressActionSet);

            ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                Task <Dictionary <string, decimal> > task = Task.Run(() =>
                {
                    var taskResult = new Dictionary <string, decimal>();

                    for (int pdbFileNumber = workDivision.ThreadFirstIndex[localThreadIndex]; pdbFileNumber <= workDivision.ThreadLastIndex[localThreadIndex]; pdbFileNumber++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        try
                        {
                            string pdbFilename = pdbFilesArray[pdbFileNumber];
                            string proteinId   = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename);

                            // Check if the file found is included in the white list.
                            if (pdbIdList != null && !pdbIdList.Contains(proteinId))
                            {
                                continue;
                            }

                            var chainIdList = pdbIdChainIdList != null ? (proteinId != null && pdbIdChainIdList.ContainsKey(proteinId) ? pdbIdChainIdList[proteinId].ToArray() : null) : null;

                            // Get atom chains.
                            ProteinChainListContainer proteinFileChains = ProteinDataBankFileOperations.PdbAtomicChains(pdbFilename, chainIdList, requiredNumberOfChains, requiredNumberOfChains, true);

                            if (proteinFileChains == null || proteinFileChains.ChainList == null || proteinFileChains.ChainList.Count != 2 ||
                                proteinFileChains.ChainList[StaticValues.ChainA] == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList.Count == 0 ||
                                proteinFileChains.ChainList[StaticValues.ChainB] == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList.Count == 0)
                            {
                                continue;
                            }

                            // Make a list to save interactions found.
                            var interactionMatchPercentage = new InteractionMatchPercentage(proteinId);

                            List <AtomPair> interactions = SearchInteractions.FindInteractions(cancellationToken, maxAtomInterationDistance, pdbFilename, pdbIdChainIdList);

                            interactionMatchPercentage.IncrementTotalInteractions(interactions.Count);

                            for (int interactionsIndex = 0; interactionsIndex < interactions.Count; interactionsIndex++)
                            {
                                interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainA, interactions[interactionsIndex].Atom1.resSeq.FieldValue);
                                interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainB, interactions[interactionsIndex].Atom2.resSeq.FieldValue);
                            }

                            InteractionMatchPercentage.CalculatePercentageResult calculatedPercentage = interactionMatchPercentage.CalculatePercentage();

                            taskResult.Add(interactionMatchPercentage.ProteinId, calculatedPercentage.InteractionMatchPercentageAverage);
                        }
                        finally
                        {
                            workDivision.IncrementItemsCompleted(1);

                            ProgressActionSet.ProgressAction(1, progressActionSet);
                            ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                        }
                    }

                    return(taskResult);
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            var result = new Dictionary <string, decimal>();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.Result != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted))
            {
                foreach (var kvp in task.Result)
                {
                    //if (result.ContainsKey(kvp.Key))
                    //{
                    //    Console.WriteLine("Key already exists: '" + kvp.Key + "'");
                    //}
                    result.Add(kvp.Key, kvp.Value);
                }
            }

            return(result);
        }
        /// <summary>
        ///     Makes spreadsheets with scientific data outputs about given proteins.
        /// </summary>
        /// <param name="cancellationToken"></param>
        /// <param name="pdbFolders">The location of the PDB files</param>
        /// <param name="pdbIdList">The PDB files which should be used.</param>
        /// <param name="consoleTextBox"></param>
        /// <param name="progressBar">User proteinInterface progress bar for user feedback.</param>
        /// <param name="estimatedTimeRemainingLabel">User proteinInterface estimated time remaining label for user feedback.</param>
        /// <param name="requestedTotalThreads"></param>
        /// <returns>Returns the generated spreadsheets with scientific data.</returns>
        public static List <List <SpreadsheetCell[]> > MakeHomodimerStatisticsSpreadsheetsList(CancellationToken cancellationToken, decimal maxAtomInterationDistance, string[] pdbFolders, List <string> pdbIdList = null, Dictionary <string, List <string> > pdbIdChainIdList = null, ProgressActionSet progressActionSet = null, int requestedTotalThreads = -1)
        {
            if (pdbFolders == null || pdbFolders.Length == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(pdbFolders));
            }

            if (pdbIdList == null || pdbIdList.Count == 0)
            {
                throw new ArgumentOutOfRangeException(nameof(pdbIdList));
            }

            if (progressActionSet == null)
            {
                throw new ArgumentNullException(nameof(progressActionSet));
            }

            // this method creates
            // 1. a list of interactions
            // 2. a list of symmetry percentage
            // 3. an "expected" heatmap by combining every possible a/b amino acid combination
            // 4. an actual heatmap for the proteinInterfaces
            // 5. normalised versions of both of the heatmaps

            string[] pdbFilesArray = ProteinDataBankFileOperations.RemoveNonWhiteListedPdbIdFromPdbFilesArray(pdbIdList, ProteinDataBankFileOperations.GetPdbFilesArray(pdbFolders));

            //var interactionRecordList = new List<ProteinInteractionRecord>();
            //var interactionMatchPercentageList = new List<InteractionMatchPercentage>();
            //var wholeProteinChainsAminoAcidCounter = new List<AminoAcidChainComposition>();
            //var interactionChainsAminoAcidCounter = new List<AminoAcidChainComposition>();
            //var interactionsAminoAcidToAminoAcidCounter = new AminoAcidPairCompositionMatrix();

            ////var wholeProteinAminoAcidToAminoAcidCounter2x2 = new AminoAcidPairCompositionMatrix(); // composition of every amino acid paired in every possible combination

            var workDivision = new WorkDivision <HomodimersStatisticsMinerTaskResult>(pdbFilesArray.Length, requestedTotalThreads);


            ProgressActionSet.StartAction(pdbFilesArray.Length, progressActionSet);



            int checkAllFilesProcessed     = 0;
            var lockCheckAllFilesProcessed = new object();

            var pdbFilesProcessed = new bool[pdbFilesArray.Length];

            Array.Clear(pdbFilesProcessed, 0, pdbFilesProcessed.Length);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                Task <HomodimersStatisticsMinerTaskResult> task = Task.Run(() =>
                {
                    var result = new HomodimersStatisticsMinerTaskResult();

                    for (int pdbFileNumber = workDivision.ThreadFirstIndex[localThreadIndex]; pdbFileNumber <= workDivision.ThreadLastIndex[localThreadIndex]; pdbFileNumber++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }

                        lock (lockCheckAllFilesProcessed)
                        {
                            checkAllFilesProcessed++;
                            pdbFilesProcessed[pdbFileNumber] = true;
                        }

                        try
                        {
                            string pdbFilename = pdbFilesArray[pdbFileNumber];
                            string proteinId   = ProteinDataBankFileOperations.PdbIdFromPdbFilename(pdbFilename);

                            // Check if the file found is included in the white list.
                            if (/*pdbIdList != null && */ !pdbIdList.Contains(proteinId))
                            {
                                ProgressActionSet.Report("Error: " + proteinId + " was not in the PDB ID white list.", progressActionSet);
                                continue;
                            }

                            List <AtomPair> interactions = SearchInteractions.FindInteractions(cancellationToken, maxAtomInterationDistance, pdbFilename, pdbIdChainIdList);

                            // Make a list to save interactions found.
                            var interactionMatchPercentage = new InteractionMatchPercentage(proteinId);

                            var chainAminoAcidCounterA1X1 = new AminoAcidChainComposition(proteinId, "A");
                            var chainAminoAcidCounterB1X1 = new AminoAcidChainComposition(proteinId, "B");

                            var chainInteractionAminoAcidCounterA = new AminoAcidChainComposition(proteinId, "A");
                            var chainInteractionAminoAcidCounterB = new AminoAcidChainComposition(proteinId, "B");

                            if (interactions != null && interactions.Count > 0)
                            {
                                interactionMatchPercentage.IncrementTotalInteractions(interactions.Count);

                                for (int interactionsIndex = 0; interactionsIndex < interactions.Count; interactionsIndex++)
                                {
                                    chainInteractionAminoAcidCounterA.IncrementAminoAcidCount(interactions[interactionsIndex].Atom1.resName.FieldValue);

                                    chainInteractionAminoAcidCounterB.IncrementAminoAcidCount(interactions[interactionsIndex].Atom2.resName.FieldValue);

                                    result.InteractionRecordList.Add(new ProteinInteractionRecord(proteinId, interactionsIndex + 1, interactions[interactionsIndex]));
                                    interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainA, interactions[interactionsIndex].Atom1.resSeq.FieldValue);
                                    interactionMatchPercentage.AddResidueSequenceIndex(StaticValues.ChainB, interactions[interactionsIndex].Atom2.resSeq.FieldValue);
                                    result.InteractionsAminoAcidToAminoAcidCounter.IncrementAminoAcidCount(interactions[interactionsIndex].Atom1.resName.FieldValue, interactions[interactionsIndex].Atom2.resName.FieldValue);
                                }
                            }

                            var chainIdList = pdbIdChainIdList != null ? (pdbIdChainIdList.ContainsKey(proteinId) ? pdbIdChainIdList[proteinId].ToArray() : null) : null;

                            ProteinChainListContainer proteinFileChains = ProteinDataBankFileOperations.PdbAtomicChains(pdbFilename, chainIdList, 2, 2, true);

                            if (proteinFileChains == null || proteinFileChains.ChainList == null || proteinFileChains.ChainList.Count != 2 ||
                                proteinFileChains.ChainList[StaticValues.ChainA] == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainA].AtomList.Count == 0 ||
                                proteinFileChains.ChainList[StaticValues.ChainB] == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList == null || proteinFileChains.ChainList[StaticValues.ChainB].AtomList.Count == 0)
                            {
                                if (!File.Exists(pdbFilename))
                                {
                                    ProgressActionSet.Report("Error: " + pdbFilename + " (" + proteinId + ") file not found", progressActionSet);
                                }
                                else
                                {
                                    int proteinFileChainCount = ProteinDataBankFileOperations.PdbAtomicChainsCount(pdbFilename);
                                    ProgressActionSet.Report("Error: " + proteinId + " did not have exactly 2 chains (" + proteinFileChainCount + " chains found) - skipping.", progressActionSet);
                                }

                                continue;
                            }

                            // count total of how many of each type of amino acids are in Chain A.
                            for (int atomIndexA = 0; atomIndexA < proteinFileChains.ChainList[StaticValues.ChainA].AtomList.Count; atomIndexA++)
                            {
                                chainAminoAcidCounterA1X1.IncrementAminoAcidCount(proteinFileChains.ChainList[StaticValues.ChainA].AtomList[atomIndexA].resName.FieldValue);
                            }

                            // count total of how many of each type of amino acids are in Chain B.
                            for (int atomIndexB = 0; atomIndexB < proteinFileChains.ChainList[StaticValues.ChainB].AtomList.Count; atomIndexB++)
                            {
                                chainAminoAcidCounterB1X1.IncrementAminoAcidCount(proteinFileChains.ChainList[StaticValues.ChainB].AtomList[atomIndexB].resName.FieldValue);
                            }

                            interactionMatchPercentage.CalculatePercentage();
                            result.InteractionMatchPercentageList.Add(interactionMatchPercentage);
                            result.WholeProteinChainsAminoAcidCounter.Add(chainAminoAcidCounterA1X1);
                            result.WholeProteinChainsAminoAcidCounter.Add(chainAminoAcidCounterB1X1);
                            result.InteractionChainsAminoAcidCounter.Add(chainInteractionAminoAcidCounterA);
                            result.InteractionChainsAminoAcidCounter.Add(chainInteractionAminoAcidCounterB);
                        }
                        finally
                        {
                            workDivision.IncrementItemsCompleted(1);

                            ProgressActionSet.ProgressAction(1, progressActionSet);
                            ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                        }
                    }

                    return(result);
                }, cancellationToken);
                workDivision.TaskList.Add(task);
            }


            workDivision.WaitAllTasks();

            ProgressActionSet.FinishAction(true, progressActionSet);

            // merge all instances of the results
            var spreadsheetTaskResult = new HomodimersStatisticsMinerTaskResult();

            foreach (var task in workDivision.TaskList.Where(t => t != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted && t.Result != null))
            {
                if (task.Result.InteractionChainsAminoAcidCounter != null && task.Result.InteractionChainsAminoAcidCounter.Count > 0)
                {
                    spreadsheetTaskResult.InteractionChainsAminoAcidCounter.AddRange(task.Result.InteractionChainsAminoAcidCounter);
                }

                if (task.Result.InteractionMatchPercentageList != null && task.Result.InteractionMatchPercentageList.Count > 0)
                {
                    spreadsheetTaskResult.InteractionMatchPercentageList.AddRange(task.Result.InteractionMatchPercentageList);
                }

                if (task.Result.InteractionRecordList != null && task.Result.InteractionRecordList.Count > 0)
                {
                    spreadsheetTaskResult.InteractionRecordList.AddRange(task.Result.InteractionRecordList);
                }

                if (task.Result.InteractionsAminoAcidToAminoAcidCounter != null)
                {
                    if (task.Result.InteractionsAminoAcidToAminoAcidCounter.AminoAcidToAminoAcid != null)
                    {
                        foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                        {
                            var totalGroups = AminoAcidGroups.AminoAcidGroups.GetTotalSubgroups(enumAminoAcidGroups);

                            for (int x = 0; x < totalGroups; x++)
                            {
                                for (int y = 0; y < totalGroups; y++)
                                {
                                    spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter.AminoAcidToAminoAcid[(int)enumAminoAcidGroups][x, y] +=
                                        task.Result.InteractionsAminoAcidToAminoAcidCounter.AminoAcidToAminoAcid[(int)enumAminoAcidGroups][x, y];
                                }
                            }
                        }
                    }
                }

                if (task.Result.WholeProteinChainsAminoAcidCounter != null && task.Result.WholeProteinChainsAminoAcidCounter.Count > 0)
                {
                    spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.AddRange(task.Result.WholeProteinChainsAminoAcidCounter);
                }
            }


            if (pdbFilesProcessed.Count(file => file == false) > 0)
            {
                ProgressActionSet.Report("ERROR: " + pdbFilesProcessed.Count(file => file == false) + " PDB FILES WERE SKIPPED! 0x01", progressActionSet);
            }
            else
            {
                ProgressActionSet.Report("CHECK: NO PDB FILES WERE SKIPPED! 0x01", progressActionSet);
            }

            if (checkAllFilesProcessed != pdbFilesArray.Length)
            {
                ProgressActionSet.Report("ERROR: " + (pdbFilesArray.Length - checkAllFilesProcessed) + " PDB FILES WERE SKIPPED! 0x02", progressActionSet);
            }
            else
            {
                ProgressActionSet.Report("CHECK: NO PDB FILES WERE SKIPPED! 0x02", progressActionSet);
            }


            spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter = spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.OrderBy(a => a.ProteinId).ThenBy(b => b.ChainId).ToList();
            spreadsheetTaskResult.InteractionChainsAminoAcidCounter  = spreadsheetTaskResult.InteractionChainsAminoAcidCounter.OrderBy(a => a.ProteinId).ThenBy(b => b.ChainId).ToList();

            AminoAcidChainComposition      wholeProteinChainsTotals = AminoAcidChainComposition.TotalFromAminoAcidChainCompositionList(spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter);
            AminoAcidChainComposition      interactionChainsTotals  = AminoAcidChainComposition.TotalFromAminoAcidChainCompositionList(spreadsheetTaskResult.InteractionChainsAminoAcidCounter);
            AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounter1X1 = AminoAcidChainComposition.ConvertToMatrix(wholeProteinChainsTotals);

            var results = new List <List <SpreadsheetCell[]> >();

            {
                /* start test */
                var spreadsheet1 = new List <SpreadsheetCell[]>();
                spreadsheet1.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% TEST SHEET 0"), });
                spreadsheet1.Add(new[] { new SpreadsheetCell("TEST SHEET 0"), });
                foreach (AminoAcidChainComposition item in spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter)
                {
                    //spreadsheet1.Add(item.ProteinId);
                    //spreadsheet1.Add(item.ChainId);
                    spreadsheet1.Add(item.SpreadsheetDataRow());
                }
                results.Add(spreadsheet1);
                spreadsheet1 = null;
                /* end test */
            }
            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

            {
                var spreadsheet2 = new List <SpreadsheetCell[]>();
                spreadsheet2.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Symmetry"), });
                spreadsheet2.Add(new[] { new SpreadsheetCell("Homodimers - List - Interaction Count And Interaction Match Percentage (Symmetry Measurement)") });
                spreadsheet2.Add(InteractionMatchPercentage.SpreadsheetColumnHeadersRow());
                var range2 = spreadsheetTaskResult.InteractionMatchPercentageList.Select(record => record.SpreadsheetDataRow()).ToList();
                //range2.Sort();
                range2 = range2
                         .OrderBy(a => a[0].CellData)
                         .ThenBy(a => a[1].CellData)
                         .ThenBy(a => a[2].CellData)
                         .ThenBy(a => a[3].CellData)
                         .ThenBy(a => a[4].CellData)
                         .ThenBy(a => a[5].CellData)
                         .ThenBy(a => a[6].CellData)
                         .ThenBy(a => a[7].CellData)
                         .ThenBy(a => a[8].CellData)
                         .ToList();
                spreadsheet2.AddRange(range2);
                range2 = null;
                results.Add(spreadsheet2);

                var spreadsheetHistogram2 = new List <SpreadsheetCell[]>();
                spreadsheetHistogram2.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HG Interaction Symmetry"), });
                spreadsheetHistogram2.Add(new[] { new SpreadsheetCell("Homodimers - List - Interaction Count And Interaction Match Percentage (Symmetry Measurement) Histogram") });
                spreadsheetHistogram2.AddRange(Histogram.MatrixToHistogram(spreadsheet2.ToArray(), Histogram.MakeBinDecimals(0, 100, 9, 10), new[] { 6, 7, 8 }, 2, -1, true));
                results.Add(spreadsheetHistogram2);

                spreadsheet2          = null;
                spreadsheetHistogram2 = null;
            }

            //
            {
                var spreadsheet3 = new List <SpreadsheetCell[]>();
                spreadsheet3.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Records"), });
                spreadsheet3.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Interaction Record"), });
                spreadsheet3.Add(ProteinInteractionRecord.TsvColumnHeadersRow());
                var range3 = spreadsheetTaskResult.InteractionRecordList.Select(record => record.SpreadsheetDataRow()).ToList();
                //range3.Sort();
                range3 = range3
                         .OrderBy(a => a[0].CellData)
                         .ThenBy(a => a[1].CellData)
                         .ThenBy(a => a[3].CellData)
                         .ThenBy(a => a[5].CellData)
                         .ThenBy(a => a[13].CellData)
                         .ThenBy(a => a[15].CellData)
                         .ToList();
                spreadsheet3.AddRange(range3);
                range3 = null;

                results.Add(spreadsheet3);

                var spreadsheetHistogram3 = new List <SpreadsheetCell[]>();
                spreadsheetHistogram3.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Records Histogram"), });
                spreadsheetHistogram3.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Interaction Record - Histogram"), });
                spreadsheetHistogram3.AddRange(Histogram.MatrixToHistogram(spreadsheet3.ToArray(), Histogram.MakeBinDecimals(0m, 5m, 0m, 0.05m), new[] { 1 }, 2, -1, true));
                results.Add(spreadsheetHistogram3);

                //spreadsheet3 = Histogram.InsertMatrixOverwrite(spreadsheet3.ToArray(), histogram3, 2, Histogram.MaxColumns(spreadsheet3.ToArray()) + 1).ToList();
                spreadsheet3          = null;
                spreadsheetHistogram3 = null;
            }
            //

            {
                var spreadsheet4 = new List <SpreadsheetCell[]>();
                spreadsheet4.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Count - A-Z"), });
                spreadsheet4.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - Interactions - A to Z"), });
                spreadsheet4.Add(AminoAcidChainComposition.SpreadsheetTitleRow());
                var range4 = spreadsheetTaskResult.InteractionChainsAminoAcidCounter.Select(record => record.SpreadsheetDataRow()).ToList();
                //range4.Sort();
                range4 = range4
                         .OrderBy(a => a[0].CellData)
                         .ThenBy(a => a[1].CellData)
                         .ToList();
                spreadsheet4.AddRange(range4);
                range4 = null;
                spreadsheet4.Add(interactionChainsTotals.SpreadsheetDataRow());
                results.Add(spreadsheet4);
                spreadsheet4 = null;
            }
            //

            {
                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    var spreadsheet5 = new List <SpreadsheetCell[]>();
                    spreadsheet5.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Interaction Count - Groups " + enumAminoAcidGroups), });
                    spreadsheet5.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - Interactions - Acid Groups " + enumAminoAcidGroups), });
                    spreadsheet5.Add(AminoAcidChainComposition.SpreadsheetGroupsTitleRow(enumAminoAcidGroups));
                    var range5 = spreadsheetTaskResult.InteractionChainsAminoAcidCounter.Select(record => record.SpreadsheetGroupsDataRow(enumAminoAcidGroups)).ToList();
                    //range4.Sort();
                    range5 = range5
                             .OrderBy(a => a[0].CellData)
                             .ThenBy(a => a[1].CellData)
                             .ToList();
                    spreadsheet5.AddRange(range5);
                    range5 = null;
                    spreadsheet5.Add(interactionChainsTotals.SpreadsheetGroupsDataRow(enumAminoAcidGroups));

                    results.Add(spreadsheet5);
                    spreadsheet5 = null;
                }
            }
            //

            {
                var spreadsheet6 = new List <SpreadsheetCell[]>();
                spreadsheet6.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Entire Count - A-Z"), });
                spreadsheet6.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - All Atoms - A to Z"), });
                spreadsheet6.Add(AminoAcidChainComposition.SpreadsheetTitleRow());
                var range6 = spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.Select(record => record.SpreadsheetDataRow()).ToList();
                //range6.Sort();
                range6 = range6
                         .OrderBy(a => a[0].CellData)
                         .ThenBy(a => a[1].CellData)
                         .ToList();
                spreadsheet6.AddRange(range6);
                range6 = null;
                spreadsheet6.Add(wholeProteinChainsTotals.SpreadsheetDataRow());
                results.Add(spreadsheet6);

                var spreadsheetHistogram6 = new List <SpreadsheetCell[]>();
                spreadsheetHistogram6.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Entire Count - A-Z - Historgram"), });
                spreadsheetHistogram6.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - All Atoms - A to Z - Histogram"), });
                spreadsheetHistogram6.AddRange(Histogram.MatrixToHistogram(spreadsheet6.ToArray(), Histogram.MakeBinDecimals(0, 10500, 0, 500), new[] { 28 }, 2, -1, true));
                spreadsheetHistogram6.Add(new [] { new SpreadsheetCell(""), });
                spreadsheetHistogram6.AddRange(Histogram.MatrixToHistogram(spreadsheet6.ToArray(), Histogram.MakeBinDecimals(0, 1000, 0, 100), new[] { 28 }, 2, -1, true));
                results.Add(spreadsheetHistogram6);

                spreadsheet6          = null;
                spreadsheetHistogram6 = null;
            }
            //

            {
                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    var spreadsheet7 = new List <SpreadsheetCell[]>();
                    spreadsheet7.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% L Entire Count - Groups " + enumAminoAcidGroups), });

                    spreadsheet7.Add(new[] { new SpreadsheetCell("Homodimers - List - Protein Amino Acid Count - All Atoms - Acid Groups " + enumAminoAcidGroups), });
                    spreadsheet7.Add(AminoAcidChainComposition.SpreadsheetGroupsTitleRow(enumAminoAcidGroups));
                    var range7 = spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.Select(record => record.SpreadsheetGroupsDataRow(enumAminoAcidGroups)).ToList();
                    //range7.Sort();
                    range7 = range7
                             .OrderBy(a => a[0].CellData)
                             .ThenBy(a => a[1].CellData)
                             .ToList();
                    spreadsheet7.AddRange(range7);
                    range7 = null;
                    spreadsheet7.Add(wholeProteinChainsTotals.SpreadsheetGroupsDataRow(enumAminoAcidGroups));

                    results.Add(spreadsheet7);
                    spreadsheet7 = null;
                }
            }


            // convert to percentage for creating mean average protein composition
            var meanProteinComposition = new AminoAcidChainComposition("Mean Composition", "-");

            foreach (AminoAcidChainComposition aminoAcidChainComposition in spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter)
            {
                // get percentage for row
                AminoAcidChainComposition percentage = AminoAcidChainComposition.ConvertToPercentage(aminoAcidChainComposition);

                // add percentage to overall tally

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    for (int x = 0; x < AminoAcidGroups.AminoAcidGroups.GetTotalSubgroups(enumAminoAcidGroups); x++)
                    {
                        meanProteinComposition.AminoAcidGroupsCount[(int)enumAminoAcidGroups][x] += (percentage.AminoAcidGroupsCount[(int)enumAminoAcidGroups][x] / spreadsheetTaskResult.WholeProteinChainsAminoAcidCounter.Count);
                    }
                }
            }

            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

            {
                /* start test */
                var spreadsheet8 = new List <SpreadsheetCell[]>();
                spreadsheet8.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% TEST SHEET 1"), }); // Worksheet name.
                spreadsheet8.Add(new[] { new SpreadsheetCell("TEST SHEET 1"), });                              // Spreadsheet title

                spreadsheet8.Add(new[] { new SpreadsheetCell(string.Empty), });
                spreadsheet8.Add(meanProteinComposition.SpreadsheetDataRow());
                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet8.Add(meanProteinComposition.SpreadsheetGroupsDataRow(enumAminoAcidGroups));
                }
                results.Add(spreadsheet8);
                spreadsheet8 = null;
                /* end test */
            }

            AminoAcidPairCompositionMatrix meanProteinMatrix = AminoAcidChainComposition.ConvertToMatrix(meanProteinComposition);

            {
                var spreadsheet9 = new List <SpreadsheetCell[]>();
                spreadsheet9.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM All Atoms 3x3"), });                            // Worksheet name.

                spreadsheet9.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Average Chain Composition"), }); // Spreadsheet title.

                //spreadsheet9.Add(new[] { new SpreadsheetCell(string.Empty), });
                //spreadsheet9.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Average Chain Composition - Percentage Composition - A to Z"), }); // Section title.
                //spreadsheet9.AddRange(meanProteinMatrix.SpreadsheetAminoAcidColorGroupsHeatMap());

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet9.Add(new[] { new SpreadsheetCell(string.Empty), });
                    spreadsheet9.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Average Chain Composition - Percentage Composition - Acid Groups " + enumAminoAcidGroups), }); // Section title.
                    spreadsheet9.AddRange(meanProteinMatrix.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                }
                results.Add(spreadsheet9);
                spreadsheet9 = null;
            }

            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

            //if (outputAllAtoms1x1)
            //{
            AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounterPercentage1X1 = AminoAcidPairCompositionMatrix.CalculatePercentageMatrix(wholeProteinAminoAcidToAminoAcidCounter1X1);

            {
                var spreadsheet10 = new List <SpreadsheetCell[]>();
                spreadsheet10.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM All Atoms 1x1") });                      // Worksheet name.

                spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Composition") }); // Spreadsheet title.

                //spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty)});
                //spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Percentage Composition - A to Z")}); // Section title.
                //spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1.SpreadsheetAminoAcidColorGroupsHeatMap());

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty) });
                    spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Percentage Composition - Acid Groups " + enumAminoAcidGroups) }); // Section title.
                    spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                }

                AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounterNormalised1X1 = AminoAcidPairCompositionMatrix.NormalizeWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix());

                //spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty)});
                //spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall UniProt Normalised - A to Z ")}); // Section title.
                //spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterNormalised1X1.SpreadsheetAminoAcidColorGroupsHeatMap());

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty) });
                    spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall UniProt Normalised - Acid Groups " + enumAminoAcidGroups) }); // Section title.
                    spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterNormalised1X1.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                }

                AminoAcidPairCompositionMatrix wholeProteinAminoAcidToAminoAcidCounterDifference1X1 = AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix());

                //spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty)});
                //spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall A to Z - UniProt Difference")}); // Section title.
                //spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterDifference1X1.SpreadsheetAminoAcidColorGroupsHeatMap());

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet10.Add(new[] { new SpreadsheetCell(string.Empty) });
                    spreadsheet10.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - All Atoms - Overall Acid Groups " + enumAminoAcidGroups + " - UniProt Difference") }); // Section title.
                    spreadsheet10.AddRange(wholeProteinAminoAcidToAminoAcidCounterDifference1X1.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                }

                results.Add(spreadsheet10);
                spreadsheet10 = null;
            }
            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

            {
                AminoAcidPairCompositionMatrix interactionsAminoAcidToAminoAcidCounterPercentage = AminoAcidPairCompositionMatrix.CalculatePercentageMatrix(spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter);

                var spreadsheet11 = new List <SpreadsheetCell[]>();
                spreadsheet11.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM Interactions Only") });    // Worksheet name.

                spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only") }); // Spreadsheet title.

                //spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty)});
                //spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - A to Z")}); // Section title.
                //spreadsheet11.AddRange(spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter.SpreadsheetAminoAcidColorGroupsHeatMap());

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty) });
                    spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - Acid Groups " + enumAminoAcidGroups) }); // Section title.
                    spreadsheet11.AddRange(spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                }

                AminoAcidPairCompositionMatrix interactionsAminoAcidToAminoAcidCounterNormalised = AminoAcidPairCompositionMatrix.NormalizeWithCompositionMatrix(interactionsAminoAcidToAminoAcidCounterPercentage, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix());

                //spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty)});
                //spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - A to Z - UniProt Normalised")}); // Section title.
                //spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterNormalised.SpreadsheetAminoAcidColorGroupsHeatMap());

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty) });
                    spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - Acid Groups " + enumAminoAcidGroups + " - UniProt Normalised") }); // Section title.
                    spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterNormalised.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                }

                AminoAcidPairCompositionMatrix interactionsAminoAcidToAminoAcidCounterDifference = AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(interactionsAminoAcidToAminoAcidCounterPercentage, UniProtProteinDatabaseComposition.AminoAcidCompositionAsMatrix());

                //spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty)});
                //spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - A to Z - UniProt Difference")}); // Section title.
                //spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterDifference.SpreadsheetAminoAcidColorGroupsHeatMap());

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet11.Add(new[] { new SpreadsheetCell(string.Empty) });
                    spreadsheet11.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Interactions Only - Acid Groups " + enumAminoAcidGroups + " - UniProt Difference") }); // Section title.
                    spreadsheet11.AddRange(interactionsAminoAcidToAminoAcidCounterDifference.SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                }

                results.Add(spreadsheet11);

                spreadsheet11 = null;
            }
            ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

            {
                var spreadsheet12 = new List <SpreadsheetCell[]>();
                spreadsheet12.Add(new[] { new SpreadsheetCell("%batch_number%%batch_letter% HM Interactions v Homodimers") });                                              // Worksheet name.

                spreadsheet12.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Difference between homodimer composition and homodimer interactions") }); // Spreadsheet title
                spreadsheet12.Add(new[] { new SpreadsheetCell(string.Empty) });

                //spreadsheet12.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Difference between homodimer composition and homodimer interactions - A to Z")}); // Section title
                //spreadsheet12.AddRange(AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter).SpreadsheetAminoAcidColorGroupsHeatMap());
                //spreadsheet12.Add(new[] { new SpreadsheetCell(string.Empty)});

                foreach (AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups enumAminoAcidGroups in Enum.GetValues(typeof(AminoAcidGroups.AminoAcidGroups.EnumAminoAcidGroups)))
                {
                    spreadsheet12.Add(new[] { new SpreadsheetCell("Homodimers - Amino Acid Heat Map - Difference between homodimer composition and homodimer interactions - Acid Groups " + enumAminoAcidGroups) }); // Section title.
                    spreadsheet12.AddRange(AminoAcidPairCompositionMatrix.DifferenceWithCompositionMatrix(wholeProteinAminoAcidToAminoAcidCounterPercentage1X1, spreadsheetTaskResult.InteractionsAminoAcidToAminoAcidCounter).SpreadsheetAminoAcidColorGroupsHeatMap(enumAminoAcidGroups));
                    spreadsheet12.Add(new[] { new SpreadsheetCell(string.Empty) });
                }

                results.Add(spreadsheet12);
            }
            ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

            return(results);
        }
Пример #9
0
        static void Main(string[] args)
        {
            // this program takes a fasta or pdb file and finds all matching homologs

            // FindHomologs.exe "c:\ds96ub\ds96ub.fasta" * "c:\pdb\pdb_seqres.fasta" NMW Y 0.3 75 c:\pdb\

            // alignment_type = (n)one, (s)imple, NMW, SWM

            var query_sequence_file  = args[0]; //query.fasta
            var query_id_chain       = args[1]; //1A2G:B
            var target_sequence_file = args[2]; //targets.fasta
            var alignment_type_str   = args[3]; //NMW,SWM,SIM,NON

            if (alignment_type_str == "*")
            {
                alignment_type_str = "NMW,SWM,SIM,NON";
            }
            var alignment_type_str_split       = alignment_type_str.ToUpperInvariant().Split(new char[] { ',', ';', ' ', '\t' });
            var compare_physicochemically      = args[4]; //Y/N
            var compare_physicochemically_bool = compare_physicochemically == "Y";
            var min_similarity_str             = args[5]; // 0.3
            var max_len_difference             = args[6];
            var max_len_difference_int         = int.Parse(max_len_difference);
            var output_folder = args[7];

            var minSimilarity = decimal.Parse(min_similarity_str);

            var alignmentTypes = new List <ProteinBioClass.AlignmentType>();

            if (alignment_type_str_split.Contains("NMW"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.NMW);
            }
            if (alignment_type_str_split.Contains("SWM"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.SWM);
            }
            if (alignment_type_str_split.Contains("SIM"))
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.SIM);
            }
            if (alignment_type_str_split.Contains("NON") || alignmentTypes.Count == 0)
            {
                alignmentTypes.Add(ProteinBioClass.AlignmentType.NON);
            }
            if (alignmentTypes.Count < alignment_type_str_split.Length)
            {
                Console.WriteLine("; unknown alignment type");
                return;
            }

            // load list of query sequences
            var queryPdbid   = query_id_chain.Split(new char[] { ':' })[0];
            var queryChainid = (query_id_chain.Contains(":") ? query_id_chain.Split(new char[] { ':' })[1] : "*")[0];


            var querySeq     = Sequence.LoadSequenceFile(query_sequence_file, null);
            var queryResults = querySeq.Where(a =>
            {
                var id = new ProteinBioClass.SequenceId(a.Id);
                return((queryPdbid == "*" || id.PdbId.ToUpperInvariant() == queryPdbid.ToUpperInvariant()) &&
                       (queryChainid == '*' || id.ChainId == queryChainid));
            }).ToList();

            if (queryResults.Count == 0)
            {
                Console.WriteLine("; the query pdbids/chainids were not found");
                return;
            }


            // load list of target sequences
            var targetSeq = Sequence.LoadSequenceFile(target_sequence_file, new string[] { null, "", "protein" });

            targetSeq = targetSeq.Where(a => a.Count() >= 50).ToList();

            Console.WriteLine("; aligning " + queryResults.Count + " query sequences to " + targetSeq.Count + " target sequences");

            // perform alignment

            //var startTime = DateTime.Now;


            //var progress = 0;
            //var progressLock = new object();


            //var tasks = new List<Task<StringBuilder>>();

            var queryPdbIds  = queryResults.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId);
            var targetPdbIds = targetSeq.Select(a => new ProteinBioClass.SequenceId(a.Id).PdbId);

            var queryPdbIdCounts = new Dictionary <string, int>();

            foreach (var x in queryPdbIds)
            {
                if (!queryPdbIdCounts.ContainsKey(x))
                {
                    queryPdbIdCounts.Add(x, 1);
                }
                else
                {
                    queryPdbIdCounts[x]++;
                }
            }

            var targetPdbIdCounts = new Dictionary <string, int>();

            foreach (var x in targetPdbIds)
            {
                if (!targetPdbIdCounts.ContainsKey(x))
                {
                    targetPdbIdCounts.Add(x, 1);
                }
                else
                {
                    targetPdbIdCounts[x]++;
                }
            }

            foreach (var _query in queryResults)
            {
                var _queryId = new ProteinBioClass.SequenceId(_query.Id);
                var filename = (new DirectoryInfo(output_folder).FullName) + @"\homologs_" + _queryId.PdbId + _queryId.ChainId + @".csv";

                // skip if already processed
                if (File.Exists(filename) && new FileInfo(filename).Length > 0)
                {
                    continue;
                }

                var totalQueryPdbIdChains = queryPdbIdCounts[_queryId.PdbId];

                WorkDivision wd = new WorkDivision(targetSeq.Count);


                for (var thread = 0; thread < wd.ThreadCount; thread++)
                {
                    var query   = _query;
                    var queryId = _queryId;
                    var lti     = thread;
                    wd.TaskList.Add(Task.Run(() =>
                    {
                        var result = new List <HomologChain>();


                        for (var target = wd.ThreadFirstIndex[lti]; target <= wd.ThreadLastIndex[lti]; target++)
                        {
                            var targetobj = targetSeq[target];

                            if (max_len_difference_int != -1 && Math.Abs(targetobj.Count() - query.Count()) > max_len_difference_int)
                            {
                                continue;
                            }

                            var targetId = new ProteinBioClass.SequenceId(targetobj.Id);

                            //var totalTargetPdbIdChains = targetSeq.Count(a => FindAtomicContacts.SequenceIdToPdbIdAndChainId(a.ID).PdbId.ToUpperInvariant() == targetId.PdbId.ToUpperInvariant());



                            //var timeRemaining =
                            //    TimeSpan.FromTicks(DateTime.Now.Subtract(startTime).Ticks *
                            //                       ((targetSeq.Count * queryResults.Count) - (progress + 1)) /
                            //                       (progress + 1));

                            foreach (var alignmentType in alignmentTypes)
                            {
                                var scores = ProteinBioClass.AlignedSequenceSimilarityPercentage(query, targetobj, alignmentType /*,
                                                                                                                                  * compare_physicochemically_bool*/);
                                decimal percentSimilar;

                                if (compare_physicochemically_bool)
                                {
                                    percentSimilar = scores.ScoreEvo;
                                }
                                else
                                {
                                    percentSimilar = scores.Score;
                                }

                                if (percentSimilar >= minSimilarity)
                                {
                                    result.Add(new HomologChain(
                                                   queryId.PdbId, queryId.ChainId, totalQueryPdbIdChains,
                                                   targetId.PdbId, targetId.ChainId, targetPdbIdCounts[targetId.PdbId],

                                                   alignmentType.ToString(),
                                                   scores.Score,
                                                   scores.ScoreEvo));
                                }
                            }
                            //if (progress % 20 == 0)
                            //    Console.Write("\r{0}% eta {1}     ",
                            //        Math.Round((decimal)(progress + 1) / (decimal)(targetSeq.Count * queryResults.Count),
                            //            2)
                            //            .ToString(CultureInfo.InvariantCulture),
                            //        timeRemaining.ToString(@"d\d\:h\h\:m\m\:s\s",
                            //            CultureInfo.InvariantCulture));
                            //lock (progressLock)
                            //    progress++;
                        }

                        return(result);
                    }));
                }

                wd.WaitAllTasks();

                var mergedlist = new List <string>();

                mergedlist.Add("; " + _queryId.PdbId + ":" + _queryId.ChainId);
                mergedlist.Add(String.Join(",",
                                           new string[]
                {
                    "query pdb id", "query chain id", "query chains",
                    "target pdb id", "target chain id", "target chains",

                    "alignment method", "sequence similarity", "sequence evo similarity"
                }));

                foreach (var t in wd.TaskList)
                {
                    var tc = t as Task <List <HomologChain> >;

                    if (tc == null)
                    {
                        throw new Exception("task in tasklist was null");
                    }

                    mergedlist.AddRange(tc.Result.Select(a => a.ToString()).ToList());
                }

                if (string.IsNullOrWhiteSpace(output_folder))
                {
                    Console.WriteLine(String.Join(Environment.NewLine, mergedlist));
                }
                else
                {
                    File.WriteAllLines(filename, mergedlist);
                }
            }
        }
        public static List <AtomPair> FindInteractions(CancellationToken cancellationToken, decimal maxAtomInterationDistance /*= 8.0m*/, string proteinId, Dictionary <string, List <string> > pdbIdChainIdList, ProteinChainListContainer proteinFileChains, bool breakWhenFirstInteractionFound = false, int totalThreads = -1, bool sort = true, int requiredChains = -1)
        {
            //const decimal maxInterationDistance = 8.0m;
            bool useCache = false;

            if (useCache && !string.IsNullOrWhiteSpace(proteinId))
            {
                var cachedInteractions = InteractionsCache.LoadPdbInteractionCache(proteinId, requiredChains);

                if (cachedInteractions != null)
                {
                    return(cachedInteractions);
                }
            }

            // check required number of chains are found
            if (proteinFileChains == null || proteinFileChains.ChainList == null || (requiredChains > -1 && proteinFileChains.ChainList.Count != requiredChains))
            {
                return(null);
            }

            // check that all chains have atoms
            if (proteinFileChains.ChainList.Any(chain => chain.AtomList == null || chain.AtomList.Count == 0))
            {
                return(null);
            }

            // Make list of 3D positions of atoms.
            var positions = new List <Point3D> [proteinFileChains.ChainList.Count];

            for (int chainIndex = 0; chainIndex < proteinFileChains.ChainList.Count; chainIndex++)
            {
                positions[chainIndex] = Clustering.AtomRecordListToPoint3DList(proteinFileChains.ChainList[chainIndex]);
            }

            var tasks = new List <Task <List <AtomPair> > >();

            for (int chainIndexA = 0; chainIndexA < proteinFileChains.ChainList.Count; chainIndexA++)
            {
                for (int chainIndexB = 0; chainIndexB < proteinFileChains.ChainList.Count; chainIndexB++)
                {
                    if (chainIndexB == chainIndexA || chainIndexB < chainIndexA)
                    {
                        continue;
                    }

                    WorkDivision <List <AtomPair> > workDivision = new WorkDivision <List <AtomPair> >(proteinFileChains.ChainList[chainIndexA].AtomList.Count, totalThreads);

                    bool breakOut     = false;
                    var  lockBreakOut = new object();

                    for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
                    {
                        int localThreadIndex = threadIndex;
                        int localChainIndexA = chainIndexA;
                        int localChainIndexB = chainIndexB;
                        WorkDivision <List <AtomPair> > localWorkDivision = workDivision;

                        Task <List <AtomPair> > task = Task.Run(() =>
                        {
                            var taskResult = new List <AtomPair>();

                            for (int atomIndexA = localWorkDivision.ThreadFirstIndex[localThreadIndex]; atomIndexA <= localWorkDivision.ThreadLastIndex[localThreadIndex]; atomIndexA++)
                            {
                                if (breakOut)
                                {
                                    break;
                                }

                                for (int atomIndexB = 0; atomIndexB < proteinFileChains.ChainList[localChainIndexB].AtomList.Count; atomIndexB++)
                                {
                                    if (breakOut || (breakWhenFirstInteractionFound && taskResult.Count > 0))
                                    {
                                        lock (lockBreakOut)
                                        {
                                            breakOut = true;
                                        }

                                        break;
                                    }

                                    if ((!positions[localChainIndexA][atomIndexA].ParseOK) || (!positions[localChainIndexB][atomIndexB].ParseOK))
                                    {
                                        continue;
                                    }

                                    decimal atomicDistanceAngstroms3D = Point3D.Distance3D(positions[localChainIndexA][atomIndexA], positions[localChainIndexB][atomIndexB], true);

                                    // Chemical proteinInterface bonds found at 5 angstrom or less.
                                    if (atomicDistanceAngstroms3D <= 0.0m || atomicDistanceAngstroms3D > maxAtomInterationDistance)
                                    {
                                        continue;
                                    }

                                    var atomPair = new AtomPair(
                                        proteinId,
                                        proteinFileChains.ChainList[localChainIndexA].AtomList[atomIndexA],
                                        localChainIndexA,
                                        proteinId,
                                        localChainIndexB,
                                        proteinFileChains.ChainList[localChainIndexB].AtomList[atomIndexB],
                                        atomicDistanceAngstroms3D);


                                    taskResult.Add(atomPair);
                                }
                            }

                            if (taskResult.Count == 0)
                            {
                                return(null);
                            }

                            return(taskResult);
                        }, cancellationToken);

                        workDivision.TaskList.Add(task);
                    }

                    tasks.AddRange(workDivision.TaskList);
                }
            }


            try
            {
                Task[] tasksToWait = tasks.Where(task => task != null && !task.IsCompleted).ToArray <Task>();
                if (tasksToWait.Length > 0)
                {
                    Task.WaitAll(tasksToWait);
                }
            }
            catch (AggregateException)
            {
            }

            // merge all results

            var atomPairList = new List <AtomPair>();

            foreach (var task in tasks.Where(t => t != null && t.IsCompleted && !t.IsCanceled && !t.IsFaulted && t.Result != null && t.Result.Count > 0))
            {
                atomPairList.AddRange(task.Result);
            }

            if (sort && atomPairList != null && atomPairList.Count > 1)
            {
                atomPairList = atomPairList
                               .OrderBy(i => ProteinDataBankFileOperations.NullableTryParseInt32(i.Atom1.resSeq.FieldValue))
                               .ThenBy(i => ProteinDataBankFileOperations.NullableTryParseInt32(i.Atom1.serial.FieldValue))
                               .ThenBy(j => ProteinDataBankFileOperations.NullableTryParseInt32(j.Atom2.resSeq.FieldValue))
                               .ThenBy(j => ProteinDataBankFileOperations.NullableTryParseInt32(j.Atom2.serial.FieldValue))
                               .ToList();
            }

            if (useCache)
            {
                InteractionsCache.SavePdbInteractionCache(proteinId, atomPairList, requiredChains);
            }

            return(atomPairList);
        }
        /*
         * public static void ClusterVectorDistanceMatrixUpgma(List<VectorProteinInterfaceWhole> vectorProteinInterfaceWholeList, decimal[,] vectorDistanceMatrix, int minimumOutputTreeLeafs, out List<string> vectorNames, out List<List<UpgmaNode>> nodeList, out List<List<string>> treeList, ProgressActionSet progressActionSet)
         * {
         *  if (vectorProteinInterfaceWholeList == null) throw new ArgumentNullException(nameof(vectorProteinInterfaceWholeList));
         *  if (vectorDistanceMatrix == null) throw new ArgumentNullException(nameof(vectorDistanceMatrix));
         *
         *  vectorNames = vectorProteinInterfaceWholeList.Select(VectorProteinInterfaceWholeTreeHeader).ToList();
         *
         *  List<string> finalTreeLeafOrderList;
         *  UpgmaClustering.Upgma(vectorDistanceMatrix, vectorNames, minimumOutputTreeLeafs, out nodeList, out treeList, out finalTreeLeafOrderList, false, progressActionSet);
         * }
         */

        public static void BestDistanceMatrixWithPartsAlignment(CancellationToken cancellationToken, List <VectorProteinInterfaceWhole> vectorProteinInterfaceWholeList, VectorDistanceMeasurementValues vectorDistanceMeasurementValues, out double[,] optimisticDistanceMatrix, /* out double[,] pessimisticDistanceMatrix,*/ ProgressActionSet progressActionSet)
        {
            if (vectorProteinInterfaceWholeList == null)
            {
                throw new ArgumentNullException(nameof(vectorProteinInterfaceWholeList));
            }
            if (vectorDistanceMeasurementValues == null)
            {
                throw new ArgumentNullException(nameof(vectorDistanceMeasurementValues));
            }

            var totalVectors = vectorProteinInterfaceWholeList.Count;

            var optimisticDistanceMatrix2 = new double[totalVectors, totalVectors];
            //var pessimisticDistanceMatrix2 = new double[totalVectors, totalVectors];

            var workDivision = new WorkDivision(vectorProteinInterfaceWholeList.Count, -1);

            ProgressActionSet.StartAction(vectorProteinInterfaceWholeList.Count, progressActionSet);

            for (int threadIndex = 0; threadIndex < workDivision.ThreadCount; threadIndex++)
            {
                int localThreadIndex = threadIndex;

                var task = Task.Run(() =>
                {
                    for (int indexX = workDivision.ThreadFirstIndex[localThreadIndex]; indexX <= workDivision.ThreadLastIndex[localThreadIndex]; indexX++)
                    {
                        if (cancellationToken.IsCancellationRequested)
                        {
                            break;
                        }
                        var vectorProteinInterfaceWholeX = vectorProteinInterfaceWholeList[indexX];

                        for (int indexY = 0; indexY < vectorProteinInterfaceWholeList.Count; indexY++)
                        {
                            if (indexX >= indexY)
                            {
                                continue;
                            }

                            var vectorProteinInterfaceWholeY = vectorProteinInterfaceWholeList[indexY];

                            if (vectorProteinInterfaceWholeX.FullProteinInterfaceId == vectorProteinInterfaceWholeY.FullProteinInterfaceId)
                            {
                                continue;
                            }

                            double optimisticDistance;
                            //double pessimisticDistance;
                            BestDistanceWithPartsAlignment(vectorProteinInterfaceWholeX, vectorProteinInterfaceWholeY, vectorDistanceMeasurementValues, out optimisticDistance /*, out pessimisticDistance*/);

                            var lengthDifference = Math.Abs(vectorProteinInterfaceWholeX.ProteinInterfaceLength - vectorProteinInterfaceWholeY.ProteinInterfaceLength);

                            var lengthDistance = lengthDifference * vectorDistanceMeasurementValues.DifferentLengthProteinInterface;

                            optimisticDistance += lengthDistance;
                            //pessimisticDistance += lengthDistance;

                            optimisticDistanceMatrix2[indexX, indexY] = optimisticDistance;
                            //pessimisticDistanceMatrix2[indexX, indexY] = pessimisticDistance;

                            optimisticDistanceMatrix2[indexY, indexX] = optimisticDistance;
                            //pessimisticDistanceMatrix2[indexY, indexX] = pessimisticDistance;
                        }

                        workDivision.IncrementItemsCompleted(1);
                        ProgressActionSet.ProgressAction(1, progressActionSet);
                        ProgressActionSet.EstimatedTimeRemainingAction(workDivision.StartTicks, workDivision.ItemsCompleted, workDivision.ItemsToProcess, progressActionSet);
                    }
                }, cancellationToken);

                workDivision.TaskList.Add(task);
            }

            workDivision.WaitAllTasks();

            ProgressActionSet.FinishAction(true, progressActionSet);

            optimisticDistanceMatrix = optimisticDistanceMatrix2;
            //pessimisticDistanceMatrix = pessimisticDistanceMatrix2;
        }
        private static void Main(string[] args)
        {
            // this program will load the homolog list in csv format and for homologs of X sequence distance return a list of all partners
            // however, some partners may be duplicates, which cannot initially be removed, since they may bind differently in other instances
            // then, because of such cases, unique id to describe each protein must be created... this is slightly problematic because
            // close target homologs of proteins are also considered to be the same protein as the query protein
            // which means that they could exist for more than one query protein


            // FindHomologsCluster.exe c:\pdb\ds96ub_homologs\ c:\pdb\pdb_seqres.fasta 0.9 0.9 > ds96ub_homologs.csv

            var homolog_csv_folder     = args[0];
            var sequence_file          = args[1];
            var min_similarity_str     = args[2];
            var min_similarity_evo_str = args[3];

            var min_similarity     = decimal.Parse(min_similarity_str);
            var min_similarity_evo = decimal.Parse(min_similarity_evo_str);

            var seqList = Sequence.LoadSequenceFile(sequence_file, new[] { null, "", "protein" });


            var homologCsvFiles = Directory.GetFiles(homolog_csv_folder, "homologs_?????.csv");

            var parsedData = FindHomologs.FindHomologs.HomologChain.Load(homologCsvFiles);


            Array.Clear(homologCsvFiles, 0, homologCsvFiles.Length);

            //var query_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.query_pdb_id, a.query_chainid)).ToList();

            //var target_pdb_list = parsed_data.Select(a => new Tuple<string, string>(a.target_pdb_id, a.target_chainid)).ToList();


            //var query_alignments = new List<homolog_csv>();

            var homologs_clustered = new List <List <Tuple <string, char> > >();

            //var min_similarity = 0.9m;

            foreach (var rec in parsedData)
            {
                if (rec.AlignmentScore >= min_similarity && rec.AlignmentScoreEvo >= min_similarity_evo)
                {
                    //var query_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.query_pdb_id && b.Item2 == rec.query_chainid) != null);
                    //var target_group = homologs_clustered.FirstOrDefault(a => a.FirstOrDefault(b => b.Item1 == rec.target_pdb_id && b.Item2 == rec.target_chainid) != null);

                    List <Tuple <string, char> > query_group  = null;
                    List <Tuple <string, char> > target_group = null;

                    foreach (var cluster in homologs_clustered)
                    {
                        var xq = cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.QueryPdbId.ToUpperInvariant() && b.Item2 == rec.QueryChainId);
                        if (xq == null)
                        {
                            continue;
                        }
                        query_group = cluster;
                        break;
                    }

                    foreach (var cluster in homologs_clustered)
                    {
                        var xt =
                            cluster.FirstOrDefault(b => b.Item1.ToUpperInvariant() == rec.TargetPdbId.ToUpperInvariant() && b.Item2 == rec.TargetChainId);
                        if (xt == null)
                        {
                            continue;
                        }
                        target_group = cluster;
                        break;
                    }

                    var new_group = new List <Tuple <string, char> >();

                    if (query_group != null)
                    {
                        new_group.AddRange(query_group);
                        homologs_clustered.Remove(query_group);
                        query_group.Clear();
                    }
                    else
                    {
                        new_group.Add(new Tuple <string, char>(rec.QueryPdbId, rec.QueryChainId));
                    }

                    if (target_group != null)
                    {
                        new_group.AddRange(target_group);
                        homologs_clustered.Remove(target_group);
                        target_group.Clear();
                    }
                    else
                    {
                        new_group.Add(new Tuple <string, char>(rec.TargetPdbId, rec.TargetChainId));
                    }

                    new_group = new_group.Distinct().ToList(); // try without distinct?
                    new_group = new_group.OrderBy(a => a.Item1).ThenBy(a => a.Item2).ToList();

                    homologs_clustered.Add(new_group);
                }
            }

            var seq_list_ids = seqList.Select(a => new ProteinBioClass.SequenceId(a.Id)).ToList();


            var wd2 = new WorkDivision(homologs_clustered.Count);

            for (var thread2 = 0; thread2 < wd2.ThreadCount; thread2++)
            {
                var lti2 = thread2;

                wd2.TaskList.Add(Task.Run(() =>
                {
                    var result2 = new List <string>();

                    for (var index2 = wd2.ThreadFirstIndex[lti2]; index2 <= wd2.ThreadLastIndex[lti2]; index2++)
                    {
                        var cluster2 = homologs_clustered[index2];


                        var wd3 = new WorkDivision(cluster2.Count);

                        for (var thread3 = 0; thread3 < wd3.ThreadCount; thread3++)
                        {
                            var lti3     = thread3;
                            var cluster3 = cluster2;

                            var index4 = index2;
                            wd3.TaskList.Add(Task.Run(() =>
                            {
                                var result = new List <HomologClusterData>();
                                for (var index3 = wd3.ThreadFirstIndex[lti3]; index3 <= wd3.ThreadLastIndex[lti3]; index3++)
                                {
                                    var item   = cluster3[index3];
                                    Sequence s = null;
                                    for (var j = 0; j < seqList.Count; j++)
                                    {
                                        if (seq_list_ids[j].PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant() && seq_list_ids[j].ChainId == item.Item2)
                                        {
                                            s = seqList[j];
                                            break;
                                        }
                                    }
                                    if (s == null)
                                    {
                                        throw new Exception("sequence not found for " + item.Item1 + ":" + item.Item2);
                                    }

                                    var complexChains = seq_list_ids.Count(a => a.PdbId.ToUpperInvariant() == item.Item1.ToUpperInvariant());

                                    var minAlignmentScore = -1m;
                                    var maxAlignmentScore = -1m;


                                    var minAlignmentScoreEvo = -1m;
                                    var maxAlignmentScoreEvo = -1m;

                                    foreach (var item2 in cluster3)
                                    {
                                        if (ReferenceEquals(item, item2))
                                        {
                                            continue;
                                        }

                                        Sequence s2 = null;
                                        for (var j2 = 0; j2 < seqList.Count; j2++)
                                        {
                                            if (seq_list_ids[j2].PdbId.ToUpperInvariant() == item2.Item1.ToUpperInvariant() &&
                                                seq_list_ids[j2].ChainId == item2.Item2)
                                            {
                                                s2 = seqList[j2];
                                                break;
                                            }
                                        }
                                        if (s2 == null)
                                        {
                                            continue;
                                        }

                                        var alignmentScore = ProteinBioClass.AlignedSequenceSimilarityPercentage(s,
                                                                                                                 s2,
                                                                                                                 ProteinBioClass.AlignmentType.NMW);

                                        if (alignmentScore.Score > maxAlignmentScore || maxAlignmentScore == -1m)
                                        {
                                            maxAlignmentScore = alignmentScore.Score;
                                        }
                                        if (alignmentScore.Score < minAlignmentScore || minAlignmentScore == -1m)
                                        {
                                            minAlignmentScore = alignmentScore.Score;
                                        }

                                        if (alignmentScore.ScoreEvo > maxAlignmentScoreEvo || maxAlignmentScoreEvo == -1m)
                                        {
                                            maxAlignmentScoreEvo = alignmentScore.ScoreEvo;
                                        }
                                        if (alignmentScore.ScoreEvo < minAlignmentScoreEvo || minAlignmentScoreEvo == -1m)
                                        {
                                            minAlignmentScoreEvo = alignmentScore.ScoreEvo;
                                        }
                                    }

                                    var r = new HomologClusterData(index4 + 1, index3 + 1, item.Item1, item.Item2, complexChains, Convert.ToInt32(s.Count()), minAlignmentScore, maxAlignmentScore, minAlignmentScoreEvo, maxAlignmentScoreEvo, s.FullSequence);

                                    result.Add(r);
                                }
                                return(result);
                            }));
                        }
                        wd3.WaitAllTasks();



                        result2.Add("; Cluster # " + (index2 + 1) + " with " + wd3.ItemsToProcess + " protein chains");
                        result2.Add("cluster index,item index,pdb id,chain id,complex chains,seq len,min clstr sid,max clstr sid,min evo clstr sid,max evo clstr sid,sequence");

                        foreach (var task in wd3.TaskList)
                        {
                            //if (task.IsFaulted || task.IsCanceled) continue;
                            var tr = task as Task <List <HomologClusterData> >;
                            if (tr == null || tr.Result == null)
                            {
                                continue;
                            }
                            result2.AddRange(tr.Result.Select(a => a.ToString()).ToList());
                        }

                        result2.Add("");
                    }

                    return(result2);
                }));
                //wd2.TaskList.Add(task2);
            }
            wd2.WaitAllTasks();

            var result1 = new List <string>();

            foreach (var task in wd2.TaskList)
            {
                //if (task.IsFaulted || task.IsCanceled) continue;
                var tr = task as Task <List <string> >;
                if (tr == null || tr.Result == null)
                {
                    continue;
                }
                result1.AddRange(tr.Result);
            }

            foreach (var line in result1)
            {
                Console.WriteLine(line);
            }
            // partners may have other interfaces, should those also be considered?
        }