void GroupProteinsOrPeptides(
            IDictionary <string, Node> entities,
            IReadOnlyList <string> entityNames,
            Type entityType,
            GlobalIDContainer globalIDTracker)
        {
            for (var count = 0; count != entityNames.Count; count++)
            {
                // Is the key there?
                if (!entities.ContainsKey(entityNames[count]))
                {
                    continue;
                }

                // Get the protein or peptide
                var entity = entities[entityNames[count]];

                // Only proceed if the correct type
                if (entity.GetType() != entityType)
                {
                    continue;
                }

                var duplicates = new NodeChildren <Node>();

                // Look for duplicates and add to a duplicate list
                duplicates.AddRange(FindDuplicates(entity));

                if (duplicates.Count <= 1)
                {
                    continue;
                }

                // Create a protein or peptide group from the duplicates
                Group newGroup;

                if (entityType == typeof(Protein))
                {
                    newGroup = new ProteinGroup(duplicates, globalIDTracker);
                }
                else if (entityType == typeof(Peptide))
                {
                    newGroup = new PeptideGroup(duplicates, globalIDTracker);
                }
                else
                {
                    throw new Exception("Invalid type: must be Protein or Peptide");
                }

                foreach (var duplicateItem in duplicates)
                {
                    // Remove entities from the library, add the new group
                    entities.Remove(duplicateItem.NodeName);
                }

                entities.Add(newGroup.NodeName, newGroup);
            }
        }
Ejemplo n.º 2
0
        private static string GetProteinList(Node node, GlobalIDContainer globalIDTracker)
        {
            if (node.NodeName.IndexOf(Group.LIST_SEP_CHAR) > 0)
            {
                var proteinList = globalIDTracker.IDListToNameListString(node.NodeName, Group.LIST_SEP_CHAR);
                return(proteinList.Replace(Group.LIST_SEP_CHAR.ToString(), "; "));
            }

            return(node.NodeName);
        }
        /// <summary>
        /// Group proteins having similar peptides
        /// </summary>
        /// <param name="peptideProteinMapList">List of protein to peptide mappings</param>
        /// <param name="clusteredProteins">Parsimonious list of protein</param>
        /// <param name="globalIDTracker"></param>
        /// <returns></returns>
        /// <remarks>Runs on a separate thread which allows for deeper recursion</remarks>
        // ReSharper disable once UnusedMember.Global
        public bool PerformParsimonyThreaded(
            List <RowEntry> peptideProteinMapList,
            out List <Node> clusteredProteins,
            out GlobalIDContainer globalIDTracker)
        {
            mPeptideProteinMapList = peptideProteinMapList;

            // Increase the stack size from 1 MB to 50 MB
            const int stackSizeInBytes = 50 * 1024 * 1024;

            // Since we're using a larger stack size, we can increase the maximum DFS recursion depth
            mMaxDFSRecursionDepth = 20000;

            var thread = new Thread(PerformParsimonyWork, stackSizeInBytes);

            thread.Start();

            var startTime = DateTime.UtcNow;

            while (true)
            {
                // Wait for 2 seconds
                thread.Join(2000);

                // Check whether the thread is still running
                if (!thread.IsAlive)
                {
                    break;
                }

                // Check whether the thread has been running too long
                if (!(DateTime.UtcNow.Subtract(startTime).TotalHours > MAX_RUNTIME_HOURS))
                {
                    continue;
                }

                OnErrorEvent("Parsimony calculations have run for over " + MAX_RUNTIME_HOURS + " hours; aborting");

                try
                {
                    thread.Abort();
                }
                catch
                {
                    // Ignore errors here;
                }

                break;
            }

            clusteredProteins = mClusteredProteins;
            globalIDTracker   = mGlobalIDTracker;

            return(mProcessingSucceeded);
        }
        /// <summary>
        /// Collapses redundant protein and peptides into single groups
        /// </summary>
        /// <param name="proteins"></param>
        /// <param name="peptides"></param>
        /// <param name="globalIDTracker"></param>
        public void CollapseNodes(
            Dictionary <string, Node> proteins,
            Dictionary <string, Node> peptides,
            GlobalIDContainer globalIDTracker)
        {
            var proteinNames = proteins.Keys.ToList();

            GroupProteinsOrPeptides(proteins, proteinNames, typeof(Protein), globalIDTracker);

            var peptideNames = peptides.Keys.ToList();

            GroupProteinsOrPeptides(peptides, peptideNames, typeof(Peptide), globalIDTracker);
        }
        /// <summary>
        /// Group proteins having similar peptides
        /// </summary>
        /// <param name="peptideProteinMapList">List of protein to peptide mappings</param>
        /// <param name="clusteredProteins">Parsimonious list of protein</param>
        /// <param name="globalIDTracker"></param>
        /// <returns></returns>
        // ReSharper disable once UnusedMember.Global
        public bool PerformParsimony(
            List <RowEntry> peptideProteinMapList,
            out List <Node> clusteredProteins,
            out GlobalIDContainer globalIDTracker)
        {
            mPeptideProteinMapList = peptideProteinMapList;
            mMaxDFSRecursionDepth  = DEFAULT_MAX_DFS_RECURSION_DEPTH;

            PerformParsimonyWork();

            clusteredProteins = mClusteredProteins;
            globalIDTracker   = mGlobalIDTracker;

            return(mProcessingSucceeded);
        }
        private void TestNodeBuilder(List <RowEntry> peptideProteinMapList, IReadOnlyList <string> expectedOutput)
        {
            var nodeBuilder   = new NodeBuilder();
            var nodeCollapser = new NodeCollapser();
            var dfs           = new DFS();
            var cover         = new Cover();

            var globalIDTracker = new GlobalIDContainer();

            nodeBuilder.RunAlgorithm(peptideProteinMapList, out var proteins, out var peptides);

            nodeCollapser.RunAlgorithm(proteins, peptides, globalIDTracker);

            var proteinsWithChildren = proteins.Values.ToList();
            var clusteredProteinSets = dfs.RunAlgorithm(proteinsWithChildren);
            var clusteredProteins    = cover.RunAlgorithm(clusteredProteinSets);

            var outLines = Utilities.ConvertNodesToStringList(clusteredProteins, globalIDTracker);

            foreach (var item in outLines)
            {
                Console.WriteLine(item);
            }

            if (expectedOutput == null || expectedOutput.Count == 0)
            {
                return;
            }

            for (var i = 0; i < outLines.Count; i++)
            {
                if (i >= expectedOutput.Count)
                {
                    Assert.Fail("Extra, unexpected output line: " + outLines[i]);
                }

                Assert.AreEqual(expectedOutput[i], outLines[i], "Mismatch on line {0}", i + 1);
            }

            if (expectedOutput.Count > outLines.Count)
            {
                Assert.Fail("Output did not include additional, expected lines, starting with " + expectedOutput[outLines.Count]);
            }
        }
Ejemplo n.º 7
0
        public static List <string> ConvertNodesToStringList(List <Node> outData, GlobalIDContainer globalIDTracker)
        {
            var outLines = new List <string>();

            var headerNames = new List <string> {
                "Protein", "Peptide"
            };

            outLines.Add(string.Join("\t", headerNames));

            foreach (var node in outData)
            {
                foreach (var child in node.Children)
                {
                    if (child.GetType() == typeof(PeptideGroup))
                    {
                        var proteinList = GetProteinList(node, globalIDTracker);
                        foreach (var groupedPeptide in ((Group)child).GetNodeGroup())
                        {
                            outLines.Add(string.Format("{0}\t{1}", proteinList, groupedPeptide.NodeName));
                        }
                    }
                    else if (node is ProteinGroup)
                    {
                        var proteinList = GetProteinList(node, globalIDTracker);
                        outLines.Add(string.Format("{0}\t{1}", proteinList, child.NodeName));
                    }
                    else
                    {
                        outLines.Add(string.Format("{0}\t{1}", node.NodeName, child.NodeName));
                    }
                }
            }

            return(outLines);
        }
 public PeptideGroup(NodeChildren <Node> groupedNodes, GlobalIDContainer globalIDTracker)
     : base(NodeTypeName.PeptideGroup, groupedNodes, globalIDTracker)
 {
 }
        /// <summary>
        /// Group proteins having similar peptides
        /// </summary>
        /// <returns></returns>
        private void PerformParsimonyWork()
        {
            mProcessingSucceeded = false;
            mClusteredProteins   = new List <Node>();
            mGlobalIDTracker     = new GlobalIDContainer();

            // Prepare objects and algorithms
            var nodeBuilder   = new NodeBuilder();
            var nodeCollapser = new NodeCollapser();

            var dfs = new DFS {
                MaxRecursionDepth = mMaxDFSRecursionDepth
            };

            var cover = new Cover();

            RegisterEvents(dfs);
            RegisterEvents(cover);

            if (ShowProgressAtConsole)
            {
                Console.WriteLine();
                OnStatusEvent("Finding parsimonious protein groups");
            }

            nodeBuilder.RunAlgorithm(mPeptideProteinMapList, out var proteins, out var peptides);

            if (proteins == null || proteins.Count == 0)
            {
                throw new Exception("Error in PerformParsimony: Protein list is empty");
            }

            if (peptides == null || peptides.Count == 0)
            {
                throw new Exception("Error in PerformParsimony: Peptide list is empty");
            }

            nodeCollapser.RunAlgorithm(proteins, peptides, mGlobalIDTracker);

            if (proteins == null || proteins.Count == 0)
            {
                throw new Exception("Error in PerformParsimony after nodeCollapser.RunAlgorithm: Protein list is empty");
            }

            if (peptides == null || peptides.Count == 0)
            {
                throw new Exception("Error in PerformParsimony after nodeCollapser.RunAlgorithm: Peptide list is empty");
            }

            var proteinsWithChildren = proteins.Values.ToList();

            var clusteredProteinSets = dfs.RunAlgorithm(proteinsWithChildren);

            if (clusteredProteinSets == null || clusteredProteinSets.Count == 0)
            {
                throw new Exception("Error in PerformParsimony: DFS returned an empty protein list");
            }

            mClusteredProteins = cover.RunAlgorithm(clusteredProteinSets);

            if (mClusteredProteins == null || mClusteredProteins.Count == 0)
            {
                throw new Exception("Error in PerformParsimony: cover.RunAlgorithm returned an empty protein list");
            }

            if (ShowProgressAtConsole)
            {
                OnStatusEvent("Iteration Complete, found {0} protein groups", mClusteredProteins.Count);
            }

            mProcessingSucceeded = true;
        }
Ejemplo n.º 10
0
        /// <summary>
        /// Creates tab-delimited text file parsimonyResultsFilePath with the protein groups and member peptides
        /// Creates tab-delimited text file proteinGroupMembersFilePath with the members of each protein group
        /// </summary>
        /// <param name="outData"></param>
        /// <param name="parsimonyResultsFilePath"></param>
        /// <param name="proteinGroupMembersFilePath"></param>
        /// <param name="globalIDTracker"></param>
        public static void SaveResults(List <Node> outData, string parsimonyResultsFilePath, string proteinGroupMembersFilePath, GlobalIDContainer globalIDTracker)
        {
            using var resultsWriter       = new StreamWriter(new FileStream(parsimonyResultsFilePath, FileMode.Create, FileAccess.Write, FileShare.Read));
            using var proteinGroupsWriter = new StreamWriter(new FileStream(proteinGroupMembersFilePath, FileMode.Create, FileAccess.Write, FileShare.Read));

            var header = "GroupID\tProtein_First\tPeptide\tProtein_List\tProtein_Count\tGroup_Count";

            resultsWriter.WriteLine(header);

            header = "GroupID\tProtein";
            proteinGroupsWriter.WriteLine(header);

            // Step through the data to determine the number of groups that each peptide is in

            var peptideToProteinGroupMap = new Dictionary <string, int>();

            foreach (var proteinNode in outData)
            {
                foreach (var child in proteinNode.Children)
                {
                    if (child.GetType() == typeof(PeptideGroup))
                    {
                        var currentPeptides = (Group)child;
                        foreach (var groupedPeptide in currentPeptides.GetNodeGroup())
                        {
                            UpdatePeptideToProteinGroupMap(peptideToProteinGroupMap, groupedPeptide.NodeName);
                        }
                    }
                    else
                    {
                        UpdatePeptideToProteinGroupMap(peptideToProteinGroupMap, child.NodeName);
                    }
                }
            }

            // Now write out the results
            var groupID = 0;

            foreach (var proteinNode in outData)
            {
                string proteinFirst;
                string proteinNameOrList;
                int    proteinsInGroupCount;

                groupID++;

                // Append one or more lines to T_Parsimony_Group_Members.txt
                if (proteinNode.GetType() == typeof(ProteinGroup))
                {
                    var currentGroup = (ProteinGroup)proteinNode;
                    proteinFirst = currentGroup.NodeNameFirst;
                    if (currentGroup.NodeName.IndexOf(Group.LIST_SEP_CHAR) > 0)
                    {
                        var proteinList = globalIDTracker.IDListToNameList(currentGroup.NodeName, Group.LIST_SEP_CHAR);
                        proteinNameOrList    = string.Join("; ", proteinList);
                        proteinsInGroupCount = proteinList.Count;

                        foreach (var proteinMember in proteinList)
                        {
                            WriteOutputGroupMemberLine(proteinGroupsWriter, groupID, proteinMember);
                        }
                    }
                    else
                    {
                        // Note: this code should never be reached
                        proteinNameOrList    = currentGroup.NodeName;
                        proteinsInGroupCount = 1;
                        WriteOutputGroupMemberLine(proteinGroupsWriter, groupID, proteinNameOrList);
                    }
                }
                else
                {
                    proteinFirst         = proteinNode.NodeName;
                    proteinNameOrList    = proteinNode.NodeName;
                    proteinsInGroupCount = 1;

                    WriteOutputGroupMemberLine(proteinGroupsWriter, groupID, proteinNameOrList);
                }

                // Append one or more lines to T_Parsimony_Grouping.txt
                foreach (var child in proteinNode.Children)
                {
                    if (child.GetType() == typeof(PeptideGroup))
                    {
                        var currentPeptides = (Group)child;
                        foreach (var groupedPeptide in currentPeptides.GetNodeGroup())
                        {
                            WriteOutputGroupingLine(resultsWriter, peptideToProteinGroupMap, groupID, proteinFirst, groupedPeptide.NodeName, proteinNameOrList, proteinsInGroupCount);
                        }
                    }
                    else
                    {
                        WriteOutputGroupingLine(resultsWriter, peptideToProteinGroupMap, groupID, proteinFirst, child.NodeName, proteinNameOrList, proteinsInGroupCount);
                    }
                }
            }
        }
 public void RunAlgorithm(Dictionary <string, Node> protein, Dictionary <string, Node> pep, GlobalIDContainer globalIDTracker)
 {
     CollapseNodes(protein, pep, globalIDTracker);
 }