Exemple #1
0
        public void CalculateProteinCoverage(List <FastaItem> fastaItems)
        {
            Parallel.ForEach(MyProteinList, myProtein =>
                             //foreach (MyProtein myProtein in MyProteinList)
            {
                //Locate the corrsponding fasta item
                myProtein.Locus = myProtein.Locus.TrimEnd('\r', '\n');
                PatternTools.FastaParser.FastaItem item = fastaItems.Find(a => a.SequenceIdentifier.Equals(myProtein.Locus));

                if (item != null)
                {
                    myProtein.Coverage    = Math.Round(item.Coverage(myProtein.DistinctPeptides), 4);
                    myProtein.Length      = item.Sequence.Length;
                    myProtein.MolWt       = Math.Round(item.MonoisotopicMass, 1);
                    myProtein.Description = item.Description;
                    myProtein.Sequence    = item.Sequence;
                }
                else
                {
                    //For some bizarre reason the key was not found
                    Console.WriteLine("ERROR:: + Protein must be found in the database: " + myProtein.Locus);
                    myProtein.Coverage    = 0;
                    myProtein.Length      = 0;
                    myProtein.MolWt       = 0;
                    myProtein.Description = "Description not found in database";
                }
            }
                             );
        }
        public void IncludeScrambled()
        {
            List <FastaItem> myScrambled = new List <FastaItem>(myItems.Count);

            for (int i = 0; i < myItems.Count; i++)
            {
                FastaItem f = new FastaItem();
                f.SequenceIdentifier = "Scrambled_" + myItems[i].SequenceIdentifier;
                f.Sequence           = ScrambleString(myItems[i].Sequence);
                f.Description        = "";
                myScrambled.Add(f);
            }

            myItems.AddRange(myScrambled);
        }
        /// <summary>
        ///
        /// </summary>
        /// <param name="fileName"></param>
        /// <param name="removeReverse"></param>
        /// <param name="removeIPIVersionNumber"></param>
        /// <param name="considerOnlyFirstName"></param>
        /// <param name="useRegexForTitleOrDescription">If you wish to only consider items that have certain words in their descriptions or names.  You should also set the Regex parameter</param>
        /// <param name="theRegex"></param>
        public void ParseFile(
            StreamReader sr,
            bool removeReverse,
            DBTypes dbType,
            bool useRegexForTitleOrDescription = false,
            Regex theRegex = null
            )
        {
            //Prepare the stream Reader variables
            string read;
            Regex  ipiVersion      = new Regex(@"\.[0-9]+", RegexOptions.Compiled);
            Regex  ipiRemove       = new Regex(@">IPI:", RegexOptions.Compiled);
            Regex  NextProtIDRegex = new Regex(@">nxp:([A-Z|0-9|\-|_]+) ", RegexOptions.Compiled);
            Regex  correctIPI      = new Regex(@"^\|>", RegexOptions.Compiled);
            Regex  spaceSplitter   = new Regex(@" ", RegexOptions.Compiled);

            Regex SequestFriendlyGI             = new Regex(@">gi\|([0-9]*)", RegexOptions.Compiled);
            Regex SequestFriendlyGIDescription  = new Regex(@">[^ ]* (.*)", RegexOptions.Compiled);
            Regex SequestFriendlyIPI            = new Regex(@"IPI:(IPI[0-9|\.]+)\|", RegexOptions.Compiled);
            Regex SequestFriendlyIPIDescription = new Regex(@"Tax_Id=[0-9]+ Gene_Symbol=(.*)", RegexOptions.Compiled);
            Regex SequestFriendUniProt          = new Regex(@"[a-z]+\|([A-Z|0-9|\-]+)\|", RegexOptions.Compiled);

            Regex SequestFriendlyUniProtDescription = new Regex(@" (.*)");

            Regex TitleSpaceDescriptionID          = new Regex(@">(\S+)", RegexOptions.Compiled);
            Regex TitleSpaceDescriptionDescription = new Regex(@" (.*)", RegexOptions.Compiled);


            //Declare the PathCandidates we will use

            //Parse and load to Ram
            FastaItem item               = new FastaItem();
            int       SequenceCounter    = 0;
            int       lineCounter        = 0;
            int       problematicCounter = 0;

            while ((read = sr.ReadLine()) != null)
            {
                lineCounter++;
                if (read.StartsWith(">"))
                {
                    if (SequenceCounter > 0)
                    {
                        if (useRegexForTitleOrDescription)
                        {
                            if (theRegex.IsMatch(item.SequenceIdentifier) || theRegex.IsMatch(item.Description))
                            {
                                myItems.Add(item);
                            }
                        }
                        else
                        {
                            myItems.Add(item);
                        }
                    }
                    else
                    {
                        SequenceCounter++;
                    }


                    //---Parse search engine friendly
                    item = new FastaItem();

                    Match id;
                    Match description;

                    if (dbType.Equals(DBTypes.NCBInr))
                    {
                        id               = SequestFriendlyGI.Match(read);
                        description      = SequestFriendlyGIDescription.Match(read);
                        item.Description = description.Groups[1].Value;
                    }
                    else if (dbType.Equals(DBTypes.IPI))
                    {
                        id               = SequestFriendlyIPI.Match(read);
                        description      = SequestFriendlyIPIDescription.Match(read);
                        item.Description = description.Groups[1].Value;
                    }
                    else if (dbType.Equals(DBTypes.IDSpaceDescription))
                    {
                        id               = TitleSpaceDescriptionID.Match(read);
                        description      = TitleSpaceDescriptionDescription.Match(read);
                        item.Description = description.Groups[1].Value;
                    }
                    else if (dbType.Equals(DBTypes.UniProt))
                    {
                        id               = SequestFriendUniProt.Match(read);
                        description      = SequestFriendlyUniProtDescription.Match(read);
                        item.Description = description.Groups[1].Value;
                    }
                    else if (dbType.Equals(DBTypes.NeXtProt))
                    {
                        id = NextProtIDRegex.Match(read);
                        List <string> cols = Regex.Split(read, Regex.Escape(@"\")).ToList();
                        string        desc = cols.Find(a => a.StartsWith("Pname="));
                        item.Description = desc.Remove(0, 6);
                    }
                    else if (dbType.Equals(DBTypes.Contaminant))
                    {
                        //we are working with dbtypes = all
                        id = Regex.Match(read, @">(.*)");
                        //we do not want a match in the description
                        description      = Regex.Match(read, @"cvbcvbfgz");
                        item.Description = description.Groups[1].Value;
                    }
                    else
                    {
                        string[] cols = Regex.Split(read, " ");
                        id = Regex.Match(cols[0], @">(.*)");
                        try
                        {
                            string desc = read.Substring(id.Length + 1);
                            description      = Regex.Match(desc, @"(.*)");
                            item.Description = description.Groups[1].Value;
                        }
                        catch
                        {
                            item.Description = "No Description";
                        }
                    }

                    if (id.Groups[1].Value.Length == 0)
                    {
                        item.SequenceIdentifier = "ProblematicID" + problematicCounter++;
                        Console.WriteLine("Problem parsing identifier: " + lineCounter + " problem:" + problematicCounter);
                    }
                    else
                    {
                        item.SequenceIdentifier = id.Groups[1].Value;
                    }



                    //------------
                }
                else
                {
                    item.Sequence += read;
                }
            }
            Console.WriteLine(lineCounter + "lines parsed in the fasta DB.");

            //And now for the last item
            if (useRegexForTitleOrDescription)
            {
                if (theRegex.IsMatch(item.SequenceIdentifier) || theRegex.IsMatch(item.Description))
                {
                    myItems.Add(item);
                }
            }
            else
            {
                myItems.Add(item);
            }

            if (removeReverse)
            {
                myItems.RemoveAll(a => a.SequenceIdentifier.Contains("Reverse_"));
            }

            sr.Close();
        }