コード例 #1
0
 public IndexingEngine(List <Protein> proteinList, List <ModificationWithMass> variableModifications, List <ModificationWithMass> fixedModifications, List <ProductType> productTypes, int currentPartition, DecoyType decoyType, IEnumerable <DigestionParams> collectionOfDigestionParams, CommonParameters commonParams, double maxFragmentSize, List <string> nestedIds) : base(commonParams, nestedIds)
 {
     ProteinList                 = proteinList;
     VariableModifications       = variableModifications;
     FixedModifications          = fixedModifications;
     ProductTypes                = productTypes;
     CurrentPartition            = currentPartition + 1;
     DecoyType                   = decoyType;
     CollectionOfDigestionParams = collectionOfDigestionParams;
     MaxFragmentSize             = maxFragmentSize;
 }
コード例 #2
0
ファイル: IndexingEngine.cs プロジェクト: blfrey/MetaMorpheus
 public IndexingEngine(List <Protein> proteinList, List <ModificationWithMass> variableModifications, List <ModificationWithMass> fixedModifications, List <ProductType> lp, int currentPartition, DecoyType decoyType, IEnumerable <IDigestionParams> CollectionOfDigestionParams, ICommonParameters commonParams, double maxFragmentSize, List <string> nestedIds) : base(nestedIds)
 {
     this.proteinList           = proteinList;
     this.variableModifications = variableModifications;
     this.fixedModifications    = fixedModifications;
     this.lp = lp;
     this.currentPartition            = currentPartition + 1;
     this.decoyType                   = decoyType;
     this.CollectionOfDigestionParams = CollectionOfDigestionParams;
     this.commonParams                = commonParams;
     this.maxFragmentSize             = maxFragmentSize;
 }
コード例 #3
0
 public IndexingEngine(List <Protein> proteinList, List <Modification> variableModifications, List <Modification> fixedModifications,
                       List <SilacLabel> silacLabels, int currentPartition, DecoyType decoyType, CommonParameters commonParams, double maxFragmentSize,
                       bool generatePrecursorIndex, List <FileInfo> proteinDatabases, List <string> nestedIds)
     : base(commonParams, nestedIds)
 {
     ProteinList            = proteinList;
     VariableModifications  = variableModifications;
     FixedModifications     = fixedModifications;
     SilacLabels            = silacLabels;
     CurrentPartition       = currentPartition + 1;
     DecoyType              = decoyType;
     MaxFragmentSize        = maxFragmentSize;
     GeneratePrecursorIndex = generatePrecursorIndex;
     this.ProteinDatabases  = proteinDatabases;
 }
コード例 #4
0
 /// <summary>
 /// Generates decoys for a list of proteins
 /// </summary>
 /// <param name="proteins"></param>
 /// <param name="decoyType"></param>
 /// <param name="digestionParams"></param>
 /// <param name="randomSeed">Used when decoy type is shuffle for shuffling the peptides</param>
 /// <returns></returns>
 public static List <Protein> GenerateDecoys(List <Protein> proteins, DecoyType decoyType, int maxThreads = -1)
 {
     if (decoyType == DecoyType.None)
     {
         return(new List <Protein>());
     }
     else if (decoyType == DecoyType.Reverse)
     {
         return(GenerateReverseDecoys(proteins, maxThreads));
     }
     else if (decoyType == DecoyType.Slide)
     {
         return(GenerateSlideDecoys(proteins, maxThreads));
     }
     else
     {
         throw new ArgumentException("Decoy type " + decoyType.ToString() + " is not implemented.");
     }
 }
コード例 #5
0
        public void MoreTests(string filename, DecoyType decoyType = DecoyType.None)
        {
            string xmlName  = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", filename);
            var    proteins = ProteinDbLoader.LoadProteinXML(xmlName, decoyType == DecoyType.None, decoyType, null, false, null, out var un);
            var    peps     = proteins[1].Digest(CommonParameters.DigestionParams, null, null).ToList();
            PeptideWithSetModifications pep = peps[peps.Count - 2];

            string     mzmlName     = $"ajgdiv{filename}{decoyType.ToString()}.mzML";
            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> {
                pep
            });

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);
            string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, $"TestSearchWithVariants{filename}{decoyType.ToString()}");

            Directory.CreateDirectory(outputFolder);

            SearchTask st = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    DoParsimony             = true,
                    DecoyType               = decoyType,
                    SearchTarget            = decoyType == DecoyType.None,
                    ModPeptidesAreDifferent = false
                },
                CommonParameters = new CommonParameters(scoreCutoff: 1, digestionParams: new DigestionParams(minPeptideLength: 2), precursorMassTolerance: new PpmTolerance(20)),
            };

            st.RunTask(outputFolder, new List <DbForTask> {
                new DbForTask(xmlName, false)
            }, new List <string> {
                mzmlName
            }, "");
            var psms = File.ReadAllLines(Path.Combine(outputFolder, "AllPSMs.psmtsv"));

            //Assert.IsTrue(psms.Any(line => line.Contains($"\t{variantPsmShort}\t" + (containsVariant ? variantPsmShort : "\t"))));

            Directory.Delete(outputFolder, true);
            File.Delete(mzmlName);
            //Directory.Delete(Path.Combine(TestContext.CurrentContext.TestDirectory, @"Task Settings"), true);
        }
コード例 #6
0
    public void InitializeDecoy(int pNumber, float decoyLifetime, float health, float movementSpeed,
                                float explosionRadius, float explosionDamage, VFXManager vfxManager, DecoyType type,
                                float secondsForTarget = 0, float targetRadius = 0, int numberOfDecoys = 0)
    {
        this.pNumber          = pNumber;
        this.decoyLifetime    = decoyLifetime;
        this.health           = health;
        this.movementSpeed    = movementSpeed;
        this.targetRadius     = targetRadius;
        this.explosionRadius  = explosionRadius;
        this.explosionDamage  = explosionDamage;
        this.secondsForAction = secondsForTarget;
        this.numberOfDecoys   = numberOfDecoys;
        this.vfxManager       = vfxManager;

        this.type = type;

        if (type == DecoyType.MovementDecoy)
        {
            enemiesList = FindObjectsOfType <Hero>().Where(p => p.PlayerNumber != pNumber).ToList();
        }

        agent.speed = movementSpeed;
    }
コード例 #7
0
 public PrecursorIndexingEngine(List <Protein> proteinList, List <ModificationWithMass> variableModifications, List <ModificationWithMass> fixedModifications, List <ProductType> lp, int currentPartition, DecoyType decoyType, IEnumerable <DigestionParams> CollectionOfDigestionParams, CommonParameters commonParams, double maxFragmentSize, List <string> nestedIds) : base(proteinList, variableModifications, fixedModifications, lp, currentPartition, decoyType, CollectionOfDigestionParams, commonParams, maxFragmentSize, nestedIds)
 {
 }
コード例 #8
0
        public static List <Protein> LoadProteinXML(string proteinDbLocation, bool generateTargetProteins, DecoyType decoyType, IEnumerable <Modification> allKnownModifications, bool IsContaminant, IEnumerable <string> modTypesToExclude, out Dictionary <string, Modification> unknownModifications)
        {
            List <Modification> prespecified = GetPtmListFromProteinXml(proteinDbLocation);

            Dictionary <string, IList <Modification> > mod_dict = new Dictionary <string, IList <Modification> >();

            if (prespecified.Count > 0 || allKnownModifications.Count() > 0)
            {
                mod_dict = GetModificationDict(new HashSet <Modification>(prespecified.Concat(allKnownModifications)));
            }

            List <Protein> result = new List <Protein>();

            unknownModifications = new Dictionary <string, Modification>();
            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Regex substituteWhitespace = new Regex(@"\s+");

                Stream uniprotXmlFileStream = proteinDbLocation.EndsWith(".gz") ?
                                              (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                              stream;

                string[] nodes = new string[6];

                string                         accession                = null;
                string                         name                     = null;
                string                         full_name                = null;
                string                         organism                 = null;
                string                         sequence                 = null;
                string                         feature_type             = null;
                string                         feature_description      = null;
                string                         original_value           = ""; // if no content is found, assume it is empty, not null (e.g. <original>A</original><variation/> for a deletion event)
                string                         variation_value          = "";
                string                         dbReference_type         = null;
                string                         dbReference_id           = null;
                List <string>                  property_types           = new List <string>();
                List <string>                  property_values          = new List <string>();
                int                            oneBasedfeature_position = -1;
                int?                           oneBasedbeginPosition    = null;
                int?                           oneBasedendPosition      = null;
                List <ProteolysisProduct>      proteolysisProducts      = new List <ProteolysisProduct>();
                List <SequenceVariation>       sequenceVariations       = new List <SequenceVariation>();
                List <DisulfideBond>           disulfideBonds           = new List <DisulfideBond>();
                var                            oneBasedModifications    = new Dictionary <int, List <Modification> >();
                List <Tuple <string, string> > gene_names               = new List <Tuple <string, string> >();
                bool                           reading_gene             = false;
                bool                           reading_organism         = false;
                List <DatabaseReference>       databaseReferences       = new List <DatabaseReference>();

                using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream))
                {
                    while (xml.Read())
                    {
                        switch (xml.NodeType)
                        {
                        case XmlNodeType.Element:
                            nodes[xml.Depth] = xml.Name;
                            int outValue;
                            switch (xml.Name)
                            {
                            case "accession":
                                if (accession == null)
                                {
                                    accession = xml.ReadElementString();
                                }
                                break;

                            case "name":
                                if (xml.Depth == 2 && !reading_gene && !reading_organism)
                                {
                                    name = xml.ReadElementString();
                                }
                                if (reading_gene && !reading_organism)
                                {
                                    gene_names.Add(new Tuple <string, string>(xml.GetAttribute("type"), xml.ReadElementString()));
                                }
                                if (reading_organism)
                                {
                                    if (xml.GetAttribute("type").Equals("scientific"))
                                    {
                                        organism = xml.ReadElementString();
                                    }
                                }
                                break;

                            case "gene":
                                reading_gene = true;
                                break;

                            case "organism":
                                if (organism == null)
                                {
                                    reading_organism = true;
                                }
                                break;

                            case "fullName":
                                if (full_name == null)
                                {
                                    full_name = xml.ReadElementString();
                                }
                                break;

                            case "feature":
                                feature_type        = xml.GetAttribute("type");
                                feature_description = xml.GetAttribute("description");
                                break;

                            case "original":
                                original_value = xml.ReadElementString();
                                break;

                            case "variation":
                                variation_value = xml.ReadElementString();
                                break;

                            case "dbReference":
                                property_types.Clear();
                                property_values.Clear();
                                dbReference_type = xml.GetAttribute("type");
                                dbReference_id   = xml.GetAttribute("id");
                                break;

                            case "property":
                                property_types.Add(xml.GetAttribute("type"));
                                property_values.Add(xml.GetAttribute("value"));
                                break;

                            case "position":
                                oneBasedfeature_position = int.Parse(xml.GetAttribute("position"));
                                break;

                            case "begin":
                                oneBasedbeginPosition = int.TryParse(xml.GetAttribute("position"), out outValue) ? (int?)outValue : null;
                                break;

                            case "end":
                                oneBasedendPosition = int.TryParse(xml.GetAttribute("position"), out outValue) ? (int?)outValue : null;
                                break;

                            case "sequence":
                                sequence = substituteWhitespace.Replace(xml.ReadElementString(), "");
                                break;
                            }
                            break;

                        case XmlNodeType.EndElement:
                            switch (xml.Name)
                            {
                            case "feature":
                                if (feature_type == "modified residue")
                                {
                                    feature_description = feature_description.Split(';')[0];

                                    // Create new entry for this residue, if needed
                                    if (!oneBasedModifications.TryGetValue(oneBasedfeature_position, out List <Modification> residue_modifications))
                                    {
                                        residue_modifications = new List <Modification>();
                                        oneBasedModifications.Add(oneBasedfeature_position, residue_modifications);
                                    }
                                    if (mod_dict.ContainsKey(feature_description))
                                    {
                                        // Known and not of a type in the exclusion list
                                        List <Modification> mods = mod_dict[feature_description].Where(m => !modTypesToExclude.Contains(m.modificationType)).ToList();
                                        if (mods.Count == 0 && oneBasedModifications[oneBasedfeature_position].Count == 0)
                                        {
                                            oneBasedModifications.Remove(oneBasedfeature_position);
                                        }
                                        else
                                        {
                                            oneBasedModifications[oneBasedfeature_position].AddRange(mods);
                                        }
                                    }
                                    else if (unknownModifications.ContainsKey(feature_description))
                                    {
                                        // Not known but seen
                                        residue_modifications.Add(unknownModifications[feature_description]);
                                    }
                                    else
                                    {
                                        // Not known and not seen
                                        unknownModifications[feature_description] = new Modification(feature_description, "unknown");
                                        residue_modifications.Add(unknownModifications[feature_description]);
                                    }
                                }
                                else if (feature_type == "peptide" || feature_type == "propeptide" || feature_type == "chain" || feature_type == "signal peptide")
                                {
                                    proteolysisProducts.Add(new ProteolysisProduct(oneBasedbeginPosition, oneBasedendPosition, feature_type));
                                }
                                else if (feature_type == "sequence variant" &&      // Only keep if there is variant sequence information and position information
                                         variation_value != null &&
                                         variation_value != "")
                                {
                                    if (oneBasedbeginPosition != null && oneBasedendPosition != null)
                                    {
                                        sequenceVariations.Add(new SequenceVariation((int)oneBasedbeginPosition, (int)oneBasedendPosition, original_value, variation_value, feature_description));
                                    }
                                    else if (oneBasedfeature_position >= 1)
                                    {
                                        sequenceVariations.Add(new SequenceVariation(oneBasedfeature_position, original_value, variation_value, feature_description));
                                    }
                                }
                                else if (feature_type == "disulfide bond")
                                {
                                    if (oneBasedbeginPosition != null && oneBasedendPosition != null)
                                    {
                                        disulfideBonds.Add(new DisulfideBond((int)oneBasedbeginPosition, (int)oneBasedendPosition, feature_description));
                                    }
                                    else if (oneBasedfeature_position >= 1)
                                    {
                                        disulfideBonds.Add(new DisulfideBond(oneBasedfeature_position, feature_description));
                                    }
                                }
                                oneBasedbeginPosition    = null;
                                oneBasedendPosition      = null;
                                oneBasedfeature_position = -1;
                                original_value           = "";
                                variation_value          = "";
                                break;

                            case "dbReference":
                                databaseReferences.Add(new DatabaseReference(dbReference_type, dbReference_id, Enumerable.Range(0, property_types.Count).Select(i => new Tuple <string, string>(property_types[i], property_values[i])).ToList()));
                                property_types   = new List <string>();
                                property_values  = new List <string>();
                                dbReference_type = null;
                                dbReference_id   = null;
                                break;

                            case "gene":
                                reading_gene = false;
                                break;

                            case "organism":
                                reading_organism = false;
                                break;

                            case "entry":
                                if (accession != null && sequence != null)
                                {
                                    if (generateTargetProteins)
                                    {
                                        var protein = new Protein(sequence, accession, organism, gene_names, oneBasedModifications, proteolysisProducts, name, full_name, false, IsContaminant, databaseReferences, sequenceVariations, disulfideBonds, proteinDbLocation);
                                        result.Add(protein);
                                    }

                                    switch (decoyType)
                                    {
                                    case DecoyType.Reverse:
                                        char[] sequence_array = sequence.ToCharArray();
                                        Dictionary <int, List <Modification> > decoy_modifications = null;
                                        List <DisulfideBond> decoy_disulfides = new List <DisulfideBond>();
                                        if (sequence.StartsWith("M", StringComparison.Ordinal))
                                        {
                                            // Do not include the initiator methionine in reversal!!!
                                            Array.Reverse(sequence_array, 1, sequence.Length - 1);
                                            decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count);
                                            foreach (var kvp in oneBasedModifications)
                                            {
                                                if (kvp.Key > 1)
                                                {
                                                    decoy_modifications.Add(sequence.Length - kvp.Key + 2, kvp.Value);
                                                }
                                                else if (kvp.Key == 1)
                                                {
                                                    decoy_modifications.Add(1, kvp.Value);
                                                }
                                            }
                                        }
                                        else
                                        {
                                            Array.Reverse(sequence_array);
                                            decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count);
                                            foreach (var kvp in oneBasedModifications)
                                            {
                                                decoy_modifications.Add(sequence.Length - kvp.Key + 1, kvp.Value);
                                            }
                                        }
                                        var reversed_sequence = new string(sequence_array);

                                        List <ProteolysisProduct> decoyPP = new List <ProteolysisProduct>();
                                        foreach (ProteolysisProduct pp in proteolysisProducts)
                                        {
                                            decoyPP.Add(new ProteolysisProduct(sequence.Length - pp.OneBasedEndPosition + 1, sequence.Length - pp.OneBasedBeginPosition, pp.Type));
                                        }
                                        foreach (DisulfideBond disulfideBond in disulfideBonds)
                                        {
                                            decoy_disulfides.Add(new DisulfideBond(sequence.Length - disulfideBond.OneBasedBeginPosition + 2, sequence.Length - disulfideBond.OneBasedEndPosition + 2, "DECOY DISULFIDE BOND: " + disulfideBond.Description));
                                        }

                                        List <SequenceVariation> decoy_variations = new List <SequenceVariation>();
                                        foreach (SequenceVariation sv in sequenceVariations)
                                        {
                                            char[] original_array  = sv.OriginalSequence.ToArray();
                                            char[] variation_array = sv.VariantSequence.ToArray();
                                            if (sv.OneBasedBeginPosition == 1)
                                            {
                                                bool orig_init_m = sv.OriginalSequence.StartsWith("M", StringComparison.Ordinal);
                                                bool var_init_m  = sv.VariantSequence.StartsWith("M", StringComparison.Ordinal);
                                                if (orig_init_m && !var_init_m)
                                                {
                                                    decoy_variations.Add(new SequenceVariation(1, "M", "", "DECOY VARIANT: Initiator Methionine Change in " + sv.Description));
                                                }
                                                original_array  = sv.OriginalSequence.Substring(Convert.ToInt32(orig_init_m)).ToArray();
                                                variation_array = sv.VariantSequence.Substring(Convert.ToInt32(var_init_m)).ToArray();
                                            }
                                            int decoy_end   = sequence.Length - sv.OneBasedBeginPosition + 2 + Convert.ToInt32(sv.OneBasedEndPosition == reversed_sequence.Length) - Convert.ToInt32(sv.OneBasedBeginPosition == 1);
                                            int decoy_begin = decoy_end - original_array.Length + 1;
                                            Array.Reverse(original_array);
                                            Array.Reverse(variation_array);
                                            decoy_variations.Add(new SequenceVariation(decoy_begin, decoy_end, new string(original_array), new string(variation_array), "DECOY VARIANT: " + sv.Description));
                                        }
                                        var decoy_protein = new Protein(reversed_sequence, "DECOY_" + accession, organism, gene_names, decoy_modifications, decoyPP, name, full_name, true, IsContaminant, null, decoy_variations, decoy_disulfides, proteinDbLocation);

                                        result.Add(decoy_protein);
                                        break;

                                    case DecoyType.Slide:
                                        int    numSlides = 20;
                                        char[] sequence_array_unslided = sequence.ToCharArray();
                                        char[] sequence_array_slided   = sequence.ToCharArray();
                                        decoy_modifications = null;
                                        List <DisulfideBond> decoy_disulfides_slide = new List <DisulfideBond>();
                                        if (sequence.StartsWith("M", StringComparison.Ordinal))
                                        {
                                            // Do not include the initiator methionine in shuffle!!!
                                            if (numSlides % sequence_array_slided.Length - 1 == 0)
                                            {
                                                numSlides++;
                                            }
                                            for (int i = 1; i < sequence_array_slided.Length; i++)
                                            {
                                                sequence_array_slided[i] = sequence_array_unslided[GetOldShuffleIndex(i, numSlides, sequence.Length, true)];
                                            }

                                            decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count);
                                            foreach (var kvp in oneBasedModifications)
                                            {
                                                if (kvp.Key > 1)
                                                {
                                                    decoy_modifications.Add(GetOldShuffleIndex(kvp.Key - 1, numSlides, sequence.Length, true) + 1, kvp.Value);
                                                }
                                                else if (kvp.Key == 1)
                                                {
                                                    decoy_modifications.Add(1, kvp.Value);
                                                }
                                            }
                                        }
                                        else
                                        {
                                            if (numSlides % sequence_array_slided.Length == 0)
                                            {
                                                numSlides++;
                                            }
                                            for (int i = 0; i < sequence_array_slided.Length; i++)
                                            {
                                                sequence_array_slided[i] = sequence_array_unslided[GetOldShuffleIndex(i, numSlides, sequence.Length, false)];
                                            }
                                            decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count);
                                            foreach (var kvp in oneBasedModifications)
                                            {
                                                decoy_modifications.Add(GetOldShuffleIndex(kvp.Key - 1, numSlides, sequence.Length, false) + 1, kvp.Value);
                                            }
                                        }
                                        var slided_sequence = new string(sequence_array_slided);

                                        List <ProteolysisProduct> decoyPP_slide = new List <ProteolysisProduct>();
                                        foreach (ProteolysisProduct pp in proteolysisProducts)              //can't keep all aa like you can with reverse, just keep it the same length
                                        {
                                            decoyPP_slide.Add(pp);
                                        }
                                        foreach (DisulfideBond disulfideBond in disulfideBonds)             //these actually need the same cysteines...
                                        {
                                            decoy_disulfides_slide.Add(new DisulfideBond(GetOldShuffleIndex(disulfideBond.OneBasedBeginPosition - 1, numSlides, slided_sequence.Length, false) + 1, GetOldShuffleIndex(disulfideBond.OneBasedEndPosition - 1, numSlides, slided_sequence.Length, false) + 1, "DECOY DISULFIDE BOND: " + disulfideBond.Description));
                                        }
                                        List <SequenceVariation> decoy_variations_slide = new List <SequenceVariation>();
                                        foreach (SequenceVariation sv in sequenceVariations)             //No idea what's going on here. Review is appreciated.
                                        {
                                            char[] original_array_unshuffled = sv.OriginalSequence.ToArray();
                                            char[] variation_array_unslided  = sv.VariantSequence.ToArray();
                                            if (sv.OneBasedBeginPosition == 1)
                                            {
                                                bool orig_init_m = sv.OriginalSequence.StartsWith("M", StringComparison.Ordinal);
                                                bool var_init_m  = sv.VariantSequence.StartsWith("M", StringComparison.Ordinal);
                                                if (orig_init_m && !var_init_m)
                                                {
                                                    decoy_variations_slide.Add(new SequenceVariation(1, "M", "", "DECOY VARIANT: Initiator Methionine Change in " + sv.Description));
                                                }
                                                original_array_unshuffled = sv.OriginalSequence.Substring(Convert.ToInt32(orig_init_m)).ToArray();
                                                variation_array_unslided  = sv.VariantSequence.Substring(Convert.ToInt32(var_init_m)).ToArray();
                                            }
                                            int    decoy_end              = sequence.Length - sv.OneBasedBeginPosition + 2 + Convert.ToInt32(sv.OneBasedEndPosition == slided_sequence.Length) - Convert.ToInt32(sv.OneBasedBeginPosition == 1);
                                            int    decoy_begin            = decoy_end - original_array_unshuffled.Length + 1;
                                            char[] original_array_slided  = sv.OriginalSequence.ToArray();
                                            char[] variation_array_slided = sv.VariantSequence.ToArray();

                                            if (numSlides % original_array_slided.Length == 0)
                                            {
                                                numSlides++;
                                            }
                                            for (int i = 0; i < original_array_slided.Length; i++)
                                            {
                                                original_array_slided[i] = original_array_unshuffled[GetOldShuffleIndex(i, numSlides, original_array_unshuffled.Length, false)];
                                            }

                                            if (numSlides % variation_array_slided.Length == 0)
                                            {
                                                numSlides++;
                                            }
                                            for (int i = 0; i < variation_array_slided.Length; i++)
                                            {
                                                variation_array_slided[i] = variation_array_unslided[GetOldShuffleIndex(i, numSlides, variation_array_unslided.Length, false)];
                                            }

                                            decoy_variations_slide.Add(new SequenceVariation(decoy_begin, decoy_end, new string(original_array_slided), new string(variation_array_slided), "DECOY VARIANT: " + sv.Description));
                                        }
                                        var decoy_protein_slide = new Protein(slided_sequence, "DECOY_" + accession, organism, gene_names, decoy_modifications, decoyPP_slide, name, full_name, true, IsContaminant, null, decoy_variations_slide, decoy_disulfides_slide, proteinDbLocation);

                                        result.Add(decoy_protein_slide);
                                        break;

                                    default:
                                        break;
                                    }
                                }
                                accession                = null;
                                name                     = null;
                                full_name                = null;
                                sequence                 = null;
                                organism                 = null;
                                feature_type             = null;
                                feature_description      = null;
                                original_value           = "";
                                variation_value          = "";
                                dbReference_type         = null;
                                dbReference_id           = null;
                                property_types           = new List <string>();
                                property_values          = new List <string>();
                                oneBasedfeature_position = -1;
                                oneBasedModifications    = new Dictionary <int, List <Modification> >();
                                proteolysisProducts      = new List <ProteolysisProduct>();
                                sequenceVariations       = new List <SequenceVariation>();
                                disulfideBonds           = new List <DisulfideBond>();
                                databaseReferences       = new List <DatabaseReference>();
                                gene_names               = new List <Tuple <string, string> >();
                                reading_gene             = false;
                                reading_organism         = false;
                                break;
                            }
                            break;
                        }
                    }
                }
            }
            return(result);
        }
コード例 #9
0
        /// <summary>
        /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field.
        /// </summary>
        /// <param name="proteinDbLocation"></param>
        /// <param name="onTheFlyDecoys"></param>
        /// <param name="IsContaminant"></param>
        /// <param name="accession_expression"></param>
        /// <param name="full_name_expression"></param>
        /// <param name="name_expression"></param>
        /// <param name="gene_expression"></param>
        /// <returns></returns>
        public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool originalTarget, DecoyType onTheFlyDecoys, bool IsContaminant, Regex accession_expression, Regex full_name_expression, Regex name_expression, Regex gene_expression, Regex organism_expression, out List <string> errors)
        {
            HashSet <string> unique_accessions = new HashSet <string>();
            int    unique_identifier           = 1;
            string accession = null;
            string name      = null;
            string full_name = null;
            string organism  = null;
            List <Tuple <string, string> > gene_name = new List <Tuple <string, string> >();

            errors = new List <string>();
            Regex substituteWhitespace = new Regex(@"\s+");

            List <Protein> result = new List <Protein>();

            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream fastaFileStream = proteinDbLocation.EndsWith(".gz") ?
                                         (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                         stream;

                StringBuilder sb    = null;
                StreamReader  fasta = new StreamReader(fastaFileStream);

                while (true)
                {
                    string line = "";

                    line = fasta.ReadLine();

                    if (line == null)
                    {
                        break;
                    }

                    if (line.StartsWith(">"))
                    {
                        var accession_match = accession_expression.Match(line);
                        var full_name_match = full_name_expression.Match(line);
                        var name_match      = name_expression.Match(line);
                        var gene_name_match = gene_expression.Match(line);
                        if (organism_expression != null)
                        {
                            var organism_match = organism_expression.Match(line);
                            if (organism_match.Groups.Count > 1)
                            {
                                organism = organism_expression.Match(line).Groups[1].Value;
                            }
                        }

                        if (accession_match.Groups.Count > 1)
                        {
                            accession = accession_expression.Match(line).Groups[1].Value;
                        }
                        if (full_name_match.Groups.Count > 1)
                        {
                            full_name = full_name_expression.Match(line).Groups[1].Value;
                        }
                        if (name_match.Groups.Count > 1)
                        {
                            name = name_expression.Match(line).Groups[1].Value;
                        }
                        if (gene_name_match.Groups.Count > 1)
                        {
                            gene_name.Add(new Tuple <string, string>("primary", gene_expression.Match(line).Groups[1].Value));
                        }

                        if (accession == null || accession == "")
                        {
                            accession = line.Substring(1).TrimEnd();
                        }

                        sb = new StringBuilder();
                    }
                    else if (sb != null)
                    {
                        sb.Append(line.Trim());
                    }

                    if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null)
                    {
                        string sequence = substituteWhitespace.Replace(sb.ToString(), "");
                        while (unique_accessions.Contains(accession))
                        {
                            accession += "_" + unique_identifier.ToString();
                            unique_identifier++;
                        }
                        unique_accessions.Add(accession);
                        if (originalTarget)
                        {
                            Protein protein = new Protein(sequence, accession, organism, gene_name, name: name, full_name: full_name, isContaminant: IsContaminant, databaseFilePath: proteinDbLocation);
                            if (protein.Length == 0)
                            {
                                errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation);
                            }
                            else
                            {
                                result.Add(protein);
                            }
                        }

                        switch (onTheFlyDecoys)
                        {
                        case DecoyType.Reverse:
                            char[] sequence_array  = sequence.ToCharArray();
                            int    starts_with_met = sequence.StartsWith("M", StringComparison.Ordinal) ? 1 : 0;
                            Array.Reverse(sequence_array, starts_with_met, sequence.Length - starts_with_met);     // Do not include the initiator methionine in reversal!!!
                            var     reversed_sequence = new string(sequence_array);
                            Protein decoy_protein     = new Protein(reversed_sequence, "DECOY_" + accession, organism, gene_name, name: name, full_name: full_name, isDecoy: true, isContaminant: IsContaminant, databaseFilePath: proteinDbLocation);
                            result.Add(decoy_protein);
                            break;

                        case DecoyType.Slide:
                            int    numSlides = 20;
                            char[] sequence_array_unslide = sequence.ToCharArray();
                            char[] sequence_array_slide   = sequence.ToCharArray();
                            bool   starts_with_met_slide  = sequence.StartsWith("M", StringComparison.Ordinal);
                            for (int i = starts_with_met_slide ? 1 : 0; i < sequence.Length; i++)
                            {
                                sequence_array_slide[i] = sequence_array_unslide[GetOldShuffleIndex(i, numSlides, sequence.Length, starts_with_met_slide)];
                            }
                            string  slide_sequence      = new string(sequence_array_slide);
                            Protein decoy_protein_slide = new Protein(slide_sequence, "DECOY_" + accession, organism, gene_name, name: name, full_name: full_name, isDecoy: true, isContaminant: IsContaminant, databaseFilePath: proteinDbLocation);
                            result.Add(decoy_protein_slide);
                            break;

                        default:
                            break;
                        }

                        accession = null;
                        name      = null;
                        full_name = null;
                        organism  = null;
                        gene_name = new List <Tuple <string, string> >();
                    }

                    // no input left
                    if (fasta.Peek() == -1)
                    {
                        break;
                    }
                }
            }
            if (!result.Any())
            {
                errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation);
            }
            return(result);
        }
コード例 #10
0
ファイル: ProteinDbLoader.cs プロジェクト: ianhirsch/mzLib
        public static List <Protein> LoadProteinXML(string proteinDbLocation, bool generateTargets, DecoyType decoyType, IEnumerable <Modification> allKnownModifications,
                                                    bool isContaminant, IEnumerable <string> modTypesToExclude, out Dictionary <string, Modification> unknownModifications, int maxThreads = -1)
        {
            List <Modification> prespecified = GetPtmListFromProteinXml(proteinDbLocation);

            allKnownModifications = allKnownModifications ?? new List <Modification>();
            modTypesToExclude     = modTypesToExclude ?? new List <string>();

            //Dictionary<string, IList<Modification>> modsDictionary = new Dictionary<string, IList<Modification>>();
            if (prespecified.Count > 0 || allKnownModifications.Count() > 0)
            {
                //modsDictionary = GetModificationDict(new HashSet<Modification>(prespecified.Concat(allKnownModifications)));
                IdToPossibleMods = GetModificationDict(new HashSet <Modification>(prespecified.Concat(allKnownModifications)));
                IdWithMotifToMod = GetModificationDictWithMotifs(new HashSet <Modification>(prespecified.Concat(allKnownModifications)));
            }

            List <Protein> targets = new List <Protein>();

            unknownModifications = new Dictionary <string, Modification>();
            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Regex substituteWhitespace = new Regex(@"\s+");

                Stream uniprotXmlFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                              (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                              stream;

                ProteinXmlEntry block = new ProteinXmlEntry();

                using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream))
                {
                    while (xml.Read())
                    {
                        if (xml.NodeType == XmlNodeType.Element)
                        {
                            block.ParseElement(xml.Name, xml);
                        }
                        if (xml.NodeType == XmlNodeType.EndElement || xml.IsEmptyElement)
                        {
                            var newProteinEntries = block.ParseEndElement(xml, modTypesToExclude, unknownModifications,
                                                                          isContaminant, proteinDbLocation);
                            targets.AddRange(newProteinEntries);
                        }
                    }
                }
            }

            List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);
            List <Protein> intermediateProteinList = (generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList();

            return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList());
        }
コード例 #11
0
ファイル: ProteinDbLoader.cs プロジェクト: ianhirsch/mzLib
        /// <summary>
        /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field.
        /// </summary>
        public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant,
                                                      FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex,
                                                      FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1)
        {
            HashSet <string> unique_accessions = new HashSet <string>();
            int    unique_identifier           = 1;
            string accession = null;
            string name      = null;
            string fullName  = null;
            string organism  = null;
            List <Tuple <string, string> > geneName = new List <Tuple <string, string> >();

            errors = new List <string>();
            Regex substituteWhitespace = new Regex(@"\s+");

            List <Protein> targets = new List <Protein>();

            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                         (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                         stream;

                StringBuilder sb    = null;
                StreamReader  fasta = new StreamReader(fastaFileStream);

                while (true)
                {
                    string line = "";
                    line = fasta.ReadLine();
                    if (line == null)
                    {
                        break;
                    }

                    if (line.StartsWith(">"))
                    {
                        accession = ApplyRegex(accessionRegex, line);
                        fullName  = ApplyRegex(fullNameRegex, line);
                        name      = ApplyRegex(nameRegex, line);
                        organism  = ApplyRegex(organismRegex, line);
                        string geneNameString = ApplyRegex(geneNameRegex, line);
                        if (geneNameString != null)
                        {
                            geneName.Add(new Tuple <string, string>("primary", geneNameString));
                        }

                        if (accession == null || accession == "")
                        {
                            accession = line.Substring(1).TrimEnd();
                        }

                        sb = new StringBuilder();
                    }
                    else if (sb != null)
                    {
                        sb.Append(line.Trim());
                    }

                    if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null)
                    {
                        string sequence = substituteWhitespace.Replace(sb.ToString(), "");
                        while (unique_accessions.Contains(accession))
                        {
                            accession += "_" + unique_identifier.ToString();
                            unique_identifier++;
                        }
                        unique_accessions.Add(accession);
                        Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName,
                                                      isContaminant: isContaminant, databaseFilePath: proteinDbLocation);
                        if (protein.Length == 0)
                        {
                            errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation);
                        }
                        else
                        {
                            targets.Add(protein);
                        }

                        accession = null;
                        name      = null;
                        fullName  = null;
                        organism  = null;
                        geneName  = new List <Tuple <string, string> >();
                    }

                    // no input left
                    if (fasta.Peek() == -1)
                    {
                        break;
                    }
                }
            }
            if (!targets.Any())
            {
                errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation);
            }
            List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);

            return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList());
        }
コード例 #12
0
        [TestCase(9, 0, true, "EDITPEPEDITP2PPP", DecoyType.Reverse)] // MPEPPP becomes MPPPEP with the variant beginning at position 2
        public static void SearchTests(int proteinIdx, int peptideIdx, bool containsVariant, string variantPsmShort, DecoyType decoyType = DecoyType.None)
        {
            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();

            // Make sure can run the complete search task when multiple compact peptides may correspond to a single PWSM
            SearchTask st = new SearchTask
            {
                SearchParameters = new SearchParameters
                {
                    DoParsimony             = true,
                    DecoyType               = decoyType,
                    SearchTarget            = decoyType == DecoyType.None,
                    ModPeptidesAreDifferent = false
                },
                CommonParameters = new CommonParameters(scoreCutoff: 1, digestionParams: new DigestionParams(minPeptideLength: 2), precursorMassTolerance: new PpmTolerance(20)),
            };

            ModificationMotif.TryGetMotif("V", out ModificationMotif motifV);
            Modification mv = new Modification("mod", null, "type", null, motifV, "Anywhere.", null, 42.01, new Dictionary <string, IList <string> >(), null, null, null, null, null);

            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
            Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary <string, IList <string> >(), null, null, null, null, null);

            List <Protein> proteins = new List <Protein>
            {
                new Protein("MPEPTIDE", "protein1", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPTIDE", "protein2", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPPPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPKPKTIDE", "protein3", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 7, "PKPK", "PK", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPTAIDE", "protein2", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 5, "PTA", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEKKAIDE", "protein2", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "KKA", "K", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
                new Protein("MPEPTIDE", "protein1", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary <int, List <Modification> > {
                        { 4, new[] { mv }.ToList() }
                    })
                }),
                new Protein("MPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary <int, List <Modification> > {
                        { 5, new[] { mp }.ToList() }
                    })
                }),
                new Protein("MPEPTIDEPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> {
                    new SequenceVariation(4, 4, "PTIDEPEPTIDE", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
                }),
            };
            PeptideWithSetModifications pep = proteins[proteinIdx].GetVariantProteins().SelectMany(p => p.Digest(CommonParameters.DigestionParams, null, null)).ToList()[peptideIdx];

            string xmlName = $"andguiaheov{proteinIdx.ToString()}.xml";

            ProteinDbWriter.WriteXmlDatabase(null, new List <Protein> {
                proteins[proteinIdx]
            }, xmlName);

            string     mzmlName     = $"ajgdiv{proteinIdx.ToString()}.mzML";
            MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> {
                pep
            });

            IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false);
            string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, $"TestSearchWithVariants{proteinIdx.ToString()}");

            Directory.CreateDirectory(outputFolder);

            st.RunTask(outputFolder, new List <DbForTask> {
                new DbForTask(xmlName, false)
            }, new List <string> {
                mzmlName
            }, "");
            var psms = File.ReadAllLines(Path.Combine(outputFolder, "AllPSMs.psmtsv"));

            Assert.IsTrue(psms.Any(line => line.Contains(containsVariant ? variantPsmShort : "\t")));

            Directory.Delete(outputFolder, true);
            File.Delete(mzmlName);
            File.Delete(xmlName);
            Directory.Delete(Path.Combine(TestContext.CurrentContext.TestDirectory, @"Task Settings"), true);

            Console.WriteLine($"Analysis time for VariantSearchTests.SearchTests({proteinIdx.ToString()},{peptideIdx.ToString()},{containsVariant.ToString()},{variantPsmShort}): {stopwatch.Elapsed.Hours}h {stopwatch.Elapsed.Minutes}m {stopwatch.Elapsed.Seconds}s");
        }
コード例 #13
0
        /// <summary>
        /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field.
        /// </summary>
        public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant,
                                                      FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex,
                                                      FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1)
        {
            HashSet <string> unique_accessions = new HashSet <string>();
            int    unique_identifier           = 2;//for isoforms. the first will be "accession", the next will be "accession_2"
            string accession = null;
            string name      = null;
            string fullName  = null;
            string organism  = null;
            List <Tuple <string, string> > geneName = new List <Tuple <string, string> >();

            errors = new List <string>();
            Regex substituteWhitespace = new Regex(@"\s+");

            List <Protein> targets = new List <Protein>();

            using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used
                                         (Stream)(new GZipStream(stream, CompressionMode.Decompress)) :
                                         stream;

                StringBuilder sb    = null;
                StreamReader  fasta = new StreamReader(fastaFileStream);

                while (true)
                {
                    string line = "";
                    line = fasta.ReadLine();
                    if (line == null)
                    {
                        break;
                    }

                    if (line.StartsWith(">"))
                    {
                        accession = ApplyRegex(accessionRegex, line);
                        fullName  = ApplyRegex(fullNameRegex, line);
                        name      = ApplyRegex(nameRegex, line);
                        organism  = ApplyRegex(organismRegex, line);
                        string geneNameString = ApplyRegex(geneNameRegex, line);
                        if (geneNameString != null)
                        {
                            geneName.Add(new Tuple <string, string>("primary", geneNameString));
                        }

                        if (accession == null || accession == "")
                        {
                            accession = line.Substring(1).TrimEnd();
                        }

                        sb = new StringBuilder();
                    }
                    else if (sb != null)
                    {
                        sb.Append(line.Trim());
                    }

                    if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null)
                    {
                        string sequence = substituteWhitespace.Replace(sb.ToString(), "");

                        // sanitize the sequence to replace unexpected characters with X (unknown amino acid)
                        // sometimes strange characters get added by RNA sequencing software, etc.
                        sequence = SanitizeAminoAcidSequence(sequence, 'X');

                        if (unique_accessions.Contains(accession))                                  //this will happen for isoforms
                        {
                            string originalAccession = accession;                                   //save the original
                            accession += "_" + unique_identifier.ToString();                        //add a number onto it
                            while (unique_accessions.Contains(accession))                           //if that number was already added
                            {
                                unique_identifier++;                                                //keep increasing it
                                accession = originalAccession + "_" + unique_identifier.ToString(); //try the new number
                            }
                            unique_identifier = 2;                                                  //reset
                        }
                        unique_accessions.Add(accession);
                        Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName,
                                                      isContaminant: isContaminant, databaseFilePath: proteinDbLocation);
                        if (protein.Length == 0)
                        {
                            errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation);
                        }
                        else
                        {
                            targets.Add(protein);
                        }

                        accession = null;
                        name      = null;
                        fullName  = null;
                        organism  = null;
                        geneName  = new List <Tuple <string, string> >();
                    }

                    // no input left
                    if (fasta.Peek() == -1)
                    {
                        break;
                    }
                }
            }
            if (!targets.Any())
            {
                errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation);
            }
            List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads);

            return(generateTargets ? targets.Concat(decoys).ToList() : decoys);
        }
コード例 #14
0
        public static List <Protein> LoadProteinDb(string fileName, bool generateTargets, DecoyType decoyType, int MaxThreadsToUse)
        {
            List <string>  dbErrors    = new List <string>();
            List <Protein> proteinList = new List <Protein>();

            string theExtension = Path.GetExtension(fileName).ToLowerInvariant();
            bool   compressed   = theExtension.EndsWith("gz"); // allows for .bgz and .tgz, too which are used on occasion

            theExtension = compressed ? Path.GetExtension(Path.GetFileNameWithoutExtension(fileName)).ToLowerInvariant() : theExtension;

            if (theExtension.Equals(".fasta") || theExtension.Equals(".fa"))
            {
                proteinList = ProteinDbLoader.LoadProteinFasta(fileName, generateTargets, decoyType, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex,
                                                               ProteinDbLoader.UniprotOrganismRegex, out dbErrors, MaxThreadsToUse);
            }

            return(proteinList.Where(p => p.BaseSequence.Length > 0).ToList());
        }