public IndexingEngine(List <Protein> proteinList, List <ModificationWithMass> variableModifications, List <ModificationWithMass> fixedModifications, List <ProductType> productTypes, int currentPartition, DecoyType decoyType, IEnumerable <DigestionParams> collectionOfDigestionParams, CommonParameters commonParams, double maxFragmentSize, List <string> nestedIds) : base(commonParams, nestedIds) { ProteinList = proteinList; VariableModifications = variableModifications; FixedModifications = fixedModifications; ProductTypes = productTypes; CurrentPartition = currentPartition + 1; DecoyType = decoyType; CollectionOfDigestionParams = collectionOfDigestionParams; MaxFragmentSize = maxFragmentSize; }
public IndexingEngine(List <Protein> proteinList, List <ModificationWithMass> variableModifications, List <ModificationWithMass> fixedModifications, List <ProductType> lp, int currentPartition, DecoyType decoyType, IEnumerable <IDigestionParams> CollectionOfDigestionParams, ICommonParameters commonParams, double maxFragmentSize, List <string> nestedIds) : base(nestedIds) { this.proteinList = proteinList; this.variableModifications = variableModifications; this.fixedModifications = fixedModifications; this.lp = lp; this.currentPartition = currentPartition + 1; this.decoyType = decoyType; this.CollectionOfDigestionParams = CollectionOfDigestionParams; this.commonParams = commonParams; this.maxFragmentSize = maxFragmentSize; }
public IndexingEngine(List <Protein> proteinList, List <Modification> variableModifications, List <Modification> fixedModifications, List <SilacLabel> silacLabels, int currentPartition, DecoyType decoyType, CommonParameters commonParams, double maxFragmentSize, bool generatePrecursorIndex, List <FileInfo> proteinDatabases, List <string> nestedIds) : base(commonParams, nestedIds) { ProteinList = proteinList; VariableModifications = variableModifications; FixedModifications = fixedModifications; SilacLabels = silacLabels; CurrentPartition = currentPartition + 1; DecoyType = decoyType; MaxFragmentSize = maxFragmentSize; GeneratePrecursorIndex = generatePrecursorIndex; this.ProteinDatabases = proteinDatabases; }
/// <summary> /// Generates decoys for a list of proteins /// </summary> /// <param name="proteins"></param> /// <param name="decoyType"></param> /// <param name="digestionParams"></param> /// <param name="randomSeed">Used when decoy type is shuffle for shuffling the peptides</param> /// <returns></returns> public static List <Protein> GenerateDecoys(List <Protein> proteins, DecoyType decoyType, int maxThreads = -1) { if (decoyType == DecoyType.None) { return(new List <Protein>()); } else if (decoyType == DecoyType.Reverse) { return(GenerateReverseDecoys(proteins, maxThreads)); } else if (decoyType == DecoyType.Slide) { return(GenerateSlideDecoys(proteins, maxThreads)); } else { throw new ArgumentException("Decoy type " + decoyType.ToString() + " is not implemented."); } }
public void MoreTests(string filename, DecoyType decoyType = DecoyType.None) { string xmlName = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", filename); var proteins = ProteinDbLoader.LoadProteinXML(xmlName, decoyType == DecoyType.None, decoyType, null, false, null, out var un); var peps = proteins[1].Digest(CommonParameters.DigestionParams, null, null).ToList(); PeptideWithSetModifications pep = peps[peps.Count - 2]; string mzmlName = $"ajgdiv{filename}{decoyType.ToString()}.mzML"; MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> { pep }); IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, $"TestSearchWithVariants{filename}{decoyType.ToString()}"); Directory.CreateDirectory(outputFolder); SearchTask st = new SearchTask { SearchParameters = new SearchParameters { DoParsimony = true, DecoyType = decoyType, SearchTarget = decoyType == DecoyType.None, ModPeptidesAreDifferent = false }, CommonParameters = new CommonParameters(scoreCutoff: 1, digestionParams: new DigestionParams(minPeptideLength: 2), precursorMassTolerance: new PpmTolerance(20)), }; st.RunTask(outputFolder, new List <DbForTask> { new DbForTask(xmlName, false) }, new List <string> { mzmlName }, ""); var psms = File.ReadAllLines(Path.Combine(outputFolder, "AllPSMs.psmtsv")); //Assert.IsTrue(psms.Any(line => line.Contains($"\t{variantPsmShort}\t" + (containsVariant ? variantPsmShort : "\t")))); Directory.Delete(outputFolder, true); File.Delete(mzmlName); //Directory.Delete(Path.Combine(TestContext.CurrentContext.TestDirectory, @"Task Settings"), true); }
public void InitializeDecoy(int pNumber, float decoyLifetime, float health, float movementSpeed, float explosionRadius, float explosionDamage, VFXManager vfxManager, DecoyType type, float secondsForTarget = 0, float targetRadius = 0, int numberOfDecoys = 0) { this.pNumber = pNumber; this.decoyLifetime = decoyLifetime; this.health = health; this.movementSpeed = movementSpeed; this.targetRadius = targetRadius; this.explosionRadius = explosionRadius; this.explosionDamage = explosionDamage; this.secondsForAction = secondsForTarget; this.numberOfDecoys = numberOfDecoys; this.vfxManager = vfxManager; this.type = type; if (type == DecoyType.MovementDecoy) { enemiesList = FindObjectsOfType <Hero>().Where(p => p.PlayerNumber != pNumber).ToList(); } agent.speed = movementSpeed; }
public PrecursorIndexingEngine(List <Protein> proteinList, List <ModificationWithMass> variableModifications, List <ModificationWithMass> fixedModifications, List <ProductType> lp, int currentPartition, DecoyType decoyType, IEnumerable <DigestionParams> CollectionOfDigestionParams, CommonParameters commonParams, double maxFragmentSize, List <string> nestedIds) : base(proteinList, variableModifications, fixedModifications, lp, currentPartition, decoyType, CollectionOfDigestionParams, commonParams, maxFragmentSize, nestedIds) { }
public static List <Protein> LoadProteinXML(string proteinDbLocation, bool generateTargetProteins, DecoyType decoyType, IEnumerable <Modification> allKnownModifications, bool IsContaminant, IEnumerable <string> modTypesToExclude, out Dictionary <string, Modification> unknownModifications) { List <Modification> prespecified = GetPtmListFromProteinXml(proteinDbLocation); Dictionary <string, IList <Modification> > mod_dict = new Dictionary <string, IList <Modification> >(); if (prespecified.Count > 0 || allKnownModifications.Count() > 0) { mod_dict = GetModificationDict(new HashSet <Modification>(prespecified.Concat(allKnownModifications))); } List <Protein> result = new List <Protein>(); unknownModifications = new Dictionary <string, Modification>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Regex substituteWhitespace = new Regex(@"\s+"); Stream uniprotXmlFileStream = proteinDbLocation.EndsWith(".gz") ? (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; string[] nodes = new string[6]; string accession = null; string name = null; string full_name = null; string organism = null; string sequence = null; string feature_type = null; string feature_description = null; string original_value = ""; // if no content is found, assume it is empty, not null (e.g. <original>A</original><variation/> for a deletion event) string variation_value = ""; string dbReference_type = null; string dbReference_id = null; List <string> property_types = new List <string>(); List <string> property_values = new List <string>(); int oneBasedfeature_position = -1; int? oneBasedbeginPosition = null; int? oneBasedendPosition = null; List <ProteolysisProduct> proteolysisProducts = new List <ProteolysisProduct>(); List <SequenceVariation> sequenceVariations = new List <SequenceVariation>(); List <DisulfideBond> disulfideBonds = new List <DisulfideBond>(); var oneBasedModifications = new Dictionary <int, List <Modification> >(); List <Tuple <string, string> > gene_names = new List <Tuple <string, string> >(); bool reading_gene = false; bool reading_organism = false; List <DatabaseReference> databaseReferences = new List <DatabaseReference>(); using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream)) { while (xml.Read()) { switch (xml.NodeType) { case XmlNodeType.Element: nodes[xml.Depth] = xml.Name; int outValue; switch (xml.Name) { case "accession": if (accession == null) { accession = xml.ReadElementString(); } break; case "name": if (xml.Depth == 2 && !reading_gene && !reading_organism) { name = xml.ReadElementString(); } if (reading_gene && !reading_organism) { gene_names.Add(new Tuple <string, string>(xml.GetAttribute("type"), xml.ReadElementString())); } if (reading_organism) { if (xml.GetAttribute("type").Equals("scientific")) { organism = xml.ReadElementString(); } } break; case "gene": reading_gene = true; break; case "organism": if (organism == null) { reading_organism = true; } break; case "fullName": if (full_name == null) { full_name = xml.ReadElementString(); } break; case "feature": feature_type = xml.GetAttribute("type"); feature_description = xml.GetAttribute("description"); break; case "original": original_value = xml.ReadElementString(); break; case "variation": variation_value = xml.ReadElementString(); break; case "dbReference": property_types.Clear(); property_values.Clear(); dbReference_type = xml.GetAttribute("type"); dbReference_id = xml.GetAttribute("id"); break; case "property": property_types.Add(xml.GetAttribute("type")); property_values.Add(xml.GetAttribute("value")); break; case "position": oneBasedfeature_position = int.Parse(xml.GetAttribute("position")); break; case "begin": oneBasedbeginPosition = int.TryParse(xml.GetAttribute("position"), out outValue) ? (int?)outValue : null; break; case "end": oneBasedendPosition = int.TryParse(xml.GetAttribute("position"), out outValue) ? (int?)outValue : null; break; case "sequence": sequence = substituteWhitespace.Replace(xml.ReadElementString(), ""); break; } break; case XmlNodeType.EndElement: switch (xml.Name) { case "feature": if (feature_type == "modified residue") { feature_description = feature_description.Split(';')[0]; // Create new entry for this residue, if needed if (!oneBasedModifications.TryGetValue(oneBasedfeature_position, out List <Modification> residue_modifications)) { residue_modifications = new List <Modification>(); oneBasedModifications.Add(oneBasedfeature_position, residue_modifications); } if (mod_dict.ContainsKey(feature_description)) { // Known and not of a type in the exclusion list List <Modification> mods = mod_dict[feature_description].Where(m => !modTypesToExclude.Contains(m.modificationType)).ToList(); if (mods.Count == 0 && oneBasedModifications[oneBasedfeature_position].Count == 0) { oneBasedModifications.Remove(oneBasedfeature_position); } else { oneBasedModifications[oneBasedfeature_position].AddRange(mods); } } else if (unknownModifications.ContainsKey(feature_description)) { // Not known but seen residue_modifications.Add(unknownModifications[feature_description]); } else { // Not known and not seen unknownModifications[feature_description] = new Modification(feature_description, "unknown"); residue_modifications.Add(unknownModifications[feature_description]); } } else if (feature_type == "peptide" || feature_type == "propeptide" || feature_type == "chain" || feature_type == "signal peptide") { proteolysisProducts.Add(new ProteolysisProduct(oneBasedbeginPosition, oneBasedendPosition, feature_type)); } else if (feature_type == "sequence variant" && // Only keep if there is variant sequence information and position information variation_value != null && variation_value != "") { if (oneBasedbeginPosition != null && oneBasedendPosition != null) { sequenceVariations.Add(new SequenceVariation((int)oneBasedbeginPosition, (int)oneBasedendPosition, original_value, variation_value, feature_description)); } else if (oneBasedfeature_position >= 1) { sequenceVariations.Add(new SequenceVariation(oneBasedfeature_position, original_value, variation_value, feature_description)); } } else if (feature_type == "disulfide bond") { if (oneBasedbeginPosition != null && oneBasedendPosition != null) { disulfideBonds.Add(new DisulfideBond((int)oneBasedbeginPosition, (int)oneBasedendPosition, feature_description)); } else if (oneBasedfeature_position >= 1) { disulfideBonds.Add(new DisulfideBond(oneBasedfeature_position, feature_description)); } } oneBasedbeginPosition = null; oneBasedendPosition = null; oneBasedfeature_position = -1; original_value = ""; variation_value = ""; break; case "dbReference": databaseReferences.Add(new DatabaseReference(dbReference_type, dbReference_id, Enumerable.Range(0, property_types.Count).Select(i => new Tuple <string, string>(property_types[i], property_values[i])).ToList())); property_types = new List <string>(); property_values = new List <string>(); dbReference_type = null; dbReference_id = null; break; case "gene": reading_gene = false; break; case "organism": reading_organism = false; break; case "entry": if (accession != null && sequence != null) { if (generateTargetProteins) { var protein = new Protein(sequence, accession, organism, gene_names, oneBasedModifications, proteolysisProducts, name, full_name, false, IsContaminant, databaseReferences, sequenceVariations, disulfideBonds, proteinDbLocation); result.Add(protein); } switch (decoyType) { case DecoyType.Reverse: char[] sequence_array = sequence.ToCharArray(); Dictionary <int, List <Modification> > decoy_modifications = null; List <DisulfideBond> decoy_disulfides = new List <DisulfideBond>(); if (sequence.StartsWith("M", StringComparison.Ordinal)) { // Do not include the initiator methionine in reversal!!! Array.Reverse(sequence_array, 1, sequence.Length - 1); decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count); foreach (var kvp in oneBasedModifications) { if (kvp.Key > 1) { decoy_modifications.Add(sequence.Length - kvp.Key + 2, kvp.Value); } else if (kvp.Key == 1) { decoy_modifications.Add(1, kvp.Value); } } } else { Array.Reverse(sequence_array); decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count); foreach (var kvp in oneBasedModifications) { decoy_modifications.Add(sequence.Length - kvp.Key + 1, kvp.Value); } } var reversed_sequence = new string(sequence_array); List <ProteolysisProduct> decoyPP = new List <ProteolysisProduct>(); foreach (ProteolysisProduct pp in proteolysisProducts) { decoyPP.Add(new ProteolysisProduct(sequence.Length - pp.OneBasedEndPosition + 1, sequence.Length - pp.OneBasedBeginPosition, pp.Type)); } foreach (DisulfideBond disulfideBond in disulfideBonds) { decoy_disulfides.Add(new DisulfideBond(sequence.Length - disulfideBond.OneBasedBeginPosition + 2, sequence.Length - disulfideBond.OneBasedEndPosition + 2, "DECOY DISULFIDE BOND: " + disulfideBond.Description)); } List <SequenceVariation> decoy_variations = new List <SequenceVariation>(); foreach (SequenceVariation sv in sequenceVariations) { char[] original_array = sv.OriginalSequence.ToArray(); char[] variation_array = sv.VariantSequence.ToArray(); if (sv.OneBasedBeginPosition == 1) { bool orig_init_m = sv.OriginalSequence.StartsWith("M", StringComparison.Ordinal); bool var_init_m = sv.VariantSequence.StartsWith("M", StringComparison.Ordinal); if (orig_init_m && !var_init_m) { decoy_variations.Add(new SequenceVariation(1, "M", "", "DECOY VARIANT: Initiator Methionine Change in " + sv.Description)); } original_array = sv.OriginalSequence.Substring(Convert.ToInt32(orig_init_m)).ToArray(); variation_array = sv.VariantSequence.Substring(Convert.ToInt32(var_init_m)).ToArray(); } int decoy_end = sequence.Length - sv.OneBasedBeginPosition + 2 + Convert.ToInt32(sv.OneBasedEndPosition == reversed_sequence.Length) - Convert.ToInt32(sv.OneBasedBeginPosition == 1); int decoy_begin = decoy_end - original_array.Length + 1; Array.Reverse(original_array); Array.Reverse(variation_array); decoy_variations.Add(new SequenceVariation(decoy_begin, decoy_end, new string(original_array), new string(variation_array), "DECOY VARIANT: " + sv.Description)); } var decoy_protein = new Protein(reversed_sequence, "DECOY_" + accession, organism, gene_names, decoy_modifications, decoyPP, name, full_name, true, IsContaminant, null, decoy_variations, decoy_disulfides, proteinDbLocation); result.Add(decoy_protein); break; case DecoyType.Slide: int numSlides = 20; char[] sequence_array_unslided = sequence.ToCharArray(); char[] sequence_array_slided = sequence.ToCharArray(); decoy_modifications = null; List <DisulfideBond> decoy_disulfides_slide = new List <DisulfideBond>(); if (sequence.StartsWith("M", StringComparison.Ordinal)) { // Do not include the initiator methionine in shuffle!!! if (numSlides % sequence_array_slided.Length - 1 == 0) { numSlides++; } for (int i = 1; i < sequence_array_slided.Length; i++) { sequence_array_slided[i] = sequence_array_unslided[GetOldShuffleIndex(i, numSlides, sequence.Length, true)]; } decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count); foreach (var kvp in oneBasedModifications) { if (kvp.Key > 1) { decoy_modifications.Add(GetOldShuffleIndex(kvp.Key - 1, numSlides, sequence.Length, true) + 1, kvp.Value); } else if (kvp.Key == 1) { decoy_modifications.Add(1, kvp.Value); } } } else { if (numSlides % sequence_array_slided.Length == 0) { numSlides++; } for (int i = 0; i < sequence_array_slided.Length; i++) { sequence_array_slided[i] = sequence_array_unslided[GetOldShuffleIndex(i, numSlides, sequence.Length, false)]; } decoy_modifications = new Dictionary <int, List <Modification> >(oneBasedModifications.Count); foreach (var kvp in oneBasedModifications) { decoy_modifications.Add(GetOldShuffleIndex(kvp.Key - 1, numSlides, sequence.Length, false) + 1, kvp.Value); } } var slided_sequence = new string(sequence_array_slided); List <ProteolysisProduct> decoyPP_slide = new List <ProteolysisProduct>(); foreach (ProteolysisProduct pp in proteolysisProducts) //can't keep all aa like you can with reverse, just keep it the same length { decoyPP_slide.Add(pp); } foreach (DisulfideBond disulfideBond in disulfideBonds) //these actually need the same cysteines... { decoy_disulfides_slide.Add(new DisulfideBond(GetOldShuffleIndex(disulfideBond.OneBasedBeginPosition - 1, numSlides, slided_sequence.Length, false) + 1, GetOldShuffleIndex(disulfideBond.OneBasedEndPosition - 1, numSlides, slided_sequence.Length, false) + 1, "DECOY DISULFIDE BOND: " + disulfideBond.Description)); } List <SequenceVariation> decoy_variations_slide = new List <SequenceVariation>(); foreach (SequenceVariation sv in sequenceVariations) //No idea what's going on here. Review is appreciated. { char[] original_array_unshuffled = sv.OriginalSequence.ToArray(); char[] variation_array_unslided = sv.VariantSequence.ToArray(); if (sv.OneBasedBeginPosition == 1) { bool orig_init_m = sv.OriginalSequence.StartsWith("M", StringComparison.Ordinal); bool var_init_m = sv.VariantSequence.StartsWith("M", StringComparison.Ordinal); if (orig_init_m && !var_init_m) { decoy_variations_slide.Add(new SequenceVariation(1, "M", "", "DECOY VARIANT: Initiator Methionine Change in " + sv.Description)); } original_array_unshuffled = sv.OriginalSequence.Substring(Convert.ToInt32(orig_init_m)).ToArray(); variation_array_unslided = sv.VariantSequence.Substring(Convert.ToInt32(var_init_m)).ToArray(); } int decoy_end = sequence.Length - sv.OneBasedBeginPosition + 2 + Convert.ToInt32(sv.OneBasedEndPosition == slided_sequence.Length) - Convert.ToInt32(sv.OneBasedBeginPosition == 1); int decoy_begin = decoy_end - original_array_unshuffled.Length + 1; char[] original_array_slided = sv.OriginalSequence.ToArray(); char[] variation_array_slided = sv.VariantSequence.ToArray(); if (numSlides % original_array_slided.Length == 0) { numSlides++; } for (int i = 0; i < original_array_slided.Length; i++) { original_array_slided[i] = original_array_unshuffled[GetOldShuffleIndex(i, numSlides, original_array_unshuffled.Length, false)]; } if (numSlides % variation_array_slided.Length == 0) { numSlides++; } for (int i = 0; i < variation_array_slided.Length; i++) { variation_array_slided[i] = variation_array_unslided[GetOldShuffleIndex(i, numSlides, variation_array_unslided.Length, false)]; } decoy_variations_slide.Add(new SequenceVariation(decoy_begin, decoy_end, new string(original_array_slided), new string(variation_array_slided), "DECOY VARIANT: " + sv.Description)); } var decoy_protein_slide = new Protein(slided_sequence, "DECOY_" + accession, organism, gene_names, decoy_modifications, decoyPP_slide, name, full_name, true, IsContaminant, null, decoy_variations_slide, decoy_disulfides_slide, proteinDbLocation); result.Add(decoy_protein_slide); break; default: break; } } accession = null; name = null; full_name = null; sequence = null; organism = null; feature_type = null; feature_description = null; original_value = ""; variation_value = ""; dbReference_type = null; dbReference_id = null; property_types = new List <string>(); property_values = new List <string>(); oneBasedfeature_position = -1; oneBasedModifications = new Dictionary <int, List <Modification> >(); proteolysisProducts = new List <ProteolysisProduct>(); sequenceVariations = new List <SequenceVariation>(); disulfideBonds = new List <DisulfideBond>(); databaseReferences = new List <DatabaseReference>(); gene_names = new List <Tuple <string, string> >(); reading_gene = false; reading_organism = false; break; } break; } } } } return(result); }
/// <summary> /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field. /// </summary> /// <param name="proteinDbLocation"></param> /// <param name="onTheFlyDecoys"></param> /// <param name="IsContaminant"></param> /// <param name="accession_expression"></param> /// <param name="full_name_expression"></param> /// <param name="name_expression"></param> /// <param name="gene_expression"></param> /// <returns></returns> public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool originalTarget, DecoyType onTheFlyDecoys, bool IsContaminant, Regex accession_expression, Regex full_name_expression, Regex name_expression, Regex gene_expression, Regex organism_expression, out List <string> errors) { HashSet <string> unique_accessions = new HashSet <string>(); int unique_identifier = 1; string accession = null; string name = null; string full_name = null; string organism = null; List <Tuple <string, string> > gene_name = new List <Tuple <string, string> >(); errors = new List <string>(); Regex substituteWhitespace = new Regex(@"\s+"); List <Protein> result = new List <Protein>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fastaFileStream = proteinDbLocation.EndsWith(".gz") ? (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StringBuilder sb = null; StreamReader fasta = new StreamReader(fastaFileStream); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith(">")) { var accession_match = accession_expression.Match(line); var full_name_match = full_name_expression.Match(line); var name_match = name_expression.Match(line); var gene_name_match = gene_expression.Match(line); if (organism_expression != null) { var organism_match = organism_expression.Match(line); if (organism_match.Groups.Count > 1) { organism = organism_expression.Match(line).Groups[1].Value; } } if (accession_match.Groups.Count > 1) { accession = accession_expression.Match(line).Groups[1].Value; } if (full_name_match.Groups.Count > 1) { full_name = full_name_expression.Match(line).Groups[1].Value; } if (name_match.Groups.Count > 1) { name = name_expression.Match(line).Groups[1].Value; } if (gene_name_match.Groups.Count > 1) { gene_name.Add(new Tuple <string, string>("primary", gene_expression.Match(line).Groups[1].Value)); } if (accession == null || accession == "") { accession = line.Substring(1).TrimEnd(); } sb = new StringBuilder(); } else if (sb != null) { sb.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null) { string sequence = substituteWhitespace.Replace(sb.ToString(), ""); while (unique_accessions.Contains(accession)) { accession += "_" + unique_identifier.ToString(); unique_identifier++; } unique_accessions.Add(accession); if (originalTarget) { Protein protein = new Protein(sequence, accession, organism, gene_name, name: name, full_name: full_name, isContaminant: IsContaminant, databaseFilePath: proteinDbLocation); if (protein.Length == 0) { errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation); } else { result.Add(protein); } } switch (onTheFlyDecoys) { case DecoyType.Reverse: char[] sequence_array = sequence.ToCharArray(); int starts_with_met = sequence.StartsWith("M", StringComparison.Ordinal) ? 1 : 0; Array.Reverse(sequence_array, starts_with_met, sequence.Length - starts_with_met); // Do not include the initiator methionine in reversal!!! var reversed_sequence = new string(sequence_array); Protein decoy_protein = new Protein(reversed_sequence, "DECOY_" + accession, organism, gene_name, name: name, full_name: full_name, isDecoy: true, isContaminant: IsContaminant, databaseFilePath: proteinDbLocation); result.Add(decoy_protein); break; case DecoyType.Slide: int numSlides = 20; char[] sequence_array_unslide = sequence.ToCharArray(); char[] sequence_array_slide = sequence.ToCharArray(); bool starts_with_met_slide = sequence.StartsWith("M", StringComparison.Ordinal); for (int i = starts_with_met_slide ? 1 : 0; i < sequence.Length; i++) { sequence_array_slide[i] = sequence_array_unslide[GetOldShuffleIndex(i, numSlides, sequence.Length, starts_with_met_slide)]; } string slide_sequence = new string(sequence_array_slide); Protein decoy_protein_slide = new Protein(slide_sequence, "DECOY_" + accession, organism, gene_name, name: name, full_name: full_name, isDecoy: true, isContaminant: IsContaminant, databaseFilePath: proteinDbLocation); result.Add(decoy_protein_slide); break; default: break; } accession = null; name = null; full_name = null; organism = null; gene_name = new List <Tuple <string, string> >(); } // no input left if (fasta.Peek() == -1) { break; } } } if (!result.Any()) { errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation); } return(result); }
public static List <Protein> LoadProteinXML(string proteinDbLocation, bool generateTargets, DecoyType decoyType, IEnumerable <Modification> allKnownModifications, bool isContaminant, IEnumerable <string> modTypesToExclude, out Dictionary <string, Modification> unknownModifications, int maxThreads = -1) { List <Modification> prespecified = GetPtmListFromProteinXml(proteinDbLocation); allKnownModifications = allKnownModifications ?? new List <Modification>(); modTypesToExclude = modTypesToExclude ?? new List <string>(); //Dictionary<string, IList<Modification>> modsDictionary = new Dictionary<string, IList<Modification>>(); if (prespecified.Count > 0 || allKnownModifications.Count() > 0) { //modsDictionary = GetModificationDict(new HashSet<Modification>(prespecified.Concat(allKnownModifications))); IdToPossibleMods = GetModificationDict(new HashSet <Modification>(prespecified.Concat(allKnownModifications))); IdWithMotifToMod = GetModificationDictWithMotifs(new HashSet <Modification>(prespecified.Concat(allKnownModifications))); } List <Protein> targets = new List <Protein>(); unknownModifications = new Dictionary <string, Modification>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Regex substituteWhitespace = new Regex(@"\s+"); Stream uniprotXmlFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; ProteinXmlEntry block = new ProteinXmlEntry(); using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream)) { while (xml.Read()) { if (xml.NodeType == XmlNodeType.Element) { block.ParseElement(xml.Name, xml); } if (xml.NodeType == XmlNodeType.EndElement || xml.IsEmptyElement) { var newProteinEntries = block.ParseEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, proteinDbLocation); targets.AddRange(newProteinEntries); } } } } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); List <Protein> intermediateProteinList = (generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList(); return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList()); }
/// <summary> /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field. /// </summary> public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant, FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex, FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1) { HashSet <string> unique_accessions = new HashSet <string>(); int unique_identifier = 1; string accession = null; string name = null; string fullName = null; string organism = null; List <Tuple <string, string> > geneName = new List <Tuple <string, string> >(); errors = new List <string>(); Regex substituteWhitespace = new Regex(@"\s+"); List <Protein> targets = new List <Protein>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StringBuilder sb = null; StreamReader fasta = new StreamReader(fastaFileStream); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith(">")) { accession = ApplyRegex(accessionRegex, line); fullName = ApplyRegex(fullNameRegex, line); name = ApplyRegex(nameRegex, line); organism = ApplyRegex(organismRegex, line); string geneNameString = ApplyRegex(geneNameRegex, line); if (geneNameString != null) { geneName.Add(new Tuple <string, string>("primary", geneNameString)); } if (accession == null || accession == "") { accession = line.Substring(1).TrimEnd(); } sb = new StringBuilder(); } else if (sb != null) { sb.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null) { string sequence = substituteWhitespace.Replace(sb.ToString(), ""); while (unique_accessions.Contains(accession)) { accession += "_" + unique_identifier.ToString(); unique_identifier++; } unique_accessions.Add(accession); Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName, isContaminant: isContaminant, databaseFilePath: proteinDbLocation); if (protein.Length == 0) { errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation); } else { targets.Add(protein); } accession = null; name = null; fullName = null; organism = null; geneName = new List <Tuple <string, string> >(); } // no input left if (fasta.Peek() == -1) { break; } } } if (!targets.Any()) { errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation); } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); return((generateTargets ? targets : new List <Protein>()).Concat(decoyType != DecoyType.None ? decoys : new List <Protein>()).ToList()); }
[TestCase(9, 0, true, "EDITPEPEDITP2PPP", DecoyType.Reverse)] // MPEPPP becomes MPPPEP with the variant beginning at position 2 public static void SearchTests(int proteinIdx, int peptideIdx, bool containsVariant, string variantPsmShort, DecoyType decoyType = DecoyType.None) { Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); // Make sure can run the complete search task when multiple compact peptides may correspond to a single PWSM SearchTask st = new SearchTask { SearchParameters = new SearchParameters { DoParsimony = true, DecoyType = decoyType, SearchTarget = decoyType == DecoyType.None, ModPeptidesAreDifferent = false }, CommonParameters = new CommonParameters(scoreCutoff: 1, digestionParams: new DigestionParams(minPeptideLength: 2), precursorMassTolerance: new PpmTolerance(20)), }; ModificationMotif.TryGetMotif("V", out ModificationMotif motifV); Modification mv = new Modification("mod", null, "type", null, motifV, "Anywhere.", null, 42.01, new Dictionary <string, IList <string> >(), null, null, null, null, null); ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary <string, IList <string> >(), null, null, null, null, null); List <Protein> proteins = new List <Protein> { new Protein("MPEPTIDE", "protein1", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPTIDE", "protein2", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPPPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPKPKTIDE", "protein3", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 7, "PKPK", "PK", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPTAIDE", "protein2", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 5, "PTA", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEKKAIDE", "protein2", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "KKA", "K", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), new Protein("MPEPTIDE", "protein1", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary <int, List <Modification> > { { 4, new[] { mv }.ToList() } }) }), new Protein("MPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary <int, List <Modification> > { { 5, new[] { mp }.ToList() } }) }), new Protein("MPEPTIDEPEPTIDE", "protein3", sequenceVariations: new List <SequenceVariation> { new SequenceVariation(4, 4, "PTIDEPEPTIDE", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), }; PeptideWithSetModifications pep = proteins[proteinIdx].GetVariantProteins().SelectMany(p => p.Digest(CommonParameters.DigestionParams, null, null)).ToList()[peptideIdx]; string xmlName = $"andguiaheov{proteinIdx.ToString()}.xml"; ProteinDbWriter.WriteXmlDatabase(null, new List <Protein> { proteins[proteinIdx] }, xmlName); string mzmlName = $"ajgdiv{proteinIdx.ToString()}.mzML"; MsDataFile myMsDataFile = new TestDataFile(new List <PeptideWithSetModifications> { pep }); IO.MzML.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, $"TestSearchWithVariants{proteinIdx.ToString()}"); Directory.CreateDirectory(outputFolder); st.RunTask(outputFolder, new List <DbForTask> { new DbForTask(xmlName, false) }, new List <string> { mzmlName }, ""); var psms = File.ReadAllLines(Path.Combine(outputFolder, "AllPSMs.psmtsv")); Assert.IsTrue(psms.Any(line => line.Contains(containsVariant ? variantPsmShort : "\t"))); Directory.Delete(outputFolder, true); File.Delete(mzmlName); File.Delete(xmlName); Directory.Delete(Path.Combine(TestContext.CurrentContext.TestDirectory, @"Task Settings"), true); Console.WriteLine($"Analysis time for VariantSearchTests.SearchTests({proteinIdx.ToString()},{peptideIdx.ToString()},{containsVariant.ToString()},{variantPsmShort}): {stopwatch.Elapsed.Hours}h {stopwatch.Elapsed.Minutes}m {stopwatch.Elapsed.Seconds}s"); }
/// <summary> /// Load a protein fasta database, using regular expressions to get various aspects of the headers. The first regex capture group is used as each field. /// </summary> public static List <Protein> LoadProteinFasta(string proteinDbLocation, bool generateTargets, DecoyType decoyType, bool isContaminant, FastaHeaderFieldRegex accessionRegex, FastaHeaderFieldRegex fullNameRegex, FastaHeaderFieldRegex nameRegex, FastaHeaderFieldRegex geneNameRegex, FastaHeaderFieldRegex organismRegex, out List <string> errors, int maxThreads = -1) { HashSet <string> unique_accessions = new HashSet <string>(); int unique_identifier = 2;//for isoforms. the first will be "accession", the next will be "accession_2" string accession = null; string name = null; string fullName = null; string organism = null; List <Tuple <string, string> > geneName = new List <Tuple <string, string> >(); errors = new List <string>(); Regex substituteWhitespace = new Regex(@"\s+"); List <Protein> targets = new List <Protein>(); using (var stream = new FileStream(proteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream fastaFileStream = proteinDbLocation.EndsWith("gz") ? // allow for .bgz and .tgz, which are (rarely) used (Stream)(new GZipStream(stream, CompressionMode.Decompress)) : stream; StringBuilder sb = null; StreamReader fasta = new StreamReader(fastaFileStream); while (true) { string line = ""; line = fasta.ReadLine(); if (line == null) { break; } if (line.StartsWith(">")) { accession = ApplyRegex(accessionRegex, line); fullName = ApplyRegex(fullNameRegex, line); name = ApplyRegex(nameRegex, line); organism = ApplyRegex(organismRegex, line); string geneNameString = ApplyRegex(geneNameRegex, line); if (geneNameString != null) { geneName.Add(new Tuple <string, string>("primary", geneNameString)); } if (accession == null || accession == "") { accession = line.Substring(1).TrimEnd(); } sb = new StringBuilder(); } else if (sb != null) { sb.Append(line.Trim()); } if ((fasta.Peek() == '>' || fasta.Peek() == -1) && accession != null && sb != null) { string sequence = substituteWhitespace.Replace(sb.ToString(), ""); // sanitize the sequence to replace unexpected characters with X (unknown amino acid) // sometimes strange characters get added by RNA sequencing software, etc. sequence = SanitizeAminoAcidSequence(sequence, 'X'); if (unique_accessions.Contains(accession)) //this will happen for isoforms { string originalAccession = accession; //save the original accession += "_" + unique_identifier.ToString(); //add a number onto it while (unique_accessions.Contains(accession)) //if that number was already added { unique_identifier++; //keep increasing it accession = originalAccession + "_" + unique_identifier.ToString(); //try the new number } unique_identifier = 2; //reset } unique_accessions.Add(accession); Protein protein = new Protein(sequence, accession, organism, geneName, name: name, fullName: fullName, isContaminant: isContaminant, databaseFilePath: proteinDbLocation); if (protein.Length == 0) { errors.Add("Line" + line + ", Protein Length of 0: " + protein.Name + " was skipped from database: " + proteinDbLocation); } else { targets.Add(protein); } accession = null; name = null; fullName = null; organism = null; geneName = new List <Tuple <string, string> >(); } // no input left if (fasta.Peek() == -1) { break; } } } if (!targets.Any()) { errors.Add("Error: No proteins could be read from the database: " + proteinDbLocation); } List <Protein> decoys = DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads); return(generateTargets ? targets.Concat(decoys).ToList() : decoys); }
public static List <Protein> LoadProteinDb(string fileName, bool generateTargets, DecoyType decoyType, int MaxThreadsToUse) { List <string> dbErrors = new List <string>(); List <Protein> proteinList = new List <Protein>(); string theExtension = Path.GetExtension(fileName).ToLowerInvariant(); bool compressed = theExtension.EndsWith("gz"); // allows for .bgz and .tgz, too which are used on occasion theExtension = compressed ? Path.GetExtension(Path.GetFileNameWithoutExtension(fileName)).ToLowerInvariant() : theExtension; if (theExtension.Equals(".fasta") || theExtension.Equals(".fa")) { proteinList = ProteinDbLoader.LoadProteinFasta(fileName, generateTargets, decoyType, false, ProteinDbLoader.UniprotAccessionRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotFullNameRegex, ProteinDbLoader.UniprotGeneNameRegex, ProteinDbLoader.UniprotOrganismRegex, out dbErrors, MaxThreadsToUse); } return(proteinList.Where(p => p.BaseSequence.Length > 0).ToList()); }