public IEnumerable <ITestCaseData> Tag_TestCases() { var tags = new HashSet <string> { "Foo", "Bar", "Baz" }; var standardAlphabetWithTags = new ProteinSequence("ACTCTTCAGC", AlphabetType.StandardProtein, GeneticCode.Standard, tags); yield return(new TestCaseData(standardAlphabetWithTags, tags).SetName("Standard protein alphabet with tags")); var extendedAlphabetWithTags = new ProteinSequence("AUCUAGCGCGUA", AlphabetType.ExtendedProtein, GeneticCode.Standard, tags); yield return(new TestCaseData(extendedAlphabetWithTags, tags).SetName("Extended protein alphabet with tags")); }
public IEnumerable <ITestCaseData> Equality_TestCases() { const string standard = "ACTGC"; const string extended = "ACUGC"; var standardStandard = new ProteinSequence(standard, AlphabetType.StandardProtein, GeneticCode.Standard); var standardExtended = new ProteinSequence(standard, AlphabetType.ExtendedProtein, GeneticCode.Standard); var extendedExtended = new ProteinSequence(extended, AlphabetType.ExtendedProtein, GeneticCode.Standard); var standardDiffStandard = new ProteinSequence(standard.Substring(0, standard.Length - 2), AlphabetType.StandardProtein, GeneticCode.Standard); var extendedDiffExtended = new ProteinSequence(extended.Substring(0, extended.Length - 2), AlphabetType.ExtendedProtein, GeneticCode.Standard); yield return(new TestCaseData(standardStandard, standardStandard).Returns(true).SetName("Standard with standard")); yield return(new TestCaseData(standardStandard, standardExtended).Returns(false).SetName("Standard with extended protein returns false")); yield return(new TestCaseData(standardStandard, standardDiffStandard).Returns(false).SetName("Two different standard protein sequences return false")); yield return(new TestCaseData(extendedExtended, extendedDiffExtended).Returns(false).SetName("Two different extended protein sequences return false")); }
public void ParseFile(string path, ProcessInfo processInfo) { processInfo.Status("Parsing " + path); string accession = ""; int sequenceCounter = 0; StringBuilder sequence = new StringBuilder(); ProteinSequence protein = new ProteinSequence(); try{ StreamReader file = new StreamReader(path); string line; while ((line = file.ReadLine()) != null){ // valid line if (sequenceCounter%500 == 0){ processInfo.Status("Parsing " + path + ", " + (int) ((float) file.BaseStream.Position/file.BaseStream.Length*100) + "%"); } bool lineIsHeader = line.StartsWith(">"); // skip all lines until the first header is found if (sequenceCounter == 0 && !lineIsHeader){ continue; } // line is a piece of a sequence if (sequenceCounter > 0 && !lineIsHeader){ sequence.Append(line.Trim()); continue; } // line is a fasta header if (lineIsHeader){ if (sequenceCounter > 0) // this is not the first header, i.e. the previous sequence is now completely read in { // add the previous protein protein.SetSequence(sequence.ToString()); entries.Add(accession, protein); } // initialize a new protein protein = new ProteinSequence(); sequenceCounter++; // then parse the new header string header = line; Match m = regexUniprotAccession.Match(header); if (m.Success){ // uniprot header accession = m.Groups[1].Value; protein.Accession = accession; protein.Header = header; } else{ // fallback position: take entire header after the > as accession accession = header.Substring(1).Trim(); protein.Accession = accession; protein.Header = header; } sequence = new StringBuilder(); } } //end while file.Close(); //add the last protein if (sequenceCounter > 0){ // make sure there is at least one sequence in the file protein.SetSequence(sequence.ToString()); entries.Add(accession, protein); } } catch (Exception){ processInfo.ErrString = "Something went wrong while parsing the fasta file.\nMake sure the path is correct and the " + "file is not opened in another application.\nMake sure the fasta file is valid."; } }
public ProteinSequenceMatchDuo(ProteinSequence proteinSequence, ProteinMatch proteinMatch) { ProteinSequence = proteinSequence; ProteinMatch = proteinMatch; }
public void Tag_Tests(ProteinSequence incoming, IEnumerable <string> incomingTags) { var expected = new HashSet <string>(incomingTags); CollectionAssert.AreEquivalent(incoming.Tags, expected); }
public int Constructor_Tests(string sequence, AlphabetType alphabet) { var protein = new ProteinSequence(sequence, alphabet, GeneticCode.Standard); return(protein.Sequence.Length); }
public bool EqualityTests(ProteinSequence a, ProteinSequence b) { return(a.Equals(b)); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int proteinIdColumnInd = param.GetParam<int>("Protein IDs").Value; string[][] proteinIds = new string[mdata.RowCount][]; string[][] leadingIds = new string[mdata.RowCount][]; List<string> allIds = new List<string>(); for (int row = 0; row < mdata.RowCount; row++){ proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';'); leadingIds[row] = new[]{proteinIds[row][0]}; allIds.AddRange(proteinIds[row]); } string fastaFilePath = param.GetParam<string>("Fasta file").Value; Fasta fasta = new Fasta(); fasta.ParseFile(fastaFilePath, processInfo); // Text annotations processInfo.Status("Adding fasta header annotations."); int[] selection = param.GetParamWithSubParams<int>("Fasta header annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; string[][] idsToBeAnnotated = (param.GetParamWithSubParams<int>("Fasta header annotations").Value == 0) ? proteinIds : leadingIds; ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++){ List<ProteinSequence> rowEntries = new List<ProteinSequence>(); foreach (string id in idsToBeAnnotated[row]){ ProteinSequence entry = fasta.GetEntry(id); if (entry == null){ continue; } rowEntries.Add(entry); } fastaEntries[row] = rowEntries.ToArray(); } if (ArrayUtils.Contains(selection, 0)){ // Entry name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string entryName = entry.EntryName; if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)){ rowAnnotations.Add(entryName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Entry name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Gene name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string geneName = entry.GeneName; if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)){ rowAnnotations.Add(geneName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Gene name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the //'Isoform x of...' prefixes and '(Fragment)' suffixes string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn); } if (ArrayUtils.Contains(selection, 3)){ // Consensus protein name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ConsensusProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 4)){ // Species string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string speciesName = entry.Species; if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)){ rowAnnotations.Add(speciesName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Species", "", annotationColumn); } // Numeric annotations processInfo.Status("Adding numeric annotations."); selection = param.GetParamWithSubParams<int>("Numeric annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; bool annotateLeadingId = (param.GetParamWithSubParams<int>("Numeric annotations").Value == 1); if (ArrayUtils.Contains(selection, 0)){ // Sequence length double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double sequenceLength = entry.GetSequence().Length; rowAnnotations.Add(sequenceLength); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Sequence length", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Monoisotopic molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double monoisotopicMass = entry.GetMonoisotopicMolecularMass(); rowAnnotations.Add(monoisotopicMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Average molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double averageMass = entry.GetAverageMolecularMass(); rowAnnotations.Add(averageMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Average molecular mass", "", annotationColumn); } // Theoretical peptides processInfo.Status("Calculating theoretical peptides."); annotateLeadingId = (param.GetParamWithSubParams<int>("Calculate theoretical peptides").Value == 1); Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases, param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<int[]>("Proteases") .Value); double minLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Min. peptide length").Value; double maxLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Max. peptide length").Value; bool displayPeptideSequences = annotateLeadingId && param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<bool>( "Show sequences").Value; foreach (Protease protease in proteases){ double[] annotationColumn = new double[mdata.RowCount]; string[] peptideColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); List<string> rowPeptides = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int) minLength, (int) maxLength); rowAnnotations.Add(nTheoreticalPeptides); if (displayPeptideSequences){ rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int) minLength, (int) maxLength)); } if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); peptideColumn[row] = String.Join(";", rowPeptides); } mdata.AddNumericColumn( "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn); if (displayPeptideSequences){ mdata.AddStringColumn( "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn); } } // Sequence features processInfo.Status("Counting sequence features."); annotateLeadingId = (param.GetParamWithSubParams<int>("Count sequence features").Value == 1); bool normalizeBySequenceLength = param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<bool>( "Normalize by sequence length").Value; if (param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value != ""){ Regex regex; try{ regex = new Regex( param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value); } catch (ArgumentException){ processInfo.ErrString = "The regular expression you provided has invalid syntax."; return; } double[] sequenceFeatureColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> featureCount = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nFeatures = regex.Matches(entry.GetSequence()).Count; featureCount.Add(normalizeBySequenceLength ? nFeatures/entry.GetLength() : nFeatures); if (annotateLeadingId){ break; } } sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray()); } mdata.AddNumericColumn( (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "", sequenceFeatureColumn); } processInfo.Status("Done."); }