public bool Equals(ProteinSequence otherSequence) { return(Accession == otherSequence.Accession && GetSequence() == otherSequence.GetSequence()); }
public void ParseFile(string path, ProcessInfo processInfo) { processInfo.Status("Parsing " + path); string accession = ""; int sequenceCounter = 0; StringBuilder sequence = new StringBuilder(); ProteinSequence protein = new ProteinSequence(); try{ StreamReader file = new StreamReader(path); string line; while ((line = file.ReadLine()) != null) // valid line { if (sequenceCounter % 500 == 0) { processInfo.Status("Parsing " + path + ", " + (int)((float)file.BaseStream.Position / file.BaseStream.Length * 100) + "%"); } bool lineIsHeader = line.StartsWith(">"); // skip all lines until the first header is found if (sequenceCounter == 0 && !lineIsHeader) { continue; } // line is a piece of a sequence if (sequenceCounter > 0 && !lineIsHeader) { sequence.Append(line.Trim()); continue; } // line is a fasta header if (lineIsHeader) { if (sequenceCounter > 0) // this is not the first header, i.e. the previous sequence is now completely read in { // add the previous protein protein.SetSequence(sequence.ToString()); entries.Add(accession, protein); } // initialize a new protein protein = new ProteinSequence(); sequenceCounter++; // then parse the new header string header = line; Match m = regexUniprotAccession.Match(header); if (m.Success) // uniprot header { accession = m.Groups[1].Value; protein.Accession = accession; protein.Header = header; } else // fallback position: take entire header after the > as accession { accession = header.Substring(1).Trim(); protein.Accession = accession; protein.Header = header; } sequence = new StringBuilder(); } } //end while file.Close(); //add the last protein if (sequenceCounter > 0) // make sure there is at least one sequence in the file { protein.SetSequence(sequence.ToString()); entries.Add(accession, protein); } } catch (Exception) { processInfo.ErrString = "Something went wrong while parsing the fasta file.\nMake sure the path is correct and the " + "file is not opened in another application.\nMake sure the fasta file is valid."; } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value; string[][] proteinIds = new string[mdata.RowCount][]; string[][] leadingIds = new string[mdata.RowCount][]; List <string> allIds = new List <string>(); for (int row = 0; row < mdata.RowCount; row++) { proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';'); leadingIds[row] = new[] { proteinIds[row][0] }; allIds.AddRange(proteinIds[row]); } string fastaFilePath = param.GetParam <string>("Fasta file").Value; Fasta fasta = new Fasta(); fasta.ParseFile(fastaFilePath, processInfo); // Text annotations processInfo.Status("Adding fasta header annotations."); int[] selection = param.GetParamWithSubParams <int>("Fasta header annotations").GetSubParameters().GetParam <int[]>("Annotations").Value; string[][] idsToBeAnnotated = param.GetParamWithSubParams <int>("Fasta header annotations").Value == 0 ? proteinIds : leadingIds; ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++) { List <ProteinSequence> rowEntries = new List <ProteinSequence>(); foreach (string id in idsToBeAnnotated[row]) { ProteinSequence entry = fasta.GetEntry(id); if (entry == null) { continue; } rowEntries.Add(entry); } fastaEntries[row] = rowEntries.ToArray(); } if (ArrayUtils.Contains(selection, 0)) // Entry name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string entryName = entry.EntryName; if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)) { rowAnnotations.Add(entryName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Entry name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)) // Gene name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string geneName = entry.GeneName; if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)) { rowAnnotations.Add(geneName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Gene name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)) { // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the //'Isoform x of...' prefixes and '(Fragment)' suffixes string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string proteinName = entry.ProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)) { rowAnnotations.Add(proteinName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn); } if (ArrayUtils.Contains(selection, 3)) // Consensus protein name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string proteinName = entry.ConsensusProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)) { rowAnnotations.Add(proteinName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 4)) // Species { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string speciesName = entry.Species; if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)) { rowAnnotations.Add(speciesName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Species", "", annotationColumn); } // Numeric annotations processInfo.Status("Adding numeric annotations."); selection = param.GetParamWithSubParams <int>("Numeric annotations").GetSubParameters().GetParam <int[]>("Annotations").Value; bool annotateLeadingId = param.GetParamWithSubParams <int>("Numeric annotations").Value == 1; if (ArrayUtils.Contains(selection, 0)) // Sequence length { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double sequenceLength = entry.GetSequence().Length; rowAnnotations.Add(sequenceLength); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Sequence length", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)) // Monoisotopic molecular mass { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double monoisotopicMass = entry.GetMonoisotopicMolecularMass(); rowAnnotations.Add(monoisotopicMass); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)) // Average molecular mass { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double averageMass = entry.GetAverageMolecularMass(); rowAnnotations.Add(averageMass); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Average molecular mass", "", annotationColumn); } // Theoretical peptides processInfo.Status("Calculating theoretical peptides."); annotateLeadingId = param.GetParamWithSubParams <int>("Calculate theoretical peptides").Value == 1; Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases, param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <int[]>("Proteases") .Value); double minLength = param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>( "Min. peptide length").Value; double maxLength = param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>( "Max. peptide length").Value; bool displayPeptideSequences = annotateLeadingId && param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <bool>( "Show sequences").Value; foreach (Protease protease in proteases) { double[] annotationColumn = new double[mdata.RowCount]; string[] peptideColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); List <string> rowPeptides = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int)minLength, (int)maxLength); rowAnnotations.Add(nTheoreticalPeptides); if (displayPeptideSequences) { rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int)minLength, (int)maxLength)); } if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); peptideColumn[row] = String.Join(";", rowPeptides); } mdata.AddNumericColumn( "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn); if (displayPeptideSequences) { mdata.AddStringColumn( "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn); } } // Sequence features processInfo.Status("Counting sequence features."); annotateLeadingId = param.GetParamWithSubParams <int>("Count sequence features").Value == 1; bool normalizeBySequenceLength = param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <bool>( "Normalize by sequence length").Value; if (param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value != "") { Regex regex; try{ regex = new Regex( param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value); } catch (ArgumentException) { processInfo.ErrString = "The regular expression you provided has invalid syntax."; return; } double[] sequenceFeatureColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> featureCount = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double nFeatures = regex.Matches(entry.GetSequence()).Count; featureCount.Add(normalizeBySequenceLength ? nFeatures / entry.GetLength() : nFeatures); if (annotateLeadingId) { break; } } sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray()); } mdata.AddNumericColumn( (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "", sequenceFeatureColumn); } processInfo.Status("Done."); }