Esempio n. 1
0
 public bool Equals(ProteinSequence otherSequence)
 {
     return(Accession == otherSequence.Accession && GetSequence() == otherSequence.GetSequence());
 }
Esempio n. 2
0
        public void ParseFile(string path, ProcessInfo processInfo)
        {
            processInfo.Status("Parsing " + path);
            string          accession       = "";
            int             sequenceCounter = 0;
            StringBuilder   sequence        = new StringBuilder();
            ProteinSequence protein         = new ProteinSequence();

            try{
                StreamReader file = new StreamReader(path);
                string       line;
                while ((line = file.ReadLine()) != null)                  // valid line
                {
                    if (sequenceCounter % 500 == 0)
                    {
                        processInfo.Status("Parsing " + path + ", " + (int)((float)file.BaseStream.Position / file.BaseStream.Length * 100) +
                                           "%");
                    }
                    bool lineIsHeader = line.StartsWith(">");

                    // skip all lines until the first header is found
                    if (sequenceCounter == 0 && !lineIsHeader)
                    {
                        continue;
                    }

                    // line is a piece of a sequence
                    if (sequenceCounter > 0 && !lineIsHeader)
                    {
                        sequence.Append(line.Trim());
                        continue;
                    }

                    // line is a fasta header
                    if (lineIsHeader)
                    {
                        if (sequenceCounter > 0)
                        // this is not the first header, i.e. the previous sequence is now completely read in
                        {
                            // add the previous protein
                            protein.SetSequence(sequence.ToString());
                            entries.Add(accession, protein);
                        }
                        // initialize a new protein
                        protein = new ProteinSequence();
                        sequenceCounter++;
                        // then parse the new header
                        string header = line;
                        Match  m      = regexUniprotAccession.Match(header);
                        if (m.Success)                          // uniprot header
                        {
                            accession         = m.Groups[1].Value;
                            protein.Accession = accession;
                            protein.Header    = header;
                        }
                        else                            // fallback position: take entire header after the > as accession
                        {
                            accession         = header.Substring(1).Trim();
                            protein.Accession = accession;
                            protein.Header    = header;
                        }
                        sequence = new StringBuilder();
                    }
                }                 //end while
                file.Close();

                //add the last protein
                if (sequenceCounter > 0)                  // make sure there is at least one sequence in the file
                {
                    protein.SetSequence(sequence.ToString());
                    entries.Add(accession, protein);
                }
            } catch (Exception) {
                processInfo.ErrString =
                    "Something went wrong while parsing the fasta file.\nMake sure the path is correct and the " +
                    "file is not opened in another application.\nMake sure the fasta file is valid.";
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value;

            string[][]    proteinIds = new string[mdata.RowCount][];
            string[][]    leadingIds = new string[mdata.RowCount][];
            List <string> allIds     = new List <string>();

            for (int row = 0; row < mdata.RowCount; row++)
            {
                proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';');
                leadingIds[row] = new[] { proteinIds[row][0] };
                allIds.AddRange(proteinIds[row]);
            }
            string fastaFilePath = param.GetParam <string>("Fasta file").Value;
            Fasta  fasta         = new Fasta();

            fasta.ParseFile(fastaFilePath, processInfo);
            // Text annotations
            processInfo.Status("Adding fasta header annotations.");
            int[] selection =
                param.GetParamWithSubParams <int>("Fasta header annotations").GetSubParameters().GetParam <int[]>("Annotations").Value;
            string[][] idsToBeAnnotated = param.GetParamWithSubParams <int>("Fasta header annotations").Value == 0
                                ? proteinIds
                                : leadingIds;
            ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][];
            for (int row = 0; row < mdata.RowCount; row++)
            {
                List <ProteinSequence> rowEntries = new List <ProteinSequence>();
                foreach (string id in idsToBeAnnotated[row])
                {
                    ProteinSequence entry = fasta.GetEntry(id);
                    if (entry == null)
                    {
                        continue;
                    }
                    rowEntries.Add(entry);
                }
                fastaEntries[row] = rowEntries.ToArray();
            }
            if (ArrayUtils.Contains(selection, 0))              // Entry name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string entryName = entry.EntryName;
                        if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName))
                        {
                            rowAnnotations.Add(entryName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Entry name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1))              // Gene name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string geneName = entry.GeneName;
                        if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName))
                        {
                            rowAnnotations.Add(geneName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Gene name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2))
            {
                // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the
                //'Isoform x of...' prefixes and '(Fragment)' suffixes
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string proteinName = entry.ProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName))
                        {
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 3))              // Consensus protein name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string proteinName = entry.ConsensusProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName))
                        {
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 4))              // Species
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string speciesName = entry.Species;
                        if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName))
                        {
                            rowAnnotations.Add(speciesName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Species", "", annotationColumn);
            }
            // Numeric annotations
            processInfo.Status("Adding numeric annotations.");
            selection =
                param.GetParamWithSubParams <int>("Numeric annotations").GetSubParameters().GetParam <int[]>("Annotations").Value;
            bool annotateLeadingId = param.GetParamWithSubParams <int>("Numeric annotations").Value == 1;

            if (ArrayUtils.Contains(selection, 0))              // Sequence length
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double sequenceLength = entry.GetSequence().Length;
                        rowAnnotations.Add(sequenceLength);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Sequence length", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1))              // Monoisotopic molecular mass
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double monoisotopicMass = entry.GetMonoisotopicMolecularMass();
                        rowAnnotations.Add(monoisotopicMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2))              // Average molecular mass
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double averageMass = entry.GetAverageMolecularMass();
                        rowAnnotations.Add(averageMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Average molecular mass", "", annotationColumn);
            }
            // Theoretical peptides
            processInfo.Status("Calculating theoretical peptides.");
            annotateLeadingId = param.GetParamWithSubParams <int>("Calculate theoretical peptides").Value == 1;
            Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases,
                                                       param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <int[]>("Proteases")
                                                       .Value);
            double minLength =
                param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>(
                    "Min. peptide length").Value;
            double maxLength =
                param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>(
                    "Max. peptide length").Value;
            bool displayPeptideSequences = annotateLeadingId &&
                                           param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <bool>(
                "Show sequences").Value;

            foreach (Protease protease in proteases)
            {
                double[] annotationColumn = new double[mdata.RowCount];
                string[] peptideColumn    = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    List <string> rowPeptides    = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int)minLength, (int)maxLength);
                        rowAnnotations.Add(nTheoreticalPeptides);
                        if (displayPeptideSequences)
                        {
                            rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int)minLength, (int)maxLength));
                        }
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                    peptideColumn[row]    = String.Join(";", rowPeptides);
                }
                mdata.AddNumericColumn(
                    "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn);
                if (displayPeptideSequences)
                {
                    mdata.AddStringColumn(
                        "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn);
                }
            }
            // Sequence features
            processInfo.Status("Counting sequence features.");
            annotateLeadingId = param.GetParamWithSubParams <int>("Count sequence features").Value == 1;
            bool normalizeBySequenceLength =
                param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <bool>(
                    "Normalize by sequence length").Value;

            if (param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value !=
                "")
            {
                Regex regex;
                try{
                    regex =
                        new Regex(
                            param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value);
                } catch (ArgumentException) {
                    processInfo.ErrString = "The regular expression you provided has invalid syntax.";
                    return;
                }
                double[] sequenceFeatureColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> featureCount = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double nFeatures = regex.Matches(entry.GetSequence()).Count;
                        featureCount.Add(normalizeBySequenceLength ? nFeatures / entry.GetLength() : nFeatures);
                        if (annotateLeadingId)
                        {
                            break;
                        }
                    }
                    sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray());
                }
                mdata.AddNumericColumn(
                    (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "",
                    sequenceFeatureColumn);
            }
            processInfo.Status("Done.");
        }