Esempio n. 1
0
 public bool Equals(ProteinSequence otherSequence)
 {
     return(Accession == otherSequence.Accession && GetSequence() == otherSequence.GetSequence());
 }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value;

            string[][]    proteinIds = new string[mdata.RowCount][];
            string[][]    leadingIds = new string[mdata.RowCount][];
            List <string> allIds     = new List <string>();

            for (int row = 0; row < mdata.RowCount; row++)
            {
                proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';');
                leadingIds[row] = new[] { proteinIds[row][0] };
                allIds.AddRange(proteinIds[row]);
            }
            string fastaFilePath = param.GetParam <string>("Fasta file").Value;
            Fasta  fasta         = new Fasta();

            fasta.ParseFile(fastaFilePath, processInfo);
            // Text annotations
            processInfo.Status("Adding fasta header annotations.");
            int[] selection =
                param.GetParamWithSubParams <int>("Fasta header annotations").GetSubParameters().GetParam <int[]>("Annotations").Value;
            string[][] idsToBeAnnotated = param.GetParamWithSubParams <int>("Fasta header annotations").Value == 0
                                ? proteinIds
                                : leadingIds;
            ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][];
            for (int row = 0; row < mdata.RowCount; row++)
            {
                List <ProteinSequence> rowEntries = new List <ProteinSequence>();
                foreach (string id in idsToBeAnnotated[row])
                {
                    ProteinSequence entry = fasta.GetEntry(id);
                    if (entry == null)
                    {
                        continue;
                    }
                    rowEntries.Add(entry);
                }
                fastaEntries[row] = rowEntries.ToArray();
            }
            if (ArrayUtils.Contains(selection, 0))              // Entry name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string entryName = entry.EntryName;
                        if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName))
                        {
                            rowAnnotations.Add(entryName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Entry name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1))              // Gene name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string geneName = entry.GeneName;
                        if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName))
                        {
                            rowAnnotations.Add(geneName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Gene name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2))
            {
                // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the
                //'Isoform x of...' prefixes and '(Fragment)' suffixes
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string proteinName = entry.ProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName))
                        {
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 3))              // Consensus protein name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string proteinName = entry.ConsensusProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName))
                        {
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 4))              // Species
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string speciesName = entry.Species;
                        if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName))
                        {
                            rowAnnotations.Add(speciesName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Species", "", annotationColumn);
            }
            // Numeric annotations
            processInfo.Status("Adding numeric annotations.");
            selection =
                param.GetParamWithSubParams <int>("Numeric annotations").GetSubParameters().GetParam <int[]>("Annotations").Value;
            bool annotateLeadingId = param.GetParamWithSubParams <int>("Numeric annotations").Value == 1;

            if (ArrayUtils.Contains(selection, 0))              // Sequence length
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double sequenceLength = entry.GetSequence().Length;
                        rowAnnotations.Add(sequenceLength);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Sequence length", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1))              // Monoisotopic molecular mass
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double monoisotopicMass = entry.GetMonoisotopicMolecularMass();
                        rowAnnotations.Add(monoisotopicMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2))              // Average molecular mass
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double averageMass = entry.GetAverageMolecularMass();
                        rowAnnotations.Add(averageMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Average molecular mass", "", annotationColumn);
            }
            // Theoretical peptides
            processInfo.Status("Calculating theoretical peptides.");
            annotateLeadingId = param.GetParamWithSubParams <int>("Calculate theoretical peptides").Value == 1;
            Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases,
                                                       param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <int[]>("Proteases")
                                                       .Value);
            double minLength =
                param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>(
                    "Min. peptide length").Value;
            double maxLength =
                param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>(
                    "Max. peptide length").Value;
            bool displayPeptideSequences = annotateLeadingId &&
                                           param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <bool>(
                "Show sequences").Value;

            foreach (Protease protease in proteases)
            {
                double[] annotationColumn = new double[mdata.RowCount];
                string[] peptideColumn    = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    List <string> rowPeptides    = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int)minLength, (int)maxLength);
                        rowAnnotations.Add(nTheoreticalPeptides);
                        if (displayPeptideSequences)
                        {
                            rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int)minLength, (int)maxLength));
                        }
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                    peptideColumn[row]    = String.Join(";", rowPeptides);
                }
                mdata.AddNumericColumn(
                    "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn);
                if (displayPeptideSequences)
                {
                    mdata.AddStringColumn(
                        "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn);
                }
            }
            // Sequence features
            processInfo.Status("Counting sequence features.");
            annotateLeadingId = param.GetParamWithSubParams <int>("Count sequence features").Value == 1;
            bool normalizeBySequenceLength =
                param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <bool>(
                    "Normalize by sequence length").Value;

            if (param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value !=
                "")
            {
                Regex regex;
                try{
                    regex =
                        new Regex(
                            param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value);
                } catch (ArgumentException) {
                    processInfo.ErrString = "The regular expression you provided has invalid syntax.";
                    return;
                }
                double[] sequenceFeatureColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> featureCount = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double nFeatures = regex.Matches(entry.GetSequence()).Count;
                        featureCount.Add(normalizeBySequenceLength ? nFeatures / entry.GetLength() : nFeatures);
                        if (annotateLeadingId)
                        {
                            break;
                        }
                    }
                    sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray());
                }
                mdata.AddNumericColumn(
                    (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "",
                    sequenceFeatureColumn);
            }
            processInfo.Status("Done.");
        }