Ejemplo n.º 1
0
        public IEnumerable <ITestCaseData> Tag_TestCases()
        {
            var tags = new HashSet <string> {
                "Foo", "Bar", "Baz"
            };
            var standardAlphabetWithTags = new ProteinSequence("ACTCTTCAGC", AlphabetType.StandardProtein, GeneticCode.Standard, tags);

            yield return(new TestCaseData(standardAlphabetWithTags, tags).SetName("Standard protein alphabet with tags"));

            var extendedAlphabetWithTags = new ProteinSequence("AUCUAGCGCGUA", AlphabetType.ExtendedProtein, GeneticCode.Standard, tags);

            yield return(new TestCaseData(extendedAlphabetWithTags, tags).SetName("Extended protein alphabet with tags"));
        }
Ejemplo n.º 2
0
        public IEnumerable <ITestCaseData> Equality_TestCases()
        {
            const string standard             = "ACTGC";
            const string extended             = "ACUGC";
            var          standardStandard     = new ProteinSequence(standard, AlphabetType.StandardProtein, GeneticCode.Standard);
            var          standardExtended     = new ProteinSequence(standard, AlphabetType.ExtendedProtein, GeneticCode.Standard);
            var          extendedExtended     = new ProteinSequence(extended, AlphabetType.ExtendedProtein, GeneticCode.Standard);
            var          standardDiffStandard = new ProteinSequence(standard.Substring(0, standard.Length - 2), AlphabetType.StandardProtein, GeneticCode.Standard);
            var          extendedDiffExtended = new ProteinSequence(extended.Substring(0, extended.Length - 2), AlphabetType.ExtendedProtein, GeneticCode.Standard);

            yield return(new TestCaseData(standardStandard, standardStandard).Returns(true).SetName("Standard with standard"));

            yield return(new TestCaseData(standardStandard, standardExtended).Returns(false).SetName("Standard with extended protein returns false"));

            yield return(new TestCaseData(standardStandard, standardDiffStandard).Returns(false).SetName("Two different standard protein sequences return false"));

            yield return(new TestCaseData(extendedExtended, extendedDiffExtended).Returns(false).SetName("Two different extended protein sequences return false"));
        }
Ejemplo n.º 3
0
        public void ParseFile(string path, ProcessInfo processInfo)
        {
            processInfo.Status("Parsing " + path);
            string accession = "";
            int sequenceCounter = 0;
            StringBuilder sequence = new StringBuilder();
            ProteinSequence protein = new ProteinSequence();
            try{
                StreamReader file = new StreamReader(path);
                string line;
                while ((line = file.ReadLine()) != null){ // valid line
                    if (sequenceCounter%500 == 0){
                        processInfo.Status("Parsing " + path + ", " + (int) ((float) file.BaseStream.Position/file.BaseStream.Length*100) +
                                            "%");
                    }
                    bool lineIsHeader = line.StartsWith(">");

                    // skip all lines until the first header is found
                    if (sequenceCounter == 0 && !lineIsHeader){
                        continue;
                    }

                    // line is a piece of a sequence
                    if (sequenceCounter > 0 && !lineIsHeader){
                        sequence.Append(line.Trim());
                        continue;
                    }

                    // line is a fasta header
                    if (lineIsHeader){
                        if (sequenceCounter > 0)
                            // this is not the first header, i.e. the previous sequence is now completely read in
                        {
                            // add the previous protein
                            protein.SetSequence(sequence.ToString());
                            entries.Add(accession, protein);
                        }
                        // initialize a new protein
                        protein = new ProteinSequence();
                        sequenceCounter++;
                        // then parse the new header
                        string header = line;
                        Match m = regexUniprotAccession.Match(header);
                        if (m.Success){ // uniprot header
                            accession = m.Groups[1].Value;
                            protein.Accession = accession;
                            protein.Header = header;
                        } else{ // fallback position: take entire header after the > as accession
                            accession = header.Substring(1).Trim();
                            protein.Accession = accession;
                            protein.Header = header;
                        }
                        sequence = new StringBuilder();
                    }
                } //end while
                file.Close();

                //add the last protein
                if (sequenceCounter > 0){ // make sure there is at least one sequence in the file
                    protein.SetSequence(sequence.ToString());
                    entries.Add(accession, protein);
                }
            } catch (Exception){
                processInfo.ErrString =
                    "Something went wrong while parsing the fasta file.\nMake sure the path is correct and the " +
                    "file is not opened in another application.\nMake sure the fasta file is valid.";
            }
        }
 public ProteinSequenceMatchDuo(ProteinSequence proteinSequence, ProteinMatch proteinMatch)
 {
     ProteinSequence = proteinSequence;
     ProteinMatch    = proteinMatch;
 }
Ejemplo n.º 5
0
        public void Tag_Tests(ProteinSequence incoming, IEnumerable <string> incomingTags)
        {
            var expected = new HashSet <string>(incomingTags);

            CollectionAssert.AreEquivalent(incoming.Tags, expected);
        }
Ejemplo n.º 6
0
        public int Constructor_Tests(string sequence, AlphabetType alphabet)
        {
            var protein = new ProteinSequence(sequence, alphabet, GeneticCode.Standard);

            return(protein.Sequence.Length);
        }
Ejemplo n.º 7
0
 public bool EqualityTests(ProteinSequence a, ProteinSequence b)
 {
     return(a.Equals(b));
 }
Ejemplo n.º 8
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int proteinIdColumnInd = param.GetParam<int>("Protein IDs").Value;
            string[][] proteinIds = new string[mdata.RowCount][];
            string[][] leadingIds = new string[mdata.RowCount][];
            List<string> allIds = new List<string>();
            for (int row = 0; row < mdata.RowCount; row++){
                proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';');
                leadingIds[row] = new[]{proteinIds[row][0]};
                allIds.AddRange(proteinIds[row]);
            }
            string fastaFilePath = param.GetParam<string>("Fasta file").Value;
            Fasta fasta = new Fasta();
            fasta.ParseFile(fastaFilePath, processInfo);
            // Text annotations
            processInfo.Status("Adding fasta header annotations.");
            int[] selection =
                param.GetParamWithSubParams<int>("Fasta header annotations").GetSubParameters().GetParam<int[]>("Annotations").Value;
            string[][] idsToBeAnnotated = (param.GetParamWithSubParams<int>("Fasta header annotations").Value == 0)
                ? proteinIds
                : leadingIds;
            ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][];
            for (int row = 0; row < mdata.RowCount; row++){
                List<ProteinSequence> rowEntries = new List<ProteinSequence>();
                foreach (string id in idsToBeAnnotated[row]){
                    ProteinSequence entry = fasta.GetEntry(id);
                    if (entry == null){
                        continue;
                    }
                    rowEntries.Add(entry);
                }
                fastaEntries[row] = rowEntries.ToArray();
            }
            if (ArrayUtils.Contains(selection, 0)){ // Entry name
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string entryName = entry.EntryName;
                        if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)){
                            rowAnnotations.Add(entryName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Entry name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1)){ // Gene name
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string geneName = entry.GeneName;
                        if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)){
                            rowAnnotations.Add(geneName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Gene name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2)){
                // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the
                //'Isoform x of...' prefixes and '(Fragment)' suffixes
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string proteinName = entry.ProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 3)){ // Consensus protein name
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string proteinName = entry.ConsensusProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 4)){ // Species
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string speciesName = entry.Species;
                        if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)){
                            rowAnnotations.Add(speciesName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Species", "", annotationColumn);
            }
            // Numeric annotations
            processInfo.Status("Adding numeric annotations.");
            selection =
                param.GetParamWithSubParams<int>("Numeric annotations").GetSubParameters().GetParam<int[]>("Annotations").Value;
            bool annotateLeadingId = (param.GetParamWithSubParams<int>("Numeric annotations").Value == 1);
            if (ArrayUtils.Contains(selection, 0)){ // Sequence length
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double sequenceLength = entry.GetSequence().Length;
                        rowAnnotations.Add(sequenceLength);
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Sequence length", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1)){ // Monoisotopic molecular mass
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double monoisotopicMass = entry.GetMonoisotopicMolecularMass();
                        rowAnnotations.Add(monoisotopicMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2)){ // Average molecular mass
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double averageMass = entry.GetAverageMolecularMass();
                        rowAnnotations.Add(averageMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Average molecular mass", "", annotationColumn);
            }
            // Theoretical peptides
            processInfo.Status("Calculating theoretical peptides.");
            annotateLeadingId = (param.GetParamWithSubParams<int>("Calculate theoretical peptides").Value == 1);
            Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases,
                param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<int[]>("Proteases")
                    .Value);
            double minLength =
                param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>(
                    "Min. peptide length").Value;
            double maxLength =
                param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>(
                    "Max. peptide length").Value;
            bool displayPeptideSequences = annotateLeadingId &&
                                            param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<bool>(
                                                "Show sequences").Value;
            foreach (Protease protease in proteases){
                double[] annotationColumn = new double[mdata.RowCount];
                string[] peptideColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    List<string> rowPeptides = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int) minLength, (int) maxLength);
                        rowAnnotations.Add(nTheoreticalPeptides);
                        if (displayPeptideSequences){
                            rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int) minLength, (int) maxLength));
                        }
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                    peptideColumn[row] = String.Join(";", rowPeptides);
                }
                mdata.AddNumericColumn(
                    "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn);
                if (displayPeptideSequences){
                    mdata.AddStringColumn(
                        "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn);
                }
            }
            // Sequence features
            processInfo.Status("Counting sequence features.");
            annotateLeadingId = (param.GetParamWithSubParams<int>("Count sequence features").Value == 1);
            bool normalizeBySequenceLength =
                param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<bool>(
                    "Normalize by sequence length").Value;
            if (param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value !=
                ""){
                Regex regex;
                try{
                    regex =
                        new Regex(
                            param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value);
                } catch (ArgumentException){
                    processInfo.ErrString = "The regular expression you provided has invalid syntax.";
                    return;
                }
                double[] sequenceFeatureColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> featureCount = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double nFeatures = regex.Matches(entry.GetSequence()).Count;
                        featureCount.Add(normalizeBySequenceLength ? nFeatures/entry.GetLength() : nFeatures);
                        if (annotateLeadingId){
                            break;
                        }
                    }
                    sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray());
                }
                mdata.AddNumericColumn(
                    (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "",
                    sequenceFeatureColumn);
            }
            processInfo.Status("Done.");
        }