コード例 #1
0
        public void SmallTest()
        {
            IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ]
            {
                { 0, 4 },
                { 1, 5 },
                { 2, 6 },
                { 3, 7 }
            });

            mdata.AddStringColumn("id", "", new [] { "a", "b", "b", "b" });
            mdata.AddStringColumn("str", "", new [] { "a;b", "b;c", "c;d", "d;e" });
            mdata.AddCategoryColumn("cat", "", new[] { new[] { "a", "b" }, new[] { "b", "c" }, new[] { "c", "d" }, new[] { "d", "e" } });
            mdata.AddNumericColumn("num", "", new [] { 0, 1, 2, 3, 4.0 });
            mdata.AddMultiNumericColumn("mnum", "", new [] { new [] { 0, 4d }, new [] { 1, 5d }, new [] { 2, 6d }, new [] { 3, 7d } });
            mdata.UniqueRows(mdata.StringColumns[0], ArrayUtils.Median, UniqueRows.Union, UniqueRows.CatUnion, UniqueRows.MultiNumUnion);

            Assert.AreEqual(2, mdata.RowCount);
            CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.Values.GetColumn(0));
            CollectionAssert.AreEqual(new [] { 4, 6 }, mdata.Values.GetColumn(1));
            CollectionAssert.AreEqual(new [] { "a;b", "b;c;d;e" }, mdata.GetStringColumn("str"));
            CollectionAssert.AreEqual(new [] { new [] { "a", "b" }, new [] { "b", "c", "d", "e" } }, mdata.GetCategoryColumnAt(0));
            CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.NumericColumns[0]);
            CollectionAssert.AreEqual(new [] { new [] { 0d, 4 }, new [] { 1d, 5, 2, 6, 3, 7 } }, mdata.MultiNumericColumns[0]);
        }
コード例 #2
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            PhosphoSitePlusParser.ParseKinaseSubstrate(out string[] seqWins, out string[] subAccs, out string[] kinases, out string[] kinAccs, out string[] species);
            if (seqWins == null)
            {
                processInfo.ErrString = "File does not exist.";
                return;
            }
            string[]   up    = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value];
            string[][] uprot = new string[up.Length][];
            for (int i = 0; i < up.Length; i++)
            {
                uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0];
            }
            string[] win = mdata.StringColumns[param.GetParam <int>("Sequence window").Value];
            Dictionary <string, List <Tuple <string, string, string> > > substrateProperties =
                new Dictionary <string, List <Tuple <string, string, string> > >();

            for (int i = 0; i < seqWins.Length; i++)
            {
                string subAcc = subAccs[i];
                if (!substrateProperties.ContainsKey(subAcc))
                {
                    substrateProperties.Add(subAcc, new List <Tuple <string, string, string> >());
                }
                substrateProperties[subAcc].Add(new Tuple <string, string, string>(seqWins[i], kinases[i], kinAccs[i]));
            }
            string[] kinaseNameColumn    = new string[uprot.Length];
            string[] kinaseUniprotColumn = new string[uprot.Length];
            for (int i = 0; i < kinaseNameColumn.Length; i++)
            {
                string[]         win1              = AddKnownSites.TransformIl(win[i]).Split(';');
                HashSet <string> kinaseNamesHits   = new HashSet <string>();
                HashSet <string> kinaseUniprotHits = new HashSet <string>();
                foreach (string ux in uprot[i])
                {
                    if (substrateProperties.ContainsKey(ux))
                    {
                        List <Tuple <string, string, string> > properties = substrateProperties[ux];
                        foreach (Tuple <string, string, string> property in properties)
                        {
                            string w = property.Item1;
                            if (AddKnownSites.Contains(win1, AddKnownSites.TransformIl(w.ToUpper().Substring(1, w.Length - 2))))
                            {
                                kinaseNamesHits.Add(property.Item2);
                                kinaseUniprotHits.Add(property.Item3);
                            }
                        }
                    }
                }
                kinaseNameColumn[i]    = kinaseNamesHits.Count > 0 ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseNamesHits)) : "";
                kinaseUniprotColumn[i] = kinaseUniprotHits.Count > 0
                                        ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseUniprotHits))
                                        : "";
            }
            mdata.AddStringColumn("PhosphoSitePlus kinase", "", kinaseNameColumn);
            mdata.AddStringColumn("PhosphoSitePlus kinase uniprot", "", kinaseUniprotColumn);
        }
コード例 #3
0
        public void TestInitialize()
        {
            var peptidesValues = new[, ] {
                { 9.0f }
            };

            peptides = PerseusFactory.CreateMatrixData(peptidesValues, new List <string> {
                "pep_MS/MS Count"
            });
            peptides.AddNumericColumn("pep_Intensity", "", new [] { 0.0 });
            peptides.AddStringColumn("pep_id", "", new [] { "35" });
            peptides.AddStringColumn("pep_Protein group IDs", "", new [] { "13;21" });
            peptides.Quality.Init(1, 1);
            peptides.Quality.Set(0, 0, 1);
            var multiNum    = new ExpandMultiNumeric();
            var errorString = string.Empty;
            var parameters2 = multiNum.GetParameters(peptides, ref errorString);

            parameters2.GetParam <int[]>("Text columns").Value = new[] { 1 };
            IMatrixData[]   suppl = null;
            IDocumentData[] docs  = null;
            multiNum.ProcessData(peptides, parameters2, ref suppl, ref docs, CreateProcessInfo());

            var proteinMainValues = new[, ]
            {
                { 166250000.0f },
                { 8346000.0f }
            };

            proteinMain = PerseusFactory.CreateMatrixData(proteinMainValues, new List <string> {
                "prot_LFQ intensity"
            });
            proteinMain.Name = "protein main";
            proteinMain.AddStringColumn("prot_id", "", new [] { "13", "21" });
            proteinMain.AddStringColumn("prot_gene name", "", new [] { "geneA", "geneB" });
            var expandValues = new[, ]
            {
                { 9.0f },
                { 9.0f }
            };

            expand = PerseusFactory.CreateMatrixData(expandValues, new List <string> {
                "pep_MS/MS Count"
            });
            expand.Name = "expand";
            expand.AddNumericColumn("pep_Intensity", "", new [] { 0.0, 0.0 });
            expand.AddStringColumn("pep_id", "", new [] { "35", "35" });
            expand.AddStringColumn("pep_Protein group IDs", "", new [] { "13", "21" });

            matching = new MatchingRowsByName();
            var err = string.Empty;

            parameters = matching.GetParameters(new[] { expand, proteinMain }, ref err);
        }
コード例 #4
0
        public void WriteMatrixTest()
        {
            // main data
            IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ] {
                { 1, 2, 3 }, { 3, 4, 5 }
            },
                                                                new List <string> {
                "col1", "col2", "col3"
            });

            // annotation rows
            mdata.AddCategoryRow("catrow", "this is catrow", new[] { new[] { "cat1" }, new[] { "cat1", "cat2" }, new[] { "cat2" } });
            mdata.AddNumericRow("numrow", "this is numrow", new[] { -1.0, 1, 2 });
            // annotation columns
            mdata.AddStringColumn("strcol1", "this is stringcol1", new[] { "1", "2" });
            mdata.AddStringColumn("strcol2", "", new[] { "", "hallo" });
            mdata.AddNumericColumn("numcol", "", new[] { 1.0, 2.0 });
            mdata.AddMultiNumericColumn("multnumcol", "this is multnumcol", new[] { new[] { -2.0, 2.0 }, new double[] {} });
            mdata.AddCategoryColumn("catcol", "", new[] { new[] { "cat1", "cat1.1" }, new[] { "cat2", "cat1" } });

            string mdataStr;

            using (MemoryStream memstream = new MemoryStream())
                using (StreamWriter writer = new StreamWriter(memstream)) {
                    PerseusUtils.WriteMatrix(mdata, writer);
                    writer.Flush();
                    mdataStr = Encoding.UTF8.GetString(memstream.ToArray());
                }

            IMatrixData mdata2 = PerseusFactory.CreateMatrixData();

            PerseusUtils.ReadMatrix(mdata2, new ProcessInfo(new Settings(), status => { }, progress => { }, 1), () => {
                StreamReader tmpStream = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(mdataStr)));
                return(tmpStream);
            }, "matrix1", '\t');

            Assert.AreEqual(2, mdata2.RowCount);
            Assert.AreEqual(3, mdata2.ColumnCount);

            Assert.AreEqual(2, mdata2.StringColumnCount);
            Assert.AreEqual(1, mdata2.NumericColumnCount);
            Assert.AreEqual(1, mdata2.CategoryColumnCount);
            Assert.AreEqual(1, mdata2.MultiNumericColumnCount);

            Assert.AreEqual("hallo", mdata2.StringColumns[mdata2.StringColumnNames.FindIndex(col => col.Equals("strcol2"))][1]);

            Assert.AreEqual(1, mdata2.CategoryRowCount);
            Assert.AreEqual(1, mdata2.NumericRowCount);
        }
コード例 #5
0
        public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string[]     baseIds = GetBaseIds(para, mdata);
            string[]     name;
            int[]        catColInds;
            int[]        textColInds;
            int[]        numColInds;
            string[][][] catCols;
            string[][]   textCols;
            double[][]   numCols;
            bool         success = ProcessDataAddAnnotation(mdata.RowCount, para, baseIds, processInfo, out name, out catColInds,
                                                            out textColInds, out numColInds, out catCols, out textCols, out numCols);

            if (!success)
            {
                return;
            }
            for (int i = 0; i < catCols.Length; i++)
            {
                mdata.AddCategoryColumn(name[catColInds[i]], "", catCols[i]);
            }
            for (int i = 0; i < textCols.Length; i++)
            {
                mdata.AddStringColumn(name[textColInds[i]], "", textCols[i]);
            }
            for (int i = 0; i < numCols.Length; i++)
            {
                mdata.AddNumericColumn(name[numColInds[i]], "", numCols[i]);
            }
        }
コード例 #6
0
 public void Setup()
 {
     _mdata = PerseusFactory.CreateMatrixData(new[, ] {
         { 0.0, 1.0, 0.0 }, { 0.0, 0.0, 0.0 }, { 0.0, 1.0, 0.0 }
     });
     _mdata.AddStringColumn("test", "", new [] { "a", "b", "a" });
 }
コード例 #7
0
        public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            GetAvailableTextAnnots(out string[] baseNames, out int[][] inds, out string[] files);
            ParameterWithSubParams <int> spd      = para.GetParamWithSubParams <int>("Source");
            int              ind                  = spd.Value;
            Parameters       param                = spd.GetSubParameters();
            int              baseCol              = param.GetParam <int>("Identifiers").Value;
            int              selection            = param.GetParam <int>("Identifier type").Value;
            HashSet <string> allIds               = GetAllIds(mdata, baseCol);
            string           file                 = files[ind];
            Dictionary <string, string[]> mapping = ReadMapping(allIds, file, inds[ind][selection]);

            string[] x      = mdata.StringColumns[baseCol];
            string[] newCol = new string[x.Length];
            for (int i = 0; i < x.Length; i++)
            {
                string        w = x[i];
                string[]      q = w.Length > 0 ? w.Split(';') : new string[0];
                List <string> m = new List <string>();
                foreach (string s in q)
                {
                    string r = s.ToLower();
                    if (mapping.ContainsKey(r))
                    {
                        m.AddRange(mapping[r]);
                    }
                }
                string[] vals = ArrayUtils.UniqueValues(m);
                newCol[i] = StringUtils.Concat(";", vals);
            }
            mdata.AddStringColumn(baseNames[ind], baseNames[ind], newCol);
        }
コード例 #8
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int stringColumnIndx = param.GetParam <int>("Sequence window").Value;

            string[] win    = mdata.StringColumns[stringColumnIndx];
            int      start  = param.GetParam <int>("Start").Value - 1;
            int      length = param.GetParam <int>("Length").Value;

            if (start < 0)
            {
                processInfo.ErrString = "Start position cannot be smaller than 1.";
                return;
            }
            if (start + length > win[0].Length)
            {
                processInfo.ErrString = "Start + length cannot exceed the total length of the sequence.";
                return;
            }
            string[] shortenedMotifs = new string[win.Length];
            for (int i = 0; i < mdata.RowCount; ++i)
            {
                shortenedMotifs[i] = win[i].Substring(start, length);
            }
            mdata.AddStringColumn("Short sequence window", "", shortenedMotifs);
        }
コード例 #9
0
 public void ProcessData(IMatrixData data, Parameters param, ref IMatrixData[] supplTables,
                         ref IDocumentData[] documents, ProcessInfo processInfo)
 {
     int[] exColInds       = param.GetParam <int[]>("Main columns").Value;
     int[] numColInds      = param.GetParam <int[]>("Numerical columns").Value;
     int[] multiNumColInds = param.GetParam <int[]>("Multi-numerical columns").Value;
     int[] catColInds      = param.GetParam <int[]>("Categorical columns").Value;
     int[] textColInds     = param.GetParam <int[]>("Text columns").Value;
     if (exColInds.Length > 0)
     {
         int ncol = data.ColumnCount;
         data.ExtractColumns(ArrayUtils.Concat(ArrayUtils.ConsecutiveInts(data.ColumnCount), exColInds));
         HashSet <string> taken = new HashSet <string>(data.ColumnNames);
         for (int i = 0; i < exColInds.Length; i++)
         {
             string s = StringUtils.GetNextAvailableName(data.ColumnNames[ncol + i], taken);
             data.ColumnNames[ncol + i] = s;
             taken.Add(s);
         }
     }
     foreach (int ind in numColInds)
     {
         HashSet <string> taken = new HashSet <string>(data.NumericColumnNames);
         string           s     = StringUtils.GetNextAvailableName(data.NumericColumnNames[ind], taken);
         data.AddNumericColumn(s, data.NumericColumnDescriptions[ind], (double[])data.NumericColumns[ind].Clone());
         taken.Add(s);
     }
     foreach (int ind in multiNumColInds)
     {
         HashSet <string> taken = new HashSet <string>(data.MultiNumericColumnNames);
         string           s     = StringUtils.GetNextAvailableName(data.MultiNumericColumnNames[ind], taken);
         data.AddMultiNumericColumn(s, data.MultiNumericColumnDescriptions[ind],
                                    (double[][])data.MultiNumericColumns[ind].Clone());
         taken.Add(s);
     }
     foreach (int ind in catColInds)
     {
         HashSet <string> taken = new HashSet <string>(data.CategoryColumnNames);
         string           s     = StringUtils.GetNextAvailableName(data.CategoryColumnNames[ind], taken);
         data.AddCategoryColumn(s, data.CategoryColumnDescriptions[ind], data.GetCategoryColumnAt(ind));
         taken.Add(s);
     }
     foreach (int ind in textColInds)
     {
         HashSet <string> taken = new HashSet <string>(data.StringColumnNames);
         string           s     = StringUtils.GetNextAvailableName(data.StringColumnNames[ind], taken);
         data.AddStringColumn(s, data.ColumnDescriptions[ind], (string[])data.StringColumns[ind].Clone());
         taken.Add(s);
     }
 }
コード例 #10
0
        public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            var annotationProvider = _annotationProvider;
            ParameterWithSubParams <int> sourceParam = para.GetParamWithSubParams <int>("Source");
            int        sourceIndex = sourceParam.Value;
            Parameters param       = sourceParam.GetSubParameters();
            int        baseCol     = param.GetParam <int>("Identifiers").Value;
            int        selection   = param.GetParam <int>("Identifier type").Value;

            var(_, id, _) = annotationProvider.TextSources()[sourceIndex];
            var newColumn = annotationProvider.MapToBaseIdentifiers(mdata.StringColumns[baseCol], sourceIndex, selection);

            mdata.AddStringColumn(id, id, newColumn);
        }
コード例 #11
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int[] inds        = param.GetParam <int[]>("Columns").Value;
            bool  keepColumns = param.GetParam <bool>("Keep original columns").Value;

            foreach (var col in inds)
            {
                var values = mdata.StringColumns[col].Select(s => s.ToUpper()).ToArray();
                if (keepColumns)
                {
                    mdata.AddStringColumn(mdata.StringColumnNames[col], mdata.StringColumnDescriptions[col], values);
                }
                else
                {
                    mdata.StringColumns[col] = values;
                }
            }
        }
コード例 #12
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int stringColumnIndx = param.GetParam<int>("Sequence window").Value;
            string[] win = mdata.StringColumns[stringColumnIndx];
            int start = param.GetParam<int>("Start").Value - 1;
            int length = param.GetParam<int>("Length").Value;
            if (start < 0){
                processInfo.ErrString = "Start position cannot be smaller than 1.";
                return;
            }
            if (start + length > win[0].Length){
                processInfo.ErrString = "Start + length cannot exceed the total length of the sequence.";
                return;
            }
            string[] shortenedMotifs = new string[win.Length];
            for (int i = 0; i < mdata.RowCount; ++i){
                shortenedMotifs[i] = win[i].Substring(start, length);
            }
            mdata.AddStringColumn("Short sequence window", "", shortenedMotifs);
        }
コード例 #13
0
        public void TestSmallExample()
        {
            double[,] values = new[, ]
            {
                { 0.0, 1.0, 0, 5 },
                { 2.0, 3.0, 0, 5 }
            };
            IMatrixData mdata = PerseusFactory.CreateMatrixData(values, new List <string> {
                "Col___1", "Col___2", "Col___3", "No expand"
            });

            mdata.ColumnDescriptions = new List <string> {
                "Description Col", "Col", "Col", "Description No expand"
            };
            double[][] multiNum = new[]
            {
                new[] { 0.0, 1.0 },
                new[] { 2.0 }
            };
            mdata.AddMultiNumericColumn("MultiNum", "", multiNum);
            string[] stringCol = new[] { "row1", "row2" };
            mdata.AddStringColumn("String", "", stringCol);
            ExpandSiteTable expand = new ExpandSiteTable();

            IMatrixData[]   supplData = null;
            IDocumentData[] docs      = null;
            expand.ProcessData(mdata, new Parameters(), ref supplData, ref docs, CreateProcessInfo());
            Assert.AreEqual(2, mdata.ColumnCount);
            CollectionAssert.AreEqual(new [] { "No expand", "Col" }, mdata.ColumnNames.ToArray());
            Assert.AreEqual(2, mdata.ColumnDescriptions.Count);
            CollectionAssert.AreEqual(new [] { "Description No expand", "Description Col" }, mdata.ColumnDescriptions.ToArray());
            Assert.AreEqual(6, mdata.RowCount);
            Assert.AreEqual(2, mdata.StringColumnCount);
            CollectionAssert.AreEqual(new [] { "String", "Unique identifier" }, mdata.StringColumnNames);
            CollectionAssert.AreEqual(stringCol.Concat(stringCol).Concat(stringCol).ToArray(), mdata.StringColumns[0]);
            Assert.AreEqual(1, mdata.MultiNumericColumnCount);
            CollectionAssert.AreEqual(multiNum.Concat(multiNum).Concat(multiNum).ToArray(), mdata.MultiNumericColumns[0]);
        }
コード例 #14
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string mod = param.GetParam <int>("Modification").StringValue;

            PhosphoSitePlusParser.ParseKnownMod(mod, out string[] seqWins, out string[] accs, out string[] pubmedLtp, out string[] pubmedMs2, out string[] cstMs2, out string[] species);
            if (seqWins == null)
            {
                processInfo.ErrString = "File does not exist.";
                return;
            }
            string[]   up    = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value];
            string[][] uprot = new string[up.Length][];
            for (int i = 0; i < up.Length; i++)
            {
                uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0];
            }
            string[] win = mdata.StringColumns[param.GetParam <int>("Sequence window").Value];
            Dictionary <string, List <int> > map = new Dictionary <string, List <int> >();

            for (int i = 0; i < seqWins.Length; i++)
            {
                string acc = accs[i];
                if (!map.ContainsKey(acc))
                {
                    map.Add(acc, new List <int>());
                }
                map[acc].Add(i);
            }
            string[]   newCol    = new string[uprot.Length];
            string[][] newCatCol = new string[uprot.Length][];
            string[][] originCol = new string[uprot.Length][];
            for (int i = 0; i < newCol.Length; i++)
            {
                string[]         win1    = TransformIl(win[i]).Split(';');
                HashSet <string> wins    = new HashSet <string>();
                HashSet <string> origins = new HashSet <string>();
                foreach (string ux in uprot[i])
                {
                    if (map.ContainsKey(ux))
                    {
                        List <int> n = map[ux];
                        foreach (int ind in n)
                        {
                            string s = seqWins[ind];
                            if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2))))
                            {
                                wins.Add(s);
                                if (pubmedLtp[ind].Length > 0)
                                {
                                    origins.Add("LTP");
                                }
                                if (pubmedMs2[ind].Length > 0)
                                {
                                    origins.Add("HTP");
                                }
                                if (cstMs2[ind].Length > 0)
                                {
                                    origins.Add("CST");
                                }
                            }
                        }
                    }
                }
                if (wins.Count > 0)
                {
                    newCol[i]    = StringUtils.Concat(";", ArrayUtils.ToArray(wins));
                    newCatCol[i] = new[] { "+" };
                    string[] x = ArrayUtils.ToArray(origins);
                    Array.Sort(x);
                    originCol[i] = x;
                }
                else
                {
                    newCol[i]    = "";
                    newCatCol[i] = new string[0];
                    originCol[i] = new string[0];
                }
            }
            mdata.AddStringColumn("PhosphoSitePlus window", "", newCol);
            mdata.AddCategoryColumn("Known site", "", newCatCol);
            mdata.AddCategoryColumn("Origin", "", originCol);
        }
コード例 #15
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int proteinIdColumnInd = param.GetParam<int>("Protein IDs").Value;
            string[][] proteinIds = new string[mdata.RowCount][];
            string[][] leadingIds = new string[mdata.RowCount][];
            List<string> allIds = new List<string>();
            for (int row = 0; row < mdata.RowCount; row++){
                proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';');
                leadingIds[row] = new[]{proteinIds[row][0]};
                allIds.AddRange(proteinIds[row]);
            }
            string fastaFilePath = param.GetParam<string>("Fasta file").Value;
            Fasta fasta = new Fasta();
            fasta.ParseFile(fastaFilePath, processInfo);
            // Text annotations
            processInfo.Status("Adding fasta header annotations.");
            int[] selection =
                param.GetParamWithSubParams<int>("Fasta header annotations").GetSubParameters().GetParam<int[]>("Annotations").Value;
            string[][] idsToBeAnnotated = (param.GetParamWithSubParams<int>("Fasta header annotations").Value == 0)
                ? proteinIds
                : leadingIds;
            ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][];
            for (int row = 0; row < mdata.RowCount; row++){
                List<ProteinSequence> rowEntries = new List<ProteinSequence>();
                foreach (string id in idsToBeAnnotated[row]){
                    ProteinSequence entry = fasta.GetEntry(id);
                    if (entry == null){
                        continue;
                    }
                    rowEntries.Add(entry);
                }
                fastaEntries[row] = rowEntries.ToArray();
            }
            if (ArrayUtils.Contains(selection, 0)){ // Entry name
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string entryName = entry.EntryName;
                        if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)){
                            rowAnnotations.Add(entryName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Entry name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1)){ // Gene name
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string geneName = entry.GeneName;
                        if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)){
                            rowAnnotations.Add(geneName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Gene name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2)){
                // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the
                //'Isoform x of...' prefixes and '(Fragment)' suffixes
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string proteinName = entry.ProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 3)){ // Consensus protein name
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string proteinName = entry.ConsensusProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 4)){ // Species
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<string> rowAnnotations = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        string speciesName = entry.Species;
                        if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)){
                            rowAnnotations.Add(speciesName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Species", "", annotationColumn);
            }
            // Numeric annotations
            processInfo.Status("Adding numeric annotations.");
            selection =
                param.GetParamWithSubParams<int>("Numeric annotations").GetSubParameters().GetParam<int[]>("Annotations").Value;
            bool annotateLeadingId = (param.GetParamWithSubParams<int>("Numeric annotations").Value == 1);
            if (ArrayUtils.Contains(selection, 0)){ // Sequence length
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double sequenceLength = entry.GetSequence().Length;
                        rowAnnotations.Add(sequenceLength);
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Sequence length", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1)){ // Monoisotopic molecular mass
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double monoisotopicMass = entry.GetMonoisotopicMolecularMass();
                        rowAnnotations.Add(monoisotopicMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2)){ // Average molecular mass
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double averageMass = entry.GetAverageMolecularMass();
                        rowAnnotations.Add(averageMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Average molecular mass", "", annotationColumn);
            }
            // Theoretical peptides
            processInfo.Status("Calculating theoretical peptides.");
            annotateLeadingId = (param.GetParamWithSubParams<int>("Calculate theoretical peptides").Value == 1);
            Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases,
                param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<int[]>("Proteases")
                    .Value);
            double minLength =
                param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>(
                    "Min. peptide length").Value;
            double maxLength =
                param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>(
                    "Max. peptide length").Value;
            bool displayPeptideSequences = annotateLeadingId &&
                                            param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<bool>(
                                                "Show sequences").Value;
            foreach (Protease protease in proteases){
                double[] annotationColumn = new double[mdata.RowCount];
                string[] peptideColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> rowAnnotations = new List<double>();
                    List<string> rowPeptides = new List<string>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int) minLength, (int) maxLength);
                        rowAnnotations.Add(nTheoreticalPeptides);
                        if (displayPeptideSequences){
                            rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int) minLength, (int) maxLength));
                        }
                        if (annotateLeadingId && rowAnnotations.Count > 0){
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                    peptideColumn[row] = String.Join(";", rowPeptides);
                }
                mdata.AddNumericColumn(
                    "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn);
                if (displayPeptideSequences){
                    mdata.AddStringColumn(
                        "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn);
                }
            }
            // Sequence features
            processInfo.Status("Counting sequence features.");
            annotateLeadingId = (param.GetParamWithSubParams<int>("Count sequence features").Value == 1);
            bool normalizeBySequenceLength =
                param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<bool>(
                    "Normalize by sequence length").Value;
            if (param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value !=
                ""){
                Regex regex;
                try{
                    regex =
                        new Regex(
                            param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value);
                } catch (ArgumentException){
                    processInfo.ErrString = "The regular expression you provided has invalid syntax.";
                    return;
                }
                double[] sequenceFeatureColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++){
                    List<double> featureCount = new List<double>();
                    foreach (ProteinSequence entry in fastaEntries[row]){
                        double nFeatures = regex.Matches(entry.GetSequence()).Count;
                        featureCount.Add(normalizeBySequenceLength ? nFeatures/entry.GetLength() : nFeatures);
                        if (annotateLeadingId){
                            break;
                        }
                    }
                    sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray());
                }
                mdata.AddNumericColumn(
                    (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "",
                    sequenceFeatureColumn);
            }
            processInfo.Status("Done.");
        }
コード例 #16
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string[] seqWins;
            string[] subAccs;
            string[] kinases;
            string[] kinAccs;
            string[] species;
            PhosphoSitePlusParser.ParseKinaseSubstrate(out seqWins, out subAccs, out kinases, out kinAccs, out species);
            if (seqWins == null){
                processInfo.ErrString = "File does not exist.";
                return;
            }
            string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value];
            string[][] uprot = new string[up.Length][];
            for (int i = 0; i < up.Length; i++){
                uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0];
            }
            string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value];
            Dictionary<string, List<Tuple<string, string, string>>> substrateProperties =
                new Dictionary<string, List<Tuple<string, string, string>>>();
            for (int i = 0; i < seqWins.Length; i++){
                string subAcc = subAccs[i];
                if (!substrateProperties.ContainsKey(subAcc)){
                    substrateProperties.Add(subAcc, new List<Tuple<string, string, string>>());
                }
                substrateProperties[subAcc].Add(new Tuple<string, string, string>(seqWins[i], kinases[i], kinAccs[i]));
            }
            string[] kinaseNameColumn = new string[uprot.Length];
            string[] kinaseUniprotColumn = new string[uprot.Length];
            for (int i = 0; i < kinaseNameColumn.Length; i++){
                string[] win1 = AddKnownSites.TransformIl(win[i]).Split(';');
                HashSet<string> kinaseNamesHits = new HashSet<string>();
                HashSet<string> kinaseUniprotHits = new HashSet<string>();
                foreach (string ux in uprot[i]){
                    if (substrateProperties.ContainsKey(ux)){
                        List<Tuple<string, string, string>> properties = substrateProperties[ux];
                        foreach (Tuple<string, string, string> property in properties){
                            string w = property.Item1;
                            if (AddKnownSites.Contains(win1, AddKnownSites.TransformIl(w.ToUpper().Substring(1, w.Length - 2)))){
                                kinaseNamesHits.Add(property.Item2);
                                kinaseUniprotHits.Add(property.Item3);
                            }
                        }
                    }
                }
                kinaseNameColumn[i] = kinaseNamesHits.Count > 0 ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseNamesHits)) : "";
                kinaseUniprotColumn[i] = kinaseUniprotHits.Count > 0
                    ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseUniprotHits))
                    : "";
            }
            mdata.AddStringColumn("PhosphoSitePlus kinase", "", kinaseNameColumn);
            mdata.AddStringColumn("PhosphoSitePlus kinase uniprot", "", kinaseUniprotColumn);
        }
コード例 #17
0
        public IMatrixData ProcessData(IMatrixData[] inputData, Parameters param, ref IMatrixData[] supplTables,
                                       ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            IMatrixData mdata1 = inputData[0];
            IMatrixData mdata2 = inputData[1];

            string[] header1 = new string[mdata1.RowCount];
            for (int i = 0; i < mdata1.RowCount; i++)
            {
                header1[i] = mdata1.Name;
            }


            string[] header2 = new string[mdata2.RowCount];
            for (int i = 0; i < mdata2.RowCount; i++)
            {
                header2[i] = mdata2.Name;
            }

            int nrows1 = mdata1.RowCount;
            int nrows2 = mdata2.RowCount;
            int nrows  = nrows1 + nrows2;

            string[] expColNames = SpecialSort(mdata1.ColumnNames, mdata2.ColumnNames, out Dictionary <string, int> dic1, out Dictionary <string, int> dic2);
            double[,] ex = new double[nrows, expColNames.Length];
            for (int i = 0; i < ex.GetLength(0); i++)
            {
                for (int j = 0; j < ex.GetLength(1); j++)
                {
                    ex[i, j] = double.NaN;
                }
            }
            for (int i = 0; i < expColNames.Length; i++)
            {
                if (dic1.ContainsKey(expColNames[i]))
                {
                    int ind = dic1[expColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        ex[j, i] = mdata1.Values.Get(j, ind);
                    }
                }
                if (dic2.ContainsKey(expColNames[i]))
                {
                    int ind = dic2[expColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        ex[nrows1 + j, i] = mdata2.Values.Get(j, ind);
                    }
                }
            }
            string[]        numColNames = SpecialSort(mdata1.NumericColumnNames, mdata2.NumericColumnNames, out dic1, out dic2);
            List <double[]> numCols     = new List <double[]>();

            for (int i = 0; i < numColNames.Length; i++)
            {
                numCols.Add(new double[nrows]);
                for (int j = 0; j < nrows; j++)
                {
                    numCols[numCols.Count - 1][j] = double.NaN;
                }
            }
            for (int i = 0; i < numColNames.Length; i++)
            {
                if (dic1.ContainsKey(numColNames[i]))
                {
                    int ind = dic1[numColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        numCols[i][j] = mdata1.NumericColumns[ind][j];
                    }
                }
                if (dic2.ContainsKey(numColNames[i]))
                {
                    int ind = dic2[numColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        numCols[i][nrows1 + j] = mdata2.NumericColumns[ind][j];
                    }
                }
            }
            string[]        stringColNames = SpecialSort(mdata1.StringColumnNames, mdata2.StringColumnNames, out dic1, out dic2);
            List <string[]> stringCols     = new List <string[]>();

            for (int i = 0; i < stringColNames.Length; i++)
            {
                stringCols.Add(new string[nrows]);
                for (int j = 0; j < nrows; j++)
                {
                    stringCols[stringCols.Count - 1][j] = "";
                }
            }
            for (int i = 0; i < stringColNames.Length; i++)
            {
                if (dic1.ContainsKey(stringColNames[i]))
                {
                    int ind = dic1[stringColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        stringCols[i][j] = mdata1.StringColumns[ind][j];
                    }
                }
                if (dic2.ContainsKey(stringColNames[i]))
                {
                    int ind = dic2[stringColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        stringCols[i][nrows1 + j] = mdata2.StringColumns[ind][j];
                    }
                }
            }



            string[]          catColNames = SpecialSort(mdata1.CategoryColumnNames, mdata2.CategoryColumnNames, out dic1, out dic2);
            List <string[][]> catCols     = new List <string[][]>();

            for (int i = 0; i < catColNames.Length; i++)
            {
                catCols.Add(new string[nrows][]);
                for (int j = 0; j < nrows; j++)
                {
                    catCols[catCols.Count - 1][j] = new string[0];
                }
            }
            for (int i = 0; i < catColNames.Length; i++)
            {
                if (dic1.ContainsKey(catColNames[i]))
                {
                    int ind = dic1[catColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        catCols[i][j] = mdata1.GetCategoryColumnEntryAt(ind, j);
                    }
                }
                if (dic2.ContainsKey(catColNames[i]))
                {
                    int ind = dic2[catColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        catCols[i][nrows1 + j] = mdata2.GetCategoryColumnEntryAt(ind, j);
                    }
                }
            }

            string[] multiNumColNames = SpecialSort(mdata1.MultiNumericColumnNames, mdata2.MultiNumericColumnNames, out dic1,
                                                    out dic2);
            List <double[][]> multiNumCols = new List <double[][]>();

            for (int i = 0; i < multiNumColNames.Length; i++)
            {
                multiNumCols.Add(new double[nrows][]);
                for (int j = 0; j < nrows; j++)
                {
                    multiNumCols[multiNumCols.Count - 1][j] = new double[0];
                }
            }
            for (int i = 0; i < multiNumColNames.Length; i++)
            {
                if (dic1.ContainsKey(multiNumColNames[i]))
                {
                    int ind = dic1[multiNumColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        multiNumCols[i][j] = mdata1.MultiNumericColumns[ind][j];
                    }
                }
                if (dic2.ContainsKey(multiNumColNames[i]))
                {
                    int ind = dic2[multiNumColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        multiNumCols[i][nrows1 + j] = mdata2.MultiNumericColumns[ind][j];
                    }
                }
            }
            string MatrixName        = "Matrix Name";
            string MatrixDescription = "Description";

            string[] listnames = header1.Concat(header2).ToArray();
            // string[][] resultarray = catlistnames.Select(x => x.ToArray()).ToArray();
            //IMPORTANT!!!!! TODO: check if the name of the matrix if changed
            IMatrixData result = PerseusFactory.CreateMatrixData(ex, expColNames.ToList());

            result.NumericColumnNames             = new List <string>(numColNames);
            result.NumericColumnDescriptions      = result.NumericColumnNames;
            result.NumericColumns                 = numCols;
            result.StringColumnNames              = new List <string>(stringColNames);
            result.StringColumns                  = stringCols;
            result.CategoryColumnNames            = new List <string>(catColNames);
            result.CategoryColumnDescriptions     = result.CategoryColumnNames;
            result.CategoryColumns                = catCols;
            result.MultiNumericColumnNames        = new List <string>(multiNumColNames);
            result.MultiNumericColumnDescriptions = result.MultiNumericColumnNames;
            result.MultiNumericColumns            = multiNumCols;
            HashSet <string> taken = new HashSet <string>(result.StringColumnNames);

            result.AddStringColumn(MatrixName, MatrixName, listnames);
            taken.Add(MatrixName);

            return(result);
        }
コード例 #18
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string mod = param.GetParam<int>("Modification").StringValue;
            string[] seqWins;
            string[] accs;
            string[] pubmedLtp;
            string[] pubmedMs2;
            string[] cstMs2;
            string[] species;
            PhosphoSitePlusParser.ParseKnownMod(mod, out seqWins, out accs, out pubmedLtp, out pubmedMs2, out cstMs2, out species);
            if (seqWins == null){
                processInfo.ErrString = "File does not exist.";
                return;
            }
            string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value];
            string[][] uprot = new string[up.Length][];
            for (int i = 0; i < up.Length; i++){
                uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0];
            }
            string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value];
            Dictionary<string, List<int>> map = new Dictionary<string, List<int>>();
            for (int i = 0; i < seqWins.Length; i++){
                string acc = accs[i];
                if (!map.ContainsKey(acc)){
                    map.Add(acc, new List<int>());
                }
                map[acc].Add(i);
            }
            string[] newCol = new string[uprot.Length];
            string[][] newCatCol = new string[uprot.Length][];
            string[][] originCol = new string[uprot.Length][];
            for (int i = 0; i < newCol.Length; i++){
                string[] win1 = TransformIl(win[i]).Split(';');
                HashSet<string> wins = new HashSet<string>();
                HashSet<string> origins = new HashSet<string>();
                foreach (string ux in uprot[i]){
                    if (map.ContainsKey(ux)){
                        List<int> n = map[ux];
                        foreach (int ind in n){
                            string s = seqWins[ind];
                            if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))){
                                wins.Add(s);
                                if (pubmedLtp[ind].Length > 0){
                                    origins.Add("LTP");
                                }
                                if (pubmedMs2[ind].Length > 0){
                                    origins.Add("HTP");
                                }
                                if (cstMs2[ind].Length > 0){
                                    origins.Add("CST");
                                }
                            }
                        }
                    }
                }
                if (wins.Count > 0){
                    newCol[i] = StringUtils.Concat(";", ArrayUtils.ToArray(wins));
                    newCatCol[i] = new[]{"+"};
                    string[] x = ArrayUtils.ToArray(origins);
                    Array.Sort(x);
                    originCol[i] = x;
                } else{
                    newCol[i] = "";
                    newCatCol[i] = new string[0];
                    originCol[i] = new string[0];
                }
            }
            mdata.AddStringColumn("PhosphoSitePlus window", "", newCol);
            mdata.AddCategoryColumn("Known site", "", newCatCol);
            mdata.AddCategoryColumn("Origin", "", originCol);
        }
コード例 #19
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value;

            string[][]    proteinIds = new string[mdata.RowCount][];
            string[][]    leadingIds = new string[mdata.RowCount][];
            List <string> allIds     = new List <string>();

            for (int row = 0; row < mdata.RowCount; row++)
            {
                proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';');
                leadingIds[row] = new[] { proteinIds[row][0] };
                allIds.AddRange(proteinIds[row]);
            }
            string fastaFilePath = param.GetParam <string>("Fasta file").Value;
            Fasta  fasta         = new Fasta();

            fasta.ParseFile(fastaFilePath, processInfo);
            // Text annotations
            processInfo.Status("Adding fasta header annotations.");
            int[] selection =
                param.GetParamWithSubParams <int>("Fasta header annotations").GetSubParameters().GetParam <int[]>("Annotations").Value;
            string[][] idsToBeAnnotated = param.GetParamWithSubParams <int>("Fasta header annotations").Value == 0
                                ? proteinIds
                                : leadingIds;
            ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][];
            for (int row = 0; row < mdata.RowCount; row++)
            {
                List <ProteinSequence> rowEntries = new List <ProteinSequence>();
                foreach (string id in idsToBeAnnotated[row])
                {
                    ProteinSequence entry = fasta.GetEntry(id);
                    if (entry == null)
                    {
                        continue;
                    }
                    rowEntries.Add(entry);
                }
                fastaEntries[row] = rowEntries.ToArray();
            }
            if (ArrayUtils.Contains(selection, 0))              // Entry name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string entryName = entry.EntryName;
                        if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName))
                        {
                            rowAnnotations.Add(entryName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Entry name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1))              // Gene name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string geneName = entry.GeneName;
                        if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName))
                        {
                            rowAnnotations.Add(geneName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Gene name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2))
            {
                // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the
                //'Isoform x of...' prefixes and '(Fragment)' suffixes
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string proteinName = entry.ProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName))
                        {
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = string.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 3))              // Consensus protein name
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string proteinName = entry.ConsensusProteinName;
                        if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName))
                        {
                            rowAnnotations.Add(proteinName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Protein name", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 4))              // Species
            {
                string[] annotationColumn = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <string> rowAnnotations = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        string speciesName = entry.Species;
                        if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName))
                        {
                            rowAnnotations.Add(speciesName);
                        }
                    }
                    annotationColumn[row] = String.Join(";", rowAnnotations.ToArray());
                }
                mdata.AddStringColumn("Species", "", annotationColumn);
            }
            // Numeric annotations
            processInfo.Status("Adding numeric annotations.");
            selection =
                param.GetParamWithSubParams <int>("Numeric annotations").GetSubParameters().GetParam <int[]>("Annotations").Value;
            bool annotateLeadingId = param.GetParamWithSubParams <int>("Numeric annotations").Value == 1;

            if (ArrayUtils.Contains(selection, 0))              // Sequence length
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double sequenceLength = entry.GetSequence().Length;
                        rowAnnotations.Add(sequenceLength);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Sequence length", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 1))              // Monoisotopic molecular mass
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double monoisotopicMass = entry.GetMonoisotopicMolecularMass();
                        rowAnnotations.Add(monoisotopicMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn);
            }
            if (ArrayUtils.Contains(selection, 2))              // Average molecular mass
            {
                double[] annotationColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double averageMass = entry.GetAverageMolecularMass();
                        rowAnnotations.Add(averageMass);
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                }
                mdata.AddNumericColumn("Average molecular mass", "", annotationColumn);
            }
            // Theoretical peptides
            processInfo.Status("Calculating theoretical peptides.");
            annotateLeadingId = param.GetParamWithSubParams <int>("Calculate theoretical peptides").Value == 1;
            Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases,
                                                       param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <int[]>("Proteases")
                                                       .Value);
            double minLength =
                param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>(
                    "Min. peptide length").Value;
            double maxLength =
                param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>(
                    "Max. peptide length").Value;
            bool displayPeptideSequences = annotateLeadingId &&
                                           param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <bool>(
                "Show sequences").Value;

            foreach (Protease protease in proteases)
            {
                double[] annotationColumn = new double[mdata.RowCount];
                string[] peptideColumn    = new string[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> rowAnnotations = new List <double>();
                    List <string> rowPeptides    = new List <string>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int)minLength, (int)maxLength);
                        rowAnnotations.Add(nTheoreticalPeptides);
                        if (displayPeptideSequences)
                        {
                            rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int)minLength, (int)maxLength));
                        }
                        if (annotateLeadingId && rowAnnotations.Count > 0)
                        {
                            break;
                        }
                    }
                    annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray());
                    peptideColumn[row]    = String.Join(";", rowPeptides);
                }
                mdata.AddNumericColumn(
                    "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn);
                if (displayPeptideSequences)
                {
                    mdata.AddStringColumn(
                        "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn);
                }
            }
            // Sequence features
            processInfo.Status("Counting sequence features.");
            annotateLeadingId = param.GetParamWithSubParams <int>("Count sequence features").Value == 1;
            bool normalizeBySequenceLength =
                param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <bool>(
                    "Normalize by sequence length").Value;

            if (param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value !=
                "")
            {
                Regex regex;
                try{
                    regex =
                        new Regex(
                            param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value);
                } catch (ArgumentException) {
                    processInfo.ErrString = "The regular expression you provided has invalid syntax.";
                    return;
                }
                double[] sequenceFeatureColumn = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    List <double> featureCount = new List <double>();
                    foreach (ProteinSequence entry in fastaEntries[row])
                    {
                        double nFeatures = regex.Matches(entry.GetSequence()).Count;
                        featureCount.Add(normalizeBySequenceLength ? nFeatures / entry.GetLength() : nFeatures);
                        if (annotateLeadingId)
                        {
                            break;
                        }
                    }
                    sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray());
                }
                mdata.AddNumericColumn(
                    (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "",
                    sequenceFeatureColumn);
            }
            processInfo.Status("Done.");
        }
コード例 #20
0
        public void LoadData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents,
                             ProcessInfo processInfo)
        {
            int nrows       = param.GetParam <int>("Number of rows").Value;
            int ncols       = param.GetParam <int>("Number of columns").Value;
            int missingPerc = param.GetParam <int>("Percentage of missing values").Value;
            int ngroups     = param.GetParam <int>("Number of groups").Value;
            ParameterWithSubParams <bool> setSeed = param.GetParamWithSubParams <bool>("Set seed");
            Random2 randy = setSeed.Value? new Random2(setSeed.GetSubParameters().GetParam <int>("Seed").Value) : new Random2();

            ngroups    = Math.Min(ngroups, ncols);
            float[,] m = new float[nrows, ncols];
            ParameterWithSubParams <int> x = param.GetParamWithSubParams <int>("Mode");
            Parameters        subParams    = x.GetSubParameters();
            List <string>     catColNames  = new List <string>();
            List <string[][]> catCols      = new List <string[][]>();

            switch (x.Value)
            {
            case 0:
                for (int i = 0; i < m.GetLength(0); i++)
                {
                    for (int j = 0; j < m.GetLength(1); j++)
                    {
                        if (randy.NextDouble() * 100 < missingPerc)
                        {
                            m[i, j] = float.NaN;
                        }
                        else
                        {
                            m[i, j] = (float)randy.NextGaussian();
                        }
                    }
                }
                break;

            case 1:
                float      dist = (float)subParams.GetParam <double>("Distance").Value;
                string[][] col  = new string[m.GetLength(0)][];
                for (int i = 0; i < m.GetLength(0); i++)
                {
                    bool which = randy.NextDouble() < 0.5;
                    for (int j = 0; j < m.GetLength(1); j++)
                    {
                        if (randy.NextDouble() * 100 < missingPerc)
                        {
                            m[i, j] = float.NaN;
                        }
                        else
                        {
                            m[i, j] = (float)randy.NextGaussian();
                        }
                    }
                    if (which)
                    {
                        m[i, 0] += dist;
                        col[i]   = new[] { "Group1" };
                    }
                    else
                    {
                        col[i] = new[] { "Group2" };
                    }
                }
                catColNames.Add("Grouping");
                catCols.Add(col);
                break;

            case 2:
                double     boxLen  = subParams.GetParam <double>("Box size").Value;
                int        howMany = subParams.GetParam <int>("How many").Value;
                string[][] col1    = new string[m.GetLength(0)][];
                float[,] centers = new float[howMany, m.GetLength(1)];
                for (int i = 0; i < centers.GetLength(0); i++)
                {
                    for (int j = 0; j < centers.GetLength(1); j++)
                    {
                        centers[i, j] = (float)(randy.NextDouble() * boxLen);
                    }
                }
                for (int i = 0; i < m.GetLength(0); i++)
                {
                    int which = (int)(randy.NextDouble() * howMany);
                    for (int j = 0; j < m.GetLength(1); j++)
                    {
                        if (randy.NextDouble() * 100 < missingPerc)
                        {
                            m[i, j] = float.NaN;
                        }
                        else
                        {
                            m[i, j] = (float)randy.NextGaussian() + centers[which, j];
                        }
                    }
                    col1[i] = new[] { "Group" + (which + 1) };
                }
                catColNames.Add("Grouping");
                catCols.Add(col1);
                break;
            }
            List <string> exprColumnNames = new List <string>();

            for (int i = 0; i < ncols; i++)
            {
                exprColumnNames.Add("Column " + (i + 1));
            }
            mdata.Name        = "Random matrix";
            mdata.ColumnNames = exprColumnNames;
            mdata.Values.Set(m);
            mdata.Quality.Set(new float[m.GetLength(0), m.GetLength(1)]);
            mdata.IsImputed.Set(new bool[m.GetLength(0), m.GetLength(1)]);
            mdata.SetAnnotationColumns(new List <string>(), new List <string[]>(), catColNames, catCols, new List <string>(),
                                       new List <double[]>(), new List <string>(), new List <double[][]>());
            mdata.Origin = "Random matrix";
            string[] names = new string[mdata.RowCount];
            for (int i = 0; i < names.Length; i++)
            {
                names[i] = "Row " + (i + 1);
            }
            mdata.AddStringColumn("Name", "Name", names);
            string[][] grouping = new string[ncols][];
            for (int i = 0; i < ncols; i++)
            {
                int ig = (i * ngroups) / ncols + 1;
                grouping[i] = new[] { "Group" + ig };
            }
            mdata.AddCategoryRow("Grouping", "Grouping", grouping);
        }