public void SmallTest() { IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ] { { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 } }); mdata.AddStringColumn("id", "", new [] { "a", "b", "b", "b" }); mdata.AddStringColumn("str", "", new [] { "a;b", "b;c", "c;d", "d;e" }); mdata.AddCategoryColumn("cat", "", new[] { new[] { "a", "b" }, new[] { "b", "c" }, new[] { "c", "d" }, new[] { "d", "e" } }); mdata.AddNumericColumn("num", "", new [] { 0, 1, 2, 3, 4.0 }); mdata.AddMultiNumericColumn("mnum", "", new [] { new [] { 0, 4d }, new [] { 1, 5d }, new [] { 2, 6d }, new [] { 3, 7d } }); mdata.UniqueRows(mdata.StringColumns[0], ArrayUtils.Median, UniqueRows.Union, UniqueRows.CatUnion, UniqueRows.MultiNumUnion); Assert.AreEqual(2, mdata.RowCount); CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.Values.GetColumn(0)); CollectionAssert.AreEqual(new [] { 4, 6 }, mdata.Values.GetColumn(1)); CollectionAssert.AreEqual(new [] { "a;b", "b;c;d;e" }, mdata.GetStringColumn("str")); CollectionAssert.AreEqual(new [] { new [] { "a", "b" }, new [] { "b", "c", "d", "e" } }, mdata.GetCategoryColumnAt(0)); CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.NumericColumns[0]); CollectionAssert.AreEqual(new [] { new [] { 0d, 4 }, new [] { 1d, 5, 2, 6, 3, 7 } }, mdata.MultiNumericColumns[0]); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { PhosphoSitePlusParser.ParseKinaseSubstrate(out string[] seqWins, out string[] subAccs, out string[] kinases, out string[] kinAccs, out string[] species); if (seqWins == null) { processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++) { uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam <int>("Sequence window").Value]; Dictionary <string, List <Tuple <string, string, string> > > substrateProperties = new Dictionary <string, List <Tuple <string, string, string> > >(); for (int i = 0; i < seqWins.Length; i++) { string subAcc = subAccs[i]; if (!substrateProperties.ContainsKey(subAcc)) { substrateProperties.Add(subAcc, new List <Tuple <string, string, string> >()); } substrateProperties[subAcc].Add(new Tuple <string, string, string>(seqWins[i], kinases[i], kinAccs[i])); } string[] kinaseNameColumn = new string[uprot.Length]; string[] kinaseUniprotColumn = new string[uprot.Length]; for (int i = 0; i < kinaseNameColumn.Length; i++) { string[] win1 = AddKnownSites.TransformIl(win[i]).Split(';'); HashSet <string> kinaseNamesHits = new HashSet <string>(); HashSet <string> kinaseUniprotHits = new HashSet <string>(); foreach (string ux in uprot[i]) { if (substrateProperties.ContainsKey(ux)) { List <Tuple <string, string, string> > properties = substrateProperties[ux]; foreach (Tuple <string, string, string> property in properties) { string w = property.Item1; if (AddKnownSites.Contains(win1, AddKnownSites.TransformIl(w.ToUpper().Substring(1, w.Length - 2)))) { kinaseNamesHits.Add(property.Item2); kinaseUniprotHits.Add(property.Item3); } } } } kinaseNameColumn[i] = kinaseNamesHits.Count > 0 ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseNamesHits)) : ""; kinaseUniprotColumn[i] = kinaseUniprotHits.Count > 0 ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseUniprotHits)) : ""; } mdata.AddStringColumn("PhosphoSitePlus kinase", "", kinaseNameColumn); mdata.AddStringColumn("PhosphoSitePlus kinase uniprot", "", kinaseUniprotColumn); }
public void TestInitialize() { var peptidesValues = new[, ] { { 9.0f } }; peptides = PerseusFactory.CreateMatrixData(peptidesValues, new List <string> { "pep_MS/MS Count" }); peptides.AddNumericColumn("pep_Intensity", "", new [] { 0.0 }); peptides.AddStringColumn("pep_id", "", new [] { "35" }); peptides.AddStringColumn("pep_Protein group IDs", "", new [] { "13;21" }); peptides.Quality.Init(1, 1); peptides.Quality.Set(0, 0, 1); var multiNum = new ExpandMultiNumeric(); var errorString = string.Empty; var parameters2 = multiNum.GetParameters(peptides, ref errorString); parameters2.GetParam <int[]>("Text columns").Value = new[] { 1 }; IMatrixData[] suppl = null; IDocumentData[] docs = null; multiNum.ProcessData(peptides, parameters2, ref suppl, ref docs, CreateProcessInfo()); var proteinMainValues = new[, ] { { 166250000.0f }, { 8346000.0f } }; proteinMain = PerseusFactory.CreateMatrixData(proteinMainValues, new List <string> { "prot_LFQ intensity" }); proteinMain.Name = "protein main"; proteinMain.AddStringColumn("prot_id", "", new [] { "13", "21" }); proteinMain.AddStringColumn("prot_gene name", "", new [] { "geneA", "geneB" }); var expandValues = new[, ] { { 9.0f }, { 9.0f } }; expand = PerseusFactory.CreateMatrixData(expandValues, new List <string> { "pep_MS/MS Count" }); expand.Name = "expand"; expand.AddNumericColumn("pep_Intensity", "", new [] { 0.0, 0.0 }); expand.AddStringColumn("pep_id", "", new [] { "35", "35" }); expand.AddStringColumn("pep_Protein group IDs", "", new [] { "13", "21" }); matching = new MatchingRowsByName(); var err = string.Empty; parameters = matching.GetParameters(new[] { expand, proteinMain }, ref err); }
public void WriteMatrixTest() { // main data IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ] { { 1, 2, 3 }, { 3, 4, 5 } }, new List <string> { "col1", "col2", "col3" }); // annotation rows mdata.AddCategoryRow("catrow", "this is catrow", new[] { new[] { "cat1" }, new[] { "cat1", "cat2" }, new[] { "cat2" } }); mdata.AddNumericRow("numrow", "this is numrow", new[] { -1.0, 1, 2 }); // annotation columns mdata.AddStringColumn("strcol1", "this is stringcol1", new[] { "1", "2" }); mdata.AddStringColumn("strcol2", "", new[] { "", "hallo" }); mdata.AddNumericColumn("numcol", "", new[] { 1.0, 2.0 }); mdata.AddMultiNumericColumn("multnumcol", "this is multnumcol", new[] { new[] { -2.0, 2.0 }, new double[] {} }); mdata.AddCategoryColumn("catcol", "", new[] { new[] { "cat1", "cat1.1" }, new[] { "cat2", "cat1" } }); string mdataStr; using (MemoryStream memstream = new MemoryStream()) using (StreamWriter writer = new StreamWriter(memstream)) { PerseusUtils.WriteMatrix(mdata, writer); writer.Flush(); mdataStr = Encoding.UTF8.GetString(memstream.ToArray()); } IMatrixData mdata2 = PerseusFactory.CreateMatrixData(); PerseusUtils.ReadMatrix(mdata2, new ProcessInfo(new Settings(), status => { }, progress => { }, 1), () => { StreamReader tmpStream = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(mdataStr))); return(tmpStream); }, "matrix1", '\t'); Assert.AreEqual(2, mdata2.RowCount); Assert.AreEqual(3, mdata2.ColumnCount); Assert.AreEqual(2, mdata2.StringColumnCount); Assert.AreEqual(1, mdata2.NumericColumnCount); Assert.AreEqual(1, mdata2.CategoryColumnCount); Assert.AreEqual(1, mdata2.MultiNumericColumnCount); Assert.AreEqual("hallo", mdata2.StringColumns[mdata2.StringColumnNames.FindIndex(col => col.Equals("strcol2"))][1]); Assert.AreEqual(1, mdata2.CategoryRowCount); Assert.AreEqual(1, mdata2.NumericRowCount); }
public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] baseIds = GetBaseIds(para, mdata); string[] name; int[] catColInds; int[] textColInds; int[] numColInds; string[][][] catCols; string[][] textCols; double[][] numCols; bool success = ProcessDataAddAnnotation(mdata.RowCount, para, baseIds, processInfo, out name, out catColInds, out textColInds, out numColInds, out catCols, out textCols, out numCols); if (!success) { return; } for (int i = 0; i < catCols.Length; i++) { mdata.AddCategoryColumn(name[catColInds[i]], "", catCols[i]); } for (int i = 0; i < textCols.Length; i++) { mdata.AddStringColumn(name[textColInds[i]], "", textCols[i]); } for (int i = 0; i < numCols.Length; i++) { mdata.AddNumericColumn(name[numColInds[i]], "", numCols[i]); } }
public void Setup() { _mdata = PerseusFactory.CreateMatrixData(new[, ] { { 0.0, 1.0, 0.0 }, { 0.0, 0.0, 0.0 }, { 0.0, 1.0, 0.0 } }); _mdata.AddStringColumn("test", "", new [] { "a", "b", "a" }); }
public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { GetAvailableTextAnnots(out string[] baseNames, out int[][] inds, out string[] files); ParameterWithSubParams <int> spd = para.GetParamWithSubParams <int>("Source"); int ind = spd.Value; Parameters param = spd.GetSubParameters(); int baseCol = param.GetParam <int>("Identifiers").Value; int selection = param.GetParam <int>("Identifier type").Value; HashSet <string> allIds = GetAllIds(mdata, baseCol); string file = files[ind]; Dictionary <string, string[]> mapping = ReadMapping(allIds, file, inds[ind][selection]); string[] x = mdata.StringColumns[baseCol]; string[] newCol = new string[x.Length]; for (int i = 0; i < x.Length; i++) { string w = x[i]; string[] q = w.Length > 0 ? w.Split(';') : new string[0]; List <string> m = new List <string>(); foreach (string s in q) { string r = s.ToLower(); if (mapping.ContainsKey(r)) { m.AddRange(mapping[r]); } } string[] vals = ArrayUtils.UniqueValues(m); newCol[i] = StringUtils.Concat(";", vals); } mdata.AddStringColumn(baseNames[ind], baseNames[ind], newCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int stringColumnIndx = param.GetParam <int>("Sequence window").Value; string[] win = mdata.StringColumns[stringColumnIndx]; int start = param.GetParam <int>("Start").Value - 1; int length = param.GetParam <int>("Length").Value; if (start < 0) { processInfo.ErrString = "Start position cannot be smaller than 1."; return; } if (start + length > win[0].Length) { processInfo.ErrString = "Start + length cannot exceed the total length of the sequence."; return; } string[] shortenedMotifs = new string[win.Length]; for (int i = 0; i < mdata.RowCount; ++i) { shortenedMotifs[i] = win[i].Substring(start, length); } mdata.AddStringColumn("Short sequence window", "", shortenedMotifs); }
public void ProcessData(IMatrixData data, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] exColInds = param.GetParam <int[]>("Main columns").Value; int[] numColInds = param.GetParam <int[]>("Numerical columns").Value; int[] multiNumColInds = param.GetParam <int[]>("Multi-numerical columns").Value; int[] catColInds = param.GetParam <int[]>("Categorical columns").Value; int[] textColInds = param.GetParam <int[]>("Text columns").Value; if (exColInds.Length > 0) { int ncol = data.ColumnCount; data.ExtractColumns(ArrayUtils.Concat(ArrayUtils.ConsecutiveInts(data.ColumnCount), exColInds)); HashSet <string> taken = new HashSet <string>(data.ColumnNames); for (int i = 0; i < exColInds.Length; i++) { string s = StringUtils.GetNextAvailableName(data.ColumnNames[ncol + i], taken); data.ColumnNames[ncol + i] = s; taken.Add(s); } } foreach (int ind in numColInds) { HashSet <string> taken = new HashSet <string>(data.NumericColumnNames); string s = StringUtils.GetNextAvailableName(data.NumericColumnNames[ind], taken); data.AddNumericColumn(s, data.NumericColumnDescriptions[ind], (double[])data.NumericColumns[ind].Clone()); taken.Add(s); } foreach (int ind in multiNumColInds) { HashSet <string> taken = new HashSet <string>(data.MultiNumericColumnNames); string s = StringUtils.GetNextAvailableName(data.MultiNumericColumnNames[ind], taken); data.AddMultiNumericColumn(s, data.MultiNumericColumnDescriptions[ind], (double[][])data.MultiNumericColumns[ind].Clone()); taken.Add(s); } foreach (int ind in catColInds) { HashSet <string> taken = new HashSet <string>(data.CategoryColumnNames); string s = StringUtils.GetNextAvailableName(data.CategoryColumnNames[ind], taken); data.AddCategoryColumn(s, data.CategoryColumnDescriptions[ind], data.GetCategoryColumnAt(ind)); taken.Add(s); } foreach (int ind in textColInds) { HashSet <string> taken = new HashSet <string>(data.StringColumnNames); string s = StringUtils.GetNextAvailableName(data.StringColumnNames[ind], taken); data.AddStringColumn(s, data.ColumnDescriptions[ind], (string[])data.StringColumns[ind].Clone()); taken.Add(s); } }
public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { var annotationProvider = _annotationProvider; ParameterWithSubParams <int> sourceParam = para.GetParamWithSubParams <int>("Source"); int sourceIndex = sourceParam.Value; Parameters param = sourceParam.GetSubParameters(); int baseCol = param.GetParam <int>("Identifiers").Value; int selection = param.GetParam <int>("Identifier type").Value; var(_, id, _) = annotationProvider.TextSources()[sourceIndex]; var newColumn = annotationProvider.MapToBaseIdentifiers(mdata.StringColumns[baseCol], sourceIndex, selection); mdata.AddStringColumn(id, id, newColumn); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] inds = param.GetParam <int[]>("Columns").Value; bool keepColumns = param.GetParam <bool>("Keep original columns").Value; foreach (var col in inds) { var values = mdata.StringColumns[col].Select(s => s.ToUpper()).ToArray(); if (keepColumns) { mdata.AddStringColumn(mdata.StringColumnNames[col], mdata.StringColumnDescriptions[col], values); } else { mdata.StringColumns[col] = values; } } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int stringColumnIndx = param.GetParam<int>("Sequence window").Value; string[] win = mdata.StringColumns[stringColumnIndx]; int start = param.GetParam<int>("Start").Value - 1; int length = param.GetParam<int>("Length").Value; if (start < 0){ processInfo.ErrString = "Start position cannot be smaller than 1."; return; } if (start + length > win[0].Length){ processInfo.ErrString = "Start + length cannot exceed the total length of the sequence."; return; } string[] shortenedMotifs = new string[win.Length]; for (int i = 0; i < mdata.RowCount; ++i){ shortenedMotifs[i] = win[i].Substring(start, length); } mdata.AddStringColumn("Short sequence window", "", shortenedMotifs); }
public void TestSmallExample() { double[,] values = new[, ] { { 0.0, 1.0, 0, 5 }, { 2.0, 3.0, 0, 5 } }; IMatrixData mdata = PerseusFactory.CreateMatrixData(values, new List <string> { "Col___1", "Col___2", "Col___3", "No expand" }); mdata.ColumnDescriptions = new List <string> { "Description Col", "Col", "Col", "Description No expand" }; double[][] multiNum = new[] { new[] { 0.0, 1.0 }, new[] { 2.0 } }; mdata.AddMultiNumericColumn("MultiNum", "", multiNum); string[] stringCol = new[] { "row1", "row2" }; mdata.AddStringColumn("String", "", stringCol); ExpandSiteTable expand = new ExpandSiteTable(); IMatrixData[] supplData = null; IDocumentData[] docs = null; expand.ProcessData(mdata, new Parameters(), ref supplData, ref docs, CreateProcessInfo()); Assert.AreEqual(2, mdata.ColumnCount); CollectionAssert.AreEqual(new [] { "No expand", "Col" }, mdata.ColumnNames.ToArray()); Assert.AreEqual(2, mdata.ColumnDescriptions.Count); CollectionAssert.AreEqual(new [] { "Description No expand", "Description Col" }, mdata.ColumnDescriptions.ToArray()); Assert.AreEqual(6, mdata.RowCount); Assert.AreEqual(2, mdata.StringColumnCount); CollectionAssert.AreEqual(new [] { "String", "Unique identifier" }, mdata.StringColumnNames); CollectionAssert.AreEqual(stringCol.Concat(stringCol).Concat(stringCol).ToArray(), mdata.StringColumns[0]); Assert.AreEqual(1, mdata.MultiNumericColumnCount); CollectionAssert.AreEqual(multiNum.Concat(multiNum).Concat(multiNum).ToArray(), mdata.MultiNumericColumns[0]); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string mod = param.GetParam <int>("Modification").StringValue; PhosphoSitePlusParser.ParseKnownMod(mod, out string[] seqWins, out string[] accs, out string[] pubmedLtp, out string[] pubmedMs2, out string[] cstMs2, out string[] species); if (seqWins == null) { processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++) { uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam <int>("Sequence window").Value]; Dictionary <string, List <int> > map = new Dictionary <string, List <int> >(); for (int i = 0; i < seqWins.Length; i++) { string acc = accs[i]; if (!map.ContainsKey(acc)) { map.Add(acc, new List <int>()); } map[acc].Add(i); } string[] newCol = new string[uprot.Length]; string[][] newCatCol = new string[uprot.Length][]; string[][] originCol = new string[uprot.Length][]; for (int i = 0; i < newCol.Length; i++) { string[] win1 = TransformIl(win[i]).Split(';'); HashSet <string> wins = new HashSet <string>(); HashSet <string> origins = new HashSet <string>(); foreach (string ux in uprot[i]) { if (map.ContainsKey(ux)) { List <int> n = map[ux]; foreach (int ind in n) { string s = seqWins[ind]; if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))) { wins.Add(s); if (pubmedLtp[ind].Length > 0) { origins.Add("LTP"); } if (pubmedMs2[ind].Length > 0) { origins.Add("HTP"); } if (cstMs2[ind].Length > 0) { origins.Add("CST"); } } } } } if (wins.Count > 0) { newCol[i] = StringUtils.Concat(";", ArrayUtils.ToArray(wins)); newCatCol[i] = new[] { "+" }; string[] x = ArrayUtils.ToArray(origins); Array.Sort(x); originCol[i] = x; } else { newCol[i] = ""; newCatCol[i] = new string[0]; originCol[i] = new string[0]; } } mdata.AddStringColumn("PhosphoSitePlus window", "", newCol); mdata.AddCategoryColumn("Known site", "", newCatCol); mdata.AddCategoryColumn("Origin", "", originCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int proteinIdColumnInd = param.GetParam<int>("Protein IDs").Value; string[][] proteinIds = new string[mdata.RowCount][]; string[][] leadingIds = new string[mdata.RowCount][]; List<string> allIds = new List<string>(); for (int row = 0; row < mdata.RowCount; row++){ proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';'); leadingIds[row] = new[]{proteinIds[row][0]}; allIds.AddRange(proteinIds[row]); } string fastaFilePath = param.GetParam<string>("Fasta file").Value; Fasta fasta = new Fasta(); fasta.ParseFile(fastaFilePath, processInfo); // Text annotations processInfo.Status("Adding fasta header annotations."); int[] selection = param.GetParamWithSubParams<int>("Fasta header annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; string[][] idsToBeAnnotated = (param.GetParamWithSubParams<int>("Fasta header annotations").Value == 0) ? proteinIds : leadingIds; ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++){ List<ProteinSequence> rowEntries = new List<ProteinSequence>(); foreach (string id in idsToBeAnnotated[row]){ ProteinSequence entry = fasta.GetEntry(id); if (entry == null){ continue; } rowEntries.Add(entry); } fastaEntries[row] = rowEntries.ToArray(); } if (ArrayUtils.Contains(selection, 0)){ // Entry name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string entryName = entry.EntryName; if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)){ rowAnnotations.Add(entryName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Entry name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Gene name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string geneName = entry.GeneName; if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)){ rowAnnotations.Add(geneName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Gene name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the //'Isoform x of...' prefixes and '(Fragment)' suffixes string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn); } if (ArrayUtils.Contains(selection, 3)){ // Consensus protein name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ConsensusProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 4)){ // Species string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string speciesName = entry.Species; if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)){ rowAnnotations.Add(speciesName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Species", "", annotationColumn); } // Numeric annotations processInfo.Status("Adding numeric annotations."); selection = param.GetParamWithSubParams<int>("Numeric annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; bool annotateLeadingId = (param.GetParamWithSubParams<int>("Numeric annotations").Value == 1); if (ArrayUtils.Contains(selection, 0)){ // Sequence length double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double sequenceLength = entry.GetSequence().Length; rowAnnotations.Add(sequenceLength); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Sequence length", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Monoisotopic molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double monoisotopicMass = entry.GetMonoisotopicMolecularMass(); rowAnnotations.Add(monoisotopicMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Average molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double averageMass = entry.GetAverageMolecularMass(); rowAnnotations.Add(averageMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Average molecular mass", "", annotationColumn); } // Theoretical peptides processInfo.Status("Calculating theoretical peptides."); annotateLeadingId = (param.GetParamWithSubParams<int>("Calculate theoretical peptides").Value == 1); Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases, param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<int[]>("Proteases") .Value); double minLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Min. peptide length").Value; double maxLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Max. peptide length").Value; bool displayPeptideSequences = annotateLeadingId && param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<bool>( "Show sequences").Value; foreach (Protease protease in proteases){ double[] annotationColumn = new double[mdata.RowCount]; string[] peptideColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); List<string> rowPeptides = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int) minLength, (int) maxLength); rowAnnotations.Add(nTheoreticalPeptides); if (displayPeptideSequences){ rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int) minLength, (int) maxLength)); } if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); peptideColumn[row] = String.Join(";", rowPeptides); } mdata.AddNumericColumn( "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn); if (displayPeptideSequences){ mdata.AddStringColumn( "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn); } } // Sequence features processInfo.Status("Counting sequence features."); annotateLeadingId = (param.GetParamWithSubParams<int>("Count sequence features").Value == 1); bool normalizeBySequenceLength = param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<bool>( "Normalize by sequence length").Value; if (param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value != ""){ Regex regex; try{ regex = new Regex( param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value); } catch (ArgumentException){ processInfo.ErrString = "The regular expression you provided has invalid syntax."; return; } double[] sequenceFeatureColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> featureCount = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nFeatures = regex.Matches(entry.GetSequence()).Count; featureCount.Add(normalizeBySequenceLength ? nFeatures/entry.GetLength() : nFeatures); if (annotateLeadingId){ break; } } sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray()); } mdata.AddNumericColumn( (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "", sequenceFeatureColumn); } processInfo.Status("Done."); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] seqWins; string[] subAccs; string[] kinases; string[] kinAccs; string[] species; PhosphoSitePlusParser.ParseKinaseSubstrate(out seqWins, out subAccs, out kinases, out kinAccs, out species); if (seqWins == null){ processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++){ uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value]; Dictionary<string, List<Tuple<string, string, string>>> substrateProperties = new Dictionary<string, List<Tuple<string, string, string>>>(); for (int i = 0; i < seqWins.Length; i++){ string subAcc = subAccs[i]; if (!substrateProperties.ContainsKey(subAcc)){ substrateProperties.Add(subAcc, new List<Tuple<string, string, string>>()); } substrateProperties[subAcc].Add(new Tuple<string, string, string>(seqWins[i], kinases[i], kinAccs[i])); } string[] kinaseNameColumn = new string[uprot.Length]; string[] kinaseUniprotColumn = new string[uprot.Length]; for (int i = 0; i < kinaseNameColumn.Length; i++){ string[] win1 = AddKnownSites.TransformIl(win[i]).Split(';'); HashSet<string> kinaseNamesHits = new HashSet<string>(); HashSet<string> kinaseUniprotHits = new HashSet<string>(); foreach (string ux in uprot[i]){ if (substrateProperties.ContainsKey(ux)){ List<Tuple<string, string, string>> properties = substrateProperties[ux]; foreach (Tuple<string, string, string> property in properties){ string w = property.Item1; if (AddKnownSites.Contains(win1, AddKnownSites.TransformIl(w.ToUpper().Substring(1, w.Length - 2)))){ kinaseNamesHits.Add(property.Item2); kinaseUniprotHits.Add(property.Item3); } } } } kinaseNameColumn[i] = kinaseNamesHits.Count > 0 ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseNamesHits)) : ""; kinaseUniprotColumn[i] = kinaseUniprotHits.Count > 0 ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseUniprotHits)) : ""; } mdata.AddStringColumn("PhosphoSitePlus kinase", "", kinaseNameColumn); mdata.AddStringColumn("PhosphoSitePlus kinase uniprot", "", kinaseUniprotColumn); }
public IMatrixData ProcessData(IMatrixData[] inputData, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { IMatrixData mdata1 = inputData[0]; IMatrixData mdata2 = inputData[1]; string[] header1 = new string[mdata1.RowCount]; for (int i = 0; i < mdata1.RowCount; i++) { header1[i] = mdata1.Name; } string[] header2 = new string[mdata2.RowCount]; for (int i = 0; i < mdata2.RowCount; i++) { header2[i] = mdata2.Name; } int nrows1 = mdata1.RowCount; int nrows2 = mdata2.RowCount; int nrows = nrows1 + nrows2; string[] expColNames = SpecialSort(mdata1.ColumnNames, mdata2.ColumnNames, out Dictionary <string, int> dic1, out Dictionary <string, int> dic2); double[,] ex = new double[nrows, expColNames.Length]; for (int i = 0; i < ex.GetLength(0); i++) { for (int j = 0; j < ex.GetLength(1); j++) { ex[i, j] = double.NaN; } } for (int i = 0; i < expColNames.Length; i++) { if (dic1.ContainsKey(expColNames[i])) { int ind = dic1[expColNames[i]]; for (int j = 0; j < nrows1; j++) { ex[j, i] = mdata1.Values.Get(j, ind); } } if (dic2.ContainsKey(expColNames[i])) { int ind = dic2[expColNames[i]]; for (int j = 0; j < nrows2; j++) { ex[nrows1 + j, i] = mdata2.Values.Get(j, ind); } } } string[] numColNames = SpecialSort(mdata1.NumericColumnNames, mdata2.NumericColumnNames, out dic1, out dic2); List <double[]> numCols = new List <double[]>(); for (int i = 0; i < numColNames.Length; i++) { numCols.Add(new double[nrows]); for (int j = 0; j < nrows; j++) { numCols[numCols.Count - 1][j] = double.NaN; } } for (int i = 0; i < numColNames.Length; i++) { if (dic1.ContainsKey(numColNames[i])) { int ind = dic1[numColNames[i]]; for (int j = 0; j < nrows1; j++) { numCols[i][j] = mdata1.NumericColumns[ind][j]; } } if (dic2.ContainsKey(numColNames[i])) { int ind = dic2[numColNames[i]]; for (int j = 0; j < nrows2; j++) { numCols[i][nrows1 + j] = mdata2.NumericColumns[ind][j]; } } } string[] stringColNames = SpecialSort(mdata1.StringColumnNames, mdata2.StringColumnNames, out dic1, out dic2); List <string[]> stringCols = new List <string[]>(); for (int i = 0; i < stringColNames.Length; i++) { stringCols.Add(new string[nrows]); for (int j = 0; j < nrows; j++) { stringCols[stringCols.Count - 1][j] = ""; } } for (int i = 0; i < stringColNames.Length; i++) { if (dic1.ContainsKey(stringColNames[i])) { int ind = dic1[stringColNames[i]]; for (int j = 0; j < nrows1; j++) { stringCols[i][j] = mdata1.StringColumns[ind][j]; } } if (dic2.ContainsKey(stringColNames[i])) { int ind = dic2[stringColNames[i]]; for (int j = 0; j < nrows2; j++) { stringCols[i][nrows1 + j] = mdata2.StringColumns[ind][j]; } } } string[] catColNames = SpecialSort(mdata1.CategoryColumnNames, mdata2.CategoryColumnNames, out dic1, out dic2); List <string[][]> catCols = new List <string[][]>(); for (int i = 0; i < catColNames.Length; i++) { catCols.Add(new string[nrows][]); for (int j = 0; j < nrows; j++) { catCols[catCols.Count - 1][j] = new string[0]; } } for (int i = 0; i < catColNames.Length; i++) { if (dic1.ContainsKey(catColNames[i])) { int ind = dic1[catColNames[i]]; for (int j = 0; j < nrows1; j++) { catCols[i][j] = mdata1.GetCategoryColumnEntryAt(ind, j); } } if (dic2.ContainsKey(catColNames[i])) { int ind = dic2[catColNames[i]]; for (int j = 0; j < nrows2; j++) { catCols[i][nrows1 + j] = mdata2.GetCategoryColumnEntryAt(ind, j); } } } string[] multiNumColNames = SpecialSort(mdata1.MultiNumericColumnNames, mdata2.MultiNumericColumnNames, out dic1, out dic2); List <double[][]> multiNumCols = new List <double[][]>(); for (int i = 0; i < multiNumColNames.Length; i++) { multiNumCols.Add(new double[nrows][]); for (int j = 0; j < nrows; j++) { multiNumCols[multiNumCols.Count - 1][j] = new double[0]; } } for (int i = 0; i < multiNumColNames.Length; i++) { if (dic1.ContainsKey(multiNumColNames[i])) { int ind = dic1[multiNumColNames[i]]; for (int j = 0; j < nrows1; j++) { multiNumCols[i][j] = mdata1.MultiNumericColumns[ind][j]; } } if (dic2.ContainsKey(multiNumColNames[i])) { int ind = dic2[multiNumColNames[i]]; for (int j = 0; j < nrows2; j++) { multiNumCols[i][nrows1 + j] = mdata2.MultiNumericColumns[ind][j]; } } } string MatrixName = "Matrix Name"; string MatrixDescription = "Description"; string[] listnames = header1.Concat(header2).ToArray(); // string[][] resultarray = catlistnames.Select(x => x.ToArray()).ToArray(); //IMPORTANT!!!!! TODO: check if the name of the matrix if changed IMatrixData result = PerseusFactory.CreateMatrixData(ex, expColNames.ToList()); result.NumericColumnNames = new List <string>(numColNames); result.NumericColumnDescriptions = result.NumericColumnNames; result.NumericColumns = numCols; result.StringColumnNames = new List <string>(stringColNames); result.StringColumns = stringCols; result.CategoryColumnNames = new List <string>(catColNames); result.CategoryColumnDescriptions = result.CategoryColumnNames; result.CategoryColumns = catCols; result.MultiNumericColumnNames = new List <string>(multiNumColNames); result.MultiNumericColumnDescriptions = result.MultiNumericColumnNames; result.MultiNumericColumns = multiNumCols; HashSet <string> taken = new HashSet <string>(result.StringColumnNames); result.AddStringColumn(MatrixName, MatrixName, listnames); taken.Add(MatrixName); return(result); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string mod = param.GetParam<int>("Modification").StringValue; string[] seqWins; string[] accs; string[] pubmedLtp; string[] pubmedMs2; string[] cstMs2; string[] species; PhosphoSitePlusParser.ParseKnownMod(mod, out seqWins, out accs, out pubmedLtp, out pubmedMs2, out cstMs2, out species); if (seqWins == null){ processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++){ uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value]; Dictionary<string, List<int>> map = new Dictionary<string, List<int>>(); for (int i = 0; i < seqWins.Length; i++){ string acc = accs[i]; if (!map.ContainsKey(acc)){ map.Add(acc, new List<int>()); } map[acc].Add(i); } string[] newCol = new string[uprot.Length]; string[][] newCatCol = new string[uprot.Length][]; string[][] originCol = new string[uprot.Length][]; for (int i = 0; i < newCol.Length; i++){ string[] win1 = TransformIl(win[i]).Split(';'); HashSet<string> wins = new HashSet<string>(); HashSet<string> origins = new HashSet<string>(); foreach (string ux in uprot[i]){ if (map.ContainsKey(ux)){ List<int> n = map[ux]; foreach (int ind in n){ string s = seqWins[ind]; if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))){ wins.Add(s); if (pubmedLtp[ind].Length > 0){ origins.Add("LTP"); } if (pubmedMs2[ind].Length > 0){ origins.Add("HTP"); } if (cstMs2[ind].Length > 0){ origins.Add("CST"); } } } } } if (wins.Count > 0){ newCol[i] = StringUtils.Concat(";", ArrayUtils.ToArray(wins)); newCatCol[i] = new[]{"+"}; string[] x = ArrayUtils.ToArray(origins); Array.Sort(x); originCol[i] = x; } else{ newCol[i] = ""; newCatCol[i] = new string[0]; originCol[i] = new string[0]; } } mdata.AddStringColumn("PhosphoSitePlus window", "", newCol); mdata.AddCategoryColumn("Known site", "", newCatCol); mdata.AddCategoryColumn("Origin", "", originCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value; string[][] proteinIds = new string[mdata.RowCount][]; string[][] leadingIds = new string[mdata.RowCount][]; List <string> allIds = new List <string>(); for (int row = 0; row < mdata.RowCount; row++) { proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';'); leadingIds[row] = new[] { proteinIds[row][0] }; allIds.AddRange(proteinIds[row]); } string fastaFilePath = param.GetParam <string>("Fasta file").Value; Fasta fasta = new Fasta(); fasta.ParseFile(fastaFilePath, processInfo); // Text annotations processInfo.Status("Adding fasta header annotations."); int[] selection = param.GetParamWithSubParams <int>("Fasta header annotations").GetSubParameters().GetParam <int[]>("Annotations").Value; string[][] idsToBeAnnotated = param.GetParamWithSubParams <int>("Fasta header annotations").Value == 0 ? proteinIds : leadingIds; ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++) { List <ProteinSequence> rowEntries = new List <ProteinSequence>(); foreach (string id in idsToBeAnnotated[row]) { ProteinSequence entry = fasta.GetEntry(id); if (entry == null) { continue; } rowEntries.Add(entry); } fastaEntries[row] = rowEntries.ToArray(); } if (ArrayUtils.Contains(selection, 0)) // Entry name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string entryName = entry.EntryName; if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)) { rowAnnotations.Add(entryName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Entry name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)) // Gene name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string geneName = entry.GeneName; if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)) { rowAnnotations.Add(geneName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Gene name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)) { // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the //'Isoform x of...' prefixes and '(Fragment)' suffixes string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string proteinName = entry.ProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)) { rowAnnotations.Add(proteinName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn); } if (ArrayUtils.Contains(selection, 3)) // Consensus protein name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string proteinName = entry.ConsensusProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)) { rowAnnotations.Add(proteinName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 4)) // Species { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string speciesName = entry.Species; if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)) { rowAnnotations.Add(speciesName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Species", "", annotationColumn); } // Numeric annotations processInfo.Status("Adding numeric annotations."); selection = param.GetParamWithSubParams <int>("Numeric annotations").GetSubParameters().GetParam <int[]>("Annotations").Value; bool annotateLeadingId = param.GetParamWithSubParams <int>("Numeric annotations").Value == 1; if (ArrayUtils.Contains(selection, 0)) // Sequence length { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double sequenceLength = entry.GetSequence().Length; rowAnnotations.Add(sequenceLength); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Sequence length", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)) // Monoisotopic molecular mass { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double monoisotopicMass = entry.GetMonoisotopicMolecularMass(); rowAnnotations.Add(monoisotopicMass); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)) // Average molecular mass { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double averageMass = entry.GetAverageMolecularMass(); rowAnnotations.Add(averageMass); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Average molecular mass", "", annotationColumn); } // Theoretical peptides processInfo.Status("Calculating theoretical peptides."); annotateLeadingId = param.GetParamWithSubParams <int>("Calculate theoretical peptides").Value == 1; Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases, param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <int[]>("Proteases") .Value); double minLength = param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>( "Min. peptide length").Value; double maxLength = param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>( "Max. peptide length").Value; bool displayPeptideSequences = annotateLeadingId && param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <bool>( "Show sequences").Value; foreach (Protease protease in proteases) { double[] annotationColumn = new double[mdata.RowCount]; string[] peptideColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); List <string> rowPeptides = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int)minLength, (int)maxLength); rowAnnotations.Add(nTheoreticalPeptides); if (displayPeptideSequences) { rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int)minLength, (int)maxLength)); } if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); peptideColumn[row] = String.Join(";", rowPeptides); } mdata.AddNumericColumn( "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn); if (displayPeptideSequences) { mdata.AddStringColumn( "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn); } } // Sequence features processInfo.Status("Counting sequence features."); annotateLeadingId = param.GetParamWithSubParams <int>("Count sequence features").Value == 1; bool normalizeBySequenceLength = param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <bool>( "Normalize by sequence length").Value; if (param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value != "") { Regex regex; try{ regex = new Regex( param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value); } catch (ArgumentException) { processInfo.ErrString = "The regular expression you provided has invalid syntax."; return; } double[] sequenceFeatureColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> featureCount = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double nFeatures = regex.Matches(entry.GetSequence()).Count; featureCount.Add(normalizeBySequenceLength ? nFeatures / entry.GetLength() : nFeatures); if (annotateLeadingId) { break; } } sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray()); } mdata.AddNumericColumn( (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "", sequenceFeatureColumn); } processInfo.Status("Done."); }
public void LoadData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int nrows = param.GetParam <int>("Number of rows").Value; int ncols = param.GetParam <int>("Number of columns").Value; int missingPerc = param.GetParam <int>("Percentage of missing values").Value; int ngroups = param.GetParam <int>("Number of groups").Value; ParameterWithSubParams <bool> setSeed = param.GetParamWithSubParams <bool>("Set seed"); Random2 randy = setSeed.Value? new Random2(setSeed.GetSubParameters().GetParam <int>("Seed").Value) : new Random2(); ngroups = Math.Min(ngroups, ncols); float[,] m = new float[nrows, ncols]; ParameterWithSubParams <int> x = param.GetParamWithSubParams <int>("Mode"); Parameters subParams = x.GetSubParameters(); List <string> catColNames = new List <string>(); List <string[][]> catCols = new List <string[][]>(); switch (x.Value) { case 0: for (int i = 0; i < m.GetLength(0); i++) { for (int j = 0; j < m.GetLength(1); j++) { if (randy.NextDouble() * 100 < missingPerc) { m[i, j] = float.NaN; } else { m[i, j] = (float)randy.NextGaussian(); } } } break; case 1: float dist = (float)subParams.GetParam <double>("Distance").Value; string[][] col = new string[m.GetLength(0)][]; for (int i = 0; i < m.GetLength(0); i++) { bool which = randy.NextDouble() < 0.5; for (int j = 0; j < m.GetLength(1); j++) { if (randy.NextDouble() * 100 < missingPerc) { m[i, j] = float.NaN; } else { m[i, j] = (float)randy.NextGaussian(); } } if (which) { m[i, 0] += dist; col[i] = new[] { "Group1" }; } else { col[i] = new[] { "Group2" }; } } catColNames.Add("Grouping"); catCols.Add(col); break; case 2: double boxLen = subParams.GetParam <double>("Box size").Value; int howMany = subParams.GetParam <int>("How many").Value; string[][] col1 = new string[m.GetLength(0)][]; float[,] centers = new float[howMany, m.GetLength(1)]; for (int i = 0; i < centers.GetLength(0); i++) { for (int j = 0; j < centers.GetLength(1); j++) { centers[i, j] = (float)(randy.NextDouble() * boxLen); } } for (int i = 0; i < m.GetLength(0); i++) { int which = (int)(randy.NextDouble() * howMany); for (int j = 0; j < m.GetLength(1); j++) { if (randy.NextDouble() * 100 < missingPerc) { m[i, j] = float.NaN; } else { m[i, j] = (float)randy.NextGaussian() + centers[which, j]; } } col1[i] = new[] { "Group" + (which + 1) }; } catColNames.Add("Grouping"); catCols.Add(col1); break; } List <string> exprColumnNames = new List <string>(); for (int i = 0; i < ncols; i++) { exprColumnNames.Add("Column " + (i + 1)); } mdata.Name = "Random matrix"; mdata.ColumnNames = exprColumnNames; mdata.Values.Set(m); mdata.Quality.Set(new float[m.GetLength(0), m.GetLength(1)]); mdata.IsImputed.Set(new bool[m.GetLength(0), m.GetLength(1)]); mdata.SetAnnotationColumns(new List <string>(), new List <string[]>(), catColNames, catCols, new List <string>(), new List <double[]>(), new List <string>(), new List <double[][]>()); mdata.Origin = "Random matrix"; string[] names = new string[mdata.RowCount]; for (int i = 0; i < names.Length; i++) { names[i] = "Row " + (i + 1); } mdata.AddStringColumn("Name", "Name", names); string[][] grouping = new string[ncols][]; for (int i = 0; i < ncols; i++) { int ig = (i * ngroups) / ncols + 1; grouping[i] = new[] { "Group" + ig }; } mdata.AddCategoryRow("Grouping", "Grouping", grouping); }