public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { ParameterWithSubParams <int> access = param.GetParamWithSubParams <int>("Matrix access"); bool rows = access.Value == 0; int groupInd; if (rows) { groupInd = access.GetSubParameters().GetParam <int>("Grouping").Value - 1; } else { groupInd = -1; } bool report = param.GetParam <bool>("Report mean and std. dev.").Value; bool median = param.GetParam <bool>("Use median").Value; if (groupInd < 0) { Zscore(rows, mdata, processInfo.NumThreads, report, median, out double[] means, out double[] stddevs); if (report) { if (rows) { mdata.AddNumericColumn("Mean", "Mean", means); mdata.AddNumericColumn("Std. dev.", "Std. dev.", stddevs); } else { mdata.AddNumericRow("Mean", "Mean", means); mdata.AddNumericRow("Std. dev.", "Std. dev.", stddevs); } } } else { string[][] catRow = mdata.GetCategoryRowAt(groupInd); foreach (string[] t in catRow) { if (t.Length > 1) { processInfo.ErrString = "The groups are overlapping."; return; } } string[] groupVals = ArrayUtils.UniqueValuesPreserveOrder(catRow); ZscoreGroups(mdata, catRow, processInfo.NumThreads, report, median, groupVals, out double[][] means, out double[][] stddevs); if (report) { for (int i = 0; i < groupVals.Length; i++) { mdata.AddNumericColumn("Mean " + groupVals[i], "Mean", means[i]); mdata.AddNumericColumn("Std. dev. " + groupVals[i], "Std. dev.", stddevs[i]); } } } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] colIndx = param.GetParam <int[]>("x").Value; int[] colIndy = param.GetParam <int[]>("y").Value; if (colIndx.Length == 0) { processInfo.ErrString = "Please select some columns"; return; } if (colIndx.Length != colIndy.Length) { processInfo.ErrString = "Please select the same number of columns in the boxes for the first and second columns."; return; } int typeInd = param.GetParam <int>("Distribution type").Value; int points = param.GetParam <int>("Number of points").Value; for (int k = 0; k < colIndx.Length; k++) { double[] xvals = GetColumn(mdata, colIndx[k]); double[] yvals = GetColumn(mdata, colIndy[k]); DensityEstimationType type = DensityEstimationType.JointDistribution; switch (typeInd) { case 1: type = DensityEstimationType.DivideByX; break; case 2: type = DensityEstimationType.DivideByY; break; case 3: type = DensityEstimationType.DivideByXY; break; } (double[] dvals, double[] pvals) = DensityEstimation.CalcDensitiesAtData(xvals, yvals, points, type); string xname = GetColumnName(mdata, colIndx[k]); string yname = GetColumnName(mdata, colIndy[k]); mdata.AddNumericColumn("Density_" + xname + "_" + yname, "Density of data points in the plane spanned by the columns " + xname + " and " + yname + ".", dvals); mdata.AddNumericColumn("Excluded fraction_" + xname + "_" + yname, "Percentage of points with a point density smaller than at this point in the plane spanned by the columns " + xname + " and " + yname + ".", pvals); } }
public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] baseIds = GetBaseIds(para, mdata); string[] name; int[] catColInds; int[] textColInds; int[] numColInds; string[][][] catCols; string[][] textCols; double[][] numCols; bool success = ProcessDataAddAnnotation(mdata.RowCount, para, baseIds, processInfo, out name, out catColInds, out textColInds, out numColInds, out catCols, out textCols, out numCols); if (!success) { return; } for (int i = 0; i < catCols.Length; i++) { mdata.AddCategoryColumn(name[catColInds[i]], "", catCols[i]); } for (int i = 0; i < textCols.Length; i++) { mdata.AddStringColumn(name[textColInds[i]], "", textCols[i]); } for (int i = 0; i < numCols.Length; i++) { mdata.AddNumericColumn(name[numColInds[i]], "", numCols[i]); } }
public void SmallTest() { IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ] { { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 } }); mdata.AddStringColumn("id", "", new [] { "a", "b", "b", "b" }); mdata.AddStringColumn("str", "", new [] { "a;b", "b;c", "c;d", "d;e" }); mdata.AddCategoryColumn("cat", "", new[] { new[] { "a", "b" }, new[] { "b", "c" }, new[] { "c", "d" }, new[] { "d", "e" } }); mdata.AddNumericColumn("num", "", new [] { 0, 1, 2, 3, 4.0 }); mdata.AddMultiNumericColumn("mnum", "", new [] { new [] { 0, 4d }, new [] { 1, 5d }, new [] { 2, 6d }, new [] { 3, 7d } }); mdata.UniqueRows(mdata.StringColumns[0], ArrayUtils.Median, UniqueRows.Union, UniqueRows.CatUnion, UniqueRows.MultiNumUnion); Assert.AreEqual(2, mdata.RowCount); CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.Values.GetColumn(0)); CollectionAssert.AreEqual(new [] { 4, 6 }, mdata.Values.GetColumn(1)); CollectionAssert.AreEqual(new [] { "a;b", "b;c;d;e" }, mdata.GetStringColumn("str")); CollectionAssert.AreEqual(new [] { new [] { "a", "b" }, new [] { "b", "c", "d", "e" } }, mdata.GetCategoryColumnAt(0)); CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.NumericColumns[0]); CollectionAssert.AreEqual(new [] { new [] { 0d, 4 }, new [] { 1d, 5, 2, 6, 3, 7 } }, mdata.MultiNumericColumns[0]); }
public void ImportResult(Dictionary <string, string[]> results, IMatrixData mdata, string pair1, string pair2, string[][] validCol, string[][] sigCol, string method, bool replicate) { foreach (KeyValuePair <string, string[]> entry in results) { if ((entry.Key == "LR") && (!replicate)) { } else { mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_" + entry.Key, pair1 + "_vs_" + pair2 + "_" + entry.Key, Array.ConvertAll(entry.Value, Double.Parse)); } double[] t = new double[entry.Value.Length]; if (((entry.Key == "p-value" || entry.Key == "padj") && method == "DESeq2") || ((entry.Key == "p-value" || entry.Key == "FDR") && method == "EdgeR")) { for (int i = 0; i < entry.Value.Length; i++) { double.TryParse(entry.Value[i], out double p); if (p == 0) { t[i] = Math.Log10(1 / Double.MaxValue) * -1; } else { t[i] = Math.Log10(p) * -1; } } mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, t); } } if (method == "DESeq2") { mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Valid", pair1 + "_vs_" + pair2 + "_Valid", validCol); } mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Significant", pair1 + "_vs_" + pair2 + "_Significant", sigCol); }
private static void ExpressionToNumeric(IList <int> colInds, IMatrixData mdata) { int[] remainingInds = ArrayUtils.Complement(colInds, mdata.ColumnCount); foreach (int colInd in colInds) { double[] d = ArrayUtils.ToDoubles(mdata.Values.GetColumn(colInd)); mdata.AddNumericColumn(mdata.ColumnNames[colInd], mdata.ColumnDescriptions[colInd], d); } mdata.ExtractColumns(remainingInds); }
public void TestInitialize() { var peptidesValues = new[, ] { { 9.0f } }; peptides = PerseusFactory.CreateMatrixData(peptidesValues, new List <string> { "pep_MS/MS Count" }); peptides.AddNumericColumn("pep_Intensity", "", new [] { 0.0 }); peptides.AddStringColumn("pep_id", "", new [] { "35" }); peptides.AddStringColumn("pep_Protein group IDs", "", new [] { "13;21" }); peptides.Quality.Init(1, 1); peptides.Quality.Set(0, 0, 1); var multiNum = new ExpandMultiNumeric(); var errorString = string.Empty; var parameters2 = multiNum.GetParameters(peptides, ref errorString); parameters2.GetParam <int[]>("Text columns").Value = new[] { 1 }; IMatrixData[] suppl = null; IDocumentData[] docs = null; multiNum.ProcessData(peptides, parameters2, ref suppl, ref docs, CreateProcessInfo()); var proteinMainValues = new[, ] { { 166250000.0f }, { 8346000.0f } }; proteinMain = PerseusFactory.CreateMatrixData(proteinMainValues, new List <string> { "prot_LFQ intensity" }); proteinMain.Name = "protein main"; proteinMain.AddStringColumn("prot_id", "", new [] { "13", "21" }); proteinMain.AddStringColumn("prot_gene name", "", new [] { "geneA", "geneB" }); var expandValues = new[, ] { { 9.0f }, { 9.0f } }; expand = PerseusFactory.CreateMatrixData(expandValues, new List <string> { "pep_MS/MS Count" }); expand.Name = "expand"; expand.AddNumericColumn("pep_Intensity", "", new [] { 0.0, 0.0 }); expand.AddStringColumn("pep_id", "", new [] { "35", "35" }); expand.AddStringColumn("pep_Protein group IDs", "", new [] { "13", "21" }); matching = new MatchingRowsByName(); var err = string.Empty; parameters = matching.GetParameters(new[] { expand, proteinMain }, ref err); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] cols = param.GetParam <int[]>("Columns").Value; int truncIndex = param.GetParam <int>("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetParam <double>("Threshold value").Value; int sideInd = param.GetParam <int>("Side").Value; TestSide side; switch (sideInd) { case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } foreach (int col in cols) { BaseVector r = mdata.Values.GetColumn(col); double[] pvals = CalcSignificanceA(r, side); string[][] fdr; switch (truncation) { case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: double[] fdrs; fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold, pvals.Length, out fdrs); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ColumnNames[col] + " Significance A", "", pvals); mdata.AddCategoryColumn(mdata.ColumnNames[col] + " A significant", "", fdr); } }
public void ProcessData(IMatrixData data, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] exColInds = param.GetParam <int[]>("Main columns").Value; int[] numColInds = param.GetParam <int[]>("Numerical columns").Value; int[] multiNumColInds = param.GetParam <int[]>("Multi-numerical columns").Value; int[] catColInds = param.GetParam <int[]>("Categorical columns").Value; int[] textColInds = param.GetParam <int[]>("Text columns").Value; if (exColInds.Length > 0) { int ncol = data.ColumnCount; data.ExtractColumns(ArrayUtils.Concat(ArrayUtils.ConsecutiveInts(data.ColumnCount), exColInds)); HashSet <string> taken = new HashSet <string>(data.ColumnNames); for (int i = 0; i < exColInds.Length; i++) { string s = StringUtils.GetNextAvailableName(data.ColumnNames[ncol + i], taken); data.ColumnNames[ncol + i] = s; taken.Add(s); } } foreach (int ind in numColInds) { HashSet <string> taken = new HashSet <string>(data.NumericColumnNames); string s = StringUtils.GetNextAvailableName(data.NumericColumnNames[ind], taken); data.AddNumericColumn(s, data.NumericColumnDescriptions[ind], (double[])data.NumericColumns[ind].Clone()); taken.Add(s); } foreach (int ind in multiNumColInds) { HashSet <string> taken = new HashSet <string>(data.MultiNumericColumnNames); string s = StringUtils.GetNextAvailableName(data.MultiNumericColumnNames[ind], taken); data.AddMultiNumericColumn(s, data.MultiNumericColumnDescriptions[ind], (double[][])data.MultiNumericColumns[ind].Clone()); taken.Add(s); } foreach (int ind in catColInds) { HashSet <string> taken = new HashSet <string>(data.CategoryColumnNames); string s = StringUtils.GetNextAvailableName(data.CategoryColumnNames[ind], taken); data.AddCategoryColumn(s, data.CategoryColumnDescriptions[ind], data.GetCategoryColumnAt(ind)); taken.Add(s); } foreach (int ind in textColInds) { HashSet <string> taken = new HashSet <string>(data.StringColumnNames); string s = StringUtils.GetNextAvailableName(data.StringColumnNames[ind], taken); data.AddStringColumn(s, data.ColumnDescriptions[ind], (string[])data.StringColumns[ind].Clone()); taken.Add(s); } }
public void WriteMatrixTest() { // main data IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ] { { 1, 2, 3 }, { 3, 4, 5 } }, new List <string> { "col1", "col2", "col3" }); // annotation rows mdata.AddCategoryRow("catrow", "this is catrow", new[] { new[] { "cat1" }, new[] { "cat1", "cat2" }, new[] { "cat2" } }); mdata.AddNumericRow("numrow", "this is numrow", new[] { -1.0, 1, 2 }); // annotation columns mdata.AddStringColumn("strcol1", "this is stringcol1", new[] { "1", "2" }); mdata.AddStringColumn("strcol2", "", new[] { "", "hallo" }); mdata.AddNumericColumn("numcol", "", new[] { 1.0, 2.0 }); mdata.AddMultiNumericColumn("multnumcol", "this is multnumcol", new[] { new[] { -2.0, 2.0 }, new double[] {} }); mdata.AddCategoryColumn("catcol", "", new[] { new[] { "cat1", "cat1.1" }, new[] { "cat2", "cat1" } }); string mdataStr; using (MemoryStream memstream = new MemoryStream()) using (StreamWriter writer = new StreamWriter(memstream)) { PerseusUtils.WriteMatrix(mdata, writer); writer.Flush(); mdataStr = Encoding.UTF8.GetString(memstream.ToArray()); } IMatrixData mdata2 = PerseusFactory.CreateMatrixData(); PerseusUtils.ReadMatrix(mdata2, new ProcessInfo(new Settings(), status => { }, progress => { }, 1), () => { StreamReader tmpStream = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(mdataStr))); return(tmpStream); }, "matrix1", '\t'); Assert.AreEqual(2, mdata2.RowCount); Assert.AreEqual(3, mdata2.ColumnCount); Assert.AreEqual(2, mdata2.StringColumnCount); Assert.AreEqual(1, mdata2.NumericColumnCount); Assert.AreEqual(1, mdata2.CategoryColumnCount); Assert.AreEqual(1, mdata2.MultiNumericColumnCount); Assert.AreEqual("hallo", mdata2.StringColumns[mdata2.StringColumnNames.FindIndex(col => col.Equals("strcol2"))][1]); Assert.AreEqual(1, mdata2.CategoryRowCount); Assert.AreEqual(1, mdata2.NumericRowCount); }
private static void AddStandardDeviation(int groupColInd, int validVals, IMatrixData mdata, int varInd) { string[][] groupCol = mdata.GetCategoryRowAt(groupColInd); string[] groupNames = ArrayUtils.UniqueValuesPreserveOrder(groupCol); int[][] colInds = PerseusPluginUtils.GetMainColIndices(groupCol, groupNames); double[][] newNumCols = new double[groupNames.Length][]; for (int i = 0; i < newNumCols.Length; i++) { newNumCols[i] = new double[mdata.RowCount]; } for (int i = 0; i < mdata.RowCount; i++) { for (int j = 0; j < groupNames.Length; j++) { List <double> vals = new List <double>(); foreach (int ind in colInds[j]) { double val = mdata.Values.Get(i, ind); if (!double.IsNaN(val) && !double.IsInfinity(val)) { vals.Add(val); } } double xy = double.NaN; if (vals.Count >= validVals) { if (varInd == 0) { xy = ArrayUtils.StandardDeviation(vals); } else { xy = ArrayUtils.StandardDeviation(vals) / Math.Sqrt(vals.Count); } } newNumCols[j][i] = xy; } } for (int i = 0; i < groupNames.Length; i++) { string name = "stddev " + groupNames[i]; mdata.AddNumericColumn(name, name, newNumCols[i]); } }
public void ProcessData(IMatrixData mdata, Parameters param1, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] cols = param1.GetParam <int[]>("Columns").Value; int[] ops = param1.GetParam <int[]>("Operation").Value; foreach (int t in ops) { double[][] vals = new double[cols.Length][]; for (int i = 0; i < cols.Length; i++) { double[][] x = mdata.MultiNumericColumns[cols[i]]; vals[i] = new double[x.Length]; for (int j = 0; j < vals[i].Length; j++) { vals[i][j] = operations[t](x[j]); } } for (int i = 0; i < cols.Length; i++) { mdata.AddNumericColumn(mdata.MultiNumericColumnNames[cols[i]] + "_" + names[t], "", vals[i]); } } }
private static void FillMatrixKeep(int groupColInd, int validVals, IMatrixData mdata, Func <IList <double>, double> func) { string[][] groupCol = mdata.GetCategoryRowAt(groupColInd); string[] groupNames = ArrayUtils.UniqueValuesPreserveOrder(groupCol); int[][] colInds = PerseusPluginUtils.GetMainColIndices(groupCol, groupNames); double[][] newNumCols = new double[groupNames.Length][]; for (int i = 0; i < newNumCols.Length; i++) { newNumCols[i] = new double[mdata.RowCount]; } for (int i = 0; i < mdata.RowCount; i++) { for (int j = 0; j < groupNames.Length; j++) { List <double> vals = new List <double>(); foreach (int ind in colInds[j]) { double val = mdata.Values.Get(i, ind); if (!double.IsNaN(val) && !double.IsInfinity(val)) { vals.Add(val); } } double xy = double.NaN; if (vals.Count >= validVals) { xy = func(vals); } newNumCols[j][i] = xy; } } for (int i = 0; i < groupNames.Length; i++) { mdata.AddNumericColumn(groupNames[i], groupNames[i], newNumCols[i]); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] mods = param.GetParam <int[]>("Modifications").StringValue.Split(new[] { ';' }, StringSplitOptions.RemoveEmptyEntries); string[] up = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++) { uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } double[][] c = new double[mods.Length][]; for (int index = 0; index < mods.Length; index++) { string mod = mods[index]; string filename = PhosphoSitePlusParser.GetFilenameForMod(mod); if (filename == null) { processInfo.ErrString = "File does not exist."; return; } PhosphoSitePlusParser.ParseKnownMods(filename, out string[] seqWins, out string[] accs, out string[] pubmedLtp, out string[] pubmedMs2, out string[] cstMs2, out string[] species); for (int i = 0; i < seqWins.Length; i++) { seqWins[i] = seqWins[i].ToUpper(); } Dictionary <string, HashSet <string> > counts = new Dictionary <string, HashSet <string> >(); for (int i = 0; i < accs.Length; i++) { string acc = accs[i]; if (!counts.ContainsKey(acc)) { counts.Add(acc, new HashSet <string>()); } counts[acc].Add(seqWins[i]); } c[index] = new double[up.Length]; for (int i = 0; i < up.Length; i++) { c[index][i] = CountSites(uprot[i], counts); } } string[][] catCol = new string[up.Length][]; for (int i = 0; i < catCol.Length; i++) { List <string> x = new List <string>(); for (int j = 0; j < mods.Length; j++) { if (c[j][i] > 0) { x.Add(mods[j]); } } x.Sort(); catCol[i] = x.ToArray(); } mdata.AddCategoryColumn("Known modifications", "Known modifications", catCol); for (int i = 0; i < mods.Length; i++) { mdata.AddNumericColumn(mods[i] + " count", mods[i] + " count", c[i]); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] cols = param.GetMultiChoiceParam("Columns").Value; int truncIndex = param.GetSingleChoiceParam("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetDoubleParam("Threshold value").Value; int sideInd = param.GetSingleChoiceParam("Side").Value; TestSide side; switch (sideInd){ case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } foreach (int col in cols){ float[] r = mdata.GetExpressionColumn(col); double[] pvals = CalcSignificanceA(r, side); string[][] fdr; switch (truncation){ case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ExpressionColumnNames[col] + " Significance A", "", pvals); mdata.AddCategoryColumn(mdata.ExpressionColumnNames[col] + " A significant", "", fdr); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int proteinIdColumnInd = param.GetParam<int>("Protein IDs").Value; string[][] proteinIds = new string[mdata.RowCount][]; string[][] leadingIds = new string[mdata.RowCount][]; List<string> allIds = new List<string>(); for (int row = 0; row < mdata.RowCount; row++){ proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';'); leadingIds[row] = new[]{proteinIds[row][0]}; allIds.AddRange(proteinIds[row]); } string fastaFilePath = param.GetParam<string>("Fasta file").Value; Fasta fasta = new Fasta(); fasta.ParseFile(fastaFilePath, processInfo); // Text annotations processInfo.Status("Adding fasta header annotations."); int[] selection = param.GetParamWithSubParams<int>("Fasta header annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; string[][] idsToBeAnnotated = (param.GetParamWithSubParams<int>("Fasta header annotations").Value == 0) ? proteinIds : leadingIds; ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++){ List<ProteinSequence> rowEntries = new List<ProteinSequence>(); foreach (string id in idsToBeAnnotated[row]){ ProteinSequence entry = fasta.GetEntry(id); if (entry == null){ continue; } rowEntries.Add(entry); } fastaEntries[row] = rowEntries.ToArray(); } if (ArrayUtils.Contains(selection, 0)){ // Entry name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string entryName = entry.EntryName; if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)){ rowAnnotations.Add(entryName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Entry name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Gene name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string geneName = entry.GeneName; if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)){ rowAnnotations.Add(geneName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Gene name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the //'Isoform x of...' prefixes and '(Fragment)' suffixes string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn); } if (ArrayUtils.Contains(selection, 3)){ // Consensus protein name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ConsensusProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 4)){ // Species string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string speciesName = entry.Species; if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)){ rowAnnotations.Add(speciesName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Species", "", annotationColumn); } // Numeric annotations processInfo.Status("Adding numeric annotations."); selection = param.GetParamWithSubParams<int>("Numeric annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; bool annotateLeadingId = (param.GetParamWithSubParams<int>("Numeric annotations").Value == 1); if (ArrayUtils.Contains(selection, 0)){ // Sequence length double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double sequenceLength = entry.GetSequence().Length; rowAnnotations.Add(sequenceLength); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Sequence length", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Monoisotopic molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double monoisotopicMass = entry.GetMonoisotopicMolecularMass(); rowAnnotations.Add(monoisotopicMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Average molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double averageMass = entry.GetAverageMolecularMass(); rowAnnotations.Add(averageMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Average molecular mass", "", annotationColumn); } // Theoretical peptides processInfo.Status("Calculating theoretical peptides."); annotateLeadingId = (param.GetParamWithSubParams<int>("Calculate theoretical peptides").Value == 1); Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases, param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<int[]>("Proteases") .Value); double minLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Min. peptide length").Value; double maxLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Max. peptide length").Value; bool displayPeptideSequences = annotateLeadingId && param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<bool>( "Show sequences").Value; foreach (Protease protease in proteases){ double[] annotationColumn = new double[mdata.RowCount]; string[] peptideColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); List<string> rowPeptides = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int) minLength, (int) maxLength); rowAnnotations.Add(nTheoreticalPeptides); if (displayPeptideSequences){ rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int) minLength, (int) maxLength)); } if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); peptideColumn[row] = String.Join(";", rowPeptides); } mdata.AddNumericColumn( "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn); if (displayPeptideSequences){ mdata.AddStringColumn( "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn); } } // Sequence features processInfo.Status("Counting sequence features."); annotateLeadingId = (param.GetParamWithSubParams<int>("Count sequence features").Value == 1); bool normalizeBySequenceLength = param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<bool>( "Normalize by sequence length").Value; if (param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value != ""){ Regex regex; try{ regex = new Regex( param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value); } catch (ArgumentException){ processInfo.ErrString = "The regular expression you provided has invalid syntax."; return; } double[] sequenceFeatureColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> featureCount = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nFeatures = regex.Matches(entry.GetSequence()).Count; featureCount.Add(normalizeBySequenceLength ? nFeatures/entry.GetLength() : nFeatures); if (annotateLeadingId){ break; } } sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray()); } mdata.AddNumericColumn( (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "", sequenceFeatureColumn); } processInfo.Status("Done."); }
private static void FillMatrixKeep(int groupColInd, int validVals, IMatrixData mdata, Func<IList<double>, double> func) { string[][] groupCol = mdata.GetCategoryRowAt(groupColInd); string[] groupNames = ArrayUtils.UniqueValuesPreserveOrder(groupCol); int[][] colInds = PerseusPluginUtils.GetExpressionColIndices(groupCol, groupNames); double[][] newNumCols = new double[groupNames.Length][]; for (int i = 0; i < newNumCols.Length; i++){ newNumCols[i] = new double[mdata.RowCount]; } for (int i = 0; i < mdata.RowCount; i++){ for (int j = 0; j < groupNames.Length; j++){ List<double> vals = new List<double>(); foreach (int ind in colInds[j]){ double val = mdata[i, ind]; if (!double.IsNaN(val) && !double.IsInfinity(val)){ vals.Add(val); } } float xy = float.NaN; if (vals.Count >= validVals){ xy = (float) func(vals); } newNumCols[j][i] = xy; } } for (int i = 0; i < groupNames.Length; i++){ mdata.AddNumericColumn(groupNames[i], groupNames[i], newNumCols[i]); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] rcols = param.GetParam <int[]>("Ratio columns").Value; int[] icols = param.GetParam <int[]>("Intensity columns").Value; if (rcols.Length == 0) { processInfo.ErrString = "Please specify some ratio columns."; return; } if (rcols.Length != icols.Length) { processInfo.ErrString = "The number of ratio and intensity columns have to be equal."; return; } int truncIndex = param.GetParam <int>("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetParam <double>("Threshold value").Value; int sideInd = param.GetParam <int>("Side").Value; TestSide side; switch (sideInd) { case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } for (int i = 0; i < rcols.Length; i++) { BaseVector r = mdata.Values.GetColumn(rcols[i]); BaseVector intens = icols[i] < mdata.ColumnCount ? mdata.Values.GetColumn(icols[i]) : new DoubleArrayVector(mdata.NumericColumns[icols[i] - mdata.ColumnCount]); double[] pvals = CalcSignificanceB(r, intens, side); string[][] fdr; switch (truncation) { case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold, out double[] fdrs); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ColumnNames[rcols[i]] + " Significance B", "", pvals); mdata.AddCategoryColumn(mdata.ColumnNames[rcols[i]] + " B significant", "", fdr); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] colIndx = param.GetParam <int[]>("x").Value; int[] colIndy = param.GetParam <int[]>("y").Value; if (colIndx.Length == 0) { processInfo.ErrString = "Please select some columns"; return; } if (colIndx.Length != colIndy.Length) { processInfo.ErrString = "Please select the same number of columns in the boxes for the first and second columns."; return; } int typeInd = param.GetParam <int>("Distribution type").Value; int points = param.GetParam <int>("Number of points").Value; for (int k = 0; k < colIndx.Length; k++) { float[] xvals = GetColumn(mdata, colIndx[k]); float[] yvals = GetColumn(mdata, colIndy[k]); float[] xvals1; float[] yvals1; NumUtils.GetValidPairs(xvals, yvals, out xvals1, out yvals1); double xmin; double xmax; double ymin; double ymax; DensityEstimation.CalcRanges(xvals1, yvals1, out xmin, out xmax, out ymin, out ymax); float[,] values = DensityEstimation.GetValuesOnGrid(xvals1, xmin, (xmax - xmin) / points, points, yvals1, ymin, (ymax - ymin) / points, points); if (typeInd == 1) { MakeConditional1(values); } if (typeInd == 2) { MakeConditional2(values); } if (typeInd == 3) { MakeConditional3(values); } DensityEstimation.DivideByMaximum(values); double[] xmat = new double[points]; for (int i = 0; i < points; i++) { xmat[i] = xmin + i * (xmax - xmin) / points; } double[] ymat = new double[points]; for (int i = 0; i < points; i++) { ymat[i] = ymin + i * (ymax - ymin) / points; } float[,] percvalues = CalcExcludedPercentage(values); double[] dvals = new double[xvals.Length]; double[] pvals = new double[xvals.Length]; for (int i = 0; i < dvals.Length; i++) { double xx = xvals[i]; double yy = yvals[i]; if (!double.IsNaN(xx) && !double.IsNaN(yy)) { int xind = ArrayUtils.ClosestIndex(xmat, xx); int yind = ArrayUtils.ClosestIndex(ymat, yy); dvals[i] = values[xind, yind]; pvals[i] = percvalues[xind, yind]; } else { dvals[i] = double.NaN; pvals[i] = double.NaN; } } string xname = GetColumnName(mdata, colIndx[k]); string yname = GetColumnName(mdata, colIndy[k]); mdata.AddNumericColumn("Density_" + xname + "_" + yname, "Density of data points in the plane spanned by the columns " + xname + " and " + yname + ".", dvals); mdata.AddNumericColumn("Excluded fraction_" + xname + "_" + yname, "Percentage of points with a point density smaller than at this point in the plane spanned by the columns " + xname + " and " + yname + ".", pvals); } }
public void ExtractDESeq2Results(IMatrixData mdata, string pair1, string pair2, ParameterWithSubParams <bool> fdrValid, ParameterWithSubParams <bool> pValid, ParameterWithSubParams <bool> lfcValid) { StreamReader reader = new StreamReader(File.OpenRead("results.csv")); int lineNum = 0; string[][] validCol = new string[mdata.Values.RowCount][]; string[][] sigCol = new string[mdata.Values.RowCount][]; Dictionary <string, string[]> results = new Dictionary <string, string[]> { { "baseMean", new string[mdata.Values.RowCount] }, { "log2FoldChange", new string[mdata.Values.RowCount] }, { "lfcSE", new string[mdata.Values.RowCount] }, { "stat", new string[mdata.Values.RowCount] }, { "p-value", new string[mdata.Values.RowCount] }, { "padj", new string[mdata.Values.RowCount] } }; while (!reader.EndOfStream) { string line = reader.ReadLine(); if (!String.IsNullOrWhiteSpace(line)) { line = line.Replace("\"", ""); string[] info = line.Split(','); if (lineNum != 0) { validCol[lineNum - 1] = new string[] { "+" }; sigCol[lineNum - 1] = new string[] { "Not Valid" }; for (int v = 0; v < info.Length; v++) { if (info[v] == "NA") { if (v == 3 || v == 5 || v == 6) { info[v] = "1"; } else if (v == 2 || v == 4) { info[v] = "0"; } validCol[lineNum - 1][0] = "-"; } } if (validCol[lineNum - 1][0] == "+") { CheckSignificant(sigCol, info, fdrValid, pValid, lfcValid, lineNum); } results["baseMean"][lineNum - 1] = info[1]; results["log2FoldChange"][lineNum - 1] = info[2]; results["lfcSE"][lineNum - 1] = info[3]; results["stat"][lineNum - 1] = info[4]; results["p-value"][lineNum - 1] = info[5]; results["padj"][lineNum - 1] = info[6]; } } lineNum++; } reader.Close(); foreach (KeyValuePair <string, string[]> entry in results) { mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_" + entry.Key, pair1 + "_vs_" + pair2 + "_" + entry.Key, Array.ConvertAll(entry.Value, Double.Parse)); double[] t = new double[entry.Value.Length]; if (entry.Key == "p-value" || entry.Key == "padj") { for (int i = 0; i < entry.Value.Length; i++) { double.TryParse(entry.Value[i], out double p); if (p == 0) { t[i] = Math.Log10(1 / Double.MaxValue) * -1; } else { t[i] = Math.Log10(p) * -1; } } mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, t); } } mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Valid", pair1 + "_vs_" + pair2 + "_Valid", validCol); mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Significant", pair1 + "_vs_" + pair2 + "_Significant", sigCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] outputColumns = param.GetParam <int[]>("Output").Value; int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value; string[] proteinIds = mdata.StringColumns[proteinIdColumnInd]; int[] intensityCols = param.GetParam <int[]>("Intensities").Value; if (intensityCols.Length == 0) { processInfo.ErrString = "Please select at least one column containing protein intensities."; return; } // variable to hold all intensity values List <double[]> columns = new List <double[]>(); string[] inputNames = new string[intensityCols.Length]; string[] sampleNames = new string[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++) { double[] values; if (intensityCols[col] < mdata.ColumnCount) { values = ArrayUtils.ToDoubles(mdata.Values.GetColumn(intensityCols[col])); inputNames[col] = mdata.ColumnNames[intensityCols[col]]; } else { values = mdata.NumericColumns[intensityCols[col] - mdata.ColumnCount]; inputNames[col] = mdata.NumericColumnNames[intensityCols[col] - mdata.ColumnCount]; } sampleNames[col] = new Regex(@"^(?:(?:LFQ )?[Ii]ntensity )?(.*)$").Match(inputNames[col]).Groups[1].Value; columns.Add(values); } // average over columns if this option is selected if (param.GetParamWithSubParams <int>("Averaging mode").Value == 3) { double[] column = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { double[] values = new double[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++) { values[col] = columns[col][row]; } column[row] = ArrayUtils.Median(ExtractValidValues(values, false)); } // delete the original list of columns columns = new List <double[]> { column }; sampleNames = new[] { "" }; } // revert logarithm if necessary if (param.GetParamWithSubParams <bool>("Logarithmized").Value) { double[] logBases = new[] { 2, Math.E, 10 }; double logBase = logBases[param.GetParamWithSubParams <bool>("Logarithmized").GetSubParameters().GetParam <int>("log base").Value]; foreach (double[] t in columns) { for (int row = 0; row < mdata.RowCount; row++) { if (t[row] == 0) { processInfo.ErrString = "Are the columns really logarithmized?\nThey contain zeroes!"; } t[row] = Math.Pow(logBase, t[row]); } } } double[] mw = mdata.NumericColumns[param.GetParam <int>("Molecular masses").Value]; // define whether the molecular masses are given in Da or kDa if (ArrayUtils.Median(mw) < 250) // most likely kDa { for (int i = 0; i < mw.Length; i++) { mw[i] *= 1000; } } double[] detectabilityNormFactor = mw; if (param.GetParamWithSubParams <bool>("Detectability correction").Value) { detectabilityNormFactor = mdata.NumericColumns[ param.GetParamWithSubParams <bool>("Detectability correction").GetSubParameters().GetParam <int>("Correction factor") .Value]; } // the normalization factor needs to be nonzero for all proteins // check and replace with 1 for all relevant cases for (int row = 0; row < mdata.RowCount; row++) { if (detectabilityNormFactor[row] == 0 || double.IsNaN(detectabilityNormFactor[row])) { detectabilityNormFactor[row] = 1; } } // detect the organism Organism organism = DetectOrganism(proteinIds); // c value the amount of DNA per haploid genome, see: http://en.wikipedia.org/wiki/C-value double cValue = organism.genomeSize * basePairWeight / avogadro; // find the histones int[] histoneRows = FindHistones(proteinIds, organism); // write a categorical column indicating the histones string[][] histoneCol = new string[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++) { histoneCol[row] = ArrayUtils.Contains(histoneRows, row) ? new[] { "+" } : new string[0]; } mdata.AddCategoryColumn("Histones", "", histoneCol); // initialize the variables for the annotation rows string[] sampleNameRow = new string[mdata.ColumnCount]; string[] inputNameRow = new string[mdata.ColumnCount]; double[] totalProteinRow = new double[mdata.ColumnCount]; double[] totalMoleculesRow = new double[mdata.ColumnCount]; string[][] organismRow = new string[mdata.ColumnCount][]; // populate the organismRow variable with empty strings as defaults (not null, which may cause errors when writing the annotations in the end.) for (int i = 0; i < organismRow.Length; i++) { organismRow[i] = new[] { "N/A" }; } double[] histoneMassRow = new double[mdata.ColumnCount]; double[] ploidyRow = new double[mdata.ColumnCount]; double[] cellVolumeRow = new double[mdata.ColumnCount]; double[] normalizationFactors = new double[columns.Count]; // calculate normalization factors for each column for (int col = 0; col < columns.Count; col++) { string sampleName = sampleNames[col]; double[] column = columns[col]; // normalization factor to go from intensities to copies, // needs to be determined either using the total protein or the histone scaling approach double factor; switch (param.GetParamWithSubParams <int>("Scaling mode").Value) { case 0: // total protein amount double mwWeightedNormalizedSummedIntensities = 0; for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { mwWeightedNormalizedSummedIntensities += column[row] / detectabilityNormFactor[row] * mw[row]; } } factor = param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>( "Protein amount per cell [pg]").Value *1e-12 * avogadro / mwWeightedNormalizedSummedIntensities; break; case 1: // histone mode double mwWeightedNormalizedSummedHistoneIntensities = 0; foreach (int row in histoneRows) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { mwWeightedNormalizedSummedHistoneIntensities += column[row] / detectabilityNormFactor[row] * mw[row]; } } double ploidy = param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>("Ploidy").Value; factor = cValue * ploidy * avogadro / mwWeightedNormalizedSummedHistoneIntensities; break; default: factor = 1; break; } normalizationFactors[col] = factor; } // check averaging mode if (param.GetParamWithSubParams <int>("Averaging mode").Value == 1) // same factor for all { double factor = ArrayUtils.Mean(normalizationFactors); for (int i = 0; i < normalizationFactors.Length; i++) { normalizationFactors[i] = factor; } } if (param.GetParamWithSubParams <int>("Averaging mode").Value == 2) // same factor in each group { if (param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value == -1) { processInfo.ErrString = "No grouping selected."; return; } string[][] groupNames = mdata.GetCategoryRowAt( param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value); string[] uniqueGroupNames = Unique(groupNames); int[] grouping = new int[columns.Count]; for (int i = 0; i < columns.Count; i++) { if (intensityCols[i] >= mdata.ColumnCount) // Numeric annotation columns cannot be grouped { grouping[i] = i; continue; } if (ArrayUtils.Contains(uniqueGroupNames, groupNames[i][0])) { grouping[i] = ArrayUtils.IndexOf(uniqueGroupNames, groupNames[i][0]); continue; } grouping[i] = i; } Dictionary <int, List <double> > factors = new Dictionary <int, List <double> >(); for (int i = 0; i < columns.Count; i++) { if (factors.ContainsKey(grouping[i])) { factors[grouping[i]].Add(normalizationFactors[i]); } else { factors.Add(grouping[i], new List <double> { normalizationFactors[i] }); } } double[] averagedNormalizationFactors = new double[columns.Count]; for (int i = 0; i < columns.Count; i++) { List <double> factor; factors.TryGetValue(grouping[i], out factor); averagedNormalizationFactors[i] = ArrayUtils.Mean(factor); } normalizationFactors = averagedNormalizationFactors; } // loop over all selected columns and calculate copy numbers for (int col = 0; col < columns.Count; col++) { string sampleName = sampleNames[col]; double[] column = columns[col]; double factor = normalizationFactors[col]; double[] copyNumbers = new double[mdata.RowCount]; double[] concentrations = new double[mdata.RowCount]; // femtoliters double[] massFraction = new double[mdata.RowCount]; double[] moleFraction = new double[mdata.RowCount]; double totalProtein = 0; // picograms double histoneMass = 0; // picograms double totalMolecules = 0; for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { copyNumbers[row] = column[row] / detectabilityNormFactor[row] * factor; totalMolecules += copyNumbers[row]; totalProtein += copyNumbers[row] * mw[row] * 1e12 / avogadro; // picograms if (ArrayUtils.Contains(histoneRows, row)) { histoneMass += copyNumbers[row] * mw[row] * 1e12 / avogadro; // picograms } } } double totalVolume = totalProtein / param.GetParam <double>("Total cellular protein concentration [g/l]").Value * 1000; // femtoliters for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { concentrations[row] = copyNumbers[row] / (totalVolume * 1e-15) / avogadro * 1e9; // nanomolar massFraction[row] = copyNumbers[row] * mw[row] * 1e12 / avogadro / totalProtein * 1e6; // ppm moleFraction[row] = copyNumbers[row] / totalMolecules * 1e6; // ppm } } string suffix = sampleName == "" ? "" : " " + sampleName; if (ArrayUtils.Contains(outputColumns, 0)) { mdata.AddNumericColumn("Copy number" + suffix, "", copyNumbers); } if (ArrayUtils.Contains(outputColumns, 1)) { mdata.AddNumericColumn("Concentration [nM]" + suffix, "", concentrations); } if (ArrayUtils.Contains(outputColumns, 2)) { mdata.AddNumericColumn("Abundance (mass/total mass) [*10^-6]" + suffix, "", massFraction); } if (ArrayUtils.Contains(outputColumns, 3)) { mdata.AddNumericColumn("Abundance (molecules/total molecules) [*10^-6]" + suffix, "", moleFraction); } double[] rank = ArrayUtils.Rank(copyNumbers); double[] relativeRank = new double[mdata.RowCount]; double validRanks = mdata.RowCount; for (int row = 0; row < mdata.RowCount; row++) { // remove rank for protein with no copy number information if (double.IsNaN(copyNumbers[row]) || copyNumbers[row] == 0) { rank[row] = double.NaN; validRanks--; // do not consider as valid } // invert ranking, so that rank 0 is the most abundant protein rank[row] = mdata.RowCount - rank[row]; } for (int row = 0; row < mdata.RowCount; row++) { relativeRank[row] = rank[row] / validRanks; } if (ArrayUtils.Contains(outputColumns, 4)) { mdata.AddNumericColumn("Copy number rank" + suffix, "", rank); } if (ArrayUtils.Contains(outputColumns, 5)) { mdata.AddNumericColumn("Relative copy number rank" + suffix, "", relativeRank); } if (intensityCols[col] < mdata.ColumnCount && param.GetParamWithSubParams <int>("Averaging mode").Value != 3) { inputNameRow[intensityCols[col]] = inputNames[col]; sampleNameRow[intensityCols[col]] = sampleNames[col]; totalProteinRow[intensityCols[col]] = Math.Round(totalProtein, 2); totalMoleculesRow[intensityCols[col]] = Math.Round(totalMolecules, 0); organismRow[intensityCols[col]] = new[] { organism.name }; histoneMassRow[intensityCols[col]] = Math.Round(histoneMass, 4); ploidyRow[intensityCols[col]] = Math.Round(histoneMass * 1e-12 / cValue, 2); cellVolumeRow[intensityCols[col]] = Math.Round(totalVolume, 2); // femtoliters } } // Summary annotation row if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 6)) { mdata.AddNumericRow("Total protein [pg/cell]", "", totalProteinRow); mdata.AddNumericRow("Total molecules per cell", "", totalMoleculesRow); mdata.AddCategoryRow("Organism", "", organismRow); mdata.AddNumericRow("Histone mass [pg/cell]", "", histoneMassRow); mdata.AddNumericRow("Ploidy", "", ploidyRow); mdata.AddNumericRow("Cell volume [fl]", "", cellVolumeRow); } // Summary matrix if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 7)) { supplTables = new IMatrixData[1]; IMatrixData supplTab = PerseusFactory.CreateMatrixData(); supplTab.ColumnNames = new List <string>(); supplTab.Values.Init(totalProteinRow.Length, 0); supplTab.SetAnnotationColumns(new List <string> { "Sample", "Input Column" }, new List <string[]>() { sampleNameRow, inputNameRow }, new List <string>() { "Organism" }, new List <string[][]>() { organismRow }, new List <string>() { "Total protein [pg/cell]", "Total molecules per cell", "Histone mass [pg/cell]", "Ploidy", "Cell volume [fl]" }, new List <double[]>() { totalProteinRow, totalMoleculesRow, histoneMassRow, ploidyRow, cellVolumeRow }, new List <string>(), new List <double[][]>()); supplTables[0] = supplTab; } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { SingleChoiceWithSubParams xp = param.GetSingleChoiceWithSubParams("Expression column selection"); bool groups = xp.Value == 2; string[] groupNames = null; int[][] colIndsGroups = null; if (groups){ int groupRowInd = xp.GetSubParameters().GetSingleChoiceParam("Group").Value; string[][] groupCol = mdata.GetCategoryRowAt(groupRowInd); groupNames = ArrayUtils.UniqueValuesPreserveOrder(groupCol); colIndsGroups = PerseusPluginUtils.GetExpressionColIndices(groupCol, groupNames); } int[] useCols = xp.Value == 1 ? xp.GetSubParameters().GetMultiChoiceParam("Columns").Value : ArrayUtils.ConsecutiveInts(mdata.ExpressionColumnCount); HashSet<int> w = ArrayUtils.ToHashSet(param.GetMultiChoiceParam("Calculate").Value); bool[] include = new bool[procs.Length]; double[][] columns = new double[procs.Length][]; double[][][] columnsG = null; if (groups){ columnsG = new double[procs.Length][][]; for (int i = 0; i < columnsG.Length; i++){ columnsG[i] = new double[groupNames.Length][]; } } for (int i = 0; i < include.Length; i++){ include[i] = w.Contains(i); if (include[i]){ columns[i] = new double[mdata.RowCount]; if (groups){ for (int j = 0; j < groupNames.Length; j++){ columnsG[i][j] = new double[mdata.RowCount]; } } } } for (int i = 0; i < mdata.RowCount; i++){ List<double> v = new List<double>(); foreach (int j in useCols){ double x = mdata[i, j]; if (!double.IsNaN(x) && !double.IsInfinity(x)){ v.Add(x); } } for (int j = 0; j < include.Length; j++){ if (include[j]){ columns[j][i] = procs[j].Item2(v); } } if (groups){ List<double>[] vg = new List<double>[groupNames.Length]; for (int j = 0; j < colIndsGroups.Length; j++){ vg[j] = new List<double>(); for (int k = 0; k < colIndsGroups[j].Length; k++){ double x = mdata[i, colIndsGroups[j][k]]; if (!double.IsNaN(x) && !double.IsInfinity(x)){ vg[j].Add(x); } } } for (int j = 0; j < include.Length; j++){ if (include[j]){ for (int k = 0; k < groupNames.Length; k++){ columnsG[j][k][i] = procs[j].Item2(vg[k]); } } } } } for (int i = 0; i < include.Length; i++){ if (include[i]){ mdata.AddNumericColumn(procs[i].Item1, procs[i].Item3, columns[i]); if (groups){ for (int k = 0; k < groupNames.Length; k++){ mdata.AddNumericColumn(procs[i].Item1 + " " + groupNames[k], procs[i].Item3, columnsG[i][k]); } } } } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] outputColumns = param.GetMultiChoiceParam("Output").Value; int proteinIdColumnInd = param.GetSingleChoiceParam("Protein IDs").Value; string[] proteinIds = mdata.StringColumns[proteinIdColumnInd]; int[] intensityCols = param.GetMultiChoiceParam("Intensities").Value; if (intensityCols.Length == 0){ processInfo.ErrString = "Please select at least one column containing protein intensities."; return; } // variable to hold all intensity values List<double[]> columns = new List<double[]>(); string[] sampleNames = new string[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++){ double[] values; if (intensityCols[col] < mdata.ExpressionColumnCount){ values = ArrayUtils.ToDoubles(mdata.GetExpressionColumn(intensityCols[col])); sampleNames[col] = mdata.ExpressionColumnNames[intensityCols[col]]; } else{ values = mdata.NumericColumns[intensityCols[col] - mdata.ExpressionColumnCount]; sampleNames[col] = mdata.NumericColumnNames[intensityCols[col] - mdata.ExpressionColumnCount]; } sampleNames[col] = new Regex(@"^(?:(?:LFQ )?[Ii]ntensity )?(.*)$").Match(sampleNames[col]).Groups[1].Value; columns.Add(values); } // average over columns if this option is selected if (param.GetSingleChoiceWithSubParams("Averaging mode").Value == 3){ double[] column = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ double[] values = new double[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++){ values[col] = columns[col][row]; } column[row] = ArrayUtils.Median(ExtractValidValues(values, false)); } // delete the original list of columns columns = new List<double[]>{column}; sampleNames = new[]{""}; } // revert logarithm if necessary if (param.GetBoolWithSubParams("Logarithmized").Value){ double[] logBases = new[]{2, Math.E, 10}; double logBase = logBases[param.GetBoolWithSubParams("Logarithmized").GetSubParameters().GetSingleChoiceParam("log base").Value]; foreach (double[] t in columns){ for (int row = 0; row < mdata.RowCount; row++){ if (t[row] == 0){ processInfo.ErrString = "Are the columns really logarithmized?\nThey contain zeroes!"; } t[row] = Math.Pow(logBase, t[row]); } } } double[] mw = mdata.NumericColumns[param.GetSingleChoiceParam("Molecular masses").Value]; // detect whether the molecular masses are given in Da or kDa if (ArrayUtils.Median(mw) < 250) // likely kDa { for (int i = 0; i < mw.Length; i++){ mw[i] *= 1000; } } double[] detectabilityNormFactor = mw; if (param.GetBoolWithSubParams("Detectability correction").Value){ detectabilityNormFactor = mdata.NumericColumns[ param.GetBoolWithSubParams("Detectability correction") .GetSubParameters() .GetSingleChoiceParam("Correction factor") .Value]; } // the normalization factor needs to be nonzero for all proteins // check and replace with 1 for all relevant cases for (int row = 0; row < mdata.RowCount; row++){ if (detectabilityNormFactor[row] == 0 || detectabilityNormFactor[row] == double.NaN){ detectabilityNormFactor[row] = 1; } } // detect the organism Organism organism = DetectOrganism(proteinIds); // c value the amount of DNA per cell, see: http://en.wikipedia.org/wiki/C-value double cValue = (organism.genomeSize*basePairWeight)/avogadro; // find the histones int[] histoneRows = FindHistones(proteinIds, organism); // write a categorical column indicating the histones string[][] histoneCol = new string[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++){ histoneCol[row] = (ArrayUtils.Contains(histoneRows, row)) ? new[]{"+"} : new[]{""}; } mdata.AddCategoryColumn("Histones", "", histoneCol); // initialize the variables for the annotation rows double[] totalProteinRow = new double[mdata.ExpressionColumnCount]; double[] totalMoleculesRow = new double[mdata.ExpressionColumnCount]; string[][] organismRow = new string[mdata.ExpressionColumnCount][]; double[] histoneMassRow = new double[mdata.ExpressionColumnCount]; double[] ploidyRow = new double[mdata.ExpressionColumnCount]; double[] cellVolumeRow = new double[mdata.ExpressionColumnCount]; double[] normalizationFactors = new double[columns.Count]; // calculate normalization factors for each column for (int col = 0; col < columns.Count; col++){ string sampleName = sampleNames[col]; double[] column = columns[col]; // normalization factor to go from intensities to copies, // needs to be determined either using the total protein or the histone scaling approach double factor; switch (param.GetSingleChoiceWithSubParams("Scaling mode").Value){ case 0: // total protein amount double mwWeightedNormalizedSummedIntensities = 0; for (int row = 0; row < mdata.RowCount; row++){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ mwWeightedNormalizedSummedIntensities += (column[row]/detectabilityNormFactor[row])*mw[row]; } } factor = (param.GetSingleChoiceWithSubParams("Scaling mode") .GetSubParameters() .GetDoubleParam("Protein amount per cell [pg]") .Value*1e-12*avogadro)/mwWeightedNormalizedSummedIntensities; break; case 1: // histone mode double mwWeightedNormalizedSummedHistoneIntensities = 0; foreach (int row in histoneRows){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ mwWeightedNormalizedSummedHistoneIntensities += (column[row]/detectabilityNormFactor[row])*mw[row]; } } double ploidy = param.GetSingleChoiceWithSubParams("Scaling mode").GetSubParameters().GetDoubleParam("Ploidy").Value; factor = (cValue*ploidy*avogadro)/mwWeightedNormalizedSummedHistoneIntensities; break; default: factor = 1; break; } normalizationFactors[col] = factor; } // check averaging mode if (param.GetSingleChoiceWithSubParams("Averaging mode").Value == 1) // same factor for all { double factor = ArrayUtils.Mean(normalizationFactors); for (int i = 0; i < normalizationFactors.Length; i++){ normalizationFactors[i] = factor; } } if (param.GetSingleChoiceWithSubParams("Averaging mode").Value == 2) // same factor in each group { if ( param.GetSingleChoiceWithSubParams("Averaging mode").GetSubParameters().GetSingleChoiceParam("Grouping").Value == -1){ processInfo.ErrString = "No grouping selected."; return; } string[][] groupNames = mdata.GetCategoryRowAt( param.GetSingleChoiceWithSubParams("Averaging mode").GetSubParameters().GetSingleChoiceParam("Grouping").Value); string[] uniqueGroupNames = Unique(groupNames); int[] grouping = new int[columns.Count]; for (int i = 0; i < columns.Count; i++){ if (intensityCols[i] >= mdata.ExpressionColumnCount){ // Numeric annotation columns cannot be grouped grouping[i] = i; continue; } if (ArrayUtils.Contains(uniqueGroupNames, groupNames[i][0])){ grouping[i] = ArrayUtils.IndexOf(uniqueGroupNames, groupNames[i][0]); continue; } grouping[i] = i; } Dictionary<int, List<double>> factors = new Dictionary<int, List<double>>(); for (int i = 0; i < columns.Count; i++){ if (factors.ContainsKey(grouping[i])){ factors[grouping[i]].Add(normalizationFactors[i]); } else{ factors.Add(grouping[i], new List<double>{normalizationFactors[i]}); } } double[] averagedNormalizationFactors = new double[columns.Count]; for (int i = 0; i < columns.Count; i++){ List<double> factor; factors.TryGetValue(grouping[i], out factor); averagedNormalizationFactors[i] = ArrayUtils.Mean(factor); } normalizationFactors = averagedNormalizationFactors; } // loop over all selected columns and calculate copy numbers for (int col = 0; col < columns.Count; col++){ string sampleName = sampleNames[col]; double[] column = columns[col]; double factor = normalizationFactors[col]; double[] copyNumbers = new double[mdata.RowCount]; double[] concentrations = new double[mdata.RowCount]; // femtoliters double[] massFraction = new double[mdata.RowCount]; double[] moleFraction = new double[mdata.RowCount]; double totalProtein = 0; // picograms double histoneMass = 0; // picograms double totalMolecules = 0; for (int row = 0; row < mdata.RowCount; row++){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ copyNumbers[row] = (column[row]/detectabilityNormFactor[row])*factor; totalMolecules += copyNumbers[row]; totalProtein += (copyNumbers[row]*mw[row]*1e12)/avogadro; // picograms if (ArrayUtils.Contains(histoneRows, row)){ histoneMass += (copyNumbers[row]*mw[row]*1e12)/avogadro; // picograms } } } double totalVolume = (totalProtein/(param.GetDoubleParam("Total cellular protein concentration [g/l]").Value))*1000; // femtoliters for (int row = 0; row < mdata.RowCount; row++){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ concentrations[row] = ((copyNumbers[row]/(totalVolume*1e-15))/avogadro)*1e9; // nanomolar massFraction[row] = (((copyNumbers[row]*mw[row]*1e12)/avogadro)/totalProtein)*1e6; // ppm moleFraction[row] = (copyNumbers[row]/totalMolecules)*1e6; // ppm } } string suffix = (sampleName == "") ? "" : " " + sampleName; if (ArrayUtils.Contains(outputColumns, 0)){ mdata.AddNumericColumn("Copy number" + suffix, "", copyNumbers); } if (ArrayUtils.Contains(outputColumns, 1)){ mdata.AddNumericColumn("Concentration [nM]" + suffix, "", concentrations); } if (ArrayUtils.Contains(outputColumns, 2)){ mdata.AddNumericColumn("Abundance (mass/total mass) [*10^-6]" + suffix, "", massFraction); } if (ArrayUtils.Contains(outputColumns, 3)){ mdata.AddNumericColumn("Abundance (molecules/total molecules) [*10^-6]" + suffix, "", moleFraction); } double[] rank = ArrayUtils.Rank(copyNumbers); double[] relativeRank = new double[mdata.RowCount]; double validRanks = mdata.RowCount; for (int row = 0; row < mdata.RowCount; row++){ // remove rank for protein with no copy number information if (double.IsNaN((copyNumbers[row])) || copyNumbers[row] == 0){ rank[row] = double.NaN; validRanks--; // do not consider as valid } // invert ranking, so that rank 0 is the most abundant protein rank[row] = mdata.RowCount - rank[row]; } for (int row = 0; row < mdata.RowCount; row++){ relativeRank[row] = rank[row]/validRanks; } if (ArrayUtils.Contains(outputColumns, 4)){ mdata.AddNumericColumn("Copy number rank" + suffix, "", rank); } if (ArrayUtils.Contains(outputColumns, 5)){ mdata.AddNumericColumn("Relative copy number rank" + suffix, "", relativeRank); } if (intensityCols[col] < mdata.ExpressionColumnCount && param.GetSingleChoiceWithSubParams("Averaging mode").Value != 3){ totalProteinRow[intensityCols[col]] = Math.Round(totalProtein, 2); totalMoleculesRow[intensityCols[col]] = Math.Round(totalMolecules, 0); organismRow[intensityCols[col]] = new string[]{organism.name}; histoneMassRow[intensityCols[col]] = Math.Round(histoneMass, 4); ploidyRow[intensityCols[col]] = Math.Round((histoneMass*1e-12)/cValue, 2); cellVolumeRow[intensityCols[col]] = Math.Round(totalVolume, 2); // femtoliters } } if (param.GetSingleChoiceWithSubParams("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 6)){ mdata.AddNumericRow("Total protein [pg/cell]", "", totalProteinRow); mdata.AddNumericRow("Total molecules per cell", "", totalMoleculesRow); mdata.AddCategoryRow("Organism", "", organismRow); mdata.AddNumericRow("Histone mass [pg/cell]", "", histoneMassRow); mdata.AddNumericRow("Ploidy", "", ploidyRow); mdata.AddNumericRow("Cell volume [fl]", "", cellVolumeRow); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] colIndx = param.GetParam<int[]>("x").Value; int[] colIndy = param.GetParam<int[]>("y").Value; if (colIndx.Length == 0){ processInfo.ErrString = "Please select some columns"; return; } if (colIndx.Length != colIndy.Length){ processInfo.ErrString = "Please select the same number of columns in the boxes for the first and second columns."; return; } int typeInd = param.GetParam<int>("Distribution type").Value; int points = param.GetParam<int>("Number of points").Value; for (int k = 0; k < colIndx.Length; k++){ float[] xvals = GetColumn(mdata, colIndx[k]); float[] yvals = GetColumn(mdata, colIndy[k]); float[] xvals1; float[] yvals1; GetValidPairs(xvals, yvals, out xvals1, out yvals1); double xmin; double xmax; double ymin; double ymax; DensityEstimation.CalcRanges(xvals1, yvals1, out xmin, out xmax, out ymin, out ymax); float[,] values = DensityEstimation.GetValuesOnGrid(xvals1, xmin, (xmax - xmin)/points, points, yvals1, ymin, (ymax - ymin)/points, points); if (typeInd == 1){ MakeConditional1(values); } if (typeInd == 2){ MakeConditional2(values); } if (typeInd == 3){ MakeConditional3(values); } DensityEstimation.DivideByMaximum(values); double[] xmat = new double[points]; for (int i = 0; i < points; i++){ xmat[i] = xmin + i*(xmax - xmin)/points; } double[] ymat = new double[points]; for (int i = 0; i < points; i++){ ymat[i] = ymin + i*(ymax - ymin)/points; } float[,] percvalues = CalcExcludedPercentage(values); double[] dvals = new double[xvals.Length]; double[] pvals = new double[xvals.Length]; for (int i = 0; i < dvals.Length; i++){ double xx = xvals[i]; double yy = yvals[i]; if (!double.IsNaN(xx) && !double.IsNaN(yy)){ int xind = ArrayUtils.ClosestIndex(xmat, xx); int yind = ArrayUtils.ClosestIndex(ymat, yy); dvals[i] = values[xind, yind]; pvals[i] = percvalues[xind, yind]; } else{ dvals[i] = double.NaN; pvals[i] = double.NaN; } } string xname = GetColumnName(mdata, colIndx[k]); string yname = GetColumnName(mdata, colIndy[k]); mdata.AddNumericColumn("Density_" + xname + "_" + yname, "Density of data points in the plane spanned by the columns " + xname + " and " + yname + ".", dvals); mdata.AddNumericColumn("Excluded fraction_" + xname + "_" + yname, "Percentage of points with a point density smaller than at this point in the plane spanned by the columns " + xname + " and " + yname + ".", pvals); } }
private static void ExpressionToNumeric(IList<int> colInds, IMatrixData mdata) { int[] remainingInds = ArrayUtils.Complement(colInds, mdata.NumericColumnCount); foreach (int colInd in colInds){ double[] d = ArrayUtils.ToDoubles(mdata.GetExpressionColumn(colInd)); mdata.AddNumericColumn(mdata.ExpressionColumnNames[colInd], mdata.ExpressionColumnDescriptions[colInd], d); } mdata.ExtractExpressionColumns(remainingInds); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ProcessInfo processInfo) { int[] colIndx = param.GetMultiChoiceParam("Column 1").Value; int[] colIndy = param.GetMultiChoiceParam("Column 2").Value; if (colIndx.Length == 0){ processInfo.ErrString = "Please select some columns"; return; } if (colIndx.Length != colIndy.Length){ processInfo.ErrString = "Please select the same number of columns in the boxes for the first and second columns."; return; } int points = param.GetIntParam("Number of points").Value; for (int k = 0; k < colIndx.Length; k++){ float[] xvals = GetColumn(mdata, colIndx[k]); float[] yvals = GetColumn(mdata, colIndy[k]); float[] xvals1; float[] yvals1; NumUtils.GetValidPairs(xvals, yvals, out xvals1, out yvals1); double xmin; double xmax; double ymin; double ymax; DensityEstimation.CalcRanges(xvals1, yvals1, out xmin, out xmax, out ymin, out ymax); float[,] values = DensityEstimation.GetValuesOnGrid(xvals1, xmin, (xmax - xmin) / points, points, yvals1, ymin, (ymax - ymin)/points, points); DensityEstimation.DivideByMaximum(values); //if (modeInd == 1){ // values = InvertRows(values); // List<string> colNames = new List<string>(); // for (int i = 0; i < values.GetLength(1); i++){ // colNames.Add("" + i); // } // mdata.SetData(mdata.Name, colNames, values, new List<string>(), new List<string[]>(), new List<string>(), // new List<string[][]>(), new List<string>(), new List<double[]>(), new List<string>(), new List<double[][]>()); // return; //} double[] xmat = new double[points]; for (int i = 0; i < points; i++){ xmat[i] = xmin + i*(xmax - xmin)/points; } double[] ymat = new double[points]; for (int i = 0; i < points; i++){ ymat[i] = ymin + i*(ymax - ymin)/points; } float[,] percvalues = CalcExcludedPercentage(values); double[] dvals = new double[xvals.Length]; double[] pvals = new double[xvals.Length]; for (int i = 0; i < dvals.Length; i++){ double xx = xvals[i]; double yy = yvals[i]; if (!double.IsNaN(xx) && !double.IsNaN(yy)){ int xind = ArrayUtils.ClosestIndex(xmat, xx); int yind = ArrayUtils.ClosestIndex(ymat, yy); dvals[i] = values[xind, yind]; pvals[i] = percvalues[xind, yind]; } else{ dvals[i] = double.NaN; pvals[i] = double.NaN; } } string xname = GetColumnName(mdata, colIndx[k]); string yname = GetColumnName(mdata, colIndy[k]); mdata.AddNumericColumn("Density_" + xname + "_" + yname, "Density of data points in the plane spanned by the columns " + xname + " and " + yname + ".", dvals); mdata.AddNumericColumn("Excluded fraction_" + xname + "_" + yname, "Percentage of points with a point density smaller than at this point in the plane spanned by the columns " + xname + " and " + yname + ".", pvals); } }
public void ProcessData(IMatrixData mdata, Parameters param1, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] cols = param1.GetMultiChoiceParam("Columns").Value; int[] ops = param1.GetMultiChoiceParam("Operation").Value; foreach (int t in ops){ double[][] vals = new double[cols.Length][]; for (int i = 0; i < cols.Length; i++){ double[][] x = mdata.MultiNumericColumns[cols[i]]; vals[i] = new double[x.Length]; for (int j = 0; j < vals[i].Length; j++){ vals[i][j] = operations[t](x[j]); } } for (int i = 0; i < cols.Length; i++){ mdata.AddNumericColumn(mdata.MultiNumericColumnNames[cols[i]] + "_" + names[t], "", vals[i]); } } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] rcols = param.GetMultiChoiceParam("Ratio columns").Value; int[] icols = param.GetMultiChoiceParam("Intensity columns").Value; if (rcols.Length == 0){ processInfo.ErrString = "Please specify some ratio columns."; return; } if (rcols.Length != icols.Length){ processInfo.ErrString = "The number of ratio and intensity columns have to be equal."; return; } int truncIndex = param.GetSingleChoiceParam("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetDoubleParam("Threshold value").Value; int sideInd = param.GetSingleChoiceParam("Side").Value; TestSide side; switch (sideInd){ case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } for (int i = 0; i < rcols.Length; i++){ float[] r = mdata.GetExpressionColumn(rcols[i]); float[] intens = icols[i] < mdata.ExpressionColumnCount ? mdata.GetExpressionColumn(icols[i]) : ArrayUtils.ToFloats(mdata.NumericColumns[icols[i] - mdata.ExpressionColumnCount]); double[] pvals = CalcSignificanceB(r, intens, side); string[][] fdr; switch (truncation){ case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ExpressionColumnNames[rcols[i]] + " Significance B", "", pvals); mdata.AddCategoryColumn(mdata.ExpressionColumnNames[rcols[i]] + " B significant", "", fdr); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value; string[][] proteinIds = new string[mdata.RowCount][]; string[][] leadingIds = new string[mdata.RowCount][]; List <string> allIds = new List <string>(); for (int row = 0; row < mdata.RowCount; row++) { proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';'); leadingIds[row] = new[] { proteinIds[row][0] }; allIds.AddRange(proteinIds[row]); } string fastaFilePath = param.GetParam <string>("Fasta file").Value; Fasta fasta = new Fasta(); fasta.ParseFile(fastaFilePath, processInfo); // Text annotations processInfo.Status("Adding fasta header annotations."); int[] selection = param.GetParamWithSubParams <int>("Fasta header annotations").GetSubParameters().GetParam <int[]>("Annotations").Value; string[][] idsToBeAnnotated = param.GetParamWithSubParams <int>("Fasta header annotations").Value == 0 ? proteinIds : leadingIds; ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++) { List <ProteinSequence> rowEntries = new List <ProteinSequence>(); foreach (string id in idsToBeAnnotated[row]) { ProteinSequence entry = fasta.GetEntry(id); if (entry == null) { continue; } rowEntries.Add(entry); } fastaEntries[row] = rowEntries.ToArray(); } if (ArrayUtils.Contains(selection, 0)) // Entry name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string entryName = entry.EntryName; if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)) { rowAnnotations.Add(entryName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Entry name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)) // Gene name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string geneName = entry.GeneName; if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)) { rowAnnotations.Add(geneName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Gene name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)) { // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the //'Isoform x of...' prefixes and '(Fragment)' suffixes string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string proteinName = entry.ProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)) { rowAnnotations.Add(proteinName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn); } if (ArrayUtils.Contains(selection, 3)) // Consensus protein name { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string proteinName = entry.ConsensusProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)) { rowAnnotations.Add(proteinName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 4)) // Species { string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <string> rowAnnotations = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { string speciesName = entry.Species; if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)) { rowAnnotations.Add(speciesName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Species", "", annotationColumn); } // Numeric annotations processInfo.Status("Adding numeric annotations."); selection = param.GetParamWithSubParams <int>("Numeric annotations").GetSubParameters().GetParam <int[]>("Annotations").Value; bool annotateLeadingId = param.GetParamWithSubParams <int>("Numeric annotations").Value == 1; if (ArrayUtils.Contains(selection, 0)) // Sequence length { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double sequenceLength = entry.GetSequence().Length; rowAnnotations.Add(sequenceLength); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Sequence length", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)) // Monoisotopic molecular mass { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double monoisotopicMass = entry.GetMonoisotopicMolecularMass(); rowAnnotations.Add(monoisotopicMass); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)) // Average molecular mass { double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double averageMass = entry.GetAverageMolecularMass(); rowAnnotations.Add(averageMass); if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Average molecular mass", "", annotationColumn); } // Theoretical peptides processInfo.Status("Calculating theoretical peptides."); annotateLeadingId = param.GetParamWithSubParams <int>("Calculate theoretical peptides").Value == 1; Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases, param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <int[]>("Proteases") .Value); double minLength = param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>( "Min. peptide length").Value; double maxLength = param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <double>( "Max. peptide length").Value; bool displayPeptideSequences = annotateLeadingId && param.GetParamWithSubParams <int>("Calculate theoretical peptides").GetSubParameters().GetParam <bool>( "Show sequences").Value; foreach (Protease protease in proteases) { double[] annotationColumn = new double[mdata.RowCount]; string[] peptideColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> rowAnnotations = new List <double>(); List <string> rowPeptides = new List <string>(); foreach (ProteinSequence entry in fastaEntries[row]) { double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int)minLength, (int)maxLength); rowAnnotations.Add(nTheoreticalPeptides); if (displayPeptideSequences) { rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int)minLength, (int)maxLength)); } if (annotateLeadingId && rowAnnotations.Count > 0) { break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); peptideColumn[row] = String.Join(";", rowPeptides); } mdata.AddNumericColumn( "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn); if (displayPeptideSequences) { mdata.AddStringColumn( "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn); } } // Sequence features processInfo.Status("Counting sequence features."); annotateLeadingId = param.GetParamWithSubParams <int>("Count sequence features").Value == 1; bool normalizeBySequenceLength = param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <bool>( "Normalize by sequence length").Value; if (param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value != "") { Regex regex; try{ regex = new Regex( param.GetParamWithSubParams <int>("Count sequence features").GetSubParameters().GetParam <string>("Regex").Value); } catch (ArgumentException) { processInfo.ErrString = "The regular expression you provided has invalid syntax."; return; } double[] sequenceFeatureColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { List <double> featureCount = new List <double>(); foreach (ProteinSequence entry in fastaEntries[row]) { double nFeatures = regex.Matches(entry.GetSequence()).Count; featureCount.Add(normalizeBySequenceLength ? nFeatures / entry.GetLength() : nFeatures); if (annotateLeadingId) { break; } } sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray()); } mdata.AddNumericColumn( (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "", sequenceFeatureColumn); } processInfo.Status("Done."); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { ParameterWithSubParams <int> xp = param.GetParamWithSubParams <int>("Expression column selection"); bool groups = xp.Value == 2; string[] groupNames = null; int[][] colIndsGroups = null; if (groups) { int groupRowInd = xp.GetSubParameters().GetParam <int>("Group").Value; string[][] groupCol = mdata.GetCategoryRowAt(groupRowInd); groupNames = ArrayUtils.UniqueValuesPreserveOrder(groupCol); colIndsGroups = PerseusPluginUtils.GetMainColIndices(groupCol, groupNames); } int[] useCols = xp.Value == 1 ? xp.GetSubParameters().GetParam <int[]>("Columns").Value : ArrayUtils.ConsecutiveInts(mdata.ColumnCount); HashSet <int> w = ArrayUtils.ToHashSet(param.GetParam <int[]>("Calculate").Value); bool[] include = new bool[procs.Length]; double[][] columns = new double[procs.Length][]; double[][][] columnsG = null; if (groups) { columnsG = new double[procs.Length][][]; for (int i = 0; i < columnsG.Length; i++) { columnsG[i] = new double[groupNames.Length][]; } } for (int i = 0; i < include.Length; i++) { include[i] = w.Contains(i); if (include[i]) { columns[i] = new double[mdata.RowCount]; if (groups) { for (int j = 0; j < groupNames.Length; j++) { columnsG[i][j] = new double[mdata.RowCount]; } } } } for (int i = 0; i < mdata.RowCount; i++) { List <double> v = new List <double>(); foreach (int j in useCols) { double x = mdata.Values.Get(i, j); if (!double.IsNaN(x) && !double.IsInfinity(x)) { v.Add(x); } } for (int j = 0; j < include.Length; j++) { if (include[j]) { columns[j][i] = procs[j].Item2(v); } } if (groups) { List <double>[] vg = new List <double> [groupNames.Length]; for (int j = 0; j < colIndsGroups.Length; j++) { vg[j] = new List <double>(); for (int k = 0; k < colIndsGroups[j].Length; k++) { double x = mdata.Values.Get(i, colIndsGroups[j][k]); if (!double.IsNaN(x) && !double.IsInfinity(x)) { vg[j].Add(x); } } } for (int j = 0; j < include.Length; j++) { if (include[j]) { for (int k = 0; k < groupNames.Length; k++) { columnsG[j][k][i] = procs[j].Item2(vg[k]); } } } } } for (int i = 0; i < include.Length; i++) { if (include[i]) { mdata.AddNumericColumn(procs[i].Item1, procs[i].Item3, columns[i]); if (groups) { for (int k = 0; k < groupNames.Length; k++) { mdata.AddNumericColumn(procs[i].Item1 + " " + groupNames[k], procs[i].Item3, columnsG[i][k]); } } } } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] mods = param.GetParam<int[]>("Modifications").StringValue.Split(new[]{';'}, StringSplitOptions.RemoveEmptyEntries); string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++){ uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } double[][] c = new double[mods.Length][]; for (int index = 0; index < mods.Length; index++){ string mod = mods[index]; string filename = PhosphoSitePlusParser.GetFilenameForMod(mod); if (filename == null){ processInfo.ErrString = "File does not exist."; return; } string[] seqWins; string[] accs; string[] pubmedLtp; string[] pubmedMs2; string[] cstMs2; string[] species; PhosphoSitePlusParser.ParseKnownMods(filename, out seqWins, out accs, out pubmedLtp, out pubmedMs2, out cstMs2, out species); for (int i = 0; i < seqWins.Length; i++){ seqWins[i] = seqWins[i].ToUpper(); } Dictionary<string, HashSet<string>> counts = new Dictionary<string, HashSet<string>>(); for (int i = 0; i < accs.Length; i++){ string acc = accs[i]; if (!counts.ContainsKey(acc)){ counts.Add(acc, new HashSet<string>()); } counts[acc].Add(seqWins[i]); } c[index] = new double[up.Length]; for (int i = 0; i < up.Length; i++){ c[index][i] = CountSites(uprot[i], counts); } } string[][] catCol = new string[up.Length][]; for (int i = 0; i < catCol.Length; i++){ List<string> x = new List<string>(); for (int j = 0; j < mods.Length; j++){ if (c[j][i] > 0){ x.Add(mods[j]); } } x.Sort(); catCol[i] = x.ToArray(); } mdata.AddCategoryColumn("Known modifications", "Known modifications", catCol); for (int i = 0; i < mods.Length; i++){ mdata.AddNumericColumn(mods[i] + " count", mods[i] + " count", c[i]); } }