private static void NumericToCategorical(IList <int> colInds, IMatrixData mdata) { int[] inds = ArrayUtils.Complement(colInds, mdata.NumericColumnCount); string[] names = ArrayUtils.SubArray(mdata.NumericColumnNames, colInds); string[] descriptions = ArrayUtils.SubArray(mdata.NumericColumnDescriptions, colInds); double[][] num = ArrayUtils.SubArray(mdata.NumericColumns, colInds); string[][][] newCat = new string[num.Length][][]; for (int j = 0; j < num.Length; j++) { newCat[j] = new string[num[j].Length][]; for (int i = 0; i < newCat[j].Length; i++) { if (double.IsNaN(num[j][i]) || double.IsInfinity(num[j][i])) { newCat[j][i] = new string[0]; } else { newCat[j][i] = new[] { "" + num[j][i] }; } } } for (int i = 0; i < names.Length; i++) { mdata.AddCategoryColumn(names[i], descriptions[i], newCat[i]); } mdata.NumericColumns = ArrayUtils.SubList(mdata.NumericColumns, inds); mdata.NumericColumnNames = ArrayUtils.SubList(mdata.NumericColumnNames, inds); mdata.NumericColumnDescriptions = ArrayUtils.SubList(mdata.NumericColumnDescriptions, inds); }
private static void StringToCategorical(IList <int> colInds, IMatrixData mdata) { int[] inds = ArrayUtils.Complement(colInds, mdata.StringColumnCount); string[] names = ArrayUtils.SubArray(mdata.StringColumnNames, colInds); string[] descriptions = ArrayUtils.SubArray(mdata.StringColumnDescriptions, colInds); string[][] str = ArrayUtils.SubArray(mdata.StringColumns, colInds); string[][][] newCat = new string[str.Length][][]; for (int j = 0; j < str.Length; j++) { newCat[j] = new string[str[j].Length][]; for (int i = 0; i < newCat[j].Length; i++) { if (str[j][i] == null || str[j][i].Length == 0) { newCat[j][i] = new string[0]; } else { string[] x = str[j][i].Split(';'); Array.Sort(x); newCat[j][i] = x; } } } for (int i = 0; i < names.Length; i++) { mdata.AddCategoryColumn(names[i], descriptions[i], newCat[i]); } mdata.StringColumns = ArrayUtils.SubList(mdata.StringColumns, inds); mdata.StringColumnNames = ArrayUtils.SubList(mdata.StringColumnNames, inds); mdata.ColumnDescriptions = ArrayUtils.SubList(mdata.StringColumnDescriptions, inds); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { var name = param.GetParam <string>("Name").Value; var keep = param.GetParam <bool>("Keep original columns").Value; var choice = param.GetParam <int[]>("Columns").Value; var strategy = Strategies[param.GetParam <int>("Strategy").Value]; var combiner = GetCombiner(strategy); var columns = choice.Select(mdata.GetCategoryColumnAt).ToArray(); var n = mdata.RowCount; var values = new string[n][]; for (int i = 0; i < mdata.RowCount; i++) { var row = columns.Select(col => col[i]).ToArray(); values[i] = combiner(row); } if (!keep) { // if unsorted removing column will change index of other columns foreach (var col in choice.OrderByDescending(col => col)) { mdata.RemoveCategoryColumnAt(col); } } mdata.AddCategoryColumn(name, "Combined column", values); }
public void ProcessData(IMatrixData mdata, Parameters para, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] baseIds = GetBaseIds(para, mdata); string[] name; int[] catColInds; int[] textColInds; int[] numColInds; string[][][] catCols; string[][] textCols; double[][] numCols; bool success = ProcessDataAddAnnotation(mdata.RowCount, para, baseIds, processInfo, out name, out catColInds, out textColInds, out numColInds, out catCols, out textCols, out numCols); if (!success) { return; } for (int i = 0; i < catCols.Length; i++) { mdata.AddCategoryColumn(name[catColInds[i]], "", catCols[i]); } for (int i = 0; i < textCols.Length; i++) { mdata.AddStringColumn(name[textColInds[i]], "", textCols[i]); } for (int i = 0; i < numCols.Length; i++) { mdata.AddNumericColumn(name[numColInds[i]], "", numCols[i]); } }
public void SmallTest() { IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ] { { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 } }); mdata.AddStringColumn("id", "", new [] { "a", "b", "b", "b" }); mdata.AddStringColumn("str", "", new [] { "a;b", "b;c", "c;d", "d;e" }); mdata.AddCategoryColumn("cat", "", new[] { new[] { "a", "b" }, new[] { "b", "c" }, new[] { "c", "d" }, new[] { "d", "e" } }); mdata.AddNumericColumn("num", "", new [] { 0, 1, 2, 3, 4.0 }); mdata.AddMultiNumericColumn("mnum", "", new [] { new [] { 0, 4d }, new [] { 1, 5d }, new [] { 2, 6d }, new [] { 3, 7d } }); mdata.UniqueRows(mdata.StringColumns[0], ArrayUtils.Median, UniqueRows.Union, UniqueRows.CatUnion, UniqueRows.MultiNumUnion); Assert.AreEqual(2, mdata.RowCount); CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.Values.GetColumn(0)); CollectionAssert.AreEqual(new [] { 4, 6 }, mdata.Values.GetColumn(1)); CollectionAssert.AreEqual(new [] { "a;b", "b;c;d;e" }, mdata.GetStringColumn("str")); CollectionAssert.AreEqual(new [] { new [] { "a", "b" }, new [] { "b", "c", "d", "e" } }, mdata.GetCategoryColumnAt(0)); CollectionAssert.AreEqual(new [] { 0, 2 }, mdata.NumericColumns[0]); CollectionAssert.AreEqual(new [] { new [] { 0d, 4 }, new [] { 1d, 5, 2, 6, 3, 7 } }, mdata.MultiNumericColumns[0]); }
public void ImportResult(Dictionary <string, string[]> results, IMatrixData mdata, string pair1, string pair2, string[][] validCol, string[][] sigCol, string method, bool replicate) { foreach (KeyValuePair <string, string[]> entry in results) { if ((entry.Key == "LR") && (!replicate)) { } else { mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_" + entry.Key, pair1 + "_vs_" + pair2 + "_" + entry.Key, Array.ConvertAll(entry.Value, Double.Parse)); } double[] t = new double[entry.Value.Length]; if (((entry.Key == "p-value" || entry.Key == "padj") && method == "DESeq2") || ((entry.Key == "p-value" || entry.Key == "FDR") && method == "EdgeR")) { for (int i = 0; i < entry.Value.Length; i++) { double.TryParse(entry.Value[i], out double p); if (p == 0) { t[i] = Math.Log10(1 / Double.MaxValue) * -1; } else { t[i] = Math.Log10(p) * -1; } } mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, t); } } if (method == "DESeq2") { mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Valid", pair1 + "_vs_" + pair2 + "_Valid", validCol); } mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Significant", pair1 + "_vs_" + pair2 + "_Significant", sigCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] cols = param.GetParam <int[]>("Columns").Value; int truncIndex = param.GetParam <int>("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetParam <double>("Threshold value").Value; int sideInd = param.GetParam <int>("Side").Value; TestSide side; switch (sideInd) { case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } foreach (int col in cols) { BaseVector r = mdata.Values.GetColumn(col); double[] pvals = CalcSignificanceA(r, side); string[][] fdr; switch (truncation) { case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: double[] fdrs; fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold, pvals.Length, out fdrs); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ColumnNames[col] + " Significance A", "", pvals); mdata.AddCategoryColumn(mdata.ColumnNames[col] + " A significant", "", fdr); } }
public void ProcessData(IMatrixData data, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] exColInds = param.GetParam <int[]>("Main columns").Value; int[] numColInds = param.GetParam <int[]>("Numerical columns").Value; int[] multiNumColInds = param.GetParam <int[]>("Multi-numerical columns").Value; int[] catColInds = param.GetParam <int[]>("Categorical columns").Value; int[] textColInds = param.GetParam <int[]>("Text columns").Value; if (exColInds.Length > 0) { int ncol = data.ColumnCount; data.ExtractColumns(ArrayUtils.Concat(ArrayUtils.ConsecutiveInts(data.ColumnCount), exColInds)); HashSet <string> taken = new HashSet <string>(data.ColumnNames); for (int i = 0; i < exColInds.Length; i++) { string s = StringUtils.GetNextAvailableName(data.ColumnNames[ncol + i], taken); data.ColumnNames[ncol + i] = s; taken.Add(s); } } foreach (int ind in numColInds) { HashSet <string> taken = new HashSet <string>(data.NumericColumnNames); string s = StringUtils.GetNextAvailableName(data.NumericColumnNames[ind], taken); data.AddNumericColumn(s, data.NumericColumnDescriptions[ind], (double[])data.NumericColumns[ind].Clone()); taken.Add(s); } foreach (int ind in multiNumColInds) { HashSet <string> taken = new HashSet <string>(data.MultiNumericColumnNames); string s = StringUtils.GetNextAvailableName(data.MultiNumericColumnNames[ind], taken); data.AddMultiNumericColumn(s, data.MultiNumericColumnDescriptions[ind], (double[][])data.MultiNumericColumns[ind].Clone()); taken.Add(s); } foreach (int ind in catColInds) { HashSet <string> taken = new HashSet <string>(data.CategoryColumnNames); string s = StringUtils.GetNextAvailableName(data.CategoryColumnNames[ind], taken); data.AddCategoryColumn(s, data.CategoryColumnDescriptions[ind], data.GetCategoryColumnAt(ind)); taken.Add(s); } foreach (int ind in textColInds) { HashSet <string> taken = new HashSet <string>(data.StringColumnNames); string s = StringUtils.GetNextAvailableName(data.StringColumnNames[ind], taken); data.AddStringColumn(s, data.ColumnDescriptions[ind], (string[])data.StringColumns[ind].Clone()); taken.Add(s); } }
public void WriteMatrixTest() { // main data IMatrixData mdata = PerseusFactory.CreateMatrixData(new double[, ] { { 1, 2, 3 }, { 3, 4, 5 } }, new List <string> { "col1", "col2", "col3" }); // annotation rows mdata.AddCategoryRow("catrow", "this is catrow", new[] { new[] { "cat1" }, new[] { "cat1", "cat2" }, new[] { "cat2" } }); mdata.AddNumericRow("numrow", "this is numrow", new[] { -1.0, 1, 2 }); // annotation columns mdata.AddStringColumn("strcol1", "this is stringcol1", new[] { "1", "2" }); mdata.AddStringColumn("strcol2", "", new[] { "", "hallo" }); mdata.AddNumericColumn("numcol", "", new[] { 1.0, 2.0 }); mdata.AddMultiNumericColumn("multnumcol", "this is multnumcol", new[] { new[] { -2.0, 2.0 }, new double[] {} }); mdata.AddCategoryColumn("catcol", "", new[] { new[] { "cat1", "cat1.1" }, new[] { "cat2", "cat1" } }); string mdataStr; using (MemoryStream memstream = new MemoryStream()) using (StreamWriter writer = new StreamWriter(memstream)) { PerseusUtils.WriteMatrix(mdata, writer); writer.Flush(); mdataStr = Encoding.UTF8.GetString(memstream.ToArray()); } IMatrixData mdata2 = PerseusFactory.CreateMatrixData(); PerseusUtils.ReadMatrix(mdata2, new ProcessInfo(new Settings(), status => { }, progress => { }, 1), () => { StreamReader tmpStream = new StreamReader(new MemoryStream(Encoding.UTF8.GetBytes(mdataStr))); return(tmpStream); }, "matrix1", '\t'); Assert.AreEqual(2, mdata2.RowCount); Assert.AreEqual(3, mdata2.ColumnCount); Assert.AreEqual(2, mdata2.StringColumnCount); Assert.AreEqual(1, mdata2.NumericColumnCount); Assert.AreEqual(1, mdata2.CategoryColumnCount); Assert.AreEqual(1, mdata2.MultiNumericColumnCount); Assert.AreEqual("hallo", mdata2.StringColumns[mdata2.StringColumnNames.FindIndex(col => col.Equals("strcol2"))][1]); Assert.AreEqual(1, mdata2.CategoryRowCount); Assert.AreEqual(1, mdata2.NumericRowCount); }
public static void FilterRows(IMatrixData mdata, Parameters parameters, int[] rows) { bool reduceMatrix = GetReduceMatrix(parameters); if (reduceMatrix){ mdata.ExtractExpressionRows(rows); } else{ Array.Sort(rows); string[][] col = new string[mdata.RowCount][]; for (int i = 0; i < col.Length; i++){ bool contains = Array.BinarySearch(rows, i) >= 0; col[i] = contains ? new[]{"Keep"} : new[]{"Discard"}; } mdata.AddCategoryColumn("Filter", "", col); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string word = param.GetParam<string>("Find what").Value; int colInd = param.GetParam<int>("Look in").Value; bool matchCase = param.GetParam<bool>("Match case").Value; bool matchWholeWord = param.GetParam<bool>("Match whole word").Value; string scolName = mdata.StringColumnNames[colInd]; string[] scol = mdata.StringColumns[colInd]; string[][] catCol = new string[mdata.RowCount][]; for (int i = 0; i < catCol.Length; i++){ bool found = Find(scol[i], word, matchCase, matchWholeWord); catCol[i] = found ? new[]{"+"} : new string[0]; } mdata.AddCategoryColumn("Search: " + scolName, "Search: " + scolName, catCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string word = param.GetParam <string>("Find what").Value; int colInd = param.GetParam <int>("Look in").Value; bool matchCase = param.GetParam <bool>("Match case").Value; bool matchWholeWord = param.GetParam <bool>("Match whole word").Value; string scolName = mdata.StringColumnNames[colInd]; string[] scol = mdata.StringColumns[colInd]; string[][] catCol = new string[mdata.RowCount][]; for (int i = 0; i < catCol.Length; i++) { bool found = Find(scol[i], word, matchCase, matchWholeWord); catCol[i] = found ? new[] { "+" } : new string[0]; } mdata.AddCategoryColumn("Search: " + scolName, "Search: " + scolName, catCol); }
private static IMatrixData GetResult(IMatrixData mdata1, IMatrixData mdata2, Parameters parameters, IList <int[]> indexMap) { IMatrixData result = (IMatrixData)mdata1.Clone(); SetAnnotationRows(result, mdata1, mdata2); bool indicator = parameters.GetParam <bool>("Indicator").Value; if (indicator) { string[][] indicatorCol = new string[indexMap.Count][]; for (int i = 0; i < indexMap.Count; i++) { indicatorCol[i] = indexMap[i].Length > 0 ? new[] { "+" } : new string[0]; } result.AddCategoryColumn(mdata2.Name, "", indicatorCol); } result.Origin = "Combination"; return(result); }
public static void FilterRows(IMatrixData mdata, Parameters parameters, int[] rows) { bool reduceMatrix = GetReduceMatrix(parameters); if (reduceMatrix) { mdata.ExtractRows(rows); } else { Array.Sort(rows); string[][] col = new string[mdata.RowCount][]; for (int i = 0; i < col.Length; i++) { bool contains = Array.BinarySearch(rows, i) >= 0; col[i] = contains ? new[] { "Keep" } : new[] { "Discard" }; } mdata.AddCategoryColumn("Filter", "", col); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { if (mdata.CategoryColumnCount < 2) { processInfo.ErrString = "There are less than two categorical columns available."; return; } int colInd1 = param.GetParam <int>("First column").Value; int colInd2 = param.GetParam <int>("Second column").Value; string[][] col1 = mdata.GetCategoryColumnAt(colInd1); string[][] col2 = mdata.GetCategoryColumnAt(colInd2); string[][] result = new string[col1.Length][]; for (int i = 0; i < result.Length; i++) { result[i] = CombineTerms(col1[i], col2[i]); } string colName = mdata.CategoryColumnNames[colInd1] + "_" + mdata.CategoryColumnNames[colInd2]; mdata.AddCategoryColumn(colName, "", result); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int numQuantiles = param.GetParam <int>("Number of quantiles").Value; int[] colInds = param.GetParam <int[]>("Columns").Value; foreach (int colInd in colInds) { double[] vals = GetValues(mdata, colInd); List <int> v = new List <int>(); for (int i = 0; i < vals.Length; i++) { if (!double.IsNaN(vals[i])) { v.Add(i); } } int[] o = v.ToArray(); vals = ArrayUtils.SubArray(vals, o); int[] q = ArrayUtils.Order(vals); o = ArrayUtils.SubArray(o, q); string[][] catCol = new string[mdata.RowCount][]; for (int i = 0; i < catCol.Length; i++) { catCol[i] = new[] { "missing" }; } for (int i = 0; i < o.Length; i++) { int catVal = (i * numQuantiles) / o.Length + 1; catCol[o[i]] = new[] { "Q" + catVal }; } string name = GetName(mdata, colInd); string nameq = name + "_q"; string desc = "The column " + name + " has been divided into " + numQuantiles + " quantiles."; mdata.AddCategoryColumn(nameq, desc, catCol); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { double[] totalPeptides = mdata.NumericColumns[param.GetParam <int>("Total number of peptides").Value]; double[] uniqueRazorPeptides = mdata.NumericColumns[param.GetParam <int>("Unique + razor peptides").Value]; double[] sequenceLength = mdata.NumericColumns[param.GetParam <int>("Sequence length").Value]; double[] theoreticalPeptides = mdata.NumericColumns[param.GetParam <int>("Number of theoretical peptides").Value]; double highMinPep = param.GetParam <double>("High: min. peptides").Value; double highMinRazorFraction = param.GetParam <double>("High: min. razor fraction").Value; double highMinTheorPep = param.GetParam <double>("High: min. theor.pep./100AA").Value; double mediumMinPep = param.GetParam <double>("Medium: min. peptides").Value; double mediumMinRazorFraction = param.GetParam <double>("Medium: min. razor fraction").Value; double mediumMinTheorPep = param.GetParam <double>("Medium: min. theor.pep./100AA").Value; double[] razorFraction = new double[mdata.RowCount]; double[] theoreticalPepsPer100Aa = new double[mdata.RowCount]; string[][] score = new string[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++) { razorFraction[row] = uniqueRazorPeptides[row] / totalPeptides[row]; theoreticalPepsPer100Aa[row] = theoreticalPeptides[row] / (sequenceLength[row] / 100); if (totalPeptides[row] >= highMinPep && razorFraction[row] >= highMinRazorFraction && theoreticalPepsPer100Aa[row] >= highMinTheorPep) { score[row] = new[] { "high" }; continue; } if (totalPeptides[row] >= mediumMinPep && razorFraction[row] >= mediumMinRazorFraction && theoreticalPepsPer100Aa[row] >= mediumMinTheorPep) { score[row] = new[] { "medium" }; continue; } score[row] = new[] { "low" }; } mdata.AddCategoryColumn("Absolute quantification accuracy", "", score); }
public static void FilterRowsNew(IMatrixData mdata, Parameters parameters, int[] rows) { bool reduceMatrix = UnpackFilterModeParam(parameters) == FilterMode.Reduce; if (parameters.GetParam <int>("Filter mode").Value == 0) { mdata.ExtractRows(rows); } else if (parameters.GetParam <int>("Filter mode").Value == 1) { Array.Sort(rows); string[][] col = new string[mdata.RowCount][]; for (int i = 0; i < col.Length; i++) { bool contains = Array.BinarySearch(rows, i) >= 0; col[i] = contains ? new[] { "Keep" } : new[] { "Discard" }; } mdata.AddCategoryColumn("Filter", "", col); } else if (parameters.GetParam <int>("Filter mode").Value == 2) { mdata.ExtractRows(rows); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] rcols = param.GetMultiChoiceParam("Ratio columns").Value; int[] icols = param.GetMultiChoiceParam("Intensity columns").Value; if (rcols.Length == 0){ processInfo.ErrString = "Please specify some ratio columns."; return; } if (rcols.Length != icols.Length){ processInfo.ErrString = "The number of ratio and intensity columns have to be equal."; return; } int truncIndex = param.GetSingleChoiceParam("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetDoubleParam("Threshold value").Value; int sideInd = param.GetSingleChoiceParam("Side").Value; TestSide side; switch (sideInd){ case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } for (int i = 0; i < rcols.Length; i++){ float[] r = mdata.GetExpressionColumn(rcols[i]); float[] intens = icols[i] < mdata.ExpressionColumnCount ? mdata.GetExpressionColumn(icols[i]) : ArrayUtils.ToFloats(mdata.NumericColumns[icols[i] - mdata.ExpressionColumnCount]); double[] pvals = CalcSignificanceB(r, intens, side); string[][] fdr; switch (truncation){ case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ExpressionColumnNames[rcols[i]] + " Significance B", "", pvals); mdata.AddCategoryColumn(mdata.ExpressionColumnNames[rcols[i]] + " B significant", "", fdr); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string mod = param.GetParam<int>("Modification").StringValue; string[] seqWins; string[] accs; string[] pubmedLtp; string[] pubmedMs2; string[] cstMs2; string[] species; PhosphoSitePlusParser.ParseKnownMod(mod, out seqWins, out accs, out pubmedLtp, out pubmedMs2, out cstMs2, out species); if (seqWins == null){ processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++){ uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value]; Dictionary<string, List<int>> map = new Dictionary<string, List<int>>(); for (int i = 0; i < seqWins.Length; i++){ string acc = accs[i]; if (!map.ContainsKey(acc)){ map.Add(acc, new List<int>()); } map[acc].Add(i); } string[] newCol = new string[uprot.Length]; string[][] newCatCol = new string[uprot.Length][]; string[][] originCol = new string[uprot.Length][]; for (int i = 0; i < newCol.Length; i++){ string[] win1 = TransformIl(win[i]).Split(';'); HashSet<string> wins = new HashSet<string>(); HashSet<string> origins = new HashSet<string>(); foreach (string ux in uprot[i]){ if (map.ContainsKey(ux)){ List<int> n = map[ux]; foreach (int ind in n){ string s = seqWins[ind]; if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))){ wins.Add(s); if (pubmedLtp[ind].Length > 0){ origins.Add("LTP"); } if (pubmedMs2[ind].Length > 0){ origins.Add("HTP"); } if (cstMs2[ind].Length > 0){ origins.Add("CST"); } } } } } if (wins.Count > 0){ newCol[i] = StringUtils.Concat(";", ArrayUtils.ToArray(wins)); newCatCol[i] = new[]{"+"}; string[] x = ArrayUtils.ToArray(origins); Array.Sort(x); originCol[i] = x; } else{ newCol[i] = ""; newCatCol[i] = new string[0]; originCol[i] = new string[0]; } } mdata.AddStringColumn("PhosphoSitePlus window", "", newCol); mdata.AddCategoryColumn("Known site", "", newCatCol); mdata.AddCategoryColumn("Origin", "", originCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string colName = param.GetParam <string>("Name of new column").Value; int[] columns = param.GetParam <int[]>("Categories").Value; bool inverse = param.GetParam <bool>("Inverse").Value; Split(columns, out int[] catCols, out int[] stringCols, mdata.CategoryColumnCount); string[] word1 = param.GetParam <string[]>("Search terms").Value; if (word1.Length == 0) { processInfo.ErrString = "Please specify one or more search terms."; return; } if (string.IsNullOrEmpty(colName)) { colName = word1[0]; } string[] word = new string[word1.Length]; for (int i = 0; i < word.Length; i++) { word[i] = word1[i].ToLower().Trim(); } bool[] indicator = new bool[mdata.RowCount]; foreach (int col in catCols) { for (int i = 0; i < mdata.RowCount; i++) { foreach (string s in mdata.GetCategoryColumnEntryAt(col, i)) { foreach (string s1 in word) { if (s.ToLower().Contains(s1)) { indicator[i] = true; break; } } } } } foreach (string[] txt in stringCols.Select(col => mdata.StringColumns[col])) { for (int i = 0; i < txt.Length; i++) { string s = txt[i]; foreach (string s1 in word) { if (s.ToLower().Contains(s1)) { indicator[i] = true; break; } } } } string[][] newCol = new string[indicator.Length][]; for (int i = 0; i < newCol.Length; i++) { bool yes = inverse ? !indicator[i] : indicator[i]; newCol[i] = yes ? new[] { "+" } : new string[0]; } mdata.AddCategoryColumn(colName, "", newCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string folder = FileUtils.executablePath + "\\conf"; string file = folder + "\\maxquantAnnot.txt.gz"; int protInd = param.GetParam <int>("Proteins").Value; int posInd = param.GetParam <int>("Positions within proteins").Value; bool addStatus = param.GetParam <bool>("Add status column").Value; string[] protCol = mdata.StringColumns[protInd]; HashSet <string> allProtIds = new HashSet <string>(); string[][] protIds = new string[protCol.Length][]; for (int i = 0; i < protCol.Length; i++) { protIds[i] = protCol[i].Length > 0 ? protCol[i].Split(';') : new string[0]; foreach (string s in protIds[i]) { if (!allProtIds.Contains(s)) { allProtIds.Add(s); } } } Dictionary <string, MiniProteinAnnotation> map = MiniProteinAnnotation.ReadMapping(file, allProtIds); string[] posCol = mdata.StringColumns[posInd]; int nrows = protCol.Length; string[][] pfamCol = new string[nrows][]; Dictionary <FeatureType, string[][]> cols = new Dictionary <FeatureType, string[][]>(); Dictionary <FeatureType, string[][]> statusCols = new Dictionary <FeatureType, string[][]>(); foreach (FeatureType t in FeatureType.allFeatureTypes) { cols.Add(t, new string[nrows][]); statusCols.Add(t, new string[nrows][]); } for (int i = 0; i < protCol.Length; i++) { string[] posString = posCol[i].Length > 0 ? posCol[i].Split(';') : new string[0]; HashSet <string> pfams = new HashSet <string>(); Dictionary <FeatureType, HashSet <string> > others = new Dictionary <FeatureType, HashSet <string> >(); Dictionary <FeatureType, HashSet <string> > othersStatus = new Dictionary <FeatureType, HashSet <string> >(); for (int j = 0; j < protIds[i].Length; j++) { string protId = protIds[i][j]; int pos = Parser.Int(posString[j]); if (map.ContainsKey(protId)) { MiniProteinAnnotation mpa = map[protId]; for (int k = 0; k < mpa.PfamIds.Length; k++) { if (Fits(pos, mpa.PfamStart[k], mpa.PfamEnd[k])) { pfams.Add(mpa.PfamNames[k]); } } foreach (FeatureType featureType in mpa.Features.Keys) { foreach (UniprotFeature uf in mpa.Features[featureType]) { if (!Parser.TryInt(uf.FeatureBegin, out int begin)) { begin = int.MaxValue; } if (!Parser.TryInt(uf.FeatureEnd, out int end)) { end = int.MinValue; } if (Fits(pos, begin, end)) { if (!others.ContainsKey(featureType)) { others.Add(featureType, new HashSet <string>()); othersStatus.Add(featureType, new HashSet <string>()); } string x = uf.FeatureDescription; if (string.IsNullOrEmpty(x)) { x = "+"; } others[featureType].Add(x); string y = uf.FeatureStatus; if (!string.IsNullOrEmpty(y)) { othersStatus[featureType].Add(y); } } } } } } pfamCol[i] = ToArray(pfams); foreach (FeatureType t in FeatureType.allFeatureTypes) { if (others.ContainsKey(t)) { cols[t][i] = ToArray(others[t]); } else { cols[t][i] = new string[0]; } if (othersStatus.ContainsKey(t)) { statusCols[t][i] = ToArray(othersStatus[t]); } else { statusCols[t][i] = new string[0]; } } } mdata.AddCategoryColumn("Pfam domains", "", pfamCol); foreach (FeatureType t in FeatureType.allFeatureTypes) { mdata.AddCategoryColumn(t.UniprotName, "", cols[t]); if (addStatus) { mdata.AddCategoryColumn(t.UniprotName + " status", "", statusCols[t]); } } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] mods = param.GetParam <int[]>("Modifications").StringValue.Split(new[] { ';' }, StringSplitOptions.RemoveEmptyEntries); string[] up = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++) { uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } double[][] c = new double[mods.Length][]; for (int index = 0; index < mods.Length; index++) { string mod = mods[index]; string filename = PhosphoSitePlusParser.GetFilenameForMod(mod); if (filename == null) { processInfo.ErrString = "File does not exist."; return; } PhosphoSitePlusParser.ParseKnownMods(filename, out string[] seqWins, out string[] accs, out string[] pubmedLtp, out string[] pubmedMs2, out string[] cstMs2, out string[] species); for (int i = 0; i < seqWins.Length; i++) { seqWins[i] = seqWins[i].ToUpper(); } Dictionary <string, HashSet <string> > counts = new Dictionary <string, HashSet <string> >(); for (int i = 0; i < accs.Length; i++) { string acc = accs[i]; if (!counts.ContainsKey(acc)) { counts.Add(acc, new HashSet <string>()); } counts[acc].Add(seqWins[i]); } c[index] = new double[up.Length]; for (int i = 0; i < up.Length; i++) { c[index][i] = CountSites(uprot[i], counts); } } string[][] catCol = new string[up.Length][]; for (int i = 0; i < catCol.Length; i++) { List <string> x = new List <string>(); for (int j = 0; j < mods.Length; j++) { if (c[j][i] > 0) { x.Add(mods[j]); } } x.Sort(); catCol[i] = x.ToArray(); } mdata.AddCategoryColumn("Known modifications", "Known modifications", catCol); for (int i = 0; i < mods.Length; i++) { mdata.AddNumericColumn(mods[i] + " count", mods[i] + " count", c[i]); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { if (mdata.CategoryColumnCount < 2){ processInfo.ErrString = "There are less than two categorical columns available."; return; } int colInd1 = param.GetSingleChoiceParam("First column").Value; int colInd2 = param.GetSingleChoiceParam("Second column").Value; string[][] col1 = mdata.GetCategoryColumnAt(colInd1); string[][] col2 = mdata.GetCategoryColumnAt(colInd2); string[][] result = new string[col1.Length][]; for (int i = 0; i < result.Length; i++){ result[i] = CombineTerms(col1[i], col2[i]); } string colName = mdata.CategoryColumnNames[colInd1] + "_" + mdata.CategoryColumnNames[colInd2]; mdata.AddCategoryColumn(colName, "", result); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] outputColumns = param.GetMultiChoiceParam("Output").Value; int proteinIdColumnInd = param.GetSingleChoiceParam("Protein IDs").Value; string[] proteinIds = mdata.StringColumns[proteinIdColumnInd]; int[] intensityCols = param.GetMultiChoiceParam("Intensities").Value; if (intensityCols.Length == 0){ processInfo.ErrString = "Please select at least one column containing protein intensities."; return; } // variable to hold all intensity values List<double[]> columns = new List<double[]>(); string[] sampleNames = new string[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++){ double[] values; if (intensityCols[col] < mdata.ExpressionColumnCount){ values = ArrayUtils.ToDoubles(mdata.GetExpressionColumn(intensityCols[col])); sampleNames[col] = mdata.ExpressionColumnNames[intensityCols[col]]; } else{ values = mdata.NumericColumns[intensityCols[col] - mdata.ExpressionColumnCount]; sampleNames[col] = mdata.NumericColumnNames[intensityCols[col] - mdata.ExpressionColumnCount]; } sampleNames[col] = new Regex(@"^(?:(?:LFQ )?[Ii]ntensity )?(.*)$").Match(sampleNames[col]).Groups[1].Value; columns.Add(values); } // average over columns if this option is selected if (param.GetSingleChoiceWithSubParams("Averaging mode").Value == 3){ double[] column = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ double[] values = new double[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++){ values[col] = columns[col][row]; } column[row] = ArrayUtils.Median(ExtractValidValues(values, false)); } // delete the original list of columns columns = new List<double[]>{column}; sampleNames = new[]{""}; } // revert logarithm if necessary if (param.GetBoolWithSubParams("Logarithmized").Value){ double[] logBases = new[]{2, Math.E, 10}; double logBase = logBases[param.GetBoolWithSubParams("Logarithmized").GetSubParameters().GetSingleChoiceParam("log base").Value]; foreach (double[] t in columns){ for (int row = 0; row < mdata.RowCount; row++){ if (t[row] == 0){ processInfo.ErrString = "Are the columns really logarithmized?\nThey contain zeroes!"; } t[row] = Math.Pow(logBase, t[row]); } } } double[] mw = mdata.NumericColumns[param.GetSingleChoiceParam("Molecular masses").Value]; // detect whether the molecular masses are given in Da or kDa if (ArrayUtils.Median(mw) < 250) // likely kDa { for (int i = 0; i < mw.Length; i++){ mw[i] *= 1000; } } double[] detectabilityNormFactor = mw; if (param.GetBoolWithSubParams("Detectability correction").Value){ detectabilityNormFactor = mdata.NumericColumns[ param.GetBoolWithSubParams("Detectability correction") .GetSubParameters() .GetSingleChoiceParam("Correction factor") .Value]; } // the normalization factor needs to be nonzero for all proteins // check and replace with 1 for all relevant cases for (int row = 0; row < mdata.RowCount; row++){ if (detectabilityNormFactor[row] == 0 || detectabilityNormFactor[row] == double.NaN){ detectabilityNormFactor[row] = 1; } } // detect the organism Organism organism = DetectOrganism(proteinIds); // c value the amount of DNA per cell, see: http://en.wikipedia.org/wiki/C-value double cValue = (organism.genomeSize*basePairWeight)/avogadro; // find the histones int[] histoneRows = FindHistones(proteinIds, organism); // write a categorical column indicating the histones string[][] histoneCol = new string[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++){ histoneCol[row] = (ArrayUtils.Contains(histoneRows, row)) ? new[]{"+"} : new[]{""}; } mdata.AddCategoryColumn("Histones", "", histoneCol); // initialize the variables for the annotation rows double[] totalProteinRow = new double[mdata.ExpressionColumnCount]; double[] totalMoleculesRow = new double[mdata.ExpressionColumnCount]; string[][] organismRow = new string[mdata.ExpressionColumnCount][]; double[] histoneMassRow = new double[mdata.ExpressionColumnCount]; double[] ploidyRow = new double[mdata.ExpressionColumnCount]; double[] cellVolumeRow = new double[mdata.ExpressionColumnCount]; double[] normalizationFactors = new double[columns.Count]; // calculate normalization factors for each column for (int col = 0; col < columns.Count; col++){ string sampleName = sampleNames[col]; double[] column = columns[col]; // normalization factor to go from intensities to copies, // needs to be determined either using the total protein or the histone scaling approach double factor; switch (param.GetSingleChoiceWithSubParams("Scaling mode").Value){ case 0: // total protein amount double mwWeightedNormalizedSummedIntensities = 0; for (int row = 0; row < mdata.RowCount; row++){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ mwWeightedNormalizedSummedIntensities += (column[row]/detectabilityNormFactor[row])*mw[row]; } } factor = (param.GetSingleChoiceWithSubParams("Scaling mode") .GetSubParameters() .GetDoubleParam("Protein amount per cell [pg]") .Value*1e-12*avogadro)/mwWeightedNormalizedSummedIntensities; break; case 1: // histone mode double mwWeightedNormalizedSummedHistoneIntensities = 0; foreach (int row in histoneRows){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ mwWeightedNormalizedSummedHistoneIntensities += (column[row]/detectabilityNormFactor[row])*mw[row]; } } double ploidy = param.GetSingleChoiceWithSubParams("Scaling mode").GetSubParameters().GetDoubleParam("Ploidy").Value; factor = (cValue*ploidy*avogadro)/mwWeightedNormalizedSummedHistoneIntensities; break; default: factor = 1; break; } normalizationFactors[col] = factor; } // check averaging mode if (param.GetSingleChoiceWithSubParams("Averaging mode").Value == 1) // same factor for all { double factor = ArrayUtils.Mean(normalizationFactors); for (int i = 0; i < normalizationFactors.Length; i++){ normalizationFactors[i] = factor; } } if (param.GetSingleChoiceWithSubParams("Averaging mode").Value == 2) // same factor in each group { if ( param.GetSingleChoiceWithSubParams("Averaging mode").GetSubParameters().GetSingleChoiceParam("Grouping").Value == -1){ processInfo.ErrString = "No grouping selected."; return; } string[][] groupNames = mdata.GetCategoryRowAt( param.GetSingleChoiceWithSubParams("Averaging mode").GetSubParameters().GetSingleChoiceParam("Grouping").Value); string[] uniqueGroupNames = Unique(groupNames); int[] grouping = new int[columns.Count]; for (int i = 0; i < columns.Count; i++){ if (intensityCols[i] >= mdata.ExpressionColumnCount){ // Numeric annotation columns cannot be grouped grouping[i] = i; continue; } if (ArrayUtils.Contains(uniqueGroupNames, groupNames[i][0])){ grouping[i] = ArrayUtils.IndexOf(uniqueGroupNames, groupNames[i][0]); continue; } grouping[i] = i; } Dictionary<int, List<double>> factors = new Dictionary<int, List<double>>(); for (int i = 0; i < columns.Count; i++){ if (factors.ContainsKey(grouping[i])){ factors[grouping[i]].Add(normalizationFactors[i]); } else{ factors.Add(grouping[i], new List<double>{normalizationFactors[i]}); } } double[] averagedNormalizationFactors = new double[columns.Count]; for (int i = 0; i < columns.Count; i++){ List<double> factor; factors.TryGetValue(grouping[i], out factor); averagedNormalizationFactors[i] = ArrayUtils.Mean(factor); } normalizationFactors = averagedNormalizationFactors; } // loop over all selected columns and calculate copy numbers for (int col = 0; col < columns.Count; col++){ string sampleName = sampleNames[col]; double[] column = columns[col]; double factor = normalizationFactors[col]; double[] copyNumbers = new double[mdata.RowCount]; double[] concentrations = new double[mdata.RowCount]; // femtoliters double[] massFraction = new double[mdata.RowCount]; double[] moleFraction = new double[mdata.RowCount]; double totalProtein = 0; // picograms double histoneMass = 0; // picograms double totalMolecules = 0; for (int row = 0; row < mdata.RowCount; row++){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ copyNumbers[row] = (column[row]/detectabilityNormFactor[row])*factor; totalMolecules += copyNumbers[row]; totalProtein += (copyNumbers[row]*mw[row]*1e12)/avogadro; // picograms if (ArrayUtils.Contains(histoneRows, row)){ histoneMass += (copyNumbers[row]*mw[row]*1e12)/avogadro; // picograms } } } double totalVolume = (totalProtein/(param.GetDoubleParam("Total cellular protein concentration [g/l]").Value))*1000; // femtoliters for (int row = 0; row < mdata.RowCount; row++){ if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])){ concentrations[row] = ((copyNumbers[row]/(totalVolume*1e-15))/avogadro)*1e9; // nanomolar massFraction[row] = (((copyNumbers[row]*mw[row]*1e12)/avogadro)/totalProtein)*1e6; // ppm moleFraction[row] = (copyNumbers[row]/totalMolecules)*1e6; // ppm } } string suffix = (sampleName == "") ? "" : " " + sampleName; if (ArrayUtils.Contains(outputColumns, 0)){ mdata.AddNumericColumn("Copy number" + suffix, "", copyNumbers); } if (ArrayUtils.Contains(outputColumns, 1)){ mdata.AddNumericColumn("Concentration [nM]" + suffix, "", concentrations); } if (ArrayUtils.Contains(outputColumns, 2)){ mdata.AddNumericColumn("Abundance (mass/total mass) [*10^-6]" + suffix, "", massFraction); } if (ArrayUtils.Contains(outputColumns, 3)){ mdata.AddNumericColumn("Abundance (molecules/total molecules) [*10^-6]" + suffix, "", moleFraction); } double[] rank = ArrayUtils.Rank(copyNumbers); double[] relativeRank = new double[mdata.RowCount]; double validRanks = mdata.RowCount; for (int row = 0; row < mdata.RowCount; row++){ // remove rank for protein with no copy number information if (double.IsNaN((copyNumbers[row])) || copyNumbers[row] == 0){ rank[row] = double.NaN; validRanks--; // do not consider as valid } // invert ranking, so that rank 0 is the most abundant protein rank[row] = mdata.RowCount - rank[row]; } for (int row = 0; row < mdata.RowCount; row++){ relativeRank[row] = rank[row]/validRanks; } if (ArrayUtils.Contains(outputColumns, 4)){ mdata.AddNumericColumn("Copy number rank" + suffix, "", rank); } if (ArrayUtils.Contains(outputColumns, 5)){ mdata.AddNumericColumn("Relative copy number rank" + suffix, "", relativeRank); } if (intensityCols[col] < mdata.ExpressionColumnCount && param.GetSingleChoiceWithSubParams("Averaging mode").Value != 3){ totalProteinRow[intensityCols[col]] = Math.Round(totalProtein, 2); totalMoleculesRow[intensityCols[col]] = Math.Round(totalMolecules, 0); organismRow[intensityCols[col]] = new string[]{organism.name}; histoneMassRow[intensityCols[col]] = Math.Round(histoneMass, 4); ploidyRow[intensityCols[col]] = Math.Round((histoneMass*1e-12)/cValue, 2); cellVolumeRow[intensityCols[col]] = Math.Round(totalVolume, 2); // femtoliters } } if (param.GetSingleChoiceWithSubParams("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 6)){ mdata.AddNumericRow("Total protein [pg/cell]", "", totalProteinRow); mdata.AddNumericRow("Total molecules per cell", "", totalMoleculesRow); mdata.AddCategoryRow("Organism", "", organismRow); mdata.AddNumericRow("Histone mass [pg/cell]", "", histoneMassRow); mdata.AddNumericRow("Ploidy", "", ploidyRow); mdata.AddNumericRow("Cell volume [fl]", "", cellVolumeRow); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] mods = param.GetParam<int[]>("Modifications").StringValue.Split(new[]{';'}, StringSplitOptions.RemoveEmptyEntries); string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++){ uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } double[][] c = new double[mods.Length][]; for (int index = 0; index < mods.Length; index++){ string mod = mods[index]; string filename = PhosphoSitePlusParser.GetFilenameForMod(mod); if (filename == null){ processInfo.ErrString = "File does not exist."; return; } string[] seqWins; string[] accs; string[] pubmedLtp; string[] pubmedMs2; string[] cstMs2; string[] species; PhosphoSitePlusParser.ParseKnownMods(filename, out seqWins, out accs, out pubmedLtp, out pubmedMs2, out cstMs2, out species); for (int i = 0; i < seqWins.Length; i++){ seqWins[i] = seqWins[i].ToUpper(); } Dictionary<string, HashSet<string>> counts = new Dictionary<string, HashSet<string>>(); for (int i = 0; i < accs.Length; i++){ string acc = accs[i]; if (!counts.ContainsKey(acc)){ counts.Add(acc, new HashSet<string>()); } counts[acc].Add(seqWins[i]); } c[index] = new double[up.Length]; for (int i = 0; i < up.Length; i++){ c[index][i] = CountSites(uprot[i], counts); } } string[][] catCol = new string[up.Length][]; for (int i = 0; i < catCol.Length; i++){ List<string> x = new List<string>(); for (int j = 0; j < mods.Length; j++){ if (c[j][i] > 0){ x.Add(mods[j]); } } x.Sort(); catCol[i] = x.ToArray(); } mdata.AddCategoryColumn("Known modifications", "Known modifications", catCol); for (int i = 0; i < mods.Length; i++){ mdata.AddNumericColumn(mods[i] + " count", mods[i] + " count", c[i]); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] cols = param.GetMultiChoiceParam("Columns").Value; int truncIndex = param.GetSingleChoiceParam("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetDoubleParam("Threshold value").Value; int sideInd = param.GetSingleChoiceParam("Side").Value; TestSide side; switch (sideInd){ case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } foreach (int col in cols){ float[] r = mdata.GetExpressionColumn(col); double[] pvals = CalcSignificanceA(r, side); string[][] fdr; switch (truncation){ case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ExpressionColumnNames[col] + " Significance A", "", pvals); mdata.AddCategoryColumn(mdata.ExpressionColumnNames[col] + " A significant", "", fdr); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string colName = param.GetParam<string>("Name of new column").Value; int[] columns = param.GetParam<int[]>("Categories").Value; bool inverse = param.GetParam<bool>("Inverse").Value; int[] catCols; int[] stringCols; Split(columns, out catCols, out stringCols, mdata.CategoryColumnCount); string[] word1 = param.GetParam<string[]>("Search terms").Value; if (word1.Length == 0){ processInfo.ErrString = "Please specify one or more search terms."; return; } if (string.IsNullOrEmpty(colName)){ colName = word1[0]; } string[] word = new string[word1.Length]; for (int i = 0; i < word.Length; i++){ word[i] = word1[i].ToLower().Trim(); } bool[] indicator = new bool[mdata.RowCount]; foreach (int col in catCols){ for (int i = 0; i < mdata.RowCount; i++){ foreach (string s in mdata.GetCategoryColumnEntryAt(col, i)){ foreach (string s1 in word){ if (s.ToLower().Contains(s1)){ indicator[i] = true; break; } } } } } foreach (string[] txt in stringCols.Select(col => mdata.StringColumns[col])){ for (int i = 0; i < txt.Length; i++){ string s = txt[i]; foreach (string s1 in word){ if (s.ToLower().Contains(s1)){ indicator[i] = true; break; } } } } string[][] newCol = new string[indicator.Length][]; for (int i = 0; i < newCol.Length; i++){ bool yes = inverse ? !indicator[i] : indicator[i]; newCol[i] = yes ? new[]{"+"} : new string[0]; } mdata.AddCategoryColumn(colName, "", newCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] rcols = param.GetParam <int[]>("Ratio columns").Value; int[] icols = param.GetParam <int[]>("Intensity columns").Value; if (rcols.Length == 0) { processInfo.ErrString = "Please specify some ratio columns."; return; } if (rcols.Length != icols.Length) { processInfo.ErrString = "The number of ratio and intensity columns have to be equal."; return; } int truncIndex = param.GetParam <int>("Use for truncation").Value; TestTruncation truncation = truncIndex == 0 ? TestTruncation.Pvalue : (truncIndex == 1 ? TestTruncation.BenjaminiHochberg : TestTruncation.PermutationBased); double threshold = param.GetParam <double>("Threshold value").Value; int sideInd = param.GetParam <int>("Side").Value; TestSide side; switch (sideInd) { case 0: side = TestSide.Both; break; case 1: side = TestSide.Left; break; case 2: side = TestSide.Right; break; default: throw new Exception("Never get here."); } for (int i = 0; i < rcols.Length; i++) { BaseVector r = mdata.Values.GetColumn(rcols[i]); BaseVector intens = icols[i] < mdata.ColumnCount ? mdata.Values.GetColumn(icols[i]) : new DoubleArrayVector(mdata.NumericColumns[icols[i] - mdata.ColumnCount]); double[] pvals = CalcSignificanceB(r, intens, side); string[][] fdr; switch (truncation) { case TestTruncation.Pvalue: fdr = PerseusPluginUtils.CalcPvalueSignificance(pvals, threshold); break; case TestTruncation.BenjaminiHochberg: fdr = PerseusPluginUtils.CalcBenjaminiHochbergFdr(pvals, threshold, out double[] fdrs); break; default: throw new Exception("Never get here."); } mdata.AddNumericColumn(mdata.ColumnNames[rcols[i]] + " Significance B", "", pvals); mdata.AddCategoryColumn(mdata.ColumnNames[rcols[i]] + " B significant", "", fdr); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ProcessInfo processInfo) { int numQuantiles = param.GetIntParam("Number of quantiles").Value; int[] colInds = param.GetMultiChoiceParam("Columns").Value; foreach (int colInd in colInds){ float[] vals = mdata.GetExpressionColumn(colInd); List<int> v = new List<int>(); for (int i = 0; i < vals.Length; i++){ if (!float.IsNaN(vals[i])){ v.Add(i); } } int[] o = v.ToArray(); vals = ArrayUtils.SubArray(vals, o); int[] q = ArrayUtils.Order(vals); o = ArrayUtils.SubArray(o, q); string[][] catCol = new string[mdata.RowCount][]; for (int i = 0; i < catCol.Length; i++){ catCol[i] = new[]{"missing"}; } for (int i = 0; i < o.Length; i++){ int catVal = (i*numQuantiles)/o.Length + 1; catCol[o[i]] = new[]{"Q" + catVal}; } string name = mdata.ExpressionColumnNames[colInd] + "_q"; string desc = "The column " + mdata.ExpressionColumnNames[colInd] + " has been divided into " + numQuantiles + " quantiles."; mdata.AddCategoryColumn(name, desc, catCol); } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] outputColumns = param.GetParam <int[]>("Output").Value; int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value; string[] proteinIds = mdata.StringColumns[proteinIdColumnInd]; int[] intensityCols = param.GetParam <int[]>("Intensities").Value; if (intensityCols.Length == 0) { processInfo.ErrString = "Please select at least one column containing protein intensities."; return; } // variable to hold all intensity values List <double[]> columns = new List <double[]>(); string[] inputNames = new string[intensityCols.Length]; string[] sampleNames = new string[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++) { double[] values; if (intensityCols[col] < mdata.ColumnCount) { values = ArrayUtils.ToDoubles(mdata.Values.GetColumn(intensityCols[col])); inputNames[col] = mdata.ColumnNames[intensityCols[col]]; } else { values = mdata.NumericColumns[intensityCols[col] - mdata.ColumnCount]; inputNames[col] = mdata.NumericColumnNames[intensityCols[col] - mdata.ColumnCount]; } sampleNames[col] = new Regex(@"^(?:(?:LFQ )?[Ii]ntensity )?(.*)$").Match(inputNames[col]).Groups[1].Value; columns.Add(values); } // average over columns if this option is selected if (param.GetParamWithSubParams <int>("Averaging mode").Value == 3) { double[] column = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { double[] values = new double[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++) { values[col] = columns[col][row]; } column[row] = ArrayUtils.Median(ExtractValidValues(values, false)); } // delete the original list of columns columns = new List <double[]> { column }; sampleNames = new[] { "" }; } // revert logarithm if necessary if (param.GetParamWithSubParams <bool>("Logarithmized").Value) { double[] logBases = new[] { 2, Math.E, 10 }; double logBase = logBases[param.GetParamWithSubParams <bool>("Logarithmized").GetSubParameters().GetParam <int>("log base").Value]; foreach (double[] t in columns) { for (int row = 0; row < mdata.RowCount; row++) { if (t[row] == 0) { processInfo.ErrString = "Are the columns really logarithmized?\nThey contain zeroes!"; } t[row] = Math.Pow(logBase, t[row]); } } } double[] mw = mdata.NumericColumns[param.GetParam <int>("Molecular masses").Value]; // define whether the molecular masses are given in Da or kDa if (ArrayUtils.Median(mw) < 250) // most likely kDa { for (int i = 0; i < mw.Length; i++) { mw[i] *= 1000; } } double[] detectabilityNormFactor = mw; if (param.GetParamWithSubParams <bool>("Detectability correction").Value) { detectabilityNormFactor = mdata.NumericColumns[ param.GetParamWithSubParams <bool>("Detectability correction").GetSubParameters().GetParam <int>("Correction factor") .Value]; } // the normalization factor needs to be nonzero for all proteins // check and replace with 1 for all relevant cases for (int row = 0; row < mdata.RowCount; row++) { if (detectabilityNormFactor[row] == 0 || double.IsNaN(detectabilityNormFactor[row])) { detectabilityNormFactor[row] = 1; } } // detect the organism Organism organism = DetectOrganism(proteinIds); // c value the amount of DNA per haploid genome, see: http://en.wikipedia.org/wiki/C-value double cValue = organism.genomeSize * basePairWeight / avogadro; // find the histones int[] histoneRows = FindHistones(proteinIds, organism); // write a categorical column indicating the histones string[][] histoneCol = new string[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++) { histoneCol[row] = ArrayUtils.Contains(histoneRows, row) ? new[] { "+" } : new string[0]; } mdata.AddCategoryColumn("Histones", "", histoneCol); // initialize the variables for the annotation rows string[] sampleNameRow = new string[mdata.ColumnCount]; string[] inputNameRow = new string[mdata.ColumnCount]; double[] totalProteinRow = new double[mdata.ColumnCount]; double[] totalMoleculesRow = new double[mdata.ColumnCount]; string[][] organismRow = new string[mdata.ColumnCount][]; // populate the organismRow variable with empty strings as defaults (not null, which may cause errors when writing the annotations in the end.) for (int i = 0; i < organismRow.Length; i++) { organismRow[i] = new[] { "N/A" }; } double[] histoneMassRow = new double[mdata.ColumnCount]; double[] ploidyRow = new double[mdata.ColumnCount]; double[] cellVolumeRow = new double[mdata.ColumnCount]; double[] normalizationFactors = new double[columns.Count]; // calculate normalization factors for each column for (int col = 0; col < columns.Count; col++) { string sampleName = sampleNames[col]; double[] column = columns[col]; // normalization factor to go from intensities to copies, // needs to be determined either using the total protein or the histone scaling approach double factor; switch (param.GetParamWithSubParams <int>("Scaling mode").Value) { case 0: // total protein amount double mwWeightedNormalizedSummedIntensities = 0; for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { mwWeightedNormalizedSummedIntensities += column[row] / detectabilityNormFactor[row] * mw[row]; } } factor = param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>( "Protein amount per cell [pg]").Value *1e-12 * avogadro / mwWeightedNormalizedSummedIntensities; break; case 1: // histone mode double mwWeightedNormalizedSummedHistoneIntensities = 0; foreach (int row in histoneRows) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { mwWeightedNormalizedSummedHistoneIntensities += column[row] / detectabilityNormFactor[row] * mw[row]; } } double ploidy = param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>("Ploidy").Value; factor = cValue * ploidy * avogadro / mwWeightedNormalizedSummedHistoneIntensities; break; default: factor = 1; break; } normalizationFactors[col] = factor; } // check averaging mode if (param.GetParamWithSubParams <int>("Averaging mode").Value == 1) // same factor for all { double factor = ArrayUtils.Mean(normalizationFactors); for (int i = 0; i < normalizationFactors.Length; i++) { normalizationFactors[i] = factor; } } if (param.GetParamWithSubParams <int>("Averaging mode").Value == 2) // same factor in each group { if (param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value == -1) { processInfo.ErrString = "No grouping selected."; return; } string[][] groupNames = mdata.GetCategoryRowAt( param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value); string[] uniqueGroupNames = Unique(groupNames); int[] grouping = new int[columns.Count]; for (int i = 0; i < columns.Count; i++) { if (intensityCols[i] >= mdata.ColumnCount) // Numeric annotation columns cannot be grouped { grouping[i] = i; continue; } if (ArrayUtils.Contains(uniqueGroupNames, groupNames[i][0])) { grouping[i] = ArrayUtils.IndexOf(uniqueGroupNames, groupNames[i][0]); continue; } grouping[i] = i; } Dictionary <int, List <double> > factors = new Dictionary <int, List <double> >(); for (int i = 0; i < columns.Count; i++) { if (factors.ContainsKey(grouping[i])) { factors[grouping[i]].Add(normalizationFactors[i]); } else { factors.Add(grouping[i], new List <double> { normalizationFactors[i] }); } } double[] averagedNormalizationFactors = new double[columns.Count]; for (int i = 0; i < columns.Count; i++) { List <double> factor; factors.TryGetValue(grouping[i], out factor); averagedNormalizationFactors[i] = ArrayUtils.Mean(factor); } normalizationFactors = averagedNormalizationFactors; } // loop over all selected columns and calculate copy numbers for (int col = 0; col < columns.Count; col++) { string sampleName = sampleNames[col]; double[] column = columns[col]; double factor = normalizationFactors[col]; double[] copyNumbers = new double[mdata.RowCount]; double[] concentrations = new double[mdata.RowCount]; // femtoliters double[] massFraction = new double[mdata.RowCount]; double[] moleFraction = new double[mdata.RowCount]; double totalProtein = 0; // picograms double histoneMass = 0; // picograms double totalMolecules = 0; for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { copyNumbers[row] = column[row] / detectabilityNormFactor[row] * factor; totalMolecules += copyNumbers[row]; totalProtein += copyNumbers[row] * mw[row] * 1e12 / avogadro; // picograms if (ArrayUtils.Contains(histoneRows, row)) { histoneMass += copyNumbers[row] * mw[row] * 1e12 / avogadro; // picograms } } } double totalVolume = totalProtein / param.GetParam <double>("Total cellular protein concentration [g/l]").Value * 1000; // femtoliters for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { concentrations[row] = copyNumbers[row] / (totalVolume * 1e-15) / avogadro * 1e9; // nanomolar massFraction[row] = copyNumbers[row] * mw[row] * 1e12 / avogadro / totalProtein * 1e6; // ppm moleFraction[row] = copyNumbers[row] / totalMolecules * 1e6; // ppm } } string suffix = sampleName == "" ? "" : " " + sampleName; if (ArrayUtils.Contains(outputColumns, 0)) { mdata.AddNumericColumn("Copy number" + suffix, "", copyNumbers); } if (ArrayUtils.Contains(outputColumns, 1)) { mdata.AddNumericColumn("Concentration [nM]" + suffix, "", concentrations); } if (ArrayUtils.Contains(outputColumns, 2)) { mdata.AddNumericColumn("Abundance (mass/total mass) [*10^-6]" + suffix, "", massFraction); } if (ArrayUtils.Contains(outputColumns, 3)) { mdata.AddNumericColumn("Abundance (molecules/total molecules) [*10^-6]" + suffix, "", moleFraction); } double[] rank = ArrayUtils.Rank(copyNumbers); double[] relativeRank = new double[mdata.RowCount]; double validRanks = mdata.RowCount; for (int row = 0; row < mdata.RowCount; row++) { // remove rank for protein with no copy number information if (double.IsNaN(copyNumbers[row]) || copyNumbers[row] == 0) { rank[row] = double.NaN; validRanks--; // do not consider as valid } // invert ranking, so that rank 0 is the most abundant protein rank[row] = mdata.RowCount - rank[row]; } for (int row = 0; row < mdata.RowCount; row++) { relativeRank[row] = rank[row] / validRanks; } if (ArrayUtils.Contains(outputColumns, 4)) { mdata.AddNumericColumn("Copy number rank" + suffix, "", rank); } if (ArrayUtils.Contains(outputColumns, 5)) { mdata.AddNumericColumn("Relative copy number rank" + suffix, "", relativeRank); } if (intensityCols[col] < mdata.ColumnCount && param.GetParamWithSubParams <int>("Averaging mode").Value != 3) { inputNameRow[intensityCols[col]] = inputNames[col]; sampleNameRow[intensityCols[col]] = sampleNames[col]; totalProteinRow[intensityCols[col]] = Math.Round(totalProtein, 2); totalMoleculesRow[intensityCols[col]] = Math.Round(totalMolecules, 0); organismRow[intensityCols[col]] = new[] { organism.name }; histoneMassRow[intensityCols[col]] = Math.Round(histoneMass, 4); ploidyRow[intensityCols[col]] = Math.Round(histoneMass * 1e-12 / cValue, 2); cellVolumeRow[intensityCols[col]] = Math.Round(totalVolume, 2); // femtoliters } } // Summary annotation row if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 6)) { mdata.AddNumericRow("Total protein [pg/cell]", "", totalProteinRow); mdata.AddNumericRow("Total molecules per cell", "", totalMoleculesRow); mdata.AddCategoryRow("Organism", "", organismRow); mdata.AddNumericRow("Histone mass [pg/cell]", "", histoneMassRow); mdata.AddNumericRow("Ploidy", "", ploidyRow); mdata.AddNumericRow("Cell volume [fl]", "", cellVolumeRow); } // Summary matrix if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 7)) { supplTables = new IMatrixData[1]; IMatrixData supplTab = PerseusFactory.CreateMatrixData(); supplTab.ColumnNames = new List <string>(); supplTab.Values.Init(totalProteinRow.Length, 0); supplTab.SetAnnotationColumns(new List <string> { "Sample", "Input Column" }, new List <string[]>() { sampleNameRow, inputNameRow }, new List <string>() { "Organism" }, new List <string[][]>() { organismRow }, new List <string>() { "Total protein [pg/cell]", "Total molecules per cell", "Histone mass [pg/cell]", "Ploidy", "Cell volume [fl]" }, new List <double[]>() { totalProteinRow, totalMoleculesRow, histoneMassRow, ploidyRow, cellVolumeRow }, new List <string>(), new List <double[][]>()); supplTables[0] = supplTab; } }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string mod = param.GetParam <int>("Modification").StringValue; PhosphoSitePlusParser.ParseKnownMod(mod, out string[] seqWins, out string[] accs, out string[] pubmedLtp, out string[] pubmedMs2, out string[] cstMs2, out string[] species); if (seqWins == null) { processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++) { uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam <int>("Sequence window").Value]; Dictionary <string, List <int> > map = new Dictionary <string, List <int> >(); for (int i = 0; i < seqWins.Length; i++) { string acc = accs[i]; if (!map.ContainsKey(acc)) { map.Add(acc, new List <int>()); } map[acc].Add(i); } string[] newCol = new string[uprot.Length]; string[][] newCatCol = new string[uprot.Length][]; string[][] originCol = new string[uprot.Length][]; for (int i = 0; i < newCol.Length; i++) { string[] win1 = TransformIl(win[i]).Split(';'); HashSet <string> wins = new HashSet <string>(); HashSet <string> origins = new HashSet <string>(); foreach (string ux in uprot[i]) { if (map.ContainsKey(ux)) { List <int> n = map[ux]; foreach (int ind in n) { string s = seqWins[ind]; if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))) { wins.Add(s); if (pubmedLtp[ind].Length > 0) { origins.Add("LTP"); } if (pubmedMs2[ind].Length > 0) { origins.Add("HTP"); } if (cstMs2[ind].Length > 0) { origins.Add("CST"); } } } } } if (wins.Count > 0) { newCol[i] = StringUtils.Concat(";", ArrayUtils.ToArray(wins)); newCatCol[i] = new[] { "+" }; string[] x = ArrayUtils.ToArray(origins); Array.Sort(x); originCol[i] = x; } else { newCol[i] = ""; newCatCol[i] = new string[0]; originCol[i] = new string[0]; } } mdata.AddStringColumn("PhosphoSitePlus window", "", newCol); mdata.AddCategoryColumn("Known site", "", newCatCol); mdata.AddCategoryColumn("Origin", "", originCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { PhosphoSitePlusParser.ParseRegulatorySites(out string[] seqWins, out string[] accs, out string[] function, out string[] process, out string[] protInteract, out string[] otherInteract, out string[] notes, out string[] species); if (seqWins == null) { processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam <int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++) { uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam <int>("Sequence window").Value]; Dictionary <string, List <int> > map = new Dictionary <string, List <int> >(); for (int i = 0; i < seqWins.Length; i++) { string acc = accs[i]; if (!map.ContainsKey(acc)) { map.Add(acc, new List <int>()); } map[acc].Add(i); } string[][] newCatCol = new string[uprot.Length][]; string[][] function2 = new string[uprot.Length][]; string[][] process2 = new string[uprot.Length][]; string[][] protInteract2 = new string[uprot.Length][]; string[][] otherInteract2 = new string[uprot.Length][]; string[][] notes2 = new string[uprot.Length][]; for (int i = 0; i < uprot.Length; i++) { string[] win1 = TransformIl(win[i]).Split(';'); HashSet <string> wins = new HashSet <string>(); HashSet <string> function1 = new HashSet <string>(); HashSet <string> process1 = new HashSet <string>(); HashSet <string> protInteract1 = new HashSet <string>(); HashSet <string> otherInteract1 = new HashSet <string>(); HashSet <string> notes1 = new HashSet <string>(); foreach (string ux in uprot[i]) { if (map.ContainsKey(ux)) { List <int> n = map[ux]; foreach (int ind in n) { string s = seqWins[ind]; if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))) { wins.Add(s); if (function[ind].Length > 0) { function1.Add(function[ind]); } if (process[ind].Length > 0) { process1.Add(process[ind]); } if (protInteract[ind].Length > 0) { protInteract1.Add(protInteract[ind]); } if (otherInteract[ind].Length > 0) { otherInteract1.Add(otherInteract[ind]); } if (notes[ind].Length > 0) { notes1.Add(notes[ind]); } } } } } if (wins.Count > 0) { newCatCol[i] = new[] { "+" }; function2[i] = ArrayUtils.ToArray(function1); process2[i] = ArrayUtils.ToArray(process1); protInteract2[i] = ArrayUtils.ToArray(protInteract1); otherInteract2[i] = ArrayUtils.ToArray(otherInteract1); notes2[i] = ArrayUtils.ToArray(notes1); } else { newCatCol[i] = new string[0]; function2[i] = new string[0]; process2[i] = new string[0]; protInteract2[i] = new string[0]; otherInteract2[i] = new string[0]; notes2[i] = new string[0]; } } mdata.AddCategoryColumn("Regulatory site", "", newCatCol); mdata.AddCategoryColumn("Regulatory site function", "", function2); mdata.AddCategoryColumn("Regulatory site process", "", process2); mdata.AddCategoryColumn("Regulatory site protInteract", "", protInteract2); mdata.AddCategoryColumn("Regulatory site otherInteract", "", otherInteract2); mdata.AddCategoryColumn("Regulatory site notes", "", notes2); }
public void ExtractDESeq2Results(IMatrixData mdata, string pair1, string pair2, ParameterWithSubParams <bool> fdrValid, ParameterWithSubParams <bool> pValid, ParameterWithSubParams <bool> lfcValid) { StreamReader reader = new StreamReader(File.OpenRead("results.csv")); int lineNum = 0; string[][] validCol = new string[mdata.Values.RowCount][]; string[][] sigCol = new string[mdata.Values.RowCount][]; Dictionary <string, string[]> results = new Dictionary <string, string[]> { { "baseMean", new string[mdata.Values.RowCount] }, { "log2FoldChange", new string[mdata.Values.RowCount] }, { "lfcSE", new string[mdata.Values.RowCount] }, { "stat", new string[mdata.Values.RowCount] }, { "p-value", new string[mdata.Values.RowCount] }, { "padj", new string[mdata.Values.RowCount] } }; while (!reader.EndOfStream) { string line = reader.ReadLine(); if (!String.IsNullOrWhiteSpace(line)) { line = line.Replace("\"", ""); string[] info = line.Split(','); if (lineNum != 0) { validCol[lineNum - 1] = new string[] { "+" }; sigCol[lineNum - 1] = new string[] { "Not Valid" }; for (int v = 0; v < info.Length; v++) { if (info[v] == "NA") { if (v == 3 || v == 5 || v == 6) { info[v] = "1"; } else if (v == 2 || v == 4) { info[v] = "0"; } validCol[lineNum - 1][0] = "-"; } } if (validCol[lineNum - 1][0] == "+") { CheckSignificant(sigCol, info, fdrValid, pValid, lfcValid, lineNum); } results["baseMean"][lineNum - 1] = info[1]; results["log2FoldChange"][lineNum - 1] = info[2]; results["lfcSE"][lineNum - 1] = info[3]; results["stat"][lineNum - 1] = info[4]; results["p-value"][lineNum - 1] = info[5]; results["padj"][lineNum - 1] = info[6]; } } lineNum++; } reader.Close(); foreach (KeyValuePair <string, string[]> entry in results) { mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_" + entry.Key, pair1 + "_vs_" + pair2 + "_" + entry.Key, Array.ConvertAll(entry.Value, Double.Parse)); double[] t = new double[entry.Value.Length]; if (entry.Key == "p-value" || entry.Key == "padj") { for (int i = 0; i < entry.Value.Length; i++) { double.TryParse(entry.Value[i], out double p); if (p == 0) { t[i] = Math.Log10(1 / Double.MaxValue) * -1; } else { t[i] = Math.Log10(p) * -1; } } mdata.AddNumericColumn(pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, pair1 + "_vs_" + pair2 + "_-log10" + entry.Key, t); } } mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Valid", pair1 + "_vs_" + pair2 + "_Valid", validCol); mdata.AddCategoryColumn(pair1 + "_vs_" + pair2 + "_Significant", pair1 + "_vs_" + pair2 + "_Significant", sigCol); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { string[] seqWins; string[] accs; string[] function; string[] process; string[] protInteract; string[] otherInteract; string[] notes; string[] species; PhosphoSitePlusParser.ParseRegulatorySites(out seqWins, out accs, out function, out process, out protInteract, out otherInteract, out notes, out species); if (seqWins == null){ processInfo.ErrString = "File does not exist."; return; } string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value]; string[][] uprot = new string[up.Length][]; for (int i = 0; i < up.Length; i++){ uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0]; } string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value]; Dictionary<string, List<int>> map = new Dictionary<string, List<int>>(); for (int i = 0; i < seqWins.Length; i++){ string acc = accs[i]; if (!map.ContainsKey(acc)){ map.Add(acc, new List<int>()); } map[acc].Add(i); } string[][] newCatCol = new string[uprot.Length][]; string[][] function2 = new string[uprot.Length][]; string[][] process2 = new string[uprot.Length][]; string[][] protInteract2 = new string[uprot.Length][]; string[][] otherInteract2 = new string[uprot.Length][]; string[][] notes2 = new string[uprot.Length][]; for (int i = 0; i < uprot.Length; i++){ string[] win1 = TransformIl(win[i]).Split(';'); HashSet<string> wins = new HashSet<string>(); HashSet<string> function1 = new HashSet<string>(); HashSet<string> process1 = new HashSet<string>(); HashSet<string> protInteract1 = new HashSet<string>(); HashSet<string> otherInteract1 = new HashSet<string>(); HashSet<string> notes1 = new HashSet<string>(); foreach (string ux in uprot[i]){ if (map.ContainsKey(ux)){ List<int> n = map[ux]; foreach (int ind in n){ string s = seqWins[ind]; if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))){ wins.Add(s); if (function[ind].Length > 0){ function1.Add(function[ind]); } if (process[ind].Length > 0){ process1.Add(process[ind]); } if (protInteract[ind].Length > 0){ protInteract1.Add(protInteract[ind]); } if (otherInteract[ind].Length > 0){ otherInteract1.Add(otherInteract[ind]); } if (notes[ind].Length > 0){ notes1.Add(notes[ind]); } } } } } if (wins.Count > 0){ newCatCol[i] = new[]{"+"}; function2[i] = ArrayUtils.ToArray(function1); process2[i] = ArrayUtils.ToArray(process1); protInteract2[i] = ArrayUtils.ToArray(protInteract1); otherInteract2[i] = ArrayUtils.ToArray(otherInteract1); notes2[i] = ArrayUtils.ToArray(notes1); } else{ newCatCol[i] = new string[0]; function2[i] = new string[0]; process2[i] = new string[0]; protInteract2[i] = new string[0]; otherInteract2[i] = new string[0]; notes2[i] = new string[0]; } } mdata.AddCategoryColumn("Regulatory site", "", newCatCol); mdata.AddCategoryColumn("Regulatory site function", "", function2); mdata.AddCategoryColumn("Regulatory site process", "", process2); mdata.AddCategoryColumn("Regulatory site protInteract", "", protInteract2); mdata.AddCategoryColumn("Regulatory site otherInteract", "", otherInteract2); mdata.AddCategoryColumn("Regulatory site notes", "", notes2); }