//gene enrichment analysis for both Basic PECA and PECA-N public static IMatrixData GetGOEnr(IMatrixData mdata, string workingDir, int option)//, out string errString) { char separator = '\t'; string filename = Path.Combine(workingDir, @".\Goterms.txt"); IMatrixData mNew = (IMatrixData)mdata.CreateNewInstance(); string name = "GSA"; if (option == 0) { name = name + "_Degradation"; } else if (option == 1) { name = name + "_Synthesis"; } mNew.Clear(); mNew.Name = name; mNew.AltName = name; //update //mNew.AltName = "Gene Set Enrichment Analysis"; //mNew.Description = "Gene Set Enrichment Analysis"; string[] colNames = TabSep.GetColumnNames(filename, 0, PerseusUtils.commentPrefix, PerseusUtils.commentPrefixExceptions, null, separator); string[][] cols = TabSep.GetColumns(colNames, filename, 0, PerseusUtils.commentPrefix, PerseusUtils.commentPrefixExceptions, separator); int nrows = TabSep.GetRowCount(filename); mNew.Values.Init(nrows, 0); mNew.SetAnnotationColumns(new List <string>(colNames), new List <string>(colNames), new List <string[]>(cols), new List <string>(), new List <string>(), new List <string[][]>(), new List <string>(), new List <string>(), new List <double[]>(), new List <string>(), new List <string>(), new List <double[][]>()); //convert the ones not matching regex to numeric string pattern = @"^((?!id|name|members).)*$"; Regex numericReg = new Regex(pattern); List <int> numericList = new List <int>(); for (int i = 0; i < colNames.Length; i++) { if (numericReg.Match(colNames[i]).Success) { numericList.Add(i); } } StringToNumerical(numericList, mNew); return(mNew); }
public IMatrixData ProcessData(IMatrixData[] inputData, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { IMatrixData mdata1 = inputData[0]; IMatrixData mdata2 = inputData[1]; int nrows1 = mdata1.RowCount; int nrows2 = mdata2.RowCount; int nrows = nrows1 + nrows2; Dictionary <string, int> dic1; Dictionary <string, int> dic2; string[] expColNames = SpecialSort(mdata1.ColumnNames, mdata2.ColumnNames, out dic1, out dic2); float[,] ex = new float[nrows, expColNames.Length]; for (int i = 0; i < ex.GetLength(0); i++) { for (int j = 0; j < ex.GetLength(1); j++) { ex[i, j] = float.NaN; } } for (int i = 0; i < expColNames.Length; i++) { if (dic1.ContainsKey(expColNames[i])) { int ind = dic1[expColNames[i]]; for (int j = 0; j < nrows1; j++) { ex[j, i] = mdata1.Values[j, ind]; } } if (dic2.ContainsKey(expColNames[i])) { int ind = dic2[expColNames[i]]; for (int j = 0; j < nrows2; j++) { ex[nrows1 + j, i] = mdata2.Values[j, ind]; } } } string[] numColNames = SpecialSort(mdata1.NumericColumnNames, mdata2.NumericColumnNames, out dic1, out dic2); List <double[]> numCols = new List <double[]>(); for (int i = 0; i < numColNames.Length; i++) { numCols.Add(new double[nrows]); for (int j = 0; j < nrows; j++) { numCols[numCols.Count - 1][j] = double.NaN; } } for (int i = 0; i < numColNames.Length; i++) { if (dic1.ContainsKey(numColNames[i])) { int ind = dic1[numColNames[i]]; for (int j = 0; j < nrows1; j++) { numCols[i][j] = mdata1.NumericColumns[ind][j]; } } if (dic2.ContainsKey(numColNames[i])) { int ind = dic2[numColNames[i]]; for (int j = 0; j < nrows2; j++) { numCols[i][nrows1 + j] = mdata2.NumericColumns[ind][j]; } } } string[] stringColNames = SpecialSort(mdata1.StringColumnNames, mdata2.StringColumnNames, out dic1, out dic2); List <string[]> stringCols = new List <string[]>(); for (int i = 0; i < stringColNames.Length; i++) { stringCols.Add(new string[nrows]); for (int j = 0; j < nrows; j++) { stringCols[stringCols.Count - 1][j] = ""; } } for (int i = 0; i < stringColNames.Length; i++) { if (dic1.ContainsKey(stringColNames[i])) { int ind = dic1[stringColNames[i]]; for (int j = 0; j < nrows1; j++) { stringCols[i][j] = mdata1.StringColumns[ind][j]; } } if (dic2.ContainsKey(stringColNames[i])) { int ind = dic2[stringColNames[i]]; for (int j = 0; j < nrows2; j++) { stringCols[i][nrows1 + j] = mdata2.StringColumns[ind][j]; } } } string[] catColNames = SpecialSort(mdata1.CategoryColumnNames, mdata2.CategoryColumnNames, out dic1, out dic2); List <string[][]> catCols = new List <string[][]>(); for (int i = 0; i < catColNames.Length; i++) { catCols.Add(new string[nrows][]); for (int j = 0; j < nrows; j++) { catCols[catCols.Count - 1][j] = new string[0]; } } for (int i = 0; i < catColNames.Length; i++) { if (dic1.ContainsKey(catColNames[i])) { int ind = dic1[stringColNames[i]]; for (int j = 0; j < nrows1; j++) { catCols[i][j] = mdata1.GetCategoryColumnEntryAt(ind, j); } } if (dic2.ContainsKey(catColNames[i])) { int ind = dic2[catColNames[i]]; for (int j = 0; j < nrows2; j++) { catCols[i][nrows1 + j] = mdata2.GetCategoryColumnEntryAt(ind, j); } } } string[] multiNumColNames = SpecialSort(mdata1.MultiNumericColumnNames, mdata2.MultiNumericColumnNames, out dic1, out dic2); List <double[][]> multiNumCols = new List <double[][]>(); for (int i = 0; i < multiNumColNames.Length; i++) { multiNumCols.Add(new double[nrows][]); for (int j = 0; j < nrows; j++) { multiNumCols[multiNumCols.Count - 1][j] = new double[0]; } } for (int i = 0; i < multiNumColNames.Length; i++) { if (dic1.ContainsKey(multiNumColNames[i])) { int ind = dic1[multiNumColNames[i]]; for (int j = 0; j < nrows1; j++) { multiNumCols[i][j] = mdata1.MultiNumericColumns[ind][j]; } } if (dic2.ContainsKey(multiNumColNames[i])) { int ind = dic2[multiNumColNames[i]]; for (int j = 0; j < nrows2; j++) { multiNumCols[i][nrows1 + j] = mdata2.MultiNumericColumns[ind][j]; } } } IMatrixData result = (IMatrixData)mdata1.CreateNewInstance(); result.ColumnNames = new List <string>(expColNames); result.ColumnDescriptions = result.ColumnNames; result.Values.Set(ex); result.NumericColumnNames = new List <string>(numColNames); result.NumericColumnDescriptions = result.NumericColumnNames; result.NumericColumns = numCols; result.StringColumnNames = new List <string>(stringColNames); result.StringColumnDescriptions = result.StringColumnDescriptions; result.StringColumns = stringCols; result.CategoryColumnNames = new List <string>(catColNames); result.CategoryColumnDescriptions = result.CategoryColumnNames; result.CategoryColumns = catCols; result.MultiNumericColumnNames = new List <string>(multiNumColNames); result.MultiNumericColumnDescriptions = result.MultiNumericColumnNames; result.MultiNumericColumns = multiNumCols; return(result); }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int[] outputColumns = param.GetParam <int[]>("Output").Value; int proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value; string[] proteinIds = mdata.StringColumns[proteinIdColumnInd]; int[] intensityCols = param.GetParam <int[]>("Intensities").Value; if (intensityCols.Length == 0) { processInfo.ErrString = "Please select at least one column containing protein intensities."; return; } // variable to hold all intensity values List <double[]> columns = new List <double[]>(); string[] inputNames = new string[intensityCols.Length]; string[] sampleNames = new string[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++) { double[] values; if (intensityCols[col] < mdata.ColumnCount) { values = ArrayUtils.ToDoubles(mdata.Values.GetColumn(intensityCols[col])); inputNames[col] = mdata.ColumnNames[intensityCols[col]]; } else { values = mdata.NumericColumns[intensityCols[col] - mdata.ColumnCount]; inputNames[col] = mdata.NumericColumnNames[intensityCols[col] - mdata.ColumnCount]; } sampleNames[col] = new Regex(@"^(?:(?:LFQ )?[Ii]ntensity )?(.*)$").Match(inputNames[col]).Groups[1].Value; columns.Add(values); } // average over columns if this option is selected if (param.GetParamWithSubParams <int>("Averaging mode").Value == 3) { double[] column = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++) { double[] values = new double[intensityCols.Length]; for (int col = 0; col < intensityCols.Length; col++) { values[col] = columns[col][row]; } column[row] = ArrayUtils.Median(ExtractValidValues(values, false)); } // delete the original list of columns columns = new List <double[]> { column }; sampleNames = new[] { "" }; } // revert logarithm if necessary if (param.GetParamWithSubParams <bool>("Logarithmized").Value) { double[] logBases = new[] { 2, Math.E, 10 }; double logBase = logBases[param.GetParamWithSubParams <bool>("Logarithmized").GetSubParameters().GetParam <int>("log base").Value]; foreach (double[] t in columns) { for (int row = 0; row < mdata.RowCount; row++) { if (t[row] == 0) { processInfo.ErrString = "Are the columns really logarithmized?\nThey contain zeroes!"; } t[row] = Math.Pow(logBase, t[row]); } } } double[] mw = mdata.NumericColumns[param.GetParam <int>("Molecular masses").Value]; // define whether the molecular masses are given in Da or kDa if (ArrayUtils.Median(mw) < 250) // most likely kDa { for (int i = 0; i < mw.Length; i++) { mw[i] *= 1000; } } double[] detectabilityNormFactor = mw; if (param.GetParamWithSubParams <bool>("Detectability correction").Value) { detectabilityNormFactor = mdata.NumericColumns[ param.GetParamWithSubParams <bool>("Detectability correction").GetSubParameters().GetParam <int>("Correction factor") .Value]; } // the normalization factor needs to be nonzero for all proteins // check and replace with 1 for all relevant cases for (int row = 0; row < mdata.RowCount; row++) { if (detectabilityNormFactor[row] == 0 || double.IsNaN(detectabilityNormFactor[row])) { detectabilityNormFactor[row] = 1; } } // detect the organism Organism organism = DetectOrganism(proteinIds); // c value the amount of DNA per haploid genome, see: http://en.wikipedia.org/wiki/C-value double cValue = organism.genomeSize * basePairWeight / avogadro; // find the histones int[] histoneRows = FindHistones(proteinIds, organism); // write a categorical column indicating the histones string[][] histoneCol = new string[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++) { histoneCol[row] = ArrayUtils.Contains(histoneRows, row) ? new[] { "+" } : new string[0]; } mdata.AddCategoryColumn("Histones", "", histoneCol); // initialize the variables for the annotation rows string[] sampleNameRow = new string[mdata.ColumnCount]; string[] inputNameRow = new string[mdata.ColumnCount]; double[] totalProteinRow = new double[mdata.ColumnCount]; double[] totalMoleculesRow = new double[mdata.ColumnCount]; string[][] organismRow = new string[mdata.ColumnCount][]; // populate the organismRow variable with empty strings as defaults (not null, which may cause errors when writing the annotations in the end.) for (int i = 0; i < organismRow.Length; i++) { organismRow[i] = new[] { "N/A" }; } double[] histoneMassRow = new double[mdata.ColumnCount]; double[] ploidyRow = new double[mdata.ColumnCount]; double[] cellVolumeRow = new double[mdata.ColumnCount]; double[] normalizationFactors = new double[columns.Count]; // calculate normalization factors for each column for (int col = 0; col < columns.Count; col++) { string sampleName = sampleNames[col]; double[] column = columns[col]; // normalization factor to go from intensities to copies, // needs to be determined either using the total protein or the histone scaling approach double factor; switch (param.GetParamWithSubParams <int>("Scaling mode").Value) { case 0: // total protein amount double mwWeightedNormalizedSummedIntensities = 0; for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { mwWeightedNormalizedSummedIntensities += column[row] / detectabilityNormFactor[row] * mw[row]; } } factor = param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>( "Protein amount per cell [pg]").Value *1e-12 * avogadro / mwWeightedNormalizedSummedIntensities; break; case 1: // histone mode double mwWeightedNormalizedSummedHistoneIntensities = 0; foreach (int row in histoneRows) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { mwWeightedNormalizedSummedHistoneIntensities += column[row] / detectabilityNormFactor[row] * mw[row]; } } double ploidy = param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>("Ploidy").Value; factor = cValue * ploidy * avogadro / mwWeightedNormalizedSummedHistoneIntensities; break; default: factor = 1; break; } normalizationFactors[col] = factor; } // check averaging mode if (param.GetParamWithSubParams <int>("Averaging mode").Value == 1) // same factor for all { double factor = ArrayUtils.Mean(normalizationFactors); for (int i = 0; i < normalizationFactors.Length; i++) { normalizationFactors[i] = factor; } } if (param.GetParamWithSubParams <int>("Averaging mode").Value == 2) // same factor in each group { if (param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value == -1) { processInfo.ErrString = "No grouping selected."; return; } string[][] groupNames = mdata.GetCategoryRowAt( param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value); string[] uniqueGroupNames = Unique(groupNames); int[] grouping = new int[columns.Count]; for (int i = 0; i < columns.Count; i++) { if (intensityCols[i] >= mdata.ColumnCount) // Numeric annotation columns cannot be grouped { grouping[i] = i; continue; } if (ArrayUtils.Contains(uniqueGroupNames, groupNames[i][0])) { grouping[i] = ArrayUtils.IndexOf(uniqueGroupNames, groupNames[i][0]); continue; } grouping[i] = i; } Dictionary <int, List <double> > factors = new Dictionary <int, List <double> >(); for (int i = 0; i < columns.Count; i++) { if (factors.ContainsKey(grouping[i])) { factors[grouping[i]].Add(normalizationFactors[i]); } else { factors.Add(grouping[i], new List <double> { normalizationFactors[i] }); } } double[] averagedNormalizationFactors = new double[columns.Count]; for (int i = 0; i < columns.Count; i++) { List <double> factor; factors.TryGetValue(grouping[i], out factor); averagedNormalizationFactors[i] = ArrayUtils.Mean(factor); } normalizationFactors = averagedNormalizationFactors; } // loop over all selected columns and calculate copy numbers for (int col = 0; col < columns.Count; col++) { string sampleName = sampleNames[col]; double[] column = columns[col]; double factor = normalizationFactors[col]; double[] copyNumbers = new double[mdata.RowCount]; double[] concentrations = new double[mdata.RowCount]; // femtoliters double[] massFraction = new double[mdata.RowCount]; double[] moleFraction = new double[mdata.RowCount]; double totalProtein = 0; // picograms double histoneMass = 0; // picograms double totalMolecules = 0; for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { copyNumbers[row] = column[row] / detectabilityNormFactor[row] * factor; totalMolecules += copyNumbers[row]; totalProtein += copyNumbers[row] * mw[row] * 1e12 / avogadro; // picograms if (ArrayUtils.Contains(histoneRows, row)) { histoneMass += copyNumbers[row] * mw[row] * 1e12 / avogadro; // picograms } } } double totalVolume = totalProtein / param.GetParam <double>("Total cellular protein concentration [g/l]").Value * 1000; // femtoliters for (int row = 0; row < mdata.RowCount; row++) { if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row])) { concentrations[row] = copyNumbers[row] / (totalVolume * 1e-15) / avogadro * 1e9; // nanomolar massFraction[row] = copyNumbers[row] * mw[row] * 1e12 / avogadro / totalProtein * 1e6; // ppm moleFraction[row] = copyNumbers[row] / totalMolecules * 1e6; // ppm } } string suffix = sampleName == "" ? "" : " " + sampleName; if (ArrayUtils.Contains(outputColumns, 0)) { mdata.AddNumericColumn("Copy number" + suffix, "", copyNumbers); } if (ArrayUtils.Contains(outputColumns, 1)) { mdata.AddNumericColumn("Concentration [nM]" + suffix, "", concentrations); } if (ArrayUtils.Contains(outputColumns, 2)) { mdata.AddNumericColumn("Abundance (mass/total mass) [*10^-6]" + suffix, "", massFraction); } if (ArrayUtils.Contains(outputColumns, 3)) { mdata.AddNumericColumn("Abundance (molecules/total molecules) [*10^-6]" + suffix, "", moleFraction); } double[] rank = ArrayUtils.Rank(copyNumbers); double[] relativeRank = new double[mdata.RowCount]; double validRanks = mdata.RowCount; for (int row = 0; row < mdata.RowCount; row++) { // remove rank for protein with no copy number information if (double.IsNaN(copyNumbers[row]) || copyNumbers[row] == 0) { rank[row] = double.NaN; validRanks--; // do not consider as valid } // invert ranking, so that rank 0 is the most abundant protein rank[row] = mdata.RowCount - rank[row]; } for (int row = 0; row < mdata.RowCount; row++) { relativeRank[row] = rank[row] / validRanks; } if (ArrayUtils.Contains(outputColumns, 4)) { mdata.AddNumericColumn("Copy number rank" + suffix, "", rank); } if (ArrayUtils.Contains(outputColumns, 5)) { mdata.AddNumericColumn("Relative copy number rank" + suffix, "", relativeRank); } if (intensityCols[col] < mdata.ColumnCount && param.GetParamWithSubParams <int>("Averaging mode").Value != 3) { inputNameRow[intensityCols[col]] = inputNames[col]; sampleNameRow[intensityCols[col]] = sampleNames[col]; totalProteinRow[intensityCols[col]] = Math.Round(totalProtein, 2); totalMoleculesRow[intensityCols[col]] = Math.Round(totalMolecules, 0); organismRow[intensityCols[col]] = new[] { organism.name }; histoneMassRow[intensityCols[col]] = Math.Round(histoneMass, 4); ploidyRow[intensityCols[col]] = Math.Round(histoneMass * 1e-12 / cValue, 2); cellVolumeRow[intensityCols[col]] = Math.Round(totalVolume, 2); // femtoliters } } // Summary annotation row if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 6)) { mdata.AddNumericRow("Total protein [pg/cell]", "", totalProteinRow); mdata.AddNumericRow("Total molecules per cell", "", totalMoleculesRow); mdata.AddCategoryRow("Organism", "", organismRow); mdata.AddNumericRow("Histone mass [pg/cell]", "", histoneMassRow); mdata.AddNumericRow("Ploidy", "", ploidyRow); mdata.AddNumericRow("Cell volume [fl]", "", cellVolumeRow); } // Summary matrix if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 7)) { supplTables = new IMatrixData[1]; IMatrixData supplTab = (IMatrixData)mdata.CreateNewInstance(DataType.Matrix); supplTab.ColumnNames = new List <string>(); supplTab.Values.Init(totalProteinRow.Length, 0); supplTab.SetAnnotationColumns(new List <string> { "Sample", "Input Column" }, new List <string[]>() { sampleNameRow, inputNameRow }, new List <string>() { "Organism" }, new List <string[][]>() { organismRow }, new List <string>() { "Total protein [pg/cell]", "Total molecules per cell", "Histone mass [pg/cell]", "Ploidy", "Cell volume [fl]" }, new List <double[]>() { totalProteinRow, totalMoleculesRow, histoneMassRow, ploidyRow, cellVolumeRow }, new List <string>(), new List <double[][]>()); supplTables[0] = supplTab; } }