Ejemplo n.º 1
0
        //gene enrichment analysis for both Basic PECA and PECA-N
        public static IMatrixData GetGOEnr(IMatrixData mdata, string workingDir, int option)//, out string errString)
        {
            char separator = '\t';

            string filename = Path.Combine(workingDir, @".\Goterms.txt");

            IMatrixData mNew = (IMatrixData)mdata.CreateNewInstance();

            string name = "GSA";

            if (option == 0)
            {
                name = name + "_Degradation";
            }
            else if (option == 1)
            {
                name = name + "_Synthesis";
            }

            mNew.Clear();
            mNew.Name    = name;
            mNew.AltName = name;

            //update
            //mNew.AltName = "Gene Set Enrichment Analysis";
            //mNew.Description = "Gene Set Enrichment Analysis";


            string[] colNames = TabSep.GetColumnNames(filename, 0, PerseusUtils.commentPrefix,
                                                      PerseusUtils.commentPrefixExceptions, null, separator);

            string[][] cols = TabSep.GetColumns(colNames, filename, 0, PerseusUtils.commentPrefix,
                                                PerseusUtils.commentPrefixExceptions, separator);

            int nrows = TabSep.GetRowCount(filename);

            mNew.Values.Init(nrows, 0);

            mNew.SetAnnotationColumns(new List <string>(colNames), new List <string>(colNames), new List <string[]>(cols), new List <string>(),
                                      new List <string>(), new List <string[][]>(), new List <string>(), new List <string>(), new List <double[]>(),
                                      new List <string>(), new List <string>(), new List <double[][]>());


            //convert the ones not matching regex to numeric
            string     pattern     = @"^((?!id|name|members).)*$";
            Regex      numericReg  = new Regex(pattern);
            List <int> numericList = new List <int>();

            for (int i = 0; i < colNames.Length; i++)
            {
                if (numericReg.Match(colNames[i]).Success)
                {
                    numericList.Add(i);
                }
            }
            StringToNumerical(numericList, mNew);
            return(mNew);
        }
Ejemplo n.º 2
0
        public IMatrixData ProcessData(IMatrixData[] inputData, Parameters param, ref IMatrixData[] supplTables,
                                       ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            IMatrixData mdata1 = inputData[0];
            IMatrixData mdata2 = inputData[1];
            int         nrows1 = mdata1.RowCount;
            int         nrows2 = mdata2.RowCount;
            int         nrows  = nrows1 + nrows2;
            Dictionary <string, int> dic1;
            Dictionary <string, int> dic2;

            string[] expColNames = SpecialSort(mdata1.ColumnNames, mdata2.ColumnNames, out dic1, out dic2);
            float[,] ex = new float[nrows, expColNames.Length];
            for (int i = 0; i < ex.GetLength(0); i++)
            {
                for (int j = 0; j < ex.GetLength(1); j++)
                {
                    ex[i, j] = float.NaN;
                }
            }
            for (int i = 0; i < expColNames.Length; i++)
            {
                if (dic1.ContainsKey(expColNames[i]))
                {
                    int ind = dic1[expColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        ex[j, i] = mdata1.Values[j, ind];
                    }
                }
                if (dic2.ContainsKey(expColNames[i]))
                {
                    int ind = dic2[expColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        ex[nrows1 + j, i] = mdata2.Values[j, ind];
                    }
                }
            }
            string[]        numColNames = SpecialSort(mdata1.NumericColumnNames, mdata2.NumericColumnNames, out dic1, out dic2);
            List <double[]> numCols     = new List <double[]>();

            for (int i = 0; i < numColNames.Length; i++)
            {
                numCols.Add(new double[nrows]);
                for (int j = 0; j < nrows; j++)
                {
                    numCols[numCols.Count - 1][j] = double.NaN;
                }
            }
            for (int i = 0; i < numColNames.Length; i++)
            {
                if (dic1.ContainsKey(numColNames[i]))
                {
                    int ind = dic1[numColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        numCols[i][j] = mdata1.NumericColumns[ind][j];
                    }
                }
                if (dic2.ContainsKey(numColNames[i]))
                {
                    int ind = dic2[numColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        numCols[i][nrows1 + j] = mdata2.NumericColumns[ind][j];
                    }
                }
            }
            string[]        stringColNames = SpecialSort(mdata1.StringColumnNames, mdata2.StringColumnNames, out dic1, out dic2);
            List <string[]> stringCols     = new List <string[]>();

            for (int i = 0; i < stringColNames.Length; i++)
            {
                stringCols.Add(new string[nrows]);
                for (int j = 0; j < nrows; j++)
                {
                    stringCols[stringCols.Count - 1][j] = "";
                }
            }
            for (int i = 0; i < stringColNames.Length; i++)
            {
                if (dic1.ContainsKey(stringColNames[i]))
                {
                    int ind = dic1[stringColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        stringCols[i][j] = mdata1.StringColumns[ind][j];
                    }
                }
                if (dic2.ContainsKey(stringColNames[i]))
                {
                    int ind = dic2[stringColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        stringCols[i][nrows1 + j] = mdata2.StringColumns[ind][j];
                    }
                }
            }
            string[]          catColNames = SpecialSort(mdata1.CategoryColumnNames, mdata2.CategoryColumnNames, out dic1, out dic2);
            List <string[][]> catCols     = new List <string[][]>();

            for (int i = 0; i < catColNames.Length; i++)
            {
                catCols.Add(new string[nrows][]);
                for (int j = 0; j < nrows; j++)
                {
                    catCols[catCols.Count - 1][j] = new string[0];
                }
            }
            for (int i = 0; i < catColNames.Length; i++)
            {
                if (dic1.ContainsKey(catColNames[i]))
                {
                    int ind = dic1[stringColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        catCols[i][j] = mdata1.GetCategoryColumnEntryAt(ind, j);
                    }
                }
                if (dic2.ContainsKey(catColNames[i]))
                {
                    int ind = dic2[catColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        catCols[i][nrows1 + j] = mdata2.GetCategoryColumnEntryAt(ind, j);
                    }
                }
            }
            string[] multiNumColNames = SpecialSort(mdata1.MultiNumericColumnNames, mdata2.MultiNumericColumnNames, out dic1,
                                                    out dic2);
            List <double[][]> multiNumCols = new List <double[][]>();

            for (int i = 0; i < multiNumColNames.Length; i++)
            {
                multiNumCols.Add(new double[nrows][]);
                for (int j = 0; j < nrows; j++)
                {
                    multiNumCols[multiNumCols.Count - 1][j] = new double[0];
                }
            }
            for (int i = 0; i < multiNumColNames.Length; i++)
            {
                if (dic1.ContainsKey(multiNumColNames[i]))
                {
                    int ind = dic1[multiNumColNames[i]];
                    for (int j = 0; j < nrows1; j++)
                    {
                        multiNumCols[i][j] = mdata1.MultiNumericColumns[ind][j];
                    }
                }
                if (dic2.ContainsKey(multiNumColNames[i]))
                {
                    int ind = dic2[multiNumColNames[i]];
                    for (int j = 0; j < nrows2; j++)
                    {
                        multiNumCols[i][nrows1 + j] = mdata2.MultiNumericColumns[ind][j];
                    }
                }
            }
            IMatrixData result = (IMatrixData)mdata1.CreateNewInstance();

            result.ColumnNames        = new List <string>(expColNames);
            result.ColumnDescriptions = result.ColumnNames;
            result.Values.Set(ex);
            result.NumericColumnNames             = new List <string>(numColNames);
            result.NumericColumnDescriptions      = result.NumericColumnNames;
            result.NumericColumns                 = numCols;
            result.StringColumnNames              = new List <string>(stringColNames);
            result.StringColumnDescriptions       = result.StringColumnDescriptions;
            result.StringColumns                  = stringCols;
            result.CategoryColumnNames            = new List <string>(catColNames);
            result.CategoryColumnDescriptions     = result.CategoryColumnNames;
            result.CategoryColumns                = catCols;
            result.MultiNumericColumnNames        = new List <string>(multiNumColNames);
            result.MultiNumericColumnDescriptions = result.MultiNumericColumnNames;
            result.MultiNumericColumns            = multiNumCols;
            return(result);
        }
Ejemplo n.º 3
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
                                ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int[] outputColumns      = param.GetParam <int[]>("Output").Value;
            int   proteinIdColumnInd = param.GetParam <int>("Protein IDs").Value;

            string[] proteinIds    = mdata.StringColumns[proteinIdColumnInd];
            int[]    intensityCols = param.GetParam <int[]>("Intensities").Value;
            if (intensityCols.Length == 0)
            {
                processInfo.ErrString = "Please select at least one column containing protein intensities.";
                return;
            }
            // variable to hold all intensity values
            List <double[]> columns = new List <double[]>();

            string[] inputNames  = new string[intensityCols.Length];
            string[] sampleNames = new string[intensityCols.Length];
            for (int col = 0; col < intensityCols.Length; col++)
            {
                double[] values;
                if (intensityCols[col] < mdata.ColumnCount)
                {
                    values          = ArrayUtils.ToDoubles(mdata.Values.GetColumn(intensityCols[col]));
                    inputNames[col] = mdata.ColumnNames[intensityCols[col]];
                }
                else
                {
                    values          = mdata.NumericColumns[intensityCols[col] - mdata.ColumnCount];
                    inputNames[col] = mdata.NumericColumnNames[intensityCols[col] - mdata.ColumnCount];
                }
                sampleNames[col] = new Regex(@"^(?:(?:LFQ )?[Ii]ntensity )?(.*)$").Match(inputNames[col]).Groups[1].Value;
                columns.Add(values);
            }
            // average over columns if this option is selected
            if (param.GetParamWithSubParams <int>("Averaging mode").Value == 3)
            {
                double[] column = new double[mdata.RowCount];
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    double[] values = new double[intensityCols.Length];
                    for (int col = 0; col < intensityCols.Length; col++)
                    {
                        values[col] = columns[col][row];
                    }
                    column[row] = ArrayUtils.Median(ExtractValidValues(values, false));
                }
                // delete the original list of columns
                columns = new List <double[]> {
                    column
                };
                sampleNames = new[] { "" };
            }
            // revert logarithm if necessary
            if (param.GetParamWithSubParams <bool>("Logarithmized").Value)
            {
                double[] logBases = new[] { 2, Math.E, 10 };
                double   logBase  =
                    logBases[param.GetParamWithSubParams <bool>("Logarithmized").GetSubParameters().GetParam <int>("log base").Value];
                foreach (double[] t in columns)
                {
                    for (int row = 0; row < mdata.RowCount; row++)
                    {
                        if (t[row] == 0)
                        {
                            processInfo.ErrString = "Are the columns really logarithmized?\nThey contain zeroes!";
                        }
                        t[row] = Math.Pow(logBase, t[row]);
                    }
                }
            }
            double[] mw = mdata.NumericColumns[param.GetParam <int>("Molecular masses").Value];
            // define whether the molecular masses are given in Da or kDa
            if (ArrayUtils.Median(mw) < 250)             // most likely kDa
            {
                for (int i = 0; i < mw.Length; i++)
                {
                    mw[i] *= 1000;
                }
            }
            double[] detectabilityNormFactor = mw;
            if (param.GetParamWithSubParams <bool>("Detectability correction").Value)
            {
                detectabilityNormFactor =
                    mdata.NumericColumns[
                        param.GetParamWithSubParams <bool>("Detectability correction").GetSubParameters().GetParam <int>("Correction factor")
                        .Value];
            }
            // the normalization factor needs to be nonzero for all proteins
            // check and replace with 1 for all relevant cases
            for (int row = 0; row < mdata.RowCount; row++)
            {
                if (detectabilityNormFactor[row] == 0 || double.IsNaN(detectabilityNormFactor[row]))
                {
                    detectabilityNormFactor[row] = 1;
                }
            }
            // detect the organism
            Organism organism = DetectOrganism(proteinIds);
            // c value the amount of DNA per haploid genome, see: http://en.wikipedia.org/wiki/C-value
            double cValue = organism.genomeSize * basePairWeight / avogadro;

            // find the histones
            int[] histoneRows = FindHistones(proteinIds, organism);
            // write a categorical column indicating the histones
            string[][] histoneCol = new string[mdata.RowCount][];
            for (int row = 0; row < mdata.RowCount; row++)
            {
                histoneCol[row] = ArrayUtils.Contains(histoneRows, row) ? new[] { "+" } : new string[0];
            }
            mdata.AddCategoryColumn("Histones", "", histoneCol);

            // initialize the variables for the annotation rows
            string[]   sampleNameRow     = new string[mdata.ColumnCount];
            string[]   inputNameRow      = new string[mdata.ColumnCount];
            double[]   totalProteinRow   = new double[mdata.ColumnCount];
            double[]   totalMoleculesRow = new double[mdata.ColumnCount];
            string[][] organismRow       = new string[mdata.ColumnCount][];
            // populate the organismRow variable with empty strings as defaults (not null, which may cause errors when writing the annotations in the end.)
            for (int i = 0; i < organismRow.Length; i++)
            {
                organismRow[i] = new[] { "N/A" };
            }
            double[] histoneMassRow       = new double[mdata.ColumnCount];
            double[] ploidyRow            = new double[mdata.ColumnCount];
            double[] cellVolumeRow        = new double[mdata.ColumnCount];
            double[] normalizationFactors = new double[columns.Count];
            // calculate normalization factors for each column
            for (int col = 0; col < columns.Count; col++)
            {
                string   sampleName = sampleNames[col];
                double[] column     = columns[col];
                // normalization factor to go from intensities to copies,
                // needs to be determined either using the total protein or the histone scaling approach
                double factor;
                switch (param.GetParamWithSubParams <int>("Scaling mode").Value)
                {
                case 0:                         // total protein amount
                    double mwWeightedNormalizedSummedIntensities = 0;
                    for (int row = 0; row < mdata.RowCount; row++)
                    {
                        if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row]))
                        {
                            mwWeightedNormalizedSummedIntensities += column[row] / detectabilityNormFactor[row] * mw[row];
                        }
                    }
                    factor =
                        param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>(
                            "Protein amount per cell [pg]").Value *1e-12 * avogadro / mwWeightedNormalizedSummedIntensities;
                    break;

                case 1:                         // histone mode
                    double mwWeightedNormalizedSummedHistoneIntensities = 0;
                    foreach (int row in histoneRows)
                    {
                        if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row]))
                        {
                            mwWeightedNormalizedSummedHistoneIntensities += column[row] / detectabilityNormFactor[row] * mw[row];
                        }
                    }
                    double ploidy =
                        param.GetParamWithSubParams <int>("Scaling mode").GetSubParameters().GetParam <double>("Ploidy").Value;
                    factor = cValue * ploidy * avogadro / mwWeightedNormalizedSummedHistoneIntensities;
                    break;

                default:
                    factor = 1;
                    break;
                }
                normalizationFactors[col] = factor;
            }
            // check averaging mode
            if (param.GetParamWithSubParams <int>("Averaging mode").Value == 1)            // same factor for all
            {
                double factor = ArrayUtils.Mean(normalizationFactors);
                for (int i = 0; i < normalizationFactors.Length; i++)
                {
                    normalizationFactors[i] = factor;
                }
            }
            if (param.GetParamWithSubParams <int>("Averaging mode").Value == 2)            // same factor in each group
            {
                if (param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value == -1)
                {
                    processInfo.ErrString = "No grouping selected.";
                    return;
                }
                string[][] groupNames =
                    mdata.GetCategoryRowAt(
                        param.GetParamWithSubParams <int>("Averaging mode").GetSubParameters().GetParam <int>("Grouping").Value);
                string[] uniqueGroupNames = Unique(groupNames);
                int[]    grouping         = new int[columns.Count];
                for (int i = 0; i < columns.Count; i++)
                {
                    if (intensityCols[i] >= mdata.ColumnCount)                      // Numeric annotation columns cannot be grouped
                    {
                        grouping[i] = i;
                        continue;
                    }
                    if (ArrayUtils.Contains(uniqueGroupNames, groupNames[i][0]))
                    {
                        grouping[i] = ArrayUtils.IndexOf(uniqueGroupNames, groupNames[i][0]);
                        continue;
                    }
                    grouping[i] = i;
                }
                Dictionary <int, List <double> > factors = new Dictionary <int, List <double> >();
                for (int i = 0; i < columns.Count; i++)
                {
                    if (factors.ContainsKey(grouping[i]))
                    {
                        factors[grouping[i]].Add(normalizationFactors[i]);
                    }
                    else
                    {
                        factors.Add(grouping[i], new List <double> {
                            normalizationFactors[i]
                        });
                    }
                }
                double[] averagedNormalizationFactors = new double[columns.Count];
                for (int i = 0; i < columns.Count; i++)
                {
                    List <double> factor;
                    factors.TryGetValue(grouping[i], out factor);
                    averagedNormalizationFactors[i] = ArrayUtils.Mean(factor);
                }
                normalizationFactors = averagedNormalizationFactors;
            }
            // loop over all selected columns and calculate copy numbers
            for (int col = 0; col < columns.Count; col++)
            {
                string   sampleName     = sampleNames[col];
                double[] column         = columns[col];
                double   factor         = normalizationFactors[col];
                double[] copyNumbers    = new double[mdata.RowCount];
                double[] concentrations = new double[mdata.RowCount];                 // femtoliters
                double[] massFraction   = new double[mdata.RowCount];
                double[] moleFraction   = new double[mdata.RowCount];
                double   totalProtein   = 0;            // picograms
                double   histoneMass    = 0;            // picograms
                double   totalMolecules = 0;
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row]))
                    {
                        copyNumbers[row] = column[row] / detectabilityNormFactor[row] * factor;
                        totalMolecules  += copyNumbers[row];
                        totalProtein    += copyNumbers[row] * mw[row] * 1e12 / avogadro;                // picograms
                        if (ArrayUtils.Contains(histoneRows, row))
                        {
                            histoneMass += copyNumbers[row] * mw[row] * 1e12 / avogadro;                       // picograms
                        }
                    }
                }
                double totalVolume = totalProtein / param.GetParam <double>("Total cellular protein concentration [g/l]").Value *
                                     1000;
                // femtoliters
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    if (!double.IsNaN(column[row]) && !double.IsNaN(mw[row]))
                    {
                        concentrations[row] = copyNumbers[row] / (totalVolume * 1e-15) / avogadro * 1e9;         // nanomolar
                        massFraction[row]   = copyNumbers[row] * mw[row] * 1e12 / avogadro / totalProtein * 1e6; // ppm
                        moleFraction[row]   = copyNumbers[row] / totalMolecules * 1e6;                           // ppm
                    }
                }
                string suffix = sampleName == "" ? "" : " " + sampleName;
                if (ArrayUtils.Contains(outputColumns, 0))
                {
                    mdata.AddNumericColumn("Copy number" + suffix, "", copyNumbers);
                }
                if (ArrayUtils.Contains(outputColumns, 1))
                {
                    mdata.AddNumericColumn("Concentration [nM]" + suffix, "", concentrations);
                }
                if (ArrayUtils.Contains(outputColumns, 2))
                {
                    mdata.AddNumericColumn("Abundance (mass/total mass) [*10^-6]" + suffix, "", massFraction);
                }
                if (ArrayUtils.Contains(outputColumns, 3))
                {
                    mdata.AddNumericColumn("Abundance (molecules/total molecules) [*10^-6]" + suffix, "", moleFraction);
                }
                double[] rank         = ArrayUtils.Rank(copyNumbers);
                double[] relativeRank = new double[mdata.RowCount];
                double   validRanks   = mdata.RowCount;
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    // remove rank for protein with no copy number information
                    if (double.IsNaN(copyNumbers[row]) || copyNumbers[row] == 0)
                    {
                        rank[row] = double.NaN;
                        validRanks--;                         // do not consider as valid
                    }
                    // invert ranking, so that rank 0 is the most abundant protein
                    rank[row] = mdata.RowCount - rank[row];
                }
                for (int row = 0; row < mdata.RowCount; row++)
                {
                    relativeRank[row] = rank[row] / validRanks;
                }
                if (ArrayUtils.Contains(outputColumns, 4))
                {
                    mdata.AddNumericColumn("Copy number rank" + suffix, "", rank);
                }
                if (ArrayUtils.Contains(outputColumns, 5))
                {
                    mdata.AddNumericColumn("Relative copy number rank" + suffix, "", relativeRank);
                }
                if (intensityCols[col] < mdata.ColumnCount && param.GetParamWithSubParams <int>("Averaging mode").Value != 3)
                {
                    inputNameRow[intensityCols[col]]      = inputNames[col];
                    sampleNameRow[intensityCols[col]]     = sampleNames[col];
                    totalProteinRow[intensityCols[col]]   = Math.Round(totalProtein, 2);
                    totalMoleculesRow[intensityCols[col]] = Math.Round(totalMolecules, 0);
                    organismRow[intensityCols[col]]       = new[] { organism.name };
                    histoneMassRow[intensityCols[col]]    = Math.Round(histoneMass, 4);
                    ploidyRow[intensityCols[col]]         = Math.Round(histoneMass * 1e-12 / cValue, 2);
                    cellVolumeRow[intensityCols[col]]     = Math.Round(totalVolume, 2);                 // femtoliters
                }
            }

            // Summary annotation row
            if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 6))
            {
                mdata.AddNumericRow("Total protein [pg/cell]", "", totalProteinRow);
                mdata.AddNumericRow("Total molecules per cell", "", totalMoleculesRow);
                mdata.AddCategoryRow("Organism", "", organismRow);
                mdata.AddNumericRow("Histone mass [pg/cell]", "", histoneMassRow);
                mdata.AddNumericRow("Ploidy", "", ploidyRow);
                mdata.AddNumericRow("Cell volume [fl]", "", cellVolumeRow);
            }

            // Summary matrix
            if (param.GetParamWithSubParams <int>("Averaging mode").Value != 3 && ArrayUtils.Contains(outputColumns, 7))
            {
                supplTables = new IMatrixData[1];
                IMatrixData supplTab = (IMatrixData)mdata.CreateNewInstance(DataType.Matrix);
                supplTab.ColumnNames = new List <string>();
                supplTab.Values.Init(totalProteinRow.Length, 0);
                supplTab.SetAnnotationColumns(new List <string> {
                    "Sample", "Input Column"
                },
                                              new List <string[]>()
                {
                    sampleNameRow, inputNameRow
                }, new List <string>()
                {
                    "Organism"
                },
                                              new List <string[][]>()
                {
                    organismRow
                },
                                              new List <string>()
                {
                    "Total protein [pg/cell]",
                    "Total molecules per cell",
                    "Histone mass [pg/cell]",
                    "Ploidy",
                    "Cell volume [fl]"
                },
                                              new List <double[]>()
                {
                    totalProteinRow, totalMoleculesRow, histoneMassRow, ploidyRow, cellVolumeRow
                },
                                              new List <string>(), new List <double[][]>());
                supplTables[0] = supplTab;
            }
        }