Esempio n. 1
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            Random2 rand = new Random2();
            double std = param.GetParam<double>("Standard deviation").Value;
            int[] inds = param.GetParam<int[]>("Columns").Value;
            List<int> mainInds = new List<int>();
            List<int> numInds = new List<int>();
            foreach (int ind in inds){
                if (ind < mdata.ColumnCount){
                    mainInds.Add(ind);
                } else{
                    numInds.Add(ind - mdata.ColumnCount);
                }
            }
            foreach (int j in mainInds){
                for (int i = 0; i < mdata.RowCount; i++){
                    mdata.Values.Set(i, j, mdata.Values.Get(i, j) + (float) rand.NextGaussian(0, std));
                }
            }
            foreach (int j in numInds){
                for (int i = 0; i < mdata.RowCount; i++){
                    mdata.NumericColumns[j][i] += (float) rand.NextGaussian(0, std);
                }
            }
        }
Esempio n. 2
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            Parameter<int> access = param.GetParam<int>("Matrix access");
            bool rows = access.Value == 0;
            UnitVectors(rows, mdata);
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            const bool rows = false;
            bool percentage;
            int minValids = PerseusPluginUtils.GetMinValids(param, out percentage);
            ParameterWithSubParams<int> modeParam = param.GetParamWithSubParams<int>("Mode");
            int modeInd = modeParam.Value;
            if (modeInd != 0 && mdata.CategoryRowNames.Count == 0){
                processInfo.ErrString = "No grouping is defined.";
                return;
            }
            if (modeInd != 0){
                processInfo.ErrString = "Group-wise filtering can only be appled to rows.";
                return;
            }
            FilteringMode filterMode;
            double threshold;
            double threshold2;
            PerseusPluginUtils.ReadValuesShouldBeParams(param, out filterMode, out threshold, out threshold2);
            if (modeInd != 0){
                //TODO
            } else{
                PerseusPluginUtils.NonzeroFilter1(rows, minValids, percentage, mdata, param, threshold, threshold2, filterMode);
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            bool rows = param.GetParam<int>("Matrix access").Value == 0;
            double min = param.GetParam<double>("Minimum").Value;
            double max = param.GetParam<double>("Maximum").Value;
            MapToInterval1(rows, mdata, min, max, processInfo.NumThreads);
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string[] mods = param.GetParam<int[]>("Modifications").StringValue.Split(new[]{';'},
                StringSplitOptions.RemoveEmptyEntries);
            string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value];
            string[][] uprot = new string[up.Length][];
            for (int i = 0; i < up.Length; i++){
                uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0];
            }
            double[][] c = new double[mods.Length][];
            for (int index = 0; index < mods.Length; index++){
                string mod = mods[index];
                string filename = PhosphoSitePlusParser.GetFilenameForMod(mod);
                if (filename == null){
                    processInfo.ErrString = "File does not exist.";
                    return;
                }
                string[] seqWins;
                string[] accs;
                string[] pubmedLtp;
                string[] pubmedMs2;
                string[] cstMs2;
                string[] species;
                PhosphoSitePlusParser.ParseKnownMods(filename, out seqWins, out accs, out pubmedLtp, out pubmedMs2, out cstMs2, out species);
                for (int i = 0; i < seqWins.Length; i++){
                    seqWins[i] = seqWins[i].ToUpper();
                }
                Dictionary<string, HashSet<string>> counts = new Dictionary<string, HashSet<string>>();
                for (int i = 0; i < accs.Length; i++){
                    string acc = accs[i];
                    if (!counts.ContainsKey(acc)){
                        counts.Add(acc, new HashSet<string>());
                    }
                    counts[acc].Add(seqWins[i]);
                }
                c[index] = new double[up.Length];
                for (int i = 0; i < up.Length; i++){
                    c[index][i] = CountSites(uprot[i], counts);
                }
            }
            string[][] catCol = new string[up.Length][];
            for (int i = 0; i < catCol.Length; i++){
                List<string> x = new List<string>();
                for (int j = 0; j < mods.Length; j++){
                    if (c[j][i] > 0){
                        x.Add(mods[j]);
                    }
                }
                x.Sort();
                catCol[i] = x.ToArray();
            }
            mdata.AddCategoryColumn("Known modifications", "Known modifications", catCol);
            for (int i = 0; i < mods.Length; i++){
                mdata.AddNumericColumn(mods[i] + " count", mods[i] + " count", c[i]);
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            var vals = param.GetParam<Tuple<Regex, string>>("Regex").Value;
            var pattern = vals.Item1;
            string replacementStr = vals.Item2;
            for (int i = 0; i < mdata.ColumnCount; i++){
                mdata.ColumnNames[i] = pattern.Replace(mdata.ColumnNames[i], replacementStr);
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string[] seqWins;
            string[] subAccs;
            string[] kinases;
            string[] kinAccs;
            string[] species;
            PhosphoSitePlusParser.ParseKinaseSubstrate(out seqWins, out subAccs, out kinases, out kinAccs, out species);
            if (seqWins == null){
                processInfo.ErrString = "File does not exist.";
                return;
            }
            string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value];
            string[][] uprot = new string[up.Length][];
            for (int i = 0; i < up.Length; i++){
                uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0];
            }
            string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value];
            Dictionary<string, List<Tuple<string, string, string>>> substrateProperties =
                new Dictionary<string, List<Tuple<string, string, string>>>();
            for (int i = 0; i < seqWins.Length; i++){
                string subAcc = subAccs[i];
                if (!substrateProperties.ContainsKey(subAcc)){
                    substrateProperties.Add(subAcc, new List<Tuple<string, string, string>>());
                }
                substrateProperties[subAcc].Add(new Tuple<string, string, string>(seqWins[i], kinases[i], kinAccs[i]));
            }
            string[] kinaseNameColumn = new string[uprot.Length];
            string[] kinaseUniprotColumn = new string[uprot.Length];
            for (int i = 0; i < kinaseNameColumn.Length; i++){
                string[] win1 = AddKnownSites.TransformIl(win[i]).Split(';');
                HashSet<string> kinaseNamesHits = new HashSet<string>();
                HashSet<string> kinaseUniprotHits = new HashSet<string>();
                foreach (string ux in uprot[i]){
                    if (substrateProperties.ContainsKey(ux)){
                        List<Tuple<string, string, string>> properties = substrateProperties[ux];
                        foreach (Tuple<string, string, string> property in properties){
                            string w = property.Item1;
                            if (AddKnownSites.Contains(win1, AddKnownSites.TransformIl(w.ToUpper().Substring(1, w.Length - 2)))){
                                kinaseNamesHits.Add(property.Item2);
                                kinaseUniprotHits.Add(property.Item3);
                            }
                        }
                    }
                }
                kinaseNameColumn[i] = kinaseNamesHits.Count > 0 ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseNamesHits)) : "";
                kinaseUniprotColumn[i] = kinaseUniprotHits.Count > 0
                    ? StringUtils.Concat(";", ArrayUtils.ToArray(kinaseUniprotHits))
                    : "";
            }
            mdata.AddStringColumn("PhosphoSitePlus kinase", "", kinaseNameColumn);
            mdata.AddStringColumn("PhosphoSitePlus kinase uniprot", "", kinaseUniprotColumn);
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string regexStr = param.GetParam<string>("Regular expression").Value;
            Regex regex = new Regex(regexStr);
            int[] inds = param.GetParam<int[]>("Columns").Value;
            bool keepColumns = param.GetParam<bool>("Keep original columns").Value;
            bool semicolons = param.GetParam<bool>("Strings separated by semicolons are independent").Value;
            foreach (int col in inds){
                ProcessCol(mdata, regex, col, keepColumns, semicolons);
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string colName = param.GetParam<string>("Name of new column").Value;
            int[] columns = param.GetParam<int[]>("Categories").Value;
            bool inverse = param.GetParam<bool>("Inverse").Value;
            int[] catCols;
            int[] stringCols;
            Split(columns, out catCols, out stringCols, mdata.CategoryColumnCount);
            string[] word1 = param.GetParam<string[]>("Search terms").Value;
            if (word1.Length == 0){
                processInfo.ErrString = "Please specify one or more search terms.";
                return;
            }
            if (string.IsNullOrEmpty(colName)){
                colName = word1[0];
            }
            string[] word = new string[word1.Length];
            for (int i = 0; i < word.Length; i++){
                word[i] = word1[i].ToLower().Trim();
            }
            bool[] indicator = new bool[mdata.RowCount];
            foreach (int col in catCols){
                for (int i = 0; i < mdata.RowCount; i++){
                    foreach (string s in mdata.GetCategoryColumnEntryAt(col, i)){
                        foreach (string s1 in word){
                            if (s.ToLower().Contains(s1)){
                                indicator[i] = true;
                                break;
                            }
                        }
                    }
                }
            }
            foreach (string[] txt in stringCols.Select(col => mdata.StringColumns[col])){
                for (int i = 0; i < txt.Length; i++){
                    string s = txt[i];
                    foreach (string s1 in word){
                        if (s.ToLower().Contains(s1)){
                            indicator[i] = true;
                            break;
                        }
                    }
                }
            }
            string[][] newCol = new string[indicator.Length][];
            for (int i = 0; i < newCol.Length; i++){
                bool yes = inverse ? !indicator[i] : indicator[i];
                newCol[i] = yes ? new[]{"+"} : new string[0];
            }
            mdata.AddCategoryColumn(colName, "", newCol);
        }
Esempio n. 10
0
        public IMatrixData ProcessData(IMatrixData[] inputData, Parameters parameters, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            IMatrixData mdata1 = inputData[0];
            Dictionary<string, string> map = GetMap(inputData[1], parameters);
            IMatrixData result = (IMatrixData) mdata1.Clone();
            int ind = parameters.GetParam<int>("Column in matrix 1 to be edited").Value;
            string[] x = mdata1.StringColumns[ind];
            for (int i = 0; i < x.Length; i++){
                x[i] = Process(x[i], map);
            }
            return result;
        }
Esempio n. 11
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int nameCol = param.GetParam<int>("New column names").Value;
            List<string> colNames;
            if (nameCol >= 0){
                HashSet<string> taken = new HashSet<string>();
                colNames = new List<string>();
                foreach (string n in mdata.StringColumns[nameCol]){
                    string n1 = StringUtils.GetNextAvailableName(n, taken);
                    taken.Add(n1);
                    colNames.Add(n1);
                }
            } else{
                colNames = new List<string>();
                for (int i = 0; i < mdata.RowCount; i++){
                    colNames.Add("Column" + (i + 1));
                }
            }
            List<string> rowNames = mdata.ColumnNames;
            mdata.Values = mdata.Values.Transpose();
            if (mdata.IsImputed != null){
                mdata.IsImputed = mdata.IsImputed.Transpose();
            }
            if (mdata.Quality != null){
                mdata.Quality = mdata.Quality.Transpose();
            }
            List<string> stringColumnNames = mdata.StringColumnNames;
            List<string> categoryColumnNames = mdata.CategoryColumnNames;
            List<string> numericColumnNames = mdata.NumericColumnNames;
            List<string> multiNumericColumnNames = mdata.MultiNumericColumnNames;
            List<string> stringColumnDescriptions = mdata.StringColumnDescriptions;
            List<string> categoryColumnDescriptions = mdata.CategoryColumnDescriptions;
            List<string> numericColumnDescriptions = mdata.NumericColumnDescriptions;
            List<string> multiNumericColumnDescriptions = mdata.MultiNumericColumnDescriptions;
            List<string[]> stringColumns = mdata.StringColumns;
            List<string[][]> categoryColumns = GetCategoryColumns(mdata);
            List<double[]> numericColumns = mdata.NumericColumns;
            List<double[][]> multiNumericColumns = mdata.MultiNumericColumns;
            mdata.SetAnnotationColumns(new List<string>(new[]{"Name"}), new List<string>(new[]{"Name"}),
                new List<string[]>(new[]{rowNames.ToArray()}), mdata.CategoryRowNames, mdata.CategoryRowDescriptions,
                GetCategoryRows(mdata), mdata.NumericRowNames, mdata.NumericRowDescriptions, mdata.NumericRows, new List<string>(),
                new List<string>(), new List<double[][]>());
            mdata.ColumnNames = colNames;
            mdata.SetAnnotationRows(stringColumnNames, stringColumnDescriptions, stringColumns, categoryColumnNames,
                categoryColumnDescriptions, categoryColumns, numericColumnNames, numericColumnDescriptions, numericColumns,
                multiNumericColumnNames, multiNumericColumnDescriptions, multiNumericColumns);
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string word = param.GetParam<string>("Find what").Value;
            int colInd = param.GetParam<int>("Look in").Value;
            bool matchCase = param.GetParam<bool>("Match case").Value;
            bool matchWholeWord = param.GetParam<bool>("Match whole word").Value;
            string scolName = mdata.StringColumnNames[colInd];
            string[] scol = mdata.StringColumns[colInd];
            string[][] catCol = new string[mdata.RowCount][];
            for (int i = 0; i < catCol.Length; i++){
                bool found = Find(scol[i], word, matchCase, matchWholeWord);
                catCol[i] = found ? new[]{"+"} : new string[0];
            }
            mdata.AddCategoryColumn("Search: " + scolName, "Search: " + scolName, catCol);
        }
        public void ProcessData(IMatrixData data, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int[] numColInds = param.GetParam<int[]>("Numerical rows").Value;
            int[] multiNumColInds = param.GetParam<int[]>("Multi-numerical rows").Value;
            int[] catColInds = param.GetParam<int[]>("Categorical rows").Value;
            int[] textColInds = param.GetParam<int[]>("Text rows").Value;
            data.NumericRows = ArrayUtils.SubList(data.NumericRows, numColInds);
            data.NumericRowNames = ArrayUtils.SubList(data.NumericRowNames, numColInds);
            data.NumericRowDescriptions = ArrayUtils.SubList(data.NumericRowDescriptions, numColInds);
            data.MultiNumericRows = ArrayUtils.SubList(data.MultiNumericRows, multiNumColInds);
            data.MultiNumericRowNames = ArrayUtils.SubList(data.MultiNumericRowNames, multiNumColInds);
            data.MultiNumericRowDescriptions = ArrayUtils.SubList(data.MultiNumericRowDescriptions, multiNumColInds);
            data.CategoryRows = PerseusPluginUtils.GetCategoryRows(data, catColInds);
            data.CategoryRowNames = ArrayUtils.SubList(data.CategoryRowNames, catColInds);
            data.CategoryRowDescriptions = ArrayUtils.SubList(data.CategoryRowDescriptions, catColInds);
            data.StringRows = ArrayUtils.SubList(data.StringRows, textColInds);
            data.StringRowNames = ArrayUtils.SubList(data.StringRowNames, textColInds);
            data.StringRowDescriptions = ArrayUtils.SubList(data.StringRowDescriptions, textColInds);
        }
Esempio n. 14
0
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int stringColumnIndx = param.GetParam<int>("Sequence window").Value;
            string[] win = mdata.StringColumns[stringColumnIndx];
            int start = param.GetParam<int>("Start").Value - 1;
            int length = param.GetParam<int>("Length").Value;
            if (start < 0){
                processInfo.ErrString = "Start position cannot be smaller than 1.";
                return;
            }
            if (start + length > win[0].Length){
                processInfo.ErrString = "Start + length cannot exceed the total length of the sequence.";
                return;
            }
            string[] shortenedMotifs = new string[win.Length];
            for (int i = 0; i < mdata.RowCount; ++i){
                shortenedMotifs[i] = win[i].Substring(start, length);
            }
            mdata.AddStringColumn("Short sequence window", "", shortenedMotifs);
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            ParameterWithSubParams<int> p = param.GetParamWithSubParams<int>("Column");
            int colInd = p.Value;
            if (colInd < 0){
                processInfo.ErrString = "No categorical columns available.";
                return;
            }
            Parameter<int[]> mcp = p.GetSubParameters().GetParam<int[]>("Values");
            int[] inds = mcp.Value;
            if (inds.Length == 0){
                processInfo.ErrString = "Please select at least one term for filtering.";
                return;
            }
            string[] values = new string[inds.Length];
            string[] v = mdata.GetCategoryColumnValuesAt(colInd);
            for (int i = 0; i < values.Length; i++){
                values[i] = v[inds[i]];
            }
            HashSet<string> value = new HashSet<string>(values);
            bool remove = param.GetParam<int>("Mode").Value == 0;
            List<int> valids = new List<int>();
            for (int i = 0; i < mdata.RowCount; i++){
                bool valid = true;
                foreach (string w in mdata.GetCategoryColumnEntryAt(colInd, i)){
                    if (value.Contains(w)){
                        valid = false;
                        break;
                    }
                }
                if ((valid && remove) || (!valid && !remove)){
                    valids.Add(i);
                }
            }
            PerseusPluginUtils.FilterRows(mdata, param, valids.ToArray());
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string[][] col = mdata.GetCategoryColumnAt(param.GetParam<int>("Indicator column").Value);
            string term = param.GetParam<string>("Value").Value;
            List<int> inds = new List<int>();
            for (int i = 0; i < col.Length; i++){
                if (Contains(col[i], term)){
                    inds.Add(i);
                }
            }
            double[][] profiles = new double[inds.Count][];
            for (int i = 0; i < profiles.Length; i++){
                profiles[i] = ArrayUtils.ToDoubles(mdata.Values.GetRow(inds[i]));
                float mean = (float) ArrayUtils.Mean(profiles[i]);
                for (int j = 0; j < profiles[i].Length; j++){
                    profiles[i][j] -= mean;
                }
            }
            double[] totalProfile = new double[mdata.ColumnCount];
            for (int i = 0; i < totalProfile.Length; i++){
                List<double> vals = new List<double>();
                foreach (double[] t in profiles){
                    double val = t[i];
                    if (double.IsNaN(val) || double.IsInfinity(val)){
                        continue;
                    }
                    vals.Add(val);
                }
                totalProfile[i] = vals.Count > 0 ? ArrayUtils.Median(vals) : double.NaN;
            }
            for (int i = 0; i < mdata.RowCount; i++){
                for (int j = 0; j < mdata.ColumnCount; j++){
                    mdata.Values.Set(i, j, mdata.Values.Get(i, j)-(float) totalProfile[j]);
                }
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            const bool rows = true;
            bool percentage;
            int minValids = PerseusPluginUtils.GetMinValids(param, out percentage);
            ParameterWithSubParams<int> modeParam = param.GetParamWithSubParams<int>("Mode");
            int modeInd = modeParam.Value;
            if (modeInd != 0 && mdata.CategoryRowNames.Count == 0){
                processInfo.ErrString = "No grouping is defined.";
                return;
            }
            FilteringMode filterMode;
            double threshold;
            double threshold2;
            PerseusPluginUtils.ReadValuesShouldBeParams(param, out filterMode, out threshold, out threshold2);
            if (modeInd != 0){
                int gind = modeParam.GetSubParameters().GetParam<int>("Grouping").Value;
                string[][] groupCol = mdata.GetCategoryRowAt(gind);
                NonzeroFilterGroup(minValids, percentage, mdata, param, modeInd == 2, threshold, threshold2, filterMode, groupCol);
            } else{
                PerseusPluginUtils.NonzeroFilter1(rows, minValids, percentage, mdata, param, threshold, threshold2, filterMode);
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int colInd = param.GetParam<int>("Column").Value;
            string searchString = param.GetParam<string>("Search string").Value;
            if (string.IsNullOrEmpty(searchString)){
                processInfo.ErrString = "Please provide a search string";
                return;
            }
            bool remove = param.GetParam<int>("Mode").Value == 0;
            bool matchCase = param.GetParam<bool>("Match case").Value;
            bool matchWholeWord = param.GetParam<bool>("Match whole word").Value;
            string[] vals = mdata.StringColumns[colInd];
            List<int> valids = new List<int>();
            for (int i = 0; i < vals.Length; i++){
                bool matches = Matches(vals[i], searchString, matchCase, matchWholeWord);
                if (matches && !remove){
                    valids.Add(i);
                } else if (!matches && remove){
                    valids.Add(i);
                }
            }
            PerseusPluginUtils.FilterRows(mdata, param, valids.ToArray());
        }
 public IAnalysisResult AnalyzeData(IMatrixData mdata, Parameters param, ProcessInfo processInfo)
 {
     return new SelectRowsManuallyResult(mdata);
 }
Esempio n. 20
0
 public static void ReadMatrixFromFile(IMatrixData mdata, ProcessInfo processInfo, string filename, char separator)
 {
     var annotationRows = new Dictionary<string, string[]>();
     var colNames = TabSep.GetColumnNames(filename, commentPrefix,
         commentPrefixExceptions, annotationRows, separator);
     var typeRow = annotationRows["Type"];
     int[] eInds, nInds, cInds, tInds, mInds;
     ColumnIndices(typeRow, out eInds, out nInds, out cInds, out tInds, out mInds);
     var filters = new List<Tuple<Relation[], int[], bool>>();
     int nrows;
     using(StreamReader reader = FileUtils.GetReader(filename))
     using (StreamReader auxReader = FileUtils.GetReader(filename))
     {
         nrows = GetRowCount(reader, auxReader, eInds, filters, separator);
     }
     using (StreamReader reader = FileUtils.GetReader(filename))
     using (StreamReader auxReader = FileUtils.GetReader(filename))
     {
         LoadMatrixData(annotationRows, eInds, cInds, nInds, tInds, mInds, processInfo, colNames, mdata, reader,
             auxReader, nrows, filename, separator, false, filters);
     }
 }
Esempio n. 21
0
        public static void ReadMatrixFromFile(IMatrixData mdata, ProcessInfo processInfo, string filename,
	        int[] eInds, int[] nInds, int[] cInds, int[] tInds, int[] mInds,
	        Parameters[] mainFilterParameters, Parameters[] numericalFilterParameters, bool shortenExpressionColumnNames)
        {
            if (!File.Exists(filename))
            {
                processInfo.ErrString = "File '" + filename + "' does not exist.";
                return;
            }
            string ftl = filename.ToLower();
            bool csv = ftl.EndsWith(".csv") || ftl.EndsWith(".csv.gz");
            char separator = csv ? ',' : '\t';
            string[] colNames;
            Dictionary<string, string[]> annotationRows = new Dictionary<string, string[]>();
            try
            {
                colNames = TabSep.GetColumnNames(filename, commentPrefix, commentPrefixExceptions,
                    annotationRows, separator);
            }
            catch (Exception)
            {
                processInfo.ErrString = "Could not open the file '" + filename + "'. It is probably opened in another program.";
                return;
            }
            string origin = filename;
            List<Tuple<Relation[], int[], bool>> filters = new List<Tuple<Relation[], int[], bool>>();
            string errString;
            foreach (Parameters p in mainFilterParameters)
            {
                AddFilter(filters, p, eInds, out errString);
                if (errString != null)
                {
                    processInfo.ErrString = errString;
                    return;
                }
            }
            foreach (Parameters p in numericalFilterParameters)
            {
                AddFilter(filters, p, nInds, out errString);
                if (errString != null)
                {
                    processInfo.ErrString = errString;
                    return;
                }
            }
            int nrows;
            using(StreamReader reader = FileUtils.GetReader(filename))
            using (StreamReader auxReader = FileUtils.GetReader(filename))
            {
                nrows = GetRowCount(reader, auxReader, eInds, filters, separator);
            }
            using (StreamReader reader = FileUtils.GetReader(filename))
            using (StreamReader auxReader = FileUtils.GetReader(filename))
            {
                LoadMatrixData(annotationRows, eInds, cInds, nInds, tInds, mInds, processInfo, colNames, mdata, reader,
                    auxReader, nrows, origin, separator, shortenExpressionColumnNames, filters);
            }
            GC.Collect();
        }
Esempio n. 22
0
        public static void LoadMatrixData(IDictionary<string, string[]> annotationRows, int[] eInds, int[] cInds, int[] nInds,
			int[] tInds, int[] mInds, ProcessInfo processInfo, IList<string> colNames, IMatrixData mdata, StreamReader reader,
			StreamReader auxReader, int nrows, string origin, char separator, bool shortenExpressionNames,
			List<Tuple<Relation[], int[], bool>> filters)
        {
            string[] colDescriptions = null;
            if (annotationRows.ContainsKey("Description")){
                colDescriptions = annotationRows["Description"];
                annotationRows.Remove("Description");
            }
            int[] allInds = ArrayUtils.Concat(new[]{eInds, cInds, nInds, tInds, mInds});
            Array.Sort(allInds);
            for (int i = 0; i < allInds.Length - 1; i++){
                if (allInds[i + 1] == allInds[i]){
                    processInfo.ErrString = "Column '" + colNames[allInds[i]] + "' has been selected multiple times";
                    return;
                }
            }
            string[] allColNames = ArrayUtils.SubArray(colNames, allInds);
            Array.Sort(allColNames);
            for (int i = 0; i < allColNames.Length - 1; i++){
                if (allColNames[i + 1].Equals(allColNames[i])){
                    processInfo.ErrString = "Column name '" + allColNames[i] + "' occurs multiple times.";
                    return;
                }
            }
            LoadMatrixData(colNames, colDescriptions, eInds, cInds, nInds, tInds, mInds, origin, mdata, annotationRows,
                processInfo.Progress, processInfo.Status, separator, reader, auxReader, nrows, shortenExpressionNames, filters);
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            bool keepEmpty = param.GetBoolParam("Keep rows without ID").Value;
            AverageType atype = GetAverageType(param.GetSingleChoiceParam("Average type for expression columns").Value);
            string[] ids2 = mdata.StringColumns[param.GetSingleChoiceParam("ID column").Value];
            string[][] ids = SplitIds(ids2);
            int[] present;
            int[] absent;
            GetPresentAbsentIndices(ids, out present, out absent);
            ids = ArrayUtils.SubArray(ids, present);
            int[][] rowInds = new int[present.Length][];
            for (int i = 0; i < rowInds.Length; i++){
                rowInds[i] = new[]{present[i]};
            }
            ClusterRows(ref rowInds, ref ids);
            if (keepEmpty){
                rowInds = ProlongRowInds(rowInds, absent);
            }
            int nrows = rowInds.Length;
            int ncols = mdata.ExpressionColumnCount;
            float[,] expVals = new float[nrows,ncols];
            for (int j = 0; j < ncols; j++){
                float[] c = mdata.GetExpressionColumn(j);
                for (int i = 0; i < nrows; i++){
                    float[] d = ArrayUtils.SubArray(c, rowInds[i]);
                    expVals[i, j] = Average(d, atype);
                }
            }
            mdata.ExpressionValues = expVals;
            for (int i = 0; i < mdata.NumericColumnCount; i++){
                string name = mdata.NumericColumnNames[i];
                AverageType atype1 = GetAverageType(param.GetSingleChoiceParam("Average type for " + name).Value);
                double[] c = mdata.NumericColumns[i];
                double[] newCol = new double[nrows];
                for (int k = 0; k < nrows; k++){
                    double[] d = ArrayUtils.SubArray(c, rowInds[k]);
                    newCol[k] = Average(d, atype1);
                }
                mdata.NumericColumns[i] = newCol;
            }
            for (int i = 0; i < mdata.CategoryColumnCount; i++){
                string[][] c = mdata.GetCategoryColumnAt(i);
                string[][] newCol = new string[nrows][];
                for (int k = 0; k < nrows; k++){
                    string[][] d = ArrayUtils.SubArray(c, rowInds[k]);
                    newCol[k] = Average(d);
                }
                mdata.SetCategoryColumnAt(newCol,i);
            }
            for (int i = 0; i < mdata.StringColumnCount; i++){
                string[] c = mdata.StringColumns[i];
                string[] newCol = new string[nrows];
                for (int k = 0; k < nrows; k++){
                    string[] d = ArrayUtils.SubArray(c, rowInds[k]);
                    newCol[k] = Average(d);
                }
                mdata.StringColumns[i] = newCol;
            }
            for (int i = 0; i < mdata.MultiNumericColumnCount; i++){
                double[][] c = mdata.MultiNumericColumns[i];
                double[][] newCol = new double[nrows][];
                for (int k = 0; k < nrows; k++){
                    double[][] d = ArrayUtils.SubArray(c, rowInds[k]);
                    newCol[k] = Average(d);
                }
                mdata.MultiNumericColumns[i] = newCol;
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param1, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int[] multiNumCols = param1.GetParam<int[]>("Multi-numeric columns").Value;
            Array.Sort(multiNumCols);
            int[] stringCols = param1.GetParam<int[]>("Text columns").Value;
            Array.Sort(stringCols);
            HashSet<int> multinumCols2 = new HashSet<int>(multiNumCols);
            HashSet<int> stringCols2 = new HashSet<int>(stringCols);
            if (multiNumCols.Length + stringCols.Length == 0){
                processInfo.ErrString = "Please select some columns.";
                return;
            }
            int rowCount = GetNewRowCount(mdata, multiNumCols, stringCols);
            float[,] expVals = new float[rowCount, mdata.ColumnCount];
            List<string[]> stringC = new List<string[]>();
            for (int i = 0; i < mdata.StringColumnCount; i++){
                stringC.Add(new string[rowCount]);
            }
            List<double[]> numC = new List<double[]>();
            for (int i = 0; i < mdata.NumericColumnCount; i++){
                numC.Add(new double[rowCount]);
            }
            List<string[][]> catC = new List<string[][]>();
            for (int i = 0; i < mdata.CategoryColumnCount; i++){
                catC.Add(new string[rowCount][]);
            }
            List<double[][]> multiNumC = new List<double[][]>();
            for (int i = 0; i < mdata.MultiNumericColumnCount; i++){
                multiNumC.Add(new double[rowCount][]);
            }
            int count = 0;
            for (int i = 0; i < mdata.RowCount; i++){
                string err;
                int entryCount = GetEntryCount(i, mdata, multiNumCols, stringCols, out err);
                if (err != null){
                    processInfo.ErrString = err;
                    return;
                }
                bool empty = entryCount == 0;
                entryCount = Math.Max(entryCount, 1);
                for (int j = 0; j < entryCount; j++){
                    for (int k = 0; k < mdata.ColumnCount; k++){
                        expVals[count + j, k] = mdata.Values.Get(i, k);
                    }
                    for (int k = 0; k < mdata.NumericColumnCount; k++){
                        numC[k][count + j] = mdata.NumericColumns[k][i];
                    }
                    for (int k = 0; k < mdata.CategoryColumnCount; k++){
                        catC[k][count + j] = mdata.GetCategoryColumnEntryAt(k, i);
                    }
                }
                for (int k = 0; k < mdata.MultiNumericColumnCount; k++){
                    if (multinumCols2.Contains(k)){
                        if (empty){
                            multiNumC[k][count] = new double[0];
                        } else{
                            double[] vals = mdata.MultiNumericColumns[k][i];
                            for (int j = 0; j < entryCount; j++){
                                multiNumC[k][count + j] = new[]{vals[j]};
                            }
                        }
                    } else{
                        for (int j = 0; j < entryCount; j++){
                            multiNumC[k][count + j] = mdata.MultiNumericColumns[k][i];
                        }
                    }
                }
                for (int k = 0; k < mdata.StringColumnCount; k++){
                    if (stringCols2.Contains(k)){
                        if (empty){
                            stringC[k][count] = "";
                        } else{
                            string[] vals = mdata.StringColumns[k][i].Split(';');
                            for (int j = 0; j < entryCount; j++){
                                stringC[k][count + j] = vals[j];
                            }
                        }
                    } else{
                        for (int j = 0; j < entryCount; j++){
                            stringC[k][count + j] = mdata.StringColumns[k][i];
                        }
                    }
                }
                count += entryCount;
            }
            int[] multiNumComplement = ArrayUtils.Complement(multiNumCols, mdata.MultiNumericColumnCount);
            List<double[][]> toBeTransformed = ArrayUtils.SubList(multiNumC, multiNumCols);
            multiNumC = ArrayUtils.SubList(multiNumC, multiNumComplement);
            foreach (double[][] d in toBeTransformed){
                numC.Add(Transform(d));
            }
            mdata.ColumnNames = mdata.ColumnNames;
            mdata.Values.Set(expVals);
            mdata.SetAnnotationColumns(mdata.StringColumnNames, stringC, mdata.CategoryColumnNames, catC,
                new List<string>(ArrayUtils.Concat(mdata.NumericColumnNames,
                    ArrayUtils.SubList(mdata.MultiNumericColumnNames, multiNumCols))), numC,
                new List<string>(ArrayUtils.SubArray(mdata.MultiNumericColumnNames, multiNumComplement)), multiNumC);
        }
Esempio n. 25
0
        public void ParseFile(string path, ProcessInfo processInfo)
        {
            processInfo.Status("Parsing " + path);
            string accession = "";
            int sequenceCounter = 0;
            StringBuilder sequence = new StringBuilder();
            ProteinSequence protein = new ProteinSequence();
            try{
                StreamReader file = new StreamReader(path);
                string line;
                while ((line = file.ReadLine()) != null){ // valid line
                    if (sequenceCounter%500 == 0){
                        processInfo.Status("Parsing " + path + ", " + (int) ((float) file.BaseStream.Position/file.BaseStream.Length*100) +
                                            "%");
                    }
                    bool lineIsHeader = line.StartsWith(">");

                    // skip all lines until the first header is found
                    if (sequenceCounter == 0 && !lineIsHeader){
                        continue;
                    }

                    // line is a piece of a sequence
                    if (sequenceCounter > 0 && !lineIsHeader){
                        sequence.Append(line.Trim());
                        continue;
                    }

                    // line is a fasta header
                    if (lineIsHeader){
                        if (sequenceCounter > 0)
                            // this is not the first header, i.e. the previous sequence is now completely read in
                        {
                            // add the previous protein
                            protein.SetSequence(sequence.ToString());
                            entries.Add(accession, protein);
                        }
                        // initialize a new protein
                        protein = new ProteinSequence();
                        sequenceCounter++;
                        // then parse the new header
                        string header = line;
                        Match m = regexUniprotAccession.Match(header);
                        if (m.Success){ // uniprot header
                            accession = m.Groups[1].Value;
                            protein.Accession = accession;
                            protein.Header = header;
                        } else{ // fallback position: take entire header after the > as accession
                            accession = header.Substring(1).Trim();
                            protein.Accession = accession;
                            protein.Header = header;
                        }
                        sequence = new StringBuilder();
                    }
                } //end while
                file.Close();

                //add the last protein
                if (sequenceCounter > 0){ // make sure there is at least one sequence in the file
                    protein.SetSequence(sequence.ToString());
                    entries.Add(accession, protein);
                }
            } catch (Exception){
                processInfo.ErrString =
                    "Something went wrong while parsing the fasta file.\nMake sure the path is correct and the " +
                    "file is not opened in another application.\nMake sure the fasta file is valid.";
            }
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            double width = param.GetParam<double>("Width").Value;
            double shift = param.GetParam<double>("Down shift").Value;
            bool separateColumns = param.GetParam<int>("Mode").Value == 1;
            int[] cols = param.GetParam<int[]>("Columns").Value;
            if (cols.Length == 0){
                return;
            }
            if (separateColumns){
                ReplaceMissingsByGaussianByColumn(width, shift, mdata, cols);
            } else{
                string err = ReplaceMissingsByGaussianWholeMatrix(width, shift, mdata, cols);
                if (err != null){
                    processInfo.ErrString = err;
                }
            }
        }
        public void ProcessData(IMatrixData data, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            bool falseAreIndicated = param.GetSingleChoiceParam("Indicated are").Value == 0;
            int catCol = param.GetSingleChoiceParam("In column").Value;
            string word = param.GetStringParam("Indicator").Value;
            int[] scoreColumns = param.GetMultiChoiceParam("Scores").Value;
            if (scoreColumns.Length == 0){
                processInfo.ErrString = "Please specify at least one column with scores.";
                return;
            }
            bool largeIsGood = param.GetBoolParam("Large values are good").Value;
            int[] showColumns = param.GetMultiChoiceParam("Display quantity").Value;
            if (showColumns.Length == 0){
                processInfo.ErrString = "Please select at least one quantity to display";
                return;
            }
            bool[] indCol = GetIndicatorColumn(falseAreIndicated, catCol, word, data);
            List<string> expColNames = new List<string>();
            List<float[]> expCols = new List<float[]>();
            foreach (int scoreColumn in scoreColumns){
                double[] vals = scoreColumn < data.NumericColumnCount
                    ? data.NumericColumns[scoreColumn]
                    : ArrayUtils.ToDoubles(data.GetExpressionColumn(scoreColumn - data.NumericColumnCount));
                string name = scoreColumn < data.NumericColumnCount
                    ? data.NumericColumnNames[scoreColumn] : data.ExpressionColumnNames[scoreColumn - data.NumericColumnCount];
                int[] order = GetOrder(vals, largeIsGood);
                CalcCurve(ArrayUtils.SubArray(indCol, order), showColumns, name, expCols, expColNames);
            }
            float[,] expData = ToMatrix(expCols);
            data.SetData(data.Name, expColNames, expData, new List<string>(), new List<string[]>(), new List<string>(),
                new List<string[][]>(), new List<string>(), new List<double[]>(), new List<string>(), new List<double[][]>());
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            string[] seqWins;
            string[] accs;
            string[] function;
            string[] process;
            string[] protInteract;
            string[] otherInteract;
            string[] notes;
            string[] species;
            PhosphoSitePlusParser.ParseRegulatorySites(out seqWins, out accs, out function, out process, out protInteract,
                out otherInteract, out notes, out species);
            if (seqWins == null){
                processInfo.ErrString = "File  does not exist.";
                return;
            }
            string[] up = mdata.StringColumns[param.GetParam<int>("Uniprot column").Value];
            string[][] uprot = new string[up.Length][];
            for (int i = 0; i < up.Length; i++){
                uprot[i] = up[i].Length > 0 ? up[i].Split(';') : new string[0];
            }
            string[] win = mdata.StringColumns[param.GetParam<int>("Sequence window").Value];
            Dictionary<string, List<int>> map = new Dictionary<string, List<int>>();
            for (int i = 0; i < seqWins.Length; i++){
                string acc = accs[i];
                if (!map.ContainsKey(acc)){
                    map.Add(acc, new List<int>());
                }
                map[acc].Add(i);
            }
            string[][] newCatCol = new string[uprot.Length][];
            string[][] function2 = new string[uprot.Length][];
            string[][] process2 = new string[uprot.Length][];
            string[][] protInteract2 = new string[uprot.Length][];
            string[][] otherInteract2 = new string[uprot.Length][];
            string[][] notes2 = new string[uprot.Length][];
            for (int i = 0; i < uprot.Length; i++){
                string[] win1 = TransformIl(win[i]).Split(';');
                HashSet<string> wins = new HashSet<string>();
                HashSet<string> function1 = new HashSet<string>();
                HashSet<string> process1 = new HashSet<string>();
                HashSet<string> protInteract1 = new HashSet<string>();
                HashSet<string> otherInteract1 = new HashSet<string>();
                HashSet<string> notes1 = new HashSet<string>();
                foreach (string ux in uprot[i]){
                    if (map.ContainsKey(ux)){
                        List<int> n = map[ux];
                        foreach (int ind in n){
                            string s = seqWins[ind];
                            if (Contains(win1, TransformIl(s.ToUpper().Substring(1, s.Length - 2)))){
                                wins.Add(s);
                                if (function[ind].Length > 0){
                                    function1.Add(function[ind]);
                                }
                                if (process[ind].Length > 0){
                                    process1.Add(process[ind]);
                                }
                                if (protInteract[ind].Length > 0){
                                    protInteract1.Add(protInteract[ind]);
                                }
                                if (otherInteract[ind].Length > 0){
                                    otherInteract1.Add(otherInteract[ind]);
                                }
                                if (notes[ind].Length > 0){
                                    notes1.Add(notes[ind]);
                                }
                            }
                        }
                    }
                }
                if (wins.Count > 0){
                    newCatCol[i] = new[]{"+"};
                    function2[i] = ArrayUtils.ToArray(function1);
                    process2[i] = ArrayUtils.ToArray(process1);
                    protInteract2[i] = ArrayUtils.ToArray(protInteract1);
                    otherInteract2[i] = ArrayUtils.ToArray(otherInteract1);
                    notes2[i] = ArrayUtils.ToArray(notes1);
                } else{
                    newCatCol[i] = new string[0];
                    function2[i] = new string[0];
                    process2[i] = new string[0];
                    protInteract2[i] = new string[0];
                    otherInteract2[i] = new string[0];
                    notes2[i] = new string[0];
                }
            }
            mdata.AddCategoryColumn("Regulatory site", "", newCatCol);
            mdata.AddCategoryColumn("Regulatory site function", "", function2);
            mdata.AddCategoryColumn("Regulatory site process", "", process2);
            mdata.AddCategoryColumn("Regulatory site protInteract", "", protInteract2);
            mdata.AddCategoryColumn("Regulatory site otherInteract", "", otherInteract2);
            mdata.AddCategoryColumn("Regulatory site notes", "", notes2);
        }
        public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables,
			ref IDocumentData[] documents, ProcessInfo processInfo)
        {
            int[] colIndx = param.GetParam<int[]>("x").Value;
            int[] colIndy = param.GetParam<int[]>("y").Value;
            if (colIndx.Length == 0){
                processInfo.ErrString = "Please select some columns";
                return;
            }
            if (colIndx.Length != colIndy.Length){
                processInfo.ErrString = "Please select the same number of columns in the boxes for the first and second columns.";
                return;
            }
            int typeInd = param.GetParam<int>("Distribution type").Value;
            int points = param.GetParam<int>("Number of points").Value;
            for (int k = 0; k < colIndx.Length; k++){
                float[] xvals = GetColumn(mdata, colIndx[k]);
                float[] yvals = GetColumn(mdata, colIndy[k]);
                float[] xvals1;
                float[] yvals1;
                GetValidPairs(xvals, yvals, out xvals1, out yvals1);
                double xmin;
                double xmax;
                double ymin;
                double ymax;
                DensityEstimation.CalcRanges(xvals1, yvals1, out xmin, out xmax, out ymin, out ymax);
                float[,] values = DensityEstimation.GetValuesOnGrid(xvals1, xmin, (xmax - xmin)/points, points, yvals1, ymin,
                    (ymax - ymin)/points, points);
                if (typeInd == 1){
                    MakeConditional1(values);
                }
                if (typeInd == 2){
                    MakeConditional2(values);
                }
                if (typeInd == 3){
                    MakeConditional3(values);
                }
                DensityEstimation.DivideByMaximum(values);
                double[] xmat = new double[points];
                for (int i = 0; i < points; i++){
                    xmat[i] = xmin + i*(xmax - xmin)/points;
                }
                double[] ymat = new double[points];
                for (int i = 0; i < points; i++){
                    ymat[i] = ymin + i*(ymax - ymin)/points;
                }
                float[,] percvalues = CalcExcludedPercentage(values);
                double[] dvals = new double[xvals.Length];
                double[] pvals = new double[xvals.Length];
                for (int i = 0; i < dvals.Length; i++){
                    double xx = xvals[i];
                    double yy = yvals[i];
                    if (!double.IsNaN(xx) && !double.IsNaN(yy)){
                        int xind = ArrayUtils.ClosestIndex(xmat, xx);
                        int yind = ArrayUtils.ClosestIndex(ymat, yy);
                        dvals[i] = values[xind, yind];
                        pvals[i] = percvalues[xind, yind];
                    } else{
                        dvals[i] = double.NaN;
                        pvals[i] = double.NaN;
                    }
                }
                string xname = GetColumnName(mdata, colIndx[k]);
                string yname = GetColumnName(mdata, colIndy[k]);
                mdata.AddNumericColumn("Density_" + xname + "_" + yname,
                    "Density of data points in the plane spanned by the columns " + xname + " and " + yname + ".", dvals);
                mdata.AddNumericColumn("Excluded fraction_" + xname + "_" + yname,
                    "Percentage of points with a point density smaller than at this point in the plane spanned by the columns " + xname +
                    " and " + yname + ".", pvals);
            }
        }
Esempio n. 30
0
 public abstract IMatrixData ProcessData(IMatrixData[] inputData, Parameters param, ref IMatrixData[] supplTables,
                                         ref IDocumentData[] documents, ProcessInfo processInfo);