private IMatrixData ProcessDbFiles(ProcessInfo processInfo, int nThreads, IList<Database> databases) { string tempFile = Path.Combine(FileUtils.GetTempFolder(), "databaseref.txt"); IMatrixData matrix; StreamWriter writer = null; try{ processInfo.Progress(0); processInfo.Status(string.Format("Read database files [{0}|{1}]", 0, "?")); Enum[] enums = new Enum[] { databaseRef.file, databaseRef.source, databaseRef.specie, databaseRef.taxid, databaseRef.version, databaseRef.identifier }; IList<string> header = enums.Select(Constants.GetPattern).ToList(); if (databases == null || databases.Count == 0){ return null; } writer = new StreamWriter(tempFile); int nTasks = databases.Count; writer.WriteLine(StringUtils.Concat("\t", header)); writer.WriteLine("#!{Type}" + StringUtils.Concat("\t", header.Select(x => "T"))); ThreadDistributor distr = new ThreadDistributor(nThreads, nTasks, x => ParseDatabase(writer, databases[x], string.Format( "Read database files [{0}|{1}]", x + 1, nTasks), (x + 1)*100/nTasks, processInfo)); distr.Start(); processInfo.Status("Close all files"); writer.Close(); writer.Dispose(); writer = null; processInfo.Progress(0); processInfo.Status("Create DatabaseRef Matrix"); matrix = new MatrixData(); LoadData(matrix, tempFile, processInfo); } catch (Exception ex){ throw ex; } finally{ if (writer != null){ writer.Close(); } if (File.Exists(tempFile)){ File.Delete(tempFile); } } return matrix; }
public void ParseFile(string path, ProcessInfo processInfo) { processInfo.Status("Parsing " + path); string accession = ""; int sequenceCounter = 0; StringBuilder sequence = new StringBuilder(); ProteinSequence protein = new ProteinSequence(); try{ StreamReader file = new StreamReader(path); string line; while ((line = file.ReadLine()) != null){ // valid line if (sequenceCounter%500 == 0){ processInfo.Status("Parsing " + path + ", " + (int) ((float) file.BaseStream.Position/file.BaseStream.Length*100) + "%"); } bool lineIsHeader = line.StartsWith(">"); // skip all lines until the first header is found if (sequenceCounter == 0 && !lineIsHeader){ continue; } // line is a piece of a sequence if (sequenceCounter > 0 && !lineIsHeader){ sequence.Append(line.Trim()); continue; } // line is a fasta header if (lineIsHeader){ if (sequenceCounter > 0) // this is not the first header, i.e. the previous sequence is now completely read in { // add the previous protein protein.SetSequence(sequence.ToString()); entries.Add(accession, protein); } // initialize a new protein protein = new ProteinSequence(); sequenceCounter++; // then parse the new header string header = line; Match m = regexUniprotAccession.Match(header); if (m.Success){ // uniprot header accession = m.Groups[1].Value; protein.Accession = accession; protein.Header = header; } else{ // fallback position: take entire header after the > as accession accession = header.Substring(1).Trim(); protein.Accession = accession; protein.Header = header; } sequence = new StringBuilder(); } } //end while file.Close(); //add the last protein if (sequenceCounter > 0){ // make sure there is at least one sequence in the file protein.SetSequence(sequence.ToString()); entries.Add(accession, protein); } } catch (Exception){ processInfo.ErrString = "Something went wrong while parsing the fasta file.\nMake sure the path is correct and the " + "file is not opened in another application.\nMake sure the fasta file is valid."; } }
private IMatrixData ProcessAplFiles(ProcessInfo processInfo, int nThreads, IList<MsRunImpl> aplfiles) { string tempFile = Path.Combine(FileUtils.GetTempFolder(), "spectraref.txt"); if (File.Exists(tempFile)){ File.Delete(tempFile); } IMatrixData matrix; StreamWriter writer = null; try{ Enum[] enums = new Enum[]{spectraRef.raw_file, spectraRef.charge, spectraRef.scan_number, spectraRef.location, spectraRef.format, spectraRef.id_format, spectraRef.fragmentation, spectraRef.mz, spectraRef.index}; IList<string> header = enums.Select(Constants.GetPattern).ToList(); if (aplfiles == null || aplfiles.Count == 0){ return null; } int nTasks = aplfiles.Count; processInfo.Progress(0); processInfo.Status(string.Format("Read Andromeda peaklist files [{0}|{1}]", 0, nTasks)); writer = new StreamWriter(tempFile); writer.WriteLine(StringUtils.Concat("\t", header)); writer.WriteLine("#!{Type}" + StringUtils.Concat("\t", header.Select(x => "T"))); ThreadDistributor distr = new ThreadDistributor(nThreads, nTasks, x => ParseAplFile(aplfiles[x], writer, string.Format( "Read Andromeda peaklist files [{0}|{1}]", x + 1, nTasks), (x + 1)*100/nTasks, processInfo)); distr.Start(); processInfo.Status("Close all files"); writer.Close(); writer.Dispose(); writer = null; processInfo.Progress(0); processInfo.Status("Create SpectraRef matrix"); matrix = new MatrixData(); LoadData(matrix, tempFile, processInfo); } catch (Exception ex){ throw ex; } finally{ if (writer != null){ writer.Close(); } if (File.Exists(tempFile)){ File.Delete(tempFile); } } return matrix; }
private static void ParseDatabase(StreamWriter writer, Database db, string status, int progress, ProcessInfo processInfo) { if (db.File == null || !File.Exists(db.File)){ return; } StreamReader reader = new StreamReader(db.File); string line; Regex regex = new Regex(db.SearchExpression); while ((line = reader.ReadLine()) != null){ if (line.StartsWith(">")){ string identifier = regex.Match(line).Groups[1].Value; object[] items = new object[]{ db.File, db.Source, db.Species, db.Taxid, db.Version, db.Prefix == null ? identifier : db.Prefix + identifier }; lock (writer){ writer.WriteLine(StringUtils.Concat("\t", items)); } } } reader.Close(); lock (processInfo){ processInfo.Progress(progress); processInfo.Status(status); } }
private static void ParseAplFile(MsRunImpl aplfile, StreamWriter writer, string status, int progress, ProcessInfo processInfo) { lock (processInfo){ processInfo.Progress(progress); processInfo.Status(status); } string file = aplfile.Location.Value; string form = aplfile.Format == null ? "" : aplfile.Format.Name; string idform = aplfile.IdFormat == null ? "" : aplfile.IdFormat.Name; int m = 0; Regex regex = new Regex("Raw[f|F]ile: (.*) Index: ([0-9]+)"); AplParser parser = new AplParser(delegate(AplEntry entry) { if (regex.IsMatch(entry.Title)){ string rawfile = regex.Match(entry.Title).Groups[1].Value; string scannumber = regex.Match(entry.Title).Groups[2].Value; m++; object[] items = new object[]{ rawfile, entry.PrecursorCharge, scannumber, file, form, idform, entry.Fragmentation, entry.Mz, m.ToString(CultureInfo.InvariantCulture) }; lock (writer){ writer.WriteLine(StringUtils.Concat("\t", items)); } } }); parser.Parse(file); lock (writer){ writer.Flush(); } }
public override IMatrixData ProcessData(IMatrixData[] inputData, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { TextWriter defaultOut = Console.Out; TextWriter defaultErr = Console.Error; try{ if (documents == null){ documents = new IDocumentData[NumDocuments]; for (int i = 0; i < NumDocuments; i++){ documents[i] = new DocumentData(); } } TextWriter logger = null; if (documents.Length > 0){ logger = new StreamWriter(new DocumentStream(documents[0])); Console.SetOut(logger); Console.SetError(logger); } int nThreads = GetMaxThreads(param); IList<MsRunImpl> runs = new List<MsRunImpl>(); SingleChoiceWithSubParams singleSub = param.GetParam(MetadataElement.MS_RUN.Name) as SingleChoiceWithSubParams; if (singleSub != null){ MsRunParam sub = singleSub.SubParams[singleSub.Value].GetAllParameters().FirstOrDefault() as MsRunParam; if (sub != null){ if (sub.Value != null){ foreach (MsRunImpl run in sub.Value){ runs.Add(run); } } } } IList<StudyVariable> studyVariables = new List<StudyVariable>(); singleSub = param.GetParam(MetadataElement.STUDY_VARIABLE.Name) as SingleChoiceWithSubParams; if (singleSub != null){ StudyVariableParam sub = singleSub.SubParams[singleSub.Value].GetAllParameters().FirstOrDefault() as StudyVariableParam; if (sub != null){ if (sub.Value != null){ foreach (StudyVariable variable in sub.Value){ studyVariables.Add(variable); } } } } IList<Sample> samples = new List<Sample>(); singleSub = param.GetParam(MetadataElement.SAMPLE.Name) as SingleChoiceWithSubParams; if (singleSub != null){ SampleParam sub = singleSub.SubParams[singleSub.Value].GetAllParameters().FirstOrDefault() as SampleParam; if (sub != null){ if (sub.Value != null){ foreach (Sample sample in sub.Value){ samples.Add(sample); } } } } IList<Assay> assays = new List<Assay>(); singleSub = param.GetParam(MetadataElement.ASSAY.Name) as SingleChoiceWithSubParams; if (singleSub != null){ AssayParam sub = singleSub.SubParams[singleSub.Value].GetAllParameters().FirstOrDefault() as AssayParam; if (sub != null){ if (sub.Value != null){ foreach (Assay assay in sub.Value){ assays.Add(assay); } } } } IList<Database> databases = new List<Database>(); singleSub = param.GetParam("database") as SingleChoiceWithSubParams; if (singleSub != null){ DatabaseParam sub = singleSub.SubParams[singleSub.Value].GetAllParameters().FirstOrDefault() as DatabaseParam; if (sub != null && sub.Value != null){ foreach (Database db in sub.Value){ databases.Add(db); } } } IMatrixData output = (IMatrixData) inputData[0].CreateNewInstance(DataType.Matrix); List<string> columnnames = new List<string>{ MetadataElement.STUDY_VARIABLE.Name, MetadataElement.ASSAY.Name, MetadataElement.MS_RUN.Name, MetadataElement.SAMPLE.Name, MetadataElement.INSTRUMENT.Name }; List<string[]> matrix = new List<string[]>(); for (int i = 0; i < columnnames.Count; i++){ matrix.Add(new string[assays.Count]); } for (int i = 0; i < assays.Count; i++){ Assay assay = assays[i]; MsRunImpl runImpl = runs.FirstOrDefault(x => x.Id.Equals(assay.MsRun.Id)); Instrument instrument = instruments.FirstOrDefault(x => x.Id.Equals(assay.MsRun.Id)); if (runImpl == null){ continue; } var studyVariable = i < studyVariables.Count ? studyVariables[i] : null; var sample = i < samples.Count ? samples[i] : null; foreach (var s in studyVariables){ if (s.AssayMap.ContainsKey(assay.Id)){ studyVariable = s; try{ int sampleId = studyVariable.SampleMap.FirstOrDefault().Key; sample = samples.FirstOrDefault(x => x.Id.Equals(sampleId)); } catch (Exception){ Console.Error.WriteLine("Can not find sample"); } break; } } AddRow(matrix, columnnames, i, runImpl, assay, sample, studyVariable, instrument); } output.SetData(Matrix.Experiment, new List<string>(), new float[assays.Count,columnnames.Count], columnnames, matrix, new List<string>(), new List<string[][]>(), new List<string>(), new List<double[]>(), new List<string>(), new List<double[][]>(), new List<string>(), new List<string[][]>(), new List<string>(), new List<double[]>()); IList<IMatrixData> supplement = new List<IMatrixData>(); try{ IList<MsRunImpl> aplfiles = runs.Where(x => x.Location != null && x.Location.Value.EndsWith(".apl")).ToList(); IMatrixData temp = ProcessAplFiles(processInfo, nThreads, aplfiles); if (temp != null){ supplement.Add(temp); } } catch (Exception e){ throw new Exception("Could not parse spectra file(s)! " + e.Message + "\n" + e.StackTrace); } try{ IMatrixData temp = ProcessDbFiles(processInfo, databases.Count < nThreads ? 1 : nThreads, databases); if (temp != null){ supplement.Add(temp); } } catch (Exception e){ throw new Exception("Could not parse database file(s)! " + e.Message + "\n" + e.StackTrace); } if (logger != null){ logger.Dispose(); } supplTables = supplement.ToArray(); processInfo.Status("Define Experiment: DONE!"); processInfo.Progress(100); return output; } catch (Exception e){ string msg = "Process aborted! " + e.Message; MessageBox.Show(msg); Logger.Error(Name, msg); processInfo.Status(msg); } finally{ Console.SetOut(defaultOut); Console.SetError(defaultErr); } return null; }
public void ProcessData(IMatrixData mdata, Parameters param, ref IMatrixData[] supplTables, ref IDocumentData[] documents, ProcessInfo processInfo) { int proteinIdColumnInd = param.GetParam<int>("Protein IDs").Value; string[][] proteinIds = new string[mdata.RowCount][]; string[][] leadingIds = new string[mdata.RowCount][]; List<string> allIds = new List<string>(); for (int row = 0; row < mdata.RowCount; row++){ proteinIds[row] = mdata.StringColumns[proteinIdColumnInd][row].Split(';'); leadingIds[row] = new[]{proteinIds[row][0]}; allIds.AddRange(proteinIds[row]); } string fastaFilePath = param.GetParam<string>("Fasta file").Value; Fasta fasta = new Fasta(); fasta.ParseFile(fastaFilePath, processInfo); // Text annotations processInfo.Status("Adding fasta header annotations."); int[] selection = param.GetParamWithSubParams<int>("Fasta header annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; string[][] idsToBeAnnotated = (param.GetParamWithSubParams<int>("Fasta header annotations").Value == 0) ? proteinIds : leadingIds; ProteinSequence[][] fastaEntries = new ProteinSequence[mdata.RowCount][]; for (int row = 0; row < mdata.RowCount; row++){ List<ProteinSequence> rowEntries = new List<ProteinSequence>(); foreach (string id in idsToBeAnnotated[row]){ ProteinSequence entry = fasta.GetEntry(id); if (entry == null){ continue; } rowEntries.Add(entry); } fastaEntries[row] = rowEntries.ToArray(); } if (ArrayUtils.Contains(selection, 0)){ // Entry name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string entryName = entry.EntryName; if (entryName != null && !ArrayUtils.Contains(rowAnnotations, entryName)){ rowAnnotations.Add(entryName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Entry name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Gene name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string geneName = entry.GeneName; if (geneName != null && !ArrayUtils.Contains(rowAnnotations, geneName)){ rowAnnotations.Add(geneName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Gene name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Verbose protein name, i.e. all protein names annotated in all fasta headers, including the //'Isoform x of...' prefixes and '(Fragment)' suffixes string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = string.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name (verbose)", "", annotationColumn); } if (ArrayUtils.Contains(selection, 3)){ // Consensus protein name string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string proteinName = entry.ConsensusProteinName; if (proteinName != null && !ArrayUtils.Contains(rowAnnotations, proteinName)){ rowAnnotations.Add(proteinName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Protein name", "", annotationColumn); } if (ArrayUtils.Contains(selection, 4)){ // Species string[] annotationColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<string> rowAnnotations = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ string speciesName = entry.Species; if (speciesName != null && !ArrayUtils.Contains(rowAnnotations, speciesName)){ rowAnnotations.Add(speciesName); } } annotationColumn[row] = String.Join(";", rowAnnotations.ToArray()); } mdata.AddStringColumn("Species", "", annotationColumn); } // Numeric annotations processInfo.Status("Adding numeric annotations."); selection = param.GetParamWithSubParams<int>("Numeric annotations").GetSubParameters().GetParam<int[]>("Annotations").Value; bool annotateLeadingId = (param.GetParamWithSubParams<int>("Numeric annotations").Value == 1); if (ArrayUtils.Contains(selection, 0)){ // Sequence length double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double sequenceLength = entry.GetSequence().Length; rowAnnotations.Add(sequenceLength); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Sequence length", "", annotationColumn); } if (ArrayUtils.Contains(selection, 1)){ // Monoisotopic molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double monoisotopicMass = entry.GetMonoisotopicMolecularMass(); rowAnnotations.Add(monoisotopicMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Monoisotopic molecular mass", "", annotationColumn); } if (ArrayUtils.Contains(selection, 2)){ // Average molecular mass double[] annotationColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double averageMass = entry.GetAverageMolecularMass(); rowAnnotations.Add(averageMass); if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); } mdata.AddNumericColumn("Average molecular mass", "", annotationColumn); } // Theoretical peptides processInfo.Status("Calculating theoretical peptides."); annotateLeadingId = (param.GetParamWithSubParams<int>("Calculate theoretical peptides").Value == 1); Protease[] proteases = ArrayUtils.SubArray(Constants.defaultProteases, param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<int[]>("Proteases") .Value); double minLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Min. peptide length").Value; double maxLength = param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<double>( "Max. peptide length").Value; bool displayPeptideSequences = annotateLeadingId && param.GetParamWithSubParams<int>("Calculate theoretical peptides").GetSubParameters().GetParam<bool>( "Show sequences").Value; foreach (Protease protease in proteases){ double[] annotationColumn = new double[mdata.RowCount]; string[] peptideColumn = new string[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> rowAnnotations = new List<double>(); List<string> rowPeptides = new List<string>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nTheoreticalPeptides = entry.GetNumberOfTheoreticalPeptides(protease, (int) minLength, (int) maxLength); rowAnnotations.Add(nTheoreticalPeptides); if (displayPeptideSequences){ rowPeptides.AddRange(entry.GetTheoreticalPeptideSequences(protease, (int) minLength, (int) maxLength)); } if (annotateLeadingId && rowAnnotations.Count > 0){ break; } } annotationColumn[row] = ArrayUtils.Median(rowAnnotations.ToArray()); peptideColumn[row] = String.Join(";", rowPeptides); } mdata.AddNumericColumn( "Number of theoretical peptides (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", annotationColumn); if (displayPeptideSequences){ mdata.AddStringColumn( "Theoretical peptide sequences (" + protease.name + ", " + minLength + "-" + maxLength + ")", "", peptideColumn); } } // Sequence features processInfo.Status("Counting sequence features."); annotateLeadingId = (param.GetParamWithSubParams<int>("Count sequence features").Value == 1); bool normalizeBySequenceLength = param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<bool>( "Normalize by sequence length").Value; if (param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value != ""){ Regex regex; try{ regex = new Regex( param.GetParamWithSubParams<int>("Count sequence features").GetSubParameters().GetParam<string>("Regex").Value); } catch (ArgumentException){ processInfo.ErrString = "The regular expression you provided has invalid syntax."; return; } double[] sequenceFeatureColumn = new double[mdata.RowCount]; for (int row = 0; row < mdata.RowCount; row++){ List<double> featureCount = new List<double>(); foreach (ProteinSequence entry in fastaEntries[row]){ double nFeatures = regex.Matches(entry.GetSequence()).Count; featureCount.Add(normalizeBySequenceLength ? nFeatures/entry.GetLength() : nFeatures); if (annotateLeadingId){ break; } } sequenceFeatureColumn[row] = ArrayUtils.Median(featureCount.ToArray()); } mdata.AddNumericColumn( (normalizeBySequenceLength ? "Normalized feature count (" : "Feature count (") + regex + ")", "", sequenceFeatureColumn); } processInfo.Status("Done."); }