/// <summary> /// Retrieves a list of features. /// </summary> /// <param name="rawFile"></param> /// <param name="featureFile"></param> /// <returns></returns> public List <UMCLight> FindFeatures(string rawFile, string featureFile) { List <UMCLight> features; using (ISpectraProvider raw = new InformedProteomicsReader()) { // Read the raw file summary data... raw.AddDataFile(rawFile, 0); var info = new DatasetInformation(); info.InputFiles.Add(new InputFile { Path = featureFile, FileType = InputFileType.Features }); var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var tolerances = new FeatureTolerances { Mass = 8, Net = .005 }; var options = new LcmsFeatureFindingOptions(tolerances); // Load and create features var msFeatures = UmcLoaderFactory.LoadMsFeatureData(info.Features.Path); var provider = RawLoaderFactory.CreateFileReader(rawFile); provider.AddDataFile(rawFile, 0); features = finder.FindFeatures(msFeatures, options, provider); } return(features); }
public void TestPromexFileReading() { const int datasetId = 0; var reader = new InformedProteomicsReader(datasetId, pbf1); var promexFileReader = new PromexFileReader(reader, datasetId); var features = promexFileReader.ReadFile(ms1ft1); Console.WriteLine(features.Count()); }
private static ISpectraProvider GetProvider(string path) { if (mRawDataProviders == null) { mRawDataProviders = new Dictionary <string, ISpectraProvider>(); } ISpectraProvider provider; if (!mRawDataProviders.TryGetValue(path, out provider)) { provider = new InformedProteomicsReader(0, path); mRawDataProviders.Add(path, provider); } return(provider); }
public void GenerateFigure4_MetaMatches(string directory, SpectralComparison comparerType, double mzBinSize, double mzTolerance, double netTolerance, double similarityScoreCutoff, double peptideScore, double peptideFdr, double ionPercent, int numberOfRequiredPeaks, string name) { AlignmentAnalysisWriterFactory.BasePath = @"M:\doc\papers\paperAlignment\Data\figure4"; Console.WriteLine(@"Post-Pre Tests For {0}", directory); var cacheFiles = Directory.GetFiles(directory, "*.mscache"); Console.WriteLine(@"Building data cache"); var data = cacheFiles.Select(path => new FigureBase.PathCache { Cache = path }).ToList(); // The options for the analysis var options = new SpectralOptions { MzBinSize = mzBinSize, MzTolerance = mzTolerance, NetTolerance = netTolerance, SimilarityCutoff = similarityScoreCutoff, TopIonPercent = ionPercent, IdScore = peptideScore, ComparerType = comparerType, Fdr = peptideFdr, RequiredPeakCount = numberOfRequiredPeaks }; var comparison = 0; for (var i = 0; i < data.Count; i++) { var cachex = data[i]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathX = ScanSummaryCache.ReadPath(cachex.Cache); var datasetX = new AlignmentDataset(rawPathX, "", cachex.Msgf); // create a raw file reader for the datasets using (var readerX = new InformedProteomicsReader()) { // wrap it in the cached object so we can load scan meta-data var cacheReaderX = new RawLoaderCache(readerX); var cacheDataX = ScanSummaryCache.ReadCache(cachex.Cache); readerX.AddDataFile(rawPathX, 0); cacheReaderX.AddCache(0, cacheDataX); for (var j = i + 1; j < data.Count; j++) { var cachey = data[j]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathY = ScanSummaryCache.ReadPath(cachey.Cache); var datasetY = new AlignmentDataset(rawPathY, "", cachey.Msgf); // create a raw file reader for the datasets using (var readerY = new InformedProteomicsReader()) { // Then the writer for creating a report var writer = AlignmentAnalysisWriterFactory.Create(AlignmentFigureType.Figure3, name + comparison); comparison++; // wrap it in the cached object so we can load scan meta-data var cacheReaderY = new RawLoaderCache(readerY); var cacheDataY = ScanSummaryCache.ReadCache(cachey.Cache); cacheReaderY.AddCache(0, cacheDataY); readerY.AddDataFile(rawPathY, 0); var names = new List <string> { data[i].Cache, data[j].Cache }; var analysis = MatchDatasets(comparerType, readerX, readerY, options, datasetX, datasetY, names); AlignMatches(analysis, writer); writer.Close(); } } } } }
public PromexFileReader(InformedProteomicsReader reader, int datasetId) { this.reader = reader; this.datasetId = datasetId; }
private MSSpectra GetSpectrum(string path, int scan) { ISpectraProvider reader = new InformedProteomicsReader(0, path); return(GetSpectrum(reader, scan, 0)); }
public IEnumerable <UMCLight> CreateXicNew(List <UMCLight> features, double massError, InformedProteomicsReader provider, bool refine = true, IProgress <ProgressData> progress = null) { var progressData = new ProgressData(progress); int id = 0, count = 0; int msmsFeatureId = 0; var resultFeatures = new List <UMCLight> { Capacity = features.Count }; var ipr = provider.LcMsRun; ipr.HigherPrecursorChromatogramCacheSize = 2000; features.Sort((x, y) => x.Mz.CompareTo(y.Mz)); // Iterate over XIC targets. foreach (var xicTarget in CreateXicTargetsYield(features, massError)) { count++; // Read XIC var target = xicTarget.StartScan + ((xicTarget.EndScan - xicTarget.StartScan) / 2); var xic = ipr.GetPrecursorExtractedIonChromatogram(xicTarget.LowMz, xicTarget.HighMz, target); if (refine) { var xicRefiner = this.XicRefiner ?? new XicRefiner(); xic = xicRefiner.RefineXic(xic); } if (xic.Count < 3) { continue; } var minEt = ipr.GetElutionTime(ipr.MinLcScan); var maxEt = ipr.GetElutionTime(ipr.MaxLcScan); var diffEt = maxEt - minEt; // Add xic points as MSFeatures. xicTarget.Feature.MsFeatures.Clear(); foreach (var point in xic) { xicTarget.Feature.AddChildFeature(new MSFeatureLight { ChargeState = xicTarget.ChargeState, Mz = xicTarget.Mz, MassMonoisotopic = xicTarget.Feature.MassMonoisotopic, Scan = point.ScanNum, Abundance = Convert.ToInt64(point.Intensity), Id = id++, DriftTime = xicTarget.Feature.DriftTime, Net = (ipr.GetElutionTime(point.ScanNum) - minEt) / diffEt, GroupId = xicTarget.Feature.GroupId }); } // Associate MS/MS information. var ms2Scans = ipr.GetFragmentationSpectraScanNums(xicTarget.Feature.Mz).ToArray(); int j = 0; for (int i = 0; i < xicTarget.Feature.MsFeatures.Count; i++) { for (; j < ms2Scans.Length; j++) { // Scan below UMC feature scan range. if (ms2Scans[j] < xicTarget.Feature.MsFeatures[i].Scan) { break; } // Haven't reached the last ms2 scan and ms2 scan is larger than next feature, could be associated with next feature if (i < xicTarget.Feature.MsFeatures.Count - 1 && ms2Scans[j] > xicTarget.Feature.MsFeatures[i + 1].Scan) { break; } // We're on the last MSFeature - is the MS/MS scan actually for this feature? if (i == xicTarget.Feature.MsFeatures.Count - 1 && ipr.GetPrevScanNum(ms2Scans[j], 1) != xicTarget.Feature.MsFeatures[i].Scan) { continue; } // Otherwise this is a MS/MS we want to add! var spectraData = new MSSpectra { Id = msmsFeatureId++, ScanMetaData = new ScanSummary { MsLevel = 2, Scan = ms2Scans[j], PrecursorMz = xicTarget.Feature.MsFeatures[i].Mz, }, CollisionType = CollisionType.None, Scan = ms2Scans[j], PrecursorMz = xicTarget.Feature.MsFeatures[i].Mz }; xicTarget.Feature.MsFeatures[i].MSnSpectra.Add(spectraData); } } resultFeatures.Add(xicTarget.Feature); if (count % 100 == 0 || count == features.Count - 1) { progressData.Report(count, features.Count); } } return(resultFeatures); }
public void CreateUMCClusterLight(string databasePath, bool indexDatabase) { // If the database is not index then do so...but before the session to the db is opened. if (indexDatabase) { DatabaseIndexer.IndexClusters(databasePath); DatabaseIndexer.IndexFeatures(databasePath); } // This is a factory based method that creates a set of data access providers used throughout MultiAlign var providers = DataAccessFactory.CreateDataAccessProviders(databasePath, false); // If you just wanted the clusters you could do this: // 1. Connect to the database //NHibernateUtil.ConnectToDatabase(databasePath, false); // 2. Then extract all of the clusters //IUmcClusterDAO clusterCache = new UmcClusterDAOHibernate(); //List<UMCClusterLight> clusters = clusterCache.FindAll(); var clusters = providers.ClusterCache.FindAll(); var shouldGetMsFeatures = true; var shouldGetMsMsFeatures = true; var shouldGetRawData = false; // This gets all of the dataset information and maps to a dictionary...if you want the raw data // otherwise comment this out. var datasets = providers.DatasetCache.FindAll(); var datasetMap = new Dictionary <int, DatasetInformation>(); datasets.ForEach(x => datasetMap.Add(x.DatasetId, x)); foreach (var cluster in clusters) { cluster.ReconstructUMCCluster(providers, true, false, shouldGetMsFeatures, shouldGetMsMsFeatures); foreach (var feature in cluster.Features) { foreach (var msFeature in feature.Features) { foreach (var spectrumMetaData in msFeature.MSnSpectra) { // then you can do stuff with the ms/ms spectra // If you had the path to the raw file, you could create a reader for you to extract the MS/MS spectra // This supports mzXML and .RAW Thermo files based on the file extension. if (shouldGetRawData) { DatasetInformation info = null; var hasKey = datasetMap.TryGetValue(spectrumMetaData.GroupId, out info); if (hasKey) { if (info.RawFile != null) { // This might seem kind of klunky, but it's called a bridge, this way I can access // MS/MS spectra from PNNLOmics without having to reference any of the Thermo DLL's // Nor support file reading capability. This is also nice because I don't have to load // several MS/MS spectra when analyzing large datasets for my spectral clustering work. var rawReader = new InformedProteomicsReader(spectrumMetaData.GroupId, info.RawFile.Path); // Then grab the actual spectrum... var summary = new ScanSummary(); var spectrum = rawReader.GetRawSpectra(spectrumMetaData.Scan, 2, out summary); // Then do what you want... // Profit??? } } } } } } } }
/// <summary> /// Runs the MultiAlign analysis /// </summary> public void PerformMultiAlignAnalysis(DatasetInformation baselineDataset, IEnumerable <DatasetInformation> aligneeDatasets, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, MultiAlignCore.Algorithms.FeatureFinding.IFeatureFinder featureFinder, IFeatureAligner <IEnumerable <UMCLight>, IEnumerable <UMCLight>, AlignmentData> aligner, IClusterer <UMCLight, UMCClusterLight> clusterer, string matchPath, string errorPath) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(baselineDataset.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = new InformedProteomicsReader()) { rawProviderX.AddDataFile(baselineDataset.RawFile.Path, 0); UpdateStatus("Creating Baseline LCMS Features."); var baselineFeatures = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); LinkPeptidesToFeatures(baselineDataset.Sequence.Path, baselineFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); // Then load the alignee dataset foreach (var dataset in aligneeDatasets) { var aligneeMsFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path); aligneeMsFeatures = LcmsFeatureFilters.FilterMsFeatures(aligneeMsFeatures, msFilterOptions); using (var rawProviderY = new InformedProteomicsReader()) { rawProviderY.AddDataFile(dataset.RawFile.Path, 0); UpdateStatus("Finding alignee features"); var aligneeFeatures = featureFinder.FindFeatures(aligneeMsFeatures, featureFindingOptions, rawProviderY); LinkPeptidesToFeatures(dataset.Sequence.Path, aligneeFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); // cluster before we do anything else.... var allFeatures = new List <UMCLight>(); allFeatures.AddRange(baselineFeatures); allFeatures.AddRange(aligneeFeatures); foreach (var feature in allFeatures) { feature.Net = feature.Net; feature.MassMonoisotopicAligned = feature.MassMonoisotopic; } // This tells us the differences before we align. var clusters = clusterer.Cluster(allFeatures); var preAlignment = AnalyzeClusters(clusters); aligner.AligneeSpectraProvider = providerY; aligner.BaselineSpectraProvider = providerX; UpdateStatus("Aligning data"); // Aligner data var data = aligner.Align(baselineFeatures, aligneeFeatures); var matches = data.Matches; WriteErrors(errorPath, matches); // create anchor points for LCMSWarp alignment var massPoints = new List <RegressionPoint>(); var netPoints = new List <RegressionPoint>(); foreach (var match in matches) { var massError = FeatureLight.ComputeMassPPMDifference(match.AnchorPointX.Mz, match.AnchorPointY.Mz); var netError = match.AnchorPointX.Net - match.AnchorPointY.Net; var massPoint = new RegressionPoint(match.AnchorPointX.Mz, 0, massError, netError); massPoints.Add(massPoint); var netPoint = new RegressionPoint(match.AnchorPointX.Net, 0, massError, netError); netPoints.Add(netPoint); } foreach (var feature in allFeatures) { feature.UmcCluster = null; feature.ClusterId = -1; } // Then cluster after alignment! UpdateStatus("clustering data"); clusters = clusterer.Cluster(allFeatures); var postAlignment = AnalyzeClusters(clusters); UpdateStatus("Note\tSame\tDifferent"); UpdateStatus(string.Format("Pre\t{0}\t{1}", preAlignment.SameCluster, preAlignment.DifferentCluster)); UpdateStatus(string.Format("Post\t{0}\t{1}", postAlignment.SameCluster, postAlignment.DifferentCluster)); SaveMatches(matchPath, matches); } } } DeRegisterProgressNotifier(aligner); DeRegisterProgressNotifier(featureFinder); DeRegisterProgressNotifier(clusterer); }
public void GenerateClusterAlignmentStatistics(string relativeDatabasePath, string relativeName, string name, FeatureAlignmentType alignmentType, LcmsFeatureClusteringAlgorithmType clusterType) { var databasePath = GetPath(relativeDatabasePath); var outputPath = GetOutputPath(relativeName); if (!Directory.Exists(outputPath)) { Directory.CreateDirectory(outputPath); } // Connect to the NHibernate database var providers = DataAccessFactory.CreateDataAccessProviders(databasePath, false); // Setup our alignment options var alignmentOptions = new AlignmentOptions(); var spectralOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .01, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass + 6, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; UpdateStatus("Retrieving all datasets for test."); var datasets = providers.DatasetCache.FindAll(); // Create our algorithms var aligner = FeatureAlignerFactory.CreateDatasetAligner(alignmentType, alignmentOptions.LCMSWarpOptions, spectralOptions); var clusterer = ClusterFactory.Create(clusterType); clusterer.Parameters = new FeatureClusterParameters <UMCLight> { Tolerances = featureTolerances }; RegisterProgressNotifier(aligner); RegisterProgressNotifier(clusterer); for (var i = 0; i < datasets.Count - 1; i++) { var matchPath = string.Format("{0}-{1}-matches.txt", name, i); var errorPath = string.Format("{0}-{1}-errors.txt", name, i); matchPath = Path.Combine(outputPath, matchPath); errorPath = Path.Combine(outputPath, errorPath); var aligneeDataset = datasets[i + 1]; var baselineDataset = datasets[i]; // Load the baseline reference set using (var rawProviderX = new InformedProteomicsReader()) { rawProviderX.AddDataFile(baselineDataset.RawFile.Path, 0); // Load the baseline reference set using (var rawProviderY = new InformedProteomicsReader()) { rawProviderY.AddDataFile(aligneeDataset.RawFile.Path, 0); var baselineFeatures = RetrieveFeatures(baselineDataset.DatasetId, providers); var aligneeFeatures = RetrieveFeatures(aligneeDataset.DatasetId, providers); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); AlignDatasets(baselineFeatures, aligneeFeatures, providerX, providerY, aligner, clusterer, matchPath, errorPath); } } } }
public void GenerateFigure3_Matches(string directory, SpectralComparison comparerType, double mzBinSize, double mzTolerance, double netTolerance, double similarityScoreCutoff, double peptideScore, double peptideFdr, double ionPercent, int numberOfRequiredPeaks) { AlignmentAnalysisWriterFactory.BasePath = @"M:\doc\papers\paperAlignment\Data\figure4"; Console.WriteLine(@"Post-Pre Tests For {0}", directory); var cacheFiles = Directory.GetFiles(directory, "*.mscache"); var msgfFiles = Directory.GetFiles(directory, "*_msgfdb_fht.txt"); Console.WriteLine(@"Building data cache"); var map = cacheFiles.ToDictionary <string, string, FigureBase.PathCache>(path => path.ToLower(), path => null); var data = (from path in msgfFiles let name = path.ToLower().Replace("_msgfdb_fht.txt", ".mscache") let newName = Path.Combine(directory, name) let features = Path.Combine(directory, name) where map.ContainsKey(newName) select new FigureBase.PathCache { Cache = newName, Msgf = path, Features = features }).ToList(); // The options for the analysis var options = new SpectralOptions { MzBinSize = mzBinSize, MzTolerance = mzTolerance, NetTolerance = netTolerance, SimilarityCutoff = similarityScoreCutoff, TopIonPercent = ionPercent, IdScore = peptideScore, ComparerType = comparerType, Fdr = peptideFdr, RequiredPeakCount = numberOfRequiredPeaks }; Console.WriteLine(@"{0}", data.Count); var comparison = 0; for (var i = 0; i < data.Count; i++) { var cachex = data[i]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathX = ScanSummaryCache.ReadPath(cachex.Cache); var datasetX = new AlignmentDataset(rawPathX, "", cachex.Msgf); // create a raw file reader for the datasets using (var readerX = new InformedProteomicsReader()) { // wrap it in the cached object so we can load scan meta-data var cacheReaderX = new RawLoaderCache(readerX); var cacheDataX = ScanSummaryCache.ReadCache(cachex.Cache); readerX.AddDataFile(rawPathX, 0); cacheReaderX.AddCache(0, cacheDataX); for (var j = i + 1; j < data.Count; j++) { // Then the writer for creating a report var writer = AlignmentAnalysisWriterFactory.Create(AlignmentFigureType.Figure3, "results-figure3-largeScale" + comparison); comparison++; var cachey = data[j]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathY = ScanSummaryCache.ReadPath(cachey.Cache); var datasetY = new AlignmentDataset(rawPathY, "", cachey.Msgf); // create a raw file reader for the datasets using (var readerY = new InformedProteomicsReader()) { // wrap it in the cached object so we can load scan meta-data var cacheReaderY = new RawLoaderCache(readerY); var cacheDataY = ScanSummaryCache.ReadCache(cachey.Cache); cacheReaderY.AddCache(0, cacheDataY); readerY.AddDataFile(rawPathY, 0); var names = new List <string> { data[i].Cache, data[j].Cache }; // Write the results var analysis = MatchDatasets(comparerType, cacheReaderX, cacheReaderY, options, datasetX, datasetY, names); AlignMatches(analysis, writer); } } } } }
public static void ExportMsMs(this UMCClusterLight cluster, string path, List <DatasetInformation> datasets, IMsMsSpectraWriter writer) { // Let's map the datasets first. var readers = new Dictionary <int, ISpectraProvider>(); var information = new Dictionary <int, DatasetInformation>(); datasets.ForEach(x => information.Add(x.DatasetId, x)); // We are only loading what datasets we have to here! // The point is, each cluster or feature may have come from a different raw data source... // since we dont store all of the data in memory, we have to fetch it from the appropriate source. // This means that we have to go into the raw data and get the scans for an MSMS spectra. foreach (var feature in cluster.Features) { if (!readers.ContainsKey(feature.GroupId)) { if (information.ContainsKey(feature.GroupId)) { var singleInfo = information[feature.GroupId]; if (singleInfo.Raw != null && singleInfo.RawPath != null) { // Make sure that we have a file. if (!File.Exists(singleInfo.RawPath)) { continue; } // Here we create a data file reader for the file we want to access. var provider = new InformedProteomicsReader(); // Then we make sure we key it to the provider. provider.AddDataFile(singleInfo.RawPath, feature.GroupId); // Then make sure we map it for a dataset, so when we sort through a cluster // we make sure that we can access in O(1) time. readers.Add(feature.GroupId, provider); } } } } // We flag the first write, so that if the file exists, we overwrite. They should have done // checking to make sure that the file was already created...we dont care. var firstWrite = true; foreach (var feature in cluster.Features) { if (readers.ContainsKey(feature.GroupId)) { var provider = readers[feature.GroupId]; foreach (var msFeature in feature.MsFeatures) { foreach (var spectrum in msFeature.MSnSpectra) { var summary = new ScanSummary(); var data = provider.GetRawSpectra(spectrum.Scan, spectrum.GroupId, out summary); spectrum.Peaks = data; spectrum.ScanMetaData = summary; } if (firstWrite) { writer.Write(path, msFeature.MSnSpectra); } else { writer.Append(path, msFeature.MSnSpectra); } } } } }