/// <summary> /// Creates LCMS Features /// </summary> public List <UMCLight> CreateLcmsFeatures( DatasetInformation information, List <MSFeatureLight> msFeatures, LcmsFeatureFindingOptions options, LcmsFeatureFilteringOptions filterOptions, IScanSummaryProvider provider, IProgress <ProgressData> progress = null) { // Make features if (msFeatures.Count < 1) { throw new Exception("No features were found in the feature files provided."); } UpdateStatus("Finding features."); ValidateFeatureFinderMaxScanLength(information, options, filterOptions); var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); finder.Progress += (sender, args) => UpdateStatus(args.Message); var features = finder.FindFeatures(msFeatures, options, provider, progress); UpdateStatus("Filtering features."); List <UMCLight> filteredFeatures = LcmsFeatureFilters.FilterFeatures(features, filterOptions, provider); UpdateStatus(string.Format("Filtered features from: {0} to {1}.", features.Count, filteredFeatures.Count)); return(filteredFeatures); }
/// <summary> /// Filters the list of MS Features that may be from MS/MS deisotoped data. /// </summary> public List <MSFeatureLight> Filter(List <MSFeatureLight> msFeatures, IScanSummaryProvider provider, ref DatasetInformation dataset) { string rawPath = dataset.RawFile.Path; if (rawPath == null || string.IsNullOrWhiteSpace(rawPath)) { return(msFeatures); } // First find all unique scans var scanMap = new Dictionary <int, bool>(); foreach (var feature in msFeatures) { if (!scanMap.ContainsKey(feature.Scan)) { // Assume all scans are parents scanMap.Add(feature.Scan, true); } } // Then parse each to figure out if this is true. var fullScans = new Dictionary <int, bool>(); var scanTimes = dataset.ScanTimes; if (provider == null) { UpdateStatus(string.Format("Warning: Raw file not found ({0}); scan times are not available!", System.IO.Path.GetFileName(rawPath))); } else { UpdateStatus(string.Format("Reading scan info from {0}", System.IO.Path.GetFileName(rawPath))); foreach (var scan in scanMap.Keys) { ScanSummary summary = provider.GetScanSummary(scan); if (summary == null) { continue; } if (summary.MsLevel == 1) { fullScans.Add(scan, true); } if (scanTimes.ContainsKey(scan)) { scanTimes[scan] = summary.Time; } else { scanTimes.Add(scan, summary.Time); } } dataset.ScanTimes = scanTimes; } return(msFeatures.Where(x => fullScans.ContainsKey(x.Scan)).ToList()); }
public MsToLcmsFeatures(IScanSummaryProvider provider, LcmsFeatureFindingOptions options = null) { if (provider == null) { throw new ArgumentNullException(); } Comparison <MSFeatureLight> mzSort = (x, y) => x.Mz.CompareTo(y.Mz); Comparison <UMCLight> monoSort = (x, y) => x.MassMonoisotopic.CompareTo(y.MassMonoisotopic); Func <MSFeatureLight, MSFeatureLight, double> mzDiff = (x, y) => FeatureLight.ComputeMassPPMDifference(x.Mz, y.Mz); Func <UMCLight, UMCLight, double> monoDiff = (x, y) => FeatureLight.ComputeMassPPMDifference(x.MassMonoisotopic, y.MassMonoisotopic); this.provider = provider; this.options = options ?? new LcmsFeatureFindingOptions(); // Set clusterers if (this.options.FirstPassClusterer == MsFeatureClusteringAlgorithmType.BinarySearchTree) { this.firstPassClusterer = new MsFeatureTreeClusterer <MSFeatureLight, UMCLight>( mzSort, mzDiff, MassComparison.Mz, this.options.InstrumentTolerances.Mass); } else { this.firstPassClusterer = ClusterFactory.Create(this.options.FirstPassClusterer); } if (this.options.SecondPassClusterer == GenericClusteringAlgorithmType.BinarySearchTree) { this.secondPassClusterer = new MsFeatureTreeClusterer <UMCLight, UMCLight>( monoSort, monoDiff, MassComparison.Monoisotopic, this.options.InstrumentTolerances.Mass); } else { var clusterFactory = new GenericClusterFactory <UMCLight, UMCLight>(); this.secondPassClusterer = clusterFactory.Create(this.options.SecondPassClusterer); } }
public IEnumerable <UMCLight> TestUmcFeatures(string relativePath, int expectedFeatureCount) { // Get the absolute path var path = GetPath(relativePath); var reader = new MsFeatureLightFileReader { Delimiter = ',' }; var newMsFeatures = reader.ReadFile(path); var finder = new UmcTreeFeatureFinder { MaximumNet = .005, MaximumScan = 50 }; var tolerances = new FeatureTolerances { Mass = 8, Net = .005 }; var options = new LcmsFeatureFindingOptions(tolerances); IScanSummaryProvider provider = null; var rawFilePath = path.Replace("_isos.csv", ".raw"); UpdateStatus("Using raw data to create better features."); var providerCache = new ScanSummaryProviderCache(); provider = providerCache.GetScanSummaryProvider(rawFilePath, 1); var features = finder.FindFeatures(newMsFeatures.ToList(), options, provider); // Work on total feature count here. Assert.Greater(features.Count, 0); Assert.AreEqual(expectedFeatureCount, features.Count); return(features); }
public static List <T> FilterFeatures <T>(List <T> features, LcmsFeatureFilteringOptions options, IScanSummaryProvider scanSummaryProvider = null) where T : UMCLight { IEnumerable <T> newFeatures; if (scanSummaryProvider == null || !options.FilterOnMinutes) { var minimumSize = options.FeatureLengthRangeScans.Minimum; var maximumSize = options.FeatureLengthRangeScans.Maximum; // Scan Length newFeatures = features.Where(x => { var size = Math.Abs(x.ScanStart - x.ScanEnd); return(size >= minimumSize && size <= maximumSize); }); } else { var minimumSize = options.FeatureLengthRangeMinutes.Minimum; var maximumSize = options.FeatureLengthRangeMinutes.Maximum; var minimumPoints = options.MinimumDataPoints; //var knownScanNumbers = scanTimes.Keys.ToList(); //knownScanNumbers.Sort(); // Scan Length newFeatures = features.Where(x => { try { double size = 0; if (x.ScanStart == 0) { //Scan 0 won't show up in scanTimes dictionary, so the feature length is just the time of the last feature scan. size = scanSummaryProvider.GetScanSummary(x.ScanEnd).Time; } else { size = Math.Abs(scanSummaryProvider.GetScanSummary(x.ScanEnd).Time - scanSummaryProvider.GetScanSummary(x.ScanStart).Time); } return(size >= minimumSize && size <= maximumSize && x.Features.Count >= minimumPoints); } catch (Exception ex) { throw (new IndexOutOfRangeException(String.Format("Exception determining the elution time for scans {0} and {1}: {2}", x.ScanStart, x.ScanEnd, ex.Message))); } }); } return(newFeatures.Where(x => x.Abundance > 0).ToList()); }
/// <summary> /// Finds features /// </summary> /// <returns></returns> public List <UMCLight> FindFeatures(List <MSFeatureLight> msFeatures, LcmsFeatureFindingOptions options, IScanSummaryProvider provider, IProgress <ProgressData> progress = null) { if (provider == null) { throw new ArgumentNullException(nameof(provider)); } var tolerances = new FeatureTolerances { Mass = options.InstrumentTolerances.Mass, Net = options.MaximumNetRange }; var clusterer = new MsToLcmsFeatures(provider, options); // MultiAlignCore.Algorithms.FeatureClustering.MsFeatureTreeClusterer //var clusterer = new MsFeatureTreeClusterer<MSFeatureLight, UMCLight> //{ // Tolerances = // new FeatureTolerances // { // Mass = options.InstrumentTolerances.Mass, // Net = options.MaximumNetRange // }, // ScanTolerance = options.MaximumScanRange, // SpectraProvider = (InformedProteomicsReader) provider // //TODO: Make sure we have a mass range for XIC's too.... //}; //clusterer.SpectraProvider = (InformedProteomicsReader) provider; //OnStatus("Starting cluster definition"); //clusterer.Progress += (sender, args) => OnStatus(args.Message); var features = clusterer.Convert(msFeatures, progress); var minScan = int.MaxValue; var maxScan = int.MinValue; foreach (var feature in msFeatures) { minScan = Math.Min(feature.Scan, minScan); maxScan = Math.Max(feature.Scan, maxScan); } var minScanTime = provider.GetScanSummary(minScan).Time; var maxScanTime = provider.GetScanSummary(maxScan).Time; var id = 0; var newFeatures = new List <UMCLight>(); foreach (var feature in features) { if (feature.MsFeatures.Count < 1) { continue; } feature.Net = (provider.GetScanSummary(feature.Scan).Time - minScanTime) / (maxScanTime - minScanTime); feature.CalculateStatistics(); feature.Id = id++; newFeatures.Add(feature); //Sets the width of the feature to be the width of the peak, not the width of the tails var maxAbundance = double.MinValue; var maxAbundanceIndex = 0; for (var msFeatureIndex = 0; msFeatureIndex < feature.MsFeatures.Count - 1; msFeatureIndex++) { var msFeature = feature.MsFeatures[msFeatureIndex]; if (msFeature.Abundance > maxAbundance) { maxAbundance = msFeature.Abundance; maxAbundanceIndex = msFeatureIndex; } } for (var msFeatureIndex = maxAbundanceIndex; msFeatureIndex > 0; msFeatureIndex--) { if (feature.MsFeatures[msFeatureIndex].Abundance / maxAbundance <= 0.05) { feature.ScanStart = feature.MsFeatures[msFeatureIndex].Scan; break; } } for (var msFeatureIndex = maxAbundanceIndex; msFeatureIndex < feature.MsFeatures.Count - 1; msFeatureIndex++) { if (feature.MsFeatures[msFeatureIndex].Abundance / maxAbundance <= 0.05) { feature.ScanEnd = feature.MsFeatures[msFeatureIndex].Scan; break; } } } return(features); }
/// <summary> /// Loads feature data from the files provided. /// </summary> /// <returns></returns> public static IList <UMCLight> LoadUmcFeatureData(DatasetInformation dataset, IUmcDAO featureCache, IScanSummaryProvider provider = null) { var features = new List <UMCLight>(); var extension = Path.GetExtension(dataset.Features.Path); if (extension == null) { return(features); } extension = extension.ToUpper(); switch (extension) { case ".TXT": if (dataset.Features.Path.EndsWith("_LCMSFeatures.txt")) { var reader = new LcImsFeatureFileReader(provider, dataset.DatasetId); features = reader.ReadFile(dataset.Features.Path).ToList(); } else { var umcReader = new LCMSFeatureFileReader(dataset.Features.Path); features = umcReader.GetUmcList(); } break; case ".DB3": features = featureCache.FindByDatasetId(dataset.DatasetId); break; case ".MS1FT": if (provider != null && provider is InformedProteomicsReader) { var promexReader = new PromexFileReader(provider as InformedProteomicsReader, dataset.DatasetId); features = promexReader.ReadFile(dataset.Features.Path).ToList(); } break; default: //Was reconstructing features from scratch even when they were already cached because the file extention was ".csv" not ".db3" features = featureCache.FindByDatasetId(dataset.DatasetId); break; } if (features != null && provider is ISpectraProvider) { var spectraProvider = provider as ISpectraProvider; LoadMsMs(features, spectraProvider); } return(features); }
public LcImsFeatureFileReader(IScanSummaryProvider provider = null, int datasetId = 0) { this.provider = provider; this.datasetId = datasetId; }
public Dictionary <int, int> LinkMSFeaturesToMSn(List <MSFeatureLight> features, List <MSSpectra> fragmentSpectra, IScanSummaryProvider provider) { return(LinkMSFeaturesToMSn(features, fragmentSpectra)); }
/// <summary> /// Load a single dataset from the provider. /// </summary> /// <returns></returns> public IList <UMCLight> LoadDataset(DatasetInformation dataset, MsFeatureFilteringOptions msFilteringOptions, LcmsFeatureFindingOptions lcmsFindingOptions, LcmsFeatureFilteringOptions lcmsFilteringOptions, DataLoadingOptions dataLoadOptions, ScanSummaryProviderCache providerCache, IdentificationProviderCache identificationProviders, IProgress <ProgressData> progress = null) { var progData = new ProgressData(progress); IScanSummaryProvider provider = null; if (!string.IsNullOrWhiteSpace(dataset.RawFile.Path)) { UpdateStatus("Using raw data to create better features."); provider = providerCache.GetScanSummaryProvider(dataset.RawFile.Path, dataset.DatasetId); } progData.StepRange(1); progData.Status = "Looking for existing features in the database."; UpdateStatus(string.Format("[{0}] - Loading dataset [{0}] - {1}.", dataset.DatasetId, dataset.DatasetName)); var datasetId = dataset.DatasetId; var features = UmcLoaderFactory.LoadUmcFeatureData(dataset, Providers.FeatureCache, provider); var hasMsFeatures = features.Any(f => f.MsFeatures.Any()); var msFeatures = new List <MSFeatureLight>(); if (!hasMsFeatures) { progData.StepRange(2); progData.Status = "Loading MS Feature Data."; UpdateStatus(string.Format("[{0}] Loading MS Feature Data [{0}] - {1}.", dataset.DatasetId, dataset.DatasetName)); var isosFilterOptions = dataLoadOptions.GetIsosFilterOptions(); msFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path, isosFilterOptions); } progData.StepRange(3); progData.Status = "Loading scan summaries."; ////var scansInfo = UmcLoaderFactory.LoadScanSummaries(dataset.Scans.Path); ////dataset.BuildScanTimes(scansInfo); progData.StepRange(100); var msnSpectra = new List <MSSpectra>(); // If we don't have any features, then we have to create some from the MS features // provided to us. if (features.Count < 1) { msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilteringOptions); msFeatures = Filter(msFeatures, provider, ref dataset); progData.Status = "Creating LCMS features."; features = CreateLcmsFeatures(dataset, msFeatures, lcmsFindingOptions, lcmsFilteringOptions, provider, new Progress <ProgressData>(pd => progData.Report(pd.Percent))); //var maxScan = Convert.ToDouble(features.Max(feature => feature.Scan)); //var minScan = Convert.ToDouble(features.Min(feature => feature.Scan)); var maxScan = features.Max(feature => feature.Scan); var minScan = features.Min(feature => feature.Scan); var id = 0; var scanTimes = dataset.ScanTimes; foreach (var feature in features) { feature.Id = id++; //feature.Net = (Convert.ToDouble(feature.Scan) - minScan) / (maxScan - minScan); feature.Net = (Convert.ToDouble(scanTimes[feature.Scan]) - scanTimes[minScan]) / (scanTimes[maxScan] - scanTimes[minScan]); feature.MassMonoisotopicAligned = feature.MassMonoisotopic; feature.NetAligned = feature.Net; feature.GroupId = datasetId; feature.SpectralCount = feature.MsFeatures.Count; foreach (var msFeature in feature.MsFeatures.Where(msFeature => msFeature != null)) { msFeature.UmcId = feature.Id; msFeature.GroupId = datasetId; msFeature.MSnSpectra.ForEach(x => x.GroupId = datasetId); msnSpectra.AddRange(msFeature.MSnSpectra); } } } else { if (!UmcLoaderFactory.AreExistingFeatures(dataset.Features.Path)) { var i = 0; foreach (var feature in features) { feature.GroupId = datasetId; feature.Id = i++; } } // Otherwise, we need to map the MS features to the LCMS Features provided. // This would mean that we extracted data from an existing database. if (msFeatures.Count > 0) { var map = FeatureDataConverters.MapFeature(features); foreach (var feature in from feature in msFeatures let doesFeatureExists = map.ContainsKey(feature.UmcId) where doesFeatureExists select feature) { map[feature.UmcId].AddChildFeature(feature); } } } //if (provider is ISpectraProvider) //{ // var spectraProvider = provider as ISpectraProvider; // UmcLoaderFactory.LoadMsMs(features.ToList(), spectraProvider); //} // Process the MS/MS data with peptides UpdateStatus("Reading List of Peptides"); if (dataset.SequenceFile != null && !string.IsNullOrEmpty(dataset.SequenceFile.Path)) { UpdateStatus("Reading List of Peptides"); var idProvider = identificationProviders.GetProvider(dataset.SequenceFile.Path, dataset.DatasetId); var peptideList = idProvider.GetAllIdentifications(); UpdateStatus("Linking MS/MS to any known Peptide/Metabolite Sequences"); var linker = new PeptideMsMsLinker(); linker.LinkPeptidesToSpectra(msnSpectra, peptideList); } progData.Report(100); return(features); }