public FeatureDisplayCommand(DatasetInformation information) : base(null, AlwaysPass) { m_information = information; m_window = null; m_name = "Features " + information.DatasetName; }
public Factor(int factorId, DatasetInformation datasetId, string factorName, string factorValue) { m_id = factorId; m_dataset = datasetId; m_factorName = factorName; m_factorValue = factorValue; }
/// <summary> /// Retrieves a list of features. /// </summary> /// <param name="rawFile"></param> /// <param name="featureFile"></param> /// <returns></returns> public List <UMCLight> FindFeatures(string rawFile, string featureFile) { List <UMCLight> features; using (ISpectraProvider raw = new InformedProteomicsReader()) { // Read the raw file summary data... raw.AddDataFile(rawFile, 0); var info = new DatasetInformation(); info.InputFiles.Add(new InputFile { Path = featureFile, FileType = InputFileType.Features }); var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var tolerances = new FeatureTolerances { Mass = 8, Net = .005 }; var options = new LcmsFeatureFindingOptions(tolerances); // Load and create features var msFeatures = UmcLoaderFactory.LoadMsFeatureData(info.Features.Path); var provider = RawLoaderFactory.CreateFileReader(rawFile); provider.AddDataFile(rawFile, 0); features = finder.FindFeatures(msFeatures, options, provider); } return(features); }
/// <summary> /// Creates LCMS Features /// </summary> public List <UMCLight> CreateLcmsFeatures( DatasetInformation information, List <MSFeatureLight> msFeatures, LcmsFeatureFindingOptions options, LcmsFeatureFilteringOptions filterOptions, IScanSummaryProvider provider, IProgress <ProgressData> progress = null) { // Make features if (msFeatures.Count < 1) { throw new Exception("No features were found in the feature files provided."); } UpdateStatus("Finding features."); ValidateFeatureFinderMaxScanLength(information, options, filterOptions); var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); finder.Progress += (sender, args) => UpdateStatus(args.Message); var features = finder.FindFeatures(msFeatures, options, provider, progress); UpdateStatus("Filtering features."); List <UMCLight> filteredFeatures = LcmsFeatureFilters.FilterFeatures(features, filterOptions, provider); UpdateStatus(string.Format("Filtered features from: {0} to {1}.", features.Count, filteredFeatures.Count)); return(filteredFeatures); }
/// <summary> /// Retrieves a list of features. /// </summary> /// <param name="rawFile"></param> /// <param name="featureFile"></param> /// <returns></returns> public List<UMCLight> FindFeatures(string rawFile, string featureFile) { List<UMCLight> features; using (ISpectraProvider raw = new ThermoRawDataFileReader()) { // Read the raw file summary data... raw.AddDataFile(rawFile, 0); var info = new DatasetInformation(); info.Features = new InputFile(); info.Features.Path = featureFile; var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var tolerances = new FeatureTolerances { Mass = 8, Net = .005 }; var options = new LcmsFeatureFindingOptions(tolerances); // Load and create features var msFeatures = UmcLoaderFactory.LoadMsFeatureData(info.Features.Path); var provider = RawLoaderFactory.CreateFileReader(rawFile); features = finder.FindFeatures(msFeatures, options, provider); } return features; }
/// <summary> /// Finds features given a dataset /// </summary> private IList <UMCLight> FindFeatures(DatasetInformation information, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, MultiAlignCore.Algorithms.FeatureFinding.IFeatureFinder featureFinder) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(information.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(information.RawFile.Path)) { rawProviderX.AddDataFile(information.RawFile.Path, 0); UpdateStatus("Creating LCMS Features."); var features = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); features = LcmsFeatureFilters.FilterFeatures(features, lcmsFilterOptions, information.ScanTimes); var datasetId = information.DatasetId; foreach (var feature in features) { var lightEntry = new List <MSFeatureLight>(); feature.GroupId = datasetId; foreach (var msFeature in feature.MsFeatures) { msFeature.GroupId = datasetId; foreach (var msmsFeature in msFeature.MSnSpectra) { msmsFeature.GroupId = datasetId; foreach (var peptide in msmsFeature.Peptides) { peptide.GroupId = datasetId; } } if (msFeature.MSnSpectra.Count > 0) { lightEntry.Add(msFeature); } } // We are doing this so that we dont have a ton of MS features in the database feature.MsFeatures.Clear(); feature.MsFeatures.AddRange(lightEntry); } LinkPeptidesToFeatures(information.SequenceFile.Path, features, peptideOptions.Fdr, peptideOptions.IdScore); DeRegisterProgressNotifier(featureFinder); return(features); } }
public DatasetInformationViewModel(DatasetInformation information) { m_information = information; var data = information.PlotData; PlotData = new ObservableCollection <PlotViewModel>(); RequestRemovalCommand = new BaseCommand( () => { if (RemovalRequested != null) { RemovalRequested(this, EventArgs.Empty); } }, s => !this.DoingWork); if (data != null) { PlotData.Add(new PlotViewModel(data.Alignment, "Alignment", new PictureDisplayCommand(data.Alignment, "Alignment" + information.DatasetName))); PlotData.Add(new PlotViewModel(data.Features, "Features", new FeatureDisplayCommand(information))); PlotData.Add(new PlotViewModel(data.MassErrorHistogram, "Mass Error Histogram")); PlotData.Add(new PlotViewModel(data.NetErrorHistogram, "NET Error Histogram")); PlotData.Add(new PlotViewModel(data.MassScanResidual, "Mass vs Scan Residuals")); PlotData.Add(new PlotViewModel(data.MassMzResidual, "Mass vs m/z Residuals")); PlotData.Add(new PlotViewModel(data.NetResiduals, "NET Residuals")); } ModifyDatasetCommand = new ShowDatasetDetailCommand(); }
private void ExportAlignmentData(AlignmentData data, DatasetInformation baselineDatasetInformation, DatasetInformation alignDatasetInformation, IEnumerable <UMCLight> baselineFeatures, IEnumerable <UMCLight> aligneeFeatures) { var netValues = new List <double>(); var massValues = new List <double>(); var anchorPoints = data.Matches; foreach (var match in anchorPoints) { netValues.Add(match.AnchorPointX.Net - match.AnchorPointY.Net); massValues.Add(match.AnchorPointX.Mass - match.AnchorPointY.Mass); } var netHist = MatchCountHistogramBuilder.CreateResidualHistogram(-.05, .05, .01, netValues); var netHistogram = new Dictionary <double, int>(); Console.WriteLine(); for (var i = 0; i < netHist.Bins.Count; i++) { netHistogram.Add(netHist.Bins[i], Convert.ToInt32(netHist.Data[i])); Console.WriteLine("{0}\t{1}", netHist.Bins[i], netHist.Data[i]); } }
public static void AddDataset(DatasetInformation dataset) { if (!m_datasets.ContainsKey(dataset.DatasetId)) { m_datasets.Add(dataset.DatasetId, dataset); } }
public DatasetInformationViewModel(DatasetInformation information) { m_information = information; var data = information.PlotData; PlotData = new ObservableCollection<PlotViewModel>(); RequestRemovalCommand = new BaseCommand( () => { if (RemovalRequested != null) { RemovalRequested(this, EventArgs.Empty); } }, s => !this.DoingWork); if (data != null) { PlotData.Add(new PlotViewModel(data.Alignment, "Alignment", new PictureDisplayCommand(data.Alignment, "Alignment" + information.DatasetName))); PlotData.Add(new PlotViewModel(data.Features, "Features", new FeatureDisplayCommand(information))); PlotData.Add(new PlotViewModel(data.MassErrorHistogram, "Mass Error Histogram")); PlotData.Add(new PlotViewModel(data.NetErrorHistogram, "NET Error Histogram")); PlotData.Add(new PlotViewModel(data.MassScanResidual, "Mass vs Scan Residuals")); PlotData.Add(new PlotViewModel(data.MassMzResidual, "Mass vs m/z Residuals")); PlotData.Add(new PlotViewModel(data.NetResiduals, "NET Residuals")); } ModifyDatasetCommand = new ShowDatasetDetailCommand(); }
private IList <UMCLight> AlignDataset( IList <UMCLight> features, IEnumerable <UMCLight> baselineFeatures, MassTagDatabase database, DatasetInformation datasetInfo, DatasetInformation baselineInfo) { AlignmentData alignmentData; if (baselineInfo == null && database == null) { throw new NullReferenceException("No reference was set for LC-MS alignment."); } // align the data. if (baselineFeatures != null && baselineInfo != null && baselineInfo.IsBaseline) { // Align pairwise and cache results intermediately. var aligner = m_algorithms.DatasetAligner; RegisterProgressNotifier(aligner); UpdateStatus("Aligning " + datasetInfo.DatasetName + " to baseline."); alignmentData = aligner.Align(baselineFeatures, features); DeRegisterProgressNotifier(aligner); } else { // Align pairwise and cache results intermediately. var aligner = m_algorithms.DatabaseAligner; RegisterProgressNotifier(aligner); UpdateStatus("Aligning " + datasetInfo.DatasetName + " to mass tag database."); alignmentData = aligner.Align(database, features); DeRegisterProgressNotifier(aligner); } if (alignmentData != null) { alignmentData.AligneeDataset = datasetInfo.DatasetName; alignmentData.DatasetID = datasetInfo.DatasetId; } var args = new FeaturesAlignedEventArgs(datasetInfo, baselineFeatures, features, alignmentData); if (FeaturesAligned != null) { FeaturesAligned(this, args); } UpdateStatus("Updating cache with aligned features."); return(features); }
/// <summary> /// Arguments that hold alignment information when a dataset is aligned. /// </summary> public FeaturesAlignedEventArgs(DatasetInformation datasetInfo, IEnumerable<UMCLight> baselineFeatures, IEnumerable<UMCLight> aligneeFeatures, classAlignmentData alignmentData) { m_datasetInformation = datasetInfo; BaselineFeatures = baselineFeatures; AligneeFeatures = aligneeFeatures; AlignmentData = alignmentData; }
/// <summary> /// Arguments that hold alignment information when a dataset is aligned. /// </summary> public FeaturesAlignedEventArgs(DatasetInformation datasetInfo, IEnumerable <UMCLight> baselineFeatures, IEnumerable <UMCLight> aligneeFeatures, AlignmentData alignmentData) { m_datasetInformation = datasetInfo; BaselineFeatures = baselineFeatures; AligneeFeatures = aligneeFeatures; AlignmentData = alignmentData; }
/// <summary> /// Adds a new dataset to the list. /// </summary> /// <returns>A list of added datasets</returns> private List <DatasetInformation> ConvertInputFilesIntoDatasets(List <InputFile> inputFiles) { var addedSets = new List <DatasetInformation>(); var datasetMap = new Dictionary <string, DatasetInformation>(); var inputMap = new Dictionary <string, List <InputFile> >(); foreach (var file in inputFiles) { var name = Path.GetFileName(file.Path); var datasetName = ExtractDatasetName(name); var isEntryMade = inputMap.ContainsKey(datasetName); if (!isEntryMade) { inputMap.Add(datasetName, new List <InputFile>()); } inputMap[datasetName].Add(file); } var i = 0; foreach (var datasetName in inputMap.Keys) { var files = inputMap[datasetName]; var datasetInformation = new DatasetInformation { DatasetId = i++, DatasetName = datasetName }; var doesDatasetExist = datasetMap.ContainsKey(datasetName); // Here we map the old dataset if it existed already. if (datasetMap.ContainsKey(datasetName)) { datasetInformation = datasetMap[datasetName]; } datasetInformation.InputFiles.AddRange(files); // Add the dataset if (!doesDatasetExist) { addedSets.Add(datasetInformation); } } // Reformat their Id's var id = 0; foreach (var x in addedSets) { x.DatasetId = id++; } return(addedSets); }
public static void FillDatasetInformation(InstanceModel model, IEnumerable <Instance> instances) { var datasetInformation = new DatasetInformation(); int objWithIncompleteData = instances.Count(instance => model.Features.Any(feature => FeatureValue.IsMissing(instance[feature]))); datasetInformation.FeatureInformations = model.Features.Select(feature => feature.FeatureInformation).ToArray(); datasetInformation.ObjectsWithIncompleteData = objWithIncompleteData; datasetInformation.GlobalAbscenseInformation = model.Features.Sum(feature => feature.FeatureInformation.MissingValueCount); model.DatasetInformation = datasetInformation; }
private double GetNet(DatasetInformation dataset, int scan) { var minScan = dataset.ScanTimes.Keys.Min(); var minEt = dataset.ScanTimes[minScan]; var maxScan = dataset.ScanTimes.Keys.Max(); var maxEt = dataset.ScanTimes[maxScan]; var et = dataset.ScanTimes[scan]; return((et - minEt) / (maxEt - minEt)); }
/// <summary> /// Loads feature data from the files provided. /// </summary> /// <returns></returns> public static IList <UMCLight> LoadUmcFeatureData(DatasetInformation dataset, IUmcDAO featureCache, IScanSummaryProvider provider = null) { var features = new List <UMCLight>(); var extension = Path.GetExtension(dataset.Features.Path); if (extension == null) { return(features); } extension = extension.ToUpper(); switch (extension) { case ".TXT": if (dataset.Features.Path.EndsWith("_LCMSFeatures.txt")) { var reader = new LcImsFeatureFileReader(provider, dataset.DatasetId); features = reader.ReadFile(dataset.Features.Path).ToList(); } else { var umcReader = new LCMSFeatureFileReader(dataset.Features.Path); features = umcReader.GetUmcList(); } break; case ".DB3": features = featureCache.FindByDatasetId(dataset.DatasetId); break; case ".MS1FT": if (provider != null && provider is InformedProteomicsReader) { var promexReader = new PromexFileReader(provider as InformedProteomicsReader, dataset.DatasetId); features = promexReader.ReadFile(dataset.Features.Path).ToList(); } break; default: //Was reconstructing features from scratch even when they were already cached because the file extention was ".csv" not ".db3" features = featureCache.FindByDatasetId(dataset.DatasetId); break; } if (features != null && provider is ISpectraProvider) { var spectraProvider = provider as ISpectraProvider; LoadMsMs(features, spectraProvider); } return(features); }
/// <summary> /// Loads dataset information from the path provided. /// </summary> /// <param name="info"></param> /// <returns></returns> public DatasetPlotInformation LoadDatasetPlots(string[] files, DatasetInformation info) { var plotInfo = new DatasetPlotInformation(); var fileList = new List <string>(); fileList.AddRange(files); var datasetFiles = new List <string>(); var name = info.DatasetName.ToLower(); foreach (var filename in fileList) { var file = filename.ToLower(); if (file.Contains(name)) { if (file.Contains("_features")) { plotInfo.Features = file; } else if (file.Contains("_heatmap")) { plotInfo.Alignment = file; } else if (file.Contains("_masshistogram")) { plotInfo.MassErrorHistogram = file; } else if (file.Contains("_nethistogram")) { plotInfo.NetErrorHistogram = file; } else if (file.Contains("_massmzresidual")) { plotInfo.MassMzResidual = file; } else if (file.Contains("_massscanresidual")) { plotInfo.MassScanResidual = file; } else if (file.Contains("_netresidual")) { plotInfo.NetResiduals = file; } } } info.PlotData = plotInfo; return(plotInfo); }
/// <summary> /// Loads dataset information from the path provided. /// </summary> /// <param name="info"></param> /// <returns></returns> public DatasetPlotInformation LoadDatasetPlots(string[] files, DatasetInformation info) { var plotInfo = new DatasetPlotInformation(); var fileList = new List<string>(); fileList.AddRange(files); var datasetFiles = new List<string>(); var name = info.DatasetName.ToLower(); foreach (var filename in fileList) { var file = filename.ToLower(); if (file.Contains(name)) { if (file.Contains("_features")) { plotInfo.Features = file; } else if (file.Contains("_heatmap")) { plotInfo.Alignment = file; } else if (file.Contains("_masshistogram")) { plotInfo.MassErrorHistogram = file; } else if (file.Contains("_nethistogram")) { plotInfo.NetErrorHistogram = file; } else if (file.Contains("_massmzresidual")) { plotInfo.MassMzResidual = file; } else if (file.Contains("_massscanresidual")) { plotInfo.MassScanResidual = file; } else if (file.Contains("_netresidual")) { plotInfo.NetResiduals = file; } } } info.PlotData = plotInfo; return plotInfo; }
public DatasetInformationViewModel(DatasetInformation information) { m_information = information; var data = information.PlotData; RequestRemovalCommand = new RelayCommand( () => { if (RemovalRequested != null) { RemovalRequested(this, EventArgs.Empty); } }, () => !this.DoingWork); this.SetDatasetState(); }
private List <DatasetInformation> CreateDatasetsFromInputFile(List <InputFile> inputFiles, bool findAdditionalFiles = false) { var datasets = new List <DatasetInformation>(); var datasetMap = new Dictionary <string, List <InputFile> >(); foreach (var file in inputFiles) { var name = System.IO.Path.GetFileName(file.Path); var datasetName = ExtractDatasetName(name); var isEntryMade = datasetMap.ContainsKey(datasetName); if (!isEntryMade) { datasetMap.Add(datasetName, new List <InputFile>()); } datasetMap[datasetName].Add(file); } var i = 0; foreach (var datasetName in datasetMap.Keys) { var files = datasetMap[datasetName]; var datasetInformation = new DatasetInformation { DatasetId = i++, DatasetName = datasetName }; // Get additional files if (findAdditionalFiles) { // Try to use the location of the feature file first, otherwise just use first available file. var featureFile = files.FirstOrDefault(file => file.FileType == InputFileType.Features); var fileToUse = featureFile ?? files.FirstOrDefault(); if (fileToUse != null) { files.AddRange(this.FindAdditionalDatasetFiles(fileToUse)); } } datasetInformation.InputFiles.AddRange(files); datasets.Add(datasetInformation); } return(datasets); }
/// <summary> /// Make sure the value for options.MaximumScanRange, which is used by the Feature Finder, /// is at least as large as the filterOptions.FeatureLengthRange.Maximum value, /// which is used for filtering the features by length /// </summary> /// <param name="information"></param> /// <param name="options"></param> /// <param name="filterOptions"></param> private static void ValidateFeatureFinderMaxScanLength( DatasetInformation information, LcmsFeatureFindingOptions options, LcmsFeatureFilteringOptions filterOptions) { if (!filterOptions.FilterOnMinutes) { if (options.MaximumScanRange < filterOptions.FeatureLengthRangeMinutes.Maximum) { // Bump up the scan range used by the LCMS Feature Finder to allow for longer features options.MaximumScanRange = (int)filterOptions.FeatureLengthRangeMinutes.Maximum; } } else { if (options.MaximumScanRange < filterOptions.FeatureLengthRangeScans.Maximum) { // Bump up the scan range used by the LCMS Feature Finder to allow for longer features options.MaximumScanRange = (int)filterOptions.FeatureLengthRangeScans.Maximum; } } }
public AlignmentData AlignToDatabase( ref IList <UMCLight> features, DatasetInformation datasetInfo, MassTagDatabase mtdb, IProgress <ProgressData> progress = null) { progress = progress ?? new Progress <ProgressData>(); var aligner = this.m_algorithms.DatabaseAligner; var alignmentData = aligner.Align(mtdb, features, progress); aligner.Progress += aligner_Progress; if (alignmentData != null) { alignmentData.AligneeDataset = datasetInfo.DatasetName; alignmentData.DatasetID = datasetInfo.DatasetId; } aligner.Progress -= aligner_Progress; return(alignmentData); }
public AlignmentData AlignToDataset( ref IList <UMCLight> features, DatasetInformation datasetInfo, IEnumerable <UMCLight> baselineFeatures, IProgress <ProgressData> progress = null) { progress = progress ?? new Progress <ProgressData>(); // Align pairwise and cache results intermediately. var aligner = this.m_algorithms.DatasetAligner; aligner.Progress += aligner_Progress; var alignmentData = aligner.Align(baselineFeatures, features, progress); if (alignmentData != null) { alignmentData.AligneeDataset = datasetInfo.DatasetName; alignmentData.DatasetID = datasetInfo.DatasetId; } aligner.Progress -= aligner_Progress; return(alignmentData); }
/// <summary> /// Partition the current view into (numSectionsPerAxis)^2 sections and select the top /// "featuresPerSection" in each section. /// </summary> /// <param name="dataset">Dataset to get features points for.</param> /// <param name="globalMax">The maximum mass in all datasets.</param> /// <returns> /// Collection of datapoints for features. /// Item 1: LCMS feature datapoints. Item2: MS Feature datapoints. /// </returns> private IEnumerable <FeaturePoint> GetPartitionedPoints(DatasetInformation dataset, double globalMax) { var netActMaximum = this.netAxis.ActualMaximum.Equals(0) ? 1.0 : this.netAxis.ActualMaximum; var massActMaximum = this.massAxis.ActualMaximum.Equals(0) ? globalMax : this.massAxis.ActualMaximum; var netStep = (netActMaximum - this.netAxis.ActualMinimum) / this.numSectionsPerAxis; var massStep = (massActMaximum - this.massAxis.ActualMinimum) / this.numSectionsPerAxis; var featureHash = new HashSet <FeaturePoint>(); var featureTree = this.quadTrees[dataset]; for (int i = 0; i < this.numSectionsPerAxis; i++) { var netMin = this.netAxis.ActualMinimum + (i * netStep); var netMax = this.netAxis.ActualMinimum + ((i + 1) * netStep); for (int j = 0; j < this.numSectionsPerAxis; j++) { var massMin = this.massAxis.ActualMinimum + (j * massStep); var massMax = this.massAxis.ActualMinimum + ((j + 1) * massStep); var treeFeatures = featureTree.Query(new RectangleF { X = (float)netMin, Y = (float)massMin, Height = (float)(massMax - massMin), Width = (float)(netMax - netMin) }); var featureRange = treeFeatures.OrderByDescending(feat => feat.UMCLight.Abundance) .Take(this.featuresPerSection); featureHash.UnionWith(featureRange); } } return(featureHash); }
public classAlignmentData AlignToDataset( ref IList<UMCLight> features, IEnumerable<UMCLight> baselineFeatures, DatasetInformation datasetInfo, DatasetInformation baselineInfo) { classAlignmentData alignmentData; if (baselineInfo == null) { throw new NullReferenceException("No reference was set for LC-MS alignment."); } // Align pairwise and cache results intermediately. var aligner = this.m_algorithms.DatasetAligner; alignmentData = aligner.Align(baselineFeatures, features); if (alignmentData != null) { alignmentData.aligneeDataset = datasetInfo.DatasetName; alignmentData.DatasetID = datasetInfo.DatasetId; } //var args = new FeaturesAlignedEventArgs(datasetInfo, baselineFeatures, features, alignmentData); return alignmentData; }
/// <summary> /// Loads baseline data for alignment. /// </summary> private IList<UMCLight> LoadBaselineData(DatasetInformation baselineInfo, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFindingOptions lcmsFindingOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, FeatureDataAccessProviders dataProviders, MassTagDatabase database, bool shouldUseMassTagDbAsBaseline) { IList<UMCLight> baselineFeatures = null; UpdateStatus("Loading baseline features."); if (!shouldUseMassTagDbAsBaseline) { if (baselineInfo == null) { throw new Exception("The baseline dataset was never set."); } var cache = new FeatureLoader { Providers = dataProviders }; RegisterProgressNotifier(cache); UpdateStatus("Loading baseline features from " + baselineInfo.DatasetName + " for alignment."); baselineFeatures = cache.LoadDataset(baselineInfo, msFilterOptions, lcmsFindingOptions, lcmsFilterOptions); cache.CacheFeatures(baselineFeatures); if (BaselineFeaturesLoaded != null) { BaselineFeaturesLoaded(this, new BaselineFeaturesLoadedEventArgs(baselineInfo, baselineFeatures.ToList())); } DeRegisterProgressNotifier(cache); } else { if (database == null) throw new NullReferenceException( "The mass tag database has to have data in it if it's being used for drift time alignment."); UpdateStatus("Setting baseline features for post drift time alignment from mass tag database."); var tags = FeatureDataConverters.ConvertToUMC(database.MassTags); if (BaselineFeaturesLoaded == null) return tags; if (tags != null) BaselineFeaturesLoaded(this, new BaselineFeaturesLoadedEventArgs(null, tags.ToList(), database)); } return baselineFeatures; }
/// <summary>Get scatter points for MS features and a rectangle annotation for the LCMS feature.</summary> /// <param name="feature">An LCMS feature.</param> /// <param name="dataset">The dataset that the LCMS feature comes from.</param> /// <returns>The tuple containing the LCMS feature annotation and the MS feature scatter points..</returns> private Tuple <RectangleAnnotation, IEnumerable <ScatterPoint> > GetMsFeaturesAndAnnotations(FeaturePoint feature, DatasetInformation dataset) { var msdataPoints = new List <ScatterPoint> { Capacity = feature.UMCLight.MsFeatures.Count }; var minNet = double.PositiveInfinity; var maxNet = 0.0; var minMass = double.PositiveInfinity; var maxMass = 0.0; foreach (var msfeature in feature.UMCLight.MsFeatures) { var net = this.GetNet(dataset, msfeature.Scan); minNet = Math.Min(minNet, net); maxNet = Math.Max(maxNet, net); minMass = Math.Min(minMass, msfeature.MassMonoisotopic); maxMass = Math.Max(maxMass, msfeature.MassMonoisotopic); msdataPoints.Add(new ScatterPoint(net, msfeature.MassMonoisotopic, 0.8)); } var netRange = maxNet - minNet; netRange = netRange.Equals(0.0) ? 0.01 : netRange; var massRange = maxMass - minMass; massRange = Math.Max(1.0, massRange); minNet = minNet - (0.25 * netRange); maxNet = maxNet + (0.25 * netRange); minMass = Math.Max(minMass - (massRange * 0.5), 0); maxMass = maxMass + (massRange * 0.5); var annotation = new RectangleAnnotation { MinimumX = minNet, MaximumX = maxNet, MinimumY = minMass, MaximumY = maxMass, Fill = OxyColors.Transparent, StrokeThickness = 1.0, }; return(new Tuple <RectangleAnnotation, IEnumerable <ScatterPoint> >(annotation, msdataPoints)); }
/// <summary> /// Finds features given a dataset /// </summary> private IList<UMCLight> FindFeatures( DatasetInformation information, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, IFeatureFinder featureFinder) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(information.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(information.RawPath)) { rawProviderX.AddDataFile(information.RawPath, 0); UpdateStatus("Creating LCMS Features."); var features = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); features = LcmsFeatureFilters.FilterFeatures(features, lcmsFilterOptions); var datasetId = information.DatasetId; foreach (var feature in features) { var lightEntry = new List<MSFeatureLight>(); feature.GroupId = datasetId; foreach (var msFeature in feature.MsFeatures) { msFeature.GroupId = datasetId; foreach (var msmsFeature in msFeature.MSnSpectra) { msmsFeature.GroupId = datasetId; foreach (var peptide in msmsFeature.Peptides) { peptide.GroupId = datasetId; } } if (msFeature.MSnSpectra.Count > 0) lightEntry.Add(msFeature); } // We are doing this so that we dont have a ton of MS features in the database feature.MsFeatures.Clear(); feature.MsFeatures.AddRange(lightEntry); } LinkPeptidesToFeatures(information.SequencePath, features, peptideOptions.Fdr, peptideOptions.IdScore); DeRegisterProgressNotifier(featureFinder); return features; } }
private double GetNet(DatasetInformation dataset, int scan) { var minScan = dataset.ScanTimes.Keys.Min(); var minEt = dataset.ScanTimes[minScan]; var maxScan = dataset.ScanTimes.Keys.Max(); var maxEt = dataset.ScanTimes[maxScan]; var et = dataset.ScanTimes[scan]; return (et - minEt) / (maxEt - minEt); }
/// <summary> /// Adds a new dataset to the list. /// </summary> /// <param name="info"></param> /// <returns>A list of added datasets</returns> public List<DatasetInformation> AddInputFiles(List<InputFile> inputFiles) { var addedSets = new List<DatasetInformation>(); var datasetMap = new Dictionary<string, DatasetInformation>(); foreach (var x in Datasets) { datasetMap.Add(x.DatasetName, x); } var inputMap = new Dictionary<string, List<InputFile>>(); foreach (var file in inputFiles) { var name = Path.GetFileName(file.Path); var datasetName = DatasetInformation.ExtractDatasetName(name); var isEntryMade = inputMap.ContainsKey(datasetName); if (!isEntryMade) { inputMap.Add(datasetName, new List<InputFile>()); } inputMap[datasetName].Add(file); } var i = 0; foreach (var datasetName in inputMap.Keys) { var files = inputMap[datasetName]; var datasetInformation = new DatasetInformation(); datasetInformation.DatasetId = i++; datasetInformation.DatasetName = datasetName; var doesDatasetExist = datasetMap.ContainsKey(datasetName); // Here we map the old dataset if it existed already. if (datasetMap.ContainsKey(datasetName)) { datasetInformation = datasetMap[datasetName]; } foreach (var file in files) { switch (file.FileType) { case InputFileType.Features: datasetInformation.Features = file; datasetInformation.Path = file.Path; break; case InputFileType.Scans: datasetInformation.Scans = file; break; case InputFileType.Raw: datasetInformation.Raw = file; break; case InputFileType.Sequence: datasetInformation.Sequence = file; break; } } /// Add the dataset if (!doesDatasetExist) { addedSets.Add(datasetInformation); Datasets.Add(datasetInformation); } } // Reformat their Id's var id = 0; foreach (var x in Datasets) { x.DatasetId = id++; } return addedSets; }
/// <summary> /// Runs the MultiAlign analysis /// </summary> public void PerformMultiAlignAnalysis(DatasetInformation baselineDataset, IEnumerable<DatasetInformation> aligneeDatasets, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, IFeatureFinder featureFinder, IFeatureAligner<IEnumerable<UMCLight>, IEnumerable<UMCLight>, classAlignmentData> aligner, IClusterer<UMCLight, UMCClusterLight> clusterer, string matchPath, string errorPath) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(baselineDataset.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(baselineDataset.RawPath)) { rawProviderX.AddDataFile(baselineDataset.RawPath, 0); UpdateStatus("Creating Baseline LCMS Features."); var baselineFeatures = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); LinkPeptidesToFeatures(baselineDataset.SequencePath, baselineFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); // Then load the alignee dataset foreach (var dataset in aligneeDatasets) { var aligneeMsFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path); aligneeMsFeatures = LcmsFeatureFilters.FilterMsFeatures(aligneeMsFeatures, msFilterOptions); using (var rawProviderY = RawLoaderFactory.CreateFileReader(dataset.RawPath)) { rawProviderY.AddDataFile(dataset.RawPath, 0); UpdateStatus("Finding alignee features"); var aligneeFeatures = featureFinder.FindFeatures(aligneeMsFeatures, featureFindingOptions, rawProviderY); LinkPeptidesToFeatures(dataset.SequencePath, aligneeFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); // cluster before we do anything else.... var allFeatures = new List<UMCLight>(); allFeatures.AddRange(baselineFeatures); allFeatures.AddRange(aligneeFeatures); foreach (var feature in allFeatures) { feature.Net = feature.Net; feature.MassMonoisotopicAligned = feature.MassMonoisotopic; } // This tells us the differences before we align. var clusters = clusterer.Cluster(allFeatures); var preAlignment = AnalyzeClusters(clusters); aligner.AligneeSpectraProvider = providerY; aligner.BaselineSpectraProvider = providerX; UpdateStatus("Aligning data"); // Aligner data var data = aligner.Align(baselineFeatures, aligneeFeatures); var matches = data.Matches; WriteErrors(errorPath, matches); // create anchor points for LCMSWarp alignment var massPoints = new List<RegressionPoint>(); var netPoints = new List<RegressionPoint>(); foreach (var match in matches) { var massError = FeatureLight.ComputeMassPPMDifference(match.AnchorPointX.Mz, match.AnchorPointY.Mz); var netError = match.AnchorPointX.Net - match.AnchorPointY.Net; var massPoint = new RegressionPoint(match.AnchorPointX.Mz, 0, massError, netError); massPoints.Add(massPoint); var netPoint = new RegressionPoint(match.AnchorPointX.Net, 0, massError, netError); netPoints.Add(netPoint); } foreach (var feature in allFeatures) { feature.UmcCluster = null; feature.ClusterId = -1; } // Then cluster after alignment! UpdateStatus("clustering data"); clusters = clusterer.Cluster(allFeatures); var postAlignment = AnalyzeClusters(clusters); UpdateStatus("Note\tSame\tDifferent"); UpdateStatus(string.Format("Pre\t{0}\t{1}", preAlignment.SameCluster, preAlignment.DifferentCluster)); UpdateStatus(string.Format("Post\t{0}\t{1}", postAlignment.SameCluster, postAlignment.DifferentCluster)); SaveMatches(matchPath, matches); } } } DeRegisterProgressNotifier(aligner); DeRegisterProgressNotifier(featureFinder); DeRegisterProgressNotifier(clusterer); }
/// <summary> /// Arguments that hold dataset information when features are loaded. /// </summary> /// <param name="info">Dataset information object</param> public FeaturesLoadedEventArgs(DatasetInformation info, IList<UMCLight> features) { DatasetInformation = info; Features = features; }
/// <summary> /// Filters the list of MS Features that may be from MS/MS deisotoped data. /// </summary> public List <MSFeatureLight> Filter(List <MSFeatureLight> msFeatures, IScanSummaryProvider provider, ref DatasetInformation dataset) { string rawPath = dataset.RawFile.Path; if (rawPath == null || string.IsNullOrWhiteSpace(rawPath)) { return(msFeatures); } // First find all unique scans var scanMap = new Dictionary <int, bool>(); foreach (var feature in msFeatures) { if (!scanMap.ContainsKey(feature.Scan)) { // Assume all scans are parents scanMap.Add(feature.Scan, true); } } // Then parse each to figure out if this is true. var fullScans = new Dictionary <int, bool>(); var scanTimes = dataset.ScanTimes; if (provider == null) { UpdateStatus(string.Format("Warning: Raw file not found ({0}); scan times are not available!", System.IO.Path.GetFileName(rawPath))); } else { UpdateStatus(string.Format("Reading scan info from {0}", System.IO.Path.GetFileName(rawPath))); foreach (var scan in scanMap.Keys) { ScanSummary summary = provider.GetScanSummary(scan); if (summary == null) { continue; } if (summary.MsLevel == 1) { fullScans.Add(scan, true); } if (scanTimes.ContainsKey(scan)) { scanTimes[scan] = summary.Time; } else { scanTimes.Add(scan, summary.Time); } } dataset.ScanTimes = scanTimes; } return(msFeatures.Where(x => fullScans.ContainsKey(x.Scan)).ToList()); }
public void TestCreateDummyDatabase(string databasePath, int totalDatasets, int totalClusters) { File.Delete(databasePath); NHibernateUtil.ConnectToDatabase(databasePath, true); IDatasetDAO datasetCache = new DatasetDAOHibernate(); IUmcClusterDAO clusterCache = new UmcClusterDAOHibernate(); IUmcDAO featureCache = new UmcDAOHibernate(); // Creating a dataset Console.WriteLine("Creating dummy datasets"); var datasets = new List <DatasetInformation>(); var total = totalDatasets; for (var i = 0; i < total; i++) { var dataset = new DatasetInformation(); dataset.DatasetId = i; dataset.DatasetName = "test" + i; datasets.Add(dataset); } datasetCache.AddAll(datasets); datasets.Clear(); datasets = datasetCache.FindAll(); // Create features Console.WriteLine("Creating features"); var features = new List <UMCLight>(); var clusters = new List <UMCClusterLight>(); var x = new Random(); var featureId = 0; for (var i = 0; i < totalClusters; i++) { var N = x.Next(1, total); var charge = x.Next(1, 10); var hash = new HashSet <int>(); var net = x.NextDouble(); var mass = 400 + (1600 * x.NextDouble()); var dt = 60 * x.NextDouble(); for (var j = 0; j < N; j++) { var did = -1; do { did = x.Next(0, total); if (!hash.Contains(did)) { hash.Add(did); break; } } while (true); var feature = new UMCLight(); feature.GroupId = did; feature.Id = featureId++; feature.ChargeState = charge; feature.MassMonoisotopic = FeatureLight.ComputeDaDifferenceFromPPM(mass, 3); feature.MassMonoisotopicAligned = feature.MassMonoisotopic; feature.Net = net + .03 * x.NextDouble(); feature.NetAligned = feature.Net; feature.Net = feature.Net; feature.DriftTime = dt; feature.AbundanceSum = x.Next(100, 200); feature.Abundance = feature.Abundance; feature.ClusterId = -1; features.Add(feature); } } featureCache.AddAll(features); }
/// <summary> /// Make sure the value for options.MaximumScanRange, which is used by the Feature Finder, /// is at least as large as the filterOptions.FeatureLengthRange.Maximum value, /// which is used for filtering the features by length /// </summary> /// <param name="information"></param> /// <param name="options"></param> /// <param name="filterOptions"></param> private static void ValidateFeatureFinderMaxScanLength( DatasetInformation information, LcmsFeatureFindingOptions options, LcmsFeatureFilteringOptions filterOptions) { if (!filterOptions.TreatAsTimeNotScan) { if (options.MaximumScanRange < filterOptions.FeatureLengthRange.Maximum) { // Bump up the scan range used by the LCMS Feature Finder to allow for longer featuers options.MaximumScanRange = (int)filterOptions.FeatureLengthRange.Maximum; } return; } int maxScanLength; if (information.ScanTimes.Count == 0) { // FeatureLengthRange.Maximum is in minutes // Assume 3 scans/second (ballpark estimate) maxScanLength = (int)filterOptions.FeatureLengthRange.Maximum * 60 * 3; } else { // Find the average number of scans that spans FeatureLengthRange.Maximum minutes // Step through the dictionary to find the average number of scans per minute var minuteThreshold = 1; var scanCountCurrent = 0; var scanCountsPerMinute = new List<int>(); foreach (var entry in information.ScanTimes) { if (entry.Value < minuteThreshold) { scanCountCurrent++; } else { if (scanCountCurrent > 0) { scanCountsPerMinute.Add(scanCountCurrent); } scanCountCurrent = 0; minuteThreshold++; } } int averageScansPerMinute; if (scanCountsPerMinute.Count > 0) { averageScansPerMinute = (int)scanCountsPerMinute.Average(); } else { averageScansPerMinute = 180; } maxScanLength = (int)(filterOptions.FeatureLengthRange.Maximum * averageScansPerMinute * 1.25); } if (options.MaximumScanRange < maxScanLength) { // Bump up the scan range used by the LCMS Feature Finder to allow for longer featuers options.MaximumScanRange = maxScanLength; } }
public void TestClusterGeneration(string databasePath, string crossPath, int charge, int minimumClusterSize) { File.Delete(databasePath); NHibernateUtil.ConnectToDatabase(databasePath, true); IDatasetDAO datasetCache = new DatasetDAOHibernate(); IUmcClusterDAO clusterCache = new UmcClusterDAOHibernate(); IUmcDAO featureCache = new UmcDAOHibernate(); // Creating a dataset Console.WriteLine("Creating dummy datasets"); var datasets = new List<DatasetInformation>(); var total = 10; for (var i = 0; i < total; i++) { var dataset = new DatasetInformation(); dataset.DatasetId = i; dataset.DatasetName = "test" + i; datasets.Add(dataset); } datasetCache.AddAll(datasets); datasets.Clear(); datasets = datasetCache.FindAll(); // Create features Console.WriteLine("Creating features"); var features = new List<UMCLight>(); var clusters = new List<UMCClusterLight>(); var x = new Random(); var featureId = 0; for (var i = 0; i < 100; i++) { var cluster = new UMCClusterLight(); cluster.Id = i; cluster.AmbiguityScore = i; cluster.Tightness = i; var N = x.Next(1, total); cluster.Id = i; cluster.ChargeState = charge; var hash = new HashSet<int>(); for (var j = 0; j < N; j++) { var did = -1; do { did = x.Next(0, total); if (!hash.Contains(did)) { hash.Add(did); break; } } while (true); var feature = new UMCLight(); feature.GroupId = did; feature.Id = featureId++; feature.ChargeState = charge; feature.MassMonoisotopic = x.NextDouble(); feature.Net = x.NextDouble(); feature.AbundanceSum = x.Next(100, 200); feature.Abundance = feature.Abundance; feature.ClusterId = cluster.Id; cluster.AddChildFeature(feature); features.Add(feature); } cluster.CalculateStatistics(ClusterCentroidRepresentation.Mean); clusters.Add(cluster); } featureCache.AddAll(features); clusterCache.AddAll(clusters); clusters = clusterCache.FindAll(); Console.WriteLine("Find all clusters"); clusters = clusterCache.FindByCharge(charge); WriteClusters(datasets, clusters, minimumClusterSize, charge, crossPath, databasePath, 300000); }
private IList<UMCLight> AlignDataset( IList<UMCLight> features, IEnumerable<UMCLight> baselineFeatures, MassTagDatabase database, DatasetInformation datasetInfo, DatasetInformation baselineInfo) { classAlignmentData alignmentData; if (baselineInfo == null && database == null) { throw new NullReferenceException("No reference was set for LC-MS alignment."); } // align the data. if (baselineFeatures != null && baselineInfo != null && baselineInfo.IsBaseline) { // Align pairwise and cache results intermediately. var aligner = m_algorithms.DatasetAligner; RegisterProgressNotifier(aligner); UpdateStatus("Aligning " + datasetInfo.DatasetName + " to baseline."); alignmentData = aligner.Align(baselineFeatures, features); DeRegisterProgressNotifier(aligner); } else { // Align pairwise and cache results intermediately. var aligner = m_algorithms.DatabaseAligner; RegisterProgressNotifier(aligner); UpdateStatus("Aligning " + datasetInfo.DatasetName + " to mass tag database."); alignmentData = aligner.Align(database, features); DeRegisterProgressNotifier(aligner); } if (alignmentData != null) { alignmentData.aligneeDataset = datasetInfo.DatasetName; alignmentData.DatasetID = datasetInfo.DatasetId; } var args = new FeaturesAlignedEventArgs(datasetInfo, baselineFeatures, features, alignmentData); if (FeaturesAligned != null) FeaturesAligned(this, args); UpdateStatus("Updating cache with aligned features."); return features; }
private void ExportAlignmentData(classAlignmentData data, DatasetInformation baselineDatasetInformation, DatasetInformation alignDatasetInformation, IEnumerable<UMCLight> baselineFeatures, IEnumerable<UMCLight> aligneeFeatures) { var netValues = new List<double>(); var massValues = new List<double>(); var anchorPoints = data.Matches; foreach (var match in anchorPoints) { netValues.Add(match.AnchorPointX.Net - match.AnchorPointY.Net); massValues.Add(match.AnchorPointX.Mass - match.AnchorPointY.Mass); } var netHist = MatchCountHistogramBuilder.CreateResidualHistogram(-.05, .05, .01, netValues); var netHistogram = new Dictionary<double, int>(); Console.WriteLine(); for (var i = 0; i < netHist.Bins.Count; i++) { netHistogram.Add(netHist.Bins[i], Convert.ToInt32(netHist.Data[i])); Console.WriteLine("{0}\t{1}", netHist.Bins[i], netHist.Data[i]); } }
/// <summary> /// Load a single dataset from the provider. /// </summary> /// <returns></returns> public IList <UMCLight> LoadDataset(DatasetInformation dataset, MsFeatureFilteringOptions msFilteringOptions, LcmsFeatureFindingOptions lcmsFindingOptions, LcmsFeatureFilteringOptions lcmsFilteringOptions, DataLoadingOptions dataLoadOptions, ScanSummaryProviderCache providerCache, IdentificationProviderCache identificationProviders, IProgress <ProgressData> progress = null) { var progData = new ProgressData(progress); IScanSummaryProvider provider = null; if (!string.IsNullOrWhiteSpace(dataset.RawFile.Path)) { UpdateStatus("Using raw data to create better features."); provider = providerCache.GetScanSummaryProvider(dataset.RawFile.Path, dataset.DatasetId); } progData.StepRange(1); progData.Status = "Looking for existing features in the database."; UpdateStatus(string.Format("[{0}] - Loading dataset [{0}] - {1}.", dataset.DatasetId, dataset.DatasetName)); var datasetId = dataset.DatasetId; var features = UmcLoaderFactory.LoadUmcFeatureData(dataset, Providers.FeatureCache, provider); var hasMsFeatures = features.Any(f => f.MsFeatures.Any()); var msFeatures = new List <MSFeatureLight>(); if (!hasMsFeatures) { progData.StepRange(2); progData.Status = "Loading MS Feature Data."; UpdateStatus(string.Format("[{0}] Loading MS Feature Data [{0}] - {1}.", dataset.DatasetId, dataset.DatasetName)); var isosFilterOptions = dataLoadOptions.GetIsosFilterOptions(); msFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path, isosFilterOptions); } progData.StepRange(3); progData.Status = "Loading scan summaries."; ////var scansInfo = UmcLoaderFactory.LoadScanSummaries(dataset.Scans.Path); ////dataset.BuildScanTimes(scansInfo); progData.StepRange(100); var msnSpectra = new List <MSSpectra>(); // If we don't have any features, then we have to create some from the MS features // provided to us. if (features.Count < 1) { msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilteringOptions); msFeatures = Filter(msFeatures, provider, ref dataset); progData.Status = "Creating LCMS features."; features = CreateLcmsFeatures(dataset, msFeatures, lcmsFindingOptions, lcmsFilteringOptions, provider, new Progress <ProgressData>(pd => progData.Report(pd.Percent))); //var maxScan = Convert.ToDouble(features.Max(feature => feature.Scan)); //var minScan = Convert.ToDouble(features.Min(feature => feature.Scan)); var maxScan = features.Max(feature => feature.Scan); var minScan = features.Min(feature => feature.Scan); var id = 0; var scanTimes = dataset.ScanTimes; foreach (var feature in features) { feature.Id = id++; //feature.Net = (Convert.ToDouble(feature.Scan) - minScan) / (maxScan - minScan); feature.Net = (Convert.ToDouble(scanTimes[feature.Scan]) - scanTimes[minScan]) / (scanTimes[maxScan] - scanTimes[minScan]); feature.MassMonoisotopicAligned = feature.MassMonoisotopic; feature.NetAligned = feature.Net; feature.GroupId = datasetId; feature.SpectralCount = feature.MsFeatures.Count; foreach (var msFeature in feature.MsFeatures.Where(msFeature => msFeature != null)) { msFeature.UmcId = feature.Id; msFeature.GroupId = datasetId; msFeature.MSnSpectra.ForEach(x => x.GroupId = datasetId); msnSpectra.AddRange(msFeature.MSnSpectra); } } } else { if (!UmcLoaderFactory.AreExistingFeatures(dataset.Features.Path)) { var i = 0; foreach (var feature in features) { feature.GroupId = datasetId; feature.Id = i++; } } // Otherwise, we need to map the MS features to the LCMS Features provided. // This would mean that we extracted data from an existing database. if (msFeatures.Count > 0) { var map = FeatureDataConverters.MapFeature(features); foreach (var feature in from feature in msFeatures let doesFeatureExists = map.ContainsKey(feature.UmcId) where doesFeatureExists select feature) { map[feature.UmcId].AddChildFeature(feature); } } } //if (provider is ISpectraProvider) //{ // var spectraProvider = provider as ISpectraProvider; // UmcLoaderFactory.LoadMsMs(features.ToList(), spectraProvider); //} // Process the MS/MS data with peptides UpdateStatus("Reading List of Peptides"); if (dataset.SequenceFile != null && !string.IsNullOrEmpty(dataset.SequenceFile.Path)) { UpdateStatus("Reading List of Peptides"); var idProvider = identificationProviders.GetProvider(dataset.SequenceFile.Path, dataset.DatasetId); var peptideList = idProvider.GetAllIdentifications(); UpdateStatus("Linking MS/MS to any known Peptide/Metabolite Sequences"); var linker = new PeptideMsMsLinker(); linker.LinkPeptidesToSpectra(msnSpectra, peptideList); } progData.Report(100); return(features); }
private void ReportPeptideFeatures(DatasetInformation information, IEnumerable<UMCLight> features) { if (!m_config.ShouldCreatePeptideScanFiles) return; var path = Path.Combine(m_config.AnalysisPath, information.DatasetName + "_peptide_scans.csv"); var writer = new PeptideScanWriter(); writer.Write(path, features); }
public BaselineFeaturesLoadedEventArgs(DatasetInformation info, List<UMCLight> features, MassTagDatabase database) : base(info, features) { Database = database; }
/// <summary> /// Partition the current view into (numSectionsPerAxis)^2 sections and select the top /// "featuresPerSection" in each section. /// </summary> /// <param name="dataset">Dataset to get features points for.</param> /// <param name="globalMax">The maximum mass in all datasets.</param> /// <param name="showMsFeatures">A value indicating whether points with Ms features should be returned.</param> /// <returns> /// Collection of datapoints for features. /// Item 1: LCMS feature datapoints. Item2: MS Feature datapoints. /// </returns> private Tuple<IEnumerable<DataPoint>, IEnumerable<ScatterPoint>> GetPartitionedPoints(DatasetInformation dataset, double globalMax, bool showMsFeatures = false) { var netActMaximum = this.netAxis.ActualMaximum.Equals(0) ? 1.0 : this.netAxis.ActualMaximum; var massActMaximum = this.massAxis.ActualMaximum.Equals(0) ? globalMax : this.massAxis.ActualMaximum; var netStep = (netActMaximum - this.netAxis.ActualMinimum) / this.numSectionsPerAxis; var massStep = (massActMaximum - this.massAxis.ActualMinimum) / this.numSectionsPerAxis; var featureHash = new HashSet<FeaturePoint>(); var featureTree = this.quadTrees[dataset]; for (int i = 0; i < this.numSectionsPerAxis; i++) { var netMin = this.netAxis.ActualMinimum + (i * netStep); var netMax = this.netAxis.ActualMinimum + ((i + 1) * netStep); for (int j = 0; j < this.numSectionsPerAxis; j++) { var massMin = this.massAxis.ActualMinimum + (j * massStep); var massMax = this.massAxis.ActualMinimum + ((j + 1) * massStep); var treeFeatures = featureTree.Query(new RectangleF { X = (float)netMin, Y = (float)massMin, Height = (float)(massMax - massMin), Width = (float)(netMax - netMin) }); var featureRange = treeFeatures.OrderByDescending(feat => feat.UMCLight.Abundance) .Take(this.featuresPerSection); featureHash.UnionWith(featureRange); } } return this.GetPoints(featureHash, showMsFeatures); }
public void CreateUMCClusterLight(string databasePath, bool indexDatabase) { // If the database is not index then do so...but before the session to the db is opened. if (indexDatabase) { DatabaseIndexer.IndexClusters(databasePath); DatabaseIndexer.IndexFeatures(databasePath); } // This is a factory based method that creates a set of data access providers used throughout MultiAlign var providers = DataAccessFactory.CreateDataAccessProviders(databasePath, false); // If you just wanted the clusters you could do this: // 1. Connect to the database //NHibernateUtil.ConnectToDatabase(databasePath, false); // 2. Then extract all of the clusters //IUmcClusterDAO clusterCache = new UmcClusterDAOHibernate(); //List<UMCClusterLight> clusters = clusterCache.FindAll(); var clusters = providers.ClusterCache.FindAll(); var shouldGetMsFeatures = true; var shouldGetMsMsFeatures = true; var shouldGetRawData = false; // This gets all of the dataset information and maps to a dictionary...if you want the raw data // otherwise comment this out. var datasets = providers.DatasetCache.FindAll(); var datasetMap = new Dictionary <int, DatasetInformation>(); datasets.ForEach(x => datasetMap.Add(x.DatasetId, x)); foreach (var cluster in clusters) { cluster.ReconstructUMCCluster(providers, true, false, shouldGetMsFeatures, shouldGetMsMsFeatures); foreach (var feature in cluster.Features) { foreach (var msFeature in feature.Features) { foreach (var spectrumMetaData in msFeature.MSnSpectra) { // then you can do stuff with the ms/ms spectra // If you had the path to the raw file, you could create a reader for you to extract the MS/MS spectra // This supports mzXML and .RAW Thermo files based on the file extension. if (shouldGetRawData) { DatasetInformation info = null; var hasKey = datasetMap.TryGetValue(spectrumMetaData.GroupId, out info); if (hasKey) { if (info.RawFile != null) { // This might seem kind of klunky, but it's called a bridge, this way I can access // MS/MS spectra from PNNLOmics without having to reference any of the Thermo DLL's // Nor support file reading capability. This is also nice because I don't have to load // several MS/MS spectra when analyzing large datasets for my spectral clustering work. var rawReader = new InformedProteomicsReader(spectrumMetaData.GroupId, info.RawFile.Path); // Then grab the actual spectrum... var summary = new ScanSummary(); var spectrum = rawReader.GetRawSpectra(spectrumMetaData.Scan, 2, out summary); // Then do what you want... // Profit??? } } } } } } } }
public BaselineFeaturesLoadedEventArgs(DatasetInformation info, List<UMCLight> features) : base(info, features) { Database = null; }
/// <summary> /// Handles converting the rows to factor objects. /// </summary> /// <param name="sender"></param> /// <param name="args"></param> public void HandleDataRow(object sender, MageDataEventArgs args) { if (args == null) { throw new NullReferenceException("The factors are invalid."); } if (args.Fields == null) { return; throw new NullReferenceException("The factor rows are invalid."); } if (args.Fields.Length < 4) { return; throw new ArgumentException("The number of columns for the factors are invalid."); } var datasetName = ""; if (m_columnMapping.ContainsKey("Dataset")) { datasetName = Convert.ToString(args.Fields[m_columnMapping["Dataset"]]).ToLower().Replace("\"", ""); } else { return; } var datasetId = -1; if (m_columnMapping.ContainsKey("Dataset_ID")) { datasetId = Convert.ToInt32(args.Fields[m_columnMapping["Dataset_ID"]].ToString().Replace("\"", "")); } else { return; } var factor = ""; if (m_columnMapping.ContainsKey("Factor")) { factor = Convert.ToString(args.Fields[m_columnMapping["Factor"]]).Replace("\"", ""); } else { return; } var value = ""; if (m_columnMapping.ContainsKey("Value")) { value = Convert.ToString(args.Fields[m_columnMapping["Value"]]).Replace("\"", ""); } else { return; } var factorMap = new ExperimentalFactor(); factorMap.Value = value; factorMap.Name = factor; DatasetInformation info = null; // Update the dataset ID. if (m_datasets.ContainsKey(datasetName)) { info = m_datasets[datasetName]; m_datasets[datasetName].DMSDatasetID = datasetId; } else { return; } // Make sure we haven't seen this factor map before. var shouldAdd = true; if (m_factorMaps.ContainsKey(factor)) { if (m_factorMaps[factor].ContainsKey(value)) { shouldAdd = false; } } else { m_factorMaps.Add(factor, new Dictionary <string, int>()); } var factorID = 0; // Add it to the list and map of factors to dump into the database. if (shouldAdd) { factorMap.FactorID = m_factorCount++; m_factorMaps[factor].Add(value, factorMap.FactorID); factorID = factorMap.FactorID; m_factors.Add(factorMap); } else { factorID = m_factorMaps[factor][value]; } var datasetFactorMap = new DatasetToExperimentalFactorMap(); datasetFactorMap.DatasetID = info.DatasetId; datasetFactorMap.FactorID = factorID; m_factorAssignments.Add(datasetFactorMap); }
public void ClusterMsMs(string name, string resultPath, string sequencePath, SequenceFileType type, string baseline, string features, double percent) { var baselineRaw = baseline.Replace("_isos.csv", ".raw"); var featuresRaw = features.Replace("_isos.csv", ".raw"); Console.WriteLine("Create Baseline Information"); var baselineInfo = new DatasetInformation { DatasetId = 0, Features = new InputFile {Path = baseline}, Raw = new InputFile {Path = baselineRaw}, Sequence = new InputFile {Path = sequencePath} }; Console.WriteLine("Create Alignee Information"); var aligneeInfo = new DatasetInformation { DatasetId = 1, Features = new InputFile {Path = features}, Raw = new InputFile {Path = featuresRaw}, Sequence = new InputFile {Path = sequencePath} }; var reader = new MsFeatureLightFileReader(); Console.WriteLine("Reading Baseline Features"); var baselineMsFeatures = reader.ReadFile(baseline).ToList(); baselineMsFeatures.ForEach(x => x.GroupId = baselineInfo.DatasetId); Console.WriteLine("Reading Alignee Features"); var aligneeMsFeatures = reader.ReadFile(features).ToList(); aligneeMsFeatures.ForEach(x => x.GroupId = aligneeInfo.DatasetId); var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var tolerances = new FeatureTolerances { Mass = 8, Net = .005 }; var options = new LcmsFeatureFindingOptions(tolerances); Console.WriteLine("Detecting Baseline Features"); var baselineFeatures = finder.FindFeatures(baselineMsFeatures, options, null); Console.WriteLine("Detecting Alignee Features"); var aligneeFeatures = finder.FindFeatures(aligneeMsFeatures, options, null); Console.WriteLine("Managing baseline and alignee features"); baselineFeatures.ForEach(x => x.GroupId = baselineInfo.DatasetId); aligneeFeatures.ForEach(x => x.GroupId = aligneeInfo.DatasetId); Console.WriteLine("Clustering MS/MS Spectra"); var clusterer = new MSMSClusterer(); clusterer.MzTolerance = .5; clusterer.MassTolerance = 6; clusterer.SpectralComparer = new SpectralNormalizedDotProductComparer { TopPercent = percent }; clusterer.SimilarityTolerance = .5; clusterer.ScanRange = 905; clusterer.Progress += clusterer_Progress; var allFeatures = new List<UMCLight>(); allFeatures.AddRange(baselineFeatures); allFeatures.AddRange(aligneeFeatures); List<MsmsCluster> clusters = null; using (var rawReader = new ThermoRawDataFileReader()) { rawReader.AddDataFile(baselineInfo.Raw.Path, baselineInfo.DatasetId); rawReader.AddDataFile(aligneeInfo.Raw.Path, aligneeInfo.DatasetId); clusters = clusterer.Cluster(allFeatures, rawReader); Console.WriteLine("Found {0} Total Clusters", clusters.Count); } if (clusters != null) { var now = DateTime.Now; var testResultPath = string.Format("{7}\\{0}-results-{1}-{2}-{3}-{4}-{5}-{6}_scans.txt", name, now.Year, now.Month, now.Day, now.Hour, now.Minute, now.Second, resultPath ); using (TextWriter writer = File.CreateText(testResultPath)) { writer.WriteLine("[Data]"); writer.WriteLine("{0}", baseline); writer.WriteLine("{0}", features); writer.WriteLine("[Scans]"); writer.WriteLine(); foreach (var cluster in clusters) { var scanData = ""; if (cluster.Features.Count == 2) { foreach (var feature in cluster.Features) { scanData += string.Format("{0},", feature.Scan); } scanData += string.Format("{0}", cluster.MeanScore); writer.WriteLine(scanData); } } } testResultPath = string.Format("{7}\\{0}-results-{1}-{2}-{3}-{4}-{5}-{6}.txt", name, now.Year, now.Month, now.Day, now.Hour, now.Minute, now.Second, resultPath ); using (TextWriter writer = File.CreateText(testResultPath)) { writer.WriteLine("[Data]"); writer.WriteLine("{0}", baseline); writer.WriteLine("{0}", features); writer.WriteLine("[Scans]"); foreach (var cluster in clusters) { var scanData = ""; var data = ""; foreach (var feature in cluster.Features) { scanData += string.Format("{0},", feature.Scan); data += string.Format("{0},{1},{2},{3},{4},{5}", feature.GroupId, feature.Id, feature.MassMonoisotopic, feature.Mz, feature.ChargeState, feature.Scan); foreach (var spectrum in feature.MSnSpectra) { foreach (var peptide in spectrum.Peptides) { data += string.Format(",{0},{1}", peptide.Sequence, peptide.Score); } } } writer.WriteLine(scanData + "," + data); } writer.WriteLine(""); writer.WriteLine(""); writer.WriteLine("[Clusters]"); foreach (var cluster in clusters) { writer.WriteLine("cluster id, cluster score"); writer.WriteLine("{0}, {1}", cluster.Id, cluster.MeanScore); writer.WriteLine("feature dataset id, id, monoisotopic mass, mz, charge, scan, peptides"); foreach (var feature in cluster.Features) { var data = string.Format("{0},{1},{2},{3},{4},{5}", feature.GroupId, feature.Id, feature.MassMonoisotopic, feature.Mz, feature.ChargeState, feature.Scan); foreach (var spectrum in feature.MSnSpectra) { foreach (var peptide in spectrum.Peptides) { data += string.Format(",{0},{1}", peptide.Sequence, peptide.Score); } } writer.WriteLine(data); } } } } }
/// <summary> /// Arguments that hold dataset information when features are loaded. /// </summary> /// <param name="info">Dataset information object</param> public FeaturesLoadedEventArgs(DatasetInformation info, IList <UMCLight> features) { DatasetInformation = info; Features = features; }
public void TestCreateDummyDatabase(string databasePath, int totalDatasets, int totalClusters) { File.Delete(databasePath); NHibernateUtil.ConnectToDatabase(databasePath, true); IDatasetDAO datasetCache = new DatasetDAOHibernate(); IUmcClusterDAO clusterCache = new UmcClusterDAOHibernate(); IUmcDAO featureCache = new UmcDAOHibernate(); // Creating a dataset Console.WriteLine("Creating dummy datasets"); var datasets = new List<DatasetInformation>(); var total = totalDatasets; for (var i = 0; i < total; i++) { var dataset = new DatasetInformation(); dataset.DatasetId = i; dataset.DatasetName = "test" + i; datasets.Add(dataset); } datasetCache.AddAll(datasets); datasets.Clear(); datasets = datasetCache.FindAll(); // Create features Console.WriteLine("Creating features"); var features = new List<UMCLight>(); var clusters = new List<UMCClusterLight>(); var x = new Random(); var featureId = 0; for (var i = 0; i < totalClusters; i++) { var N = x.Next(1, total); var charge = x.Next(1, 10); var hash = new HashSet<int>(); var net = x.NextDouble(); var mass = 400 + (1600*x.NextDouble()); var dt = 60*x.NextDouble(); for (var j = 0; j < N; j++) { var did = -1; do { did = x.Next(0, total); if (!hash.Contains(did)) { hash.Add(did); break; } } while (true); var feature = new UMCLight { GroupId = did, Id = featureId++, ChargeState = charge, MassMonoisotopic = FeatureLight.ComputeDaDifferenceFromPPM(mass, 3) }; feature.MassMonoisotopicAligned = feature.MassMonoisotopic; feature.Net = net + 0.03 * x.NextDouble(); feature.NetAligned = feature.Net; feature.Net = feature.Net; feature.DriftTime = dt; feature.AbundanceSum = x.Next(100, 200); feature.Abundance = feature.Abundance; feature.ClusterId = -1; features.Add(feature); } } featureCache.AddAll(features); }
/// <summary> /// Runs the MultiAlign analysis /// </summary> public void PerformMultiAlignAnalysis(DatasetInformation baselineDataset, IEnumerable <DatasetInformation> aligneeDatasets, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, MultiAlignCore.Algorithms.FeatureFinding.IFeatureFinder featureFinder, IFeatureAligner <IEnumerable <UMCLight>, IEnumerable <UMCLight>, AlignmentData> aligner, IClusterer <UMCLight, UMCClusterLight> clusterer, string matchPath, string errorPath) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(baselineDataset.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = new InformedProteomicsReader()) { rawProviderX.AddDataFile(baselineDataset.RawFile.Path, 0); UpdateStatus("Creating Baseline LCMS Features."); var baselineFeatures = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); LinkPeptidesToFeatures(baselineDataset.Sequence.Path, baselineFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); // Then load the alignee dataset foreach (var dataset in aligneeDatasets) { var aligneeMsFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path); aligneeMsFeatures = LcmsFeatureFilters.FilterMsFeatures(aligneeMsFeatures, msFilterOptions); using (var rawProviderY = new InformedProteomicsReader()) { rawProviderY.AddDataFile(dataset.RawFile.Path, 0); UpdateStatus("Finding alignee features"); var aligneeFeatures = featureFinder.FindFeatures(aligneeMsFeatures, featureFindingOptions, rawProviderY); LinkPeptidesToFeatures(dataset.Sequence.Path, aligneeFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); // cluster before we do anything else.... var allFeatures = new List <UMCLight>(); allFeatures.AddRange(baselineFeatures); allFeatures.AddRange(aligneeFeatures); foreach (var feature in allFeatures) { feature.Net = feature.Net; feature.MassMonoisotopicAligned = feature.MassMonoisotopic; } // This tells us the differences before we align. var clusters = clusterer.Cluster(allFeatures); var preAlignment = AnalyzeClusters(clusters); aligner.AligneeSpectraProvider = providerY; aligner.BaselineSpectraProvider = providerX; UpdateStatus("Aligning data"); // Aligner data var data = aligner.Align(baselineFeatures, aligneeFeatures); var matches = data.Matches; WriteErrors(errorPath, matches); // create anchor points for LCMSWarp alignment var massPoints = new List <RegressionPoint>(); var netPoints = new List <RegressionPoint>(); foreach (var match in matches) { var massError = FeatureLight.ComputeMassPPMDifference(match.AnchorPointX.Mz, match.AnchorPointY.Mz); var netError = match.AnchorPointX.Net - match.AnchorPointY.Net; var massPoint = new RegressionPoint(match.AnchorPointX.Mz, 0, massError, netError); massPoints.Add(massPoint); var netPoint = new RegressionPoint(match.AnchorPointX.Net, 0, massError, netError); netPoints.Add(netPoint); } foreach (var feature in allFeatures) { feature.UmcCluster = null; feature.ClusterId = -1; } // Then cluster after alignment! UpdateStatus("clustering data"); clusters = clusterer.Cluster(allFeatures); var postAlignment = AnalyzeClusters(clusters); UpdateStatus("Note\tSame\tDifferent"); UpdateStatus(string.Format("Pre\t{0}\t{1}", preAlignment.SameCluster, preAlignment.DifferentCluster)); UpdateStatus(string.Format("Post\t{0}\t{1}", postAlignment.SameCluster, postAlignment.DifferentCluster)); SaveMatches(matchPath, matches); } } } DeRegisterProgressNotifier(aligner); DeRegisterProgressNotifier(featureFinder); DeRegisterProgressNotifier(clusterer); }
/// <summary> /// Filters the list of MS Features that may be from MS/MS deisotoped data. /// </summary> public List<MSFeatureLight> Filter(List<MSFeatureLight> msFeatures, ref DatasetInformation dataset) { string rawPath = dataset.RawPath; if (rawPath == null || string.IsNullOrWhiteSpace(rawPath)) return msFeatures; // First find all unique scans var scanMap = new Dictionary<int, bool>(); foreach (var feature in msFeatures) { if (!scanMap.ContainsKey(feature.Scan)) { // Assume all scans are parents scanMap.Add(feature.Scan, true); } } // Then parse each to figure out if this is true. var fullScans = new Dictionary<int, bool>(); var scanTimes = dataset.ScanTimes; using (var provider = RawLoaderFactory.CreateFileReader(rawPath)) { if (provider == null) { UpdateStatus(string.Format("Warning: Raw file not found ({0}); scan times are not available!", System.IO.Path.GetFileName(rawPath))); } else { UpdateStatus(string.Format("Reading scan info from {0}", System.IO.Path.GetFileName(rawPath))); provider.AddDataFile(rawPath, 0); foreach (var scan in scanMap.Keys) { ScanSummary summary = provider.GetScanSummary(scan, 0); if (summary == null) { continue;} if (summary.MsLevel == 1) { fullScans.Add(scan, true); } if (scanTimes.ContainsKey(scan)){ scanTimes[scan] = summary.Time; } else { scanTimes.Add(scan, summary.Time); } } dataset.ScanTimes = scanTimes; } } return msFeatures.Where(x => fullScans.ContainsKey(x.Scan)).ToList(); }
/// <summary> /// Load a single dataset from the provider. /// </summary> /// <returns></returns> public IList<UMCLight> LoadDataset(DatasetInformation dataset, MsFeatureFilteringOptions msFilteringOptions, LcmsFeatureFindingOptions lcmsFindingOptions, LcmsFeatureFilteringOptions lcmsFilteringOptions) { UpdateStatus(string.Format("[{0}] - Loading dataset [{0}] - {1}.", dataset.DatasetId, dataset.DatasetName)); var datasetId = dataset.DatasetId; var features = UmcLoaderFactory.LoadUmcFeatureData(dataset.Features.Path, dataset.DatasetId, Providers.FeatureCache); UpdateStatus(string.Format("[{0}] Loading MS Feature Data [{0}] - {1}.", dataset.DatasetId, dataset.DatasetName)); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path); var scansInfo = UmcLoaderFactory.LoadScanSummaries(dataset.Scans.Path); dataset.BuildScanTimes(scansInfo); var msnSpectra = new List<MSSpectra>(); // If we don't have any features, then we have to create some from the MS features // provided to us. if (features.Count < 1) { msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilteringOptions); msFeatures = Filter(msFeatures, ref dataset); features = CreateLcmsFeatures(dataset, msFeatures, lcmsFindingOptions, lcmsFilteringOptions); //var maxScan = Convert.ToDouble(features.Max(feature => feature.Scan)); //var minScan = Convert.ToDouble(features.Min(feature => feature.Scan)); var maxScan = features.Max(feature => feature.Scan); var minScan = features.Min(feature => feature.Scan); var id = 0; var scanTimes = dataset.ScanTimes; foreach (var feature in features) { feature.Id = id++; //feature.Net = (Convert.ToDouble(feature.Scan) - minScan) / (maxScan - minScan); feature.Net = (Convert.ToDouble(scanTimes[feature.Scan]) - scanTimes[minScan]) / (scanTimes[maxScan] - scanTimes[minScan]); feature.MassMonoisotopicAligned = feature.MassMonoisotopic; feature.NetAligned = feature.Net; feature.GroupId = datasetId; feature.SpectralCount = feature.MsFeatures.Count; foreach (var msFeature in feature.MsFeatures.Where(msFeature => msFeature != null)) { msFeature.UmcId = feature.Id; msFeature.GroupId = datasetId; msFeature.MSnSpectra.ForEach(x => x.GroupId = datasetId); msnSpectra.AddRange(msFeature.MSnSpectra); } } } else { if (!UmcLoaderFactory.AreExistingFeatures(dataset.Features.Path)) { var i = 0; foreach (var feature in features) { feature.GroupId = datasetId; feature.Id = i++; } } // Otherwise, we need to map the MS features to the LCMS Features provided. // This would mean that we extracted data from an existing database. if (msFeatures.Count > 0) { var map = FeatureDataConverters.MapFeature(features); foreach (var feature in from feature in msFeatures let doesFeatureExists = map.ContainsKey(feature.UmcId) where doesFeatureExists select feature) { map[feature.UmcId].AddChildFeature(feature); } } } // Process the MS/MS data with peptides UpdateStatus("Reading List of Peptides"); var sequenceProvider = PeptideReaderFactory.CreateReader(dataset.SequencePath); if (sequenceProvider != null) { UpdateStatus("Reading List of Peptides"); var peptides = sequenceProvider.Read(dataset.SequencePath); var count = 0; var peptideList = peptides.ToList(); peptideList.ForEach(x => x.Id = count++); UpdateStatus("Linking MS/MS to any known Peptide/Metabolite Sequences"); var linker = new PeptideMsMsLinker(); linker.LinkPeptidesToSpectra(msnSpectra, peptideList); } return features; }
public void TestClusterGeneration(string databasePath, string crossPath, int charge, int minimumClusterSize) { File.Delete(databasePath); NHibernateUtil.ConnectToDatabase(databasePath, true); IDatasetDAO datasetCache = new DatasetDAOHibernate(); IUmcClusterDAO clusterCache = new UmcClusterDAOHibernate(); IUmcDAO featureCache = new UmcDAOHibernate(); // Creating a dataset Console.WriteLine("Creating dummy datasets"); var datasets = new List <DatasetInformation>(); var total = 10; for (var i = 0; i < total; i++) { var dataset = new DatasetInformation(); dataset.DatasetId = i; dataset.DatasetName = "test" + i; datasets.Add(dataset); } datasetCache.AddAll(datasets); datasets.Clear(); datasets = datasetCache.FindAll(); // Create features Console.WriteLine("Creating features"); var features = new List <UMCLight>(); var clusters = new List <UMCClusterLight>(); var x = new Random(); var featureId = 0; for (var i = 0; i < 100; i++) { var cluster = new UMCClusterLight(); cluster.Id = i; cluster.AmbiguityScore = i; cluster.Tightness = i; var N = x.Next(1, total); cluster.Id = i; cluster.ChargeState = charge; var hash = new HashSet <int>(); for (var j = 0; j < N; j++) { var did = -1; do { did = x.Next(0, total); if (!hash.Contains(did)) { hash.Add(did); break; } } while (true); var feature = new UMCLight(); feature.GroupId = did; feature.Id = featureId++; feature.ChargeState = charge; feature.MassMonoisotopic = x.NextDouble(); feature.Net = x.NextDouble(); feature.AbundanceSum = x.Next(100, 200); feature.Abundance = feature.Abundance; feature.ClusterId = cluster.Id; cluster.AddChildFeature(feature); features.Add(feature); } cluster.CalculateStatistics(ClusterCentroidRepresentation.Mean); clusters.Add(cluster); } featureCache.AddAll(features); clusterCache.AddAll(clusters); clusters = clusterCache.FindAll(); Console.WriteLine("Find all clusters"); clusters = clusterCache.FindByCharge(charge); WriteClusters(datasets, clusters, minimumClusterSize, charge, crossPath, databasePath, 300000); }
/// <summary> /// Creates LCMS Features /// </summary> public List<UMCLight> CreateLcmsFeatures( DatasetInformation information, List<MSFeatureLight> msFeatures, LcmsFeatureFindingOptions options, LcmsFeatureFilteringOptions filterOptions) { // Make features if (msFeatures.Count < 1) throw new Exception("No features were found in the feature files provided."); UpdateStatus("Finding features."); ISpectraProvider provider = null; if (information.RawPath != null && !string.IsNullOrWhiteSpace(information.RawPath)) { UpdateStatus("Using raw data to create better features."); provider = RawLoaderFactory.CreateFileReader(information.RawPath); provider.AddDataFile(information.RawPath, 0); } ValidateFeatureFinderMaxScanLength(information, options, filterOptions); var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); finder.Progress += (sender, args) => UpdateStatus(args.Message); var features = finder.FindFeatures(msFeatures, options, provider); UpdateStatus("Filtering features."); List<UMCLight> filteredFeatures; if (filterOptions.TreatAsTimeNotScan) //Feature length determined based on time (mins) { filteredFeatures = LcmsFeatureFilters.FilterFeatures(features, filterOptions, information.ScanTimes); } else //Feature length determined based on scans { filteredFeatures = LcmsFeatureFilters.FilterFeatures(features, filterOptions); } UpdateStatus(string.Format("Filtered features from: {0} to {1}.", features.Count, filteredFeatures.Count)); return filteredFeatures; }