///// <summary> ///// Matches two datasets based on spectral similarity. ///// </summary> ///// <param name="readerX"></param> ///// <param name="readerY"></param> ///// <param name="options"></param> ///// <returns></returns> //public MatchDatasets(ISpectraProvider readerX, // ISpectraProvider readerY, // SpectralOptions options) //{ // // This helps us compare various comparison calculation methods // var comparer = SpectralComparerFactory.CreateSpectraComparer(options.ComparerType); // // This guy filters the spectra, so that we only keep the N most intense ions for comparison // var filter = SpectrumFilterFactory.CreateFilter(SpectraFilters.TopPercent); // // Here we find all the matches // var finder = new SpectralAnchorPointFinderOriginal(); // return finder.FindAnchorPoints(readerX, // readerY, // comparer, // filter, // options); //} public void ValidateMatches(IEnumerable <SpectralAnchorPointMatch> matches, IEnumerable <Peptide> peptidesA, IEnumerable <Peptide> peptidesB, SpectralOptions options) { IEnumerable <SpectralAnchorPointMatch> anchorPointMatches = matches as SpectralAnchorPointMatch[] ?? matches.ToArray(); // If the list has peptides...then we should validate matches var enumerable = peptidesB as Peptide[] ?? peptidesB.ToArray(); var peptides = peptidesA as Peptide[] ?? peptidesA.ToArray(); var matchPeptides = (peptides.Any() && enumerable.Any()); if (matchPeptides) { peptidesA = peptides.ToList().Where(x => PeptideUtility.PassesCutoff(x, options.IdScore, options.Fdr)).ToList(); peptidesB = enumerable.ToList().Where(x => PeptideUtility.PassesCutoff(x, options.IdScore, options.Fdr)).ToList(); var peptideMapX = PeptideUtility.MapWithBestScan(peptidesA); var peptideMapY = PeptideUtility.MapWithBestScan(peptidesB); // Then map the peptide sequences to identify True Positive and False Positives var matcher = new PeptideAnchorPointMatcher(); matcher.Match(anchorPointMatches, peptideMapX, peptideMapY, options); } }
public static IFeatureAligner <MassTagDatabase, IEnumerable <UMCLight>, AlignmentData> CreateDatabaseAligner( FeatureAlignmentType type, LcmsWarpAlignmentOptions options, SpectralOptions spectralOptions) { IFeatureAligner <MassTagDatabase, IEnumerable <UMCLight>, AlignmentData> aligner = null; switch (type) { case FeatureAlignmentType.LCMS_WARP: aligner = new LcmsWarpFeatureAligner(options); break; case FeatureAlignmentType.DIRECT_IMS_INFUSION: aligner = new DummyAlignment(); break; case FeatureAlignmentType.SPECTRAL_ALIGNMENT: aligner = new SpectralAlignerWrapper { Options = spectralOptions }; break; } return(aligner); }
/// <summary> /// Finds features given a dataset /// </summary> private IList <UMCLight> FindFeatures(DatasetInformation information, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, MultiAlignCore.Algorithms.FeatureFinding.IFeatureFinder featureFinder) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(information.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(information.RawFile.Path)) { rawProviderX.AddDataFile(information.RawFile.Path, 0); UpdateStatus("Creating LCMS Features."); var features = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); features = LcmsFeatureFilters.FilterFeatures(features, lcmsFilterOptions, information.ScanTimes); var datasetId = information.DatasetId; foreach (var feature in features) { var lightEntry = new List <MSFeatureLight>(); feature.GroupId = datasetId; foreach (var msFeature in feature.MsFeatures) { msFeature.GroupId = datasetId; foreach (var msmsFeature in msFeature.MSnSpectra) { msmsFeature.GroupId = datasetId; foreach (var peptide in msmsFeature.Peptides) { peptide.GroupId = datasetId; } } if (msFeature.MSnSpectra.Count > 0) { lightEntry.Add(msFeature); } } // We are doing this so that we dont have a ton of MS features in the database feature.MsFeatures.Clear(); feature.MsFeatures.AddRange(lightEntry); } LinkPeptidesToFeatures(information.SequenceFile.Path, features, peptideOptions.Fdr, peptideOptions.IdScore); DeRegisterProgressNotifier(featureFinder); return(features); } }
/// <summary> /// Links anchor points use the raw spectra provided. /// </summary> public IEnumerable <SpectralAnchorPointMatch> FindAnchorPoints(IEnumerable <Peptide> peptidesA, IEnumerable <Peptide> peptidesB, SpectralOptions options) { var matches = new List <SpectralAnchorPointMatch>(); peptidesA = AssignNET(peptidesA); peptidesB = AssignNET(peptidesB); // Map sequences var mapA = PeptideUtility.MapWithBestSequence(peptidesA); var mapB = PeptideUtility.MapWithBestSequence(peptidesB); foreach (var sequence in mapB.Keys) { if (mapA.ContainsKey(sequence)) { var point = new SpectralAnchorPointMatch { AnchorPointX = { Peptide = mapA[sequence] }, AnchorPointY = { Peptide = mapB[sequence] } }; var net = point.AnchorPointX.Net - point.AnchorPointY.Net; var mz = point.AnchorPointX.Mz - point.AnchorPointY.Mz; if (Math.Abs(net) < options.NetTolerance && Math.Abs(mz) < options.MzTolerance) { matches.Add(point); } } } return(matches); }
/// <summary> /// Builds the feature aligner. /// </summary> public void BuildAligner(LcmsWarpAlignmentOptions options, SpectralOptions spectralOptions) { m_provider.DatasetAligner = FeatureAlignerFactory.CreateDatasetAligner(options.AlignmentAlgorithmType, options, spectralOptions); m_provider.DatabaseAligner = FeatureAlignerFactory.CreateDatabaseAligner(options.AlignmentAlgorithmType, options, spectralOptions); }
/// <summary> /// Constructor /// </summary> public SpectralAnalysis() { MassData = new AlignmentMeasurement <double>(); NetData = new AlignmentMeasurement <double>(); Matches = new List <SpectralAnchorPointMatch>(); Options = new SpectralOptions(); DatasetNames = new List <string>(); }
/// <summary> /// Matches anchor points to peptide data. /// </summary> /// <param name="matches"></param> /// <param name="peptideMapX"></param> /// <param name="peptideMapY"></param> /// <param name="options"></param> public void Match(IEnumerable <SpectralAnchorPointMatch> matches, Dictionary <int, Peptide> peptideMapX, Dictionary <int, Peptide> peptideMapY, SpectralOptions options) { foreach (var match in matches) { var scanX = match.AnchorPointX.Scan; var scanY = match.AnchorPointY.Scan; // Assume the spectrum was not identified first...then prove a false match later var isMatch = AnchorPointMatchType.PeptideFailed; if (!peptideMapX.ContainsKey(scanX)) { match.IsValidMatch = isMatch; continue; } if (!peptideMapY.ContainsKey(scanY)) { match.IsValidMatch = isMatch; continue; } var peptidex = peptideMapX[scanX]; var peptidey = peptideMapY[scanY]; if (peptidex == null || peptidey == null) { match.IsValidMatch = isMatch; continue; } peptidex.Sequence = PeptideUtility.CleanString(peptidex.Sequence); peptidey.Sequence = PeptideUtility.CleanString(peptidey.Sequence); // Make sure the peptides are equivalent. if (peptidex.Sequence.Equals(peptidey.Sequence) && !string.IsNullOrWhiteSpace(peptidey.Sequence)) { isMatch = AnchorPointMatchType.TrueMatch; } else { isMatch = AnchorPointMatchType.FalseMatch; } // Then link as true positive. match.AnchorPointX.Peptide = peptidex; match.AnchorPointY.Peptide = peptidey; match.IsValidMatch = isMatch; } }
private void MatchPeptides(AlignmentDataset datasetX, AlignmentDataset datasetY, Dictionary <int, ScanSummary> scanDataX, Dictionary <int, ScanSummary> scanDataY, IEnumerable <string> names, SpectralOptions options) { // Read data for peptides var reader = PeptideReaderFactory.CreateReader(SequenceFileType.MSGF); var peptidesA = reader.Read(datasetX.PeptideFile); var peptidesB = reader.Read(datasetY.PeptideFile); peptidesA = peptidesA.ToList().Where(x => PeptideUtility.PassesCutoff(x, options.IdScore, options.Fdr)).ToList(); peptidesB = peptidesB.ToList().Where(x => PeptideUtility.PassesCutoff(x, options.IdScore, options.Fdr)).ToList(); var peptideMapX = PeptideUtility.MapWithBestScan(peptidesA); var peptideMapY = PeptideUtility.MapWithBestScan(peptidesB); // Determine the scan extrema var maxX = scanDataX.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; var minX = scanDataX.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; var maxY = scanDataY.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; var minY = scanDataY.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; // Then map the peptide sequences to identify True Positive and False Positives var count = (from scanx in peptideMapX.Keys let peptideX = peptideMapX[scanx] from scany in peptideMapY.Keys let peptideY = peptideMapY[scany] let netX = Convert.ToDouble(scanx - minX) / Convert.ToDouble(maxX - minX) let netY = Convert.ToDouble(scany - minY) / Convert.ToDouble(maxY - minY) let net = Convert.ToDouble(netX - netY) where Math.Abs(net) < options.NetTolerance where Math.Abs(peptideX.Mz - peptideY.Mz) < options.MzTolerance where PeptideUtility.PassesCutoff(peptideX, options.IdScore, options.Fdr) && PeptideUtility.PassesCutoff(peptideY, options.IdScore, options.Fdr) && peptideX.Sequence.Equals(peptideY.Sequence) select peptideX).Count(); Console.WriteLine(); foreach (var name in names) { Console.WriteLine(name); } Console.WriteLine(@"Matches - {0}", count); }
public static IFeatureAligner<IEnumerable<UMCLight>, IEnumerable<UMCLight>, classAlignmentData> CreateDatasetAligner(FeatureAlignmentType type, LcmsWarpAlignmentOptions options, SpectralOptions spectralOptions) { IFeatureAligner<IEnumerable<UMCLight>, IEnumerable<UMCLight>, classAlignmentData> aligner = null; switch (type) { case FeatureAlignmentType.LCMS_WARP: aligner = new LcmsWarpFeatureAligner {Options = options}; break; case FeatureAlignmentType.DIRECT_IMS_INFUSION: aligner = new DummyAlignment(); break; case FeatureAlignmentType.SPECTRAL_ALIGNMENT: aligner = new SpectralAlignerWrapper {Options = spectralOptions, Bandwidth = Bandwidth}; break; } return aligner; }
protected static SpectralAnalysis MatchDatasets(SpectralComparison comparerType, ISpectraProvider readerX, ISpectraProvider readerY, SpectralOptions options, AlignmentDataset datasetX, AlignmentDataset datasetY, List <string> names) { var peptideReader = PeptideReaderFactory.CreateReader(SequenceFileType.MSGF); var finder = new SpectralAnchorPointFinder(); var validator = new SpectralAnchorPointValidator(); var comparer = SpectralComparerFactory.CreateSpectraComparer(comparerType); var filter = SpectrumFilterFactory.CreateFilter(SpectraFilters.TopPercent); var matches = finder.FindAnchorPoints(readerX, readerY, comparer, filter, options); var peptidesX = peptideReader.Read(datasetX.PeptideFile); var peptidesY = peptideReader.Read(datasetY.PeptideFile); validator.ValidateMatches(matches, peptidesX, peptidesY, options); var analysis = new SpectralAnalysis { DatasetNames = names, Matches = matches, Options = options }; return(analysis); }
public void GenerateFigure4_MetaMatches(string directory, SpectralComparison comparerType, double mzBinSize, double mzTolerance, double netTolerance, double similarityScoreCutoff, double peptideScore, double peptideFdr, double ionPercent, int numberOfRequiredPeaks, string name) { AlignmentAnalysisWriterFactory.BasePath = @"M:\doc\papers\paperAlignment\Data\figure4"; Console.WriteLine(@"Post-Pre Tests For {0}", directory); var cacheFiles = Directory.GetFiles(directory, "*.mscache"); Console.WriteLine(@"Building data cache"); var data = cacheFiles.Select(path => new FigureBase.PathCache { Cache = path }).ToList(); // The options for the analysis var options = new SpectralOptions { MzBinSize = mzBinSize, MzTolerance = mzTolerance, NetTolerance = netTolerance, SimilarityCutoff = similarityScoreCutoff, TopIonPercent = ionPercent, IdScore = peptideScore, ComparerType = comparerType, Fdr = peptideFdr, RequiredPeakCount = numberOfRequiredPeaks }; var comparison = 0; for (var i = 0; i < data.Count; i++) { var cachex = data[i]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathX = ScanSummaryCache.ReadPath(cachex.Cache); var datasetX = new AlignmentDataset(rawPathX, "", cachex.Msgf); // create a raw file reader for the datasets using (var readerX = new InformedProteomicsReader()) { // wrap it in the cached object so we can load scan meta-data var cacheReaderX = new RawLoaderCache(readerX); var cacheDataX = ScanSummaryCache.ReadCache(cachex.Cache); readerX.AddDataFile(rawPathX, 0); cacheReaderX.AddCache(0, cacheDataX); for (var j = i + 1; j < data.Count; j++) { var cachey = data[j]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathY = ScanSummaryCache.ReadPath(cachey.Cache); var datasetY = new AlignmentDataset(rawPathY, "", cachey.Msgf); // create a raw file reader for the datasets using (var readerY = new InformedProteomicsReader()) { // Then the writer for creating a report var writer = AlignmentAnalysisWriterFactory.Create(AlignmentFigureType.Figure3, name + comparison); comparison++; // wrap it in the cached object so we can load scan meta-data var cacheReaderY = new RawLoaderCache(readerY); var cacheDataY = ScanSummaryCache.ReadCache(cachey.Cache); cacheReaderY.AddCache(0, cacheDataY); readerY.AddDataFile(rawPathY, 0); var names = new List <string> { data[i].Cache, data[j].Cache }; var analysis = MatchDatasets(comparerType, readerX, readerY, options, datasetX, datasetY, names); AlignMatches(analysis, writer); writer.Close(); } } } } }
public void GenerateFigure4_MetaMatches(string directory, SpectralComparison comparerType, double mzBinSize, double mzTolerance, double netTolerance, double similarityScoreCutoff, double peptideScore, double peptideFdr, double ionPercent, int numberOfRequiredPeaks, string name) { AlignmentAnalysisWriterFactory.BasePath = @"M:\doc\papers\paperAlignment\Data\figure4"; Console.WriteLine(@"Post-Pre Tests For {0}", directory); var cacheFiles = Directory.GetFiles(directory, "*.mscache"); Console.WriteLine(@"Building data cache"); var data = cacheFiles.Select(path => new FigureBase.PathCache { Cache = path }).ToList(); // The options for the analysis var options = new SpectralOptions { MzBinSize = mzBinSize, MzTolerance = mzTolerance, NetTolerance = netTolerance, SimilarityCutoff = similarityScoreCutoff, TopIonPercent = ionPercent, IdScore = peptideScore, ComparerType = comparerType, Fdr = peptideFdr, RequiredPeakCount = numberOfRequiredPeaks }; var comparison = 0; for (var i = 0; i < data.Count; i++) { var cachex = data[i]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathX = ScanSummaryCache.ReadPath(cachex.Cache); var datasetX = new AlignmentDataset(rawPathX, "", cachex.Msgf); // create a raw file reader for the datasets using (var readerX = RawLoaderFactory.CreateFileReader(datasetX.RawFile)) { // wrap it in the cached object so we can load scan meta-data var cacheReaderX = new RawLoaderCache(readerX); var cacheDataX = ScanSummaryCache.ReadCache(cachex.Cache); readerX.AddDataFile(rawPathX, 0); cacheReaderX.AddCache(0, cacheDataX); for (var j = i + 1; j < data.Count; j++) { var cachey = data[j]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathY = ScanSummaryCache.ReadPath(cachey.Cache); var datasetY = new AlignmentDataset(rawPathY, "", cachey.Msgf); // create a raw file reader for the datasets using (var readerY = RawLoaderFactory.CreateFileReader(datasetY.RawFile)) { // Then the writer for creating a report var writer = AlignmentAnalysisWriterFactory.Create(AlignmentFigureType.Figure3, name + comparison); comparison++; // wrap it in the cached object so we can load scan meta-data var cacheReaderY = new RawLoaderCache(readerY); var cacheDataY = ScanSummaryCache.ReadCache(cachey.Cache); cacheReaderY.AddCache(0, cacheDataY); readerY.AddDataFile(rawPathY, 0); var names = new List<string> { data[i].Cache, data[j].Cache }; var analysis = MatchDatasets(comparerType, readerX, readerY, options, datasetX, datasetY, names); AlignMatches(analysis, writer); writer.Close(); } } } } }
public void GenerateFigure3_Matches(string directory, SpectralComparison comparerType, double mzBinSize, double mzTolerance, double netTolerance, double similarityScoreCutoff, double peptideScore, double peptideFdr, double ionPercent, int numberOfRequiredPeaks) { AlignmentAnalysisWriterFactory.BasePath = @"M:\doc\papers\paperAlignment\Data\figure4"; Console.WriteLine(@"Post-Pre Tests For {0}", directory); var cacheFiles = Directory.GetFiles(directory, "*.mscache"); var msgfFiles = Directory.GetFiles(directory, "*_msgfdb_fht.txt"); Console.WriteLine(@"Building data cache"); var map = cacheFiles.ToDictionary<string, string, FigureBase.PathCache>(path => path.ToLower(), path => null); var data = (from path in msgfFiles let name = path.ToLower().Replace("_msgfdb_fht.txt", ".mscache") let newName = Path.Combine(directory, name) let features = Path.Combine(directory, name) where map.ContainsKey(newName) select new FigureBase.PathCache { Cache = newName, Msgf = path, Features = features }).ToList(); // The options for the analysis var options = new SpectralOptions { MzBinSize = mzBinSize, MzTolerance = mzTolerance, NetTolerance = netTolerance, SimilarityCutoff = similarityScoreCutoff, TopIonPercent = ionPercent, IdScore = peptideScore, ComparerType = comparerType, Fdr = peptideFdr, RequiredPeakCount = numberOfRequiredPeaks }; Console.WriteLine(@"{0}", data.Count); var comparison = 0; for (var i = 0; i < data.Count; i++) { var cachex = data[i]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathX = ScanSummaryCache.ReadPath(cachex.Cache); var datasetX = new AlignmentDataset(rawPathX, "", cachex.Msgf); // create a raw file reader for the datasets using (var readerX = RawLoaderFactory.CreateFileReader(datasetX.RawFile)) { // wrap it in the cached object so we can load scan meta-data var cacheReaderX = new RawLoaderCache(readerX); var cacheDataX = ScanSummaryCache.ReadCache(cachex.Cache); readerX.AddDataFile(rawPathX, 0); cacheReaderX.AddCache(0, cacheDataX); for (var j = i + 1; j < data.Count; j++) { // Then the writer for creating a report var writer = AlignmentAnalysisWriterFactory.Create(AlignmentFigureType.Figure3, "results-figure3-largeScale" + comparison); comparison++; var cachey = data[j]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathY = ScanSummaryCache.ReadPath(cachey.Cache); var datasetY = new AlignmentDataset(rawPathY, "", cachey.Msgf); // create a raw file reader for the datasets using (var readerY = RawLoaderFactory.CreateFileReader(datasetY.RawFile)) { // wrap it in the cached object so we can load scan meta-data var cacheReaderY = new RawLoaderCache(readerY); var cacheDataY = ScanSummaryCache.ReadCache(cachey.Cache); cacheReaderY.AddCache(0, cacheDataY); readerY.AddDataFile(rawPathY, 0); var names = new List<string> { data[i].Cache, data[j].Cache }; // Write the results var analysis = MatchDatasets(comparerType, cacheReaderX, cacheReaderY, options, datasetX, datasetY, names); AlignMatches(analysis, writer); } } } } }
protected static SpectralAnalysis MatchDatasets(SpectralComparison comparerType, ISpectraProvider readerX, ISpectraProvider readerY, SpectralOptions options, AlignmentDataset datasetX, AlignmentDataset datasetY, List<string> names) { var peptideReader = PeptideReaderFactory.CreateReader(SequenceFileType.MSGF); var finder = new SpectralAnchorPointFinder(); var validator = new SpectralAnchorPointValidator(); var comparer = SpectralComparerFactory.CreateSpectraComparer(comparerType); var filter = SpectrumFilterFactory.CreateFilter(SpectraFilters.TopPercent); var matches = finder.FindAnchorPoints(readerX, readerY, comparer, filter, options); var peptidesX = peptideReader.Read(datasetX.PeptideFile); var peptidesY = peptideReader.Read(datasetY.PeptideFile); validator.ValidateMatches(matches, peptidesX, peptidesY, options); var analysis = new SpectralAnalysis { DatasetNames = names, Matches = matches, Options = options }; return analysis; }
/// <summary> /// Runs the MultiAlign analysis /// </summary> public void PerformMultiAlignAnalysis(DatasetInformation baselineDataset, IEnumerable <DatasetInformation> aligneeDatasets, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, MultiAlignCore.Algorithms.FeatureFinding.IFeatureFinder featureFinder, IFeatureAligner <IEnumerable <UMCLight>, IEnumerable <UMCLight>, AlignmentData> aligner, IClusterer <UMCLight, UMCClusterLight> clusterer, string matchPath, string errorPath) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(baselineDataset.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = new InformedProteomicsReader()) { rawProviderX.AddDataFile(baselineDataset.RawFile.Path, 0); UpdateStatus("Creating Baseline LCMS Features."); var baselineFeatures = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); LinkPeptidesToFeatures(baselineDataset.Sequence.Path, baselineFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); // Then load the alignee dataset foreach (var dataset in aligneeDatasets) { var aligneeMsFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path); aligneeMsFeatures = LcmsFeatureFilters.FilterMsFeatures(aligneeMsFeatures, msFilterOptions); using (var rawProviderY = new InformedProteomicsReader()) { rawProviderY.AddDataFile(dataset.RawFile.Path, 0); UpdateStatus("Finding alignee features"); var aligneeFeatures = featureFinder.FindFeatures(aligneeMsFeatures, featureFindingOptions, rawProviderY); LinkPeptidesToFeatures(dataset.Sequence.Path, aligneeFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); // cluster before we do anything else.... var allFeatures = new List <UMCLight>(); allFeatures.AddRange(baselineFeatures); allFeatures.AddRange(aligneeFeatures); foreach (var feature in allFeatures) { feature.Net = feature.Net; feature.MassMonoisotopicAligned = feature.MassMonoisotopic; } // This tells us the differences before we align. var clusters = clusterer.Cluster(allFeatures); var preAlignment = AnalyzeClusters(clusters); aligner.AligneeSpectraProvider = providerY; aligner.BaselineSpectraProvider = providerX; UpdateStatus("Aligning data"); // Aligner data var data = aligner.Align(baselineFeatures, aligneeFeatures); var matches = data.Matches; WriteErrors(errorPath, matches); // create anchor points for LCMSWarp alignment var massPoints = new List <RegressionPoint>(); var netPoints = new List <RegressionPoint>(); foreach (var match in matches) { var massError = FeatureLight.ComputeMassPPMDifference(match.AnchorPointX.Mz, match.AnchorPointY.Mz); var netError = match.AnchorPointX.Net - match.AnchorPointY.Net; var massPoint = new RegressionPoint(match.AnchorPointX.Mz, 0, massError, netError); massPoints.Add(massPoint); var netPoint = new RegressionPoint(match.AnchorPointX.Net, 0, massError, netError); netPoints.Add(netPoint); } foreach (var feature in allFeatures) { feature.UmcCluster = null; feature.ClusterId = -1; } // Then cluster after alignment! UpdateStatus("clustering data"); clusters = clusterer.Cluster(allFeatures); var postAlignment = AnalyzeClusters(clusters); UpdateStatus("Note\tSame\tDifferent"); UpdateStatus(string.Format("Pre\t{0}\t{1}", preAlignment.SameCluster, preAlignment.DifferentCluster)); UpdateStatus(string.Format("Post\t{0}\t{1}", postAlignment.SameCluster, postAlignment.DifferentCluster)); SaveMatches(matchPath, matches); } } } DeRegisterProgressNotifier(aligner); DeRegisterProgressNotifier(featureFinder); DeRegisterProgressNotifier(clusterer); }
private void MatchPeptides(AlignmentDataset datasetX, AlignmentDataset datasetY, Dictionary<int, ScanSummary> scanDataX, Dictionary<int, ScanSummary> scanDataY, IEnumerable<string> names, SpectralOptions options) { // Read data for peptides var reader = PeptideReaderFactory.CreateReader(SequenceFileType.MSGF); var peptidesA = reader.Read(datasetX.PeptideFile); var peptidesB = reader.Read(datasetY.PeptideFile); peptidesA = peptidesA.ToList().Where(x => PeptideUtility.PassesCutoff(x, options.IdScore, options.Fdr)).ToList(); peptidesB = peptidesB.ToList().Where(x => PeptideUtility.PassesCutoff(x, options.IdScore, options.Fdr)).ToList(); var peptideMapX = PeptideUtility.MapWithBestScan(peptidesA); var peptideMapY = PeptideUtility.MapWithBestScan(peptidesB); // Determine the scan extrema var maxX = scanDataX.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; var minX = scanDataX.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; var maxY = scanDataY.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; var minY = scanDataY.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; // Then map the peptide sequences to identify True Positive and False Positives var count = (from scanx in peptideMapX.Keys let peptideX = peptideMapX[scanx] from scany in peptideMapY.Keys let peptideY = peptideMapY[scany] let netX = Convert.ToDouble(scanx - minX)/Convert.ToDouble(maxX - minX) let netY = Convert.ToDouble(scany - minY)/Convert.ToDouble(maxY - minY) let net = Convert.ToDouble(netX - netY) where Math.Abs(net) < options.NetTolerance where Math.Abs(peptideX.Mz - peptideY.Mz) < options.MzTolerance where PeptideUtility.PassesCutoff(peptideX, options.IdScore, options.Fdr) && PeptideUtility.PassesCutoff(peptideY, options.IdScore, options.Fdr) && peptideX.Sequence.Equals(peptideY.Sequence) select peptideX).Count(); Console.WriteLine(); foreach (var name in names) Console.WriteLine(name); Console.WriteLine(@"Matches - {0}", count); }
public void GenerateClusterAlignmentStatistics(string relativeDatabasePath, string relativeName, string name, FeatureAlignmentType alignmentType, LcmsFeatureClusteringAlgorithmType clusterType) { var databasePath = GetPath(relativeDatabasePath); var outputPath = GetOutputPath(relativeName); if (!Directory.Exists(outputPath)) { Directory.CreateDirectory(outputPath); } // Connect to the NHibernate database var providers = DataAccessFactory.CreateDataAccessProviders(databasePath, false); // Setup our alignment options var alignmentOptions = new AlignmentOptions(); var spectralOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .01, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass + 6, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; UpdateStatus("Retrieving all datasets for test."); var datasets = providers.DatasetCache.FindAll(); // Create our algorithms var aligner = FeatureAlignerFactory.CreateDatasetAligner(alignmentType, alignmentOptions.LCMSWarpOptions, spectralOptions); var clusterer = ClusterFactory.Create(clusterType); clusterer.Parameters = new FeatureClusterParameters<UMCLight> { Tolerances = featureTolerances }; RegisterProgressNotifier(aligner); RegisterProgressNotifier(clusterer); for (var i = 0; i < datasets.Count - 1; i++) { var matchPath = string.Format("{0}-{1}-matches.txt", name, i); var errorPath = string.Format("{0}-{1}-errors.txt", name, i); matchPath = Path.Combine(outputPath, matchPath); errorPath = Path.Combine(outputPath, errorPath); var aligneeDataset = datasets[i + 1]; var baselineDataset = datasets[i]; // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(baselineDataset.RawPath)) { rawProviderX.AddDataFile(baselineDataset.RawPath, 0); // Load the baseline reference set using (var rawProviderY = RawLoaderFactory.CreateFileReader(aligneeDataset.RawPath)) { rawProviderY.AddDataFile(aligneeDataset.RawPath, 0); var baselineFeatures = RetrieveFeatures(baselineDataset.DatasetId, providers); var aligneeFeatures = RetrieveFeatures(aligneeDataset.DatasetId, providers); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); AlignDatasets( baselineFeatures, aligneeFeatures, providerX, providerY, aligner, clusterer, matchPath, errorPath); } } } }
public SpectralAligner() { Options = new SpectralOptions(); Filter = SpectrumFilterFactory.CreateFilter(SpectraFilters.TopPercent); SpectralComparer = SpectralComparerFactory.CreateSpectraComparer(SpectralComparison.CosineDotProduct); }
/// <summary> /// Finds features given a dataset /// </summary> private IList<UMCLight> FindFeatures( DatasetInformation information, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, IFeatureFinder featureFinder) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(information.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(information.RawPath)) { rawProviderX.AddDataFile(information.RawPath, 0); UpdateStatus("Creating LCMS Features."); var features = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); features = LcmsFeatureFilters.FilterFeatures(features, lcmsFilterOptions); var datasetId = information.DatasetId; foreach (var feature in features) { var lightEntry = new List<MSFeatureLight>(); feature.GroupId = datasetId; foreach (var msFeature in feature.MsFeatures) { msFeature.GroupId = datasetId; foreach (var msmsFeature in msFeature.MSnSpectra) { msmsFeature.GroupId = datasetId; foreach (var peptide in msmsFeature.Peptides) { peptide.GroupId = datasetId; } } if (msFeature.MSnSpectra.Count > 0) lightEntry.Add(msFeature); } // We are doing this so that we dont have a ton of MS features in the database feature.MsFeatures.Clear(); feature.MsFeatures.AddRange(lightEntry); } LinkPeptidesToFeatures(information.SequencePath, features, peptideOptions.Fdr, peptideOptions.IdScore); DeRegisterProgressNotifier(featureFinder); return features; } }
public void TestClustering( string directory, string outputPath, FeatureAlignmentType alignmentType, LcmsFeatureClusteringAlgorithmType clusterType) { var matchPath = string.Format("{0}.txt", outputPath); var errorPath = string.Format("{0}-errors.txt", outputPath); // Loads the supported MultiAlign types var supportedTypes = DatasetInformation.SupportedFileTypes; var extensions = new List<string>(); supportedTypes.ForEach(x => extensions.Add("*" + x.Extension)); // Find our datasets var inputFiles = DatasetSearcher.FindDatasets(directory, extensions, SearchOption.TopDirectoryOnly); var datasets = DatasetInformation.ConvertInputFilesIntoDatasets(inputFiles); // Setup our alignment options var alignmentOptions = new AlignmentOptions(); var spectralOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .01, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass + 6, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; var featureFindingOptions = new LcmsFeatureFindingOptions(featureTolerances) { MaximumNetRange = .002, MaximumScanRange = 50 }; // Create our algorithms var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var aligner = FeatureAlignerFactory.CreateDatasetAligner(alignmentType, alignmentOptions.LCMSWarpOptions, spectralOptions); var clusterer = ClusterFactory.Create(clusterType); clusterer.Parameters = new FeatureClusterParameters<UMCLight> { Tolerances = featureTolerances }; RegisterProgressNotifier(aligner); RegisterProgressNotifier(finder); RegisterProgressNotifier(clusterer); var lcmsFilters = new LcmsFeatureFilteringOptions { FeatureLengthRange = new FilterRange(50, 300) }; var msFilterOptions = new MsFeatureFilteringOptions { MinimumIntensity = 5000, ChargeRange = new FilterRange(1, 6), ShouldUseChargeFilter = true, ShouldUseDeisotopingFilter = true, ShouldUseIntensityFilter = true }; for (var i = 0; i < 1; i++) { var aligneeDatasets = datasets.Where((t, j) => j != i).ToList(); PerformMultiAlignAnalysis(datasets[0], aligneeDatasets, featureFindingOptions, msFilterOptions, lcmsFilters, spectralOptions, finder, aligner, clusterer, matchPath, errorPath); } }
public void GenerateFigure3_Matches(string directory, SpectralComparison comparerType, double mzBinSize, double mzTolerance, double netTolerance, double similarityScoreCutoff, double peptideScore, double peptideFdr, double ionPercent, int numberOfRequiredPeaks) { AlignmentAnalysisWriterFactory.BasePath = @"M:\doc\papers\paperAlignment\Data\figure4"; Console.WriteLine(@"Post-Pre Tests For {0}", directory); var cacheFiles = Directory.GetFiles(directory, "*.mscache"); var msgfFiles = Directory.GetFiles(directory, "*_msgfdb_fht.txt"); Console.WriteLine(@"Building data cache"); var map = cacheFiles.ToDictionary <string, string, FigureBase.PathCache>(path => path.ToLower(), path => null); var data = (from path in msgfFiles let name = path.ToLower().Replace("_msgfdb_fht.txt", ".mscache") let newName = Path.Combine(directory, name) let features = Path.Combine(directory, name) where map.ContainsKey(newName) select new FigureBase.PathCache { Cache = newName, Msgf = path, Features = features }).ToList(); // The options for the analysis var options = new SpectralOptions { MzBinSize = mzBinSize, MzTolerance = mzTolerance, NetTolerance = netTolerance, SimilarityCutoff = similarityScoreCutoff, TopIonPercent = ionPercent, IdScore = peptideScore, ComparerType = comparerType, Fdr = peptideFdr, RequiredPeakCount = numberOfRequiredPeaks }; Console.WriteLine(@"{0}", data.Count); var comparison = 0; for (var i = 0; i < data.Count; i++) { var cachex = data[i]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathX = ScanSummaryCache.ReadPath(cachex.Cache); var datasetX = new AlignmentDataset(rawPathX, "", cachex.Msgf); // create a raw file reader for the datasets using (var readerX = new InformedProteomicsReader()) { // wrap it in the cached object so we can load scan meta-data var cacheReaderX = new RawLoaderCache(readerX); var cacheDataX = ScanSummaryCache.ReadCache(cachex.Cache); readerX.AddDataFile(rawPathX, 0); cacheReaderX.AddCache(0, cacheDataX); for (var j = i + 1; j < data.Count; j++) { // Then the writer for creating a report var writer = AlignmentAnalysisWriterFactory.Create(AlignmentFigureType.Figure3, "results-figure3-largeScale" + comparison); comparison++; var cachey = data[j]; // Get the raw path stored in the cache file... // then get the dataset object var rawPathY = ScanSummaryCache.ReadPath(cachey.Cache); var datasetY = new AlignmentDataset(rawPathY, "", cachey.Msgf); // create a raw file reader for the datasets using (var readerY = new InformedProteomicsReader()) { // wrap it in the cached object so we can load scan meta-data var cacheReaderY = new RawLoaderCache(readerY); var cacheDataY = ScanSummaryCache.ReadCache(cachey.Cache); cacheReaderY.AddCache(0, cacheDataY); readerY.AddDataFile(rawPathY, 0); var names = new List <string> { data[i].Cache, data[j].Cache }; // Write the results var analysis = MatchDatasets(comparerType, cacheReaderX, cacheReaderY, options, datasetX, datasetY, names); AlignMatches(analysis, writer); } } } } }
public void TestPeptideBands(string directory, string matchPath) { // Loads the supported MultiAlign types var supportedTypes = DatasetInformation.SupportedFileTypes; var extensions = new List<string>(); supportedTypes.ForEach(x => extensions.Add("*" + x.Extension)); // Find our datasets var inputFiles = DatasetSearcher.FindDatasets(directory, extensions, SearchOption.TopDirectoryOnly); var datasets = DatasetInformation.ConvertInputFilesIntoDatasets(inputFiles); // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; var msFilterOptions = new MsFeatureFilteringOptions { MinimumIntensity = 5000, ChargeRange = new FilterRange(1, 6), ShouldUseChargeFilter = true, ShouldUseDeisotopingFilter = true, ShouldUseIntensityFilter = true }; var featureFindingOptions = new LcmsFeatureFindingOptions(featureTolerances) { MaximumNetRange = .002, MaximumScanRange = 50 }; var baselineDataset = datasets[0]; UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(baselineDataset.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); var finderFinder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var peptideOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .05, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; var features = new List<MSFeatureLight>(); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(baselineDataset.RawPath)) { rawProviderX.AddDataFile(baselineDataset.RawPath, 0); UpdateStatus("Creating Baseline LCMS Features."); var baselineFeatures = finderFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); LinkPeptidesToFeatures(baselineDataset.SequencePath, baselineFeatures, peptideOptions.Fdr, peptideOptions.IdScore); baselineFeatures.ForEach(x => features.AddRange(x.MsFeatures)); features = features.Where(x => x.HasMsMs()).ToList(); features = features.OrderBy(x => x.Mz).ToList(); var peptideList = new List<MSFeatureLight>(); foreach (var feature in features) { foreach (var spectrum in feature.MSnSpectra) { var peptideFound = false; foreach (var peptide in spectrum.Peptides) { peptideList.Add(feature); peptideFound = true; break; } if (peptideFound) break; } } using (var writer = File.CreateText(matchPath)) { writer.WriteLine("Charge\tpmz\tscan\tNET\t"); foreach (var feature in peptideList) { writer.WriteLine("{0}\t{1}\t{2}\t{3}\t", feature.ChargeState, feature.Mz, feature.Scan, feature.Net); } } } }
public void CreateFeatureDatabase(string directoryPath, string databasePath) { var directory = GetPath(directoryPath); databasePath = GetPath(databasePath); // Loads the supported MultiAlign types var supportedTypes = DatasetLoader.SupportedFileTypes; var extensions = new List <string>(); supportedTypes.ForEach(x => extensions.Add("*" + x.Extension)); // Find our datasets var datasetLoader = new DatasetLoader(); var datasets = datasetLoader.GetValidDatasets(directory, extensions, SearchOption.TopDirectoryOnly); // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass + 6, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; var featureFindingOptions = new LcmsFeatureFindingOptions(featureTolerances) { MaximumNetRange = .002, MaximumScanRange = 50 }; var lcmsFilters = new LcmsFeatureFilteringOptions { FeatureLengthRangeScans = new FilterRange(50, 300) }; var msFilterOptions = new MsFeatureFilteringOptions { MinimumIntensity = 5000, ChargeRange = new FilterRange(1, 6), ShouldUseChargeFilter = true, ShouldUseDeisotopingFilter = true, ShouldUseIntensityFilter = true }; var spectralOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .01, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); NHibernateUtil.CreateDatabase(databasePath); // Synchronization and IO for serializing all data to the database. var providers = DataAccessFactory.CreateDataAccessProviders(databasePath, true); var cache = new FeatureLoader { Providers = providers }; var datasetId = 0; foreach (var dataset in datasets) { dataset.DatasetId = datasetId++; var features = FindFeatures(dataset, featureFindingOptions, msFilterOptions, lcmsFilters, spectralOptions, finder); cache.CacheFeatures(features); } providers.DatasetCache.AddAll(datasets); }
///// <summary> ///// Links anchor points use the raw spectra provided. ///// </summary> //public IEnumerable<SpectralAnchorPointMatch> FindAnchorPoints2( ISpectraProvider readerX, // ISpectraProvider readerY, // ISpectralComparer comparer, // ISpectraFilter filter, // SpectralOptions options, // bool skipComparison = true) //{ // var matches = new List<SpectralAnchorPointMatch>(); // var scanDataX = readerX.GetScanData(0); // var scanDataY = readerY.GetScanData(0); // // Determine the scan extrema // var maxX = scanDataX.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; // var minX = scanDataX.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; // var maxY = scanDataY.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; // var minY = scanDataY.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; // // Create a spectral comparer // var ySpectraCache = new Dictionary<int, MSSpectra>(); // // Here we sort the summary spectra....so that we can improve run time efficiency // // and minimize as much memory as possible. // var ySpectraSummary = scanDataY.Values.Where(summary => summary.MsLevel == 2).ToList(); // var xSpectraSummary = scanDataX.Values.Where(summary => summary.MsLevel == 2).ToList(); // ySpectraSummary.Sort((x, y) => x.PrecursorMZ.CompareTo(y.PrecursorMZ)); // xSpectraSummary.Sort((x, y) => x.PrecursorMZ.CompareTo(y.PrecursorMZ)); // double mzTolerance = options.MzTolerance; // foreach (var xsum in xSpectraSummary) // { // int scanx = xsum.Scan; // // Grab the first spectra // var spectrumX = SpectralUtilities.GetSpectra(options.MzBinSize, // options.TopIonPercent, // filter, // readerX, // scanx, // options.RequiredPeakCount); // spectrumX.PrecursorMZ = xsum.PrecursorMZ; // // Here we make sure that we are efficiently using the cache...we want to clear any // // cached spectra that we arent using. We know that the summaries are sorted by m/z // // so if the xsum m/z is greater than anything in the cache, dump the spectra... // double currentMz = xsum.PrecursorMZ; // // Use linq? // var toRemove = new List<int>(); // foreach (int scan in ySpectraCache.Keys) // { // MSSpectra yscan = ySpectraCache[scan]; // double difference = currentMz - yscan.PrecursorMZ; // // We only need to care about smaller m/z's // if (difference >= mzTolerance) // { // toRemove.Add(scan); // } // else // { // // Because if we are here, we are within range...AND! // // ...the m/z of i + 1 > i...because they are sorted... // // so if the m/z comes within range (positive) then // // that means we need to evaluate the tolerance. // break; // } // } // // Then we clean up...since spectra can be large...we'll take the performance hit here... // // and minimize memory impacts! // if (toRemove.Count > 0) // { // toRemove.ForEach(x => ySpectraCache.Remove(x)); // GC.Collect(); // GC.WaitForPendingFinalizers(); // } // // Iterate through the other analysis. // foreach (var ysum in ySpectraSummary) // { // int scany = ysum.Scan; // // We know that we are out of range here.... // if (Math.Abs(xsum.PrecursorMZ - ysum.PrecursorMZ) >= mzTolerance) // continue; // double netX = Convert.ToDouble(scanx - minX) / Convert.ToDouble(maxX - minX); // double netY = Convert.ToDouble(scany - minY) / Convert.ToDouble(maxY - minY); // double net = Convert.ToDouble(netX - netY); // // Has to pass the NET tolerance // if (options.NetTolerance < Math.Abs(net)) continue; // // Grab the first spectra...if we have it, great dont re-read // MSSpectra spectrumY = null; // if (ySpectraCache.ContainsKey(scany)) // { // if (!skipComparison) // spectrumY = ySpectraCache[scany]; // } // else // { // if (!skipComparison) // { // spectrumY = SpectralUtilities.GetSpectra(options.MzBinSize, // options.TopIonPercent, // filter, // readerY, // scany, // options.RequiredPeakCount); // spectrumY.PrecursorMZ = ysum.PrecursorMZ; // ySpectraCache.Add(scany, spectrumY); // } // } // // compare the spectra // double spectralSimilarity = 0; // if (!skipComparison) // spectralSimilarity = comparer.CompareSpectra(spectrumX, spectrumY); // if (double.IsNaN(spectralSimilarity) || double.IsNegativeInfinity(spectralSimilarity) || double.IsPositiveInfinity(spectralSimilarity)) // continue; // if (spectralSimilarity < options.SimilarityCutoff) // continue; // var pointX = new SpectralAnchorPoint // { // Net = netX, // Mass = 0, // Mz = xsum.PrecursorMZ, // Scan = scanx, // Spectrum = spectrumX // }; // var pointY = new SpectralAnchorPoint // { // Net = netX, // Mass = 0, // Mz = ysum.PrecursorMZ, // Scan = scany, // Spectrum = spectrumY // }; // var match = new SpectralAnchorPointMatch // { // AnchorPointX = pointX, // AnchorPointY = pointY, // SimilarityScore = spectralSimilarity, // IsValidMatch = AnchorPointMatchType.FalseMatch // }; // matches.Add(match); // } // } // return matches; //} /// <summary> /// Computes all anchor point matches between two sets of spectra. /// </summary> /// <param name="readerX"></param> /// <param name="readerY"></param> /// <param name="comparer"></param> /// <param name="filter"></param> /// <param name="options"></param> /// <param name="skipComparison"></param> /// <returns></returns> public IEnumerable <SpectralAnchorPointMatch> FindAnchorPoints(ISpectraProvider readerX, ISpectraProvider readerY, ISpectralComparer comparer, ISpectraFilter filter, SpectralOptions options, bool skipComparison = false) { var matches = new List <SpectralAnchorPointMatch>(); var scanDataX = readerX.GetScanData(0); var scanDataY = readerY.GetScanData(0); // Determine the scan extrema var maxX = scanDataX.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; var minX = scanDataX.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; var maxY = scanDataY.Aggregate((l, r) => l.Value.Scan > r.Value.Scan ? l : r).Key; var minY = scanDataY.Aggregate((l, r) => l.Value.Scan < r.Value.Scan ? l : r).Key; // Here we sort the summary spectra....so that we can improve run time efficiency // and minimize as much memory as possible. var ySpectraSummary = scanDataY.Values.Where(summary => summary.MsLevel == 2).ToList(); var xSpectraSummary = scanDataX.Values.Where(summary => summary.MsLevel == 2).ToList(); ySpectraSummary.Sort((x, y) => x.PrecursorMz.CompareTo(y.PrecursorMz)); xSpectraSummary.Sort((x, y) => x.PrecursorMz.CompareTo(y.PrecursorMz)); var netTolerance = options.NetTolerance; var mzTolerance = options.MzTolerance; var j = 0; var i = 0; var yTotal = ySpectraSummary.Count; var xTotal = xSpectraSummary.Count; var similarities = new List <double>(); var cache = new Dictionary <int, MSSpectra>(); var pointsY = new Dictionary <int, SpectralAnchorPoint>(); while (i < xTotal && j < yTotal) { var xsum = xSpectraSummary[i]; var scanx = xsum.Scan; var precursorX = xsum.PrecursorMz; MSSpectra spectrumX = null; while (j < yTotal && ySpectraSummary[j].PrecursorMz < (precursorX - mzTolerance)) { // Here we make sure we arent caching something var scany = ySpectraSummary[j].Scan; if (cache.ContainsKey(scany)) { cache.Remove(scany); if (pointsY.ContainsKey(scany)) { if (pointsY[scany].Spectrum.Peaks != null) { pointsY[scany].Spectrum.Peaks.Clear(); pointsY[scany].Spectrum.Peaks = null; } } } j++; } var k = 0; var points = new List <SpectralAnchorPoint>(); while ((j + k) < yTotal && Math.Abs(ySpectraSummary[j + k].PrecursorMz - precursorX) < mzTolerance) { var ysum = ySpectraSummary[j + k]; k++; var scany = ysum.Scan; var netX = Convert.ToDouble(scanx - minX) / Convert.ToDouble(maxX - minX); var netY = Convert.ToDouble(scany - minY) / Convert.ToDouble(maxY - minY); var net = Convert.ToDouble(netX - netY); // Test whether the spectra are within decent range. if (Math.Abs(net) < netTolerance) { // We didnt pull this spectrum before, because we arent sure // if it will be within tolerance....so we just delay this // until we have to...after this happens, we only pull it once. if (spectrumX == null) { if (!skipComparison) { // Grab the first spectra spectrumX = SpectralUtilities.GetSpectra(options.MzBinSize, options.TopIonPercent, filter, readerX, scanx, options.RequiredPeakCount); if (spectrumX != null) { spectrumX.PrecursorMz = xsum.PrecursorMz; } else { // This spectra does not have enough peaks or did not pass our filters, throw it away! break; } } } MSSpectra spectrumY = null; if (!skipComparison) { if (cache.ContainsKey(scany)) { spectrumY = cache[scany]; } else { spectrumY = SpectralUtilities.GetSpectra(options.MzBinSize, options.TopIonPercent, filter, readerY, scany, options.RequiredPeakCount); if (spectrumY != null) { spectrumY.PrecursorMz = ysum.PrecursorMz; cache.Add(scany, spectrumY); } else { continue; // This spectra does not have enough peaks or did not pass our filters, throw it away! } } } if (spectrumX == null || spectrumY == null) { continue; } // compare the spectra double spectralSimilarity = 0; if (!skipComparison) { spectralSimilarity = comparer.CompareSpectra(spectrumX, spectrumY); } // similarities.Add(spectralSimilarity); File.AppendAllText(@"c:\data\proteomics\test.txt", string.Format("{0}\t{1}\t{2}\n", spectrumX.PrecursorMz, spectrumY.PrecursorMz, spectralSimilarity)); if (double.IsNaN(spectralSimilarity) || double.IsInfinity(spectralSimilarity)) { continue; } if (spectralSimilarity < options.SimilarityCutoff) { continue; } var pointX = new SpectralAnchorPoint { Net = netX, Mass = 0, Mz = xsum.PrecursorMz, Scan = scanx, Spectrum = spectrumX }; var pointY = new SpectralAnchorPoint { Net = netY, Mass = 0, Mz = ysum.PrecursorMz, Scan = scany, Spectrum = spectrumY }; var match = new SpectralAnchorPointMatch(); match.AnchorPointX = pointX; match.AnchorPointY = pointY; match.SimilarityScore = spectralSimilarity; match.IsValidMatch = AnchorPointMatchType.FalseMatch; matches.Add(match); points.Add(pointX); if (!pointsY.ContainsKey(scany)) { pointsY.Add(scany, pointY); } } } // Move to the next spectra in the x-list i++; foreach (var p in points) { if (p.Spectrum.Peaks != null) { p.Spectrum.Peaks.Clear(); p.Spectrum.Peaks = null; } } points.Clear(); } return(matches); }
public void GenerateClusterAlignmentStatistics(string relativeDatabasePath, string relativeName, string name, FeatureAlignmentType alignmentType, LcmsFeatureClusteringAlgorithmType clusterType) { var databasePath = GetPath(relativeDatabasePath); var outputPath = GetOutputPath(relativeName); if (!Directory.Exists(outputPath)) { Directory.CreateDirectory(outputPath); } // Connect to the NHibernate database var providers = DataAccessFactory.CreateDataAccessProviders(databasePath, false); // Setup our alignment options var alignmentOptions = new AlignmentOptions(); var spectralOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .01, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass + 6, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; UpdateStatus("Retrieving all datasets for test."); var datasets = providers.DatasetCache.FindAll(); // Create our algorithms var aligner = FeatureAlignerFactory.CreateDatasetAligner(alignmentType, alignmentOptions.LCMSWarpOptions, spectralOptions); var clusterer = ClusterFactory.Create(clusterType); clusterer.Parameters = new FeatureClusterParameters <UMCLight> { Tolerances = featureTolerances }; RegisterProgressNotifier(aligner); RegisterProgressNotifier(clusterer); for (var i = 0; i < datasets.Count - 1; i++) { var matchPath = string.Format("{0}-{1}-matches.txt", name, i); var errorPath = string.Format("{0}-{1}-errors.txt", name, i); matchPath = Path.Combine(outputPath, matchPath); errorPath = Path.Combine(outputPath, errorPath); var aligneeDataset = datasets[i + 1]; var baselineDataset = datasets[i]; // Load the baseline reference set using (var rawProviderX = new InformedProteomicsReader()) { rawProviderX.AddDataFile(baselineDataset.RawFile.Path, 0); // Load the baseline reference set using (var rawProviderY = new InformedProteomicsReader()) { rawProviderY.AddDataFile(aligneeDataset.RawFile.Path, 0); var baselineFeatures = RetrieveFeatures(baselineDataset.DatasetId, providers); var aligneeFeatures = RetrieveFeatures(aligneeDataset.DatasetId, providers); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); AlignDatasets(baselineFeatures, aligneeFeatures, providerX, providerY, aligner, clusterer, matchPath, errorPath); } } } }
public void TestPeptideBands(string directory, string matchPath) { // Loads the supported MultiAlign types var supportedTypes = DatasetLoader.SupportedFileTypes; var extensions = new List <string>(); supportedTypes.ForEach(x => extensions.Add("*" + x.Extension)); // Find our datasets var datasetLoader = new DatasetLoader(); var datasets = datasetLoader.GetValidDatasets(directory, extensions, SearchOption.TopDirectoryOnly); // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; var msFilterOptions = new MsFeatureFilteringOptions { MinimumIntensity = 5000, ChargeRange = new FilterRange(1, 6), ShouldUseChargeFilter = true, ShouldUseDeisotopingFilter = true, ShouldUseIntensityFilter = true }; var featureFindingOptions = new LcmsFeatureFindingOptions(featureTolerances) { MaximumNetRange = .002, MaximumScanRange = 50 }; var baselineDataset = datasets[0]; UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(baselineDataset.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); var finderFinder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var peptideOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .05, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; var features = new List <MSFeatureLight>(); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(baselineDataset.RawFile.Path)) { rawProviderX.AddDataFile(baselineDataset.RawFile.Path, 0); UpdateStatus("Creating Baseline LCMS Features."); var baselineFeatures = finderFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); LinkPeptidesToFeatures(baselineDataset.Sequence.Path, baselineFeatures, peptideOptions.Fdr, peptideOptions.IdScore); baselineFeatures.ForEach(x => features.AddRange(x.MsFeatures)); features = features.Where(x => x.HasMsMs()).ToList(); features = features.OrderBy(x => x.Mz).ToList(); var peptideList = new List <MSFeatureLight>(); foreach (var feature in features) { foreach (var spectrum in feature.MSnSpectra) { var peptideFound = false; foreach (var peptide in spectrum.Peptides) { peptideList.Add(feature); peptideFound = true; break; } if (peptideFound) { break; } } } using (var writer = File.CreateText(matchPath)) { writer.WriteLine("Charge\tpmz\tscan\tNET\t"); foreach (var feature in peptideList) { writer.WriteLine("{0}\t{1}\t{2}\t{3}\t", feature.ChargeState, feature.Mz, feature.Scan, feature.Net); } } } }
public void TestClustering( string directory, string outputPath, FeatureAlignmentType alignmentType, LcmsFeatureClusteringAlgorithmType clusterType) { var matchPath = string.Format("{0}.txt", outputPath); var errorPath = string.Format("{0}-errors.txt", outputPath); // Loads the supported MultiAlign types var supportedTypes = DatasetLoader.SupportedFileTypes; var extensions = new List <string>(); supportedTypes.ForEach(x => extensions.Add("*" + x.Extension)); // Find our datasets var datasetLoader = new DatasetLoader(); var datasets = datasetLoader.GetValidDatasets(directory, extensions, SearchOption.TopDirectoryOnly); // Setup our alignment options var alignmentOptions = new AlignmentOptions(); var spectralOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .01, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass + 6, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; var featureFindingOptions = new LcmsFeatureFindingOptions(featureTolerances) { MaximumNetRange = .002, MaximumScanRange = 50 }; // Create our algorithms var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); var aligner = FeatureAlignerFactory.CreateDatasetAligner(alignmentType, alignmentOptions.LCMSWarpOptions, spectralOptions); var clusterer = ClusterFactory.Create(clusterType); clusterer.Parameters = new FeatureClusterParameters <UMCLight> { Tolerances = featureTolerances }; RegisterProgressNotifier(aligner); RegisterProgressNotifier(finder); RegisterProgressNotifier(clusterer); var lcmsFilters = new LcmsFeatureFilteringOptions { FeatureLengthRangeScans = new FilterRange(50, 300) }; var msFilterOptions = new MsFeatureFilteringOptions { MinimumIntensity = 5000, ChargeRange = new FilterRange(1, 6), ShouldUseChargeFilter = true, ShouldUseDeisotopingFilter = true, ShouldUseIntensityFilter = true }; for (var i = 0; i < 1; i++) { var aligneeDatasets = datasets.Where((t, j) => j != i).ToList(); PerformMultiAlignAnalysis(datasets[0], aligneeDatasets, featureFindingOptions, msFilterOptions, lcmsFilters, spectralOptions, finder, aligner, clusterer, matchPath, errorPath); } }
public void CreateFeatureDatabase(string directoryPath, string databasePath) { var directory = GetPath(directoryPath); databasePath = GetPath(databasePath); // Loads the supported MultiAlign types var supportedTypes = DatasetInformation.SupportedFileTypes; var extensions = new List<string>(); supportedTypes.ForEach(x => extensions.Add("*" + x.Extension)); // Find our datasets var inputFiles = DatasetSearcher.FindDatasets(directory, extensions, SearchOption.TopDirectoryOnly); var datasets = DatasetInformation.ConvertInputFilesIntoDatasets(inputFiles); // Options setup var instrumentOptions = InstrumentPresetFactory.Create(InstrumentPresets.LtqOrbitrap); var featureTolerances = new FeatureTolerances { Mass = instrumentOptions.Mass + 6, Net = instrumentOptions.NetTolerance, DriftTime = instrumentOptions.DriftTimeTolerance }; var featureFindingOptions = new LcmsFeatureFindingOptions(featureTolerances) { MaximumNetRange = .002, MaximumScanRange = 50 }; var lcmsFilters = new LcmsFeatureFilteringOptions { FeatureLengthRange = new FilterRange(50, 300) }; var msFilterOptions = new MsFeatureFilteringOptions { MinimumIntensity = 5000, ChargeRange = new FilterRange(1, 6), ShouldUseChargeFilter = true, ShouldUseDeisotopingFilter = true, ShouldUseIntensityFilter = true }; var spectralOptions = new SpectralOptions { ComparerType = SpectralComparison.CosineDotProduct, Fdr = .01, IdScore = 1e-09, MzBinSize = .5, MzTolerance = .5, NetTolerance = .1, RequiredPeakCount = 32, SimilarityCutoff = .75, TopIonPercent = .8 }; var finder = FeatureFinderFactory.CreateFeatureFinder(FeatureFinderType.TreeBased); NHibernateUtil.CreateDatabase(databasePath); // Synchronization and IO for serializing all data to the database. var providers = DataAccessFactory.CreateDataAccessProviders(databasePath, true); var cache = new FeatureLoader { Providers = providers }; var datasetId = 0; foreach(var dataset in datasets) { dataset.DatasetId = datasetId++; var features = FindFeatures(dataset, featureFindingOptions, msFilterOptions, lcmsFilters, spectralOptions, finder); cache.CacheFeatures(features); } providers.DatasetCache.AddAll(datasets); }
/// <summary> /// Runs the MultiAlign analysis /// </summary> public void PerformMultiAlignAnalysis(DatasetInformation baselineDataset, IEnumerable<DatasetInformation> aligneeDatasets, LcmsFeatureFindingOptions featureFindingOptions, MsFeatureFilteringOptions msFilterOptions, LcmsFeatureFilteringOptions lcmsFilterOptions, SpectralOptions peptideOptions, IFeatureFinder featureFinder, IFeatureAligner<IEnumerable<UMCLight>, IEnumerable<UMCLight>, classAlignmentData> aligner, IClusterer<UMCLight, UMCClusterLight> clusterer, string matchPath, string errorPath) { UpdateStatus("Loading baseline features."); var msFeatures = UmcLoaderFactory.LoadMsFeatureData(baselineDataset.Features.Path); msFeatures = LcmsFeatureFilters.FilterMsFeatures(msFeatures, msFilterOptions); // Load the baseline reference set using (var rawProviderX = RawLoaderFactory.CreateFileReader(baselineDataset.RawPath)) { rawProviderX.AddDataFile(baselineDataset.RawPath, 0); UpdateStatus("Creating Baseline LCMS Features."); var baselineFeatures = featureFinder.FindFeatures(msFeatures, featureFindingOptions, rawProviderX); LinkPeptidesToFeatures(baselineDataset.SequencePath, baselineFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerX = new CachedFeatureSpectraProvider(rawProviderX, baselineFeatures); // Then load the alignee dataset foreach (var dataset in aligneeDatasets) { var aligneeMsFeatures = UmcLoaderFactory.LoadMsFeatureData(dataset.Features.Path); aligneeMsFeatures = LcmsFeatureFilters.FilterMsFeatures(aligneeMsFeatures, msFilterOptions); using (var rawProviderY = RawLoaderFactory.CreateFileReader(dataset.RawPath)) { rawProviderY.AddDataFile(dataset.RawPath, 0); UpdateStatus("Finding alignee features"); var aligneeFeatures = featureFinder.FindFeatures(aligneeMsFeatures, featureFindingOptions, rawProviderY); LinkPeptidesToFeatures(dataset.SequencePath, aligneeFeatures, peptideOptions.Fdr, peptideOptions.IdScore); var providerY = new CachedFeatureSpectraProvider(rawProviderY, aligneeFeatures); // cluster before we do anything else.... var allFeatures = new List<UMCLight>(); allFeatures.AddRange(baselineFeatures); allFeatures.AddRange(aligneeFeatures); foreach (var feature in allFeatures) { feature.Net = feature.Net; feature.MassMonoisotopicAligned = feature.MassMonoisotopic; } // This tells us the differences before we align. var clusters = clusterer.Cluster(allFeatures); var preAlignment = AnalyzeClusters(clusters); aligner.AligneeSpectraProvider = providerY; aligner.BaselineSpectraProvider = providerX; UpdateStatus("Aligning data"); // Aligner data var data = aligner.Align(baselineFeatures, aligneeFeatures); var matches = data.Matches; WriteErrors(errorPath, matches); // create anchor points for LCMSWarp alignment var massPoints = new List<RegressionPoint>(); var netPoints = new List<RegressionPoint>(); foreach (var match in matches) { var massError = FeatureLight.ComputeMassPPMDifference(match.AnchorPointX.Mz, match.AnchorPointY.Mz); var netError = match.AnchorPointX.Net - match.AnchorPointY.Net; var massPoint = new RegressionPoint(match.AnchorPointX.Mz, 0, massError, netError); massPoints.Add(massPoint); var netPoint = new RegressionPoint(match.AnchorPointX.Net, 0, massError, netError); netPoints.Add(netPoint); } foreach (var feature in allFeatures) { feature.UmcCluster = null; feature.ClusterId = -1; } // Then cluster after alignment! UpdateStatus("clustering data"); clusters = clusterer.Cluster(allFeatures); var postAlignment = AnalyzeClusters(clusters); UpdateStatus("Note\tSame\tDifferent"); UpdateStatus(string.Format("Pre\t{0}\t{1}", preAlignment.SameCluster, preAlignment.DifferentCluster)); UpdateStatus(string.Format("Post\t{0}\t{1}", postAlignment.SameCluster, postAlignment.DifferentCluster)); SaveMatches(matchPath, matches); } } } DeRegisterProgressNotifier(aligner); DeRegisterProgressNotifier(featureFinder); DeRegisterProgressNotifier(clusterer); }