private void RunFeatureAlignment(IList <string> ms1FtFiles, IReadOnlyList <string> rawFiles, string outFilePath) { var runList = new List <LcMsRun>(); foreach (var rawFile in rawFiles) { runList.Add(new PbfLcMsRun(rawFile)); } var align = new LcMsFeatureAlignment(ms1FtFiles, runList, new LcMsFeatureAlignComparer(new Tolerance(10))); align.AlignFeatures(); Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); var tempOutPath = outFilePath + ".tmp"; OutputAlignmentResult(align, tempOutPath, rawFiles, true); align.RefineAbundance(); OutputAlignmentResult(align, outFilePath, rawFiles, false); }
public void TestAlignFeatures() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string rawFolder = @"\\proto-11\MSXML_Cache\PBF_Gen_1_193\2015_2"; const string promexOutFolder = @"D:\MassSpecFiles\UTEX\MSAlign"; const string msAlignResultFolder = @"D:\MassSpecFiles\UTEX\MSAlign"; if (!Directory.Exists(rawFolder)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, rawFolder); } var nDataset = 32; var dataset = new string[nDataset]; for (var i = 0; i < nDataset; i++) { dataset[i] = string.Format("Syn_utex2973_Top_{0,2:D2}_TopDown_7May15_Bane_14-09-01RZ", i + 1); //var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]); } var tolerance = new Tolerance(10); var ftComparer = new UtexFeatureComparer(tolerance); var align = new LcMsFeatureAlignment(ftComparer); var prsmReader = new ProteinSpectrumMatchReader(0.01); var filesProcessed = 0; for (var i = 0; i < dataset.Length; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]); if (!File.Exists(rawFile)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile); continue; } var run = PbfLcMsRun.GetLcMsRun(rawFile); var path = string.Format(@"{0}\{1}_MSAlign_ResultTable.txt", msAlignResultFolder, dataset[i]); if (!File.Exists(path)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", path); continue; } var ms1ftPath = string.Format(@"{0}\{1}.ms1ft", promexOutFolder, dataset[i]); if (!File.Exists(ms1ftPath)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", ms1ftPath); continue; } filesProcessed++; //var map = new ProteinSpectrumMathMap(run, i, dataset[i]); //map.LoadIdentificationResult(path, ProteinSpectrumMatch.SearchTool.MsAlign); var prsmList = prsmReader.LoadIdentificationResult(path, ProteinSpectrumMatch.SearchTool.MsAlign); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName.Substring( match.ProteinName.IndexOf(ProteinNamePrefix) + ProteinNamePrefix.Length, 5); } var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1ftPath, run); // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsMz(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } align.AddDataSet(i, features, run); } if (filesProcessed == 0) { Assert.Ignore("Skipped since input files not found"); } align.AlignFeatures(); Console.WriteLine("{0} alignments ", align.CountAlignedFeatures); align.RefineAbundance(); var alignedFeatureList = align.GetAlignedFeatures(); for (var i = 0; i < nDataset; i++) { var ms1ftPath = string.Format(@"{0}\{1}_aligned.ms1ft", promexOutFolder, dataset[i]); var writer = new StreamWriter(ms1ftPath); writer.Write(LcMsFeatureFinderLauncher.GetHeaderString()); writer.WriteLine("\tIdedMs2ScanNums"); for (var j = 0; j < alignedFeatureList.Count; j++) { writer.Write(j + 1); writer.Write("\t"); if (alignedFeatureList[j][i] == null) { for (var k = 0; k < 14; k++) { writer.Write("0\t"); } writer.Write("0\n"); } else { writer.Write(LcMsFeatureFinderLauncher.GetString(alignedFeatureList[j][i])); writer.Write("\t"); if (alignedFeatureList[j][i].ProteinSpectrumMatches == null) { writer.Write(""); } else { var scanNums = string.Join(";", alignedFeatureList[j][i].ProteinSpectrumMatches.Select(prsm => prsm.ScanNum)); writer.Write(scanNums); } writer.Write("\n"); } } writer.Close(); } }
public List <UMCClusterLight> Cluster(List <UMCLight> data, IProgress <ProgressData> progress = null) { progress = progress ?? new Progress <ProgressData>(); if (data.Count == 0) { return(new List <UMCClusterLight>()); } this.maxFeatureId = data.Select(d => d.Id).Max(); this.featureMap = new Dictionary <Tuple <int, int>, UMCLight>(); foreach (var feature in data) { var key = new Tuple <int, int>(feature.GroupId, feature.Id); this.featureMap.Add(key, feature); } var lcmsFeatureAligner = new LcMsFeatureAlignment(new LcMsFeatureAlignComparer(new Tolerance(10, ToleranceUnit.Ppm))); // Group features by dataset var idToFeatures = new Dictionary <int, List <UMCLight> >(); foreach (var umcLight in data) { if (!idToFeatures.ContainsKey(umcLight.GroupId)) { idToFeatures.Add(umcLight.GroupId, new List <UMCLight>()); } idToFeatures[umcLight.GroupId].Add(umcLight); } // Convert UMCLights to InformedProteomics LcMsFeatures foreach (var ds in idToFeatures) { var lcmsFeatures = new List <LcMsFeature>(ds.Value.Select(this.GetLcMsFeature)); lcmsFeatureAligner.AddDataSet(ds.Key, lcmsFeatures, this.GetLcMsRun(ds.Key)); } // Perform clustering lcmsFeatureAligner.AlignFeatures(); // Fill in mising features using noise. lcmsFeatureAligner.RefineAbundance(-30, progress); var clusteredFeatures = lcmsFeatureAligner.GetAlignedFeatures(); // Convert InformedProteomics clusters to UMCClusterLight int clustId = 0; var clusters = new List <UMCClusterLight>(); foreach (var cluster in clusteredFeatures) { var firstFeature = cluster.FirstOrDefault(f => f != null); if (firstFeature == null) { continue; } var umcCluster = new UMCClusterLight { Id = clustId++, }; int datasetId = 0; // Promex doesn't keep track of which dataset noise features belong to, so we need to. foreach (var feature in cluster) { if (feature == null) { continue; } feature.DataSetId = datasetId++; var umc = this.GetUMC(feature); umcCluster.AddChildFeature(umc); umc.SetParentFeature(umcCluster); } umcCluster.CalculateStatistics(ClusterCentroidRepresentation.Median); clusters.Add(umcCluster); } return(clusters); }
private void RunFeatureAlignment(List<string> ms1FtFiles, List<string> rawFiles, string outFilePath) { var runList = new List<LcMsRun>(); foreach(var rawFile in rawFiles) runList.Add(new PbfLcMsRun(rawFile)); var align = new LcMsFeatureAlignment(ms1FtFiles, runList, new LcMsFeatureAlignComparer(new Tolerance(10))); align.AlignFeatures(); Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); var tempOutPath = outFilePath + ".tmp"; OutputAlignmentResult(align, tempOutPath, rawFiles, true); align.RefineAbundance(); OutputAlignmentResult(align, outFilePath, rawFiles, false); }