public LcMsFeatureMap(LcMsRun run, string ms1FtPath, double minMass, double maxMass) : this(run, LcMsFeatureAlignment.LoadProMexResult(0, ms1FtPath, run, minMass, maxMass), Path.GetFileNameWithoutExtension(ms1FtPath), minMass, maxMass) { }
private void OutputAlignmentResult(LcMsFeatureAlignment align, string outFilePath, string[] dataName) { var alignedFeatureList = align.GetAlignedFeatures(); var writer = new StreamWriter(outFilePath); writer.Write("MonoMass\tMinElutionTime\tMaxElutionTime"); for (var i = 0; i < align.CountDatasets; i++) { writer.Write("\t{0}", dataName[i]); } writer.Write("\n"); for (var i = 0; i < align.CountAlignedFeatures; i++) { var features = alignedFeatureList[i]; var minMaxNet = TestLcMsFeatureAlignment.GetMinMaxNet(features); writer.Write(@"{0} {1:0.00000} {2:0.00000}", minMaxNet.Item1, minMaxNet.Item3, minMaxNet.Item4); for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); writer.Write(feature != null ? feature.Abundance : 0d); } writer.Write("\n"); } writer.Close(); }
public void TestFeatureAlignment() { const string outFilePath = @"\\protoapps\UserData\Jungkap\CompRef\aligned\promex_crosstab_temp.tsv"; var runLabels = new[] { "32A", "32B", "32C", "32D", "32E", "32F", "32G", "33A", "33B", "33C", "33D", "33E", "33F", "33G" }; var nDataset = runLabels.Length; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(10); var alignment = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance)); for (var i = 0; i < nDataset; i++) { var rawFile = string.Format(@"{0}\CPTAC_Intact_CR{1}_24Aug15_Bane_15-02-06-RZ.pbf", RawFolder, runLabels[i]); var mspFile = string.Format(@"{0}\CPTAC_Intact_CR{1}_24Aug15_Bane_15-02-06-RZ_IcTda.tsv", MsPfFolder, runLabels[i]); var ms1FtFile = string.Format(@"{0}\CPTAC_Intact_CR{1}_24Aug15_Bane_15-02-06-RZ.ms1ft", Ms1FtFolder, runLabels[i]); var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); if (File.Exists(mspFile)) { var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsMz(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < nDataset; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", runLabels[i]); } OutputCrossTabWithId(outFilePath, alignment, runLabels); }
public void TestFeatureAlignment() { const string outFilePath = @"\\protoapps\UserData\Jungkap\Lewy\aligned\promex_crosstab_temp.tsv"; //CPTAC_Intact_CR32A_24Aug15_Bane_15-02-06-RZ var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(10); var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance)); for (var i = 0; i < NdataSet; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, GetDataSetNames(i)); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder, GetDataSetNames(i)); var mspFile2 = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder2, GetDataSetNames(i)); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", Ms1FtFolder, GetDataSetNames(i)); Console.WriteLine(rawFile); var run = PbfLcMsRun.GetLcMsRun(rawFile); var prsmList1 = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); var prsmList2 = prsmReader.LoadIdentificationResult(mspFile2, ProteinSpectrumMatch.SearchTool.MsPathFinder); prsmList1.AddRange(prsmList2); var prsmList = MergePrsm(prsmList1); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < NdataSet; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", GetDataSetNames(i)); } OutputCrossTabWithId(outFilePath, alignment); }
private void AlignFeatures(List <string> datasets, string mspfFolder, string ms1ftFolder, string outFilePath) { var nDataset = datasets.Count; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(12); var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance)); for (var i = 0; i < nDataset; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]); var ms1FtFile2 = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]); var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); var features2 = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile2, run); features.AddRange(features2); if (File.Exists(mspFile)) { var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); //var prsmFeatureMatch = new bool[prsmList.Count]; foreach (var match in prsmList) { match.ProteinId = match.ProteinName; } // tag features by PrSMs foreach (var feature in features) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsMz(feature.Mass); foreach (var match in prsmList) { if (feature.MinScanNum < match.ScanNum && match.ScanNum < feature.MaxScanNum && Math.Abs(feature.Mass - match.Mass) < massTol) { feature.ProteinSpectrumMatches.Add(match); //prsmFeatureMatch[k] = true; } } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < nDataset; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", datasets[i]); } AnalysisCompRef.OutputCrossTabWithId(outFilePath, alignment, datasets); }
public void TestFeatureAlignment() { const string outFilePath = @"\\protoapps\UserData\Jungkap\Lewy\aligned\promex_crosstab_temp.tsv"; //CPTAC_Intact_CR32A_24Aug15_Bane_15-02-06-RZ var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(10); var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance)); for (var i = 0; i < NdataSet; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, GetDataSetNames(i)); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder, GetDataSetNames(i)); var mspFile2 = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder2, GetDataSetNames(i)); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", Ms1FtFolder, GetDataSetNames(i)); Console.WriteLine(rawFile); var run = PbfLcMsRun.GetLcMsRun(rawFile); var prsmList1 = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); var prsmList2 = prsmReader.LoadIdentificationResult(mspFile2, ProteinSpectrumMatch.SearchTool.MsPathFinder); prsmList1.AddRange(prsmList2); var prsmList = MergePrsm(prsmList1); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < NdataSet; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", GetDataSetNames(i)); } OutputCrossTabWithId(outFilePath, alignment); }
public void TestFeatureAlignment() { const string outFilePath = @"\\protoapps\UserData\Jungkap\Quant\aligned\promex_crosstab.tsv"; //const string outFolder = @"\\protoapps\UserData\Jungkap\CompRef\aligned"; var runLabels = new string[] { "1x1", "1x2", "1x3", "1x4", "1x5", "5x1", "5x2", "5x3", "5x4", "5x5", "10x1", "10x2", "10x3", "10x4", "10x5", }; var nDataset = runLabels.Length; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(10); var alignment = new LcMsFeatureAlignment(new SpikeInFeatureComparer(tolerance)); for (var i = 0; i < nDataset; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", RawFolder, datasets[i]); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", MsPfFolder, datasets[i]); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", Ms1FtFolder, datasets[i]); var run = PbfLcMsRun.GetLcMsRun(rawFile); var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); /* * for (var i = 0; i < nDataset; i++) * { * alignment.FillMissingFeatures(i); * Console.WriteLine("{0} has been processed", runLabels[i]); * } */ OutputCrossTabWithId(outFilePath, alignment, runLabels); }
public void TestCptacSpikeIn() { const string featureFolder = @"D:\MassSpecFiles\CPTAC_spike_in\promex"; const string rawFolder = @"D:\MassSpecFiles\CPTAC_spike_in\raw"; var outFilePath = string.Format(@"{0}\aligned_features.tsv", featureFolder); var align = new LcMsFeatureAlignment(new LcMsFeatureAlignComparer(new Tolerance(10))); for (var i = 0; i < spikeDatasets.Length; i++) { var featureFilePath = string.Format(@"{0}\{1}.ms1ft", featureFolder, spikeDatasets[i]); var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, spikeDatasets[i]); if (!File.Exists(rawFile)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile); continue; } if (!File.Exists(featureFilePath)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", featureFilePath); continue; } var run = PbfLcMsRun.GetLcMsRun(rawFile); var s = 0d; foreach (var scanNum in run.GetMs1ScanVector()) { var spec = run.GetSpectrum(scanNum); var summedIntensity = spec.Peaks.Sum(p => p.Intensity); s += summedIntensity; } foreach (var scanNum in run.GetScanNumbers(2)) { var spec = run.GetSpectrum(scanNum); var summedIntensity = spec.Peaks.Sum(p => p.Intensity); s += summedIntensity; } Console.WriteLine("{0}\t{1}", i, s); //var features = LcMsFeatureAlignment.LoadProMexResult(i, featureFilePath, run); //align.AddDataSet(i, features, run); } //align.AlignFeatures(); //Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); //align.RefineAbundance(); //OutputAlignmentResult(align, outFilePath, spikeDatasets); }
public void TestCptacSpikeIn() { const string featureFolder = @"D:\MassSpecFiles\CPTAC_spike_in\promex"; const string rawFolder = @"D:\MassSpecFiles\CPTAC_spike_in\raw"; var outFilePath = string.Format(@"{0}\aligned_features.tsv", featureFolder); var align = new LcMsFeatureAlignment(new LcMsFeatureAlignComparer(new Tolerance(10))); for (var i = 0; i < spikeDatasets.Length; i++) { var featureFilePath = string.Format(@"{0}\{1}.ms1ft", featureFolder, spikeDatasets[i]); var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, spikeDatasets[i]); if (!File.Exists(rawFile)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile); continue; } if (!File.Exists(featureFilePath)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", featureFilePath); continue; } var run = PbfLcMsRun.GetLcMsRun(rawFile); var s = 0d; foreach (var scanNum in run.GetMs1ScanVector()) { var spec = run.GetSpectrum(scanNum); var summedIntensity = spec.Peaks.Sum(p => p.Intensity); s += summedIntensity; } foreach (var scanNum in run.GetScanNumbers(2)) { var spec = run.GetSpectrum(scanNum); var summedIntensity = spec.Peaks.Sum(p => p.Intensity); s += summedIntensity; } Console.WriteLine("{0}\t{1}", i, s); //var features = LcMsFeatureAlignment.LoadProMexResult(i, featureFilePath, run); //align.AddDataSet(i, features, run); } //align.AlignFeatures(); //Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); //align.RefineAbundance(); //OutputAlignmentResult(align, outFilePath, spikeDatasets); }
public void CompareClustering() { // Cluster using MultiAlign to Promex adapters var provider = new ScanSummaryProviderCache(); var reader1 = provider.GetScanSummaryProvider(pbf1, 0) as InformedProteomicsReader; var reader2 = provider.GetScanSummaryProvider(pbf2, 1) as InformedProteomicsReader; var promexFileReader1 = new PromexFileReader(reader1, 0); var features1 = promexFileReader1.ReadFile(ms1ft1); var promexFileReader2 = new PromexFileReader(reader2, 1); var features2 = promexFileReader2.ReadFile(ms1ft2); var features = new List <UMCLight>(); features.AddRange(features1); features.AddRange(features2); var clusterer = new PromexClusterer { Readers = provider, }; var clusters = clusterer.Cluster(features); var clusterCount = clusters.Count(c => c.UmcList.Count > 1); // Cluster using only ProMex var lcmsRun1 = PbfLcMsRun.GetLcMsRun(pbf1); var lcmsRun2 = PbfLcMsRun.GetLcMsRun(pbf2); var aligner = new LcMsFeatureAlignment(new LcMsFeatureAlignComparer(new Tolerance(10, ToleranceUnit.Ppm))); var promexFeatures1 = LcMsFeatureAlignment.LoadProMexResult(0, ms1ft1, lcmsRun1); aligner.AddDataSet(0, promexFeatures1, lcmsRun1); var promexFeatures2 = LcMsFeatureAlignment.LoadProMexResult(1, ms1ft2, lcmsRun2); aligner.AddDataSet(1, promexFeatures2, lcmsRun2); aligner.AlignFeatures(); var promexClusters = aligner.GetAlignedFeatures(); var promexClusterCount = promexClusters.Count(c => c.Count(f => f != null) > 1); Assert.AreEqual(clusters.Count, promexClusters.Count); Assert.AreEqual(clusterCount, promexClusterCount); }
private void RunFeatureAlignment(IList <string> ms1FtFiles, IReadOnlyList <string> rawFiles, string outFilePath) { var runList = new List <LcMsRun>(); foreach (var rawFile in rawFiles) { runList.Add(new PbfLcMsRun(rawFile)); } var align = new LcMsFeatureAlignment(ms1FtFiles, runList, new LcMsFeatureAlignComparer(new Tolerance(10))); align.AlignFeatures(); Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); var tempOutPath = outFilePath + ".tmp"; OutputAlignmentResult(align, tempOutPath, rawFiles, true); align.RefineAbundance(); OutputAlignmentResult(align, outFilePath, rawFiles, false); }
public void TestCptac10Replicates() { const string featureFolder = @"D:\MassSpecFiles\CPTAC_rep10\icr2ls"; const string rawFolder = @"\\proto-11\MSXML_Cache\PBF_Gen_1_193\2015_1"; var outFilePath = string.Format(@"{0}\aligned_features.tsv", featureFolder); var align = new LcMsFeatureAlignment(new LcMsFeatureAlignComparer(new Tolerance(10))); var dataNames = new string[10]; for (var i = 0; i < 10; i++) { dataNames[i] = string.Format(@"CPTAC_Intact_rep{0}_15Jan15_Bane_C2-14-08-02RZ", i + 1); var featureFilePath = string.Format(@"{0}\{1}_isos.tsv", featureFolder, dataNames[i]); var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataNames[i]); if (!File.Exists(rawFile)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile); continue; } if (!File.Exists(featureFilePath)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", featureFilePath); continue; } var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, featureFilePath, run); align.AddDataSet(i, features, run); } align.AlignFeatures(); Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); //var tempOutPath = outFilePath + ".tmp"; //OutputAlignmentResult(align, tempOutPath, rawFiles, true); //align.RefineAbundance(); OutputAlignmentResult(align, outFilePath, dataNames); }
public void CompareFileReading() { // Read using MultiAlign to Promex adapters var provider = new ScanSummaryProviderCache(); var reader1 = provider.GetScanSummaryProvider(pbf1, 0) as InformedProteomicsReader; var promexFileReader = new PromexFileReader(reader1, 0); var features = promexFileReader.ReadFile(ms1ft1).ToList(); var lcmsRun = PbfLcMsRun.GetLcMsRun(pbf1); var promexFeatures = LcMsFeatureAlignment.LoadProMexResult(0, ms1ft1, lcmsRun).ToList(); Assert.AreEqual(features.Count, promexFeatures.Count); for (int i = 0; i < features.Count; i++) { Assert.AreEqual(features[i].MassMonoisotopic, promexFeatures[i].Mass); ////Assert.AreEqual(features[i].Mz, promexFeatures[i].RepresentativeMz); Assert.AreEqual(features[i].Net, promexFeatures[i].Net); Assert.AreEqual(features[i].ScanStart, promexFeatures[i].MinScanNum); Assert.AreEqual(features[i].ScanEnd, promexFeatures[i].MaxScanNum); Assert.AreEqual(features[i].Abundance, promexFeatures[i].Abundance); } }
public void FindMissingLcMsFeatures() { var mspfFolder = @"D:\MassSpecFiles\CompRef_Kelleher\Study3"; var ms1ftFolder = @"D:\MassSpecFiles\CompRef_Kelleher\Study3"; const int Nfraction1 = 3; const int Nfraction2 = 5; for (var frac1 = 1; frac1 <= Nfraction1; frac1++) { for (var frac2 = 1; frac2 <= Nfraction2; frac2++) { var datasets = GetDataSetNamesStudy3(frac1, frac2); //var outFilePath = string.Format(@"D:\MassSpecFiles\CompRef_Kelleher\study3_GFrep{0}_Gfrac{1}.tsv", frac1.ToString("D2"), frac2.ToString("D2")); var nDataset = datasets.Count; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(12); for (var i = 0; i < nDataset; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]); var outPath = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]); if (File.Exists(outPath)) { continue; } var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); var prsmFeatureMatch = new bool[prsmList.Count]; for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); for (var k = 0; k < prsmList.Count; k++) { var match = prsmList[k]; if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); prsmFeatureMatch[k] = true; } } } var missingPrsm = new List <ProteinSpectrumMatch>(); for (var k = 0; k < prsmList.Count; k++) { if (!prsmFeatureMatch[k]) { missingPrsm.Add(prsmList[k]); } } FeatureFind(missingPrsm, run, outPath); Console.WriteLine(outPath); } } } }
public List <UMCClusterLight> Cluster(List <UMCLight> data, IProgress <ProgressData> progress = null) { progress = progress ?? new Progress <ProgressData>(); if (data.Count == 0) { return(new List <UMCClusterLight>()); } this.maxFeatureId = data.Select(d => d.Id).Max(); this.featureMap = new Dictionary <Tuple <int, int>, UMCLight>(); foreach (var feature in data) { var key = new Tuple <int, int>(feature.GroupId, feature.Id); this.featureMap.Add(key, feature); } var lcmsFeatureAligner = new LcMsFeatureAlignment(new LcMsFeatureAlignComparer(new Tolerance(10, ToleranceUnit.Ppm))); // Group features by dataset var idToFeatures = new Dictionary <int, List <UMCLight> >(); foreach (var umcLight in data) { if (!idToFeatures.ContainsKey(umcLight.GroupId)) { idToFeatures.Add(umcLight.GroupId, new List <UMCLight>()); } idToFeatures[umcLight.GroupId].Add(umcLight); } // Convert UMCLights to InformedProteomics LcMsFeatures foreach (var ds in idToFeatures) { var lcmsFeatures = new List <LcMsFeature>(ds.Value.Select(this.GetLcMsFeature)); lcmsFeatureAligner.AddDataSet(ds.Key, lcmsFeatures, this.GetLcMsRun(ds.Key)); } // Perform clustering lcmsFeatureAligner.AlignFeatures(); // Fill in mising features using noise. lcmsFeatureAligner.RefineAbundance(-30, progress); var clusteredFeatures = lcmsFeatureAligner.GetAlignedFeatures(); // Convert InformedProteomics clusters to UMCClusterLight int clustId = 0; var clusters = new List <UMCClusterLight>(); foreach (var cluster in clusteredFeatures) { var firstFeature = cluster.FirstOrDefault(f => f != null); if (firstFeature == null) { continue; } var umcCluster = new UMCClusterLight { Id = clustId++, }; int datasetId = 0; // Promex doesn't keep track of which dataset noise features belong to, so we need to. foreach (var feature in cluster) { if (feature == null) { continue; } feature.DataSetId = datasetId++; var umc = this.GetUMC(feature); umcCluster.AddChildFeature(umc); umc.SetParentFeature(umcCluster); } umcCluster.CalculateStatistics(ClusterCentroidRepresentation.Median); clusters.Add(umcCluster); } return(clusters); }
public void OutputCrossTabWithId(string outputFilePath, LcMsFeatureAlignment alignment, string[] runLabels) { var nDataset = runLabels.Length; var writer = new StreamWriter(outputFilePath); writer.Write("MonoMass"); writer.Write("\t"); writer.Write("MinElutionTime"); writer.Write("\t"); writer.Write("MaxElutionTime"); foreach (var dataName in runLabels) { writer.Write("\t"); writer.Write(dataName + "_Abundance"); } foreach (var dataName in runLabels) { writer.Write("\t"); writer.Write(dataName + "_Ms1Score"); } writer.Write("\t"); writer.Write("Pre"); writer.Write("\t"); writer.Write("Sequence"); writer.Write("\t"); writer.Write("Post"); writer.Write("\t"); writer.Write("Modifications"); writer.Write("\t"); writer.Write("SequenceText"); writer.Write("\t"); writer.Write("ProteinName"); writer.Write("\t"); writer.Write("ProteinDesc"); writer.Write("\t"); writer.Write("ProteinLength"); writer.Write("\t"); writer.Write("Start"); writer.Write("\t"); writer.Write("End"); foreach (var dataName in runLabels) { writer.Write("\t"); writer.Write(dataName + "_SpectraCount"); } writer.Write("\n"); var alignedFeatureList = alignment.GetAlignedFeatures(); for (var j = 0; j < alignedFeatureList.Count; j++) { var features = alignedFeatureList[j]; var mass = features.Where(f => f != null).Select(f => f.Mass).Median(); var minElutionTime = features.Where(f => f != null).Select(f => f.MinElutionTime).Median(); var maxElutionTime = features.Where(f => f != null).Select(f => f.MaxElutionTime).Median(); writer.Write(mass); writer.Write("\t"); writer.Write(minElutionTime); writer.Write("\t"); writer.Write(maxElutionTime); for (var i = 0; i < nDataset; i++) { writer.Write("\t"); writer.Write(features[i] == null ? 0 : features[i].Abundance); } for (var i = 0; i < nDataset; i++) { writer.Write("\t"); writer.Write(features[i] == null ? 0 : features[i].Score); } var prsm = (from f in features where f != null && f.ProteinSpectrumMatches != null && f.ProteinSpectrumMatches.Count > 0 select f.ProteinSpectrumMatches[0]).FirstOrDefault(); if (prsm == null) { for (var k = 0; k < 10; k++) { writer.Write("\t"); writer.Write(" "); } } else { writer.Write("\t"); writer.Write(prsm.Pre); writer.Write("\t"); writer.Write(prsm.Sequence); writer.Write("\t"); writer.Write(prsm.Post); writer.Write("\t"); writer.Write(prsm.Modifications); writer.Write("\t"); writer.Write(prsm.SequenceText); writer.Write("\t"); writer.Write(prsm.ProteinName); writer.Write("\t"); writer.Write(prsm.ProteinDesc); writer.Write("\t"); writer.Write(prsm.ProteinLength); writer.Write("\t"); writer.Write(prsm.FirstResidue); writer.Write("\t"); writer.Write(prsm.LastResidue); } // spectral count from ms2 for (var i = 0; i < nDataset; i++) { writer.Write("\t"); writer.Write(features[i] == null ? 0 : features[i].ProteinSpectrumMatches.Count); } writer.Write("\n"); } writer.Close(); }
public IEnumerable <UMCLight> ReadFile(string fileLocation) { var features = LcMsFeatureAlignment.LoadProMexResult(this.datasetId, fileLocation, this.reader.LcMsRun); var umcLights = new List <UMCLight> { Capacity = features.Count }; int umcId = 0; int msId = 0; foreach (var feature in features) { var chargeState = (feature.MinCharge + feature.MaxCharge) / 2; var mz = (feature.Mass + (chargeState * Constants.Proton)) / chargeState; // Parent feature var umcLight = new UMCLight { Id = umcId++, GroupId = this.datasetId, ScanStart = feature.MinScanNum, ScanEnd = feature.MaxScanNum, Abundance = feature.Abundance, AbundanceSum = feature.Abundance, ChargeState = chargeState, MinCharge = feature.MinCharge, MaxCharge = feature.MaxCharge, Net = feature.Net, NetAligned = feature.Net, NetStart = feature.MinNet, NetEnd = feature.MaxNet, MassMonoisotopic = feature.Mass, MassMonoisotopicAligned = feature.Mass, Mz = mz }; for (int chargestate = feature.MinCharge; chargestate <= feature.MaxCharge; chargestate++) { // Add min point umcLight.AddChildFeature(new MSFeatureLight { Id = msId++, GroupId = this.datasetId, Scan = feature.MinScanNum, Abundance = feature.Abundance, ChargeState = chargestate, Net = feature.MinNet, MassMonoisotopic = feature.Mass, Mz = mz }); // Add max point umcLight.AddChildFeature(new MSFeatureLight { Id = msId++, GroupId = this.datasetId, Scan = feature.MaxScanNum, Abundance = feature.Abundance, ChargeState = chargestate, Net = feature.MaxNet, MassMonoisotopic = feature.Mass, Mz = mz }); } //umcLight.CalculateStatistics(ClusterCentroidRepresentation.Median); umcLights.Add(umcLight); } return(umcLights); }
public static void Main(string[] args) { if (args.Length == 0) { ShowSyntax(); return; } // Parse file var inputFilePath = args[0]; if (!File.Exists(inputFilePath)) { ConsoleMsgUtils.ShowError("File not found: " + inputFilePath); return; } var datasets = DatasetInfo.ParseDatasetInfoFile(inputFilePath); if (datasets.Count == 0) { ConsoleMsgUtils.ShowError("No valid data found in the dataset info file"); ShowSyntax(); return; } var fileName = Path.GetFileNameWithoutExtension(inputFilePath); var directory = Path.GetDirectoryName(inputFilePath); var crosstabFilename = string.Format("{0}_crosstab.tsv", fileName); string outputfilePath; if (string.IsNullOrWhiteSpace(directory)) { outputfilePath = crosstabFilename; } else { outputfilePath = Path.Combine(directory, crosstabFilename); } var nDataset = datasets.Count; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(100); var alignment = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance)); var dataId = 0; foreach (var dataset in datasets) { if (!File.Exists(dataset.RawFilePath)) { ConsoleMsgUtils.ShowError("Instrument file not found: " + dataset.RawFilePath); continue; } if (!File.Exists(dataset.Ms1FtFilePath)) { ConsoleMsgUtils.ShowError("ProMex results file not found: " + dataset.Ms1FtFilePath); continue; } Console.WriteLine("Opening " + dataset.RawFilePath); var run = PbfLcMsRun.GetLcMsRun(dataset.RawFilePath, 0, 0); Console.WriteLine("Opening " + dataset.Ms1FtFilePath); var features = LcMsFeatureAlignment.LoadProMexResult(dataId++, dataset.Ms1FtFilePath, run); if (!string.IsNullOrWhiteSpace(dataset.MsPfIdFilePath) && File.Exists(dataset.MsPfIdFilePath)) { Console.WriteLine("Opening " + dataset.MsPfIdFilePath); var prsmList = prsmReader.LoadIdentificationResult(dataset.MsPfIdFilePath, ProteinSpectrumMatch.SearchTool.MsPathFinder); foreach (var match in prsmList) { match.ProteinId = match.ProteinName; } // tag features by PrSMs foreach (var feature in features) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsMz(feature.Mass); foreach (var match in prsmList) { if (feature.MinScanNum < match.ScanNum && match.ScanNum < feature.MaxScanNum && Math.Abs(feature.Mass - match.Mass) < massTol) { feature.ProteinSpectrumMatches.Add(match); } } } } alignment.AddDataSet(dataId, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); var validResults = 0; for (var datasetIndex = 0; datasetIndex < nDataset; datasetIndex++) { if (datasetIndex >= alignment.CountDatasets) { ConsoleMsgUtils.ShowWarning(string.Format("Could not align {0}; features not found", datasets[datasetIndex].Label)); continue; } alignment.FillMissingFeatures(datasetIndex); Console.WriteLine("{0} has been processed", datasets[datasetIndex].Label); validResults++; } if (validResults > 0) { OutputCrossTabWithId(outputfilePath, alignment, datasets.Select(ds => ds.Label).ToArray()); } }
public static void Main(string[] args) { // Parse file var inputFilePath = args[0]; var datasets = DatasetInfo.ParseDatasetInfoFile(inputFilePath); var fileName = Path.GetFileNameWithoutExtension(inputFilePath); var directory = Path.GetDirectoryName(inputFilePath); var outputfilePath = Path.Combine(directory, string.Format("{0}_crosstab.tsv", fileName)); int nDataset = datasets.Count; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(100); var alignment = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance)); int dataId = 0; foreach (var dataset in datasets) { var run = PbfLcMsRun.GetLcMsRun(dataset.RawFilePath, 0, 0); var features = LcMsFeatureAlignment.LoadProMexResult(dataId++, dataset.Ms1FtFilePath, run); if (File.Exists(dataset.MsPfIdFilePath)) { var prsmList = prsmReader.LoadIdentificationResult(dataset.MsPfIdFilePath, ProteinSpectrumMatch.SearchTool.MsPathFinder); foreach (var match in prsmList) { match.ProteinId = match.ProteinName; } // tag features by PrSMs foreach (LcMsFeature feature in features) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsMz(feature.Mass); foreach (var match in prsmList) { if (feature.MinScanNum < match.ScanNum && match.ScanNum < feature.MaxScanNum && Math.Abs(feature.Mass - match.Mass) < massTol) { feature.ProteinSpectrumMatches.Add(match); } } } } alignment.AddDataSet(dataId, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < nDataset; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", datasets[i].Label); } OutputCrossTabWithId(outputfilePath, alignment, datasets.Select(ds => ds.Label).ToArray()); }
public void AlignFeatures(List<string> datasets, string mspfFolder, string ms1ftFolder, string outFilePath) { var nDataset = datasets.Count; var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(12); var alignment = new LcMsFeatureAlignment(new AnalysisCompRef.CompRefFeatureComparer(tolerance)); for (var i = 0; i < nDataset; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", PbfPath, datasets[i]); var mspFile = string.Format(@"{0}\{1}_IcTda.tsv", mspfFolder, datasets[i]); var ms1FtFile = string.Format(@"{0}\{1}.ms1ft", ms1ftFolder, datasets[i]); var ms1FtFile2 = string.Format(@"{0}\{1}.seqtag.ms1ft", ms1ftFolder, datasets[i]); var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run); var features2 = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile2, run); features.AddRange(features2); if (File.Exists(mspFile)) { var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsPathFinder); //var prsmFeatureMatch = new bool[prsmList.Count]; for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); for(var k = 0; k < prsmList.Count; k++) { var match = prsmList[k]; if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); //prsmFeatureMatch[k] = true; } } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < nDataset; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", datasets[i]); } AnalysisCompRef.OutputCrossTabWithId(outFilePath, alignment, datasets.ToArray()); }
public void TestCptac10Replicates() { const string featureFolder = @"D:\MassSpecFiles\CPTAC_rep10\icr2ls"; const string rawFolder = @"\\proto-11\MSXML_Cache\PBF_Gen_1_193\2015_1"; var outFilePath = string.Format(@"{0}\aligned_features.tsv", featureFolder); var align = new LcMsFeatureAlignment(new LcMsFeatureAlignComparer(new Tolerance(10))); var dataNames = new string[10]; for (var i = 0; i < 10; i++) { dataNames[i] = string.Format(@"CPTAC_Intact_rep{0}_15Jan15_Bane_C2-14-08-02RZ", i+1); var featureFilePath = string.Format(@"{0}\{1}_isos.tsv", featureFolder, dataNames[i]); var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataNames[i]); if (!File.Exists(rawFile)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile); continue; } if (!File.Exists(featureFilePath)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", featureFilePath); continue; } var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, featureFilePath, run); align.AddDataSet(i, features, run); } align.AlignFeatures(); Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); //var tempOutPath = outFilePath + ".tmp"; //OutputAlignmentResult(align, tempOutPath, rawFiles, true); //align.RefineAbundance(); OutputAlignmentResult(align, outFilePath, dataNames); }
private void OutputAlignmentResult(LcMsFeatureAlignment align, string outFilePath, string[] dataName) { var alignedFeatureList = align.GetAlignedFeatures(); var writer = new StreamWriter(outFilePath); writer.Write("MonoMass\tMinElutionTime\tMaxElutionTime"); for (var i = 0; i < align.CountDatasets; i++) { writer.Write("\t{0}", dataName[i]); } writer.Write("\n"); for (var i = 0; i < align.CountAlignedFeatures; i++) { var features = alignedFeatureList[i]; var minMaxNet = TestLcMsFeatureAlignment.GetMinMaxNet(features); writer.Write(@"{0} {1:0.00000} {2:0.00000}", minMaxNet.Item1, minMaxNet.Item3, minMaxNet.Item4); for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); writer.Write(feature != null ? feature.Abundance : 0d); } writer.Write("\n"); } writer.Close(); }
private void OutputCrossTabWithId(string outputFilePath, LcMsFeatureAlignment alignment) { using (var writer = new StreamWriter(outputFilePath)) { var headerLine = new List <string> { "MonoMass", "MinElutionTime", "MaxElutionTime" }; for (var i = 0; i < DATASET_COUNT; i++) { var dataName = GetDataSetNames(i); headerLine.Add(dataName + "_Abundance"); } for (var i = 0; i < DATASET_COUNT; i++) { var dataName = GetDataSetNames(i); headerLine.Add(dataName + "_Ms1Score"); } headerLine.Add("Pre"); headerLine.Add("Sequence"); headerLine.Add("Post"); headerLine.Add("Modifications"); headerLine.Add("ProteinName"); headerLine.Add("ProteinDesc"); headerLine.Add("ProteinLength"); headerLine.Add("Start"); headerLine.Add("End"); for (var i = 0; i < DATASET_COUNT; i++) { var dataName = GetDataSetNames(i); headerLine.Add(dataName + "_SpectraCount"); } writer.WriteLine(string.Join("\t", headerLine)); var alignedFeatureList = alignment.GetAlignedFeatures(); foreach (var features in alignedFeatureList) { var mass = features.Where(f => f != null).Select(f => f.Mass).Median(); var minElutionTime = features.Where(f => f != null).Select(f => f.MinElutionTime).Median(); var maxElutionTime = features.Where(f => f != null).Select(f => f.MaxElutionTime).Median(); var dataLine = new List <string> { PRISM.StringUtilities.DblToString(mass, 4), PRISM.StringUtilities.DblToString(minElutionTime, 3), PRISM.StringUtilities.DblToString(maxElutionTime, 3) }; for (var i = 0; i < DATASET_COUNT; i++) { if (features[i] == null) { dataLine.Add("0"); } else { dataLine.Add(PRISM.StringUtilities.DblToString(features[i].Abundance, 2)); } } for (var i = 0; i < DATASET_COUNT; i++) { if (features[i] == null) { dataLine.Add("0"); } else { if (features[i].Score <= float.MinValue) { dataLine.Add(PRISM.StringUtilities.DblToStringScientific(float.MinValue, 2)); } else { dataLine.Add(PRISM.StringUtilities.DblToString(features[i].Score, 3)); } } } var prsm = (from f in features where f?.ProteinSpectrumMatches != null && f.ProteinSpectrumMatches.Count > 0 select f.ProteinSpectrumMatches[0]).FirstOrDefault(); if (prsm == null) { for (var k = 0; k < 9; k++) { dataLine.Add(" "); } } else { dataLine.Add(prsm.Pre); dataLine.Add(prsm.Sequence); dataLine.Add(prsm.Post); dataLine.Add(prsm.Modifications); dataLine.Add(prsm.ProteinName); dataLine.Add(prsm.ProteinDesc); dataLine.Add(prsm.ProteinLength.ToString()); dataLine.Add(prsm.FirstResidue.ToString()); dataLine.Add(prsm.LastResidue.ToString()); } // spectral count from ms2 for (var i = 0; i < DATASET_COUNT; i++) { if (features[i] == null) { dataLine.Add("0"); } else { dataLine.Add(features[i].ProteinSpectrumMatches.Count.ToString()); } } writer.WriteLine(string.Join("\t", dataLine)); } } Console.WriteLine("Results written to " + outputFilePath); }
private void OutputAlignmentResult(LcMsFeatureAlignment align, string outFilePath, List<string> rawFiles, bool isTemp = true) { var alignedFeatureList = align.GetAlignedFeatures(); var writer = new StreamWriter(outFilePath); writer.Write("MonoMass\tMinElutionTime\tMaxElutionTime"); for (var i = 0; i < align.CountDatasets; i++) { var dataSetName = Path.GetFileNameWithoutExtension(rawFiles[i]); writer.Write("\t{0}", dataSetName); } for (var i = 0; i < align.CountDatasets; i++) { //var dataSetName = Path.GetFileNameWithoutExtension(align.RawFileList[i]); writer.Write("\t{0}_Score", i); } /* for (var i = 0; i < align.CountDatasets; i++) { //var dataSetName = Path.GetFileNameWithoutExtension(align.RawFileList[i]); writer.Write("\t{0}_Net", i); }*/ writer.Write("\n"); for (var i = 0; i < align.CountAlignedFeatures; i++) { var features = alignedFeatureList[i]; var minMaxNet = GetMinMaxNet(features); writer.Write(@"{0} {1:0.00000} {2:0.00000}", minMaxNet.Item1, minMaxNet.Item3, minMaxNet.Item4); for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); writer.Write(feature != null ? feature.Abundance : 0d); } for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); writer.Write(feature != null ? feature.Score : 0d); } /* for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); if (feature != null) writer.Write("{0:0.00000}", feature.MinNet); else writer.Write(0); } for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); if (feature != null) writer.Write("{0:0.00000}", feature.MaxNet); else writer.Write(0); }*/ writer.Write("\n"); } writer.Close(); if (isTemp) return; var outDirectory = Path.GetDirectoryName(Path.GetFullPath(outFilePath)); for (var i = 0; i < align.CountDatasets; i++) { var dataSetName = Path.GetFileNameWithoutExtension(rawFiles[i]); //writer.Write("\t{0}", dataSetName); // now output results!! var ms1ftFilePath = String.Format(@"{0}\{1}.aligned.ms1ft", outDirectory, dataSetName); var writer2 = new StreamWriter(ms1ftFilePath); writer2.WriteLine(LcMsFeatureFinderLauncher.GetHeaderString()); for (var j = 0; j < align.CountAlignedFeatures; j++) { var f1 = alignedFeatureList[j][i]; writer2.Write("{0}\t", j + 1); writer2.WriteLine(LcMsFeatureFinderLauncher.GetString(f1)); } writer2.Close(); } }
private void RunFeatureAlignment(List<string> ms1FtFiles, List<string> rawFiles, string outFilePath) { var runList = new List<LcMsRun>(); foreach(var rawFile in rawFiles) runList.Add(new PbfLcMsRun(rawFile)); var align = new LcMsFeatureAlignment(ms1FtFiles, runList, new LcMsFeatureAlignComparer(new Tolerance(10))); align.AlignFeatures(); Console.WriteLine("# of aligned features = {0}", align.CountAlignedFeatures); var tempOutPath = outFilePath + ".tmp"; OutputAlignmentResult(align, tempOutPath, rawFiles, true); align.RefineAbundance(); OutputAlignmentResult(align, outFilePath, rawFiles, false); }
public void TestIMERFeatureAlignment() { const string outFilePath = @"D:\MassSpecFiles\IMER\promex_crosstab.tsv"; const string rawFolder = @"D:\MassSpecFiles\IMER"; var runLabels = new string[] { "1", "2", "3", "4", "5", "6" }; var nDataset = runLabels.Length; //CPTAC_Intact_CR32A_24Aug15_Bane_15-02-06-RZ var prsmReader = new ProteinSpectrumMatchReader(); var tolerance = new Tolerance(10); var alignment = new LcMsFeatureAlignment(new CompRefFeatureComparer(tolerance)); for (var i = 0; i < nDataset; i++) { var k = runLabels[i].Equals("2") || runLabels[i].Equals("3") ? 14 : 13; var rawFile = string.Format(@"{0}\Diabetes_iPSC_Beta_{1}_IMER_{2}May14_Alder_14-01-33.pbf", rawFolder, runLabels[i], k); var mspFile = string.Format(@"{0}\Diabetes_iPSC_Beta_{1}_IMER_{2}May14_Alder_14-01-33_msgfdb_syn.txt", rawFolder, runLabels[i], k); var ms1FtFile = string.Format(@"{0}\Diabetes_iPSC_Beta_{1}_IMER_{2}May14_Alder_14-01-33.ms1ft", rawFolder, runLabels[i], k); Console.WriteLine(rawFile); Console.WriteLine(File.Exists(rawFile)); var run = PbfLcMsRun.GetLcMsRun(rawFile); var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1FtFile, run, 500, 15000); if (File.Exists(mspFile)) { var prsmList = prsmReader.LoadIdentificationResult(mspFile, ProteinSpectrumMatch.SearchTool.MsGfPlus); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName; } // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsTh(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } } alignment.AddDataSet(i, features, run); } alignment.AlignFeatures(); Console.WriteLine("{0} alignments ", alignment.CountAlignedFeatures); for (var i = 0; i < nDataset; i++) { alignment.FillMissingFeatures(i); Console.WriteLine("{0} has been processed", runLabels[i]); } OutputCrossTabWithId(outFilePath, alignment, runLabels); }
private void OutputAlignmentResult(LcMsFeatureAlignment align, string outFilePath, IReadOnlyList <string> rawFiles, bool isTemp = true) { var alignedFeatureList = align.GetAlignedFeatures(); var writer = new StreamWriter(outFilePath); writer.Write("MonoMass\tMinElutionTime\tMaxElutionTime"); for (var i = 0; i < align.CountDatasets; i++) { var dataSetName = Path.GetFileNameWithoutExtension(rawFiles[i]); writer.Write("\t{0}", dataSetName); } for (var i = 0; i < align.CountDatasets; i++) { //var dataSetName = Path.GetFileNameWithoutExtension(align.RawFileList[i]); writer.Write("\t{0}_Score", i); } /* * for (var i = 0; i < align.CountDatasets; i++) * { * //var dataSetName = Path.GetFileNameWithoutExtension(align.RawFileList[i]); * writer.Write("\t{0}_Net", i); * }*/ writer.Write("\n"); for (var i = 0; i < align.CountAlignedFeatures; i++) { var features = alignedFeatureList[i]; var minMaxNet = GetMinMaxNet(features); writer.Write(@"{0}\t{1:0.00000}\t{2:0.00000}", minMaxNet.Item1, minMaxNet.Item3, minMaxNet.Item4); for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); writer.Write(feature?.Abundance ?? 0d); } for (var j = 0; j < align.CountDatasets; j++) { var feature = features[j]; writer.Write("\t"); writer.Write(feature?.Score ?? 0d); } /* * for (var j = 0; j < align.CountDatasets; j++) * { * var feature = features[j]; * writer.Write("\t"); * if (feature != null) writer.Write("{0:0.00000}", feature.MinNet); * else writer.Write(0); * } * * for (var j = 0; j < align.CountDatasets; j++) * { * var feature = features[j]; * writer.Write("\t"); * if (feature != null) writer.Write("{0:0.00000}", feature.MaxNet); * else writer.Write(0); * }*/ writer.Write("\n"); } writer.Close(); if (isTemp) { return; } var outDirectory = Path.GetDirectoryName(Path.GetFullPath(outFilePath)); for (var i = 0; i < align.CountDatasets; i++) { var dataSetName = Path.GetFileNameWithoutExtension(rawFiles[i]); //writer.Write("\t{0}", dataSetName); // now output results!! var ms1ftFilePath = string.Format(@"{0}\{1}.aligned.ms1ft", outDirectory, dataSetName); var writer2 = new StreamWriter(ms1ftFilePath); writer2.WriteLine(LcMsFeatureFinderLauncher.GetHeaderString()); for (var j = 0; j < align.CountAlignedFeatures; j++) { var f1 = alignedFeatureList[j][i]; writer2.Write("{0}\t", j + 1); writer2.WriteLine(LcMsFeatureFinderLauncher.GetString(f1)); } writer2.Close(); } }
public void OutputCrossTabWithId(string outputFilePath, LcMsFeatureAlignment alignment) { var writer = new StreamWriter(outputFilePath); writer.Write("MonoMass"); writer.Write("\t"); writer.Write("MinElutionTime"); writer.Write("\t"); writer.Write("MaxElutionTime"); for (var i = 0; i < NdataSet; i++) { var dataName = GetDataSetNames(i); writer.Write("\t"); writer.Write(dataName + "_Abundance"); } for (var i = 0; i < NdataSet; i++) { var dataName = GetDataSetNames(i); writer.Write("\t"); writer.Write(dataName + "_Ms1Score"); } writer.Write("\t"); writer.Write("Pre"); writer.Write("\t"); writer.Write("Sequence"); writer.Write("\t"); writer.Write("Post"); writer.Write("\t"); writer.Write("Modifications"); writer.Write("\t"); writer.Write("ProteinName"); writer.Write("\t"); writer.Write("ProteinDesc"); writer.Write("\t"); writer.Write("ProteinLength"); writer.Write("\t"); writer.Write("Start"); writer.Write("\t"); writer.Write("End"); for (var i = 0; i < NdataSet; i++) { var dataName = GetDataSetNames(i); writer.Write("\t"); writer.Write(dataName + "_SpectraCount"); } writer.Write("\n"); var alignedFeatureList = alignment.GetAlignedFeatures(); for (var j = 0; j < alignedFeatureList.Count; j++) { var features = alignedFeatureList[j]; var mass = features.Where(f => f != null).Select(f => f.Mass).Median(); var minElutionTime = features.Where(f => f != null).Select(f => f.MinElutionTime).Median(); var maxElutionTime = features.Where(f => f != null).Select(f => f.MaxElutionTime).Median(); writer.Write(mass); writer.Write("\t"); writer.Write(minElutionTime); writer.Write("\t"); writer.Write(maxElutionTime); for (var i = 0; i < NdataSet; i++) { writer.Write("\t"); writer.Write(features[i] == null ? 0 : features[i].Abundance); } for (var i = 0; i < NdataSet; i++) { writer.Write("\t"); writer.Write(features[i] == null ? 0 : features[i].Score); } var prsm = (from f in features where f != null && f.ProteinSpectrumMatches != null && f.ProteinSpectrumMatches.Count > 0 select f.ProteinSpectrumMatches[0]).FirstOrDefault(); if (prsm == null) { for (var k = 0; k < 9; k++) { writer.Write("\t"); writer.Write(" "); } } else { writer.Write("\t"); writer.Write(prsm.Pre); writer.Write("\t"); writer.Write(prsm.Sequence); writer.Write("\t"); writer.Write(prsm.Post); writer.Write("\t"); writer.Write(prsm.Modifications); writer.Write("\t"); writer.Write(prsm.ProteinName); writer.Write("\t"); writer.Write(prsm.ProteinDesc); writer.Write("\t"); writer.Write(prsm.ProteinLength); writer.Write("\t"); writer.Write(prsm.FirstResidue); writer.Write("\t"); writer.Write(prsm.LastResidue); } // spectral count from ms2 for (var i = 0; i < NdataSet; i++) { writer.Write("\t"); writer.Write(features[i] == null ? 0 : features[i].ProteinSpectrumMatches.Count); } writer.Write("\n"); } writer.Close(); }
public void TestAlignFeatures() { var methodName = MethodBase.GetCurrentMethod().Name; Utils.ShowStarting(methodName); const string rawFolder = @"\\proto-11\MSXML_Cache\PBF_Gen_1_193\2015_2"; const string promexOutFolder = @"D:\MassSpecFiles\UTEX\MSAlign"; const string msAlignResultFolder = @"D:\MassSpecFiles\UTEX\MSAlign"; if (!Directory.Exists(rawFolder)) { Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, rawFolder); } var nDataset = 32; var dataset = new string[nDataset]; for (var i = 0; i < nDataset; i++) { dataset[i] = string.Format("Syn_utex2973_Top_{0,2:D2}_TopDown_7May15_Bane_14-09-01RZ", i + 1); //var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]); } var tolerance = new Tolerance(10); var ftComparer = new UtexFeatureComparer(tolerance); var align = new LcMsFeatureAlignment(ftComparer); var prsmReader = new ProteinSpectrumMatchReader(0.01); var filesProcessed = 0; for (var i = 0; i < dataset.Length; i++) { var rawFile = string.Format(@"{0}\{1}.pbf", rawFolder, dataset[i]); if (!File.Exists(rawFile)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", rawFile); continue; } var run = PbfLcMsRun.GetLcMsRun(rawFile); var path = string.Format(@"{0}\{1}_MSAlign_ResultTable.txt", msAlignResultFolder, dataset[i]); if (!File.Exists(path)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", path); continue; } var ms1ftPath = string.Format(@"{0}\{1}.ms1ft", promexOutFolder, dataset[i]); if (!File.Exists(ms1ftPath)) { Console.WriteLine(@"Warning: Skipping file not found: {0}", ms1ftPath); continue; } filesProcessed++; //var map = new ProteinSpectrumMathMap(run, i, dataset[i]); //map.LoadIdentificationResult(path, ProteinSpectrumMatch.SearchTool.MsAlign); var prsmList = prsmReader.LoadIdentificationResult(path, ProteinSpectrumMatch.SearchTool.MsAlign); for (var j = 0; j < prsmList.Count; j++) { var match = prsmList[j]; match.ProteinId = match.ProteinName.Substring( match.ProteinName.IndexOf(ProteinNamePrefix) + ProteinNamePrefix.Length, 5); } var features = LcMsFeatureAlignment.LoadProMexResult(i, ms1ftPath, run); // tag features by PrSMs for (var j = 0; j < features.Count; j++) { //features[j].ProteinSpectrumMatches = new ProteinSpectrumMatchSet(i); var massTol = tolerance.GetToleranceAsMz(features[j].Mass); foreach (var match in prsmList) { if (features[j].MinScanNum < match.ScanNum && match.ScanNum < features[j].MaxScanNum && Math.Abs(features[j].Mass - match.Mass) < massTol) { features[j].ProteinSpectrumMatches.Add(match); } } } align.AddDataSet(i, features, run); } if (filesProcessed == 0) { Assert.Ignore("Skipped since input files not found"); } align.AlignFeatures(); Console.WriteLine("{0} alignments ", align.CountAlignedFeatures); align.RefineAbundance(); var alignedFeatureList = align.GetAlignedFeatures(); for (var i = 0; i < nDataset; i++) { var ms1ftPath = string.Format(@"{0}\{1}_aligned.ms1ft", promexOutFolder, dataset[i]); var writer = new StreamWriter(ms1ftPath); writer.Write(LcMsFeatureFinderLauncher.GetHeaderString()); writer.WriteLine("\tIdedMs2ScanNums"); for (var j = 0; j < alignedFeatureList.Count; j++) { writer.Write(j + 1); writer.Write("\t"); if (alignedFeatureList[j][i] == null) { for (var k = 0; k < 14; k++) { writer.Write("0\t"); } writer.Write("0\n"); } else { writer.Write(LcMsFeatureFinderLauncher.GetString(alignedFeatureList[j][i])); writer.Write("\t"); if (alignedFeatureList[j][i].ProteinSpectrumMatches == null) { writer.Write(""); } else { var scanNums = string.Join(";", alignedFeatureList[j][i].ProteinSpectrumMatches.Select(prsm => prsm.ScanNum)); writer.Write(scanNums); } writer.Write("\n"); } } writer.Close(); } }
private void OutputCrossTabWithId(string outputFilePath, LcMsFeatureAlignment alignment, string[] runLabels) { OutputCrossTabWithId(outputFilePath, alignment, runLabels.ToList()); }