private void getFragmentationStatistics() { IList <object[]> queryRows; lock (session) { var randomIds = session.CreateQuery("SELECT psm.Id " + viewFilter.GetFilteredQueryString(DataFilter.FromPeptideSpectrumMatch)) .List <long>() .Shuffle() .Take(1000) .OrderBy(o => o); string randomIdSet = String.Join(",", randomIds.Select(o => o.ToString()).ToArray()); queryRows = session.CreateQuery("SELECT psm.Spectrum.Source.Name, psm.Spectrum, psm, DISTINCT_GROUP_CONCAT(pm.Offset || ':' || mod.MonoMassDelta), psm.Peptide.Sequence " + "FROM PeptideSpectrumMatch psm " + "LEFT JOIN psm.Modifications pm " + "LEFT JOIN pm.Modification mod " + "WHERE psm.Id IN (" + randomIdSet + ") " + "GROUP BY psm.Spectrum.id ") .List <object[]>(); } var spectrumRows = queryRows.Select(o => new SpectrumRow(o)).OrderBy(o => o.SourceName); var percentTicBySpectrumByFragmentType = new List <PointPairList>(); var percentPeakCountBySpectrumByFragmentType = new List <PointPairList>(); var meanMzErrorBySpectrumByFragmentType = new List <PointPairList>(); var percentTicListByFragmentType = new List <List <double> >(); var percentPeakCountListByFragmentType = new List <List <double> >(); var meanMzErrorListByFragmentType = new List <List <double> >(); foreach (var graphControl in graphControls) { graphControl.MasterPane.PaneList.ForEach(o => o.CurveList.ForEach(c => c.Clear())); } for (int i = 0; i < (int)IonSeries.Count; ++i) { percentTicBySpectrumByFragmentType.Add(percentTicGraphControl.MasterPane.PaneList[i + 1].CurveList[3].Points as PointPairList); percentPeakCountBySpectrumByFragmentType.Add(percentPeakCountGraphControl.MasterPane.PaneList[i + 1].CurveList[3].Points as PointPairList); meanMzErrorBySpectrumByFragmentType.Add(meanMzErrorGraphControl.MasterPane.PaneList[i + 1].CurveList[3].Points as PointPairList); percentTicListByFragmentType.Add(new List <double>()); percentPeakCountListByFragmentType.Add(new List <double>()); meanMzErrorListByFragmentType.Add(new List <double>()); } int spectraCount = 0; maxPercentTic = 10; maxPercentPeakCount = 10; maxMeanMzError = 0.1; var tolerance = fragmentTolerance; string spectrumListFilters = String.Empty; Invoke(new MethodInvoker(() => { tolerance.value = Convert.ToDouble(fragmentToleranceTextBox.Text); tolerance.units = (MZTolerance.Units)fragmentToleranceUnitsComboBox.SelectedIndex; meanMzErrorGraphControl.GraphPane.YAxis.Title.Text = "Mean m/z error (" + tolerance.units.ToString() + ")"; spectrumListFilters = spectrumFiltersTextBox.Text; setAutomaticScales(); })); var points = new PointPairList(); string currentSourceName = null; string currentSourcePath = null; msdata.MSData msd = null; lock (owner) foreach (var row in spectrumRows) { if (row.SourceName != currentSourceName) { currentSourceName = row.SourceName; currentSourcePath = IDPickerForm.LocateSpectrumSource(currentSourceName, session.Connection.GetDataSource()); if (String.IsNullOrEmpty(currentSourcePath)) { throw new FileNotFoundException("source file not found"); } msd = new pwiz.CLI.msdata.MSDataFile(currentSourcePath); //var param = session.Query<AnalysisParameter>().Where(o => o.Name == "SpectrumListFilters").Min(o => o.Value); //string spectrumListFilters = String.IsNullOrEmpty(param) ? String.Empty : param; SpectrumListFactory.wrap(msd, spectrumListFilters.Split(";".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)); } string spectrumId = String.Format("{0}/{1}", row.SourceName, msdata.id.abbreviate(row.Spectrum.NativeID)); var spectrumList = msd.run.spectrumList; ++spectraCount; var pwizPeptide = new proteome.Peptide(row.ModifiedSequence, proteome.ModificationParsing.ModificationParsing_Auto, proteome.ModificationDelimiter.ModificationDelimiter_Brackets); var fragmentation = pwizPeptide.fragmentation(true, true); var pwizSpectrum = spectrumList.spectrum(spectrumList.find(row.Spectrum.NativeID), true); var pointMap = new seems.PointMap(new ZedGraph.PointPairList(pwizSpectrum.getMZArray().data, pwizSpectrum.getIntensityArray().data)); double tic = pointMap.Values.Sum(); var percentTicByFragmentType = new List <double>(Enumerable.Repeat(0.0, (int)IonSeries.Count)); var percentPeakCountByFragmentType = new List <double>(Enumerable.Repeat(0.0, (int)IonSeries.Count)); var matchCountByFragmentType = new List <int>(Enumerable.Repeat(0, (int)IonSeries.Count)); var meanMzErrorByFragmentType = new List <double>(Enumerable.Repeat(Double.NaN, (int)IonSeries.Count)); seems.PointMap.Enumerator itr; double expected; IonSeries[] ionSeries = Enum.GetValues(typeof(IonSeries)).Cast <IonSeries>().Where(o => o != IonSeries.Count).ToArray(); for (int z = 1; z <= 1; ++z) { for (int length = 1, end = pwizPeptide.sequence.Length; length <= end; ++length) { foreach (IonSeries series in ionSeries) { if ((series == IonSeries.c || series == IonSeries.x) && length == pwizPeptide.sequence.Length) { continue; } expected = fragmentMass(fragmentation, series, length, z); itr = pointMap.FindNear(expected, expected - (expected - tolerance)); if (itr != null && itr.IsValid) { percentTicByFragmentType[(int)series] += itr.Current.Value; ++percentPeakCountByFragmentType[(int)series]; ++matchCountByFragmentType[(int)series]; if (Double.IsNaN(meanMzErrorByFragmentType[(int)series])) { meanMzErrorByFragmentType[(int)series] = 0; } meanMzErrorByFragmentType[(int)series] += mzError(itr.Current.Key, expected); } } } } var rng = new Random(); for (int i = 0; i < percentTicBySpectrumByFragmentType.Count; ++i) { // convert sum to mean if (percentPeakCountByFragmentType[i] > 0) { meanMzErrorByFragmentType[i] /= matchCountByFragmentType[i]; } // convert to percentages percentTicByFragmentType[i] /= tic / 100; percentPeakCountByFragmentType[i] /= pointMap.Count / 100.0; maxPercentTic = Math.Max(maxPercentTic, percentTicByFragmentType[i]); maxPercentPeakCount = Math.Max(maxPercentPeakCount, percentPeakCountByFragmentType[i]); double jitter = (rng.NextDouble() - 0.5); percentTicBySpectrumByFragmentType[i].Add(jitter, percentTicByFragmentType[i], String.Format("{0}: {1:G4}% ({2} matches)", spectrumId, percentTicByFragmentType[i], matchCountByFragmentType[i])); percentPeakCountBySpectrumByFragmentType[i].Add(jitter, percentPeakCountByFragmentType[i], String.Format("{0}: {1:G4}% ({2} matches)", spectrumId, percentPeakCountByFragmentType[i], matchCountByFragmentType[i])); percentTicListByFragmentType[i].Add(percentTicByFragmentType[i]); percentPeakCountListByFragmentType[i].Add(percentPeakCountByFragmentType[i]); if (!Double.IsNaN(meanMzErrorByFragmentType[i])) { maxMeanMzError = Math.Max(maxMeanMzError, Math.Abs(meanMzErrorByFragmentType[i])); meanMzErrorBySpectrumByFragmentType[i].Add(jitter, meanMzErrorByFragmentType[i], String.Format("{0}: {1:G4}%", spectrumId, meanMzErrorByFragmentType[i])); meanMzErrorListByFragmentType[i].Add(meanMzErrorByFragmentType[i]); } } if ((spectraCount % 100) == 0) { setAutomaticScales(); } }// for each spectrum row Invoke(new MethodInvoker(() => { for (int i = 0; i < percentTicBySpectrumByFragmentType.Count; ++i) { if (percentTicListByFragmentType[i].Count < 5) { continue; } percentTicListByFragmentType[i].Sort(); percentPeakCountListByFragmentType[i].Sort(); addSixNumberSummary(percentTicGraphControl.MasterPane.PaneList[i + 1], percentTicListByFragmentType[i]); addSixNumberSummary(percentPeakCountGraphControl.MasterPane.PaneList[i + 1], percentPeakCountListByFragmentType[i]); if (meanMzErrorListByFragmentType[i].Count < 5) { continue; } meanMzErrorListByFragmentType[i].Sort(); addSixNumberSummary(meanMzErrorGraphControl.MasterPane.PaneList[i + 1], meanMzErrorListByFragmentType[i]); } })); }
private List <double> getPeakStatistics() { IList <object[]> queryRows; lock (session) { var randomIds = session.CreateQuery("SELECT psm.Id " + viewFilter.GetFilteredQueryString(DataFilter.FromPeptideSpectrumMatch)) .List <long>() .Shuffle() .Take(1000) .OrderBy(o => o); string randomIdSet = String.Join(",", randomIds.Select(o => o.ToString()).ToArray()); queryRows = session.CreateQuery("SELECT psm.Spectrum.Source.Name, psm.Spectrum, psm, DISTINCT_GROUP_CONCAT(pm.Offset || ':' || mod.MonoMassDelta), psm.Peptide.Sequence " + "FROM PeptideSpectrumMatch psm " + "LEFT JOIN psm.Modifications pm " + "LEFT JOIN pm.Modification mod " + "WHERE psm.Id IN (" + randomIdSet + ") " + "GROUP BY psm.Spectrum.id ") .List <object[]>(); } var spectrumRows = queryRows.Select(o => new SpectrumRow(o)).OrderBy(o => o.SourceName); precursorScatterPlot.Clear(); chargeReducedScatterPlot.Clear(); int spectraCount = 0; string spectrumListFilters = String.Empty; Invoke(new MethodInvoker(() => { spectrumListFilters = spectrumFiltersTextBox.Text; zedGraphControl.MasterPane.AxisChange(); zedGraphControl.Refresh(); })); var points = new PointPairList(); string currentSourceName = null; string currentSourcePath = null; msdata.MSData msd = null; lock (owner) foreach (var row in spectrumRows) { if (row.SourceName != currentSourceName) { currentSourceName = row.SourceName; currentSourcePath = IDPickerForm.LocateSpectrumSource(currentSourceName, session.Connection.GetDataSource()); msd = new pwiz.CLI.msdata.MSDataFile(currentSourcePath); //var param = session.Query<AnalysisParameter>().Where(o => o.Name == "SpectrumListFilters").Min(o => o.Value); //string spectrumListFilters = String.IsNullOrEmpty(param) ? String.Empty : param; SpectrumListFactory.wrap(msd, spectrumListFilters.Split(";".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)); } string label = String.Format("{0}/{1}\n{2}", row.SourceName, msdata.id.abbreviate(row.Spectrum.NativeID), row.ModifiedSequence); var spectrumList = msd.run.spectrumList; ++spectraCount; var pwizPeptide = new proteome.Peptide(row.ModifiedSequence, proteome.ModificationParsing.ModificationParsing_Auto, proteome.ModificationDelimiter.ModificationDelimiter_Brackets); var fragmentation = pwizPeptide.fragmentation(true, true); var pwizSpectrum = spectrumList.spectrum(spectrumList.find(row.Spectrum.NativeID), true); var pointMap = new seems.PointMap(new ZedGraph.PointPairList(pwizSpectrum.getMZArray().data, pwizSpectrum.getIntensityArray().data)); double tic = pointMap.Values.Sum(); double precursorMz = row.Spectrum.PrecursorMZ; double chargeReducedPrecursorMz = precursorMz * row.PeptideSpectrumMatch.Charge; bool plotMatchedPeaks = true; bool removeMatchedPeaks = false; double tolerance = 0.03; seems.PointMap.Enumerator itr; IonSeries[] ionSeries = Enum.GetValues(typeof(IonSeries)).Cast <IonSeries>().Where(o => o != IonSeries.Count).ToArray(); for (int z = 1; z <= 1; ++z) { for (int length = 1, end = pwizPeptide.sequence.Length; length <= end; ++length) { string NTermFragment = row.ModifiedSequence.Substring(0, length); string CTermFragment = row.ModifiedSequence.Substring(row.ModifiedSequence.Length - length); foreach (IonSeries series in ionSeries) { if ((series == IonSeries.c || series == IonSeries.cMinus1 || series == IonSeries.x) && length == pwizPeptide.sequence.Length) { continue; } itr = pointMap.FindNear(fragmentMass(fragmentation, series, length, z), tolerance); if (itr != null && itr.IsValid) { if (plotMatchedPeaks) { precursorScatterPlot.AddPoint(new PointPair(itr.Current.Key - precursorMz, itr.Current.Value / tic, (int)series, String.Format("{0} {1}\n{2} {3} {4} {5}", label, precursorMz, NTermFragment, itr.Current.Key, IonSeriesLabels[(int)series], length))); chargeReducedScatterPlot.AddPoint(new PointPair(itr.Current.Key - chargeReducedPrecursorMz, itr.Current.Value / tic, (int)series, String.Format("{0} {1}\n{2} {3} {4} {5}", label, chargeReducedPrecursorMz, NTermFragment, itr.Current.Key, IonSeriesLabels[(int)series], length))); } if (removeMatchedPeaks) { pointMap.Remove(itr); } } } } } foreach (var pair in pointMap) { precursorScatterPlot.AddPoint(new PointPair(pair.Key - precursorMz, pair.Value / tic, 0, label)); chargeReducedScatterPlot.AddPoint(new PointPair(pair.Key - chargeReducedPrecursorMz, pair.Value / tic, 0, label)); } if ((spectraCount % 100) == 0) { Invoke(new MethodInvoker(() => { zedGraphControl.MasterPane.AxisChange(); zedGraphControl.Refresh(); })); } } Invoke(new MethodInvoker(() => { if (!lockZoomCheckBox.Checked) { zedGraphControl.ZoomOutAll(zedGraphControl.GraphPane); } zedGraphControl.MasterPane.AxisChange(); zedGraphControl.Refresh(); })); return(new List <double>()); //percentTicBySpectrumByFragmentType[1]; }
private phosphoRS.PTMResultClass RunOnSource(string sourceFilepath, int currentSource, int totalSources, PhosphoRSConfig config, IDictionary<long, PhosphoPeptideAttestationRow> phosphoRows) { var msd = new pwiz.CLI.msdata.MSDataFile(sourceFilepath); var spectrumList = msd.run.spectrumList; int rowNumber = 0; int totalRows = phosphoRows.Count(); items.Clear(); var spectrumTypes = new Set<CVID>(); foreach (var row in phosphoRows) { if (rowNumber == 0 || (rowNumber % 100) == 0) { if (cancelAttestation.IsCancellationRequested) { this.progressBar.ProgressBar.Visible = false; _bgWorkerCancelled = true; setProgress(-1, "Cancelled."); return null; } else { if (rowNumber == 0) setStatus(String.Format("Reading peaks and creating PhosphoRS objects for source {0} of {1} ({2}): {3} spectra\r\n", currentSource, totalSources, Path.GetFileName(sourceFilepath), totalRows)); setProgress((rowNumber + 1) / totalRows * 100, String.Format("Reading peaks and creating PhosphoRS objects for source {0} of {1} ({2}): {3}/{4} spectra", currentSource, totalSources, Path.GetFileName(sourceFilepath), rowNumber + 1, totalRows)); } } var pwizSpectrum = spectrumList.spectrum(spectrumList.find(row.Value.SpectrumNativeID), true); //may create indexoutofrange error if no spectrum nativeID var OriginalMZs = pwizSpectrum.getMZArray().data; //getMZArray().data returns IList<double> var OriginalIntensities = pwizSpectrum.getIntensityArray().data; row.Value.Peaks = new phosphoRS.Peak[OriginalMZs.Count]; for (int i = 0; i < OriginalMZs.Count; ++i) row.Value.Peaks[i] = new phosphoRS.Peak(OriginalMZs[i], OriginalIntensities[i]); if (config.spectrumType == phosphoRS.SpectrumType.None) { row.Value.SpectrumType = phosphoRS.SpectrumType.None; foreach (var precursor in pwizSpectrum.precursors) foreach (var method in precursor.activation.cvParamChildren(CVID.MS_dissociation_method)) { // if dissociation method is set to "Auto" but could not be determined from the file, alert the user if (!spectrumTypeByDissociationMethod.Contains(method.cvid)) throw new InvalidDataException("cannot handle unmapped dissociation method \"" + CV.cvTermInfo(method.cvid).shortName() + "\" for spectrum \"" + row.Value.SourceName + "/" + row.Value.SpectrumNativeID + "\"; please override the method manually"); else if (row.Value.SpectrumType != phosphoRS.SpectrumType.ECD_ETD) // don't override ETD (e.g. if there is also supplemental CID) { row.Value.SpectrumType = spectrumTypeByDissociationMethod[method.cvid]; spectrumTypes.Add(method.cvid); } } if (row.Value.SpectrumType == phosphoRS.SpectrumType.None) throw new InvalidDataException("cannot find a dissociation method for spectrum \"" + row.Value.SourceName + "/" + row.Value.SpectrumNativeID + "\"; please set the method manually"); } else row.Value.SpectrumType = config.spectrumType; var psm = getPhosphoRS_PSM(config, row.Value); // DEBUG //tbStatus.AppendText(PeptideToString(phosphoPeptide) + "," + AAS.ToOneLetterCodeString() + "," + ptmRepresentation.ToString() + "\n"); // Init the mod map of original variant for this PSM. var id2ModMap = new List<System.Tuple<int, List<int>>> { new System.Tuple<int, List<int>>((int) row.Value.PSMId, row.Value.OriginalPhosphoSites.Keys.ToList<int>()) }; items.Add(new System.Tuple<phosphoRS.PeptideSpectrumMatch, List<System.Tuple<int, List<int>>>>(psm, id2ModMap)); ++rowNumber; } // report automatically found fragmentation method if (config.spectrumType == phosphoRS.SpectrumType.None) setStatus(String.Format("Found {0} fragmentation types: {1}\r\n", spectrumTypes.Count, String.Join(", ", spectrumTypes.Keys.Select(o => CV.cvTermInfo(o).shortName())))); setProgress(currentSource / totalSources * 100, String.Format("Running PhosphoRS on source {0} of {1} ({2})...", currentSource, totalSources, Path.GetFileName(sourceFilepath))); // Initialize the localization. currentNr = 0; var phosphoRS_Context = new phosphoRS.ThreadManagement(this, cancelAttestation, config.maxIsoformCount, config.maxPTMCount, config.scoreNLToo, config.fragmentMassTolerance, config.scoredAA, items.Count); // Start the site localization (takes advantage of multi-threading) try { phosphoRS_Context.StartPTMLocalisation(); // Safety if the attestation module doesn't throw the exception. if (cancelAttestation.IsCancellationRequested) { this.progressBar.ProgressBar.Visible = false; _bgWorkerCancelled = true; setProgress(-1, "Cancelled."); return null; } return phosphoRS_Context.PTMResult; } catch (OperationCanceledException) { this.progressBar.ProgressBar.Visible = false; _bgWorkerCancelled = true; setProgress(-1, "Cancelled."); return null; } finally { msd.Dispose(); } }
private phosphoRS.PTMResultClass RunOnSource(string sourceFilepath, int currentSource, int totalSources, PhosphoRSConfig config, IDictionary <long, PhosphoPeptideAttestationRow> phosphoRows) { var msd = new pwiz.CLI.msdata.MSDataFile(sourceFilepath); var spectrumList = msd.run.spectrumList; int rowNumber = 0; int totalRows = phosphoRows.Count(); items.Clear(); var spectrumTypes = new Set <CVID>(); foreach (var row in phosphoRows) { if (rowNumber == 0 || (rowNumber % 100) == 0) { if (cancelAttestation.IsCancellationRequested) { this.progressBar.ProgressBar.Visible = false; _bgWorkerCancelled = true; setProgress(-1, "Cancelled."); return(null); } else { if (rowNumber == 0) { setStatus(String.Format("Reading peaks and creating PhosphoRS objects for source {0} of {1} ({2}): {3} spectra\r\n", currentSource, totalSources, Path.GetFileName(sourceFilepath), totalRows)); } setProgress((rowNumber + 1) / totalRows * 100, String.Format("Reading peaks and creating PhosphoRS objects for source {0} of {1} ({2}): {3}/{4} spectra", currentSource, totalSources, Path.GetFileName(sourceFilepath), rowNumber + 1, totalRows)); } } var pwizSpectrum = spectrumList.spectrum(spectrumList.find(row.Value.SpectrumNativeID), true); //may create indexoutofrange error if no spectrum nativeID var OriginalMZs = pwizSpectrum.getMZArray().data; //getMZArray().data returns IList<double> var OriginalIntensities = pwizSpectrum.getIntensityArray().data; row.Value.Peaks = new phosphoRS.Peak[OriginalMZs.Count]; for (int i = 0; i < OriginalMZs.Count; ++i) { row.Value.Peaks[i] = new phosphoRS.Peak(OriginalMZs[i], OriginalIntensities[i]); } if (config.spectrumType == phosphoRS.SpectrumType.None) { row.Value.SpectrumType = phosphoRS.SpectrumType.None; foreach (var precursor in pwizSpectrum.precursors) { foreach (var method in precursor.activation.cvParamChildren(CVID.MS_dissociation_method)) { // if dissociation method is set to "Auto" but could not be determined from the file, alert the user if (!spectrumTypeByDissociationMethod.Contains(method.cvid)) { throw new InvalidDataException("cannot handle unmapped dissociation method \"" + CV.cvTermInfo(method.cvid).shortName() + "\" for spectrum \"" + row.Value.SourceName + "/" + row.Value.SpectrumNativeID + "\"; please override the method manually"); } else if (row.Value.SpectrumType != phosphoRS.SpectrumType.ECD_ETD) // don't override ETD (e.g. if there is also supplemental CID) { row.Value.SpectrumType = spectrumTypeByDissociationMethod[method.cvid]; spectrumTypes.Add(method.cvid); } } } if (row.Value.SpectrumType == phosphoRS.SpectrumType.None) { throw new InvalidDataException("cannot find a dissociation method for spectrum \"" + row.Value.SourceName + "/" + row.Value.SpectrumNativeID + "\"; please set the method manually"); } } else { row.Value.SpectrumType = config.spectrumType; } var psm = getPhosphoRS_PSM(config, row.Value); // DEBUG //tbStatus.AppendText(PeptideToString(phosphoPeptide) + "," + AAS.ToOneLetterCodeString() + "," + ptmRepresentation.ToString() + "\n"); // Init the mod map of original variant for this PSM. var id2ModMap = new List <System.Tuple <int, List <int> > > { new System.Tuple <int, List <int> >((int)row.Value.PSMId, row.Value.OriginalPhosphoSites.Keys.ToList <int>()) }; items.Add(new System.Tuple <phosphoRS.PeptideSpectrumMatch, List <System.Tuple <int, List <int> > > >(psm, id2ModMap)); ++rowNumber; } // report automatically found fragmentation method if (config.spectrumType == phosphoRS.SpectrumType.None) { setStatus(String.Format("Found {0} fragmentation types: {1}\r\n", spectrumTypes.Count, String.Join(", ", spectrumTypes.Keys.Select(o => CV.cvTermInfo(o).shortName())))); } setProgress(currentSource / totalSources * 100, String.Format("Running PhosphoRS on source {0} of {1} ({2})...", currentSource, totalSources, Path.GetFileName(sourceFilepath))); // Initialize the localization. currentNr = 0; var phosphoRS_Context = new phosphoRS.ThreadManagement(this, cancelAttestation, config.maxIsoformCount, config.maxPTMCount, config.scoreNLToo, config.fragmentMassTolerance, config.scoredAA, items.Count); // Start the site localization (takes advantage of multi-threading) try { phosphoRS_Context.StartPTMLocalisation(); // Safety if the attestation module doesn't throw the exception. if (cancelAttestation.IsCancellationRequested) { this.progressBar.ProgressBar.Visible = false; _bgWorkerCancelled = true; setProgress(-1, "Cancelled."); return(null); } return(phosphoRS_Context.PTMResult); } catch (OperationCanceledException) { this.progressBar.ProgressBar.Visible = false; _bgWorkerCancelled = true; setProgress(-1, "Cancelled."); return(null); } finally { msd.Dispose(); } }
private void getFragmentationStatistics () { IList<object[]> queryRows; lock (session) { var randomIds = session.CreateQuery("SELECT psm.Id " + viewFilter.GetFilteredQueryString(DataFilter.FromPeptideSpectrumMatch)) .List<long>() .Shuffle() .Take(1000) .OrderBy(o => o); string randomIdSet = String.Join(",", randomIds.Select(o => o.ToString()).ToArray()); queryRows = session.CreateQuery("SELECT psm.Spectrum.Source.Name, psm.Spectrum, psm, DISTINCT_GROUP_CONCAT(pm.Offset || ':' || mod.MonoMassDelta), psm.Peptide.Sequence " + "FROM PeptideSpectrumMatch psm " + "LEFT JOIN psm.Modifications pm " + "LEFT JOIN pm.Modification mod " + "WHERE psm.Id IN (" + randomIdSet + ") " + "GROUP BY psm.Spectrum.id ") .List<object[]>(); } var spectrumRows = queryRows.Select(o => new SpectrumRow(o)).OrderBy(o => o.SourceName); var percentTicBySpectrumByFragmentType = new List<PointPairList>(); var percentPeakCountBySpectrumByFragmentType = new List<PointPairList>(); var meanMzErrorBySpectrumByFragmentType = new List<PointPairList>(); var percentTicListByFragmentType = new List<List<double>>(); var percentPeakCountListByFragmentType = new List<List<double>>(); var meanMzErrorListByFragmentType = new List<List<double>>(); foreach(var graphControl in graphControls) graphControl.MasterPane.PaneList.ForEach(o => o.CurveList.ForEach(c => c.Clear())); for (int i = 0; i < (int) IonSeries.Count; ++i) { percentTicBySpectrumByFragmentType.Add(percentTicGraphControl.MasterPane.PaneList[i + 1].CurveList[3].Points as PointPairList); percentPeakCountBySpectrumByFragmentType.Add(percentPeakCountGraphControl.MasterPane.PaneList[i + 1].CurveList[3].Points as PointPairList); meanMzErrorBySpectrumByFragmentType.Add(meanMzErrorGraphControl.MasterPane.PaneList[i + 1].CurveList[3].Points as PointPairList); percentTicListByFragmentType.Add(new List<double>()); percentPeakCountListByFragmentType.Add(new List<double>()); meanMzErrorListByFragmentType.Add(new List<double>()); } int spectraCount = 0; maxPercentTic = 10; maxPercentPeakCount = 10; maxMeanMzError = 0.1; var tolerance = fragmentTolerance; string spectrumListFilters = String.Empty; Invoke(new MethodInvoker(() => { tolerance.value = Convert.ToDouble(fragmentToleranceTextBox.Text); tolerance.units = (MZTolerance.Units) fragmentToleranceUnitsComboBox.SelectedIndex; meanMzErrorGraphControl.GraphPane.YAxis.Title.Text = "Mean m/z error (" + tolerance.units.ToString() + ")"; spectrumListFilters = spectrumFiltersTextBox.Text; setAutomaticScales(); })); var points = new PointPairList(); string currentSourceName = null; string currentSourcePath = null; msdata.MSData msd = null; lock (owner) foreach (var row in spectrumRows) { if (row.SourceName != currentSourceName) { currentSourceName = row.SourceName; currentSourcePath = IDPickerForm.LocateSpectrumSource(currentSourceName, session.Connection.GetDataSource()); if (String.IsNullOrEmpty(currentSourcePath)) throw new FileNotFoundException("source file not found"); msd = new pwiz.CLI.msdata.MSDataFile(currentSourcePath); //var param = session.Query<AnalysisParameter>().Where(o => o.Name == "SpectrumListFilters").Min(o => o.Value); //string spectrumListFilters = String.IsNullOrEmpty(param) ? String.Empty : param; SpectrumListFactory.wrap(msd, spectrumListFilters.Split(";".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)); } string spectrumId = String.Format("{0}/{1}", row.SourceName, msdata.id.abbreviate(row.Spectrum.NativeID)); var spectrumList = msd.run.spectrumList; ++spectraCount; var pwizPeptide = new proteome.Peptide(row.ModifiedSequence, proteome.ModificationParsing.ModificationParsing_Auto, proteome.ModificationDelimiter.ModificationDelimiter_Brackets); var fragmentation = pwizPeptide.fragmentation(true, true); var pwizSpectrum = spectrumList.spectrum(spectrumList.find(row.Spectrum.NativeID), true); var pointMap = new seems.PointMap(new ZedGraph.PointPairList(pwizSpectrum.getMZArray().data, pwizSpectrum.getIntensityArray().data)); double tic = pointMap.Values.Sum(); var percentTicByFragmentType = new List<double>(Enumerable.Repeat(0.0, (int) IonSeries.Count)); var percentPeakCountByFragmentType = new List<double>(Enumerable.Repeat(0.0, (int) IonSeries.Count)); var matchCountByFragmentType = new List<int>(Enumerable.Repeat(0, (int) IonSeries.Count)); var meanMzErrorByFragmentType = new List<double>(Enumerable.Repeat(Double.NaN, (int) IonSeries.Count)); seems.PointMap.Enumerator itr; double expected; IonSeries[] ionSeries = Enum.GetValues(typeof(IonSeries)).Cast<IonSeries>().Where(o => o != IonSeries.Count).ToArray(); for (int z = 1; z <= 1; ++z) for (int length = 1, end = pwizPeptide.sequence.Length; length <= end; ++length) foreach (IonSeries series in ionSeries) { if ((series == IonSeries.c || series == IonSeries.x) && length == pwizPeptide.sequence.Length) continue; expected = fragmentMass(fragmentation, series, length, z); itr = pointMap.FindNear(expected, expected - (expected - tolerance)); if (itr != null && itr.IsValid) { percentTicByFragmentType[(int)series] += itr.Current.Value; ++percentPeakCountByFragmentType[(int)series]; ++matchCountByFragmentType[(int)series]; if (Double.IsNaN(meanMzErrorByFragmentType[(int)series])) meanMzErrorByFragmentType[(int)series] = 0; meanMzErrorByFragmentType[(int)series] += mzError(itr.Current.Key, expected); } } var rng = new Random(); for (int i = 0; i < percentTicBySpectrumByFragmentType.Count; ++i) { // convert sum to mean if (percentPeakCountByFragmentType[i] > 0) meanMzErrorByFragmentType[i] /= matchCountByFragmentType[i]; // convert to percentages percentTicByFragmentType[i] /= tic / 100; percentPeakCountByFragmentType[i] /= pointMap.Count / 100.0; maxPercentTic = Math.Max(maxPercentTic, percentTicByFragmentType[i]); maxPercentPeakCount = Math.Max(maxPercentPeakCount, percentPeakCountByFragmentType[i]); double jitter = (rng.NextDouble() - 0.5); percentTicBySpectrumByFragmentType[i].Add(jitter, percentTicByFragmentType[i], String.Format("{0}: {1:G4}% ({2} matches)", spectrumId, percentTicByFragmentType[i], matchCountByFragmentType[i])); percentPeakCountBySpectrumByFragmentType[i].Add(jitter, percentPeakCountByFragmentType[i], String.Format("{0}: {1:G4}% ({2} matches)", spectrumId, percentPeakCountByFragmentType[i], matchCountByFragmentType[i])); percentTicListByFragmentType[i].Add(percentTicByFragmentType[i]); percentPeakCountListByFragmentType[i].Add(percentPeakCountByFragmentType[i]); if (!Double.IsNaN(meanMzErrorByFragmentType[i])) { maxMeanMzError = Math.Max(maxMeanMzError, Math.Abs(meanMzErrorByFragmentType[i])); meanMzErrorBySpectrumByFragmentType[i].Add(jitter, meanMzErrorByFragmentType[i], String.Format("{0}: {1:G4}%", spectrumId, meanMzErrorByFragmentType[i])); meanMzErrorListByFragmentType[i].Add(meanMzErrorByFragmentType[i]); } } if ((spectraCount % 100) == 0) setAutomaticScales(); } // for each spectrum row Invoke(new MethodInvoker(() => { for (int i = 0; i < percentTicBySpectrumByFragmentType.Count; ++i) { if (percentTicListByFragmentType[i].Count < 5) continue; percentTicListByFragmentType[i].Sort(); percentPeakCountListByFragmentType[i].Sort(); addSixNumberSummary(percentTicGraphControl.MasterPane.PaneList[i + 1], percentTicListByFragmentType[i]); addSixNumberSummary(percentPeakCountGraphControl.MasterPane.PaneList[i + 1], percentPeakCountListByFragmentType[i]); if (meanMzErrorListByFragmentType[i].Count < 5) continue; meanMzErrorListByFragmentType[i].Sort(); addSixNumberSummary(meanMzErrorGraphControl.MasterPane.PaneList[i + 1], meanMzErrorListByFragmentType[i]); } })); }
/// <summary> /// run clustering, Rescue PSMs, update idpDB /// </summary> private void RescuePSMsByClustering() { DateTime startTime = DateTime.Now; reportProgressDelegate reportProgress = new reportProgressDelegate(setProgress); reportStatusDelegate reportStatus = new reportStatusDelegate(setStatus); string database = session.Connection.GetDataSource(); logFile = Path.ChangeExtension(database, ".log.txt"); string config = string.Format("Parameters:\r\n" + "PrecursorMZTol: {0} \r\n" + "FragmentMZTol: {1} \r\n" + "Similarity Threshold >= {2} \r\n" + "Rank <= {3} \r\n" + "Cluster Size >= {4} \r\n" + "Search Scores: {5}{6}{7};{8}{9}{10};{11}{12}{13} \r\n\r\n", precursorMzTolerance, fragmentMzTolerance, similarityThreshold, maxRank, minClusterSize, searchScore1Name, searchScore1Order, searchScore1Threshold, searchScore2Name, searchScore2Order, searchScore2Threshold, searchScore3Name, searchScore3Order, searchScore3Threshold); reportStatus(config); //if (writeLog) // File.WriteAllText(logFile, config); /* * back up original idpDB */ if (backupDB) { string dbBackupFile = Path.ChangeExtension(database, ".backup.idpDB"); reportStatus(string.Format("Backing up idpDB to {0} ... ", dbBackupFile)); reportProgress(-1, "Backing up idpDB"); File.Copy(database, dbBackupFile, true); reportStatus(reportSecondsElapsed((DateTime.Now - startTime).TotalSeconds)); } //reportStatus("Dropping filters... \r\n"); // basicDataFilter.DropFilters(session); //// this will drop all filtered tables and rename unfiltered tables //basicDataFilter.ApplyBasicFilters(session); reportStatus("Querying spectra..."); reportProgress(-1, "Querying spectra..."); IList<object[]> queryRows; lock (session) //// SQL query to retrieve spectrum info for unfiltered psm, filter query results by rank1 search score // queryRows = session.CreateSQLQuery(@"SELECT s.Id, source.Name, NativeID, PrecursorMZ // FROM Spectrum s // JOIN SpectrumSource source ON s.Source = source.Id // JOIN UnfilteredPeptideSpectrumMatch psm ON s.Id = psm.Spectrum AND psm.Rank = 1 // JOIN PeptideSpectrumMatchScore psmScore ON psm.Id = psmScore.PsmId // JOIN PeptideSpectrumMatchScoreName scoreName ON psmScore.ScoreNameId=scoreName.Id // WHERE (scoreName.Name = " + "'" + searchScore1Name + "'" + " AND psmScore.Value " + searchScore1Order + searchScore1Threshold.ToString() + ") OR (scoreName.Name = " + "'" + searchScore2Name + "'" + " AND psmScore.Value " + searchScore2Order + searchScore2Threshold.ToString() + ") OR (scoreName.Name = " + "'" + searchScore3Name + "'" + " AND psmScore.Value " + searchScore3Order + searchScore3Threshold.ToString() + ")" + // " GROUP BY s.Id" // ).List<object[]>(); //// SQL query to retrieve spectrum info for unfiltered psm that map to identified peptide, filter by search score queryRows = session.CreateSQLQuery(@"SELECT s.Id, source.Name, NativeID, PrecursorMZ FROM UnfilteredSpectrum s JOIN SpectrumSource source ON s.Source = source.Id JOIN UnfilteredPeptideSpectrumMatch psm ON s.Id = psm.Spectrum JOIN Peptide p ON p.Id = psm.Peptide JOIN PeptideSpectrumMatchScore psmScore ON psm.Id = psmScore.PsmId JOIN PeptideSpectrumMatchScoreName scoreName ON psmScore.ScoreNameId=scoreName.Id WHERE (scoreName.Name = " + "'" + searchScore1Name + "'" + " AND psmScore.Value " + searchScore1Order + searchScore1Threshold.ToString() + ") OR (scoreName.Name = " + "'" + searchScore2Name + "'" + " AND psmScore.Value " + searchScore2Order + searchScore2Threshold.ToString() + ") OR (scoreName.Name = " + "'" + searchScore3Name + "'" + " AND psmScore.Value " + searchScore3Order + searchScore3Threshold.ToString() + ")" + " GROUP BY s.Id" ).List<object[]>(); var foundSpectraList = session.CreateSQLQuery(@"SELECT distinct spectrum FROM PeptideSpectrumMatch").List<object>(); var foundSpectra = new HashSet<long>(); { long tempLong; foreach (var item in foundSpectraList) if (long.TryParse(item.ToString(), out tempLong)) foundSpectra.Add(tempLong); } var spectrumRows = queryRows.Select(o => new SpectrumRow(o)).OrderBy(o => o.SourceName).ToList(); ////converted IOrderedEnumerable to List, the former one may end up with multiple enumeration, each invokes constructor, resulting a fresh set of object /* * extract peaks for each spectrum, spectrumRows was sorted by SourceName */ string currentSourceName = null; string currentSourcePath = null; msdata.MSData msd = null; int spectrumRowsCount = spectrumRows.Count(); //Set<long> processedSpectrumIDs = new Set<long>(); reportStatus(reportSecondsElapsed((DateTime.Now - startTime).TotalSeconds)); reportStatus(string.Format("Extracting peaks for {0} spectra ... ", spectrumRowsCount)); lock (owner) for (int i = 0; i < spectrumRowsCount; ++i) { if (_bgWorkerClustering.CancellationPending) { _bgWorkerCancelled = true; return; } var row = spectrumRows.ElementAt(i); reportProgress((int)(((double)(i + 1) / (double)spectrumRowsCount) * 100), string.Format("Extracting peaks ({0}/{1}) from {2}", i + 1, spectrumRowsCount, row.SourceName)); //if (processedSpectrumIDs.Contains(row.SpectrumId)) // break; if (row.SourceName != currentSourceName) { currentSourceName = row.SourceName; currentSourcePath = IDPickerForm.LocateSpectrumSource(currentSourceName, session.Connection.GetDataSource()); if (msd != null) msd.Dispose(); msd = new pwiz.CLI.msdata.MSDataFile(currentSourcePath); SpectrumListFactory.wrap(msd, "threshold count 100 most-intense"); //only keep the top 100 peaks //SpectrumListFactory.wrap(msd, "threshold bpi-relative .5 most-intense"); //keep all peaks that are at least 50% of the intensity of the base peak //SpectrumListFactory.wrap(msd, "threshold tic-cutoff .95 most-intense"); //keep all peaks that count for 95% TIC //threshold <count|count-after-ties|absolute|bpi-relative|tic-relative|tic-cutoff> <threshold> <most-intense|least-intense> [int_set(MS levels)] } var spectrumList = msd.run.spectrumList; var pwizSpectrum = spectrumList.spectrum(spectrumList.find(row.SpectrumNativeID), true); //may create indexoutofrange error if no spectrum nativeID row.OriginalMZs = pwizSpectrum.getMZArray().data; //getMZArray().data returns IList<double> row.OriginalIntensities = pwizSpectrum.getIntensityArray().data; //processedSpectrumIDs.Add(row.SpectrumId); } /* * re-sort spectrumRows by precursorMZ * walk through each spectrum. compare similarity to all other spectra within the precursorMZTolerance * (e.g. compare 1 to 2,3,4, then 2 to 3,4,5, then 3 to 4,5 etc), * if above similarityThreshold, add link edge to BOTH spectra * merge all connected spectra to a cluster */ reportStatus(reportSecondsElapsed((DateTime.Now - startTime).TotalSeconds)); reportStatus("Computing similarities... "); var spectrumRowsOrderByPrecursorMZ = (from randomVar in spectrumRows orderby randomVar.PrecursorMZ select randomVar).ToList(); LinkMap linkMap = new LinkMap(); //// spectrum Id as key, directly linked spectra as value double similarityScore = 0; lock (owner) for (int i = 0; i < spectrumRowsCount; ++i) { if (_bgWorkerClustering.CancellationPending) { _bgWorkerCancelled = true; return; } var row = spectrumRowsOrderByPrecursorMZ.ElementAt(i); reportProgress((int)(((double)(i + 1) / (double)spectrumRowsCount) * 100), "Computing similarities"); for (int j = i + 1; j < spectrumRowsCount; ++j) { var nextRow = spectrumRowsOrderByPrecursorMZ.ElementAt(j); if (Math.Abs(row.PrecursorMZ - nextRow.PrecursorMZ) > precursorMzTolerance) { break; } else { ////compare pairwise similarity, link spectra passing threshold to both spectrum Peaks rowPeakList = new Peaks(row.OriginalMZs, row.OriginalIntensities); Peaks nextRowPeakList = new Peaks(nextRow.OriginalMZs, nextRow.OriginalIntensities); //// converting peak intensities to sqrt here is 5-fold slower than doing this in DotProductCompareTo function //Peaks rowPeakList = new Peaks(row.OriginalMZs, row.OriginalIntensities.Select(o => Math.Sqrt(o)).ToList()); //Peaks nextRowPeakList = new Peaks(nextRow.OriginalMZs, nextRow.OriginalIntensities.Select(o => Math.Sqrt(o)).ToList()); similarityScore = ClusteringAnalysis.DotProductCompareTo(rowPeakList, nextRowPeakList, fragmentMzTolerance); //reportStatus("similarity between " + row.SpectrumNativeID + " and " + nextRow.SpectrumNativeID + " is " + similarityScore.ToString() + "\r\n"); if (similarityScore >= similarityThreshold) { linkMap[(long)row.SpectrumId].Add((long)nextRow.SpectrumId); linkMap[(long)nextRow.SpectrumId].Add((long)row.SpectrumId); //// if a -> b, then b -> a } } } } reportStatus(reportSecondsElapsed((DateTime.Now - startTime).TotalSeconds)); reportStatus("Clustering spectra... "); reportProgress(-1, "Clustering spectra"); linkMap.GetMergedLinkList(); reportStatus(reportSecondsElapsed((DateTime.Now - startTime).TotalSeconds)); //// print clustered spectra //foreach (var cluster in linkMap.MergedLinkList) //{ // reportStatus("Number of spectra in cluster: " + cluster.Count().ToString() + "\r\n"); // foreach (var sID in cluster) // { // var nativeID = (from o in spectrumRows where o.SpectrumId == sID select o.SpectrumNativeID).First(); // reportStatus(nativeID.ToString() + "\t"); // } // reportStatus("\r\n"); //} ////free some memory queryRows.Clear(); queryRows = null; msd.Dispose(); msd = null; spectrumRows.Clear(); spectrumRows = null; spectrumRowsOrderByPrecursorMZ.Clear(); spectrumRowsOrderByPrecursorMZ = null; /* * Go through each cluster, rescue PSMs if spectra in the same cluster were identified as the same peptide (id) */ List<Set<long>> clusterSetList = (from o in linkMap.MergedLinkList where o.Count >= minClusterSize select o).ToList(); //// each element in the list is a set of clustered spectrum Ids, select sets with at least minClusterSize element int clusterSetListCount = clusterSetList.Count(); var allSpectrumIDs = (from o in clusterSetList from j in o select j).ToList(); reportStatus(string.Format("Number of clusters: {0} \r\n", clusterSetListCount)); reportStatus(string.Format("Number of spectra clustered: {0}/{1} ({2:0.0%}) \r\n", allSpectrumIDs.Count, spectrumRowsCount, (double)allSpectrumIDs.Count / spectrumRowsCount)); IList<object> identPSMQueryRows; lock (session) identPSMQueryRows = session.CreateSQLQuery(@"SELECT psm.Id FROM PeptideSpectrumMatch psm").List<object>(); var identPSMIdSet = new Set<long>(identPSMQueryRows.Select(o => (long)o)); reportStatus(string.Format("Number of PSMs identified: {0} \r\n", identPSMIdSet.Count)); //// create a temp table to store clustered spectrum IDs session.CreateSQLQuery(@"DROP TABLE IF EXISTS TempSpecIds; CREATE TEMP TABLE TempSpecIds (Id INTEGER PRIMARY KEY) ").ExecuteUpdate(); var insertTempSpecIdscmd = session.Connection.CreateCommand(); insertTempSpecIdscmd.CommandText = "INSERT INTO TempSpecIds VALUES (?)"; var insertTempSpecIdsParameters = new List<System.Data.IDbDataParameter>(); for (int i = 0; i < 1; ++i) { insertTempSpecIdsParameters.Add(insertTempSpecIdscmd.CreateParameter()); insertTempSpecIdscmd.Parameters.Add(insertTempSpecIdsParameters[i]); } insertTempSpecIdscmd.Prepare(); foreach (var id in allSpectrumIDs) { insertTempSpecIdsParameters[0].Value = id; insertTempSpecIdscmd.ExecuteNonQuery(); } IList<object> allPsmIdQueryRows; lock (session) //// SQL query to retrieve all psm id for clustered spectra with score above a threshold allPsmIdQueryRows = session.CreateSQLQuery(@"SELECT GROUP_CONCAT(psm.Id) FROM TempSpecIds JOIN UnfilteredPeptideSpectrumMatch psm ON TempSpecIds.Id = psm.Spectrum JOIN PeptideSpectrumMatchScore psmScore ON psm.Id = psmScore.PsmId JOIN PeptideSpectrumMatchScoreName scoreName ON psmScore.ScoreNameId=scoreName.Id WHERE psm.Rank <= " + maxRank.ToString() + " AND ((scoreName.Name = " + "'" + searchScore1Name + "'" + " AND psmScore.Value " + searchScore1Order + searchScore1Threshold.ToString() + ") OR (scoreName.Name = " + "'" + searchScore2Name + "'" + " AND psmScore.Value " + searchScore2Order + searchScore2Threshold.ToString() + ") OR (scoreName.Name = " + "'" + searchScore3Name + "'" + " AND psmScore.Value " + searchScore3Order + searchScore3Threshold.ToString() + "))" + " GROUP BY TempSpecIds.Id, psm.Charge" ).List<object>(); var allPsmIdsRows = allPsmIdQueryRows.Select(o => new PsmIdRow(o)).ToList(); Set<long> allPsmIds = new Set<long>(); foreach (var row in allPsmIdsRows) { allPsmIds.Union(row.PsmIds); } session.CreateSQLQuery(@"DROP TABLE IF EXISTS TempSpecIds").ExecuteUpdate(); reportStatus("Querying PSMs..."); reportProgress(-1, "Querying PSMs"); IList<object[]> allClusterQueryRows; //// create a temp table to store psm IDs session.CreateSQLQuery(@"DROP TABLE IF EXISTS TempPsmIds; CREATE TEMP TABLE TempPsmIds (Id INTEGER PRIMARY KEY) ").ExecuteUpdate(); var cmd = session.Connection.CreateCommand(); cmd.CommandText = "INSERT INTO TempPsmIds VALUES (?)"; var parameters = new List<System.Data.IDbDataParameter>(); for (int i = 0; i < 1; ++i) { parameters.Add(cmd.CreateParameter()); cmd.Parameters.Add(parameters[i]); } cmd.Prepare(); foreach (var id in allPsmIds) { parameters[0].Value = id; cmd.ExecuteNonQuery(); } //// qurey string for revison 286, no DecoySequence in Peptide table // string queryCmd = @"SELECT psm.Id as psmId, s.Id, source.Name, s.NativeID, psm.Rank, psm.Charge, psmScore.Value, IFNULL(GROUP_CONCAT(DISTINCT pm.Offset || ':' || mod.MonoMassDelta),''), // (SELECT SUBSTR(pro.Sequence, pi.Offset+1, pi.Length) // FROM PeptideInstance pi // JOIN ProteinData pro ON pi.Protein=pro.Id // WHERE pi.Protein=pro.Id AND // pi.Id=(SELECT MIN(pi2.Id) // FROM PeptideInstance pi2 // WHERE psm.Peptide=pi2.Peptide)) // FROM TempIDs tempIDs // JOIN Spectrum s ON s.Id = tempIDs.Id // JOIN SpectrumSource source ON s.Source = source.Id // JOIN PeptideSpectrumMatch psm ON s.Id = psm.Spectrum // LEFT JOIN PeptideModification pm ON psm.Id = pm.PeptideSpectrumMatch // LEFT JOIN Modification mod ON pm.Modification = mod.Id // JOIN PeptideSpectrumMatchScore psmScore ON psm.Id = psmScore.PsmId // JOIN PeptideSpectrumMatchScoreName scoreName ON psmScore.ScoreNameId=scoreName.Id // WHERE scoreName.Name = " + "'" + searchScoreName + "'" + " AND psm.Rank <= 5" + // " GROUP BY psm.Id"; //AND s.Id IN ( " + String.Join(",", allSpectrumIDs.Select(o => o.ToString()).ToArray()) + " ) " + //// query string for revison 288, added DecoySequence in Peptide table // string queryCmd = @"SELECT psm.Id as psmId, s.Id, source.Name, s.NativeID, psm.Rank, psm.Charge, psmScore.Value, IFNULL(GROUP_CONCAT(DISTINCT pm.Offset || ':' || mod.MonoMassDelta),''), // (SELECT IFNULL(SUBSTR(pro.Sequence, pi.Offset+1, pi.Length), (SELECT DecoySequence FROM Peptide p WHERE p.Id = pi.Peptide)) // FROM PeptideInstance pi // LEFT JOIN ProteinData pro ON pi.Protein=pro.Id // WHERE pi.Id=(SELECT pi2.Id FROM PeptideInstance pi2 WHERE pi2.Peptide=psm.Peptide LIMIT 1)) // FROM TempIDs tempIDs // JOIN Spectrum s ON s.Id = tempIDs.Id // JOIN SpectrumSource source ON s.Source = source.Id // JOIN PeptideSpectrumMatch psm ON s.Id = psm.Spectrum // LEFT JOIN PeptideModification pm ON psm.Id = pm.PeptideSpectrumMatch // LEFT JOIN Modification mod ON pm.Modification = mod.Id // JOIN PeptideSpectrumMatchScore psmScore ON psm.Id = psmScore.PsmId // JOIN PeptideSpectrumMatchScoreName scoreName ON psmScore.ScoreNameId=scoreName.Id // WHERE scoreName.Name = " + "'" + searchScoreName + "'" + " AND psm.Rank <= 5" + // " GROUP BY psm.Id"; ////query string for revision 291, retrive by PSM Ids // string queryCmd = @"SELECT psm.Id as psmId, psm.Peptide,s.Id, source.Name, s.NativeID, psm.Charge, IFNULL(GROUP_CONCAT(DISTINCT pm.Offset || ':' || mod.MonoMassDelta),''), // (SELECT IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length), (SELECT DecoySequence FROM UnfilteredPeptide p WHERE p.Id = pi.Peptide))), // GROUP_CONCAT(pro.Accession),psm.QValue, psm.Rank, psmScore.Value, analysis.Id // FROM TempPsmIds tempPsmIds // JOIN UnfilteredPeptideSpectrumMatch psm ON psm.Id = tempPsmIds.Id // JOIN Analysis analysis ON psm.Analysis = analysis.Id // JOIN Spectrum s ON s.Id = psm.Spectrum // JOIN SpectrumSource source ON s.Source = source.Id // JOIN UnfilteredPeptideInstance pi ON psm.Peptide = pi.Peptide // JOIN UnfilteredProtein pro ON pi.Protein = pro.Id // LEFT JOIN ProteinData pd ON pi.Protein=pd.Id // LEFT JOIN PeptideModification pm ON psm.Id = pm.PeptideSpectrumMatch // LEFT JOIN Modification mod ON pm.Modification = mod.Id // LEFT JOIN PeptideSpectrumMatchScore psmScore ON psm.Id = psmScore.PsmId // LEFT JOIN PeptideSpectrumMatchScoreName scoreName ON psmScore.ScoreNameId=scoreName.Id // WHERE scoreName.Name = " + "'" + searchScore1Name + "'" + // " GROUP BY psm.Id"; // query for r291, fix no seq for some peptides shared by target and decoy proteins, query seq for target and decoy proteins separately then union string queryCmd = @"SELECT psm.Id as psmId, psm.Peptide,s.Id, source.Name, s.NativeID, psm.Charge, IFNULL(GROUP_CONCAT(DISTINCT pm.Offset || ':' || mod.MonoMassDelta),''), IFNULL(IFNULL(SUBSTR(pd.Sequence, pi.Offset+1, pi.Length),(SELECT DecoySequence FROM UnfilteredPeptide p WHERE p.Id = pi.Peptide)), (SELECT SUBSTR(pd.Sequence, pi.Offset+1, pi.Length) FROM UnfilteredPeptideInstance pi JOIN UnfilteredProtein pro ON pi.Protein = pro.Id AND pro.IsDecoy = 0 LEFT JOIN ProteinData pd ON pi.Protein=pd.Id WHERE psm.Peptide = pi.Peptide UNION SELECT p.DecoySequence FROM UnfilteredPeptide p JOIN UnfilteredPeptideInstance pi ON p.Id = pi.Peptide JOIN UnfilteredProtein pro ON pi.Protein = pro.Id AND pro.IsDecoy = 1 WHERE psm.Peptide = pi.Peptide AND p.DecoySequence is not null)), GROUP_CONCAT(pro.Accession), psm.QValue, psm.Rank, psmScore.Value, psm.Analysis FROM TempPsmIds tempPsmIds JOIN UnfilteredPeptideSpectrumMatch psm ON psm.Id = tempPsmIds.Id JOIN UnfilteredSpectrum s ON s.Id = psm.Spectrum JOIN SpectrumSource source ON s.Source = source.Id JOIN UnfilteredPeptideInstance pi ON psm.Peptide = pi.Peptide JOIN UnfilteredProtein pro ON pi.Protein = pro.Id LEFT JOIN ProteinData pd ON pi.Protein=pd.Id LEFT JOIN PeptideModification pm ON psm.Id = pm.PeptideSpectrumMatch LEFT JOIN Modification mod ON pm.Modification = mod.Id LEFT JOIN PeptideSpectrumMatchScore psmScore ON psm.Id = psmScore.PsmId LEFT JOIN PeptideSpectrumMatchScoreName scoreName ON psmScore.ScoreNameId=scoreName.Id WHERE scoreName.Name in ( " + "'" + searchScore1Name + "','" + searchScore2Name + "','" + searchScore3Name + "')" + " GROUP BY psm.Id"; lock (session) allClusterQueryRows = session.CreateSQLQuery(queryCmd).List<object[]>(); var allClusterSpectrumRows = allClusterQueryRows.Select(o => new ClusterSpectrumRow(o)).ToList(); session.CreateSQLQuery(@"DROP TABLE IF EXISTS TempPsmIds").ExecuteUpdate(); reportStatus(reportSecondsElapsed((DateTime.Now - startTime).TotalSeconds)); reportStatus(string.Format("Number of PSMs retrieved: {0} \r\n", allClusterSpectrumRows.Count)); reportStatus("Rescuing PSMs... "); if (writeLog) { string logHeader = string.Join("\t", new string[] { "SourceName", "NativeID", "Charge", "RescuedSequence", "Protein", "ScoreName", "SearchScore", "BAScore", "QValue", "Rank", "Rank1Sequence", "Rank1Protein", "Rank1SearchScore", "Rank1BAScore", "Rank1Qvalue", "\r\n" }); File.WriteAllText(logFile, logHeader); } Dictionary<long, UpdateValues> updateDict = new Dictionary<long, UpdateValues>(); ////key: Id in unfiltered psm table, value: reassigned Qvalue and reassinged Rank Set<long> rescuedDistinctSpectraIds = new Set<long>(); //// SQL query to retrieve anlaysis Id and search score order in QonvertSettings table IList<object[]> qonvertSettingsQueryRows; lock (session) qonvertSettingsQueryRows = session.CreateSQLQuery("SELECT Id, ScoreInfoByName FROM QonverterSettings").List<object[]>(); var qonvertSettingRows = qonvertSettingsQueryRows.Select(o => new qonvertSettingRows(o)).ToList(); Dictionary<long, string> analysisScoreOrder = new Dictionary<long, string>(); Dictionary<long, string> analysisScoreName = new Dictionary<long, string>(); foreach (var qonvertSettingRow in qonvertSettingRows) { analysisScoreOrder.Add(qonvertSettingRow.Id, qonvertSettingRow.ScoreOrder); analysisScoreName.Add(qonvertSettingRow.Id, qonvertSettingRow.ScoreName); } ////walk through each cluster to rescue PSMs for (int i = 0; i < clusterSetListCount; ++i) { var clusterSet = clusterSetList.ElementAt(i); if (_bgWorkerClustering.CancellationPending) { _bgWorkerCancelled = true; return; } //reportStatus("Clustering set: " + String.Join(",",clusterSet.Select(j => j.ToString()).ToArray()) + "\r\n"); reportProgress((int)(((double)(i + 1) / (double)clusterSetListCount) * 100), "Rescuing PSMs"); var clusterSpectrumRows = (from o in allClusterSpectrumRows where clusterSet.Contains(o.SpectrumId) select o).ToList(); //Map<long, Set<long>> peptideIdDict = new Map<long, Set<long>>(); //key: peptide id, value: psm ids //Set<long> unprocessedPSMIds = new Set<long>(); Set<string> unprocessedSpecChargeAnalysisSet = new Set<string>(); //spectrumId.charge.analysis var pepSeqDict = new PepDictionary(); //key: modified peptide sequence, value: spectrumId.charge.analysis, score //var peptideIdDict = new PepDictionary(); //key: peptide ID, value: PSM Ids and scores foreach (var row in clusterSpectrumRows) { //peptideIdDict.Add(row.PeptideId,row.PSMId, row.SearchScore); //peptideIdDict[row.PeptideId].Add(row.PSMId); pepSeqDict.Add(row.ModifiedSequence, row.SpectrumId, row.Charge, row.Analysis, row.SearchScore, row.PSMId); //unprocessedPSMIds.Add(row.PSMId); //unprocessedSpectrumCharge.Add(row.SpectrumId.ToString() + "." + row.Charge.ToString()); unprocessedSpecChargeAnalysisSet.Add(row.SpectrumId.ToString() + "." + row.Charge.ToString() + "." + row.Analysis.ToString()); } pepSeqDict.ComputeBayesianAverage(analysisScoreOrder); //replace score from sum of search scores to Bayesian Average var sortedPepSeqDictKeys = from k in pepSeqDict.Keys orderby pepSeqDict[k].FinalScore descending, pepSeqDict[k].PsmIdSpecDict.Count() descending select k; // sort by score, if tied, second sort by # of linked psms foreach (var pepSeq in sortedPepSeqDictKeys) { if (unprocessedSpecChargeAnalysisSet.Count == 0) break; if (pepSeqDict[pepSeq].PsmIdSpecDict.Keys.Any(pId => identPSMIdSet.Contains(pId))) ////at least one psm identified as this peptide in this cluster { foreach (var psmId in pepSeqDict[pepSeq].PsmIdSpecDict.Keys) { var row = (from o in clusterSpectrumRows where o.PSMId == psmId select o).First(); string spec = row.SpectrumId.ToString() + "." + row.Charge.ToString() + "." + row.Analysis.ToString(); if (unprocessedSpecChargeAnalysisSet.Contains(spec)) { if (identPSMIdSet.Contains(psmId) || foundSpectra.Contains(row.SpectrumId)) { //// not process ident PSMs unprocessedSpecChargeAnalysisSet.Remove(spec); } else { updateDict.Add(psmId, new UpdateValues(-1, 1)); //// update Qvalue = -1, Rank =1 ++rescuedPSMsCount; rescuedDistinctSpectraIds.Add(row.SpectrumId); unprocessedSpecChargeAnalysisSet.Remove(spec); if (writeLog) { string originalRank1Seq = ""; string originalRank1Protein = ""; string originalRank1Score = ""; string originalRank1BAScore = ""; string originalRank1Qvalue = ""; if (row.Rank != 1) { var originalRank1Rows = (from o in clusterSpectrumRows where o.SpectrumId == row.SpectrumId && o.Rank == 1 && o.Charge == row.Charge && o.Analysis == row.Analysis select new { o.ModifiedSequence, o.Protein, o.SearchScore, o.QValue }).ToList(); ////may exist more than one rank1 hits foreach (var originalRank1Row in originalRank1Rows) { originalRank1Seq += originalRank1Row.ModifiedSequence + ";"; originalRank1Protein += originalRank1Row.Protein + ";"; originalRank1Score += originalRank1Row.SearchScore.ToString("0.0000") + ";"; originalRank1BAScore += pepSeqDict.ContainsKey(originalRank1Row.ModifiedSequence) ? pepSeqDict[originalRank1Row.ModifiedSequence].FinalScore.ToString("0.0000") + ";" : ""; originalRank1Qvalue += originalRank1Row.QValue.ToString("0.0000") + ";"; } } string logLine = string.Join("\t", new string[] { row.SourceName, row.SpectrumNativeID, row.Charge.ToString(), row.ModifiedSequence, row.Protein, analysisScoreName[row.Analysis], row.SearchScore.ToString("0.0000"), pepSeqDict[pepSeq].FinalScore.ToString("0.0000"), row.QValue.ToString("0.0000"), row.Rank.ToString(), originalRank1Seq, originalRank1Protein, originalRank1Score, originalRank1BAScore, originalRank1Qvalue }); using (StreamWriter sw = File.AppendText(logFile)) { sw.WriteLine(logLine); } } } } } } } //// end of foreach (var pepSeq in sortedPepSeqDictKeys) } //// end of for (int i = 0; i < clusterSetListCount; ++i) reportStatus(string.Format("{0} seconds elapsed\r\n", (DateTime.Now - startTime).TotalSeconds)); /* *update unfiltered psm table in idpDB */ if (rescuedPSMsCount == 0) return; reportStatus("Updating idpDB... "); session.Transaction.Begin(); //basicDataFilter.DropFilters(session); // tables were dropped before querying var updateCmd = session.Connection.CreateCommand(); updateCmd.CommandText = "UPDATE UnfilteredPeptideSpectrumMatch SET QValue = ?, Rank = ? WHERE Id = ?"; var updateParameters = new List<System.Data.IDbDataParameter>(); for (int i = 0; i < 3; ++i) { updateParameters.Add(updateCmd.CreateParameter()); updateCmd.Parameters.Add(updateParameters[i]); } updateCmd.Prepare(); int updateCount = 0; int allUpdateCount = updateDict.Count; foreach (KeyValuePair<long, UpdateValues> pair in updateDict) { updateParameters[0].Value = pair.Value.ReassignedQvalue; //// Qvalue updateParameters[1].Value = pair.Value.ReassignedRank; //// Rank updateParameters[2].Value = pair.Key; //// psm id updateCmd.ExecuteNonQuery(); reportProgress((int)(((double)(updateCount + 1) / (double)allUpdateCount) * 100), "Updating idpDB"); ++updateCount; } session.Transaction.Commit(); //basicDataFilter.ApplyBasicFilters(session); reportStatus(reportSecondsElapsed((DateTime.Now - startTime).TotalSeconds)); reportStatus(string.Format("Rescued {0} PSMs for {1} distinct spectra\r\n", rescuedPSMsCount, rescuedDistinctSpectraIds.Count)); reportProgress(0, "Ready"); /* * not recompute q values, reload idpDB, implemented in _bgWorkerClustering_RunWorkerCompleted */ } //// end of RescuePSMsByClustering
private List<double> getPeakStatistics () { IList<object[]> queryRows; lock (session) { var randomIds = session.CreateQuery("SELECT psm.Id " + viewFilter.GetFilteredQueryString(DataFilter.FromPeptideSpectrumMatch)) .List<long>() .Shuffle() .Take(1000) .OrderBy(o => o); string randomIdSet = String.Join(",", randomIds.Select(o => o.ToString()).ToArray()); queryRows = session.CreateQuery("SELECT psm.Spectrum.Source.Name, psm.Spectrum, psm, DISTINCT_GROUP_CONCAT(pm.Offset || ':' || mod.MonoMassDelta), psm.Peptide.Sequence " + "FROM PeptideSpectrumMatch psm " + "LEFT JOIN psm.Modifications pm " + "LEFT JOIN pm.Modification mod " + "WHERE psm.Id IN (" + randomIdSet + ") " + "GROUP BY psm.Spectrum.id ") .List<object[]>(); } var spectrumRows = queryRows.Select(o => new SpectrumRow(o)).OrderBy(o => o.SourceName); precursorScatterPlot.Clear(); chargeReducedScatterPlot.Clear(); int spectraCount = 0; string spectrumListFilters = String.Empty; Invoke(new MethodInvoker(() => { spectrumListFilters = spectrumFiltersTextBox.Text; zedGraphControl.MasterPane.AxisChange(); zedGraphControl.Refresh(); })); var points = new PointPairList(); string currentSourceName = null; string currentSourcePath = null; msdata.MSData msd = null; lock(owner) foreach (var row in spectrumRows) { if (row.SourceName != currentSourceName) { currentSourceName = row.SourceName; currentSourcePath = IDPickerForm.LocateSpectrumSource(currentSourceName, session.Connection.GetDataSource()); msd = new pwiz.CLI.msdata.MSDataFile(currentSourcePath); //var param = session.Query<AnalysisParameter>().Where(o => o.Name == "SpectrumListFilters").Min(o => o.Value); //string spectrumListFilters = String.IsNullOrEmpty(param) ? String.Empty : param; SpectrumListFactory.wrap(msd, spectrumListFilters.Split(";".ToCharArray(), StringSplitOptions.RemoveEmptyEntries)); } string label = String.Format("{0}/{1}\n{2}", row.SourceName, msdata.id.abbreviate(row.Spectrum.NativeID), row.ModifiedSequence); var spectrumList = msd.run.spectrumList; ++spectraCount; var pwizPeptide = new proteome.Peptide(row.ModifiedSequence, proteome.ModificationParsing.ModificationParsing_Auto, proteome.ModificationDelimiter.ModificationDelimiter_Brackets); var fragmentation = pwizPeptide.fragmentation(true, true); var pwizSpectrum = spectrumList.spectrum(spectrumList.find(row.Spectrum.NativeID), true); var pointMap = new seems.PointMap(new ZedGraph.PointPairList(pwizSpectrum.getMZArray().data, pwizSpectrum.getIntensityArray().data)); double tic = pointMap.Values.Sum(); double precursorMz = row.Spectrum.PrecursorMZ; double chargeReducedPrecursorMz = precursorMz * row.PeptideSpectrumMatch.Charge; bool plotMatchedPeaks = true; bool removeMatchedPeaks = false; double tolerance = 0.03; seems.PointMap.Enumerator itr; IonSeries[] ionSeries = Enum.GetValues(typeof(IonSeries)).Cast<IonSeries>().Where(o => o != IonSeries.Count).ToArray(); for (int z = 1; z <= 1; ++z) for (int length = 1, end = pwizPeptide.sequence.Length; length <= end; ++length) { string NTermFragment = row.ModifiedSequence.Substring(0, length); string CTermFragment = row.ModifiedSequence.Substring(row.ModifiedSequence.Length - length); foreach (IonSeries series in ionSeries) { if ((series == IonSeries.c || series == IonSeries.cMinus1 || series == IonSeries.x) && length == pwizPeptide.sequence.Length) continue; itr = pointMap.FindNear(fragmentMass(fragmentation, series, length, z), tolerance); if (itr != null && itr.IsValid) { if (plotMatchedPeaks) { precursorScatterPlot.AddPoint(new PointPair(itr.Current.Key - precursorMz, itr.Current.Value / tic, (int)series, String.Format("{0} {1}\n{2} {3} {4} {5}", label, precursorMz, NTermFragment, itr.Current.Key, IonSeriesLabels[(int)series], length))); chargeReducedScatterPlot.AddPoint(new PointPair(itr.Current.Key - chargeReducedPrecursorMz, itr.Current.Value / tic, (int)series, String.Format("{0} {1}\n{2} {3} {4} {5}", label, chargeReducedPrecursorMz, NTermFragment, itr.Current.Key, IonSeriesLabels[(int)series], length))); } if (removeMatchedPeaks) pointMap.Remove(itr); } } } foreach (var pair in pointMap) { precursorScatterPlot.AddPoint(new PointPair(pair.Key - precursorMz, pair.Value/tic, 0, label)); chargeReducedScatterPlot.AddPoint(new PointPair(pair.Key - chargeReducedPrecursorMz, pair.Value/tic, 0, label)); } if ((spectraCount % 100) == 0) { Invoke(new MethodInvoker(() => { zedGraphControl.MasterPane.AxisChange(); zedGraphControl.Refresh(); })); } } Invoke(new MethodInvoker(() => { if (!lockZoomCheckBox.Checked) { zedGraphControl.ZoomOutAll(zedGraphControl.GraphPane); } zedGraphControl.MasterPane.AxisChange(); zedGraphControl.Refresh(); })); return new List<double>(); //percentTicBySpectrumByFragmentType[1]; }