Ejemplo n.º 1
0
        public static void ParseSearchResults(this QcDataContainer qcData, RawDataCollection rawData, IRawDataPlus rawFile, QcParameters qcParameters)
        {
            XElement results = LoadSearchResults(qcParameters, rawData);

            PsmDataCollection Psms = ExtractPsmData(results, qcParameters.searchParameters.SearchAlgorithm);

            qcData.ParsePSMs(Psms, qcParameters);
        }
Ejemplo n.º 2
0
        public static void UpdateQcCollection(QcDataCollection qcDataCollection, QcDataContainer newQcData, MethodDataContainer methodData, string rawFileName)
        {
            qcDataCollection.QcData.Add(methodData.CreationDate, newQcData);
            qcDataCollection.ProcessedRawFiles.Add(Path.GetFileName(rawFileName));
            qcDataCollection.WriteQcToTable();
            Console.WriteLine("QC data written to csv file.");

            try
            {
                XmlSerialization.WriteToXmlFile <QcDataCollection>(qcDataCollection.QcFile, qcDataCollection);
                Log.Information("QC file saved successfully");
                Console.WriteLine("QC file saved successfully");
            }
            catch (Exception e)
            {
                Log.Error(e, "Failed during serialization of QC data");
                Console.WriteLine("ERROR: failure during serialization of QC data.");
                Environment.Exit(1);
            }
        }
Ejemplo n.º 3
0
        public static void ChromIntMetrics(this QcDataContainer qcData, RawDataCollection rawData, MetricsData metrics)
        {
            double firstRtToExceed10 = 0;
            double lastRtToExceed10  = 0;
            double proportionCovered;
            var    scans         = rawData.scanIndex.ScanEnumerators[MSOrderType.Ms];
            var    reversedScans = scans.Reverse();
            var    totalIntList  = (from x in scans select rawData.metaData[x].SummedIntensity).ToArray();

            // get Q1 of total intensity from all scans
            double threshold = totalIntList.Max() / 10;

            // get first RT which exceeds Q1
            for (int i = 0; i < scans.Length; i++)
            {
                int scan = scans[i];
                if (totalIntList.MovingAverage(i, 20) > threshold)
                {
                    firstRtToExceed10 = rawData.retentionTimes[scan];
                    break;
                }
            }

            for (int i = scans.Length - 1; i >= 0; i--)
            {
                int scan = scans[i];
                if (totalIntList.MovingAverage(i, 20) > threshold)
                {
                    lastRtToExceed10 = rawData.retentionTimes[scan];
                    break;
                }
            }

            // get proportion of run encompassed by these times
            //proportionCovered = (lastRtToExceedQ1 - firstRtToExceedQ1) / metrics.TotalAnalysisTime;
            proportionCovered = (lastRtToExceed10 - firstRtToExceed10) / rawData.retentionTimes[rawData.scanIndex.ScanEnumerators[MSOrderType.Ms].Last()];

            qcData.TimeBeforeFirstScanToExceedPoint1MaxIntensity = firstRtToExceed10;// - rawData.retentionTimes[1];
            qcData.TimeAfterLastScanToExceedPoint1MaxIntensity   = rawData.retentionTimes[rawData.scanIndex.ScanEnumerators[MSOrderType.Ms].Last()] - lastRtToExceed10;
            qcData.FractionOfRunAbovePoint1MaxIntensity          = proportionCovered;
        }
Ejemplo n.º 4
0
        public static void GetModificationFrequency(this QcDataContainer qcData, IEnumerable <PsmData> psms, SearchParameters searchParameters)
        {
            string nmod = searchParameters.NMod;
            string kmod = searchParameters.KMod;
            string xmod = searchParameters.XMod;
            Dictionary <string, string> Modifications      = new Dictionary <string, string>();
            Dictionary <string, int>    TotalLabelingSites = new Dictionary <string, int>();
            Dictionary <string, int>    LabelingSitesHit   = new Dictionary <string, int>();
            Dictionary <string, double> LabelingEfficiency = new Dictionary <string, double>();
            List <Modification>         mods;
            List <string> AminosOfInterest = new List <string>();

            string[] Mods = new string[] { nmod, kmod, xmod };

            // "Prime" the dictionaries
            foreach (var item in Mods)
            {
                if (item == null)
                {
                    continue;
                }
                var splitString = item.Split('@');
                // add the key: value pairs as mass@AA:AA
                Modifications.Add(item, splitString.Last());
                // and AA:int
                TotalLabelingSites.Add(splitString.Last(), 0);
                LabelingSitesHit.Add(splitString.Last(), 0);
                AminosOfInterest.Add(splitString.Last());
            }

            // now we need to get at labeling efficiency
            int KTotalSites  = 0;
            int KTotalMissed = 0;
            int NTotalSites  = 0;
            int NTotalMissed = 0;
            int XTotalSites  = 0;
            int XTotalMissed = 0;

            // define a function to return the number of labeling sites
            int NumberOfSites(string aa, string seq)
            {
                if (aa == "[" | aa == "]")
                {
                    return(1);
                }
                else
                {
                    return(seq.Count(x => x.ToString() == aa));
                }
            }

            foreach (PsmData psm in psms)
            {
                mods = psm.Mods;
                bool skipNterm = false;

                // check the sequence in two steps. First the n-terminus, then remove the n-terminus and check the rest of it.

                // FIRST STEP: N-TERMINUS

                if (nmod != null)
                {
                    // check if the first residue is lysine
                    if (psm.Seq[0] == 'K')
                    {
                        // if so, we need to see if it was only labeled once. Skip the psm if that is the case because it is ambiguous
                        IEnumerable <Modification> nMods = from x in mods
                                                           where x.Loc == 0
                                                           select x;
                        int numMods = nMods.Count();

                        if (numMods == 1)
                        {
                            // we can't know which reactive site is modified, so don't include this peptide
                            continue;
                        }
                        if (numMods == 0)
                        {
                            // nothing is labeled
                            TotalLabelingSites["["] += 1;
                            if (AminosOfInterest.Contains("K"))
                            {
                                TotalLabelingSites["K"] += 1;
                            }
                        }
                        if (numMods == 2)
                        {
                            TotalLabelingSites["["] += 1;
                            LabelingSitesHit["["]   += 1;

                            if (AminosOfInterest.Contains("K"))
                            {
                                TotalLabelingSites["K"] += 1;
                                LabelingSitesHit["K"]   += 1;
                            }
                        }
                    }
                    // If the first residue is not lysine
                    else
                    {
                        IEnumerable <Modification> nMods = from x in mods
                                                           where x.Loc == 0
                                                           select x;

                        // add 1 to total n-termini, because it is always there
                        TotalLabelingSites["["] += 1;

                        // get the aa residue letter
                        string residue = psm.Seq[0].ToString();

                        //see if it is of interest
                        if (AminosOfInterest.Contains(residue))
                        {
                            // if so, add 1 to total sites for it
                            TotalLabelingSites[residue] += 1;
                        }

                        // now go through each detected modification
                        foreach (Modification mod in nMods)
                        {
                            if (nmod.Contains(mod.Mass.ToString()))
                            {
                                LabelingSitesHit["["] += 1;
                            }
                            else
                            {
                                if (AminosOfInterest.Contains(mod.AA))
                                {
                                    LabelingSitesHit[mod.AA] += 1;
                                }
                            }
                        }
                    }
                }
                int start;
                if (nmod != null)
                {
                    start = 1;
                }
                else
                {
                    start = 0;
                }

                // now continue with the rest

                for (int i = start; i < psm.Seq.Length; i++)
                {
                    // check if we care about this amino acid
                    string aa = psm.Seq[i].ToString();
                    if (AminosOfInterest.Contains(aa))
                    {
                        // add one to potential labeling sites
                        TotalLabelingSites[aa] += 1;

                        // There should only ever be one modification for each of the rest of the residues, so we can reference it by location to see if it exists
                        bool hit = (from x in mods
                                    where x.Loc == i
                                    select 1).Count() == 1;
                        if (hit)
                        {
                            LabelingSitesHit[aa] += 1;
                        }
                    }
                    else
                    {
                        continue;
                    }
                }
            }

            // spit out some metrics to the console

            foreach (string aa in AminosOfInterest)
            {
                if (aa == "[")
                {
                    Console.WriteLine("Total N-term sites: {0}", TotalLabelingSites["["]);
                }
                else
                {
                    Console.WriteLine("Total {0} sites: {1}", aa, TotalLabelingSites[aa]);
                }
            }

            foreach (string aa in AminosOfInterest)
            {
                if (aa == "[")
                {
                    Console.WriteLine("Missed modifications at N-term: {0}", TotalLabelingSites["["] - LabelingSitesHit["["]);
                }
                else
                {
                    Console.WriteLine("Missed modifications at {0}: {1}", aa, TotalLabelingSites[aa] - LabelingSitesHit[aa]);
                }
            }

            // calculate labelling efficiency for each site
            foreach (var aa in AminosOfInterest)
            {
                double efficiency = (double)LabelingSitesHit[aa] / TotalLabelingSites[aa];
                LabelingEfficiency.Add(aa, efficiency);
                if (aa == "[")
                {
                    Console.WriteLine("Modification frequency at N-term: {0}", efficiency);
                }
                else
                {
                    Console.WriteLine("Modification frequency at {0}: {1}", aa, efficiency);
                }

                // if the sites are n-term or K add them to their own attributes
                if (aa == "[")
                {
                    qcData.LabelingEfficiencyAtNTerm = efficiency;
                }
                else
                {
                    if (aa == "K")
                    {
                        qcData.LabelingEfficiencyAtK = efficiency;
                    }
                    // if not, then add it to xmod attributes
                    else
                    {
                        qcData.LabelingEfficiencyAtX = efficiency;
                        qcData.LabelX = aa;
                    }
                }
            }
        }
Ejemplo n.º 5
0
        public static void ParsePSMs(this QcDataContainer qcData, PsmDataCollection psmCollection, QcParameters qcParameters)
        {
            XElement results, searchSummary;
            IEnumerable <XElement> decoyPSMs, search_hits, spectrumQueries;
            int numGoodPSMs, totalCleavageSites, pepsWithNoMissedCleavages, peptidesWithNoMissedCleavages;
            IEnumerable <int>     allMissedCleavages, charges;
            double                IdRate, chargeRatio3to2, chargeRatio4to2;
            double                digestionEfficiencyByCleavage, digestionEfficiency, topDecoyScore;
            double                missedCleavageRate;
            Dictionary <int, int> numCharges       = new Dictionary <int, int>();
            SearchParameters      searchParameters = qcParameters.searchParameters;
            int                   numSearched      = searchParameters.NumSpectra;
            List <PsmData>        psms;
            IEnumerable <PsmData> goodPsms, nonDecoys;

            // convert the dictionary to a list for easy parsing
            psms = psmCollection.Values.ToList();

            // get the top decoy score
            topDecoyScore = (from x in psms
                             where x.Decoy
                             select x.Hyperscore)
                            .ToArray().Percentile(95);

            // get the non-decoys
            nonDecoys = from x in psms
                        where !x.Decoy
                        select x;

            // and select the non-decoy hits which are above the top decoy score
            goodPsms = from x in psms
                       where !x.Decoy & x.Hyperscore > topDecoyScore
                       select x;

            Console.WriteLine("Total hits: {0}", psms.Count());
            Console.WriteLine("Top decoy score: {0}", topDecoyScore);
            Console.WriteLine("Non-decoy hits: {0}", nonDecoys.Count());
            Console.WriteLine("Non-decoy hits above top decoy score: {0}", goodPsms.Count());


            // parse out the charges
            charges = from x in goodPsms
                      select x.Charge;

            // get the number of each charge, add to a dictionary
            foreach (int charge in new List <int>()
            {
                2, 3, 4
            })
            {
                numCharges.Add(charge, (from x in charges where x == charge select 1).Count());
            }

            // calculate charge ratios
            chargeRatio3to2 = Convert.ToDouble(numCharges[3]) / Convert.ToDouble(numCharges[2]);
            chargeRatio4to2 = Convert.ToDouble(numCharges[4]) / Convert.ToDouble(numCharges[2]);

            // parse out the missed cleavage data
            pepsWithNoMissedCleavages = (from x in goodPsms
                                         where x.MissedCleavages == 0
                                         select 1).Sum();

            // number of PSMs is the length of this collection
            numGoodPSMs = goodPsms.Count();

            // missed cleavages per PSM
            digestionEfficiency = (double)pepsWithNoMissedCleavages / numGoodPSMs;
            Console.WriteLine("Digestion efficiency: {0}", digestionEfficiency);

            // get missed cleavage rate, i.e. number of missed cleavages per psm
            missedCleavageRate = (double)(from x in goodPsms select x.MissedCleavages).Sum() / numGoodPSMs;
            Console.WriteLine("Missed cleavage rate (/PSM): {0}", missedCleavageRate);

            // calculate ID rate
            IdRate = (double)numGoodPSMs / numSearched;
            Console.WriteLine("IDrate: {0}", IdRate);

            // get labeling efficiency metrics
            if ((searchParameters.NMod != null) | (searchParameters.KMod != null) | (searchParameters.XMod != null))
            {
                qcData.GetModificationFrequency(goodPsms, searchParameters);
            }

            // get median mass drift
            qcData.MedianMassDrift = (from x in goodPsms
                                      select x.MassDrift)
                                     .ToArray().Percentile(50);

            qcData.IdentificationRate  = IdRate;
            qcData.MissedCleavageRate  = missedCleavageRate;
            qcData.DigestionEfficiency = digestionEfficiency;
            qcData.ChargeRatio3to2     = chargeRatio3to2;
            qcData.ChargeRatio4to2     = chargeRatio4to2;
        }
Ejemplo n.º 6
0
        public static QcDataContainer ProcessQcData(this QcDataCollection Data, RawDataCollection rawData, IRawDataPlus rawFile, string qcDirectory, string fastaDB = null)
        {
            DateTime dateAcquired = rawFile.CreationDate;
            //RawDataCollection rawData = new RawDataCollection(rawFile);

            MetricsData metricsData = new MetricsData();

            metricsData.GetMetricsData(metaData: rawData.metaData, rawData: rawData, rawFile: rawFile);

            QcDataContainer qcData = new QcDataContainer(rawData.rawFileName, dateAcquired);

            qcData.Instrument        = rawData.instrument;
            qcData.ExperimentMsOrder = rawData.methodData.AnalysisOrder;
            qcData.Ms1Analyzer       = rawData.methodData.MassAnalyzers[MSOrderType.Ms].ToString();
            qcData.Ms2Analyzer       = rawData.methodData.MassAnalyzers[MSOrderType.Ms2].ToString();

            if (qcData.ExperimentMsOrder == MSOrderType.Ms3)
            {
                qcData.Ms3Analyzer = rawData.methodData.MassAnalyzers[MSOrderType.Ms3].ToString();
            }
            else
            {
                qcData.Ms3Analyzer = "None";
            }

            qcData.TotalScans  = metricsData.TotalScans;
            qcData.NumMs1Scans = metricsData.MS1Scans;
            qcData.NumMs2Scans = metricsData.MS2Scans;
            qcData.NumMs3Scans = metricsData.MS3Scans;

            qcData.Ms1ScanRate = metricsData.MS1ScanRate;
            qcData.Ms2ScanRate = metricsData.MS2ScanRate;

            qcData.MeanDutyCycle = metricsData.MeanDutyCycle;

            qcData.MeanTopN = metricsData.MeanTopN;

            qcData.MedianPrecursorIntensity       = metricsData.MedianPrecursorIntensity;
            qcData.MedianSummedMs2Intensity       = metricsData.MedianSummedMS2Intensity;
            qcData.MedianMs1IsolationInterference = metricsData.MedianMs1IsolationInterference;
            qcData.MedianMs2FractionConsumingTop80PercentTotalIntensity = metricsData.MedianMs2FractionConsumingTop80PercentTotalIntensity;

            qcData.NumEsiStabilityFlags = NumberOfEsiFlags(rawData);

            qcData.QuantMeta = metricsData.QuantMeta;

            qcData.GradientTime       = metricsData.Gradient;
            qcData.ColumnPeakCapacity = metricsData.PeakCapacity;
            qcData.ChromIntMetrics(rawData, metricsData);

            if (!rawData.isBoxCar)
            {
                qcData.PeakShape.Asymmetry.P10 = rawData.peakData.PeakShapeMedians.Asymmetry.P10;
                qcData.PeakShape.Asymmetry.P50 = rawData.peakData.PeakShapeMedians.Asymmetry.P50;

                qcData.PeakShape.Width.P10 = rawData.peakData.PeakShapeMedians.Width.P10;
                qcData.PeakShape.Width.P50 = rawData.peakData.PeakShapeMedians.Width.P50;
            }



            // add the signal-to-noise distribution to the QC data. These are presented as "median of the ith percentile", so for example we take all the 10th percentile values of
            // the S2N and put them in a list, then report the median of that list

            qcData.MedianSummedMs1Intensity = (from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms] select rawData.centroidStreams[x].Intensities.Sum()).ToArray().Percentile(50);

            // add the fill-time distribution to the QC data. This is more straightforward. Just put all the fill times in an array and use it to instantiate a new distribution.
            qcData.Ms1FillTime = new Distribution((from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms] select rawData.trailerExtras[x].InjectionTime).ToArray());
            qcData.Ms2FillTime = new Distribution((from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms2] select rawData.trailerExtras[x].InjectionTime).ToArray());
            qcData.Ms3FillTime = new Distribution((from x in rawData.scanIndex.ScanEnumerators[MSOrderType.Ms3] select rawData.trailerExtras[x].InjectionTime).ToArray());

            //Data.QcData.Add(dateAcquired, newData);
            //Data.ProcessedRawFiles.Add(Path.GetFileName(rawData.rawFileName));

            return(qcData);
        }
Ejemplo n.º 7
0
        public static void DoQc(QcParameters qcParameters)
        {
            QcDataCollection qcDataCollection;
            string           dataDirectory         = qcParameters.RawFileDirectory;
            string           qcDirectory           = qcParameters.QcDirectory;
            string           qcSearchDataDirecotry = qcParameters.QcSearchDataDirectory;
            SearchParameters searchParameters      = qcParameters.searchParameters;

            // our qc file
            string qcFile = Path.Combine(qcDirectory, "QC.xml");

            // see if the file exists
            if (File.Exists(qcFile))
            {
                // if so, open it
                try
                {
                    qcDataCollection = XmlSerialization.ReadFromXmlFile <QcDataCollection>(qcFile);
                    Log.Information("QC data file loaded successfully");
                }
                catch (Exception e)
                {
                    Log.Error(e, "Failed while loading QC data");
                    throw e;
                }
            }
            else
            {
                // if not, check if the directory exists
                if (!Directory.Exists(qcDirectory))
                {
                    Directory.CreateDirectory(qcDirectory);
                }

                qcDataCollection = new QcDataCollection(dataDirectory, qcDirectory);
                Log.Information("Appears to be a new QC directory. New QC data collection created.");
            }

            // get our list of new raw files. it is every raw file in the directory that is not listed in the qc data
            var fileList = Directory.GetFiles(dataDirectory, "*.*", SearchOption.TopDirectoryOnly)
                           .Where(s => s.EndsWith(".raw", StringComparison.OrdinalIgnoreCase)).ToList();

            if (fileList.Count() == 0)
            {
                Log.Error("No raw files found in {Directory}", dataDirectory);
                Console.WriteLine("{0} contains no raw files!", dataDirectory);
                Environment.Exit(1);
            }

            fileList.RemoveAll(s => qcDataCollection.ProcessedRawFiles.Contains(Path.GetFileName(s)));

            Log.Information("Raw files in QC queue: {Files}", fileList);

            if (fileList.Count() == 0)
            {
                Log.Information("No new files to QC");
                Console.WriteLine("No new files in the directory to QC!");
                Environment.Exit(0);
            }

            Console.WriteLine("{0} file(s) to process", fileList.Count());

            foreach (string fileName in fileList)
            {
                Console.WriteLine("Processing {0}", fileName);

                IFileHeader rawHeader;

                // try to open the raw file header
                try
                {
                    rawHeader = FileHeaderReaderFactory.ReadFile(fileName);;
                }
                catch (Exception)
                {
                    Log.Information("{File} is not a valid raw file", fileName);
                    Console.WriteLine("{0} is not a valid raw file, continuing to next file.", fileName);
                    continue;
                }

                // is it a real raw file?
                if (rawHeader.FileType == FileType.RawFile)
                {
                    Log.Information("{File} is a valid raw file", fileName);
                    Log.Information("Creation date: {Date}", rawHeader.CreationDate);
                    Log.Information("File description: {Description}", rawHeader.FileDescription);
                }
                else
                {
                    Log.Information("{File} is not a valid raw file", fileName);
                    Console.WriteLine("{0} is not a valid raw file, continuing to next file.", fileName);
                    continue;
                }
                // okay, it is probably a real raw file, let's do the QC

                // check if the raw file already exists in the QC data with a different name
                if (qcDataCollection.QcData.Keys.Contains(rawHeader.CreationDate))
                {
                    Log.Information("A file with the same creation date and time as {File} already exists in the QC data", fileName);
                    Console.WriteLine("{0} appears to already exist in the QC data with the name {1}. Skipping to next file.",
                                      fileName, qcDataCollection.QcData[rawHeader.CreationDate].RawFile);
                    continue;
                }

                using (IRawDataPlus rawFile = RawFileReaderFactory.ReadFile(fileName))
                {
                    rawFile.SelectInstrument(Device.MS, 1);
                    RawDataCollection rawData = new RawDataCollection(rawFile);
                    rawData.ExtractAll(rawFile);

                    /*
                     * if (idpyPars?.QuantMods != null)
                     * {
                     *  rawData.quantData.Quantify(rawData, rawFile, )
                     * }
                     */

                    QcDataContainer newQcData = ProcessQcData(Data: qcDataCollection, rawData: rawData, rawFile: rawFile, qcDirectory: qcDirectory);

                    if (searchParameters != null)
                    {
                        Search.WriteSearchMGF(qcParameters, rawData, rawFile, searchParameters.FixedScans);
                        Search.RunSearch(qcParameters, rawData, rawFile);
                        newQcData.ParseSearchResults(rawData, rawFile, qcParameters);

                        /*
                         * if (searchParameters.SearchAlgorithm == SearchAlgorithm.XTandem)
                         * {
                         *  SearchQC.ParseXTandem(newQcData, qcParameters);
                         *  newQcData.IdentipyParameters = String.Format("\"Algorithm: X!Tandem; fmods: {0}; nmod: {1}; kmod: {2}; xmod: {3}; fastaDB: {4}; xtandemDirectory: {5}\"",
                         *  searchParameters.FixedMods, searchParameters.NMod, searchParameters.KMod, searchParameters.XMod, searchParameters.FastaDatabase, searchParameters.XTandemDirectory);
                         * }
                         * else
                         * {
                         *  SearchQC.ParseIdentipy(newQcData, rawData, rawFile, qcParameters);
                         *  newQcData.IdentipyParameters = String.Format("\"Algorithm: IdentiPy; fmods: {0}; nmod: {1}; kmod: {2}; xmod: {3}; fastaDB: {4}; pythonExecutable: {5}; identipyScript: {6}\"",
                         *  searchParameters.FixedMods, searchParameters.NMod, searchParameters.KMod, searchParameters.XMod, searchParameters.FastaDatabase, searchParameters.PythonExecutable, searchParameters.IdentipyScript);
                         * }
                         */
                    }

                    qcDataCollection.QcData.Add(rawFile.CreationDate, newQcData);
                    qcDataCollection.ProcessedRawFiles.Add(Path.GetFileName(rawData.rawFileName));
                    qcDataCollection.WriteQcToTable();
                }

                Log.Information("QC finished: {File}", fileName);
            }

            Log.Information("QC of all files completed");
            Console.WriteLine("QC of all files completed!");

            try
            {
                XmlSerialization.WriteToXmlFile <QcDataCollection>(qcFile, qcDataCollection);
                Log.Information("QC file saved successfully");
                Console.WriteLine("QC file saved successfully");
            }
            catch (Exception e)
            {
                Log.Error(e, "Failed during serialization of QC data");
                throw e;
            }
        }