Ejemplo n.º 1
0
        public void CollectTrainingSet()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            TestUtils.ShowStarting(methodName);

            const string idFileFolder  = @"D:\MassSpecFiles\training\IdResult";
            const string outFileFolder = @"D:\MassSpecFiles\training\FilteredIdResult";

            if (!Directory.Exists(idFileFolder))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder);
            }


            for (var d = 0; d < TrainSetFileLists.Length; d++)
            {
                var dataset = TrainSetFileLists[d];

                var dataname    = Path.GetFileNameWithoutExtension(dataset);
                var idFile      = string.Format(@"{0}\{1}_IcTda.tsv", idFileFolder, dataname);
                var outFileName = string.Format(@"{0}\{1}.trainset.tsv", outFileFolder, Path.GetFileNameWithoutExtension(dataset));

                if (File.Exists(outFileName))
                {
                    continue;
                }

                Console.WriteLine(dataset);

                if (!File.Exists(idFile))
                {
                    idFile = string.Format(@"{0}\{1}_msgfdb_syn.txt", idFileFolder, dataname);

                    if (!File.Exists(idFile))
                    {
                        Console.WriteLine(@"Skipping file since not found: " + idFile);
                        continue;
                    }
                }

                Console.WriteLine(idFile);

                var targetSets = LcMsFeatureTrain.CollectTrainSet(dataset, idFile);
                Console.WriteLine(targetSets.Count);


                var writer =
                    new StreamWriter(outFileName);
                writer.WriteLine("MinScan\tMaxScan\tMinCharge\tMaxCharge\tMass\tSequence\tModifications");

                foreach (var prsmSet in targetSets)
                {
                    writer.Write(prsmSet.MinScanNum);
                    writer.Write("\t");
                    writer.Write(prsmSet.MaxScanNum);
                    writer.Write("\t");
                    writer.Write(prsmSet.MinCharge);
                    writer.Write("\t");
                    writer.Write(prsmSet.MaxCharge);
                    writer.Write("\t");
                    writer.Write(prsmSet.Mass);
                    writer.Write("\t");
                    writer.Write(prsmSet[0].Sequence);
                    writer.Write("\t");
                    writer.Write(prsmSet[0].Modifications);
                    //writer.Write("\t")
                    //writer.Write(string.Join(";", prsmSet.Select(prsm => prsm.ScanNum)));
                    writer.Write("\n");
                }

                writer.Close();
            }
        }
Ejemplo n.º 2
0
        public void TestGenerateFrequencyData()
        {
            var methodName = MethodBase.GetCurrentMethod().Name;

            Utils.ShowStarting(methodName);

            const string idFileFolder  = @"D:\MassSpecFiles\training\IdScoring\MSPF_trainset";
            const string outFileFolder = @"D:\MassSpecFiles\training\IdScoring";

            if (!Directory.Exists(idFileFolder))
            {
                Assert.Ignore(@"Skipping test {0} since folder not found: {1}", methodName, idFileFolder);
            }

            Modification.RegisterAndGetModification(Modification.Cysteinyl.Name, Modification.Cysteinyl.Composition);
            Modification.RegisterAndGetModification(Modification.Phosphorylation.Name, Modification.Phosphorylation.Composition);
            Modification.RegisterAndGetModification(Modification.Methylation.Name, Modification.Methylation.Composition);
            Modification.RegisterAndGetModification(Modification.DiMethylation.Name, Modification.DiMethylation.Composition);
            Modification.RegisterAndGetModification(Modification.TriMethylation.Name, Modification.TriMethylation.Composition);
            Modification.RegisterAndGetModification("Trioxidation", new Composition(0, 0, 0, 3, 0));
            // var aaSet = new AminoAcidSet(@"D:\MassSpecFiles\training\Mods.txt");

            foreach (var dataset in TrainSetFileLists)
            {
                var dataname  = Path.GetFileNameWithoutExtension(dataset);
                var idFile    = string.Format(@"{0}\{1}_IcTda.tsv", idFileFolder, dataname);
                var decoyFile = string.Format(@"{0}\{1}_IcDecoy.tsv", idFileFolder, dataname);
                //  var targetFile = string.Format(@"{0}\{1}_IcTarget.tsv", idFileFolder, dataname);

                if (!File.Exists(idFile))
                {
                    continue;
                }

                var prsmReader = new ProteinSpectrumMatchReader(0.01);
                var prsmList   = prsmReader.LoadIdentificationResult(idFile);

                var minScore     = prsmList.Last().Score;
                var decoyMatches = prsmReader.ReadMsPathFinderResult(decoyFile, int.MaxValue, 1, Math.Max(minScore - 5, 10));
                var run          = PbfLcMsRun.GetLcMsRun(dataset);

                var spectrumMatchSet = LcMsFeatureTrain.CollectTrainSet(dataset, idFile);
                Console.WriteLine(spectrumMatchSet.Count);
                var writer = new StreamWriter(string.Format(@"{0}\{1}_target.tsv", outFileFolder, dataname));

                foreach (var matches in spectrumMatchSet)
                {
                    foreach (var match in matches)
                    {
                        var spec = run.GetSpectrum(match.ScanNum) as ProductSpectrum;
                        GetMatchStatistics(spec, match.GetSequence(), match.Charge, writer);
                    }
                }
                writer.Close();

                writer = new StreamWriter(string.Format(@"{0}\{1}_decoy.tsv", outFileFolder, dataname));
                foreach (var match in decoyMatches)
                {
                    var sequence = match.GetSequence();
                    var spec     = run.GetSpectrum(match.ScanNum) as ProductSpectrum;
                    GetMatchStatistics(spec, sequence, match.Charge, writer);
                }
                writer.Close();
            }
        }