public virtual List <FeatureLocation> GetSequenceRegions()
        {
            //Read sequence regions
            var items = SequenceRegionUtils.GetSequenceRegions(CoordinateFile);

            items.ForEach(m =>
            {
                if (m.Seqname.StartsWith("chr"))
                {
                    m.Seqname = m.Seqname.StringAfter("chr");
                }
            });

            //Fill sequence information, only miRNA and tRNA will be filled.
            if (!string.IsNullOrEmpty(this.FastaFile))
            {
                Console.WriteLine("Reading sequence from {0} ...", this.FastaFile);
                var seqs = SequenceUtils.Read(new FastaFormat(), this.FastaFile).ToDictionary(m => m.Name);
                items.ForEach(m =>
                {
                    if (m.Name.StartsWith(SmallRNAConsts.miRNA) || m.Name.StartsWith(SmallRNAConsts.tRNA))
                    {
                        if (seqs.ContainsKey(m.Name))
                        {
                            m.Sequence = seqs[m.Name].SeqString;
                        }
                        else
                        {
                            Console.WriteLine("Missing sequence: " + m.Name);
                        }
                    }
                    else
                    {
                        m.Sequence = string.Empty;
                    }
                });
                seqs.Clear();
            }

            var result = items.ConvertAll(m => new FeatureLocation(m)).ToList();

            result.ForEach(m =>
            {
                foreach (var categoryName in SmallRNAConsts.Biotypes)
                {
                    if (m.Name.StartsWith(categoryName))
                    {
                        m.Category = categoryName;
                    }
                }
            });
            return(result);
        }
        protected override IFileProcessor GetFileProcessor()
        {
            proteins = SequenceUtils.Read(new FastaFormat(), base.GetOriginFile());
            Protease protease = ProteaseManager.GetProteaseByName(proteases.SelectedItem);
            Digest   digest   = new Digest()
            {
                DigestProtease     = protease,
                MaxMissedCleavages = 2
            };

            List <SimplePeakChro> totalPeaks = new List <SimplePeakChro>();

            foreach (var seq in proteins)
            {
                digest.ProteinSequence = seq;
                digest.AddDigestFeatures();

                List <DigestPeptideInfo> peptides = seq.GetDigestPeptideInfo();
                peptides.RemoveAll(m => m.PeptideSeq.Length < 6);
                foreach (var dpi in peptides)
                {
                    double mass = aas.MonoPeptideMass(dpi.PeptideSeq);
                    List <SimplePeakChro> curPeaks = new List <SimplePeakChro>();
                    for (int charge = 2; charge <= 3; charge++)
                    {
                        double precursor = (mass + Atom.H.MonoMass * charge) / charge;
                        if (precursor < 300 || precursor > 2000)
                        {
                            continue;
                        }

                        curPeaks.Add(new SimplePeakChro()
                        {
                            Mz       = precursor,
                            Sequence = dpi.PeptideSeq,
                            Charge   = charge
                        });
                    }

                    if (curPeaks.Count > 0)
                    {
                        dpi.Annotations[CHRO_KEY] = curPeaks;
                        totalPeaks.AddRange(curPeaks);
                    }
                }

                peptides.RemoveAll(m => !m.Annotations.ContainsKey(CHRO_KEY));
            }

            return(new ProteinChromatographProcessor(totalPeaks, new string[] { rawFile.FullName }.ToList(), new RawFileImpl(), ppmTolerance.Value, 2.0, rebuildAll.Checked));
        }
        public override IEnumerable <string> Process()
        {
            Progress.SetMessage("reading fasta file ...");
            var faMap = SequenceUtils.Read(new FastaFormat(), options.FastaFile).ToDictionary(m => m.Name);

            Progress.SetMessage("{0} sequences read ...", faMap.Count);

            using (StreamWriter sw = new StreamWriter(options.OutputFile))
            {
                Progress.SetMessage("reading gff file ...");
                var gffs = GtfItemFile.ReadFromFile(options.GffFile);
            }

            return(new string[] { options.OutputFile });
        }
        public override IEnumerable <string> Process()
        {
            Progress.SetMessage("Reading sequences from: " + _options.InputFile + "...");
            var seqs = SequenceUtils.Read(_options.InputFile);

            seqs.Sort((m1, m2) =>
            {
                var chr1    = m1.Name.StringBefore("_").StringAfter("chr");
                var suffix1 = m1.Name.Contains("_") ? m1.Name.StringAfter("_") : string.Empty;
                var chr2    = m2.Name.StringBefore("_").StringAfter("chr");
                var suffix2 = m2.Name.Contains("_") ? m2.Name.StringAfter("_") : string.Empty;

                if (string.IsNullOrWhiteSpace(suffix1))
                {
                    if (string.IsNullOrWhiteSpace(suffix2))
                    {
                        return(GenomeUtils.CompareChromosome(chr1, chr2));
                    }
                    else
                    {
                        return(-1);
                    }
                }
                else
                {
                    if (string.IsNullOrWhiteSpace(suffix2))
                    {
                        return(1);
                    }
                    else
                    {
                        var ret = GenomeUtils.CompareChromosome(chr1, chr2);
                        if (ret == 0)
                        {
                            ret = suffix1.CompareTo(suffix2);
                        }
                        return(ret);
                    }
                }
            });

            Progress.SetMessage("Writing sequences to: " + _options.OutputFile + "...");
            SequenceUtils.Write(new FastaFormat(), _options.OutputFile, seqs);

            Progress.SetMessage("Finished.");

            return(new[] { _options.OutputFile });
        }
Exemplo n.º 5
0
        public override IEnumerable <string> Process(string fileName)
        {
            Progress.SetMessage("Reading sequences from " + database + " ...");
            var seqs = SequenceUtils.Read(new FastaFormat(), database);

            seqs.RemoveAll(m => m.Name.StartsWith("rev_") || !m.Name.Contains("|#"));

            var format = new MascotPeptideTextFormat();

            Progress.SetMessage("Procesing peptides from " + Path.GetFileName(fileName) + " ...");
            var peptides = format.ReadFromFile(fileName);

            Progress.SetRange(0, peptides.Count);
            foreach (var peptide in peptides)
            {
                Progress.Increment(1);
                var pureSeq = peptide.Annotations["PureSequence"] as string;
                foreach (var seq in seqs)
                {
                    if (seq.SeqString.Contains(pureSeq))
                    {
                        peptide.Annotations["MutDB"] = seq.Name;
                        break;
                    }
                }
            }

            var result = fileName + ".mutdb";

            using (StreamWriter sw = new StreamWriter(fileName + ".mutdb"))
            {
                sw.WriteLine(format.PeptideFormat.GetHeader() + "\tMutDB");
                foreach (var peptide in peptides)
                {
                    sw.Write(format.PeptideFormat.GetString(peptide));
                    if (peptide.Annotations.ContainsKey("MutDB"))
                    {
                        sw.WriteLine("\t" + peptide.Annotations["MutDB"]);
                    }
                    else
                    {
                        sw.WriteLine("\t");
                    }
                }
            }

            return(new string[] { result });
        }
        public virtual List <FeatureLocation> GetSequenceRegions()
        {
            //Read sequence regions
            var result = SequenceRegionUtils.GetSequenceRegions(CoordinateFile, GtfFeatureName, BedAsGtf);

            result.ForEach(m =>
            {
                m.Seqname = m.Seqname.StringAfter("chr");
            });

            //Fill sequence information
            var sr = result.FirstOrDefault(m => m.Name.Contains(":"));

            if (sr != null)
            {
                var sequence = sr.Name.StringAfter(":");
                if (sequence.All(m => MIRNA.Contains(m)))
                {
                    result.ForEach(m => m.Sequence = m.Name.StringAfter(":"));
                    result.ForEach(m => m.Name     = m.Name.StringBefore(":"));
                }
            }

            if (!string.IsNullOrEmpty(this.FastaFile))
            {
                Console.WriteLine("Reading sequence from {0} ...", this.FastaFile);
                var seqs = SequenceUtils.Read(new FastaFormat(), this.FastaFile).ToDictionary(m => m.Name);
                result.ForEach(m =>
                {
                    if (seqs.ContainsKey(m.Name))
                    {
                        m.Sequence = seqs[m.Name].SeqString;
                    }
                    else
                    {
                        Console.WriteLine("Missing sequence: " + m.Name);
                    }
                });
                seqs.Clear();
            }

            return(result.ConvertAll(m => new FeatureLocation(m)).ToList());
        }
        public override IEnumerable <string> Process(string fileName)
        {
            Progress.SetMessage("Loading sequences from " + fileName + "...");
            var seqs = SequenceUtils.Read(new FastaFormat(), fileName);

            Progress.SetMessage("Converint {0} sequences ...", seqs.Count);
            seqs.ForEach(m =>
            {
                m.SeqString = MiRnaToDna(m.SeqString);
            });

            var result = Path.ChangeExtension(fileName, ".dna" + Path.GetExtension(fileName));

            Progress.SetMessage("Saving {0} sequences to {1}", seqs.Count, result);
            SequenceUtils.Write(new FastaFormat(), result, seqs);

            Progress.SetMessage("Finished!");
            return(new string[] { result });
        }
Exemplo n.º 8
0
        public static List <CoverageRegion> GetSmallRNACoverageRegionFromFasta(string featureFile)
        {
            var sequences = SequenceUtils.Read(featureFile);
            var result    = new List <CoverageRegion>();

            foreach (var smallRNA in sequences)
            {
                //coverage in all position will be set as same as total query count
                var rg = new CoverageRegion();
                rg.Name     = smallRNA.Name;
                rg.Seqname  = "Unknown";
                rg.Start    = -1;
                rg.End      = -1;
                rg.Strand   = '*';
                rg.Sequence = smallRNA.SeqString;

                for (int i = 0; i < smallRNA.SeqString.Length; i++)
                {
                    rg.Coverages.Add(new CoverageSite(DEFAULT_COVERAGE));
                }
                result.Add(rg);
            }
            return(result);
        }
        public override IEnumerable <string> Process()
        {
            var expRawfileMap = options.RawFiles.ToDictionary(m => Path.GetFileNameWithoutExtension(m));

            Progress.SetMessage("Reading library file ...");
            var liblist = new MS2ItemXmlFormat().ReadFromFile(options.LibraryFile);

            PreprocessingMS2ItemList(liblist);

            var lib = liblist.GroupBy(m => m.Charge).ToDictionary(m => m.Key, m => m.ToList());

            Progress.SetMessage("Building library sequence amino acid composition ...");
            lib.ForEach(m => m.Value.ForEach(l => l.AminoacidCompsition = (from a in l.Peptide
                                                                           where options.SubstitutionDeltaMassMap.ContainsKey(a)
                                                                           select a).Distinct().OrderBy(k => k).ToArray()));

            var expScanMap = (from p in liblist
                              from sq in p.FileScans
                              select sq).ToList().GroupBy(m => m.Experimental).ToDictionary(m => m.Key, m => new HashSet <int>(from l in m select l.FirstScan));

            if (File.Exists(options.PeptidesFile))
            {
                Progress.SetMessage("Reading peptides file used for excluding scan ...");
                var peptides = new MascotPeptideTextFormat().ReadFromFile(options.PeptidesFile);
                foreach (var pep in peptides)
                {
                    HashSet <int> scans;
                    if (!expScanMap.TryGetValue(pep.Query.FileScan.Experimental, out scans))
                    {
                        scans = new HashSet <int>();
                        expScanMap[pep.Query.FileScan.Experimental] = scans;
                    }
                    scans.Add(pep.Query.FileScan.FirstScan);
                }
            }

            Progress.SetMessage("Reading MS2/MS3 data ...");
            var result = GetCandidateMs2ItemList(expRawfileMap, expScanMap);

            PreprocessingMS2ItemList(result);

            //new MS2ItemXmlFormat().WriteToFile(options.OutputFile + ".xml", result);

            Progress.SetMessage("Finding SAP ...");
            List <SapPredicted> predicted = new List <SapPredicted>();

            var minDeltaMass = options.SubstitutionDeltaMassMap.Values.Min(l => l.Min(k => k.DeltaMass));
            var maxDeltaMass = options.SubstitutionDeltaMassMap.Values.Max(l => l.Max(k => k.DeltaMass));

            Progress.SetRange(0, result.Count);
            Progress.Begin();

            FindCandidates(lib, result, predicted, minDeltaMass, maxDeltaMass);

            var groups = predicted.ToGroupDictionary(m => m.Ms2.GetFileScans());

            predicted.Clear();
            foreach (var g in groups.Values)
            {
                var gg = g.ToGroupDictionary(m => m.LibMs2).Values.ToList();
                gg.Sort((m1, m2) =>
                {
                    return(CompareSapPrecitedList(m1, m2));
                });

                var expect = gg[0].FirstOrDefault(m => m.IsExpect);
                if (expect != null)
                {
                    predicted.Add(expect);
                }
                else
                {
                    predicted.AddRange(gg[0]);
                    for (int i = 1; i < gg.Count; i++)
                    {
                        if (CompareSapPrecitedList(gg[0], gg[i]) == 0)
                        {
                            predicted.AddRange(gg[i]);
                        }
                        else
                        {
                            break;
                        }
                    }
                }
            }

            if (File.Exists(options.MatchedFile))
            {
                new SapPredictedValidationWriter(options.MatchedFile).WriteToFile(options.OutputFile, predicted);
            }
            else
            {
                new SapPredictedWriter().WriteToFile(options.OutputTableFile, predicted);

                Progress.SetMessage("Generating SAP sequence ...");
                List <Sequence> predictedSeq = new List <Sequence>();
                foreach (var predict in predicted)
                {
                    var seq = PeptideUtils.GetPureSequence(predict.LibMs2.Peptide);
                    if (predict.Target.TargetType == VariantType.SingleAminoacidPolymorphism)
                    {
                        for (int i = 0; i < seq.Length; i++)
                        {
                            if (seq[i] == predict.Target.Source[0])
                            {
                                foreach (var t in predict.Target.Target)
                                {
                                    string targetSeq;
                                    if (i == 0)
                                    {
                                        targetSeq = t + seq.Substring(1);
                                    }
                                    else
                                    {
                                        targetSeq = seq.Substring(0, i) + t + seq.Substring(i + 1);
                                    }

                                    var reference = string.Format("sp|SAP_{0}_{1}|{2}_{3}_{4}_{5}", targetSeq, predict.Target.TargetType, seq, predict.Target.Source, i + 1, t);
                                    predictedSeq.Add(new Sequence(reference, targetSeq));
                                }
                            }
                        }
                    }
                    else
                    {
                        foreach (var tseq in predict.Target.Target)
                        {
                            string reference;
                            if (predict.Target.TargetType == VariantType.NTerminalLoss)
                            {
                                reference = string.Format("sp|SAP_{0}_{1}|{2}_loss_{3}", tseq, predict.Target.TargetType, seq, seq.Substring(0, seq.Length - tseq.Length));
                            }
                            else if (predict.Target.TargetType == VariantType.CTerminalLoss)
                            {
                                reference = string.Format("sp|SAP_{0}_{1}|{2}_loss_{3}", tseq, predict.Target.TargetType, seq, seq.Substring(tseq.Length));
                            }
                            else if (predict.Target.TargetType == VariantType.NTerminalExtension)
                            {
                                reference = string.Format("sp|SAP_{0}_{1}|{2}_ext_{3}", tseq, predict.Target.TargetType, seq, tseq.Substring(0, tseq.Length - seq.Length));
                            }
                            else if (predict.Target.TargetType == VariantType.CTerminalExtension)
                            {
                                reference = string.Format("sp|SAP_{0}_{1}|{2}_ext_{3}", tseq, predict.Target.TargetType, seq, tseq.Substring(seq.Length));
                            }
                            else
                            {
                                throw new Exception("I don't know how to deal with " + predict.Target.TargetType.ToString());
                            }

                            predictedSeq.Add(new Sequence(reference, tseq));
                        }
                    }
                }

                predictedSeq = (from g in predictedSeq.GroupBy(m => m.SeqString)
                                select g.First()).ToList();

                Progress.SetMessage("Reading database {0} ...", options.DatabaseFastaFile);
                var databases = SequenceUtils.Read(options.DatabaseFastaFile);

                Progress.SetMessage("Removing variant sequences which are already existed in database ...");
                for (int i = predictedSeq.Count - 1; i >= 0; i--)
                {
                    foreach (var db in databases)
                    {
                        if (db.SeqString.Contains(predictedSeq[i].SeqString))
                        {
                            predictedSeq.RemoveAt(i);
                            break;
                        }
                    }
                }
                databases.AddRange(predictedSeq);

                Progress.SetMessage("Writing SAP sequence and original database to {0} ...", options.OutputFile);

                SequenceUtils.Write(new FastaFormat(), options.OutputFile, databases);
            }

            Progress.End();

            return(new string[] { options.OutputFile, options.OutputTableFile });
        }
Exemplo n.º 10
0
        /// <summary>
        /// 读取fasta文件,进行数据处理。
        /// </summary>
        /// <param name="fileName">fasta</param>
        /// <returns>result file</returns>
        public override IEnumerable <string> Process()
        {
            HashSet <string> pnovoseqs = new HashSet <string>();

            var pnovoParser = new PNovoPlusParser(options.TitleParser);

            pnovoParser.Progress = this.Progress;

            //找到一个非酶切位点的氨基酸,可代表denovo序列前后氨基酸。
            var anotheraa = 'A';

            for (int i = 0; i < 26; i++)
            {
                anotheraa = (char)('A' + i);
                if (options.Enzyme.CleaveageResidues.Contains(anotheraa) || options.Enzyme.NotCleaveResidues.Contains(anotheraa))
                {
                    continue;
                }
                break;
            }

            Progress.SetRange(0, options.PnovoFiles.Length);
            int totalSpectrumCount     = 0;
            int totalSpectrumPassScore = 0;

            foreach (var pnovoFile in options.PnovoFiles)
            {
                Progress.SetMessage("Reading " + pnovoFile + " ...");
                int spectrumCount = pnovoParser.GetSpectrumCount(pnovoFile);
                var curSpectra    = pnovoParser.ParsePeptides(pnovoFile, 10, options.MinScore);

                totalSpectrumCount     += spectrumCount;
                totalSpectrumPassScore += curSpectra.Count;

                RemoveMissCleavagePeptides(anotheraa, curSpectra);

                pnovoseqs.UnionWith(from c in curSpectra
                                    from p in c.Peptides
                                    select p.PureSequence);
                Progress.Increment(1);
            }


            var pNovoStat = Path.Combine(options.TargetDirectory, "pNovo.SAP.stat");

            using (StreamWriter sw = new StreamWriter(pNovoStat))
            {
                sw.WriteLine("Total Spectrum Count\t" + totalSpectrumCount.ToString());
                sw.WriteLine("Total Peptide-Spectrum-Match Passed Score Filter\t" + totalSpectrumPassScore.ToString());
            }

            Progress.SetPosition(0);
            Progress.SetMessage("Reading " + options.TargetFastaFile + " ...");
            var seqs = SequenceUtils.Read(new FastaFormat(), options.TargetFastaFile);

            Progress.SetMessage("Digesting sequences ...");

            GetDigestPeptide(seqs);

            seqs.Clear();
            seqs.TrimExcess();
            GC.Collect();
            GC.WaitForFullGCComplete();

            //清除所有跟理论库一样的肽段。
            Progress.SetMessage("Removing identical peptides ...");
            pnovoseqs.ExceptWith(miss0.Keys);

            var pnovoArray = pnovoseqs.ToArray();

            pnovoseqs.Clear();
            GC.Collect();
            GC.WaitForFullGCComplete();

            miss0group = miss0.Keys.ToGroupDictionary(m => m.Length);

            var type2seqs = new List <Type2Sequence>();
            var type2_2   = new List <string>();

            foreach (var m in miss1.Keys)
            {
                int maxpos = -1;
                for (int i = 1; i < m.Length; i++)
                {
                    if (options.Enzyme.IsCleavageSite(m[i - 1], m[i], anotheraa))
                    {
                        maxpos = i - 1;
                        break;
                    }
                }

                if (maxpos == -1)
                {
                    throw new Exception("There is no misscleavage in " + m);
                }

                if (maxpos == 0)
                {
                    type2_2.Add(m);
                }
                else
                {
                    type2seqs.Add(new Type2Sequence()
                    {
                        Sequence      = m,
                        PriorSequence = m.Substring(0, maxpos),
                        PostSequence  = m.Substring(maxpos + 1)
                    });
                }
            }
            miss1type2_1 = type2seqs.ToGroupDictionary(m => GetType2Key(m.Sequence));
            miss1type2_2 = type2_2.ToGroupDictionary(m => m.Substring(1));

            miss0type3 = miss0.Keys.ToGroupDictionary(m => GetType3Key(m));

            type2seqs.Clear();
            GC.Collect();
            GC.WaitForFullGCComplete();

            Progress.SetMessage("Finding mutation ...");
            Progress.SetRange(0, pnovoArray.Length);

            var pre100   = pnovoArray.Length / 100;
            var pre10000 = pnovoArray.Length / 10000;

            if (pre10000 == 0)
            {
                pre10000 = 1;
            }

            var totalCount           = pnovoArray.Length;
            var binSize              = totalCount / options.ThreadCount;
            List <FindParam> fparams = new List <FindParam>();
            List <Thread>    threads = new List <Thread>();
            var startPos             = 0;

            for (int i = 0; i < options.ThreadCount; i++)
            {
                int count;
                if (i == options.ThreadCount - 1)
                {
                    count = pnovoArray.Length - startPos;
                }
                else
                {
                    count = binSize;
                }
                List <string> binSeq = new List <string>();
                binSeq.AddRange(pnovoArray.Skip(startPos).Take(count));
                startPos = startPos + count;

                var aparam = new FindParam()
                {
                    PnovoSeqs = binSeq
                };
                fparams.Add(aparam);

                Thread at = new Thread(this.FindMutation);
                threads.Add(at);
                at.IsBackground = true;
                at.Start(aparam);
            }

            pnovoArray = null;
            GC.Collect();
            GC.WaitForFullGCComplete();

            var startTime = DateTime.Now;

            Progress.SetRange(0, totalCount);
            while (true)
            {
                Thread.Sleep(1000);

                if (Progress.IsCancellationPending())
                {
                    throw new UserTerminatedException();
                }

                int finishedCount = fparams.Sum(m => m.FinishedCount);
                Progress.SetPosition(finishedCount);

                if (finishedCount == 0)
                {
                    continue;
                }

                var curTime       = DateTime.Now;
                var costTime      = curTime - startTime;
                var totalCostTime = new TimeSpan(costTime.Ticks * totalCount / finishedCount);
                var finishTime    = curTime + new TimeSpan(costTime.Ticks * (totalCount - finishedCount) / finishedCount);

                StringBuilder costFormat = new StringBuilder();
                if (totalCostTime.TotalHours >= 2.0)
                {
                    costFormat.Append(Math.Truncate(totalCostTime.TotalHours).ToString() + " hours and ");
                }
                else if (totalCostTime.TotalHours >= 1.0)
                {
                    costFormat.Append("one hour and ");
                }
                costFormat.Append(totalCostTime.Minutes.ToString() + " minutes");

                Progress.SetMessage("Finding mutation {0} / {1}, will cost {2} and finish at {3} ...", finishedCount, totalCount, costFormat, finishTime);

                int finishedThreadCount = threads.Count(m => !m.IsAlive);
                if (finishedThreadCount == threads.Count)
                {
                    break;
                }
            }

            int type1 = fparams.Sum(m => m.Type1Count);
            int type2 = fparams.Sum(m => m.Type2Count);
            int type3 = fparams.Sum(m => m.Type3Count);

            using (StreamWriter sw = new StreamWriter(pNovoStat, true))
            {
                sw.WriteLine("Type1 Count\t" + type1.ToString());
                sw.WriteLine("Type2 Count\t" + type2.ToString());
                sw.WriteLine("Type3 Count\t" + type3.ToString());
            }

            var singleMutation = (from f in fparams
                                  from s in f.Sequences
                                  select s).ToList();

            string newFastaFile = new FileInfo(options.TargetDirectory + "/" + FileUtils.ChangeExtension(new FileInfo(options.DatabaseFastaFile).Name, "mutation.fasta")).FullName;

            using (StreamWriter sw = new StreamWriter(newFastaFile))
            {
                using (StreamReader sr = new StreamReader(options.DatabaseFastaFile))
                {
                    string line = sr.ReadToEnd();
                    sw.WriteLine(line);

                    foreach (var seq in singleMutation)
                    {
                        sw.WriteLine(">" + seq.Reference);
                        sw.WriteLine(seq.SeqString);
                    }
                }
            }

            Progress.SetRange(0, options.PnovoFiles.Length);
            var sapSequences = new HashSet <string>(singleMutation.ConvertAll(m => m.SeqString));
            List <IIdentifiedSpectrum> allSpectra = new List <IIdentifiedSpectrum>();

            foreach (var pnovoFile in options.PnovoFiles)
            {
                Progress.SetMessage("Reading " + pnovoFile + " ...");
                var curSpectra = pnovoParser.ParsePeptides(pnovoFile, 10, options.MinScore);

                RemoveMissCleavagePeptides(anotheraa, curSpectra);

                curSpectra.RemoveAll(m => !m.Peptides.Any(n => sapSequences.Contains(n.PureSequence)));
                allSpectra.AddRange(curSpectra);
                Progress.Increment(1);
            }

            var pNovoPeptides = Path.Combine(options.TargetDirectory, "pNovo.SAP.peptides");

            new MascotPeptideTextFormat("\tFileScan\tSequence\tCharge\tScore\tDeltaScore").WriteToFile(pNovoPeptides, allSpectra);

            Progress.SetMessage("Finished.");
            Progress.End();

            return(new string[] { newFastaFile });
        }
Exemplo n.º 11
0
        public override IEnumerable <string> Process()
        {
            var paramFile = options.OutputFile + ".param";

            options.SaveToFile(options.OutputFile + ".param");

            var bedfile = new BedItemFile <BedItem>(6);

            Progress.SetMessage("building chromosome name map ...");

            var mitoName = "M";
            Dictionary <string, string> chrNameMap = new Dictionary <string, string>();
            var ff = new FastaFormat(int.MaxValue);

            var faiFile = options.FastaFile + ".fai";

            if (File.Exists(faiFile))
            {
                using (StreamReader sr = new StreamReader(faiFile))
                {
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        var name = line.Split('\t')[0];
                        chrNameMap[name] = name;
                        if (name.StartsWith("chr"))
                        {
                            chrNameMap[name.StringAfter("chr")] = name;
                        }
                        if (!name.StartsWith("chr"))
                        {
                            chrNameMap["chr" + name] = name;
                        }

                        if (name.Equals("chrMT") || name.Equals("MT"))
                        {
                            mitoName = "MT";
                        }
                        if (name.Equals("chrM") || name.Equals("M"))
                        {
                            mitoName = "M";
                        }
                    }
                }
            }
            else
            {
                using (StreamReader sr = new StreamReader(options.FastaFile))
                {
                    Sequence seq;
                    while ((seq = ff.ReadSequence(sr)) != null)
                    {
                        var name = seq.Name;
                        chrNameMap[name] = name;
                        if (name.StartsWith("chr"))
                        {
                            chrNameMap[name.StringAfter("chr")] = name;
                        }
                        if (!name.StartsWith("chr"))
                        {
                            chrNameMap["chr" + name] = name;
                        }

                        if (name.Equals("chrMT") || name.Equals("MT"))
                        {
                            mitoName = "MT";
                        }
                        if (name.Equals("chrM") || name.Equals("M"))
                        {
                            mitoName = "M";
                        }
                    }
                }
            }
            var longMitoName = chrNameMap[mitoName];

            Progress.SetMessage("mitochondral chromosome name = {0}", longMitoName);

            var mirnas = new List <BedItem>();

            if (File.Exists(options.MiRBaseFile))
            {
                Progress.SetMessage("Processing {0} ...", options.MiRBaseFile);

                if (options.MiRBaseFile.EndsWith(".bed"))
                {
                    mirnas = bedfile.ReadFromFile(options.MiRBaseFile);
                    mirnas.ForEach(m =>
                    {
                        m.Seqname = m.Seqname.StringAfter("chr");
                        m.Name    = options.MiRBaseKey + ":" + m.Name;
                    });
                }
                else
                {
                    using (var gf = new GtfItemFile(options.MiRBaseFile))
                    {
                        GtfItem item;
                        while ((item = gf.Next(options.MiRBaseKey)) != null)
                        {
                            BedItem loc = new BedItem();
                            loc.Seqname = item.Seqname.StringAfter("chr");
                            loc.Start   = item.Start - 1;
                            loc.End     = item.End;
                            loc.Name    = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";");
                            loc.Score   = 1000;
                            loc.Strand  = item.Strand;
                            mirnas.Add(loc);
                        }
                    }
                }

                Progress.SetMessage("{0} miRNA readed.", mirnas.Count);
            }

            List <BedItem> trnas = new List <BedItem>();

            if (File.Exists(options.UcscTrnaFile))
            {
                //reading tRNA from ucsc table without mitocondrom tRNA
                Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile);
                trnas = bedfile.ReadFromFile(options.UcscTrnaFile);
                trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr"));

                var removed = trnas.Where(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))).ToList();
                if (removed.Count != trnas.Count)
                {
                    //remove the tRNA not from 1-22, X and Y
                    trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n)));

                    //mitocondrom tRNA will be extracted from ensembl gtf file
                    trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT"));
                }

                trnas.ForEach(m => m.Name = GetTRNAName(m.Name));

                Progress.SetMessage("{0} tRNA from ucsc readed.", trnas.Count);

                if (File.Exists(options.UcscMatureTrnaFastaFile))
                {
                    var seqs = SequenceUtils.Read(options.UcscMatureTrnaFastaFile);
                    foreach (var seq in seqs)
                    {
                        var tRNAName = GetTRNAName(seq.Name);
                        trnas.Add(new BedItem()
                        {
                            Seqname  = seq.Name,
                            Start    = 0,
                            End      = seq.SeqString.Length,
                            Strand   = '+',
                            Name     = tRNAName,
                            Sequence = seq.SeqString
                        });
                    }
                }
            }

            var others = new List <BedItem>();

            if (File.Exists(options.EnsemblGtfFile))
            {
                //reading smallRNA/tRNA from ensembl gtf file
                Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile);
                using (var gf = new GtfItemFile(options.EnsemblGtfFile))
                {
                    var biotypes = new HashSet <string>(SmallRNAConsts.Biotypes);
                    biotypes.Remove(SmallRNAConsts.miRNA);

                    GtfItem item;
                    int     count = 0;
                    while ((item = gf.Next("gene")) != null)
                    {
                        string biotype;
                        if (item.Attributes.Contains("gene_biotype"))
                        {
                            biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\"");
                        }
                        else if (item.Attributes.Contains("gene_type"))
                        {
                            biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\"");
                        }
                        else
                        {
                            continue;
                        }

                        if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA))
                        {
                            continue;
                        }

                        if (biotype.Equals("Mt_tRNA"))
                        {
                            count++;
                            var     gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId;
                            BedItem loc       = new BedItem();
                            loc.Seqname = mitoName;
                            loc.Start   = item.Start - 1;
                            loc.End     = item.End;
                            loc.Name    = string.Format(SmallRNAConsts.mt_tRNA + ":" + longMitoName + ".tRNA{0}-{1}", count, gene_name.StringAfter("-"));
                            loc.Score   = 1000;
                            loc.Strand  = item.Strand;
                            trnas.Add(loc);
                        }
                        else if (biotypes.Contains(biotype))
                        {
                            string seqName;
                            if (item.Seqname.ToLower().StartsWith("chr"))
                            {
                                seqName = item.Seqname.Substring(3);
                            }
                            else
                            {
                                seqName = item.Seqname;
                            }
                            if (seqName.Equals("M") || seqName.Equals("MT"))
                            {
                                seqName = mitoName;
                            }

                            //ignore all smallRNA coordinates on scaffold or contig.
                            //if (seqName.Length > 5)
                            //{
                            //  continue;
                            //}

                            var gene_name   = item.Attributes.StringAfter("gene_name \"").StringBefore("\"");
                            var lowGeneName = gene_name.ToLower();
                            if (lowGeneName.StartsWith("rny") || lowGeneName.Equals("y_rna"))
                            {
                                biotype = "yRNA";
                            }

                            BedItem loc = new BedItem();
                            loc.Seqname = seqName;
                            loc.Start   = item.Start - 1;
                            loc.End     = item.End;

                            //if (lowGeneName.EndsWith("_rrna") && loc.Length < 200)
                            //{
                            //  biotype = "rRNA";
                            //}

                            loc.Name   = biotype + ":" + gene_name + ":" + item.GeneId;
                            loc.Score  = 1000;
                            loc.Strand = item.Strand;

                            others.Add(loc);
                        }
                    }
                }
            }

            var all = new List <BedItem>();

            all.AddRange(mirnas);
            all.AddRange(trnas);
            all.AddRange(others);

            foreach (var bi in all)
            {
                if (chrNameMap.ContainsKey(bi.Seqname))
                {
                    bi.Seqname = chrNameMap[bi.Seqname];
                }
            }

            if (File.Exists(options.RRNAFile))
            {
                var seqs = SequenceUtils.Read(options.RRNAFile);
                foreach (var seq in seqs)
                {
                    all.Add(new BedItem()
                    {
                        Seqname = seq.Name,
                        Start   = 0,
                        End     = seq.SeqString.Length,
                        Strand  = '+',
                        Name    = "rRNA:" + SmallRNAConsts.rRNADB_KEY + seq.Name
                    });
                }
            }

            Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "...");
            using (var sw = new StreamWriter(options.OutputFile))
            {
                foreach (var pir in SmallRNAConsts.Biotypes)
                {
                    var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                    Progress.SetMessage("{0} : {1}", pir, locs.Count);

                    GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                    foreach (var loc in locs)
                    {
                        sw.WriteLine(bedfile.GetValue(loc));
                    }
                }
            }

            var miRNA_bed = FileUtils.ChangeExtension(options.OutputFile, ".miRNA.bed");

            Progress.SetMessage("Saving miRNA coordinates to " + miRNA_bed + "...");
            using (var sw = new StreamWriter(miRNA_bed))
            {
                var pir  = SmallRNAConsts.miRNA;
                var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                Progress.SetMessage("{0} : {1}", pir, locs.Count);

                GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                foreach (var loc in locs)
                {
                    sw.WriteLine(bedfile.GetValue(loc));
                }
            }

            Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss1 ...");
            using (var sw = new StreamWriter(options.OutputFile + ".miss1"))
            {
                foreach (var pir in SmallRNAConsts.Biotypes)
                {
                    if (pir == SmallRNABiotype.lincRNA.ToString() || pir == SmallRNABiotype.lncRNA.ToString())
                    {
                        continue;
                    }
                    var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                    locs.RemoveAll(l => l.Name.Contains(SmallRNAConsts.rRNADB_KEY));

                    Progress.SetMessage("{0} : {1}", pir, locs.Count);

                    GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                    foreach (var loc in locs)
                    {
                        sw.WriteLine(bedfile.GetValue(loc));
                    }
                }
            }

            Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss0 ...");
            using (var sw = new StreamWriter(options.OutputFile + ".miss0"))
            {
                foreach (var pir in SmallRNAConsts.Biotypes)
                {
                    if (pir != SmallRNABiotype.lincRNA.ToString() && pir != SmallRNABiotype.lncRNA.ToString() && pir != SmallRNABiotype.rRNA.ToString())
                    {
                        continue;
                    }
                    var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                    if (pir == SmallRNABiotype.rRNA.ToString())
                    {
                        locs.RemoveAll(l => !l.Name.Contains(SmallRNAConsts.rRNADB_KEY));
                    }

                    Progress.SetMessage("{0} : {1}", pir, locs.Count);

                    GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                    foreach (var loc in locs)
                    {
                        sw.WriteLine(bedfile.GetValue(loc));
                    }
                }
            }

            var summaryFile = options.OutputFile + ".info";

            Progress.SetMessage("Writing summary to " + summaryFile + "...");
            using (var sw = new StreamWriter(summaryFile))
            {
                sw.WriteLine("Biotype\tCount");

                all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count()));
            }

            var result = new List <string>(new[] { options.OutputFile });

            var fasta = Path.ChangeExtension(options.OutputFile, ".fasta");

            if ((File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) || File.Exists(options.RRNAFile))
            {
                result.Add(fasta);
                using (var sw = new StreamWriter(fasta))
                {
                    string line;
                    using (var sr = new StreamReader(options.FastaFile))
                    {
                        while ((line = sr.ReadLine()) != null)
                        {
                            sw.WriteLine(line);
                        }
                    }

                    if (File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile))
                    {
                        using (var sr = new StreamReader(options.UcscMatureTrnaFastaFile))
                        {
                            while ((line = sr.ReadLine()) != null)
                            {
                                sw.WriteLine(line);
                            }
                        }
                    }

                    if (File.Exists(options.RRNAFile))
                    {
                        using (var sr = new StreamReader(options.RRNAFile))
                        {
                            while ((line = sr.ReadLine()) != null)
                            {
                                sw.WriteLine(line);
                            }
                        }
                    }
                }
            }

            var faFile = options.OutputFile + ".fa";

            Progress.SetMessage("Extracting sequence from " + options.FastaFile + "...");
            var b2foptions = new Bed2FastaProcessorOptions()
            {
                GenomeFastaFile = options.FastaFile,
                InputFile       = options.OutputFile,
                OutputFile      = faFile,
                KeepChrInName   = false,
            };

            if (!File.Exists(options.UcscMatureTrnaFastaFile))
            {
                b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA) || m.StartsWith(SmallRNAConsts.tRNA);
            }
            else
            {
                b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA);
            }

            new Bed2FastaProcessor(b2foptions)
            {
                Progress = this.Progress
            }.Process();

            if (File.Exists(options.UcscMatureTrnaFastaFile))
            {
                Progress.SetMessage("Extracting sequence from " + options.UcscMatureTrnaFastaFile + " ...");

                using (var sw = new StreamWriter(faFile, true))
                {
                    foreach (var tRNA in trnas)
                    {
                        if (!string.IsNullOrEmpty(tRNA.Sequence))
                        {
                            sw.WriteLine(">{0}", tRNA.Name);
                            sw.WriteLine("{0}", tRNA.Sequence);
                        }
                    }
                }
            }

            return(result);
        }
        public IEnumerable <string> Process(string filename)
        {
            List <string> result = new List <string>();

            List <string> proteins      = new List <string>();
            List <string> lightPeptides = new List <string>();
            List <string> heavyPeptides = new List <string>();

            Dictionary <string, Sequence> seqMap = new Dictionary <string, Sequence>();

            if (File.Exists(filename + ".fasta"))
            {
                List <Sequence> seqs = SequenceUtils.Read(ff, filename + ".fasta");
                foreach (Sequence seq in seqs)
                {
                    seqMap[seq.Name] = seq;
                }
            }

            string lightResult = filename + ".light";
            string heavyResult = filename + ".heavy";

            StreamWriter swLightFasta = null;
            StreamWriter swHeavyFasta = null;

            if (seqMap.Count > 0)
            {
                swLightFasta = new StreamWriter(lightResult + ".fasta");
                swHeavyFasta = new StreamWriter(heavyResult + ".fasta");
            }

            try
            {
                using (StreamWriter swLight = new StreamWriter(lightResult))
                {
                    using (StreamWriter swHeavy = new StreamWriter(heavyResult))
                    {
                        using (StreamReader sr = new StreamReader(filename))
                        {
                            string line = sr.ReadLine();
                            swLight.WriteLine(line);
                            swHeavy.WriteLine(line);

                            line = sr.ReadLine();
                            SequestPeptideTextFormat format = new SequestPeptideTextFormat(line);
                            swLight.WriteLine(line);
                            swHeavy.WriteLine(line);

                            bool bIsProtein = true;
                            while ((line = sr.ReadLine()) != null)
                            {
                                if (line.Trim().Length == 0)
                                {
                                    WriteGroup(proteins, lightPeptides, heavyPeptides, swLight, swHeavy, swLightFasta, swHeavyFasta, seqMap);
                                    break;
                                }

                                if (line.StartsWith("$"))
                                {
                                    if (bIsProtein)
                                    {
                                        proteins.Add(line);
                                        continue;
                                    }

                                    WriteGroup(proteins, lightPeptides, heavyPeptides, swLight, swHeavy, swLightFasta, swHeavyFasta, seqMap);

                                    proteins.Clear();
                                    lightPeptides.Clear();
                                    heavyPeptides.Clear();

                                    proteins.Add(line);
                                    bIsProtein = true;
                                    continue;
                                }

                                bIsProtein = false;

                                IIdentifiedSpectrum sph = format.PeptideFormat.ParseString(line);
                                string matchedSeq       = PeptideUtils.GetMatchedSequence(sph.Sequence);
                                double lightMass        = lightCalc.GetMass(matchedSeq);
                                double heavyMass        = heavyCalc.GetMass(matchedSeq);

                                if (Math.Abs(lightMass - sph.ExperimentalMass) < 0.1)
                                {
                                    lightPeptides.Add(line);
                                    continue;
                                }

                                if (Math.Abs(heavyMass - sph.ExperimentalMass) < 0.1)
                                {
                                    heavyPeptides.Add(line);
                                    continue;
                                }

                                throw new Exception(MyConvert.Format("Mass={0:0.0000}; {1:0.0000}; {2:0.0000}", sph.ExperimentalMass,
                                                                     lightMass, heavyMass));
                            }
                        }
                    }
                }
            }
            finally
            {
                if (seqMap.Count > 0)
                {
                    swLightFasta.Close();
                    swHeavyFasta.Close();
                }
            }

            result.Add(lightResult);
            result.Add(heavyResult);

            return(result);
        }
Exemplo n.º 13
0
        public override IEnumerable <string> Process(string fileName)
        {
            var aas = new Aminoacids();

            Progress.SetMessage("reading pNovo result from " + pNovoPeptideFile + " ...");
            var pNovoSpectra = new MascotPeptideTextFormat().ReadFromFile(pNovoPeptideFile);
            var pNovoMap     = new Dictionary <string, HashSet <string> >();

            foreach (var pep in pNovoSpectra)
            {
                var key = pep.Query.FileScan.LongFileName;
                if (!pNovoMap.ContainsKey(key))
                {
                    pNovoMap[key] = new HashSet <string>();
                }
                pNovoMap[key].UnionWith(from p in pep.Peptides select p.PureSequence);
            }

            var format = new MascotPeptideTextFormat();

            Progress.SetMessage("reading peptide-spectra-matches from " + fileName + " ...");
            var spectra = format.ReadFromFile(fileName);

            //价位筛选
            spectra.RemoveAll(m => !charges.Contains(m.Charge));
            //对于有不确定的氨基酸,直接忽略。
            spectra.ForEach(m =>
            {
                for (int i = m.Peptides.Count - 1; i >= 0; i--)
                {
                    if (m.Peptides[i].PureSequence.Any(n => aas[n].Codes.Length == 0))
                    {
                        m.RemovePeptideAt(i);
                    }
                }
            });
            spectra.RemoveAll(m => m.Peptides.Count == 0);

            Progress.SetMessage("comparing peptide-spectra-matches with pNovo result...");
            //与pNovo判定的mutation是否一致?
            spectra.RemoveAll(m =>
            {
                if (!IsMutationPeptide(m))
                {
                    return(false);
                }

                var key = m.Query.FileScan.LongFileName;
                if (!pNovoMap.ContainsKey(key))
                {
                    return(true);
                }

                var set = pNovoMap[key];
                return(!m.Peptides.Any(n => set.Contains(n.PureSequence.Replace('I', 'L'))));
            });

            //Get spectra whose peptides are all from mutated version
            var mutSpectra  = spectra.FindAll(m => IsMutationPeptide(m)).ToList();
            var mutPeptides = (from s in mutSpectra
                               from p in s.Peptides
                               select p).ToList();
            var mutGroup = mutPeptides.GroupBy(m => m.PureSequence);

            //Get specra whose peptides are all from wide version
            var fromSpectra = spectra.Except(mutSpectra).ToList();

            fromSpectra.RemoveAll(m => m.Proteins.Any(n => mutationReg.Match(n).Success));
            var fromPeptides = (from s in fromSpectra
                                from p in s.Peptides
                                select p).ToList();
            var fromGroup = fromPeptides.GroupBy(m => m.PureSequence).ToGroupDictionary(n => n.Key.Length);
            var minLength = fromGroup.Count == 0 ? 6 : fromGroup.Min(m => m.Key);
            var maxLength = fromGroup.Count == 0 ? 30 : fromGroup.Max(m => m.Key);

            //Check the mutation type
            var type1 = new List <List <IGrouping <string, IIdentifiedPeptide> > >();
            var type2 = new List <List <IGrouping <string, IIdentifiedPeptide> > >();
            var type3 = new List <List <IGrouping <string, IIdentifiedPeptide> > >();

            Progress.SetRange(0, mutGroup.Count());
            Progress.SetPosition(0);
            Progress.SetMessage("finding mutation-original pairs ...");

            foreach (var mut in mutGroup)
            {
                var matched = new List <IGrouping <string, IIdentifiedPeptide> >();
                matched.Add(mut);
                Progress.Increment(1);

                var protein = mut.First().Proteins[0];

                List <List <IGrouping <string, IIdentifiedPeptide> > > type;
                if (protein.EndsWith("type3"))
                {
                    type = type3;
                    var mutseq = mut.Key.Substring(0, mut.Key.Length - 1);
                    for (int i = mut.Key.Length + 1; i <= maxLength; i++)
                    {
                        if (fromGroup.ContainsKey(i))
                        {
                            var others = fromGroup[i];
                            foreach (var o in others)
                            {
                                if (o.Key.StartsWith(mutseq))
                                {
                                    matched.Add(o);
                                }
                            }
                        }
                    }
                }
                else if (protein.EndsWith("type2"))
                {
                    type = type2;
                    for (int i = minLength; i < mut.Key.Length; i++)
                    {
                        if (fromGroup.ContainsKey(i))
                        {
                            var others = fromGroup[i];
                            foreach (var o in others)
                            {
                                var oseq = o.Key.Substring(0, o.Key.Length - 1);
                                if (mut.Key.StartsWith(oseq))
                                {
                                    matched.Add(o);
                                }
                            }
                        }
                    }
                }
                else if (protein.EndsWith("type1"))
                {
                    type = type1;

                    if (fromGroup.ContainsKey(mut.Key.Length))
                    {
                        var oLength = fromGroup[mut.Key.Length];
                        foreach (var o in oLength)
                        {
                            int mutationSite = -1;
                            if (MutationUtils.IsMutationOneIL2(o.Key, mut.Key, ref mutationSite, IgnoreNtermMutation, IgnoreDeamidatedMutation, IgnoreMultipleNucleotideMutation))
                            {
                                matched.Add(o);
                            }
                        }
                    }
                }
                else
                {
                    throw new Exception("There is no mutation type information at protein name: " + protein + "\nIt should be like MUL_NHLGQK_type1, MUL_NHLGQK_type2 or MUL_NHLGQK_type3");
                }

                type.Add(matched);
            }

            type1.Sort((m1, m2) =>
            {
                var res = m1.Count.CompareTo(m2.Count);
                if (res == 0)
                {
                    res = m2[0].Count().CompareTo(m1[0].Count());
                }
                return(res);
            });

            Progress.SetMessage("reading protein sequences ...");
            var proteins = SequenceUtils.Read(new FastaFormat(), fastaFile);

            var proMap = proteins.ToDictionary(m =>
            {
                string ac;
                if (acParser.TryParse(m.Name, out ac))
                {
                    return(ac);
                }
                else
                {
                    return(m.Name);
                }
            });

            var    classification = GetClassification();
            string mutHeader      = "FileScan\tMH+\tDiff(MH+)\tCharge\tRank\tScore\tExpectValue\tModification";
            var    mutPepFormat   = new MascotPeptideTextFormat(mutHeader);

            Progress.SetMessage("writing result ...");
            var result1 = DoStatistic(fileName, aas, format, proMap, classification, mutHeader, mutPepFormat, type1, ".type1");
            var result2 = DoStatistic(fileName, aas, format, proMap, classification, mutHeader, mutPepFormat, type2, ".type2");
            var result3 = DoStatistic(fileName, aas, format, proMap, classification, mutHeader, mutPepFormat, type3, ".type3");

            return(result1.Concat(result2).Concat(result3).ToArray());
        }