/// <summary>
        /// Fill reference allele from genome fasta file.
        /// </summary>
        /// <param name="snpItems"></param>
        /// <param name="fastaFile"></param>
        /// <param name="progress"></param>
        public static void FillReferenceAlleleFromFasta(this IEnumerable <SNPItem> snpItems, string fastaFile, IProgressCallback progress = null)
        {
            if (progress == null)
            {
                progress = new ConsoleProgressCallback();
            }

            var dic = snpItems.ToGroupDictionary(m => m.Chrom);

            progress.SetMessage("Filling reference allele from {0} file ...", fastaFile);
            using (var sw = new StreamReader(fastaFile))
            {
                var      ff = new FastaFormat();
                Sequence seq;
                while ((seq = ff.ReadSequence(sw)) != null)
                {
                    progress.SetMessage("chromosome " + seq.Name + " ...");
                    var chr = HumanChromosomeToInt(seq.Name);
                    if (dic.ContainsKey(chr))
                    {
                        var snps = dic[chr];
                        foreach (var snp in snps)
                        {
                            snp.RefChar = char.ToUpper(seq.SeqString[snp.Position - 1]);
                        }
                    }
                }
            }
            progress.SetMessage("Filling reference allele finished.");
        }
示例#2
0
        public override IEnumerable <string> Process(string fileName)
        {
            string      result = FileUtils.ChangeExtension(fileName, "") + "_" + name + new FileInfo(fileName).Extension;
            FastaFormat format = new FastaFormat();

            Progress.SetMessage("Processing " + fileName);
            using (StreamReader sr = new StreamReader(fileName))
            {
                Progress.SetRange(0, sr.BaseStream.Length);
                using (StreamWriter sw = new StreamWriter(result))
                {
                    Sequence seq;
                    while ((seq = format.ReadSequence(sr)) != null)
                    {
                        if (Progress.IsCancellationPending())
                        {
                            throw new UserTerminatedException();
                        }

                        Progress.SetPosition(sr.BaseStream.Position);
                        if (nameRegex.Match(seq.Name).Success)
                        {
                            format.WriteSequence(sw, seq);
                        }
                    }
                }
            }

            return(new string[] { result });
        }
示例#3
0
        public static List <CoverageRegion> GetTargetCoverageRegion(ITargetBuilderOptions options, IProgressCallback progress, bool removeRegionWithoutSequence = true)
        {
            List <CoverageRegion> result;

            if (options.TargetFile.EndsWith(".xml"))
            {
                result = GetTargetCoverageRegionFromXml(options, progress);
            }
            else
            {
                result = GetTargetCoverageRegionFromBed(options, progress);
            }

            var dic = result.ToGroupDictionary(m => m.Seqname);

            progress.SetMessage("Filling sequence from {0}...", options.GenomeFastaFile);
            using (var sr = new StreamReader(options.GenomeFastaFile))
            {
                var      ff = new FastaFormat();
                Sequence seq;
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    progress.SetMessage("Processing chromosome {0} ...", seq.Reference);
                    var seqname = seq.Name.StringAfter("chr");
                    List <CoverageRegion> lst;
                    if (dic.TryGetValue(seqname, out lst))
                    {
                        foreach (var l in lst)
                        {
                            l.Sequence = seq.SeqString.Substring((int)(l.Start - 1), (int)l.Length);
                            if (l.Strand == '+')
                            {
                                l.ReverseComplementedSequence = SequenceUtils.GetReverseComplementedSequence(l.Sequence);
                            }
                        }
                    }
                }
            }
            if (removeRegionWithoutSequence)
            {
                result.RemoveAll(l => string.IsNullOrEmpty(l.Sequence));
            }

            progress.SetMessage("Filling sequence finished.");

            var namemap = new MapReader(1, 12).ReadFromFile(options.RefgeneFile);

            result.ForEach(m =>
            {
                var gene     = m.Name.StringBefore("_utr3");
                m.GeneSymbol = namemap.ContainsKey(gene) ? namemap[gene] : string.Empty;
            });

            return(result);
        }
示例#4
0
        public static HashSet <string> GetContaminationAccessNumbers(IStringParser <string> acParser, string fastaFilename, string contaminationDescriptionPattern,
                                                                     IProgressCallback progress)
        {
            HashSet <string> result = new HashSet <string>();

            if (progress == null)
            {
                progress = new EmptyProgressCallback();
            }

            Regex reg = new Regex(contaminationDescriptionPattern, RegexOptions.IgnoreCase);

            progress.SetMessage("Get contamination map from database ...");
            var ff = new FastaFormat();

            using (var sr = new StreamReader(fastaFilename))
            {
                progress.SetRange(1, sr.BaseStream.Length);

                Sequence seq;
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    if (progress.IsCancellationPending())
                    {
                        throw new UserTerminatedException();
                    }

                    progress.SetPosition(sr.GetCharpos());

                    string ac = acParser.GetValue(seq.Name);

                    if (reg.Match(seq.Reference).Success)
                    {
                        result.Add(ac);
                    }
                }
            }

            progress.SetMessage("Get contamination map from database finished.");

            return(result);
        }
示例#5
0
        private void ProcessFile(ref int index, StreamWriter sw, string fastaFile, bool isContaminant)
        {
            FastaFormat ff = new FastaFormat();

            using (StreamReader sr = new StreamReader(fastaFile))
            {
                Progress.SetRange(0, sr.BaseStream.Length);

                Sequence seq;
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    Progress.SetPosition(sr.BaseStream.Position);

                    if (isContaminant)
                    {
                        if (!seq.Reference.StartsWith("CON_"))
                        {
                            seq.Reference = "CON_" + seq.Reference;
                        }
                    }

                    if (!options.ReversedOnly)
                    {
                        ff.WriteSequence(sw, seq);
                    }

                    if (options.IsPseudoAminoacid)
                    {
                        options.PseudoAminoacidBuilder.Build(seq);
                    }

                    index++;
                    Sequence reversedSeq = GetReversedSequence(index, seq);

                    ff.WriteSequence(sw, reversedSeq);
                }
            }
        }
示例#6
0
        public override IEnumerable <string> Process(string fileName)
        {
            FastaFormat ff     = new FastaFormat();
            var         result = Path.ChangeExtension(fileName, ".dM.fasta");

            using (StreamReader sr = new StreamReader(fileName))
                using (StreamWriter sw = new StreamWriter(result))
                {
                    Sequence seq;
                    Progress.SetRange(1, sr.BaseStream.Length);
                    while ((seq = ff.ReadSequence(sr)) != null)
                    {
                        Progress.SetPosition(StreamUtils.GetCharpos(sr));
                        if (seq.SeqString.StartsWith("M"))
                        {
                            seq.SeqString = seq.SeqString.Substring(1);
                            seq.Reference = seq.Name + " N-terminal-M-Removed " + seq.Description;
                        }
                        ff.WriteSequence(sw, seq);
                    }
                }

            return(new string[] { result });
        }
示例#7
0
        public override IEnumerable <string> Process()
        {
            var paramFile = options.OutputFile + ".param";

            options.SaveToFile(options.OutputFile + ".param");

            var bedfile = new BedItemFile <BedItem>(6);

            Progress.SetMessage("building chromosome name map ...");

            var mitoName = "M";
            Dictionary <string, string> chrNameMap = new Dictionary <string, string>();
            var ff = new FastaFormat(int.MaxValue);

            var faiFile = options.FastaFile + ".fai";

            if (File.Exists(faiFile))
            {
                using (StreamReader sr = new StreamReader(faiFile))
                {
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        var name = line.Split('\t')[0];
                        chrNameMap[name] = name;
                        if (name.StartsWith("chr"))
                        {
                            chrNameMap[name.StringAfter("chr")] = name;
                        }
                        if (!name.StartsWith("chr"))
                        {
                            chrNameMap["chr" + name] = name;
                        }

                        if (name.Equals("chrMT") || name.Equals("MT"))
                        {
                            mitoName = "MT";
                        }
                        if (name.Equals("chrM") || name.Equals("M"))
                        {
                            mitoName = "M";
                        }
                    }
                }
            }
            else
            {
                using (StreamReader sr = new StreamReader(options.FastaFile))
                {
                    Sequence seq;
                    while ((seq = ff.ReadSequence(sr)) != null)
                    {
                        var name = seq.Name;
                        chrNameMap[name] = name;
                        if (name.StartsWith("chr"))
                        {
                            chrNameMap[name.StringAfter("chr")] = name;
                        }
                        if (!name.StartsWith("chr"))
                        {
                            chrNameMap["chr" + name] = name;
                        }

                        if (name.Equals("chrMT") || name.Equals("MT"))
                        {
                            mitoName = "MT";
                        }
                        if (name.Equals("chrM") || name.Equals("M"))
                        {
                            mitoName = "M";
                        }
                    }
                }
            }
            var longMitoName = chrNameMap[mitoName];

            Progress.SetMessage("mitochondral chromosome name = {0}", longMitoName);

            var mirnas = new List <BedItem>();

            if (File.Exists(options.MiRBaseFile))
            {
                Progress.SetMessage("Processing {0} ...", options.MiRBaseFile);

                if (options.MiRBaseFile.EndsWith(".bed"))
                {
                    mirnas = bedfile.ReadFromFile(options.MiRBaseFile);
                    mirnas.ForEach(m =>
                    {
                        m.Seqname = m.Seqname.StringAfter("chr");
                        m.Name    = options.MiRBaseKey + ":" + m.Name;
                    });
                }
                else
                {
                    using (var gf = new GtfItemFile(options.MiRBaseFile))
                    {
                        GtfItem item;
                        while ((item = gf.Next(options.MiRBaseKey)) != null)
                        {
                            BedItem loc = new BedItem();
                            loc.Seqname = item.Seqname.StringAfter("chr");
                            loc.Start   = item.Start - 1;
                            loc.End     = item.End;
                            loc.Name    = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";");
                            loc.Score   = 1000;
                            loc.Strand  = item.Strand;
                            mirnas.Add(loc);
                        }
                    }
                }

                Progress.SetMessage("{0} miRNA readed.", mirnas.Count);
            }

            List <BedItem> trnas = new List <BedItem>();

            if (File.Exists(options.UcscTrnaFile))
            {
                //reading tRNA from ucsc table without mitocondrom tRNA
                Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile);
                trnas = bedfile.ReadFromFile(options.UcscTrnaFile);
                trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr"));

                var removed = trnas.Where(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))).ToList();
                if (removed.Count != trnas.Count)
                {
                    //remove the tRNA not from 1-22, X and Y
                    trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n)));

                    //mitocondrom tRNA will be extracted from ensembl gtf file
                    trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT"));
                }

                trnas.ForEach(m => m.Name = GetTRNAName(m.Name));

                Progress.SetMessage("{0} tRNA from ucsc readed.", trnas.Count);

                if (File.Exists(options.UcscMatureTrnaFastaFile))
                {
                    var seqs = SequenceUtils.Read(options.UcscMatureTrnaFastaFile);
                    foreach (var seq in seqs)
                    {
                        var tRNAName = GetTRNAName(seq.Name);
                        trnas.Add(new BedItem()
                        {
                            Seqname  = seq.Name,
                            Start    = 0,
                            End      = seq.SeqString.Length,
                            Strand   = '+',
                            Name     = tRNAName,
                            Sequence = seq.SeqString
                        });
                    }
                }
            }

            var others = new List <BedItem>();

            if (File.Exists(options.EnsemblGtfFile))
            {
                //reading smallRNA/tRNA from ensembl gtf file
                Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile);
                using (var gf = new GtfItemFile(options.EnsemblGtfFile))
                {
                    var biotypes = new HashSet <string>(SmallRNAConsts.Biotypes);
                    biotypes.Remove(SmallRNAConsts.miRNA);

                    GtfItem item;
                    int     count = 0;
                    while ((item = gf.Next("gene")) != null)
                    {
                        string biotype;
                        if (item.Attributes.Contains("gene_biotype"))
                        {
                            biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\"");
                        }
                        else if (item.Attributes.Contains("gene_type"))
                        {
                            biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\"");
                        }
                        else
                        {
                            continue;
                        }

                        if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA))
                        {
                            continue;
                        }

                        if (biotype.Equals("Mt_tRNA"))
                        {
                            count++;
                            var     gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId;
                            BedItem loc       = new BedItem();
                            loc.Seqname = mitoName;
                            loc.Start   = item.Start - 1;
                            loc.End     = item.End;
                            loc.Name    = string.Format(SmallRNAConsts.mt_tRNA + ":" + longMitoName + ".tRNA{0}-{1}", count, gene_name.StringAfter("-"));
                            loc.Score   = 1000;
                            loc.Strand  = item.Strand;
                            trnas.Add(loc);
                        }
                        else if (biotypes.Contains(biotype))
                        {
                            string seqName;
                            if (item.Seqname.ToLower().StartsWith("chr"))
                            {
                                seqName = item.Seqname.Substring(3);
                            }
                            else
                            {
                                seqName = item.Seqname;
                            }
                            if (seqName.Equals("M") || seqName.Equals("MT"))
                            {
                                seqName = mitoName;
                            }

                            //ignore all smallRNA coordinates on scaffold or contig.
                            //if (seqName.Length > 5)
                            //{
                            //  continue;
                            //}

                            var gene_name   = item.Attributes.StringAfter("gene_name \"").StringBefore("\"");
                            var lowGeneName = gene_name.ToLower();
                            if (lowGeneName.StartsWith("rny") || lowGeneName.Equals("y_rna"))
                            {
                                biotype = "yRNA";
                            }

                            BedItem loc = new BedItem();
                            loc.Seqname = seqName;
                            loc.Start   = item.Start - 1;
                            loc.End     = item.End;

                            //if (lowGeneName.EndsWith("_rrna") && loc.Length < 200)
                            //{
                            //  biotype = "rRNA";
                            //}

                            loc.Name   = biotype + ":" + gene_name + ":" + item.GeneId;
                            loc.Score  = 1000;
                            loc.Strand = item.Strand;

                            others.Add(loc);
                        }
                    }
                }
            }

            var all = new List <BedItem>();

            all.AddRange(mirnas);
            all.AddRange(trnas);
            all.AddRange(others);

            foreach (var bi in all)
            {
                if (chrNameMap.ContainsKey(bi.Seqname))
                {
                    bi.Seqname = chrNameMap[bi.Seqname];
                }
            }

            if (File.Exists(options.RRNAFile))
            {
                var seqs = SequenceUtils.Read(options.RRNAFile);
                foreach (var seq in seqs)
                {
                    all.Add(new BedItem()
                    {
                        Seqname = seq.Name,
                        Start   = 0,
                        End     = seq.SeqString.Length,
                        Strand  = '+',
                        Name    = "rRNA:" + SmallRNAConsts.rRNADB_KEY + seq.Name
                    });
                }
            }

            Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "...");
            using (var sw = new StreamWriter(options.OutputFile))
            {
                foreach (var pir in SmallRNAConsts.Biotypes)
                {
                    var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                    Progress.SetMessage("{0} : {1}", pir, locs.Count);

                    GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                    foreach (var loc in locs)
                    {
                        sw.WriteLine(bedfile.GetValue(loc));
                    }
                }
            }

            var miRNA_bed = FileUtils.ChangeExtension(options.OutputFile, ".miRNA.bed");

            Progress.SetMessage("Saving miRNA coordinates to " + miRNA_bed + "...");
            using (var sw = new StreamWriter(miRNA_bed))
            {
                var pir  = SmallRNAConsts.miRNA;
                var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                Progress.SetMessage("{0} : {1}", pir, locs.Count);

                GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                foreach (var loc in locs)
                {
                    sw.WriteLine(bedfile.GetValue(loc));
                }
            }

            Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss1 ...");
            using (var sw = new StreamWriter(options.OutputFile + ".miss1"))
            {
                foreach (var pir in SmallRNAConsts.Biotypes)
                {
                    if (pir == SmallRNABiotype.lincRNA.ToString() || pir == SmallRNABiotype.lncRNA.ToString())
                    {
                        continue;
                    }
                    var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                    locs.RemoveAll(l => l.Name.Contains(SmallRNAConsts.rRNADB_KEY));

                    Progress.SetMessage("{0} : {1}", pir, locs.Count);

                    GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                    foreach (var loc in locs)
                    {
                        sw.WriteLine(bedfile.GetValue(loc));
                    }
                }
            }

            Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss0 ...");
            using (var sw = new StreamWriter(options.OutputFile + ".miss0"))
            {
                foreach (var pir in SmallRNAConsts.Biotypes)
                {
                    if (pir != SmallRNABiotype.lincRNA.ToString() && pir != SmallRNABiotype.lncRNA.ToString() && pir != SmallRNABiotype.rRNA.ToString())
                    {
                        continue;
                    }
                    var locs = all.Where(m => m.Name.StartsWith(pir)).ToList();
                    if (pir == SmallRNABiotype.rRNA.ToString())
                    {
                        locs.RemoveAll(l => !l.Name.Contains(SmallRNAConsts.rRNADB_KEY));
                    }

                    Progress.SetMessage("{0} : {1}", pir, locs.Count);

                    GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start);

                    foreach (var loc in locs)
                    {
                        sw.WriteLine(bedfile.GetValue(loc));
                    }
                }
            }

            var summaryFile = options.OutputFile + ".info";

            Progress.SetMessage("Writing summary to " + summaryFile + "...");
            using (var sw = new StreamWriter(summaryFile))
            {
                sw.WriteLine("Biotype\tCount");

                all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count()));
            }

            var result = new List <string>(new[] { options.OutputFile });

            var fasta = Path.ChangeExtension(options.OutputFile, ".fasta");

            if ((File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) || File.Exists(options.RRNAFile))
            {
                result.Add(fasta);
                using (var sw = new StreamWriter(fasta))
                {
                    string line;
                    using (var sr = new StreamReader(options.FastaFile))
                    {
                        while ((line = sr.ReadLine()) != null)
                        {
                            sw.WriteLine(line);
                        }
                    }

                    if (File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile))
                    {
                        using (var sr = new StreamReader(options.UcscMatureTrnaFastaFile))
                        {
                            while ((line = sr.ReadLine()) != null)
                            {
                                sw.WriteLine(line);
                            }
                        }
                    }

                    if (File.Exists(options.RRNAFile))
                    {
                        using (var sr = new StreamReader(options.RRNAFile))
                        {
                            while ((line = sr.ReadLine()) != null)
                            {
                                sw.WriteLine(line);
                            }
                        }
                    }
                }
            }

            var faFile = options.OutputFile + ".fa";

            Progress.SetMessage("Extracting sequence from " + options.FastaFile + "...");
            var b2foptions = new Bed2FastaProcessorOptions()
            {
                GenomeFastaFile = options.FastaFile,
                InputFile       = options.OutputFile,
                OutputFile      = faFile,
                KeepChrInName   = false,
            };

            if (!File.Exists(options.UcscMatureTrnaFastaFile))
            {
                b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA) || m.StartsWith(SmallRNAConsts.tRNA);
            }
            else
            {
                b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA);
            }

            new Bed2FastaProcessor(b2foptions)
            {
                Progress = this.Progress
            }.Process();

            if (File.Exists(options.UcscMatureTrnaFastaFile))
            {
                Progress.SetMessage("Extracting sequence from " + options.UcscMatureTrnaFastaFile + " ...");

                using (var sw = new StreamWriter(faFile, true))
                {
                    foreach (var tRNA in trnas)
                    {
                        if (!string.IsNullOrEmpty(tRNA.Sequence))
                        {
                            sw.WriteLine(">{0}", tRNA.Name);
                            sw.WriteLine("{0}", tRNA.Sequence);
                        }
                    }
                }
            }

            return(result);
        }
示例#8
0
        public override IEnumerable <string> Process(string filename)
        {
            FastaFormat ff     = new FastaFormat();
            Digest      digest = new Digest();

            digest.DigestProtease     = ProteaseManager.FindOrCreateProtease("Trypsin", true, "RK", "P");
            digest.MaxMissedCleavages = 1;

            NGlycanFilter filter = new NGlycanFilter();

            digest.Filter = filter;

            string resultFile = filename + ".nglycan";
            Dictionary <string, NGlycanValue> peptideProteinMap = new Dictionary <string, NGlycanValue>();

            using (StreamReader sr = new StreamReader(filename))
            {
                Sequence seq;
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    digest.ProteinSequence = seq;
                    digest.AddDigestFeatures();

                    if (seq.Annotation.ContainsKey(Digest.PEPTIDE_FEATURE_TYPE))
                    {
                        bool[] isGlycans = filter.IsNglycan;

                        List <DigestPeptideInfo> nglycanPeptides = (List <DigestPeptideInfo>)seq.Annotation[Digest.PEPTIDE_FEATURE_TYPE];
                        foreach (DigestPeptideInfo dpi in nglycanPeptides)
                        {
                            if (!peptideProteinMap.ContainsKey(dpi.PeptideSeq))
                            {
                                StringBuilder sb = new StringBuilder();
                                for (int i = 0; i < dpi.PeptideSeq.Length; i++)
                                {
                                    if (isGlycans[dpi.PeptideLoc.Min - 1 + i])
                                    {
                                        sb.Append(1);
                                    }
                                    else
                                    {
                                        sb.Append(0);
                                    }
                                }

                                NGlycanValue value = new NGlycanValue();
                                value.NGlycanSites = sb.ToString();

                                peptideProteinMap[dpi.PeptideSeq] = value;
                            }

                            peptideProteinMap[dpi.PeptideSeq].Proteins.Add(parser.GetValue(dpi.ProteinName));
                        }
                    }
                }
            }

            List <string> peptides = new List <string>(peptideProteinMap.Keys);

            peptides.Sort();

            using (StreamWriter sw = new StreamWriter(resultFile))
            {
                foreach (string pep in peptides)
                {
                    NGlycanValue value = peptideProteinMap[pep];
                    sw.Write(pep + "\t" + value.NGlycanSites + "\t");
                    bool bFirst = true;
                    foreach (string protein in value.Proteins)
                    {
                        if (bFirst)
                        {
                            bFirst = false;
                            sw.Write(protein);
                        }
                        else
                        {
                            sw.Write(" ! " + protein);
                        }
                    }
                    sw.WriteLine();
                }
            }

            return(new[] { resultFile });
        }
示例#9
0
        public static void FillSequenceFromFasta(IStringParser <string> acParser, string fastaFilename, IIdentifiedResult t,
                                                 IProgressCallback progress)
        {
            if (progress == null)
            {
                progress = new EmptyProgressCallback();
            }

            progress.SetMessage("Initializing accessNumber/protein map ...");

            var acMap = new Dictionary <string, IIdentifiedProtein>();

            foreach (IIdentifiedProteinGroup group in t)
            {
                foreach (IIdentifiedProtein protein in group)
                {
                    string ac = acParser.GetValue(protein.Name);
                    if (acMap.ContainsKey(ac))
                    {
                        throw new Exception("Duplicate access number " + ac);
                    }
                    acMap[ac] = protein;

                    if (ac != protein.Name)
                    {
                        if (acMap.ContainsKey(protein.Name))
                        {
                            throw new Exception("Duplicate access number " + protein.Name);
                        }
                        acMap[protein.Name] = protein;
                    }
                }
            }

            progress.SetMessage("Filling sequence from database ...");
            var ff = new FastaFormat();

            using (var sr = new StreamReader(fastaFilename))
            {
                progress.SetRange(1, sr.BaseStream.Length);

                Sequence seq;
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    if (progress.IsCancellationPending())
                    {
                        throw new UserTerminatedException();
                    }

                    progress.SetPosition(sr.BaseStream.Position);

                    string ac = acParser.GetValue(seq.Name);
                    if (acMap.ContainsKey(ac))
                    {
                        IIdentifiedProtein protein = acMap[ac];
                        protein.Name        = seq.Name.Replace("/", " ");
                        protein.Description = seq.Description.Replace("\t", " ").Replace("/", " ");
                        protein.Sequence    = seq.SeqString;
                    }
                }
            }

            var failed = acMap.Values.Where(l => l.Sequence == null).ToList();

            if (failed.Count > 0)
            {
                var proteinNames = failed.ConvertAll(l => l.Name).ToArray();
                if (!proteinNames.All(l => l.StartsWith("XXX_")))
                {
                    throw new Exception(string.Format("Couldn't find sequence of following protein(s), change access number pattern or select another database\n{0}", proteinNames.Merge("/")));
                }
            }

            progress.SetMessage("Fill sequence from database finished.");
        }
示例#10
0
        public override IEnumerable <string> Process(string fileName)
        {
            var result = new List <string>();

            string[] acLines = File.ReadAllLines(fileName);

            var acs = new HashSet <string>();

            foreach (var acline in acLines)
            {
                string ac;
                if (!parser.TryParse(acline, out ac))
                {
                    ac = acline;
                }

                acs.Add(ac);
            }

            var findAcs = new HashSet <string>();

            var resultFile = fileName + ".fasta";

            result.Add(resultFile);

            var ff = new FastaFormat();

            using (StreamWriter sw = new StreamWriter(resultFile))
                using (StreamReader sr = new StreamReader(database))
                {
                    Progress.SetRange(0, sr.BaseStream.Length);

                    Sequence seq;
                    while ((seq = ff.ReadSequence(sr)) != null)
                    {
                        Progress.SetPosition(sr.BaseStream.Position);

                        string curAc;
                        if (!parser.TryParse(seq.Name, out curAc))
                        {
                            curAc = seq.Name;
                        }

                        if (acs.Contains(curAc))
                        {
                            findAcs.Add(curAc);
                            if (this.replaceName)
                            {
                                seq.Reference = curAc;
                            }
                            ff.WriteSequence(sw, seq);
                        }
                    }
                }

            acs.ExceptWith(findAcs);

            var missFile = fileName + ".miss";

            if (acs.Count > 0)
            {
                using (StreamWriter sw = new StreamWriter(missFile))
                {
                    foreach (var ac in acs)
                    {
                        sw.WriteLine(ac);
                    }
                }
                result.Add(missFile);
            }
            else if (File.Exists(missFile))
            {
                File.Delete(missFile);
            }

            return(result);
        }
示例#11
0
        /// <summary>
        /// 读取fasta文件,进行数据处理。
        /// </summary>
        /// <param name="fileName"></param>
        /// <returns></returns>
        public override IEnumerable <string> Process(string fastaFile)
        {
            HashSet <string> result = new HashSet <string>();

            var ff = new FastaFormat();

            using (StreamReader sr = new StreamReader(fastaFile))
            {
                Progress.SetRange(0, sr.BaseStream.Length);

                var aas = new Aminoacids();

                Predicate <string> aaFilter = m =>
                {
                    foreach (var aa in ignoreAminoacids)
                    {
                        if (m.Contains(aa))
                        {
                            return(false);
                        }
                    }
                    return(true);
                };

                Predicate <string> lengthFilter = m => m.Length >= minLength;

                Predicate <string> massFilter = m =>
                {
                    var mass = aas.MonoPeptideMass(m);
                    return(mass >= minMass && mass <= maxMass);
                };

                Predicate <string> filter = m => aaFilter(m) && lengthFilter(m) && massFilter(m);

                List <Digest> digs = new List <Digest>();
                foreach (var protease in proteases)
                {
                    var dig = new Digest();
                    dig.DigestProtease     = protease;
                    dig.MaxMissedCleavages = maxMissCleavage;
                    digs.Add(dig);
                }

                Sequence seq;
                Progress.SetMessage("Digesting sequences ...");
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    Progress.SetPosition(sr.GetCharpos());

                    if (Progress.IsCancellationPending())
                    {
                        throw new UserTerminatedException();
                    }

                    HashSet <string> curseqs = new HashSet <string>();
                    curseqs.Add(seq.SeqString);

                    foreach (var dig in digs)
                    {
                        var last = curseqs;
                        curseqs = new HashSet <string>();

                        foreach (var curseq in last)
                        {
                            var pro = new Sequence(curseq, curseq);
                            dig.ProteinSequence = pro;
                            dig.AddDigestFeatures();
                            var infos = pro.GetDigestPeptideInfo();

                            infos.ForEach(m =>
                            {
                                if (filter(m.PeptideSeq))
                                {
                                    curseqs.Add(m.PeptideSeq);
                                }
                            });
                        }
                    }

                    result.UnionWith(curseqs);
                }
            }

            Progress.SetMessage("Sorting sequences ...");
            var peps = new List <string>(result);

            peps.Sort((m1, m2) =>
            {
                var res = m1.Length.CompareTo(m2.Length);
                if (res == 0)
                {
                    res = m1.CompareTo(m2);
                }
                return(res);
            });

            var resultFile = fastaFile + ".pep";

            using (StreamWriter sw = new StreamWriter(resultFile))
            {
                peps.ForEach(m => sw.WriteLine(m));
            }

            return(new[] { resultFile });
        }
示例#12
0
        public override IEnumerable <string> Process()
        {
            var srItems = SequenceRegionUtils.GetSequenceRegions(options.InputFile).Where(m => options.AcceptName(m.Name)).ToList();

            srItems = (from sr in srItems.GroupBy(m => m.Name)
                       select sr.First()).ToList();

            var keepChrInName = options.KeepChrInName && srItems.Any(m => m.Name.StartsWith("chr"));

            if (!keepChrInName)
            {
                srItems.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr"));
            }

            var srMap = srItems.ToGroupDictionary(m => m.Seqname);

            var ff = new FastaFormat(int.MaxValue);

            using (StreamWriter sw = new StreamWriter(options.OutputFile))
            {
                using (StreamReader sr = new StreamReader(options.GenomeFastaFile))
                {
                    Sequence seq;
                    while ((seq = ff.ReadSequence(sr)) != null)
                    {
                        Progress.SetMessage("processing " + seq.Name + " ...");
                        var name = seq.Name;
                        if (!keepChrInName)
                        {
                            name = name.StringAfter("chr");
                        }

                        List <GtfItem> items;

                        if (!srMap.TryGetValue(name, out items))
                        {
                            if (name.Equals("M"))
                            {
                                name = "MT";
                                srMap.TryGetValue(name, out items);
                            }
                            else if (name.Equals("chrM"))
                            {
                                name = "chrMT";
                                srMap.TryGetValue(name, out items);
                            }
                            else if (name.Equals("MT"))
                            {
                                name = "M";
                                srMap.TryGetValue(name, out items);
                            }
                            else if (name.Equals("chrMT"))
                            {
                                name = "chrM";
                                srMap.TryGetValue(name, out items);
                            }
                        }

                        if (items != null)
                        {
                            Progress.SetMessage("  there are {0} entries in {1} ...", items.Count, name);
                            foreach (var item in items)
                            {
                                if (item.Start - 1 + item.Length >= seq.SeqString.Length)
                                {
                                    throw new Exception(string.Format("{0} exceed chromosome {1} length {2}", item, name, seq.SeqString.Length));
                                }
                                var newseq = seq.SeqString.Substring((int)item.Start - 1, (int)item.Length);
                                if (item.Strand == '-')
                                {
                                    newseq = SequenceUtils.GetReverseComplementedSequence(newseq);
                                }
                                newseq = newseq.ToUpper();

                                var newname = string.Format("{0} {1} {2}", item.Name, item.GetLocationWithoutStrand(), item.Strand);
                                var entry   = new Sequence(newname, newseq);

                                ff.WriteSequence(sw, entry);
                            }
                        }
                    }
                }
            }
            return(new string[] { options.OutputFile });
        }