/// <summary> /// Fill reference allele from genome fasta file. /// </summary> /// <param name="snpItems"></param> /// <param name="fastaFile"></param> /// <param name="progress"></param> public static void FillReferenceAlleleFromFasta(this IEnumerable <SNPItem> snpItems, string fastaFile, IProgressCallback progress = null) { if (progress == null) { progress = new ConsoleProgressCallback(); } var dic = snpItems.ToGroupDictionary(m => m.Chrom); progress.SetMessage("Filling reference allele from {0} file ...", fastaFile); using (var sw = new StreamReader(fastaFile)) { var ff = new FastaFormat(); Sequence seq; while ((seq = ff.ReadSequence(sw)) != null) { progress.SetMessage("chromosome " + seq.Name + " ..."); var chr = HumanChromosomeToInt(seq.Name); if (dic.ContainsKey(chr)) { var snps = dic[chr]; foreach (var snp in snps) { snp.RefChar = char.ToUpper(seq.SeqString[snp.Position - 1]); } } } } progress.SetMessage("Filling reference allele finished."); }
public override IEnumerable <string> Process(string fileName) { string result = FileUtils.ChangeExtension(fileName, "") + "_" + name + new FileInfo(fileName).Extension; FastaFormat format = new FastaFormat(); Progress.SetMessage("Processing " + fileName); using (StreamReader sr = new StreamReader(fileName)) { Progress.SetRange(0, sr.BaseStream.Length); using (StreamWriter sw = new StreamWriter(result)) { Sequence seq; while ((seq = format.ReadSequence(sr)) != null) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } Progress.SetPosition(sr.BaseStream.Position); if (nameRegex.Match(seq.Name).Success) { format.WriteSequence(sw, seq); } } } } return(new string[] { result }); }
public static List <CoverageRegion> GetTargetCoverageRegion(ITargetBuilderOptions options, IProgressCallback progress, bool removeRegionWithoutSequence = true) { List <CoverageRegion> result; if (options.TargetFile.EndsWith(".xml")) { result = GetTargetCoverageRegionFromXml(options, progress); } else { result = GetTargetCoverageRegionFromBed(options, progress); } var dic = result.ToGroupDictionary(m => m.Seqname); progress.SetMessage("Filling sequence from {0}...", options.GenomeFastaFile); using (var sr = new StreamReader(options.GenomeFastaFile)) { var ff = new FastaFormat(); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { progress.SetMessage("Processing chromosome {0} ...", seq.Reference); var seqname = seq.Name.StringAfter("chr"); List <CoverageRegion> lst; if (dic.TryGetValue(seqname, out lst)) { foreach (var l in lst) { l.Sequence = seq.SeqString.Substring((int)(l.Start - 1), (int)l.Length); if (l.Strand == '+') { l.ReverseComplementedSequence = SequenceUtils.GetReverseComplementedSequence(l.Sequence); } } } } } if (removeRegionWithoutSequence) { result.RemoveAll(l => string.IsNullOrEmpty(l.Sequence)); } progress.SetMessage("Filling sequence finished."); var namemap = new MapReader(1, 12).ReadFromFile(options.RefgeneFile); result.ForEach(m => { var gene = m.Name.StringBefore("_utr3"); m.GeneSymbol = namemap.ContainsKey(gene) ? namemap[gene] : string.Empty; }); return(result); }
public static HashSet <string> GetContaminationAccessNumbers(IStringParser <string> acParser, string fastaFilename, string contaminationDescriptionPattern, IProgressCallback progress) { HashSet <string> result = new HashSet <string>(); if (progress == null) { progress = new EmptyProgressCallback(); } Regex reg = new Regex(contaminationDescriptionPattern, RegexOptions.IgnoreCase); progress.SetMessage("Get contamination map from database ..."); var ff = new FastaFormat(); using (var sr = new StreamReader(fastaFilename)) { progress.SetRange(1, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { if (progress.IsCancellationPending()) { throw new UserTerminatedException(); } progress.SetPosition(sr.GetCharpos()); string ac = acParser.GetValue(seq.Name); if (reg.Match(seq.Reference).Success) { result.Add(ac); } } } progress.SetMessage("Get contamination map from database finished."); return(result); }
private void ProcessFile(ref int index, StreamWriter sw, string fastaFile, bool isContaminant) { FastaFormat ff = new FastaFormat(); using (StreamReader sr = new StreamReader(fastaFile)) { Progress.SetRange(0, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(sr.BaseStream.Position); if (isContaminant) { if (!seq.Reference.StartsWith("CON_")) { seq.Reference = "CON_" + seq.Reference; } } if (!options.ReversedOnly) { ff.WriteSequence(sw, seq); } if (options.IsPseudoAminoacid) { options.PseudoAminoacidBuilder.Build(seq); } index++; Sequence reversedSeq = GetReversedSequence(index, seq); ff.WriteSequence(sw, reversedSeq); } } }
public override IEnumerable <string> Process(string fileName) { FastaFormat ff = new FastaFormat(); var result = Path.ChangeExtension(fileName, ".dM.fasta"); using (StreamReader sr = new StreamReader(fileName)) using (StreamWriter sw = new StreamWriter(result)) { Sequence seq; Progress.SetRange(1, sr.BaseStream.Length); while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(StreamUtils.GetCharpos(sr)); if (seq.SeqString.StartsWith("M")) { seq.SeqString = seq.SeqString.Substring(1); seq.Reference = seq.Name + " N-terminal-M-Removed " + seq.Description; } ff.WriteSequence(sw, seq); } } return(new string[] { result }); }
public override IEnumerable <string> Process() { var paramFile = options.OutputFile + ".param"; options.SaveToFile(options.OutputFile + ".param"); var bedfile = new BedItemFile <BedItem>(6); Progress.SetMessage("building chromosome name map ..."); var mitoName = "M"; Dictionary <string, string> chrNameMap = new Dictionary <string, string>(); var ff = new FastaFormat(int.MaxValue); var faiFile = options.FastaFile + ".fai"; if (File.Exists(faiFile)) { using (StreamReader sr = new StreamReader(faiFile)) { string line; while ((line = sr.ReadLine()) != null) { var name = line.Split('\t')[0]; chrNameMap[name] = name; if (name.StartsWith("chr")) { chrNameMap[name.StringAfter("chr")] = name; } if (!name.StartsWith("chr")) { chrNameMap["chr" + name] = name; } if (name.Equals("chrMT") || name.Equals("MT")) { mitoName = "MT"; } if (name.Equals("chrM") || name.Equals("M")) { mitoName = "M"; } } } } else { using (StreamReader sr = new StreamReader(options.FastaFile)) { Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { var name = seq.Name; chrNameMap[name] = name; if (name.StartsWith("chr")) { chrNameMap[name.StringAfter("chr")] = name; } if (!name.StartsWith("chr")) { chrNameMap["chr" + name] = name; } if (name.Equals("chrMT") || name.Equals("MT")) { mitoName = "MT"; } if (name.Equals("chrM") || name.Equals("M")) { mitoName = "M"; } } } } var longMitoName = chrNameMap[mitoName]; Progress.SetMessage("mitochondral chromosome name = {0}", longMitoName); var mirnas = new List <BedItem>(); if (File.Exists(options.MiRBaseFile)) { Progress.SetMessage("Processing {0} ...", options.MiRBaseFile); if (options.MiRBaseFile.EndsWith(".bed")) { mirnas = bedfile.ReadFromFile(options.MiRBaseFile); mirnas.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); m.Name = options.MiRBaseKey + ":" + m.Name; }); } else { using (var gf = new GtfItemFile(options.MiRBaseFile)) { GtfItem item; while ((item = gf.Next(options.MiRBaseKey)) != null) { BedItem loc = new BedItem(); loc.Seqname = item.Seqname.StringAfter("chr"); loc.Start = item.Start - 1; loc.End = item.End; loc.Name = options.MiRBaseKey + ":" + item.Attributes.StringAfter("Name=").StringBefore(";"); loc.Score = 1000; loc.Strand = item.Strand; mirnas.Add(loc); } } } Progress.SetMessage("{0} miRNA readed.", mirnas.Count); } List <BedItem> trnas = new List <BedItem>(); if (File.Exists(options.UcscTrnaFile)) { //reading tRNA from ucsc table without mitocondrom tRNA Progress.SetMessage("Processing {0} ...", options.UcscTrnaFile); trnas = bedfile.ReadFromFile(options.UcscTrnaFile); trnas.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); var removed = trnas.Where(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))).ToList(); if (removed.Count != trnas.Count) { //remove the tRNA not from 1-22, X and Y trnas.RemoveAll(m => (m.Seqname.Length > 1) && !m.Seqname.All(n => char.IsDigit(n))); //mitocondrom tRNA will be extracted from ensembl gtf file trnas.RemoveAll(m => m.Seqname.Equals("M") || m.Seqname.Equals("MT")); } trnas.ForEach(m => m.Name = GetTRNAName(m.Name)); Progress.SetMessage("{0} tRNA from ucsc readed.", trnas.Count); if (File.Exists(options.UcscMatureTrnaFastaFile)) { var seqs = SequenceUtils.Read(options.UcscMatureTrnaFastaFile); foreach (var seq in seqs) { var tRNAName = GetTRNAName(seq.Name); trnas.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = tRNAName, Sequence = seq.SeqString }); } } } var others = new List <BedItem>(); if (File.Exists(options.EnsemblGtfFile)) { //reading smallRNA/tRNA from ensembl gtf file Progress.SetMessage("Processing {0} ...", options.EnsemblGtfFile); using (var gf = new GtfItemFile(options.EnsemblGtfFile)) { var biotypes = new HashSet <string>(SmallRNAConsts.Biotypes); biotypes.Remove(SmallRNAConsts.miRNA); GtfItem item; int count = 0; while ((item = gf.Next("gene")) != null) { string biotype; if (item.Attributes.Contains("gene_biotype")) { biotype = item.Attributes.StringAfter("gene_biotype \"").StringBefore("\""); } else if (item.Attributes.Contains("gene_type")) { biotype = item.Attributes.StringAfter("gene_type \"").StringBefore("\""); } else { continue; } if (File.Exists(options.UcscTrnaFile) && biotype.Equals(SmallRNAConsts.tRNA)) { continue; } if (biotype.Equals("Mt_tRNA")) { count++; var gene_name = item.Attributes.Contains("gene_name") ? item.Attributes.StringAfter("gene_name \"").StringBefore("\"") : item.GeneId; BedItem loc = new BedItem(); loc.Seqname = mitoName; loc.Start = item.Start - 1; loc.End = item.End; loc.Name = string.Format(SmallRNAConsts.mt_tRNA + ":" + longMitoName + ".tRNA{0}-{1}", count, gene_name.StringAfter("-")); loc.Score = 1000; loc.Strand = item.Strand; trnas.Add(loc); } else if (biotypes.Contains(biotype)) { string seqName; if (item.Seqname.ToLower().StartsWith("chr")) { seqName = item.Seqname.Substring(3); } else { seqName = item.Seqname; } if (seqName.Equals("M") || seqName.Equals("MT")) { seqName = mitoName; } //ignore all smallRNA coordinates on scaffold or contig. //if (seqName.Length > 5) //{ // continue; //} var gene_name = item.Attributes.StringAfter("gene_name \"").StringBefore("\""); var lowGeneName = gene_name.ToLower(); if (lowGeneName.StartsWith("rny") || lowGeneName.Equals("y_rna")) { biotype = "yRNA"; } BedItem loc = new BedItem(); loc.Seqname = seqName; loc.Start = item.Start - 1; loc.End = item.End; //if (lowGeneName.EndsWith("_rrna") && loc.Length < 200) //{ // biotype = "rRNA"; //} loc.Name = biotype + ":" + gene_name + ":" + item.GeneId; loc.Score = 1000; loc.Strand = item.Strand; others.Add(loc); } } } } var all = new List <BedItem>(); all.AddRange(mirnas); all.AddRange(trnas); all.AddRange(others); foreach (var bi in all) { if (chrNameMap.ContainsKey(bi.Seqname)) { bi.Seqname = chrNameMap[bi.Seqname]; } } if (File.Exists(options.RRNAFile)) { var seqs = SequenceUtils.Read(options.RRNAFile); foreach (var seq in seqs) { all.Add(new BedItem() { Seqname = seq.Name, Start = 0, End = seq.SeqString.Length, Strand = '+', Name = "rRNA:" + SmallRNAConsts.rRNADB_KEY + seq.Name }); } } Progress.SetMessage("Saving smallRNA coordinates to " + options.OutputFile + "..."); using (var sw = new StreamWriter(options.OutputFile)) { foreach (var pir in SmallRNAConsts.Biotypes) { var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } var miRNA_bed = FileUtils.ChangeExtension(options.OutputFile, ".miRNA.bed"); Progress.SetMessage("Saving miRNA coordinates to " + miRNA_bed + "..."); using (var sw = new StreamWriter(miRNA_bed)) { var pir = SmallRNAConsts.miRNA; var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss1 ..."); using (var sw = new StreamWriter(options.OutputFile + ".miss1")) { foreach (var pir in SmallRNAConsts.Biotypes) { if (pir == SmallRNABiotype.lincRNA.ToString() || pir == SmallRNABiotype.lncRNA.ToString()) { continue; } var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); locs.RemoveAll(l => l.Name.Contains(SmallRNAConsts.rRNADB_KEY)); Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } Progress.SetMessage("Saving smallRNA miss1 coordinates to " + options.OutputFile + ".miss0 ..."); using (var sw = new StreamWriter(options.OutputFile + ".miss0")) { foreach (var pir in SmallRNAConsts.Biotypes) { if (pir != SmallRNABiotype.lincRNA.ToString() && pir != SmallRNABiotype.lncRNA.ToString() && pir != SmallRNABiotype.rRNA.ToString()) { continue; } var locs = all.Where(m => m.Name.StartsWith(pir)).ToList(); if (pir == SmallRNABiotype.rRNA.ToString()) { locs.RemoveAll(l => !l.Name.Contains(SmallRNAConsts.rRNADB_KEY)); } Progress.SetMessage("{0} : {1}", pir, locs.Count); GenomeUtils.SortChromosome(locs, m => m.Seqname, m => (int)m.Start); foreach (var loc in locs) { sw.WriteLine(bedfile.GetValue(loc)); } } } var summaryFile = options.OutputFile + ".info"; Progress.SetMessage("Writing summary to " + summaryFile + "..."); using (var sw = new StreamWriter(summaryFile)) { sw.WriteLine("Biotype\tCount"); all.ConvertAll(m => m.Name).Distinct().GroupBy(m => m.StringBefore(":")).OrderByDescending(m => m.Count()).ToList().ForEach(m => sw.WriteLine("{0}\t{1}", m.Key, m.Count())); } var result = new List <string>(new[] { options.OutputFile }); var fasta = Path.ChangeExtension(options.OutputFile, ".fasta"); if ((File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) || File.Exists(options.RRNAFile)) { result.Add(fasta); using (var sw = new StreamWriter(fasta)) { string line; using (var sr = new StreamReader(options.FastaFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } if (File.Exists(options.UcscTrnaFile) && File.Exists(options.UcscMatureTrnaFastaFile)) { using (var sr = new StreamReader(options.UcscMatureTrnaFastaFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } if (File.Exists(options.RRNAFile)) { using (var sr = new StreamReader(options.RRNAFile)) { while ((line = sr.ReadLine()) != null) { sw.WriteLine(line); } } } } } var faFile = options.OutputFile + ".fa"; Progress.SetMessage("Extracting sequence from " + options.FastaFile + "..."); var b2foptions = new Bed2FastaProcessorOptions() { GenomeFastaFile = options.FastaFile, InputFile = options.OutputFile, OutputFile = faFile, KeepChrInName = false, }; if (!File.Exists(options.UcscMatureTrnaFastaFile)) { b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA) || m.StartsWith(SmallRNAConsts.tRNA); } else { b2foptions.AcceptName = m => m.StartsWith(SmallRNAConsts.miRNA) || m.StartsWith(SmallRNAConsts.mt_tRNA); } new Bed2FastaProcessor(b2foptions) { Progress = this.Progress }.Process(); if (File.Exists(options.UcscMatureTrnaFastaFile)) { Progress.SetMessage("Extracting sequence from " + options.UcscMatureTrnaFastaFile + " ..."); using (var sw = new StreamWriter(faFile, true)) { foreach (var tRNA in trnas) { if (!string.IsNullOrEmpty(tRNA.Sequence)) { sw.WriteLine(">{0}", tRNA.Name); sw.WriteLine("{0}", tRNA.Sequence); } } } } return(result); }
public override IEnumerable <string> Process(string filename) { FastaFormat ff = new FastaFormat(); Digest digest = new Digest(); digest.DigestProtease = ProteaseManager.FindOrCreateProtease("Trypsin", true, "RK", "P"); digest.MaxMissedCleavages = 1; NGlycanFilter filter = new NGlycanFilter(); digest.Filter = filter; string resultFile = filename + ".nglycan"; Dictionary <string, NGlycanValue> peptideProteinMap = new Dictionary <string, NGlycanValue>(); using (StreamReader sr = new StreamReader(filename)) { Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { digest.ProteinSequence = seq; digest.AddDigestFeatures(); if (seq.Annotation.ContainsKey(Digest.PEPTIDE_FEATURE_TYPE)) { bool[] isGlycans = filter.IsNglycan; List <DigestPeptideInfo> nglycanPeptides = (List <DigestPeptideInfo>)seq.Annotation[Digest.PEPTIDE_FEATURE_TYPE]; foreach (DigestPeptideInfo dpi in nglycanPeptides) { if (!peptideProteinMap.ContainsKey(dpi.PeptideSeq)) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < dpi.PeptideSeq.Length; i++) { if (isGlycans[dpi.PeptideLoc.Min - 1 + i]) { sb.Append(1); } else { sb.Append(0); } } NGlycanValue value = new NGlycanValue(); value.NGlycanSites = sb.ToString(); peptideProteinMap[dpi.PeptideSeq] = value; } peptideProteinMap[dpi.PeptideSeq].Proteins.Add(parser.GetValue(dpi.ProteinName)); } } } } List <string> peptides = new List <string>(peptideProteinMap.Keys); peptides.Sort(); using (StreamWriter sw = new StreamWriter(resultFile)) { foreach (string pep in peptides) { NGlycanValue value = peptideProteinMap[pep]; sw.Write(pep + "\t" + value.NGlycanSites + "\t"); bool bFirst = true; foreach (string protein in value.Proteins) { if (bFirst) { bFirst = false; sw.Write(protein); } else { sw.Write(" ! " + protein); } } sw.WriteLine(); } } return(new[] { resultFile }); }
public static void FillSequenceFromFasta(IStringParser <string> acParser, string fastaFilename, IIdentifiedResult t, IProgressCallback progress) { if (progress == null) { progress = new EmptyProgressCallback(); } progress.SetMessage("Initializing accessNumber/protein map ..."); var acMap = new Dictionary <string, IIdentifiedProtein>(); foreach (IIdentifiedProteinGroup group in t) { foreach (IIdentifiedProtein protein in group) { string ac = acParser.GetValue(protein.Name); if (acMap.ContainsKey(ac)) { throw new Exception("Duplicate access number " + ac); } acMap[ac] = protein; if (ac != protein.Name) { if (acMap.ContainsKey(protein.Name)) { throw new Exception("Duplicate access number " + protein.Name); } acMap[protein.Name] = protein; } } } progress.SetMessage("Filling sequence from database ..."); var ff = new FastaFormat(); using (var sr = new StreamReader(fastaFilename)) { progress.SetRange(1, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { if (progress.IsCancellationPending()) { throw new UserTerminatedException(); } progress.SetPosition(sr.BaseStream.Position); string ac = acParser.GetValue(seq.Name); if (acMap.ContainsKey(ac)) { IIdentifiedProtein protein = acMap[ac]; protein.Name = seq.Name.Replace("/", " "); protein.Description = seq.Description.Replace("\t", " ").Replace("/", " "); protein.Sequence = seq.SeqString; } } } var failed = acMap.Values.Where(l => l.Sequence == null).ToList(); if (failed.Count > 0) { var proteinNames = failed.ConvertAll(l => l.Name).ToArray(); if (!proteinNames.All(l => l.StartsWith("XXX_"))) { throw new Exception(string.Format("Couldn't find sequence of following protein(s), change access number pattern or select another database\n{0}", proteinNames.Merge("/"))); } } progress.SetMessage("Fill sequence from database finished."); }
public override IEnumerable <string> Process(string fileName) { var result = new List <string>(); string[] acLines = File.ReadAllLines(fileName); var acs = new HashSet <string>(); foreach (var acline in acLines) { string ac; if (!parser.TryParse(acline, out ac)) { ac = acline; } acs.Add(ac); } var findAcs = new HashSet <string>(); var resultFile = fileName + ".fasta"; result.Add(resultFile); var ff = new FastaFormat(); using (StreamWriter sw = new StreamWriter(resultFile)) using (StreamReader sr = new StreamReader(database)) { Progress.SetRange(0, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(sr.BaseStream.Position); string curAc; if (!parser.TryParse(seq.Name, out curAc)) { curAc = seq.Name; } if (acs.Contains(curAc)) { findAcs.Add(curAc); if (this.replaceName) { seq.Reference = curAc; } ff.WriteSequence(sw, seq); } } } acs.ExceptWith(findAcs); var missFile = fileName + ".miss"; if (acs.Count > 0) { using (StreamWriter sw = new StreamWriter(missFile)) { foreach (var ac in acs) { sw.WriteLine(ac); } } result.Add(missFile); } else if (File.Exists(missFile)) { File.Delete(missFile); } return(result); }
/// <summary> /// 读取fasta文件,进行数据处理。 /// </summary> /// <param name="fileName"></param> /// <returns></returns> public override IEnumerable <string> Process(string fastaFile) { HashSet <string> result = new HashSet <string>(); var ff = new FastaFormat(); using (StreamReader sr = new StreamReader(fastaFile)) { Progress.SetRange(0, sr.BaseStream.Length); var aas = new Aminoacids(); Predicate <string> aaFilter = m => { foreach (var aa in ignoreAminoacids) { if (m.Contains(aa)) { return(false); } } return(true); }; Predicate <string> lengthFilter = m => m.Length >= minLength; Predicate <string> massFilter = m => { var mass = aas.MonoPeptideMass(m); return(mass >= minMass && mass <= maxMass); }; Predicate <string> filter = m => aaFilter(m) && lengthFilter(m) && massFilter(m); List <Digest> digs = new List <Digest>(); foreach (var protease in proteases) { var dig = new Digest(); dig.DigestProtease = protease; dig.MaxMissedCleavages = maxMissCleavage; digs.Add(dig); } Sequence seq; Progress.SetMessage("Digesting sequences ..."); while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(sr.GetCharpos()); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } HashSet <string> curseqs = new HashSet <string>(); curseqs.Add(seq.SeqString); foreach (var dig in digs) { var last = curseqs; curseqs = new HashSet <string>(); foreach (var curseq in last) { var pro = new Sequence(curseq, curseq); dig.ProteinSequence = pro; dig.AddDigestFeatures(); var infos = pro.GetDigestPeptideInfo(); infos.ForEach(m => { if (filter(m.PeptideSeq)) { curseqs.Add(m.PeptideSeq); } }); } } result.UnionWith(curseqs); } } Progress.SetMessage("Sorting sequences ..."); var peps = new List <string>(result); peps.Sort((m1, m2) => { var res = m1.Length.CompareTo(m2.Length); if (res == 0) { res = m1.CompareTo(m2); } return(res); }); var resultFile = fastaFile + ".pep"; using (StreamWriter sw = new StreamWriter(resultFile)) { peps.ForEach(m => sw.WriteLine(m)); } return(new[] { resultFile }); }
public override IEnumerable <string> Process() { var srItems = SequenceRegionUtils.GetSequenceRegions(options.InputFile).Where(m => options.AcceptName(m.Name)).ToList(); srItems = (from sr in srItems.GroupBy(m => m.Name) select sr.First()).ToList(); var keepChrInName = options.KeepChrInName && srItems.Any(m => m.Name.StartsWith("chr")); if (!keepChrInName) { srItems.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); } var srMap = srItems.ToGroupDictionary(m => m.Seqname); var ff = new FastaFormat(int.MaxValue); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { using (StreamReader sr = new StreamReader(options.GenomeFastaFile)) { Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetMessage("processing " + seq.Name + " ..."); var name = seq.Name; if (!keepChrInName) { name = name.StringAfter("chr"); } List <GtfItem> items; if (!srMap.TryGetValue(name, out items)) { if (name.Equals("M")) { name = "MT"; srMap.TryGetValue(name, out items); } else if (name.Equals("chrM")) { name = "chrMT"; srMap.TryGetValue(name, out items); } else if (name.Equals("MT")) { name = "M"; srMap.TryGetValue(name, out items); } else if (name.Equals("chrMT")) { name = "chrM"; srMap.TryGetValue(name, out items); } } if (items != null) { Progress.SetMessage(" there are {0} entries in {1} ...", items.Count, name); foreach (var item in items) { if (item.Start - 1 + item.Length >= seq.SeqString.Length) { throw new Exception(string.Format("{0} exceed chromosome {1} length {2}", item, name, seq.SeqString.Length)); } var newseq = seq.SeqString.Substring((int)item.Start - 1, (int)item.Length); if (item.Strand == '-') { newseq = SequenceUtils.GetReverseComplementedSequence(newseq); } newseq = newseq.ToUpper(); var newname = string.Format("{0} {1} {2}", item.Name, item.GetLocationWithoutStrand(), item.Strand); var entry = new Sequence(newname, newseq); ff.WriteSequence(sw, entry); } } } } } return(new string[] { options.OutputFile }); }