public override IEnumerable <string> Process() { var format = new MascotPeptideTextFormat(); Progress.SetMessage("reading peptide-spectra-matches from " + options.PeptideFile + " ..."); var spectra = format.ReadFromFile(options.PeptideFile); var seqMap = new Dictionary <string, IIdentifiedPeptide>(); foreach (var spec in spectra) { seqMap[spec.Peptide.PureSequence] = spec.Peptide; } var aas = (from c in new Aminoacids().GetVisibleAminoacids() where c != 'I' select c.ToString()).Merge(""); var ff = new FastaFormat(); Progress.SetMessage("inserting amino acid ..."); using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine(File.ReadAllText(options.DatabaseFile)); var seqs = seqMap.Keys.OrderBy(m => m).ToArray(); var reversed_index = 1000000; foreach (var seq in seqs) { for (int i = 0; i < seq.Length; i++) { for (int j = 0; j < aas.Length; j++) { var newsequence = seq.Insert(i, aas[j].ToString()); var newref = string.Format("INS_{0}_{1}{2} Insertion of {3}", seq, i, aas[j], seqMap[seq].Proteins.Merge("/")); var newseq = new Sequence(newref, newsequence); ff.WriteSequence(sw, newseq); if (options.GenerateReversedPeptide) { var revsequence = SequenceUtils.GetReversedSequence(newsequence); var revref = string.Format("REVERSED_{0}", reversed_index++); var revseq = new Sequence(revref, revsequence); ff.WriteSequence(sw, revseq); } } } } } return(new[] { options.OutputFile }); }
public static void WriteFastaFile(string fastaFilename, IList <IIdentifiedProteinGroup> t, Func <IIdentifiedProteinGroup, bool> validateGroup) { foreach (var g in t) { if (validateGroup(g) && g.Count > 0 && (g[0].Sequence == null || g[0].Sequence.Length == 0)) { return; } } var ff = new FastaFormat(); using (var sw = new StreamWriter(fastaFilename)) { foreach (IIdentifiedProteinGroup mpg in t) { if (validateGroup(mpg)) { foreach (IIdentifiedProtein protein in mpg) { ff.WriteSequence(sw, protein.Reference, protein.Sequence); } } } } }
public override IEnumerable <string> Process(string fileName) { DatFormat reader = new DatFormat(); FastaFormat writer = new FastaFormat(); string result = FileUtils.ChangeExtension(fileName, ".fasta"); long fileLength = new FileInfo(fileName).Length; using (StreamReader sr = new StreamReader(fileName)) using (StreamWriter sw = new StreamWriter(result)) { Progress.SetRange(0, fileLength); Sequence seq; while ((seq = reader.ReadSequence(sr)) != null) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } Progress.SetPosition(sr.GetCharpos()); writer.WriteSequence(sw, seq); } } return(new string[] { result }); }
public override IEnumerable <string> Process(string fileName) { string result = FileUtils.ChangeExtension(fileName, "") + "_" + name + new FileInfo(fileName).Extension; FastaFormat format = new FastaFormat(); Progress.SetMessage("Processing " + fileName); using (StreamReader sr = new StreamReader(fileName)) { Progress.SetRange(0, sr.BaseStream.Length); using (StreamWriter sw = new StreamWriter(result)) { Sequence seq; while ((seq = format.ReadSequence(sr)) != null) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } Progress.SetPosition(sr.BaseStream.Position); if (nameRegex.Match(seq.Name).Success) { format.WriteSequence(sw, seq); } } } } return(new string[] { result }); }
private void WriteFasta(StreamWriter swFasta, Dictionary <string, Sequence> seqMap, string protein) { if (swFasta != null) { string[] parts = Regex.Split(protein, @"\s+"); ff.WriteSequence(swFasta, seqMap[parts[1].Trim()]); } }
private void ProcessFile(ref int index, StreamWriter sw, string fastaFile, bool isContaminant) { FastaFormat ff = new FastaFormat(); using (StreamReader sr = new StreamReader(fastaFile)) { Progress.SetRange(0, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(sr.BaseStream.Position); if (isContaminant) { if (!seq.Reference.StartsWith("CON_")) { seq.Reference = "CON_" + seq.Reference; } } if (!options.ReversedOnly) { ff.WriteSequence(sw, seq); } if (options.IsPseudoAminoacid) { options.PseudoAminoacidBuilder.Build(seq); } index++; Sequence reversedSeq = GetReversedSequence(index, seq); ff.WriteSequence(sw, reversedSeq); } } }
public override IEnumerable <string> Process(string fileName) { var parser = new MsfDatabaseParser(SearchEngineType.SEQUEST); var seqs = parser.ParseProteinSequences(fileName); SQLiteDBHelper sqlite = new SQLiteDBHelper(fileName); var result = new List <Sequence>(); var aaReader = sqlite.ExecuteReader("select count(*) from peptides_decoy", null); if (aaReader.Read()) { if (aaReader.GetInt32(0) > 0) // there are decoy database { foreach (var seq in seqs) { result.Add(seq); var revseq = new Sequence(MsfDatabaseParser.GetReversedReference(seq.Reference), SequenceUtils.GetReversedSequence(seq.SeqString)); result.Add(revseq); } } } if (result.Count == 0) { result = seqs; } var fastafile = fileName + ".fasta"; using (var sw = new StreamWriter(fastafile)) { var ff = new FastaFormat(); foreach (var seq in result) { ff.WriteSequence(sw, seq); } } return(new[] { fastafile }); }
public override IEnumerable <string> Process(string fileName) { FastaFormat ff = new FastaFormat(); var result = Path.ChangeExtension(fileName, ".dM.fasta"); using (StreamReader sr = new StreamReader(fileName)) using (StreamWriter sw = new StreamWriter(result)) { Sequence seq; Progress.SetRange(1, sr.BaseStream.Length); while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(StreamUtils.GetCharpos(sr)); if (seq.SeqString.StartsWith("M")) { seq.SeqString = seq.SeqString.Substring(1); seq.Reference = seq.Name + " N-terminal-M-Removed " + seq.Description; } ff.WriteSequence(sw, seq); } } return(new string[] { result }); }
public override IEnumerable <string> Process(string fileName) { var result = new List <string>(); string[] acLines = File.ReadAllLines(fileName); var acs = new HashSet <string>(); foreach (var acline in acLines) { string ac; if (!parser.TryParse(acline, out ac)) { ac = acline; } acs.Add(ac); } var findAcs = new HashSet <string>(); var resultFile = fileName + ".fasta"; result.Add(resultFile); var ff = new FastaFormat(); using (StreamWriter sw = new StreamWriter(resultFile)) using (StreamReader sr = new StreamReader(database)) { Progress.SetRange(0, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(sr.BaseStream.Position); string curAc; if (!parser.TryParse(seq.Name, out curAc)) { curAc = seq.Name; } if (acs.Contains(curAc)) { findAcs.Add(curAc); if (this.replaceName) { seq.Reference = curAc; } ff.WriteSequence(sw, seq); } } } acs.ExceptWith(findAcs); var missFile = fileName + ".miss"; if (acs.Count > 0) { using (StreamWriter sw = new StreamWriter(missFile)) { foreach (var ac in acs) { sw.WriteLine(ac); } } result.Add(missFile); } else if (File.Exists(missFile)) { File.Delete(missFile); } return(result); }
public override IEnumerable <string> Process() { var srItems = SequenceRegionUtils.GetSequenceRegions(options.InputFile).Where(m => options.AcceptName(m.Name)).ToList(); srItems = (from sr in srItems.GroupBy(m => m.Name) select sr.First()).ToList(); var keepChrInName = options.KeepChrInName && srItems.Any(m => m.Name.StartsWith("chr")); if (!keepChrInName) { srItems.ForEach(m => m.Seqname = m.Seqname.StringAfter("chr")); } var srMap = srItems.ToGroupDictionary(m => m.Seqname); var ff = new FastaFormat(int.MaxValue); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { using (StreamReader sr = new StreamReader(options.GenomeFastaFile)) { Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetMessage("processing " + seq.Name + " ..."); var name = seq.Name; if (!keepChrInName) { name = name.StringAfter("chr"); } List <GtfItem> items; if (!srMap.TryGetValue(name, out items)) { if (name.Equals("M")) { name = "MT"; srMap.TryGetValue(name, out items); } else if (name.Equals("chrM")) { name = "chrMT"; srMap.TryGetValue(name, out items); } else if (name.Equals("MT")) { name = "M"; srMap.TryGetValue(name, out items); } else if (name.Equals("chrMT")) { name = "chrM"; srMap.TryGetValue(name, out items); } } if (items != null) { Progress.SetMessage(" there are {0} entries in {1} ...", items.Count, name); foreach (var item in items) { if (item.Start - 1 + item.Length >= seq.SeqString.Length) { throw new Exception(string.Format("{0} exceed chromosome {1} length {2}", item, name, seq.SeqString.Length)); } var newseq = seq.SeqString.Substring((int)item.Start - 1, (int)item.Length); if (item.Strand == '-') { newseq = SequenceUtils.GetReverseComplementedSequence(newseq); } newseq = newseq.ToUpper(); var newname = string.Format("{0} {1} {2}", item.Name, item.GetLocationWithoutStrand(), item.Strand); var entry = new Sequence(newname, newseq); ff.WriteSequence(sw, entry); } } } } } return(new string[] { options.OutputFile }); }