public bool IsSourceAllelesMatchedWithG1000() { if (this.G1000Allele1 == ' ') { return(false); } var comp1 = SequenceUtils.GetComplementAllele(this.G1000Allele1); var comp2 = SequenceUtils.GetComplementAllele(this.G1000Allele2); var plat1 = Allele1; var plat2 = Allele2[0]; if (plat1 == this.G1000Allele1 && plat2 == this.G1000Allele2) { return(true); } if (plat1 == this.G1000Allele2 && plat2 == this.G1000Allele1) { return(true); } if (plat1 == comp1 && plat2 == comp2) { return(true); } if (plat1 == comp2 && plat2 == comp1) { return(true); } return(false); }
public Sequence GetReversedSequence(int index, Sequence seq) { if (options.DecoyType == DecoyType.Index) { return(SequenceUtils.GetReversedSequence(seq.SeqString, index)); } else { var description = options.DecoyKey + " " + seq.Description; var sequence = SequenceUtils.GetReversedSequence(seq.SeqString); string prefix = string.Empty, oldname; if (options.DecoyType == DecoyType.Middle) { oldname = seq.Name.StringAfter("|"); if (oldname.Equals(seq.Name)) { oldname = seq.Name.StringAfter(":"); if (!oldname.Equals(seq.Name)) { prefix = seq.Name.StringBefore(":") + ":"; } } else { prefix = seq.Name.StringBefore("|") + "|"; } } else { oldname = seq.Name; } var newname = prefix + options.DecoyKey + "_" + oldname; return(new Sequence(newname + " " + description, sequence)); } }
/// <summary> /// Comparing allele1 and allele2 with dbsnp reference allele to suggest the action for adjustment. /// </summary> /// <returns></returns> public StrandAction SuggestAction() { StrandAction result; if (this.Allele2[0] == this.DbsnpRefAllele) { result = StrandAction.None; } else if (this.Allele1 == this.DbsnpRefAllele) { result = StrandAction.Switch; } else if (SequenceUtils.GetComplementAllele(this.Allele2[0]) == this.DbsnpRefAllele) { result = StrandAction.Flip; } else if (SequenceUtils.GetComplementAllele(this.Allele1) == this.DbsnpRefAllele) { result = StrandAction.FlipSwitch; } else { result = StrandAction.Unknown; } return(result); }
protected override IIdentifiedResult GetIdentifiedResult(string fileName) { format = new MascotPeptideTextFormat(); var spectra = format.ReadFromFile(fileName); IIdentifiedResult result; if (isSiteLevel) { result = IdentifiedSpectrumUtils.BuildGroupByPeptide(spectra); } else { result = IdentifiedSpectrumUtils.BuildGroupByUniquePeptide(spectra); } var map = SequenceUtils.ReadAccessNumberReferenceMap(new FastaFormat(), this.fastaFile, this.parser); foreach (var group in result) { var proteins = group[0].Description.Split('/'); group[0].Description = (from p in proteins let ac = parser.GetValue(p) select map[ac]).ToList().Merge(" ! "); } return(result); }
public SingleNucleotidePolymorphism GetNotGsnapMismatch(string querySequence) { if (this.NumberOfMismatch == 0) { return(null); } var isPositiveStrand = this.Strand == '+'; var m = mismatch.Match(this.MismatchPositions); if (!m.Success) { return(null); } var seq = isPositiveStrand ? querySequence : SequenceUtils.GetReversedSequence(querySequence); var pos = int.Parse(m.Groups[1].Value); var detectedChr = seq[pos]; var chr = m.Groups[2].Value.First(); chr = isPositiveStrand ? chr : SequenceUtils.GetComplementAllele(chr); return(new SingleNucleotidePolymorphism(pos, chr, detectedChr)); }
public static List <CoverageRegion> GetTargetCoverageRegion(ITargetBuilderOptions options, IProgressCallback progress, bool removeRegionWithoutSequence = true) { List <CoverageRegion> result; if (options.TargetFile.EndsWith(".xml")) { result = GetTargetCoverageRegionFromXml(options, progress); } else { result = GetTargetCoverageRegionFromBed(options, progress); } var dic = result.ToGroupDictionary(m => m.Seqname); progress.SetMessage("Filling sequence from {0}...", options.GenomeFastaFile); using (var sr = new StreamReader(options.GenomeFastaFile)) { var ff = new FastaFormat(); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { progress.SetMessage("Processing chromosome {0} ...", seq.Reference); var seqname = seq.Name.StringAfter("chr"); List <CoverageRegion> lst; if (dic.TryGetValue(seqname, out lst)) { foreach (var l in lst) { l.Sequence = seq.SeqString.Substring((int)(l.Start - 1), (int)l.Length); if (l.Strand == '+') { l.ReverseComplementedSequence = SequenceUtils.GetReverseComplementedSequence(l.Sequence); } } } } } if (removeRegionWithoutSequence) { result.RemoveAll(l => string.IsNullOrEmpty(l.Sequence)); } progress.SetMessage("Filling sequence finished."); var namemap = new MapReader(1, 12).ReadFromFile(options.RefgeneFile); result.ForEach(m => { var gene = m.Name.StringBefore("_utr3"); m.GeneSymbol = namemap.ContainsKey(gene) ? namemap[gene] : string.Empty; }); return(result); }
public List <double> compute() { List <double> result = new List <double>(); for (int i = step; i <= maxSize; i += step) { Likelihood ls = new Likelihood(SequenceUtils <int> .getSubsequence(sequence.Sequence, 0, i), model.Model); double value = ls.FullProbability(i); result.Add(value); } return(result); }
public virtual List <FeatureLocation> GetSequenceRegions() { //Read sequence regions var items = SequenceRegionUtils.GetSequenceRegions(CoordinateFile); items.ForEach(m => { if (m.Seqname.StartsWith("chr")) { m.Seqname = m.Seqname.StringAfter("chr"); } }); //Fill sequence information, only miRNA and tRNA will be filled. if (!string.IsNullOrEmpty(this.FastaFile)) { Console.WriteLine("Reading sequence from {0} ...", this.FastaFile); var seqs = SequenceUtils.Read(new FastaFormat(), this.FastaFile).ToDictionary(m => m.Name); items.ForEach(m => { if (m.Name.StartsWith(SmallRNAConsts.miRNA) || m.Name.StartsWith(SmallRNAConsts.tRNA)) { if (seqs.ContainsKey(m.Name)) { m.Sequence = seqs[m.Name].SeqString; } else { Console.WriteLine("Missing sequence: " + m.Name); } } else { m.Sequence = string.Empty; } }); seqs.Clear(); } var result = items.ConvertAll(m => new FeatureLocation(m)).ToList(); result.ForEach(m => { foreach (var categoryName in SmallRNAConsts.Biotypes) { if (m.Name.StartsWith(categoryName)) { m.Category = categoryName; } } }); return(result); }
public void FillSequence(Sequence seq) { //matched exon fill sequence exons.ForEach(m => m.FillSequence(seq, this.Strand)); //fill direct sequence this.DirectExpectSequence = seq.SeqString.Substring((int)this.ExpectStart, (int)(this.ExpectEnd - this.ExpectStart + 1)).ToUpper(); if (this.Strand == '-') { this.DirectExpectSequence = SequenceUtils.GetReverseComplementedSequence(this.DirectExpectSequence); } }
public override IEnumerable <string> Process() { var format = new MascotPeptideTextFormat(); Progress.SetMessage("reading peptide-spectra-matches from " + options.PeptideFile + " ..."); var spectra = format.ReadFromFile(options.PeptideFile); var seqMap = new Dictionary <string, IIdentifiedPeptide>(); foreach (var spec in spectra) { seqMap[spec.Peptide.PureSequence] = spec.Peptide; } var aas = (from c in new Aminoacids().GetVisibleAminoacids() where c != 'I' select c.ToString()).Merge(""); var ff = new FastaFormat(); Progress.SetMessage("inserting amino acid ..."); using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine(File.ReadAllText(options.DatabaseFile)); var seqs = seqMap.Keys.OrderBy(m => m).ToArray(); var reversed_index = 1000000; foreach (var seq in seqs) { for (int i = 0; i < seq.Length; i++) { for (int j = 0; j < aas.Length; j++) { var newsequence = seq.Insert(i, aas[j].ToString()); var newref = string.Format("INS_{0}_{1}{2} Insertion of {3}", seq, i, aas[j], seqMap[seq].Proteins.Merge("/")); var newseq = new Sequence(newref, newsequence); ff.WriteSequence(sw, newseq); if (options.GenerateReversedPeptide) { var revsequence = SequenceUtils.GetReversedSequence(newsequence); var revref = string.Format("REVERSED_{0}", reversed_index++); var revseq = new Sequence(revref, revsequence); ff.WriteSequence(sw, revseq); } } } } } return(new[] { options.OutputFile }); }
protected override IFileProcessor GetFileProcessor() { proteins = SequenceUtils.Read(new FastaFormat(), base.GetOriginFile()); Protease protease = ProteaseManager.GetProteaseByName(proteases.SelectedItem); Digest digest = new Digest() { DigestProtease = protease, MaxMissedCleavages = 2 }; List <SimplePeakChro> totalPeaks = new List <SimplePeakChro>(); foreach (var seq in proteins) { digest.ProteinSequence = seq; digest.AddDigestFeatures(); List <DigestPeptideInfo> peptides = seq.GetDigestPeptideInfo(); peptides.RemoveAll(m => m.PeptideSeq.Length < 6); foreach (var dpi in peptides) { double mass = aas.MonoPeptideMass(dpi.PeptideSeq); List <SimplePeakChro> curPeaks = new List <SimplePeakChro>(); for (int charge = 2; charge <= 3; charge++) { double precursor = (mass + Atom.H.MonoMass * charge) / charge; if (precursor < 300 || precursor > 2000) { continue; } curPeaks.Add(new SimplePeakChro() { Mz = precursor, Sequence = dpi.PeptideSeq, Charge = charge }); } if (curPeaks.Count > 0) { dpi.Annotations[CHRO_KEY] = curPeaks; totalPeaks.AddRange(curPeaks); } } peptides.RemoveAll(m => !m.Annotations.ContainsKey(CHRO_KEY)); } return(new ProteinChromatographProcessor(totalPeaks, new string[] { rawFile.FullName }.ToList(), new RawFileImpl(), ppmTolerance.Value, 2.0, rebuildAll.Checked)); }
public override IEnumerable <string> Process() { Progress.SetMessage("reading fasta file ..."); var faMap = SequenceUtils.Read(new FastaFormat(), options.FastaFile).ToDictionary(m => m.Name); Progress.SetMessage("{0} sequences read ...", faMap.Count); using (StreamWriter sw = new StreamWriter(options.OutputFile)) { Progress.SetMessage("reading gff file ..."); var gffs = GtfItemFile.ReadFromFile(options.GffFile); } return(new string[] { options.OutputFile }); }
public void FillSequence(Sequence seq, char strand) { StringBuilder sb = new StringBuilder(); foreach (var loc in this) { sb.Append(seq.SeqString.Substring((int)loc.Start, (int)loc.Length)); } this.Sequence = sb.ToString().ToUpper(); if (strand == '-') { this.Sequence = SequenceUtils.GetReverseComplementedSequence(this.Sequence); } }
public override IEnumerable <string> Process(string fileName) { Progress.SetMessage("Reading sequences from " + database + " ..."); var seqs = SequenceUtils.Read(new FastaFormat(), database); seqs.RemoveAll(m => m.Name.StartsWith("rev_") || !m.Name.Contains("|#")); var format = new MascotPeptideTextFormat(); Progress.SetMessage("Procesing peptides from " + Path.GetFileName(fileName) + " ..."); var peptides = format.ReadFromFile(fileName); Progress.SetRange(0, peptides.Count); foreach (var peptide in peptides) { Progress.Increment(1); var pureSeq = peptide.Annotations["PureSequence"] as string; foreach (var seq in seqs) { if (seq.SeqString.Contains(pureSeq)) { peptide.Annotations["MutDB"] = seq.Name; break; } } } var result = fileName + ".mutdb"; using (StreamWriter sw = new StreamWriter(fileName + ".mutdb")) { sw.WriteLine(format.PeptideFormat.GetHeader() + "\tMutDB"); foreach (var peptide in peptides) { sw.Write(format.PeptideFormat.GetString(peptide)); if (peptide.Annotations.ContainsKey("MutDB")) { sw.WriteLine("\t" + peptide.Annotations["MutDB"]); } else { sw.WriteLine("\t"); } } } return(new string[] { result }); }
public override IEnumerable <string> Process() { Progress.SetMessage("Reading sequences from: " + _options.InputFile + "..."); var seqs = SequenceUtils.Read(_options.InputFile); seqs.Sort((m1, m2) => { var chr1 = m1.Name.StringBefore("_").StringAfter("chr"); var suffix1 = m1.Name.Contains("_") ? m1.Name.StringAfter("_") : string.Empty; var chr2 = m2.Name.StringBefore("_").StringAfter("chr"); var suffix2 = m2.Name.Contains("_") ? m2.Name.StringAfter("_") : string.Empty; if (string.IsNullOrWhiteSpace(suffix1)) { if (string.IsNullOrWhiteSpace(suffix2)) { return(GenomeUtils.CompareChromosome(chr1, chr2)); } else { return(-1); } } else { if (string.IsNullOrWhiteSpace(suffix2)) { return(1); } else { var ret = GenomeUtils.CompareChromosome(chr1, chr2); if (ret == 0) { ret = suffix1.CompareTo(suffix2); } return(ret); } } }); Progress.SetMessage("Writing sequences to: " + _options.OutputFile + "..."); SequenceUtils.Write(new FastaFormat(), _options.OutputFile, seqs); Progress.SetMessage("Finished."); return(new[] { _options.OutputFile }); }
private void WriteAudioSeq() { if (!_settings.RandomizeBGM) { return; } foreach (SequenceInfo s in RomData.SequenceList) { s.Name = Values.MusicDirectory + s.Name; } ResourceUtils.ApplyHack(Values.ModsDirectory + "fix-music"); ResourceUtils.ApplyHack(Values.ModsDirectory + "inst24-swap-guitar"); SequenceUtils.RebuildAudioSeq(RomData.SequenceList); }
public override IEnumerable <string> Process(string fileName) { Progress.SetMessage("Loading sequences from " + fileName + "..."); var seqs = SequenceUtils.Read(new FastaFormat(), fileName); Progress.SetMessage("Converint {0} sequences ...", seqs.Count); seqs.ForEach(m => { m.SeqString = MiRnaToDna(m.SeqString); }); var result = Path.ChangeExtension(fileName, ".dna" + Path.GetExtension(fileName)); Progress.SetMessage("Saving {0} sequences to {1}", seqs.Count, result); SequenceUtils.Write(new FastaFormat(), result, seqs); Progress.SetMessage("Finished!"); return(new string[] { result }); }
private void WriteAudioSeq(Random random) { if (_settings.Music != Music.Random) { return; } SequenceUtils.ReadSequenceInfo(); BGMShuffle(random); foreach (SequenceInfo s in RomData.SequenceList) { s.Name = Values.MusicDirectory + s.Name; } ResourceUtils.ApplyHack(Values.ModsDirectory + "fix-music"); ResourceUtils.ApplyHack(Values.ModsDirectory + "inst24-swap-guitar"); SequenceUtils.RebuildAudioSeq(RomData.SequenceList); }
public ISequence Get(SeqId id) { if (id == null) { throw new ArgumentNullException("id"); } ServicePointManager.ServerCertificateValidationCallback = delegate { return(true); }; WebClient web = new WebClient(); string address = $"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={id.ToString()}&rettype=fasta&retmode=text"; var fasta = web.DownloadString(address); var result = fasta.Split('\n'); var sb = new StringBuilder(); for (int i = 1; i < result.Length; ++i) { sb.Append(result[i]); } return(SequenceUtils.SequenceFromString(id, sb.ToString())); }
public virtual List <FeatureLocation> GetSequenceRegions() { //Read sequence regions var result = SequenceRegionUtils.GetSequenceRegions(CoordinateFile, GtfFeatureName, BedAsGtf); result.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); }); //Fill sequence information var sr = result.FirstOrDefault(m => m.Name.Contains(":")); if (sr != null) { var sequence = sr.Name.StringAfter(":"); if (sequence.All(m => MIRNA.Contains(m))) { result.ForEach(m => m.Sequence = m.Name.StringAfter(":")); result.ForEach(m => m.Name = m.Name.StringBefore(":")); } } if (!string.IsNullOrEmpty(this.FastaFile)) { Console.WriteLine("Reading sequence from {0} ...", this.FastaFile); var seqs = SequenceUtils.Read(new FastaFormat(), this.FastaFile).ToDictionary(m => m.Name); result.ForEach(m => { if (seqs.ContainsKey(m.Name)) { m.Sequence = seqs[m.Name].SeqString; } else { Console.WriteLine("Missing sequence: " + m.Name); } }); seqs.Clear(); } return(result.ConvertAll(m => new FeatureLocation(m)).ToList()); }
public bool IsPlatformAllelesMatchedWithDatabase() { if (this.G1000Allele1 == ' ') { return(false); } var comp1 = SequenceUtils.GetComplementAllele(this.G1000Allele1); var comp2 = SequenceUtils.GetComplementAllele(this.G1000Allele2); foreach (var plat in Platforms.Values) { var plat1 = plat.Allele1.First(); var plat2 = plat.Allele2.First(); if (plat1 == this.G1000Allele1 && plat2 == this.G1000Allele2) { continue; } if (plat1 == this.G1000Allele2 && plat2 == this.G1000Allele1) { continue; } if (plat1 == comp1 && plat2 == comp2) { continue; } if (plat1 == comp2 && plat2 == comp1) { continue; } return(false); } return(true); }
public Dictionary <int, IIdentifiedProtein> ParseProteinMap(string fileName, bool isDecoy) { var suffix = isDecoy ? "_decoy" : ""; SQLiteDBHelper sqlite = new SQLiteDBHelper(fileName); var result = new Dictionary <int, IIdentifiedProtein>(); string sqlProtein = string.Format("select ps.ProteinID, pa.Description, pro.Sequence, ps.ProteinScore, ps.Coverage from ProteinAnnotations as pa, Proteins as pro, ProteinScores{0} as ps where pro.ProteinID=pa.ProteinID and pro.ProteinID=ps.ProteinID", suffix); var proteinReader = sqlite.ExecuteReader(sqlProtein, null); Progress.SetMessage("Parsing proteins ..."); while (proteinReader.Read()) { var protein = new IdentifiedProtein(); var proid = proteinReader.GetInt32(0); var des = proteinReader.GetString(1); if (des.Length > 0 && des[0] == '>') { des = des.Substring(1); } protein.Reference = des; protein.Sequence = proteinReader.GetString(2); protein.Score = proteinReader.GetDouble(3); protein.Coverage = proteinReader.GetDouble(4); result[proid] = protein; } if (isDecoy) { foreach (var v in result.Values) { v.Sequence = SequenceUtils.GetReversedSequence(v.Sequence); v.Reference = GetReversedReference(v.Reference); } } return(result); }
public static SeedItem GetSeed(CoverageRegion cr, int offset, int seedLength, double minCoverage) { if (cr.Sequence.Length < offset + seedLength) { return(null); } var coverages = cr.Coverages.Skip(offset).Take(seedLength).ToList(); var coverage = coverages.Average(l => l.Coverage); if (coverage < minCoverage) { return(null); } var newseq = cr.Sequence.Substring(offset, seedLength); var start = cr.Start + offset; var end = cr.Start + offset + seedLength - 1; if (cr.Strand == '+') { newseq = SequenceUtils.GetReverseComplementedSequence(newseq); } return(new SeedItem() { Seqname = cr.Seqname, Start = start, End = end, Strand = cr.Strand, Coverage = coverage, Name = cr.Name, Sequence = newseq, Source = cr, SourceOffset = offset, GeneSymbol = cr.GeneSymbol }); }
private void ProcessFile(ref int index, StreamWriter sw, string fastaFile, bool isContaminant) { FastaFormat ff = new FastaFormat(); using (StreamReader sr = new StreamReader(fastaFile)) { Progress.SetRange(0, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { Progress.SetPosition(sr.BaseStream.Position); if (isContaminant) { if (!seq.Reference.StartsWith("CON_")) { seq.Reference = "CON_" + seq.Reference; } } if (options.ReversedOnly) { ff.WriteSequence(sw, seq); } if (options.IsPseudoAminoacid) { options.PseudoAminoacidBuilder.Build(seq); } index++; Sequence reversedSeq = SequenceUtils.GetReversedSequence(seq.SeqString, index); ff.WriteSequence(sw, reversedSeq); } } }
public static List <CoverageRegion> GetSmallRNACoverageRegionFromFasta(string featureFile) { var sequences = SequenceUtils.Read(featureFile); var result = new List <CoverageRegion>(); foreach (var smallRNA in sequences) { //coverage in all position will be set as same as total query count var rg = new CoverageRegion(); rg.Name = smallRNA.Name; rg.Seqname = "Unknown"; rg.Start = -1; rg.End = -1; rg.Strand = '*'; rg.Sequence = smallRNA.SeqString; for (int i = 0; i < smallRNA.SeqString.Length; i++) { rg.Coverages.Add(new CoverageSite(DEFAULT_COVERAGE)); } result.Add(rg); } return(result); }
public override IEnumerable <string> Process() { var expRawfileMap = options.RawFiles.ToDictionary(m => Path.GetFileNameWithoutExtension(m)); Progress.SetMessage("Reading library file ..."); var liblist = new MS2ItemXmlFormat().ReadFromFile(options.LibraryFile); PreprocessingMS2ItemList(liblist); var lib = liblist.GroupBy(m => m.Charge).ToDictionary(m => m.Key, m => m.ToList()); Progress.SetMessage("Building library sequence amino acid composition ..."); lib.ForEach(m => m.Value.ForEach(l => l.AminoacidCompsition = (from a in l.Peptide where options.SubstitutionDeltaMassMap.ContainsKey(a) select a).Distinct().OrderBy(k => k).ToArray())); var expScanMap = (from p in liblist from sq in p.FileScans select sq).ToList().GroupBy(m => m.Experimental).ToDictionary(m => m.Key, m => new HashSet <int>(from l in m select l.FirstScan)); if (File.Exists(options.PeptidesFile)) { Progress.SetMessage("Reading peptides file used for excluding scan ..."); var peptides = new MascotPeptideTextFormat().ReadFromFile(options.PeptidesFile); foreach (var pep in peptides) { HashSet <int> scans; if (!expScanMap.TryGetValue(pep.Query.FileScan.Experimental, out scans)) { scans = new HashSet <int>(); expScanMap[pep.Query.FileScan.Experimental] = scans; } scans.Add(pep.Query.FileScan.FirstScan); } } Progress.SetMessage("Reading MS2/MS3 data ..."); var result = GetCandidateMs2ItemList(expRawfileMap, expScanMap); PreprocessingMS2ItemList(result); //new MS2ItemXmlFormat().WriteToFile(options.OutputFile + ".xml", result); Progress.SetMessage("Finding SAP ..."); List <SapPredicted> predicted = new List <SapPredicted>(); var minDeltaMass = options.SubstitutionDeltaMassMap.Values.Min(l => l.Min(k => k.DeltaMass)); var maxDeltaMass = options.SubstitutionDeltaMassMap.Values.Max(l => l.Max(k => k.DeltaMass)); Progress.SetRange(0, result.Count); Progress.Begin(); FindCandidates(lib, result, predicted, minDeltaMass, maxDeltaMass); var groups = predicted.ToGroupDictionary(m => m.Ms2.GetFileScans()); predicted.Clear(); foreach (var g in groups.Values) { var gg = g.ToGroupDictionary(m => m.LibMs2).Values.ToList(); gg.Sort((m1, m2) => { return(CompareSapPrecitedList(m1, m2)); }); var expect = gg[0].FirstOrDefault(m => m.IsExpect); if (expect != null) { predicted.Add(expect); } else { predicted.AddRange(gg[0]); for (int i = 1; i < gg.Count; i++) { if (CompareSapPrecitedList(gg[0], gg[i]) == 0) { predicted.AddRange(gg[i]); } else { break; } } } } if (File.Exists(options.MatchedFile)) { new SapPredictedValidationWriter(options.MatchedFile).WriteToFile(options.OutputFile, predicted); } else { new SapPredictedWriter().WriteToFile(options.OutputTableFile, predicted); Progress.SetMessage("Generating SAP sequence ..."); List <Sequence> predictedSeq = new List <Sequence>(); foreach (var predict in predicted) { var seq = PeptideUtils.GetPureSequence(predict.LibMs2.Peptide); if (predict.Target.TargetType == VariantType.SingleAminoacidPolymorphism) { for (int i = 0; i < seq.Length; i++) { if (seq[i] == predict.Target.Source[0]) { foreach (var t in predict.Target.Target) { string targetSeq; if (i == 0) { targetSeq = t + seq.Substring(1); } else { targetSeq = seq.Substring(0, i) + t + seq.Substring(i + 1); } var reference = string.Format("sp|SAP_{0}_{1}|{2}_{3}_{4}_{5}", targetSeq, predict.Target.TargetType, seq, predict.Target.Source, i + 1, t); predictedSeq.Add(new Sequence(reference, targetSeq)); } } } } else { foreach (var tseq in predict.Target.Target) { string reference; if (predict.Target.TargetType == VariantType.NTerminalLoss) { reference = string.Format("sp|SAP_{0}_{1}|{2}_loss_{3}", tseq, predict.Target.TargetType, seq, seq.Substring(0, seq.Length - tseq.Length)); } else if (predict.Target.TargetType == VariantType.CTerminalLoss) { reference = string.Format("sp|SAP_{0}_{1}|{2}_loss_{3}", tseq, predict.Target.TargetType, seq, seq.Substring(tseq.Length)); } else if (predict.Target.TargetType == VariantType.NTerminalExtension) { reference = string.Format("sp|SAP_{0}_{1}|{2}_ext_{3}", tseq, predict.Target.TargetType, seq, tseq.Substring(0, tseq.Length - seq.Length)); } else if (predict.Target.TargetType == VariantType.CTerminalExtension) { reference = string.Format("sp|SAP_{0}_{1}|{2}_ext_{3}", tseq, predict.Target.TargetType, seq, tseq.Substring(seq.Length)); } else { throw new Exception("I don't know how to deal with " + predict.Target.TargetType.ToString()); } predictedSeq.Add(new Sequence(reference, tseq)); } } } predictedSeq = (from g in predictedSeq.GroupBy(m => m.SeqString) select g.First()).ToList(); Progress.SetMessage("Reading database {0} ...", options.DatabaseFastaFile); var databases = SequenceUtils.Read(options.DatabaseFastaFile); Progress.SetMessage("Removing variant sequences which are already existed in database ..."); for (int i = predictedSeq.Count - 1; i >= 0; i--) { foreach (var db in databases) { if (db.SeqString.Contains(predictedSeq[i].SeqString)) { predictedSeq.RemoveAt(i); break; } } } databases.AddRange(predictedSeq); Progress.SetMessage("Writing SAP sequence and original database to {0} ...", options.OutputFile); SequenceUtils.Write(new FastaFormat(), options.OutputFile, databases); } Progress.End(); return(new string[] { options.OutputFile, options.OutputTableFile }); }
/// <summary> /// 读取fasta文件,进行数据处理。 /// </summary> /// <param name="fileName">fasta</param> /// <returns>result file</returns> public override IEnumerable <string> Process() { HashSet <string> pnovoseqs = new HashSet <string>(); var pnovoParser = new PNovoPlusParser(options.TitleParser); pnovoParser.Progress = this.Progress; //找到一个非酶切位点的氨基酸,可代表denovo序列前后氨基酸。 var anotheraa = 'A'; for (int i = 0; i < 26; i++) { anotheraa = (char)('A' + i); if (options.Enzyme.CleaveageResidues.Contains(anotheraa) || options.Enzyme.NotCleaveResidues.Contains(anotheraa)) { continue; } break; } Progress.SetRange(0, options.PnovoFiles.Length); int totalSpectrumCount = 0; int totalSpectrumPassScore = 0; foreach (var pnovoFile in options.PnovoFiles) { Progress.SetMessage("Reading " + pnovoFile + " ..."); int spectrumCount = pnovoParser.GetSpectrumCount(pnovoFile); var curSpectra = pnovoParser.ParsePeptides(pnovoFile, 10, options.MinScore); totalSpectrumCount += spectrumCount; totalSpectrumPassScore += curSpectra.Count; RemoveMissCleavagePeptides(anotheraa, curSpectra); pnovoseqs.UnionWith(from c in curSpectra from p in c.Peptides select p.PureSequence); Progress.Increment(1); } var pNovoStat = Path.Combine(options.TargetDirectory, "pNovo.SAP.stat"); using (StreamWriter sw = new StreamWriter(pNovoStat)) { sw.WriteLine("Total Spectrum Count\t" + totalSpectrumCount.ToString()); sw.WriteLine("Total Peptide-Spectrum-Match Passed Score Filter\t" + totalSpectrumPassScore.ToString()); } Progress.SetPosition(0); Progress.SetMessage("Reading " + options.TargetFastaFile + " ..."); var seqs = SequenceUtils.Read(new FastaFormat(), options.TargetFastaFile); Progress.SetMessage("Digesting sequences ..."); GetDigestPeptide(seqs); seqs.Clear(); seqs.TrimExcess(); GC.Collect(); GC.WaitForFullGCComplete(); //清除所有跟理论库一样的肽段。 Progress.SetMessage("Removing identical peptides ..."); pnovoseqs.ExceptWith(miss0.Keys); var pnovoArray = pnovoseqs.ToArray(); pnovoseqs.Clear(); GC.Collect(); GC.WaitForFullGCComplete(); miss0group = miss0.Keys.ToGroupDictionary(m => m.Length); var type2seqs = new List <Type2Sequence>(); var type2_2 = new List <string>(); foreach (var m in miss1.Keys) { int maxpos = -1; for (int i = 1; i < m.Length; i++) { if (options.Enzyme.IsCleavageSite(m[i - 1], m[i], anotheraa)) { maxpos = i - 1; break; } } if (maxpos == -1) { throw new Exception("There is no misscleavage in " + m); } if (maxpos == 0) { type2_2.Add(m); } else { type2seqs.Add(new Type2Sequence() { Sequence = m, PriorSequence = m.Substring(0, maxpos), PostSequence = m.Substring(maxpos + 1) }); } } miss1type2_1 = type2seqs.ToGroupDictionary(m => GetType2Key(m.Sequence)); miss1type2_2 = type2_2.ToGroupDictionary(m => m.Substring(1)); miss0type3 = miss0.Keys.ToGroupDictionary(m => GetType3Key(m)); type2seqs.Clear(); GC.Collect(); GC.WaitForFullGCComplete(); Progress.SetMessage("Finding mutation ..."); Progress.SetRange(0, pnovoArray.Length); var pre100 = pnovoArray.Length / 100; var pre10000 = pnovoArray.Length / 10000; if (pre10000 == 0) { pre10000 = 1; } var totalCount = pnovoArray.Length; var binSize = totalCount / options.ThreadCount; List <FindParam> fparams = new List <FindParam>(); List <Thread> threads = new List <Thread>(); var startPos = 0; for (int i = 0; i < options.ThreadCount; i++) { int count; if (i == options.ThreadCount - 1) { count = pnovoArray.Length - startPos; } else { count = binSize; } List <string> binSeq = new List <string>(); binSeq.AddRange(pnovoArray.Skip(startPos).Take(count)); startPos = startPos + count; var aparam = new FindParam() { PnovoSeqs = binSeq }; fparams.Add(aparam); Thread at = new Thread(this.FindMutation); threads.Add(at); at.IsBackground = true; at.Start(aparam); } pnovoArray = null; GC.Collect(); GC.WaitForFullGCComplete(); var startTime = DateTime.Now; Progress.SetRange(0, totalCount); while (true) { Thread.Sleep(1000); if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } int finishedCount = fparams.Sum(m => m.FinishedCount); Progress.SetPosition(finishedCount); if (finishedCount == 0) { continue; } var curTime = DateTime.Now; var costTime = curTime - startTime; var totalCostTime = new TimeSpan(costTime.Ticks * totalCount / finishedCount); var finishTime = curTime + new TimeSpan(costTime.Ticks * (totalCount - finishedCount) / finishedCount); StringBuilder costFormat = new StringBuilder(); if (totalCostTime.TotalHours >= 2.0) { costFormat.Append(Math.Truncate(totalCostTime.TotalHours).ToString() + " hours and "); } else if (totalCostTime.TotalHours >= 1.0) { costFormat.Append("one hour and "); } costFormat.Append(totalCostTime.Minutes.ToString() + " minutes"); Progress.SetMessage("Finding mutation {0} / {1}, will cost {2} and finish at {3} ...", finishedCount, totalCount, costFormat, finishTime); int finishedThreadCount = threads.Count(m => !m.IsAlive); if (finishedThreadCount == threads.Count) { break; } } int type1 = fparams.Sum(m => m.Type1Count); int type2 = fparams.Sum(m => m.Type2Count); int type3 = fparams.Sum(m => m.Type3Count); using (StreamWriter sw = new StreamWriter(pNovoStat, true)) { sw.WriteLine("Type1 Count\t" + type1.ToString()); sw.WriteLine("Type2 Count\t" + type2.ToString()); sw.WriteLine("Type3 Count\t" + type3.ToString()); } var singleMutation = (from f in fparams from s in f.Sequences select s).ToList(); string newFastaFile = new FileInfo(options.TargetDirectory + "/" + FileUtils.ChangeExtension(new FileInfo(options.DatabaseFastaFile).Name, "mutation.fasta")).FullName; using (StreamWriter sw = new StreamWriter(newFastaFile)) { using (StreamReader sr = new StreamReader(options.DatabaseFastaFile)) { string line = sr.ReadToEnd(); sw.WriteLine(line); foreach (var seq in singleMutation) { sw.WriteLine(">" + seq.Reference); sw.WriteLine(seq.SeqString); } } } Progress.SetRange(0, options.PnovoFiles.Length); var sapSequences = new HashSet <string>(singleMutation.ConvertAll(m => m.SeqString)); List <IIdentifiedSpectrum> allSpectra = new List <IIdentifiedSpectrum>(); foreach (var pnovoFile in options.PnovoFiles) { Progress.SetMessage("Reading " + pnovoFile + " ..."); var curSpectra = pnovoParser.ParsePeptides(pnovoFile, 10, options.MinScore); RemoveMissCleavagePeptides(anotheraa, curSpectra); curSpectra.RemoveAll(m => !m.Peptides.Any(n => sapSequences.Contains(n.PureSequence))); allSpectra.AddRange(curSpectra); Progress.Increment(1); } var pNovoPeptides = Path.Combine(options.TargetDirectory, "pNovo.SAP.peptides"); new MascotPeptideTextFormat("\tFileScan\tSequence\tCharge\tScore\tDeltaScore").WriteToFile(pNovoPeptides, allSpectra); Progress.SetMessage("Finished."); Progress.End(); return(new string[] { newFastaFile }); }
public override bool PrepareOptions() { if (!PrepareOutputDirectory()) { return(false); } if (!string.IsNullOrWhiteSpace(ExcludeBedFile) && !File.Exists(ExcludeBedFile)) { ParsingErrors.Add("Exclude file not exists:" + ExcludeBedFile); return(false); } switch (From) { case DataSourceType.Mpileup: if (null == MpileupFile) { ParsingErrors.Add("Mpileup file not defined."); return(false); } if (!File.Exists(MpileupFile)) { ParsingErrors.Add(string.Format("Mpileup file not exists {0}.", MpileupFile)); return(false); } Console.WriteLine("#mpileup file: " + MpileupFile); break; case DataSourceType.BAM: if (null == NormalBam) { ParsingErrors.Add("Bam file for normal sample not defined."); } else if (!File.Exists(NormalBam)) { ParsingErrors.Add(string.Format("Bam file for normal sample not exists {0}", NormalBam)); } if (null == TumorBam) { ParsingErrors.Add("Bam file for tumor sample not defined."); } else if (!File.Exists(TumorBam)) { ParsingErrors.Add(string.Format("Bam file for tumor sample is not exists {0}", TumorBam)); } if (ThreadCount >= 2) { Console.WriteLine("Checking chromosome names for thread mode ..."); if (ChromosomeNames == null || ChromosomeNames.Count == 0 && File.Exists(GenomeFastaFile)) { var fai = GenomeFastaFile + ".fai"; if (File.Exists(fai)) { var lines = File.ReadAllLines(fai); ChromosomeNames = lines.ToList().ConvertAll(m => { var pos = m.IndexOfAny(new[] { '\t', ' ' }); if (pos == -1) { return(m); } return(m.Substring(0, pos)); }); } else { Console.WriteLine("Reading chromosome names from fasta file ..."); ChromosomeNames = SequenceUtils.ReadFastaNames(GenomeFastaFile); if (ChromosomeNames.Count == 0) { ParsingErrors.Add(string.Format("Genome fasta file doesn't contain chromosome names, {0}.", GenomeFastaFile)); return(false); } } foreach (var chr in ChromosomeNames) { Console.WriteLine(chr); } } } else { if (ChromosomeNames != null && ChromosomeNames.Count > 0) { Console.WriteLine("#mpileup chromosome names: " + ChromosomeNames.Merge(",")); } } break; case DataSourceType.Console: Console.WriteLine("#mpileup from console."); break; } return(true); }
int IAnyOfModeCharMatcher.CountIn(string sequence) { return(SequenceUtils.CountIn(sequence, Options.GetSequenceChars())); }
int IAnyOfModeCharMatcher.IndexIn(string sequence, int startIndex) { return(SequenceUtils.IndexIn(sequence, Options.GetSequenceChars(), startIndex)); }