public static HashSet <string> GetContaminationAccessNumbers(IStringParser <string> acParser, string fastaFilename, string contaminationDescriptionPattern, IProgressCallback progress) { HashSet <string> result = new HashSet <string>(); if (progress == null) { progress = new EmptyProgressCallback(); } Regex reg = new Regex(contaminationDescriptionPattern, RegexOptions.IgnoreCase); progress.SetMessage("Get contamination map from database ..."); var ff = new FastaFormat(); using (var sr = new StreamReader(fastaFilename)) { progress.SetRange(1, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { if (progress.IsCancellationPending()) { throw new UserTerminatedException(); } progress.SetPosition(sr.GetCharpos()); string ac = acParser.GetValue(seq.Name); if (reg.Match(seq.Reference).Success) { result.Add(ac); } } } progress.SetMessage("Get contamination map from database finished."); return(result); }
/// <summary> /// 从isobaricFile中读取spectra对应的isobaric labelling信息。 /// </summary> /// <param name="spectra"></param> /// <param name="isobaricFile"></param> /// <param name="progress"></param> public static void Load(List <IIdentifiedSpectrum> spectra, string isobaricFile, bool readPeaks = false, IProgressCallback progress = null) { if (progress == null) { progress = new EmptyProgressCallback(); } var fileNames = new HashSet <string>(from s in spectra let fs = s.Query.FileScan select fs.Experimental + "," + fs.FirstScan.ToString()); using (var reader = IsobaricResultFileFormatFactory.GetXmlReader(true, readPeaks)) { var usedChannels = IsobaricScanXmlUtils.GetUsedChannels(isobaricFile); reader.Open(isobaricFile); progress.SetMessage("Reading Isobaric from {0} ...", isobaricFile); progress.SetRange(1, spectra.Count); foreach (var spectrum in spectra) { if (progress.IsCancellationPending()) { throw new UserTerminatedException(); } progress.Increment(1); var fs = spectrum.Query.FileScan; if (reader.Has(fs.Experimental, fs.FirstScan)) { spectrum.SetIsobaricItem(reader.Read(fs.Experimental, fs.FirstScan, usedChannels)); } else { spectrum.SetIsobaricItem(null); } } } }
/// <summary> /// Fill dbsnp information. The name of SNPItem will be replaced by dbSNP name and the mapping between dbSNP name and old SNPItem name will be returned. /// </summary> /// <param name="snpItems"></param> /// <param name="dbSnpVcfFile"></param> /// <param name="progress"></param> /// <returns></returns> public static Dictionary <string, string> FillDbsnpIdByPosition(this IEnumerable <SNPItem> snpItems, string dbSnpVcfFile, IProgressCallback progress = null) { var sourceDbsnpMap = snpItems.ToDictionary(m => m.Name, m => m.Name); if (progress == null) { progress = new EmptyProgressCallback(); } var dic = snpItems.ToDoubleDictionary(m => m.Chrom, m => m.Position); progress.SetMessage("Filling dbSNP id from {0} ...", dbSnpVcfFile); using (var sr = new StreamReader(dbSnpVcfFile)) { progress.SetRange(0, sr.BaseStream.Length); string line; while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("##")) { break; } } int linecount = 0; Dictionary <int, SNPItem> chrMap = null; int lastChr = -1; while (line != null) { linecount++; if (linecount % 10000 == 0) { progress.SetPosition(sr.GetCharpos()); } try { //make sure it is SNV if (!line.Contains("VC=SNV")) { continue; } //Even it marked as SNV, it still could be insertion/deletion //2 179658175 rs11537855 C CC,CT . . RS=11537855;RSPOS=179658175;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x050100001205000002000110;GENEINFO=TTN:7273;WGT=1;VC=SNV;SLO;NSF;REF;ASP;OTHERKG;NOC var parts = line.Split('\t'); if (parts[3].Split(',').Any(l => l.Length != 1)) { continue; } if (parts[4].Split(',').Any(l => l.Length != 1)) { continue; } var chr = HumanChromosomeToInt(parts[0]); var position = int.Parse(parts[1]); if (lastChr != chr) { if (!dic.TryGetValue(chr, out chrMap)) { continue; } lastChr = chr; } SNPItem source; if (!chrMap.TryGetValue(position, out source)) { continue; } if (!source.Name.Equals(parts[2])) { sourceDbsnpMap.Remove(source.Name); sourceDbsnpMap[source.Name] = parts[2]; } source.DbsnpRefAllele = parts[3][0]; source.DbsnpAltAllele = parts[4][0]; source.DbsnpIsReversed = parts[7].Contains(";RV;"); } finally { line = sr.ReadLine(); } } } var snpMap = snpItems.ToDictionary(m => m.Name); var result = new Dictionary <string, string>(); foreach (var r in sourceDbsnpMap) { result[r.Value] = r.Key; if (!r.Key.Equals(r.Value)) { snpMap[r.Key].Name = r.Value; } } progress.SetMessage("Filling dbSNP id finished."); return(result); }
public static void FillSequenceFromFasta(IStringParser <string> acParser, string fastaFilename, IIdentifiedResult t, IProgressCallback progress) { if (progress == null) { progress = new EmptyProgressCallback(); } progress.SetMessage("Initializing accessNumber/protein map ..."); var acMap = new Dictionary <string, IIdentifiedProtein>(); foreach (IIdentifiedProteinGroup group in t) { foreach (IIdentifiedProtein protein in group) { string ac = acParser.GetValue(protein.Name); if (acMap.ContainsKey(ac)) { throw new Exception("Duplicate access number " + ac); } acMap[ac] = protein; if (ac != protein.Name) { if (acMap.ContainsKey(protein.Name)) { throw new Exception("Duplicate access number " + protein.Name); } acMap[protein.Name] = protein; } } } progress.SetMessage("Filling sequence from database ..."); var ff = new FastaFormat(); using (var sr = new StreamReader(fastaFilename)) { progress.SetRange(1, sr.BaseStream.Length); Sequence seq; while ((seq = ff.ReadSequence(sr)) != null) { if (progress.IsCancellationPending()) { throw new UserTerminatedException(); } progress.SetPosition(sr.BaseStream.Position); string ac = acParser.GetValue(seq.Name); if (acMap.ContainsKey(ac)) { IIdentifiedProtein protein = acMap[ac]; protein.Name = seq.Name.Replace("/", " "); protein.Description = seq.Description.Replace("\t", " ").Replace("/", " "); protein.Sequence = seq.SeqString; } } } var failed = acMap.Values.Where(l => l.Sequence == null).ToList(); if (failed.Count > 0) { var proteinNames = failed.ConvertAll(l => l.Name).ToArray(); if (!proteinNames.All(l => l.StartsWith("XXX_"))) { throw new Exception(string.Format("Couldn't find sequence of following protein(s), change access number pattern or select another database\n{0}", proteinNames.Merge("/"))); } } progress.SetMessage("Fill sequence from database finished."); }