Ejemplo n.º 1
0
        public static HashSet <string> GetContaminationAccessNumbers(IStringParser <string> acParser, string fastaFilename, string contaminationDescriptionPattern,
                                                                     IProgressCallback progress)
        {
            HashSet <string> result = new HashSet <string>();

            if (progress == null)
            {
                progress = new EmptyProgressCallback();
            }

            Regex reg = new Regex(contaminationDescriptionPattern, RegexOptions.IgnoreCase);

            progress.SetMessage("Get contamination map from database ...");
            var ff = new FastaFormat();

            using (var sr = new StreamReader(fastaFilename))
            {
                progress.SetRange(1, sr.BaseStream.Length);

                Sequence seq;
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    if (progress.IsCancellationPending())
                    {
                        throw new UserTerminatedException();
                    }

                    progress.SetPosition(sr.GetCharpos());

                    string ac = acParser.GetValue(seq.Name);

                    if (reg.Match(seq.Reference).Success)
                    {
                        result.Add(ac);
                    }
                }
            }

            progress.SetMessage("Get contamination map from database finished.");

            return(result);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// 从isobaricFile中读取spectra对应的isobaric labelling信息。
        /// </summary>
        /// <param name="spectra"></param>
        /// <param name="isobaricFile"></param>
        /// <param name="progress"></param>
        public static void Load(List <IIdentifiedSpectrum> spectra, string isobaricFile, bool readPeaks = false, IProgressCallback progress = null)
        {
            if (progress == null)
            {
                progress = new EmptyProgressCallback();
            }

            var fileNames = new HashSet <string>(from s in spectra
                                                 let fs = s.Query.FileScan
                                                          select fs.Experimental + "," + fs.FirstScan.ToString());

            using (var reader = IsobaricResultFileFormatFactory.GetXmlReader(true, readPeaks))
            {
                var usedChannels = IsobaricScanXmlUtils.GetUsedChannels(isobaricFile);

                reader.Open(isobaricFile);

                progress.SetMessage("Reading Isobaric from {0} ...", isobaricFile);
                progress.SetRange(1, spectra.Count);

                foreach (var spectrum in spectra)
                {
                    if (progress.IsCancellationPending())
                    {
                        throw new UserTerminatedException();
                    }

                    progress.Increment(1);

                    var fs = spectrum.Query.FileScan;
                    if (reader.Has(fs.Experimental, fs.FirstScan))
                    {
                        spectrum.SetIsobaricItem(reader.Read(fs.Experimental, fs.FirstScan, usedChannels));
                    }
                    else
                    {
                        spectrum.SetIsobaricItem(null);
                    }
                }
            }
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Fill dbsnp information. The name of SNPItem will be replaced by dbSNP name and the mapping between dbSNP name and old SNPItem name will be returned.
        /// </summary>
        /// <param name="snpItems"></param>
        /// <param name="dbSnpVcfFile"></param>
        /// <param name="progress"></param>
        /// <returns></returns>
        public static Dictionary <string, string> FillDbsnpIdByPosition(this IEnumerable <SNPItem> snpItems, string dbSnpVcfFile, IProgressCallback progress = null)
        {
            var sourceDbsnpMap = snpItems.ToDictionary(m => m.Name, m => m.Name);

            if (progress == null)
            {
                progress = new  EmptyProgressCallback();
            }

            var dic = snpItems.ToDoubleDictionary(m => m.Chrom, m => m.Position);

            progress.SetMessage("Filling dbSNP id from {0} ...", dbSnpVcfFile);
            using (var sr = new StreamReader(dbSnpVcfFile))
            {
                progress.SetRange(0, sr.BaseStream.Length);

                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    if (!line.StartsWith("##"))
                    {
                        break;
                    }
                }

                int linecount = 0;
                Dictionary <int, SNPItem> chrMap = null;
                int lastChr = -1;
                while (line != null)
                {
                    linecount++;

                    if (linecount % 10000 == 0)
                    {
                        progress.SetPosition(sr.GetCharpos());
                    }

                    try
                    {
                        //make sure it is SNV
                        if (!line.Contains("VC=SNV"))
                        {
                            continue;
                        }

                        //Even it marked as SNV, it still could be insertion/deletion
                        //2       179658175       rs11537855      C       CC,CT   .       .       RS=11537855;RSPOS=179658175;dbSNPBuildID=120;SSR=0;SAO=0;VP=0x050100001205000002000110;GENEINFO=TTN:7273;WGT=1;VC=SNV;SLO;NSF;REF;ASP;OTHERKG;NOC
                        var parts = line.Split('\t');
                        if (parts[3].Split(',').Any(l => l.Length != 1))
                        {
                            continue;
                        }

                        if (parts[4].Split(',').Any(l => l.Length != 1))
                        {
                            continue;
                        }

                        var chr      = HumanChromosomeToInt(parts[0]);
                        var position = int.Parse(parts[1]);

                        if (lastChr != chr)
                        {
                            if (!dic.TryGetValue(chr, out chrMap))
                            {
                                continue;
                            }
                            lastChr = chr;
                        }

                        SNPItem source;
                        if (!chrMap.TryGetValue(position, out source))
                        {
                            continue;
                        }

                        if (!source.Name.Equals(parts[2]))
                        {
                            sourceDbsnpMap.Remove(source.Name);
                            sourceDbsnpMap[source.Name] = parts[2];
                        }

                        source.DbsnpRefAllele  = parts[3][0];
                        source.DbsnpAltAllele  = parts[4][0];
                        source.DbsnpIsReversed = parts[7].Contains(";RV;");
                    }
                    finally
                    {
                        line = sr.ReadLine();
                    }
                }
            }

            var snpMap = snpItems.ToDictionary(m => m.Name);
            var result = new Dictionary <string, string>();

            foreach (var r in sourceDbsnpMap)
            {
                result[r.Value] = r.Key;
                if (!r.Key.Equals(r.Value))
                {
                    snpMap[r.Key].Name = r.Value;
                }
            }

            progress.SetMessage("Filling dbSNP id finished.");
            return(result);
        }
Ejemplo n.º 4
0
        public static void FillSequenceFromFasta(IStringParser <string> acParser, string fastaFilename, IIdentifiedResult t,
                                                 IProgressCallback progress)
        {
            if (progress == null)
            {
                progress = new EmptyProgressCallback();
            }

            progress.SetMessage("Initializing accessNumber/protein map ...");

            var acMap = new Dictionary <string, IIdentifiedProtein>();

            foreach (IIdentifiedProteinGroup group in t)
            {
                foreach (IIdentifiedProtein protein in group)
                {
                    string ac = acParser.GetValue(protein.Name);
                    if (acMap.ContainsKey(ac))
                    {
                        throw new Exception("Duplicate access number " + ac);
                    }
                    acMap[ac] = protein;

                    if (ac != protein.Name)
                    {
                        if (acMap.ContainsKey(protein.Name))
                        {
                            throw new Exception("Duplicate access number " + protein.Name);
                        }
                        acMap[protein.Name] = protein;
                    }
                }
            }

            progress.SetMessage("Filling sequence from database ...");
            var ff = new FastaFormat();

            using (var sr = new StreamReader(fastaFilename))
            {
                progress.SetRange(1, sr.BaseStream.Length);

                Sequence seq;
                while ((seq = ff.ReadSequence(sr)) != null)
                {
                    if (progress.IsCancellationPending())
                    {
                        throw new UserTerminatedException();
                    }

                    progress.SetPosition(sr.BaseStream.Position);

                    string ac = acParser.GetValue(seq.Name);
                    if (acMap.ContainsKey(ac))
                    {
                        IIdentifiedProtein protein = acMap[ac];
                        protein.Name        = seq.Name.Replace("/", " ");
                        protein.Description = seq.Description.Replace("\t", " ").Replace("/", " ");
                        protein.Sequence    = seq.SeqString;
                    }
                }
            }

            var failed = acMap.Values.Where(l => l.Sequence == null).ToList();

            if (failed.Count > 0)
            {
                var proteinNames = failed.ConvertAll(l => l.Name).ToArray();
                if (!proteinNames.All(l => l.StartsWith("XXX_")))
                {
                    throw new Exception(string.Format("Couldn't find sequence of following protein(s), change access number pattern or select another database\n{0}", proteinNames.Merge("/")));
                }
            }

            progress.SetMessage("Fill sequence from database finished.");
        }