Пример #1
0
        private HashSet <string> ReadPerfectMappedReadNames(string readNameFile)
        {
            HashSet <string> reads = new HashSet <string>();

            var ext = Path.GetExtension(readNameFile).ToLower();

            if (!ext.Equals(".bam") && !ext.Equals(".sam"))
            {
                Progress.SetMessage("Reading perfect mapped reads from text file {0} ...", readNameFile);
                reads = new HashSet <string>(File.ReadAllLines(readNameFile));
            }
            else
            {
                Progress.SetMessage("Reading perfect mapped reads from bam/sam file {0} ...", readNameFile);
                var list = new List <string>();
                using (var sr = SAMFactory.GetReader(readNameFile, true))
                {
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (line.Contains("NM:i:0"))
                        {
                            list.Add(line.StringBefore("\t"));
                        }
                    }
                }
                reads = new HashSet <string>(list);
            }

            Progress.SetMessage("{0} perfect mapped reads.", reads.Count);
            return(reads);
        }
 public AlignedPositionMapBuilder(AlignedPositionMapBuilderOptions options, string fileName)
 {
     this._options = options;
     _format       = options.GetSAMFormat();
     _file         = SAMFactory.GetReader(fileName, true);
     _list         = new AlignedPositionMapList();
     _done         = new List <AlignedPositionMap>();
 }
Пример #3
0
        public override IEnumerable <string> Process()
        {
            var format = new MappedItemGroupXmlFileFormat();

            Progress.SetMessage("reading mapped reads from " + _options.CountFile + " ...");
            var mapped = format.ReadFromFile(_options.CountFile);

            var sequenceLocusSet = new HashSet <string>(from item in mapped
                                                        from mi in item
                                                        from mr in mi.MappedRegions
                                                        from al in mr.AlignedLocations
                                                        select string.Format("{0}:{1}:{2}", al.Parent.Sequence, al.Seqname, al.Start));

            Progress.SetMessage("There are {0} unique sequence:locus", sequenceLocusSet.Count);

            using (var sw = new StreamWriter(_options.OutputFile))
            {
                using (var sr = SAMFactory.GetReader(_options.BamFile, false))
                {
                    sr.ReadHeaders().ForEach(m => sw.WriteLine(m));

                    int    count    = 0;
                    int    accepted = 0;
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (count % 1000 == 0)
                        {
                            if (Progress.IsCancellationPending())
                            {
                                throw new UserTerminatedException();
                            }
                        }

                        if (count % 100000 == 0 && count > 0)
                        {
                            Progress.SetMessage("{0} candidates from {1} reads", accepted, count);
                        }

                        count++;

                        var parts = line.Split('\t');

                        var locus = string.Format("{0}:{1}:{2}", parts[SAMFormatConst.SEQ_INDEX], parts[SAMFormatConst.RNAME_INDEX], parts[SAMFormatConst.POS_INDEX]);
                        if (!sequenceLocusSet.Contains(locus))
                        {
                            continue;
                        }

                        sw.WriteLine(line);
                        accepted++;
                    }
                }
            }

            return(new[] { _options.OutputFile });
        }
Пример #4
0
 private void GetChromosomes(string normalFile)
 {
     using (var file = SAMFactory.GetReader(normalFile, false))
     {
         var headers = file.ReadHeaders();
         this.Chromosomes = (from h in headers
                             where h.StartsWith("@SQ")
                             select h.StringAfter("SN:").StringBefore("\t")).ToList();
     }
 }
Пример #5
0
        private IEnumerable <string> DoSingleEndProcess()
        {
            using (var sw = new StreamWriter(options.OutputFile))
            {
                using (var sr = SAMFactory.GetReader(options.InputFile, options.Samtools, true))
                {
                    string line;
                    var    count   = 0;
                    var    ignored = new HashSet <string>();
                    while ((line = sr.ReadLine()) != null)
                    {
                        count++;

                        if (count % 100000 == 0)
                        {
                            Progress.SetMessage("{0} reads", count);
                            if (Progress.IsCancellationPending())
                            {
                                throw new UserTerminatedException();
                            }
                        }

                        try
                        {
                            var ss = LineToSamItem(line);
                            Console.WriteLine(ss.Qname);
                            if (ignored.Contains(ss.Qname))
                            {
                                continue;
                            }

                            ss.WriteFastq(sw);
                            ignored.Add(ss.Qname);
                        }
                        catch (Exception ex)
                        {
                            Console.Error.WriteLine("Error of line {0} : {1}", line, ex.StackTrace);
                            throw;
                        }
                    }
                }
            }
            return(new[] { options.OutputFile });
        }
        public override bool PrepareOptions()
        {
            if (!PrepareOutputDirectory())
            {
                return(false);
            }

            try
            {
                using (SAMFactory.GetReader(this.NormalFile)) { }
                using (SAMFactory.GetReader(this.TumorFile)) { }
            }
            catch (Exception ex)
            {
                ParsingErrors.Add(ex.Message);
                return(false);
            }

            if (this.ThreadCount >= 2)
            {
                Console.WriteLine("Checking chromosome names for thread mode ...");
                if (this.ChromosomeNames == null || this.ChromosomeNames.Count == 0)
                {
                    this.ChromosomeNames = SAMUtils.GetChromosomes(this.NormalFile);
                }

                foreach (var chr in this.ChromosomeNames)
                {
                    Console.WriteLine(chr);
                }
            }
            else
            {
                if (this.ChromosomeNames != null && this.ChromosomeNames.Count > 0)
                {
                    Console.Out.WriteLine("#mpileup chromosome names: " + this.ChromosomeNames.Merge(","));
                }
            }

            return(true);
        }
Пример #7
0
        public override IEnumerable <string> Process()
        {
            PileupCountList pc = new PileupCountList();

            var format = options.GetSAMFormat();

            var cm = new SmallRNACountMap(options.CountFile);

            var srItems = SequenceRegionUtils.GetSequenceRegions(options.CoordinateFile, "miRNA", options.BedAsGtf);

            srItems.ForEach(m =>
            {
                m.Seqname = m.Seqname.StringAfter("chr");
            });
            var srmap = srItems.GroupBy(m => m.Seqname).ToDictionary(m => m.Key, m => m.ToList());

            StreamWriter swScript = null;

            try
            {
                if (options.ExportIgvScript)
                {
                    swScript = new StreamWriter(options.OutputFile + ".igv");
                    swScript.WriteLine("snapshotDirectory {0}", Path.GetDirectoryName(options.OutputFile).Replace('\\', '/'));
                }

                using (StreamWriter sw = new StreamWriter(options.OutputFile))
                {
                    sw.WriteLine(@"##fileformat=VCFv4.2
##fileDate={0:yyyyMMdd}
##source={1}
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description=""Number of Samples With Data"">
##INFO=<ID=DP,Number=1,Type=Integer,Description=""Total Depth"">
##INFO=<ID=AF,Number=A,Type=Float,Description=""Allele Frequency"">
##INFO=<ID=FP,Number=1,Type=Float,Description=""Fisher Exact Test P-Value"">
##INFO=<ID=MN,Number=.,Type=String,Description=""miRNA name contains this position"">
##FILTER=<ID=FisherET,Description=""Fisher exact test Pvalue less than {2}"">
##FILTER=<ID=AltAlleFreq,Description=""Alternative allele frequency less than {3}"">
##FILTER=<ID=notMiRNA,Description=""Position not located in miRNA locus"">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description=""Read Depth"">
##FORMAT=<ID=AD,Number=1,Type=Integer,Description=""Allelic Depth"">
#CHROM  POS ID  REF ALT QUAL  FILTER  INFO  FORMAT  {4}",
                                 DateTime.Now,
                                 "PileupCountBuilder",
                                 options.FisherPValue,
                                 options.MinimumAlternativeAlleleFrequency,
                                 Path.GetFileNameWithoutExtension(options.InputFile));

                    using (var sr = SAMFactory.GetReader(options.InputFile, true))
                    {
                        int    count = 0;
                        string line;
                        while ((line = sr.ReadLine()) != null)
                        {
                            count++;

                            if (count % 100 == 0)
                            {
                                if (Progress.IsCancellationPending())
                                {
                                    throw new UserTerminatedException();
                                }
                            }

                            if (count % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads processed", count);
                            }

                            var parts = line.Split('\t');

                            var qname = parts[SAMFormatConst.QNAME_INDEX];
                            var seq   = parts[SAMFormatConst.SEQ_INDEX];

                            //too short
                            if (seq.Length < options.MinimumReadLength)
                            {
                                continue;
                            }

                            SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);
                            //unmatched
                            if (flag.HasFlag(SAMFlags.UnmappedQuery))
                            {
                                continue;
                            }

                            var cigar = parts[SAMFormatConst.CIGAR_INDEX];
                            //insertion/deletion
                            if (cigar.Any(m => m == 'I' || m == 'D'))
                            {
                                continue;
                            }

                            var sam = new SAMAlignedItem()
                            {
                                Qname = qname,
                            };

                            bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand);
                            char strand;
                            if (isReversed)
                            {
                                strand       = '-';
                                sam.Sequence = SequenceUtils.GetReverseComplementedSequence(seq);
                            }
                            else
                            {
                                strand       = '+';
                                sam.Sequence = seq;
                            }

                            var loc = new SAMAlignedLocation(sam)
                            {
                                Seqname           = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr"),
                                Start             = int.Parse(parts[SAMFormatConst.POS_INDEX]),
                                Strand            = strand,
                                Cigar             = parts[SAMFormatConst.CIGAR_INDEX],
                                MismatchPositions = format.GetMismatchPositions(parts),
                                NumberOfMismatch  = format.GetNumberOfMismatch(parts),
                                Sequence          = seq
                            };

                            loc.ParseEnd(sam.Sequence);
                            sam.AddLocation(loc);

                            if (format.HasAlternativeHits)
                            {
                                format.ParseAlternativeHits(parts, sam);
                            }

                            var finished = pc.Add(sam, cm.GetCount(sam.Qname));
                            if (null == finished || 0 == finished.Count)
                            {
                                continue;
                            }

                            foreach (var fin in finished)
                            {
                                //if (fin.Chromosome.Equals("1") && fin.Position == 5160725)
                                //{
                                //  Console.WriteLine(fin);
                                //}
                                var ft = fin.FisherExactTest();
                                if (ft.PValue <= options.FisherPValue)
                                {
                                    var total     = fin.Sum(m => m.Value);
                                    var minallele = total * options.MinimumAlternativeAlleleFrequency;
                                    if (ft.Sample2.Failed >= minallele)
                                    {
                                        List <GtfItem> srs;
                                        List <string>  ranges = new List <string>();

                                        if (srmap.TryGetValue(sam.Locations[0].Seqname, out srs))
                                        {
                                            foreach (var seqr in srs)
                                            {
                                                if (seqr.Contains(fin.Position))
                                                {
                                                    ranges.Add(seqr.GetNameLocation());
                                                }
                                            }
                                        }

                                        var alter = (from r in fin
                                                     where r.Key != fin.Reference
                                                     orderby r.Key
                                                     select r).ToList();

                                        var str = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tNS={7};DP={8};AF={9};FP={10:0.##E0}{11}\tDP:AD\t{12}:{13},{14}",
                                                                fin.Chromosome,
                                                                fin.Position,
                                                                ".",
                                                                fin.Reference,
                                                                (from r in alter
                                                                 select r.Key.ToString()).Merge(","),
                                                                0,
                                                                ranges.Count == 0 ? "notMiRNA" : "PASS",
                                                                1,
                                                                total,
                                                                (from r in alter
                                                                 select string.Format("{0:0.###}", r.Value * 1.0 / total)).Merge(","),
                                                                ft.PValue,
                                                                ranges.Count == 0 ? "" : ";" + ranges.Merge(","),
                                                                total,
                                                                ft.Sample2.Succeed,
                                                                (from r in alter
                                                                 select r.Value.ToString()).Merge(","));

                                        sw.WriteLine(str);
                                        //Console.WriteLine(str);

                                        if (swScript != null && ranges.Count > 0)
                                        {
                                            swScript.WriteLine(@"goto {0}:{1}
sort position
snapshot {0}_{2}_{1}.png", fin.Chromosome, fin.Position, ranges[0].Replace('(', '_').Replace(')', '_').Replace(':', '_'));
                                        }
                                    }
                                }
                            }

                            finished.Clear();
                        }
                    }
                }
            }
            finally
            {
                if (swScript != null)
                {
                    swScript.Close();
                }
            }
            return(new string[] { options.OutputFile });
        }
Пример #8
0
        public List <ChromosomeCountSlimItem> Build(string fileName)
        {
            if (File.Exists(options.CategoryMapFile))
            {
                Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ...");
                nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value);
            }

            var result = new List <ChromosomeCountSlimItem>();

            var queries     = new Dictionary <string, SAMChromosomeItem>();
            var chromosomes = new Dictionary <string, ChromosomeCountSlimItem>();

            Regex chromosomeRegex = null;
            Func <string, bool> acceptChromosome;

            if (string.IsNullOrEmpty(options.ChromosomePattern))
            {
                acceptChromosome = m => true;
            }
            else
            {
                chromosomeRegex  = new Regex(options.ChromosomePattern);
                acceptChromosome = m => chromosomeRegex.Match(m).Success;
            }

            Progress.SetMessage("Parsing alignment file " + fileName + " ...");
            using (var sr = SAMFactory.GetReader(fileName, true))
            {
                int    count        = 0;
                int    waitingcount = 0;
                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    if (count % 1000 == 0)
                    {
                        if (Progress.IsCancellationPending())
                        {
                            throw new UserTerminatedException();
                        }
                    }

                    if (count % 100000 == 0 && count > 0)
                    {
                        Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count);
                    }

                    count++;

                    var parts = line.Split('\t');

                    SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

                    //unmatched
                    if (flag.HasFlag(SAMFlags.UnmappedQuery))
                    {
                        continue;
                    }

                    var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]);
                    if (!acceptChromosome(seqname))
                    {
                        continue;
                    }

                    var qname = parts[SAMFormatConst.QNAME_INDEX];
                    SAMChromosomeItem query;
                    if (!queries.TryGetValue(qname, out query))
                    {
                        query          = new SAMChromosomeItem();
                        query.Qname    = qname;
                        queries[qname] = query;

                        if (options.KeepSequence)
                        {
                            query.Sequence = parts[SAMFormatConst.SEQ_INDEX];
                            if (flag.HasFlag(SAMFlags.QueryOnReverseStrand))
                            {
                                query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence);
                            }
                        }
                    }

                    query.Chromosomes.Add(seqname);

                    ChromosomeCountSlimItem item;
                    if (!chromosomes.TryGetValue(seqname, out item))
                    {
                        item = new ChromosomeCountSlimItem();
                        item.Names.Add(seqname);
                        chromosomes[seqname] = item;
                        result.Add(item);
                    }
                    item.Queries.Add(query);

                    waitingcount++;
                }

                Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count);
            }

            foreach (var query in queries.Values)
            {
                query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList();
            }

            foreach (var sam in chromosomes.Values)
            {
                sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList();
            }

            if (!string.IsNullOrEmpty(options.PreferPrefix))
            {
                foreach (var query in queries.Values)
                {
                    if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix)))
                    {
                        var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray();
                        foreach (var chrom in chroms)
                        {
                            chromosomes[chrom].Queries.Remove(query);
                            query.Chromosomes.Remove(chrom);
                        }
                    }
                }

                result.RemoveAll(l => l.Queries.Count == 0);
            }
            return(result);
        }
        public int Extract(string sourceFile, string targetFile, IEnumerable <string> exceptQueryNames, string countFile)
        {
            int result = 0;

            var except = new HashSet <string>(exceptQueryNames);

            SmallRNACountMap cm      = new SmallRNACountMap();
            StreamWriter     swCount = null;

            if (File.Exists(countFile))
            {
                var oldCm = new SmallRNACountMap(countFile);
                foreach (var c in oldCm.Counts)
                {
                    cm.Counts[c.Key.StringBefore(SmallRNAConsts.NTA_TAG)] = c.Value;
                }
                swCount = new StreamWriter(targetFile + ".dupcount");
            }

            try
            {
                using (var sw = StreamUtils.GetWriter(targetFile, targetFile.ToLower().EndsWith(".gz")))
                {
                    using (var sr = SAMFactory.GetReader(sourceFile, true))
                    {
                        string line;
                        var    count = 0;
                        while ((line = sr.ReadLine()) != null)
                        {
                            count++;

                            if (count % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads", count);
                                if (Progress.IsCancellationPending())
                                {
                                    throw new UserTerminatedException();
                                }
                            }

                            var ss = SAMUtils.Parse <SAMItemSlim>(line);
                            ss.Qname = ss.Qname.StringBefore(SmallRNAConsts.NTA_TAG);
                            if (except.Contains(ss.Qname))
                            {
                                continue;
                            }

                            if (Filter != null && !Filter.Accept(ss))
                            {
                                continue;
                            }

                            except.Add(ss.Qname);
                            ss.WriteFastq(sw);

                            if (swCount != null)
                            {
                                swCount.WriteLine("{0}\t{1}", ss.Qname, cm.Counts[ss.Qname]);
                            }

                            result++;
                        }
                    }
                }
            }
            finally
            {
                if (swCount != null)
                {
                    swCount.Close();
                }
            }
            return(result);
        }
Пример #10
0
        protected override List <T> DoBuild <T>(string fileName, out List <QueryInfo> totalQueries)
        {
            var result = new List <T>();

            _format = _options.GetSAMFormat();

            totalQueries = new List <QueryInfo>();

            using (var sr = SAMFactory.GetReader(fileName, true))
            {
                int    count        = 0;
                int    waitingcount = 0;
                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    count++;

                    if (count % 1000 == 0)
                    {
                        if (Progress.IsCancellationPending())
                        {
                            throw new UserTerminatedException();
                        }
                    }

                    var parts = line.Split('\t');

                    var qname = parts[SAMFormatConst.QNAME_INDEX];
                    var qi    = new QueryInfo(qname);
                    totalQueries.Add(qi);

                    SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);
                    if (!_filter.AcceptFlags(flag))
                    {
                        continue;
                    }

                    var mismatchCount = _format.GetNumberOfMismatch(parts);
                    var seq           = parts[SAMFormatConst.SEQ_INDEX];

                    qi.Mismatch = mismatchCount;
                    qi.Length   = seq.Length;

                    //too many mismatchs
                    if (!_filter.AcceptMismatch(mismatchCount))
                    {
                        continue;
                    }

                    if (!_filter.AcceptQueryName(qname))
                    {
                        continue;
                    }

                    if (!_filter.AcceptLength(seq.Length))
                    {
                        continue;
                    }

                    var cigar = parts[SAMFormatConst.CIGAR_INDEX];
                    if (!_filter.AcceptCigar(cigar))
                    {
                        continue;
                    }

                    var seqname = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr");
                    var start   = int.Parse(parts[SAMFormatConst.POS_INDEX]);
                    var end     = SAMUtils.ParseEnd(start, cigar);

                    bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand);
                    char strand;
                    if (isReversed)
                    {
                        strand = '-';
                    }
                    else
                    {
                        strand = '+';
                    }

                    var sam = new T();
                    var loc = new SAMAlignedLocation(sam)
                    {
                        Seqname = seqname,
                        Start   = start,
                        End     = end,
                        Strand  = strand,
                    };

                    if (!_filter.AcceptLocus(loc))
                    {
                        continue;
                    }

                    if (isReversed)
                    {
                        seq = SequenceUtils.GetReverseComplementedSequence(seq);
                    }

                    sam.Qname    = qname;
                    sam.Sequence = seq;

                    loc.AlignmentScore    = _format.GetAlignmentScore(parts);
                    loc.Cigar             = cigar;
                    loc.NumberOfMismatch  = mismatchCount;
                    loc.MismatchPositions = _format.GetMismatchPositions(parts);

                    if (_format.HasAlternativeHits)
                    {
                        _format.ParseAlternativeHits(parts, sam);
                    }

                    result.Add(sam);

                    waitingcount++;

                    if (waitingcount % 100 == 0)
                    {
                        Progress.SetMessage("{0} feature reads from {1} reads", waitingcount, count);
                    }
                }
            }

            return(result);
        }
Пример #11
0
        private IEnumerable <string> DoPairEndProcess()
        {
            var map = new Dictionary <string, SAMItemSlim>();

            var output1 = Path.ChangeExtension(options.OutputFile, ".1" + Path.GetExtension(options.OutputFile));
            var output2 = Path.ChangeExtension(options.OutputFile, ".2" + Path.GetExtension(options.OutputFile));

            var ignored = new HashSet <string>();

            using (var sw1 = new StreamWriter(output1))
            {
                using (var sw2 = new StreamWriter(output2))
                {
                    var sw = new[] { null, sw1, sw2 };
                    using (var sr = SAMFactory.GetReader(options.InputFile, options.Samtools, true))
                    {
                        string line;
                        var    count = 0;
                        while ((line = sr.ReadLine()) != null)
                        {
                            count++;

                            if (count % 100000 == 0)
                            {
                                Progress.SetMessage("{0} reads", count);
                                if (Progress.IsCancellationPending())
                                {
                                    throw new UserTerminatedException();
                                }
                            }

                            var ss = LineToPairedSamItem(line);

                            if (ignored.Contains(ss.Qname))
                            {
                                continue;
                            }

                            SAMItemSlim paired;
                            if (map.TryGetValue(ss.Qname, out paired))
                            {
                                if (paired.Pos == ss.Pos)
                                {
                                    continue;
                                }
                                ss.WriteFastq(sw[ss.Pos], true);

                                paired.WriteFastq(sw[paired.Pos], true);
                                ignored.Add(ss.Qname);
                                map.Remove(ss.Qname);
                            }
                            else
                            {
                                map[ss.Qname] = ss;
                            }
                        }
                    }

                    if (map.Count > 0)
                    {
                        var output3 = Path.ChangeExtension(options.OutputFile, ".orphan" + Path.GetExtension(options.OutputFile));
                        using (var sw3 = new StreamWriter(output3))
                        {
                            foreach (var v in map.Values)
                            {
                                v.WriteFastq(sw3, true);
                            }
                        }
                    }
                }
            }
            return(new[] { output1, output2 });
        }
        protected override List <T> DoBuild <T>(string fileName, out List <QueryInfo> totalQueries)
        {
            var result = new List <T>();

            _format = _options.GetSAMFormat();

            totalQueries = new List <QueryInfo>();

            using (var sr = SAMFactory.GetReader(fileName, true))
            {
                int    count        = 0;
                int    waitingcount = 0;
                string line;
                while ((line = sr.ReadLine()) != null)
                {
                    if (count % 1000 == 0)
                    {
                        if (Progress.IsCancellationPending())
                        {
                            throw new UserTerminatedException();
                        }
                    }

                    if (count % 100000 == 0 && count > 0)
                    {
                        Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count);
                    }

                    count++;
                    var qname = line.StringBefore("\t");
                    //Console.WriteLine("line = {0}", line);
                    //Console.WriteLine("query = {0}", qname);

                    var qi = new QueryInfo(qname);
                    totalQueries.Add(qi);

                    var      parts = line.Split('\t');
                    SAMFlags flag  = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);
                    //unmatched
                    if (flag.HasFlag(SAMFlags.UnmappedQuery))
                    {
                        continue;
                    }

                    //too many mismatchs
                    var mismatchCount = _format.GetNumberOfMismatch(parts);
                    var seq           = parts[SAMFormatConst.SEQ_INDEX];

                    qi.Mismatch          = mismatchCount;
                    qi.Length            = seq.Length;
                    qi.NoPenaltyMutation = 0;

                    if (_options.T2cAsNoPenaltyMutation)
                    {
                    }

                    if (mismatchCount > _options.MaximumMismatch)
                    {
                        continue;
                    }

                    if (!AcceptQueryName(qname))
                    {
                        continue;
                    }

                    //too short
                    if (seq.Length < _options.MinimumReadLength)
                    {
                        continue;
                    }

                    //too long
                    if (seq.Length > _options.MaximumReadLength)
                    {
                        continue;
                    }

                    var cigar = parts[SAMFormatConst.CIGAR_INDEX];
                    ////insertion/deletion
                    //if (cigar.Any(m => m == 'I' || m == 'D'))
                    //{
                    //  continue;
                    //}

                    bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand);
                    char strand;
                    if (isReversed)
                    {
                        strand = '-';
                        seq    = SequenceUtils.GetReverseComplementedSequence(seq);
                    }
                    else
                    {
                        strand = '+';
                    }

                    var score = _format.GetAlignmentScore(parts);

                    var sam = new T()
                    {
                        Qname    = qname,
                        Sequence = seq
                    };

                    var seqname = parts[SAMFormatConst.RNAME_INDEX];
                    var loc     = new SAMAlignedLocation(sam)
                    {
                        Seqname           = seqname,
                        Start             = int.Parse(parts[SAMFormatConst.POS_INDEX]),
                        Strand            = strand,
                        Cigar             = cigar,
                        NumberOfMismatch  = mismatchCount,
                        AlignmentScore    = score,
                        MismatchPositions = _format.GetMismatchPositions(parts)
                    };

                    loc.ParseEnd(sam.Sequence);
                    sam.AddLocation(loc);

                    if (_format.HasAlternativeHits)
                    {
                        _format.ParseAlternativeHits(parts, sam);
                    }

                    result.Add(sam);

                    waitingcount++;
                }

                Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count);
            }

            return(result);
        }
Пример #13
0
        public override IEnumerable <string> Process()
        {
            var countFiles = options.GetCountFiles();

            countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name));

            var countMap  = new Dictionary <string, Dictionary <string, int> >();
            int fileIndex = 0;

            foreach (var file in countFiles)
            {
                fileIndex++;
                Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File);

                var queries = new HashSet <string>();
                using (var sr = SAMFactory.GetReader(file.File, true))
                {
                    int    count = 0;
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        count++;

                        if (count % 1000 == 0)
                        {
                            if (Progress.IsCancellationPending())
                            {
                                throw new UserTerminatedException();
                            }
                        }

                        var parts = line.Split('\t');

                        SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]);

                        //unmatched
                        if (flag.HasFlag(SAMFlags.UnmappedQuery))
                        {
                            continue;
                        }

                        queries.Add(parts[SAMFormatConst.QNAME_INDEX]);
                    }
                }

                var countDic = new Dictionary <string, int>();
                countMap[file.Name] = countDic;
                var cm = new MapItemReader(0, 1, informationIndex: 2).ReadFromFile(file.AdditionalFile);
                foreach (var query in queries)
                {
                    var count = cm[query];
                    countDic[count.Information] = int.Parse(count.Value);
                }

                Progress.SetMessage("{0} reads mapped.", queries.Count);
            }

            var uniques = (from c in countMap.Values
                           from seq in c.Keys
                           select seq).Distinct().ToArray();
            var uniqueCounts = (from seq in uniques
                                let totalCount = (from c in countMap.Values
                                                  where c.ContainsKey(seq)
                                                  select c[seq]).Sum()
                                                 select new { Sequence = seq, Count = totalCount }).OrderByDescending(m => m.Count).ToArray();

            using (var sw = new StreamWriter(options.OutputFile))
            {
                sw.WriteLine("Sequence\t" + (from cf in countFiles select cf.Name).Merge("\t"));
                foreach (var uc in uniqueCounts)
                {
                    var seq = uc.Sequence;
                    sw.Write(seq);
                    foreach (var cf in countFiles)
                    {
                        var map = countMap[cf.Name];
                        int count;
                        if (map.TryGetValue(seq, out count))
                        {
                            sw.Write("\t{0}", count);
                        }
                        else
                        {
                            sw.Write("\t0");
                        }
                    }
                    sw.WriteLine();
                }
            }

            Progress.End();

            return(new string[] { Path.GetFullPath(options.OutputFile) });
        }
Пример #14
0
        protected override List <T> DoBuild <T>(string fileName, out List <QueryInfo> totalQueries)
        {
            Progress.SetMessage("Find queries overlapped with coordinates...");
            rangeQueries = new HashSet <string>();

            var miss1file = options.CoordinateFile + ".miss1";
            var miss0file = options.CoordinateFile + ".miss0";

            if (File.Exists(miss1file) && File.Exists(miss0file) && !options.T2cAsNoPenaltyMutation)
            {
                var miss1Queries = new HashSet <string>();
                using (var sr = SAMFactory.GetReader(fileName, true, miss1file))
                {
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (line.StartsWith("@"))
                        {
                            continue;
                        }
                        var qname = line.StringBefore("\t");
                        miss1Queries.Add(qname);
                    }
                }
                Progress.SetMessage("Miss 1 queries : {0}", miss1Queries.Count);

                var miss0Queries = new HashSet <string>();
                using (var sr = SAMFactory.GetReader(fileName, true, miss0file))
                {
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (line.StartsWith("@"))
                        {
                            continue;
                        }
                        if (line.Contains("NM:i:0"))
                        {
                            var qname = line.StringBefore("\t");
                            miss0Queries.Add(qname);
                        }
                    }
                }
                Progress.SetMessage("Miss 0 queries : {0}", miss0Queries.Count);
                rangeQueries.UnionWith(miss1Queries);
                rangeQueries.UnionWith(miss0Queries);
                miss1Queries.Clear();
                miss0Queries.Clear();
            }
            else
            {
                using (var sr = SAMFactory.GetReader(fileName, true, options.CoordinateFile))
                {
                    string line;
                    while ((line = sr.ReadLine()) != null)
                    {
                        if (line.StartsWith("@"))
                        {
                            continue;
                        }
                        var qname = line.StringBefore("\t");
                        rangeQueries.Add(qname);
                    }
                }
            }

            Progress.SetMessage("{0} queries overlaped with coordinates.", rangeQueries.Count);

            return(base.DoBuild <T>(fileName, out totalQueries));
        }