public List <ChromosomeCountSlimItem> Build(string fileName) { if (File.Exists(options.CategoryMapFile)) { Progress.SetMessage("Reading name map file " + options.CategoryMapFile + " ..."); nameMap = new MapItemReader(0, 1).ReadFromFile(options.CategoryMapFile).ToDictionary(m => m.Key, m => m.Value.Value); } var result = new List <ChromosomeCountSlimItem>(); var queries = new Dictionary <string, SAMChromosomeItem>(); var chromosomes = new Dictionary <string, ChromosomeCountSlimItem>(); Regex chromosomeRegex = null; Func <string, bool> acceptChromosome; if (string.IsNullOrEmpty(options.ChromosomePattern)) { acceptChromosome = m => true; } else { chromosomeRegex = new Regex(options.ChromosomePattern); acceptChromosome = m => chromosomeRegex.Match(m).Success; } Progress.SetMessage("Parsing alignment file " + fileName + " ..."); using (var sr = SAMFactory.GetReader(fileName, true)) { int count = 0; int waitingcount = 0; string line; while ((line = sr.ReadLine()) != null) { if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0 && count > 0) { Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count); } count++; var parts = line.Split('\t'); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } var seqname = GetName(parts[SAMFormatConst.RNAME_INDEX]); if (!acceptChromosome(seqname)) { continue; } var qname = parts[SAMFormatConst.QNAME_INDEX]; SAMChromosomeItem query; if (!queries.TryGetValue(qname, out query)) { query = new SAMChromosomeItem(); query.Qname = qname; queries[qname] = query; if (options.KeepSequence) { query.Sequence = parts[SAMFormatConst.SEQ_INDEX]; if (flag.HasFlag(SAMFlags.QueryOnReverseStrand)) { query.Sequence = SequenceUtils.GetReverseComplementedSequence(query.Sequence); } } } query.Chromosomes.Add(seqname); ChromosomeCountSlimItem item; if (!chromosomes.TryGetValue(seqname, out item)) { item = new ChromosomeCountSlimItem(); item.Names.Add(seqname); chromosomes[seqname] = item; result.Add(item); } item.Queries.Add(query); waitingcount++; } Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count); } foreach (var query in queries.Values) { query.Chromosomes = query.Chromosomes.Distinct().OrderBy(m => m).ToList(); } foreach (var sam in chromosomes.Values) { sam.Queries = sam.Queries.Distinct().OrderBy(m => m.Qname).ToList(); } if (!string.IsNullOrEmpty(options.PreferPrefix)) { foreach (var query in queries.Values) { if (query.Chromosomes.Any(l => l.StartsWith(options.PreferPrefix))) { var chroms = query.Chromosomes.Where(l => l.StartsWith(options.PreferPrefix)).ToArray(); foreach (var chrom in chroms) { chromosomes[chrom].Queries.Remove(query); query.Chromosomes.Remove(chrom); } } } result.RemoveAll(l => l.Queries.Count == 0); } return(result); }
public override IEnumerable <string> Process() { PileupCountList pc = new PileupCountList(); var format = options.GetSAMFormat(); var cm = new SmallRNACountMap(options.CountFile); var srItems = SequenceRegionUtils.GetSequenceRegions(options.CoordinateFile, "miRNA", options.BedAsGtf); srItems.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); }); var srmap = srItems.GroupBy(m => m.Seqname).ToDictionary(m => m.Key, m => m.ToList()); StreamWriter swScript = null; try { if (options.ExportIgvScript) { swScript = new StreamWriter(options.OutputFile + ".igv"); swScript.WriteLine("snapshotDirectory {0}", Path.GetDirectoryName(options.OutputFile).Replace('\\', '/')); } using (StreamWriter sw = new StreamWriter(options.OutputFile)) { sw.WriteLine(@"##fileformat=VCFv4.2 ##fileDate={0:yyyyMMdd} ##source={1} ##phasing=partial ##INFO=<ID=NS,Number=1,Type=Integer,Description=""Number of Samples With Data""> ##INFO=<ID=DP,Number=1,Type=Integer,Description=""Total Depth""> ##INFO=<ID=AF,Number=A,Type=Float,Description=""Allele Frequency""> ##INFO=<ID=FP,Number=1,Type=Float,Description=""Fisher Exact Test P-Value""> ##INFO=<ID=MN,Number=.,Type=String,Description=""miRNA name contains this position""> ##FILTER=<ID=FisherET,Description=""Fisher exact test Pvalue less than {2}""> ##FILTER=<ID=AltAlleFreq,Description=""Alternative allele frequency less than {3}""> ##FILTER=<ID=notMiRNA,Description=""Position not located in miRNA locus""> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description=""Read Depth""> ##FORMAT=<ID=AD,Number=1,Type=Integer,Description=""Allelic Depth""> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {4}", DateTime.Now, "PileupCountBuilder", options.FisherPValue, options.MinimumAlternativeAlleleFrequency, Path.GetFileNameWithoutExtension(options.InputFile)); using (var sr = SAMFactory.GetReader(options.InputFile, true)) { int count = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 100 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0) { Progress.SetMessage("{0} reads processed", count); } var parts = line.Split('\t'); var qname = parts[SAMFormatConst.QNAME_INDEX]; var seq = parts[SAMFormatConst.SEQ_INDEX]; //too short if (seq.Length < options.MinimumReadLength) { continue; } SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; //insertion/deletion if (cigar.Any(m => m == 'I' || m == 'D')) { continue; } var sam = new SAMAlignedItem() { Qname = qname, }; bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; sam.Sequence = SequenceUtils.GetReverseComplementedSequence(seq); } else { strand = '+'; sam.Sequence = seq; } var loc = new SAMAlignedLocation(sam) { Seqname = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr"), Start = int.Parse(parts[SAMFormatConst.POS_INDEX]), Strand = strand, Cigar = parts[SAMFormatConst.CIGAR_INDEX], MismatchPositions = format.GetMismatchPositions(parts), NumberOfMismatch = format.GetNumberOfMismatch(parts), Sequence = seq }; loc.ParseEnd(sam.Sequence); sam.AddLocation(loc); if (format.HasAlternativeHits) { format.ParseAlternativeHits(parts, sam); } var finished = pc.Add(sam, cm.GetCount(sam.Qname)); if (null == finished || 0 == finished.Count) { continue; } foreach (var fin in finished) { //if (fin.Chromosome.Equals("1") && fin.Position == 5160725) //{ // Console.WriteLine(fin); //} var ft = fin.FisherExactTest(); if (ft.PValue <= options.FisherPValue) { var total = fin.Sum(m => m.Value); var minallele = total * options.MinimumAlternativeAlleleFrequency; if (ft.Sample2.Failed >= minallele) { List <GtfItem> srs; List <string> ranges = new List <string>(); if (srmap.TryGetValue(sam.Locations[0].Seqname, out srs)) { foreach (var seqr in srs) { if (seqr.Contains(fin.Position)) { ranges.Add(seqr.GetNameLocation()); } } } var alter = (from r in fin where r.Key != fin.Reference orderby r.Key select r).ToList(); var str = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tNS={7};DP={8};AF={9};FP={10:0.##E0}{11}\tDP:AD\t{12}:{13},{14}", fin.Chromosome, fin.Position, ".", fin.Reference, (from r in alter select r.Key.ToString()).Merge(","), 0, ranges.Count == 0 ? "notMiRNA" : "PASS", 1, total, (from r in alter select string.Format("{0:0.###}", r.Value * 1.0 / total)).Merge(","), ft.PValue, ranges.Count == 0 ? "" : ";" + ranges.Merge(","), total, ft.Sample2.Succeed, (from r in alter select r.Value.ToString()).Merge(",")); sw.WriteLine(str); //Console.WriteLine(str); if (swScript != null && ranges.Count > 0) { swScript.WriteLine(@"goto {0}:{1} sort position snapshot {0}_{2}_{1}.png", fin.Chromosome, fin.Position, ranges[0].Replace('(', '_').Replace(')', '_').Replace(':', '_')); } } } } finished.Clear(); } } } } finally { if (swScript != null) { swScript.Close(); } } return(new string[] { options.OutputFile }); }
public virtual bool AcceptFlags(SAMFlags flags) { return !flags.HasFlag(SAMFlags.UnmappedQuery); }
protected override List <T> DoBuild <T>(string fileName, out List <QueryInfo> totalQueries) { var result = new List <T>(); _format = _options.GetSAMFormat(); totalQueries = new List <QueryInfo>(); using (var sr = SAMFactory.GetReader(fileName, true)) { int count = 0; int waitingcount = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } var parts = line.Split('\t'); var qname = parts[SAMFormatConst.QNAME_INDEX]; var qi = new QueryInfo(qname); totalQueries.Add(qi); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); if (!_filter.AcceptFlags(flag)) { continue; } var mismatchCount = _format.GetNumberOfMismatch(parts); var seq = parts[SAMFormatConst.SEQ_INDEX]; qi.Mismatch = mismatchCount; qi.Length = seq.Length; //too many mismatchs if (!_filter.AcceptMismatch(mismatchCount)) { continue; } if (!_filter.AcceptQueryName(qname)) { continue; } if (!_filter.AcceptLength(seq.Length)) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; if (!_filter.AcceptCigar(cigar)) { continue; } var seqname = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr"); var start = int.Parse(parts[SAMFormatConst.POS_INDEX]); var end = SAMUtils.ParseEnd(start, cigar); bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; } else { strand = '+'; } var sam = new T(); var loc = new SAMAlignedLocation(sam) { Seqname = seqname, Start = start, End = end, Strand = strand, }; if (!_filter.AcceptLocus(loc)) { continue; } if (isReversed) { seq = SequenceUtils.GetReverseComplementedSequence(seq); } sam.Qname = qname; sam.Sequence = seq; loc.AlignmentScore = _format.GetAlignmentScore(parts); loc.Cigar = cigar; loc.NumberOfMismatch = mismatchCount; loc.MismatchPositions = _format.GetMismatchPositions(parts); if (_format.HasAlternativeHits) { _format.ParseAlternativeHits(parts, sam); } result.Add(sam); waitingcount++; if (waitingcount % 100 == 0) { Progress.SetMessage("{0} feature reads from {1} reads", waitingcount, count); } } } return(result); }
public virtual bool AcceptFlags(SAMFlags flags) { return(!flags.HasFlag(SAMFlags.UnmappedQuery)); }
protected override List <T> DoBuild <T>(string fileName, out List <QueryInfo> totalQueries) { var result = new List <T>(); _format = _options.GetSAMFormat(); totalQueries = new List <QueryInfo>(); using (var sr = SAMFactory.GetReader(fileName, true)) { int count = 0; int waitingcount = 0; string line; while ((line = sr.ReadLine()) != null) { if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0 && count > 0) { Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count); } count++; var qname = line.StringBefore("\t"); //Console.WriteLine("line = {0}", line); //Console.WriteLine("query = {0}", qname); var qi = new QueryInfo(qname); totalQueries.Add(qi); var parts = line.Split('\t'); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } //too many mismatchs var mismatchCount = _format.GetNumberOfMismatch(parts); var seq = parts[SAMFormatConst.SEQ_INDEX]; qi.Mismatch = mismatchCount; qi.Length = seq.Length; qi.NoPenaltyMutation = 0; if (_options.T2cAsNoPenaltyMutation) { } if (mismatchCount > _options.MaximumMismatch) { continue; } if (!AcceptQueryName(qname)) { continue; } //too short if (seq.Length < _options.MinimumReadLength) { continue; } //too long if (seq.Length > _options.MaximumReadLength) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; ////insertion/deletion //if (cigar.Any(m => m == 'I' || m == 'D')) //{ // continue; //} bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; seq = SequenceUtils.GetReverseComplementedSequence(seq); } else { strand = '+'; } var score = _format.GetAlignmentScore(parts); var sam = new T() { Qname = qname, Sequence = seq }; var seqname = parts[SAMFormatConst.RNAME_INDEX]; var loc = new SAMAlignedLocation(sam) { Seqname = seqname, Start = int.Parse(parts[SAMFormatConst.POS_INDEX]), Strand = strand, Cigar = cigar, NumberOfMismatch = mismatchCount, AlignmentScore = score, MismatchPositions = _format.GetMismatchPositions(parts) }; loc.ParseEnd(sam.Sequence); sam.AddLocation(loc); if (_format.HasAlternativeHits) { _format.ParseAlternativeHits(parts, sam); } result.Add(sam); waitingcount++; } Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count); } return(result); }
public override IEnumerable <string> Process() { var countFiles = options.GetCountFiles(); countFiles.Sort((m1, m2) => m1.Name.CompareTo(m2.Name)); var countMap = new Dictionary <string, Dictionary <string, int> >(); int fileIndex = 0; foreach (var file in countFiles) { fileIndex++; Progress.SetMessage("Reading {0}/{1}: {2} ...", fileIndex, countFiles.Count, file.File); var queries = new HashSet <string>(); using (var sr = SAMFactory.GetReader(file.File, true)) { int count = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } var parts = line.Split('\t'); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } queries.Add(parts[SAMFormatConst.QNAME_INDEX]); } } var countDic = new Dictionary <string, int>(); countMap[file.Name] = countDic; var cm = new MapItemReader(0, 1, informationIndex: 2).ReadFromFile(file.AdditionalFile); foreach (var query in queries) { var count = cm[query]; countDic[count.Information] = int.Parse(count.Value); } Progress.SetMessage("{0} reads mapped.", queries.Count); } var uniques = (from c in countMap.Values from seq in c.Keys select seq).Distinct().ToArray(); var uniqueCounts = (from seq in uniques let totalCount = (from c in countMap.Values where c.ContainsKey(seq) select c[seq]).Sum() select new { Sequence = seq, Count = totalCount }).OrderByDescending(m => m.Count).ToArray(); using (var sw = new StreamWriter(options.OutputFile)) { sw.WriteLine("Sequence\t" + (from cf in countFiles select cf.Name).Merge("\t")); foreach (var uc in uniqueCounts) { var seq = uc.Sequence; sw.Write(seq); foreach (var cf in countFiles) { var map = countMap[cf.Name]; int count; if (map.TryGetValue(seq, out count)) { sw.Write("\t{0}", count); } else { sw.Write("\t0"); } } sw.WriteLine(); } } Progress.End(); return(new string[] { Path.GetFullPath(options.OutputFile) }); }