public int AddByScore(SAMAlignedItem item, int count, int minScore) { int result = 0; string align, score; item.GetSequenceScore(out align, out score); for (int i = 0; i < align.Length; i++) { if (minScore > 0) { var bq = (int)(score[i]) - 33; if (bq < minScore) { result++; continue; } } var c = align[i]; var dic = Count[(int)(item.Pos) + i]; int curcount = 0; if (dic.TryGetValue(c, out curcount)) { dic[c] = curcount + count; } else { dic[c] = count; } } return result; }
public static List <SAMAlignedItem> ToSAMAlignedItems(this XElement root) { var result = new List <SAMAlignedItem>(); foreach (var queryEle in root.Element("queries").Elements("query")) { var query = new SAMAlignedItem(); query.Qname = queryEle.Attribute("name").Value; query.Sequence = queryEle.Attribute("sequence").Value; query.QueryCount = int.Parse(queryEle.Attribute("count").Value); query.Sample = queryEle.GetAttributeValue("sample", null); result.Add(query); foreach (var locEle in queryEle.Elements("location")) { var loc = new SAMAlignedLocation(query); loc.ParseLocation(locEle); loc.Cigar = locEle.Attribute("cigar").Value; loc.AlignmentScore = int.Parse(locEle.Attribute("score").Value); loc.MismatchPositions = locEle.Attribute("mdz").Value; loc.NumberOfMismatch = int.Parse(locEle.Attribute("nmi").Value); var nnmpattr = locEle.Attribute("nnpm"); if (nnmpattr != null) { loc.NumberOfNoPenaltyMutation = int.Parse(nnmpattr.Value); } } } return(result); }
public void TestParseAlternativeHits() { var bwaformat = new BwaFormat(); var sam = new SAMAlignedItem(); Assert.IsTrue(bwaformat.HasAlternativeHits); var parts = "HWI-ST508:275:D2A2JACXX:3:1105:21234:49676\t0\t1_hsa\t564952\t0\t36M\t*\t0\t0\tAGTAAGGTCAGCTAATTAAGCTATCGGGCCCATAAA\t@@?DDFF?FFFDBHIIJIJIJJGJJJJJJJJJJJJJ\tRG:Z:2570-KCV-01-19\tXT:A:R\tNM:i:3\tX0:i:4\tX1:i:0\tXM:i:3\tXO:i:0\tXG:i:0\tMD:Z:15A18C0C0\tXA:Z:M_hsa,+4403,36M,3;17_hsa,+19506660,36M,3;X_hsa,-55206629,36M,3;".Split('\t'); bwaformat.ParseAlternativeHits(parts, sam); }
public SAMAlignedLocation(SAMAlignedItem parent) { this.Features = new List<ISequenceRegion>(); if (parent != null) { parent.AddLocation(this); } }
public SAMAlignedLocation(SAMAlignedItem parent) { this.Features = new List <ISequenceRegion>(); if (parent != null) { parent.AddLocation(this); } }
public List<PileupCount> Add(SAMAlignedItem item, int count) { List<PileupCount> result = null; if (!item.Locations[0].Seqname.Equals(this.Chromosome)) { result = Count; Count = new List<PileupCount>(); } else if (this.Position != -1) { if (item.Pos > this.Count.Last().Position) { result = Count; Count = new List<PileupCount>(); } else { int finishedCount = (int)(item.Pos - this.Position); if (finishedCount > 0) { result = new List<PileupCount>(); result.AddRange(Count.Take(finishedCount)); Count.RemoveRange(0, finishedCount); } } } string align, refer; item.GetSequences(out align, out refer); for (int i = Count.Count; i < align.Length; i++) { Count.Add(new PileupCount() { Chromosome = item.Locations[0].Seqname, Position = item.Locations[0].Start + i, Reference = refer[i] }); } for (int i = 0; i < align.Length; i++) { var c = align[i]; var dic = Count[i]; int curcount = 0; if (dic.TryGetValue(c, out curcount)) { dic[c] = curcount + count; } else { dic[c] = count; } } return result; }
public void TestGetGetSequences() { SAMAlignedItem item = new SAMAlignedItem(); item.AddLocation(new SAMAlignedLocation(item) { Cigar = "5S18M2D19M5S", Start = 39979942, MismatchPositions = "18^CA10T8", Sequence = "aaaaaGTAGTACCAACTGTAAGTCCTTATCTTCATACTTTGTaaaaa" }); string align, refer; item.GetSequences(out align, out refer); Assert.AreEqual("GTAGTACCAACTGTAAGT CCTTATCTTCATACTTTGT", align); Assert.AreEqual("GTAGTACCAACTGTAAGTCACCTTATCTTCTTACTTTGT", refer); }
public static List <SAMAlignedItem> ReadFrom(XmlReader source) { var result = new List <SAMAlignedItem>(); source.ReadToFollowing("queries"); if (source.ReadToDescendant("query")) { do { var query = new SAMAlignedItem(); result.Add(query); query.Qname = source.GetAttribute("name"); query.Sequence = source.GetAttribute("sequence"); query.QueryCount = int.Parse(source.GetAttribute("count")); query.Sample = source.GetAttribute("sample"); if (source.ReadToDescendant("location")) { do { var loc = new SAMAlignedLocation(query); loc.Seqname = source.GetAttribute("seqname"); loc.Start = long.Parse(source.GetAttribute("start")); loc.End = long.Parse(source.GetAttribute("end")); loc.Strand = source.GetAttribute("strand")[0]; loc.Cigar = source.GetAttribute("cigar"); loc.AlignmentScore = int.Parse(source.GetAttribute("score")); loc.MismatchPositions = source.GetAttribute("mdz"); loc.NumberOfMismatch = int.Parse(source.GetAttribute("nmi")); var nnmpattr = source.GetAttribute("nnpm"); if (nnmpattr != null) { loc.NumberOfNoPenaltyMutation = int.Parse(nnmpattr); } } while (source.ReadToNextSibling("location")); } } while (source.ReadToNextSibling("query")); } return(result); }
public static List<SAMAlignedItem> ReadFrom(XmlReader source) { var result = new List<SAMAlignedItem>(); source.ReadToFollowing("queries"); if (source.ReadToDescendant("query")) { do { var query = new SAMAlignedItem(); result.Add(query); query.Qname = source.GetAttribute("name"); query.Sequence = source.GetAttribute("sequence"); query.QueryCount = int.Parse(source.GetAttribute("count")); query.Sample = source.GetAttribute("sample"); if (source.ReadToDescendant("location")) { do { var loc = new SAMAlignedLocation(query); loc.Seqname = source.GetAttribute("seqname"); loc.Start = long.Parse(source.GetAttribute("start")); loc.End = long.Parse(source.GetAttribute("end")); loc.Strand = source.GetAttribute("strand")[0]; loc.Cigar = source.GetAttribute("cigar"); loc.AlignmentScore = int.Parse(source.GetAttribute("score")); loc.MismatchPositions = source.GetAttribute("mdz"); loc.NumberOfMismatch = int.Parse(source.GetAttribute("nmi")); var nnmpattr = source.GetAttribute("nnpm"); if (nnmpattr != null) { loc.NumberOfNoPenaltyMutation = int.Parse(nnmpattr); } } while (source.ReadToNextSibling("location")); } } while (source.ReadToNextSibling("query")); } return result; }
public void Add(SAMAlignedItem item, int count) { string align, refer; item.GetSequences(out align, out refer); for (int i = 0; i < align.Length; i++) { var c = align[i]; var dic = Count[(int)(item.Pos) + i]; int curcount = 0; if (dic.TryGetValue(c, out curcount)) { dic[c] = curcount + count; } else { dic[c] = count; } } }
public override void ParseAlternativeHits(string[] parts, SAMAlignedItem item) { var countstr = GetOptionValue(parts, "X0:i:", false); if (string.IsNullOrEmpty(countstr)) { return; } var count = int.Parse(countstr) - 1; if (count == 0) { return; } var xaz = GetOptionValue(parts, "XA:Z:", false); if (string.IsNullOrEmpty(xaz)) { return; } var match = _reg.Match(xaz); for (var i = 0; i < count; i++) { var loc = new SAMAlignedLocation(item) { Seqname = match.Groups[1].Value, Strand = match.Groups[2].Value[0], Start = long.Parse(match.Groups[3].Value) }; loc.End = loc.Start + item.Locations[0].Length - 1; loc.Cigar = match.Groups[4].Value; loc.NumberOfMismatch = int.Parse(match.Groups[5].Value); item.AddLocation(loc); } }
public virtual void ParseAlternativeHits(string[] parts, SAMAlignedItem target) { }
public override IEnumerable<string> Process() { PileupCountList pc = new PileupCountList(); var format = options.GetSAMFormat(); var cm = new SmallRNACountMap(options.CountFile); var srItems = SequenceRegionUtils.GetSequenceRegions(options.CoordinateFile, "miRNA", options.BedAsGtf); srItems.ForEach(m => { m.Seqname = m.Seqname.StringAfter("chr"); }); var srmap = srItems.GroupBy(m => m.Seqname).ToDictionary(m => m.Key, m => m.ToList()); StreamWriter swScript = null; try { if (options.ExportIgvScript) { swScript = new StreamWriter(options.OutputFile + ".igv"); swScript.WriteLine("snapshotDirectory {0}", Path.GetDirectoryName(options.OutputFile).Replace('\\', '/')); } using (StreamWriter sw = new StreamWriter(options.OutputFile)) { sw.WriteLine(@"##fileformat=VCFv4.2 ##fileDate={0:yyyyMMdd} ##source={1} ##phasing=partial ##INFO=<ID=NS,Number=1,Type=Integer,Description=""Number of Samples With Data""> ##INFO=<ID=DP,Number=1,Type=Integer,Description=""Total Depth""> ##INFO=<ID=AF,Number=A,Type=Float,Description=""Allele Frequency""> ##INFO=<ID=FP,Number=1,Type=Float,Description=""Fisher Exact Test P-Value""> ##INFO=<ID=MN,Number=.,Type=String,Description=""miRNA name contains this position""> ##FILTER=<ID=FisherET,Description=""Fisher exact test Pvalue less than {2}""> ##FILTER=<ID=AltAlleFreq,Description=""Alternative allele frequency less than {3}""> ##FILTER=<ID=notMiRNA,Description=""Position not located in miRNA locus""> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description=""Read Depth""> ##FORMAT=<ID=AD,Number=1,Type=Integer,Description=""Allelic Depth""> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT {4}", DateTime.Now, "PileupCountBuilder", options.FisherPValue, options.MinimumAlternativeAlleleFrequency, Path.GetFileNameWithoutExtension(options.InputFile)); using (var sr = SAMFactory.GetReader(options.InputFile, true)) { int count = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 100 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0) { Progress.SetMessage("{0} reads processed", count); } var parts = line.Split('\t'); var qname = parts[SAMFormatConst.QNAME_INDEX]; var seq = parts[SAMFormatConst.SEQ_INDEX]; //too short if (seq.Length < options.MinimumReadLength) { continue; } SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; //insertion/deletion if (cigar.Any(m => m == 'I' || m == 'D')) { continue; } var sam = new SAMAlignedItem() { Qname = qname, }; bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; sam.Sequence = SequenceUtils.GetReverseComplementedSequence(seq); } else { strand = '+'; sam.Sequence = seq; } var loc = new SAMAlignedLocation(sam) { Seqname = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr"), Start = int.Parse(parts[SAMFormatConst.POS_INDEX]), Strand = strand, Cigar = parts[SAMFormatConst.CIGAR_INDEX], MismatchPositions = format.GetMismatchPositions(parts), NumberOfMismatch = format.GetNumberOfMismatch(parts), Sequence = seq }; loc.ParseEnd(sam.Sequence); sam.AddLocation(loc); if (format.HasAlternativeHits) { format.ParseAlternativeHits(parts, sam); } var finished = pc.Add(sam, cm.GetCount(sam.Qname)); if (null == finished || 0 == finished.Count) { continue; } foreach (var fin in finished) { //if (fin.Chromosome.Equals("1") && fin.Position == 5160725) //{ // Console.WriteLine(fin); //} var ft = fin.FisherExactTest(); if (ft.PValue <= options.FisherPValue) { var total = fin.Sum(m => m.Value); var minallele = total * options.MinimumAlternativeAlleleFrequency; if (ft.Sample2.Failed >= minallele) { List<GtfItem> srs; List<string> ranges = new List<string>(); if (srmap.TryGetValue(sam.Locations[0].Seqname, out srs)) { foreach (var seqr in srs) { if (seqr.Contains(fin.Position)) { ranges.Add(seqr.GetNameLocation()); } } } var alter = (from r in fin where r.Key != fin.Reference orderby r.Key select r).ToList(); var str = string.Format("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tNS={7};DP={8};AF={9};FP={10:0.##E0}{11}\tDP:AD\t{12}:{13},{14}", fin.Chromosome, fin.Position, ".", fin.Reference, (from r in alter select r.Key.ToString()).Merge(","), 0, ranges.Count == 0 ? "notMiRNA" : "PASS", 1, total, (from r in alter select string.Format("{0:0.###}", r.Value * 1.0 / total)).Merge(","), ft.PValue, ranges.Count == 0 ? "" : ";" + ranges.Merge(","), total, ft.Sample2.Succeed, (from r in alter select r.Value.ToString()).Merge(",")); sw.WriteLine(str); //Console.WriteLine(str); if (swScript != null && ranges.Count > 0) { swScript.WriteLine(@"goto {0}:{1} sort position snapshot {0}_{2}_{1}.png", fin.Chromosome, fin.Position, ranges[0].Replace('(', '_').Replace(')', '_').Replace(':', '_')); } } } } finished.Clear(); } } } } finally { if (swScript != null) { swScript.Close(); } } return new string[] { options.OutputFile }; }
public SAMAlignedItem NextSAMAlignedItem() { string line; while ((line = _file.ReadLine()) != null) { var parts = line.Split('\t'); var qname = parts[SAMFormatConst.QNAME_INDEX]; var seq = parts[SAMFormatConst.SEQ_INDEX]; var flag = (SAMFlags) int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } //check map quality var mapq = int.Parse(parts[SAMFormatConst.MAPQ_INDEX]); if (mapq < _options.MinimumReadQuality) { continue; } var sam = new SAMAlignedItem { Qname = qname, }; bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; sam.Sequence = SequenceUtils.GetReverseComplementedSequence(seq); } else { strand = '+'; sam.Sequence = seq; } var loc = new SAMAlignedLocation(sam) { Seqname = parts[SAMFormatConst.RNAME_INDEX], Start = int.Parse(parts[SAMFormatConst.POS_INDEX]), Strand = strand, Cigar = parts[SAMFormatConst.CIGAR_INDEX], MismatchPositions = _format.GetMismatchPositions(parts), NumberOfMismatch = _format.GetNumberOfMismatch(parts), Sequence = seq, Qual = parts[SAMFormatConst.QUAL_INDEX] }; loc.ParseEnd(sam.Sequence); sam.AddLocation(loc); if (_format.HasAlternativeHits) { _format.ParseAlternativeHits(parts, sam); } return sam; } return null; }
/// <summary> /// Add alignment result and return the completed positions /// </summary> /// <param name="item">alignment result</param> /// <returns>completed positions</returns> public List<AlignedPositionMap> Add(SAMAlignedItem item) { List<AlignedPositionMap> result = null; //if the alignment result moves to another chromosome, all uncompleted positions //will be completed. if (!item.Locations[0].Seqname.Equals(this.Chromosome)) { result = Positions; Positions = new List<AlignedPositionMap>(); PositionMap = new Dictionary<long, AlignedPositionMap>(); } else if (this.Position != -1) { //if the alignment result position is larger than the last position in the uncompleted positions, //all uncompleted positions will be completed. if (item.Pos > this.Positions.Last().Position) { result = Positions; Positions = new List<AlignedPositionMap>(); PositionMap = new Dictionary<long, AlignedPositionMap>(); } else { //set up the completed list result = new List<AlignedPositionMap>(); while (Positions[0].Position < item.Pos) { result.Add(Positions[0]); PositionMap.Remove(Positions[0].Position); Positions.RemoveAt(0); } } } List<AlignedPosition> align = item.GetAlignedPositions(); foreach (var asp in align) { AlignedPositionMap dic; if (!PositionMap.TryGetValue(asp.Position, out dic)) { dic = new AlignedPositionMap() { Chromosome = item.Locations[0].Seqname, Position = asp.Position }; Positions.Add(dic); PositionMap[dic.Position] = dic; } List<AlignedPosition> curcount; if (!dic.TryGetValue(asp.AlignedEvent, out curcount)) { curcount = new List<AlignedPosition>(); dic[asp.AlignedEvent] = curcount; } curcount.Add(asp); } return result; }
public void TestAdd() { SAMAlignedItem item1 = new SAMAlignedItem() { Sequence = "CTCTTAGATCGATGTGGTGCTCCGGAAAAAA", }; item1.AddLocation(new SAMAlignedLocation(item1) { Seqname ="chr13", Cigar = "5S21M5S", MismatchPositions = "10T10", Start = 39979942, Sequence = "CTCTTAGATCGATGTGGTGCTCCGGAAAAAA" }); SAMAlignedItem item2 = new SAMAlignedItem() { Sequence = "GATGTAGTGCTCCGGATTTTT" }; item2.AddLocation(new SAMAlignedLocation(item2) { Seqname = "chr13", Cigar = "21M", MismatchPositions = "5T15", Start = 39979947, Sequence = "GATGTAGTGCTCCGGATTTTT" }); List<PileupCount> all = new List<PileupCount>(); PileupCountList count = new PileupCountList(); var res1 = count.Add(item1, 2); Assert.AreEqual(0, res1.Count); var res2 = count.Add(item2, 3); Assert.AreEqual(5, res2.Count); all.AddRange(res2); for (int i = 0; i < res2.Count; i++) { Assert.AreEqual(item1.Sequence[5 + i], res2[i].Reference); Assert.AreEqual(item1.Sequence[5 + i], res2[i].First().Key); Assert.AreEqual(2, res2[i].First().Value); Assert.AreEqual(item1.Locations[0].Seqname, res2[i].Chromosome); Assert.AreEqual(item1.Pos + i, res2[i].Position); } item1.Locations[0].Seqname = "chr14"; var res3 = count.Add(item1, 2); Assert.AreEqual(21, res3.Count); all.AddRange(res3); for (int i = 0; i < 16; i++) { if (i == 5) { Assert.AreEqual('T', res3[i].Reference); Assert.True(res3[i].ContainsKey('G')); Assert.AreEqual(2, res3[i]['G']); Assert.True(res3[i].ContainsKey('A')); Assert.AreEqual(3, res3[i]['A']); } else { Assert.AreEqual(item2.Sequence[i], res3[i].Reference); Assert.AreEqual(item2.Sequence[i], res3[i].First().Key); Assert.AreEqual(5, res3[i].First().Value); } } for (int i = 16; i < 21; i++) { Assert.AreEqual(item2.Sequence[i], res3[i].First().Key); Assert.AreEqual(3, res3[i].First().Value); } var res4 = count.Count; Assert.AreEqual(21, res4.Count); all.AddRange(res4); for (int i = 0; i < res4.Count; i++) { Assert.AreEqual(item1.Sequence[5 + i], res4[i].First().Key); Assert.AreEqual(2, res4[i].First().Value); Assert.AreEqual(item1.Locations[0].Seqname, res4[i].Chromosome); Assert.AreEqual(item1.Pos + i, res4[i].Position); } //all.ForEach(m => Output(m)); }
public static List<SAMAlignedItem> ToSAMAlignedItems(this XElement root) { var result = new List<SAMAlignedItem>(); foreach (var queryEle in root.Element("queries").Elements("query")) { var query = new SAMAlignedItem(); query.Qname = queryEle.Attribute("name").Value; query.Sequence = queryEle.Attribute("sequence").Value; query.QueryCount = int.Parse(queryEle.Attribute("count").Value); query.Sample = queryEle.GetAttributeValue("sample", null); result.Add(query); foreach (var locEle in queryEle.Elements("location")) { var loc = new SAMAlignedLocation(query); loc.ParseLocation(locEle); loc.Cigar = locEle.Attribute("cigar").Value; loc.AlignmentScore = int.Parse(locEle.Attribute("score").Value); loc.MismatchPositions = locEle.Attribute("mdz").Value; loc.NumberOfMismatch = int.Parse(locEle.Attribute("nmi").Value); var nnmpattr = locEle.Attribute("nnpm"); if (nnmpattr != null) { loc.NumberOfNoPenaltyMutation = int.Parse(nnmpattr.Value); } } } return result; }