protected override List <T> DoBuild <T>(string fileName, out List <QueryInfo> totalQueries) { var result = new List <T>(); _format = _options.GetSAMFormat(); totalQueries = new List <QueryInfo>(); using (var sr = SAMFactory.GetReader(fileName, true)) { int count = 0; int waitingcount = 0; string line; while ((line = sr.ReadLine()) != null) { count++; if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } var parts = line.Split('\t'); var qname = parts[SAMFormatConst.QNAME_INDEX]; var qi = new QueryInfo(qname); totalQueries.Add(qi); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); if (!_filter.AcceptFlags(flag)) { continue; } var mismatchCount = _format.GetNumberOfMismatch(parts); var seq = parts[SAMFormatConst.SEQ_INDEX]; qi.Mismatch = mismatchCount; qi.Length = seq.Length; //too many mismatchs if (!_filter.AcceptMismatch(mismatchCount)) { continue; } if (!_filter.AcceptQueryName(qname)) { continue; } if (!_filter.AcceptLength(seq.Length)) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; if (!_filter.AcceptCigar(cigar)) { continue; } var seqname = parts[SAMFormatConst.RNAME_INDEX].StringAfter("chr"); var start = int.Parse(parts[SAMFormatConst.POS_INDEX]); var end = SAMUtils.ParseEnd(start, cigar); bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; } else { strand = '+'; } var sam = new T(); var loc = new SAMAlignedLocation(sam) { Seqname = seqname, Start = start, End = end, Strand = strand, }; if (!_filter.AcceptLocus(loc)) { continue; } if (isReversed) { seq = SequenceUtils.GetReverseComplementedSequence(seq); } sam.Qname = qname; sam.Sequence = seq; loc.AlignmentScore = _format.GetAlignmentScore(parts); loc.Cigar = cigar; loc.NumberOfMismatch = mismatchCount; loc.MismatchPositions = _format.GetMismatchPositions(parts); if (_format.HasAlternativeHits) { _format.ParseAlternativeHits(parts, sam); } result.Add(sam); waitingcount++; if (waitingcount % 100 == 0) { Progress.SetMessage("{0} feature reads from {1} reads", waitingcount, count); } } } return(result); }
protected override List <T> DoBuild <T>(string fileName, out List <QueryInfo> totalQueries) { var result = new List <T>(); _format = _options.GetSAMFormat(); totalQueries = new List <QueryInfo>(); using (var sr = SAMFactory.GetReader(fileName, true)) { int count = 0; int waitingcount = 0; string line; while ((line = sr.ReadLine()) != null) { if (count % 1000 == 0) { if (Progress.IsCancellationPending()) { throw new UserTerminatedException(); } } if (count % 100000 == 0 && count > 0) { Progress.SetMessage("{0} candidates from {1} reads", waitingcount, count); } count++; var qname = line.StringBefore("\t"); //Console.WriteLine("line = {0}", line); //Console.WriteLine("query = {0}", qname); var qi = new QueryInfo(qname); totalQueries.Add(qi); var parts = line.Split('\t'); SAMFlags flag = (SAMFlags)int.Parse(parts[SAMFormatConst.FLAG_INDEX]); //unmatched if (flag.HasFlag(SAMFlags.UnmappedQuery)) { continue; } //too many mismatchs var mismatchCount = _format.GetNumberOfMismatch(parts); var seq = parts[SAMFormatConst.SEQ_INDEX]; qi.Mismatch = mismatchCount; qi.Length = seq.Length; qi.NoPenaltyMutation = 0; if (_options.T2cAsNoPenaltyMutation) { } if (mismatchCount > _options.MaximumMismatch) { continue; } if (!AcceptQueryName(qname)) { continue; } //too short if (seq.Length < _options.MinimumReadLength) { continue; } //too long if (seq.Length > _options.MaximumReadLength) { continue; } var cigar = parts[SAMFormatConst.CIGAR_INDEX]; ////insertion/deletion //if (cigar.Any(m => m == 'I' || m == 'D')) //{ // continue; //} bool isReversed = flag.HasFlag(SAMFlags.QueryOnReverseStrand); char strand; if (isReversed) { strand = '-'; seq = SequenceUtils.GetReverseComplementedSequence(seq); } else { strand = '+'; } var score = _format.GetAlignmentScore(parts); var sam = new T() { Qname = qname, Sequence = seq }; var seqname = parts[SAMFormatConst.RNAME_INDEX]; var loc = new SAMAlignedLocation(sam) { Seqname = seqname, Start = int.Parse(parts[SAMFormatConst.POS_INDEX]), Strand = strand, Cigar = cigar, NumberOfMismatch = mismatchCount, AlignmentScore = score, MismatchPositions = _format.GetMismatchPositions(parts) }; loc.ParseEnd(sam.Sequence); sam.AddLocation(loc); if (_format.HasAlternativeHits) { _format.ParseAlternativeHits(parts, sam); } result.Add(sam); waitingcount++; } Progress.SetMessage("Finally, there are {0} candidates from {1} reads", waitingcount, count); } return(result); }