/// <summary> /// Validate BAM file Header fields. /// </summary> /// <param name="nodeName">XML nodename used for different test cases</param> /// <param name="seqAlignment">seqAlignment object</param> void ValidateBAMHeaderRecords(string nodeName, SequenceAlignmentMap seqAlignment) { string expectedHeaderTagValues = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.RecordTagValuesNode); string expectedHeaderTagKeys = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.RecordTagKeysNode); string expectedHeaderTypes = _utilityObj._xmlUtil.GetTextValue( nodeName, Constants.HeaderTyepsNodes); string[] expectedHeaderTagsValues = expectedHeaderTagValues.Split(','); string[] expectedHeaderKeys = expectedHeaderTagKeys.Split(','); string[] expectedHeaders = expectedHeaderTypes.Split(','); SAMAlignmentHeader header = seqAlignment.Header; IList <SAMRecordField> recordFields = header.RecordFields; int tagKeysCount = 0; int tagValuesCount = 0; for (int index = 0; index < recordFields.Count; index++) { Assert.AreEqual(expectedHeaders[index].Replace("/", ""), recordFields[index].Typecode.ToString((IFormatProvider)null).Replace("/", "")); for (int tags = 0; tags < recordFields[index].Tags.Count; tags++) { Assert.AreEqual(expectedHeaderKeys[tagKeysCount].Replace("/", ""), recordFields[index].Tags[tags].Tag.ToString((IFormatProvider)null).Replace("/", "")); Assert.AreEqual(expectedHeaderTagsValues[tagValuesCount].Replace("/", ""), recordFields[index].Tags[tags].Value.ToString((IFormatProvider)null).Replace("/", "").Replace("\r", "").Replace("\n", "")); tagKeysCount++; tagValuesCount++; } } }
/// <summary> /// Updates the header with reference name and length from ReferenceNamesAndLength file. /// </summary> /// <param name="header">SAM alignment header.</param> private void UpdateReferenceInformationFromFile(SAMAlignmentHeader header) { header.ReferenceSequences.Clear(); using (StreamReader reader = new StreamReader(ReferenceNamesAndLength)) { header.ReferenceSequences.Clear(); string read = reader.ReadLine(); while (!string.IsNullOrEmpty(read)) { string[] splitRegion = read.Split(new string[] { "\t" }, StringSplitOptions.RemoveEmptyEntries); if (splitRegion.Length > 1) { string name = splitRegion[0]; long len = long.Parse(splitRegion[1], CultureInfo.InvariantCulture); header.ReferenceSequences.Add(new ReferenceSequenceInfo(name, len)); } else { throw new InvalidOperationException("Invalid file for reference name and length"); } read = reader.ReadLine(); } } }
/// <summary> /// Dispalys the headers present in the BAM file /// </summary> /// <param name="seqAlignmentMap">SeqAlignment map</param> private void DisplayHeader(SequenceAlignmentMap seqAlignmentMap) { // Get Header SAMAlignmentHeader header = seqAlignmentMap.Header; IList <SAMRecordField> recordField = header.RecordFields; IList <string> commenstList = header.Comments; if (recordField.Count > 0) { Console.WriteLine("MetaData:"); // Read Header Lines for (int i = 0; i < recordField.Count; i++) { Console.Write("\n@{0}", recordField[i].Typecode); for (int tags = 0; tags < recordField[i].Tags.Count; tags++) { Console.Write("\t{0}:{1}", recordField[i].Tags[tags].Tag, recordField[i].Tags[tags].Value); } } } // Displays the comments if any if (commenstList.Count > 0) { for (int i = 0; i < commenstList.Count; i++) { Console.Write("\n@CO\t{0}\n", commenstList[i].ToString()); } } }
public void ValidateSAMParserHeader() { string filePath = utilityObj.xmlUtil.GetTextValue( Constants.SmallSAMFileNode, Constants.FilePathNode); string[] expectedHeaderTagValues = utilityObj.xmlUtil.GetTextValue( Constants.SmallSAMFileNode, Constants.RecordTagValuesNode).Split(','); string[] expectedHeaderTagKeys = utilityObj.xmlUtil.GetTextValue( Constants.SmallSAMFileNode, Constants.RecordTagKeysNode).Split(','); string[] expectedHeaderTypes = utilityObj.xmlUtil.GetTextValue( Constants.SmallSAMFileNode, Constants.HeaderTyepsNodes).Split(','); SAMAlignmentHeader aligntHeader = SAMParser.ParseSAMHeader(filePath); int tagKeysCount = 0; int tagValuesCount = 0; for (int index = 0; index < aligntHeader.RecordFields.Count; index++) { Assert.AreEqual(expectedHeaderTypes[index].Replace("/", ""), aligntHeader.RecordFields[index].Typecode.ToString((IFormatProvider)null).Replace("/", "")); for (int tags = 0; tags < aligntHeader.RecordFields[index].Tags.Count; tags++) { Assert.AreEqual( expectedHeaderTagKeys[tagKeysCount].Replace("/", ""), aligntHeader.RecordFields[index].Tags[tags].Tag.ToString((IFormatProvider)null).Replace("/", "")); Assert.AreEqual( expectedHeaderTagValues[tagValuesCount].Replace("/", ""), aligntHeader.RecordFields[index].Tags[tags].Value.ToString((IFormatProvider)null).Replace("/", "").Replace("\r", "").Replace("\n", "")); tagKeysCount++; tagValuesCount++; } } }
/// <summary> /// Writes specified sequence alignment to stream. /// The output is formatted according to the BAM structure. /// </summary> /// <param name="sequenceAlignmentMap">SequenceAlignmentMap object.</param> /// <param name="writer">Stream to write.</param> /// <param name="createSortedFile">If this flag is true output file will be sorted.</param> private void WriteUncompressed(SequenceAlignmentMap sequenceAlignmentMap, Stream writer, bool createSortedFile) { SAMAlignmentHeader header = sequenceAlignmentMap.Header; if (createSortedFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates) { header = GetHeaderWithSortedSQFields(header, true); this.refSequences = header.GetReferenceSequenceRanges(); } if (this.refSequences == null) { this.refSequences = header.GetReferenceSequenceRanges(); } WriteHeader(header, writer); writer.Flush(); if (createSortedFile) { WriteUncompressedSortedBAM(sequenceAlignmentMap, writer); } else { foreach (SAMAlignedSequence seq in sequenceAlignmentMap.QuerySequences) { SAMAlignedSequence alignedSeq = seq; this.ValidateSQHeader(alignedSeq.RName); this.WriteAlignedSequence(alignedSeq, writer); writer.Flush(); } } writer.Flush(); }
/// <summary> /// Converts the input BAM to SAM file format. /// </summary> private void ConvertFromBAMToSAM() { using (Stream stream = new FileStream(InputFilePath, FileMode.Open, FileAccess.Read)) { SAMAlignmentHeader header = null; try { header = bamparser.GetHeader(stream); } catch (Exception ex) { throw new InvalidOperationException(Resources.InvalidBAMFile, ex); } WriteHeader(header); if (!HeaderOnly) { if (!string.IsNullOrEmpty(Library)) { rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList(); } foreach (SAMAlignedSequence alignedSequence in GetAlignedSequence(stream)) { WriteAlignedSequence(header, alignedSequence); } } } }
// Validates alignment header. private static void ValidateAlignmentHeader(SAMAlignmentHeader header) { string message = header.IsValid(); if (!string.IsNullOrEmpty(message)) { throw new ArgumentException(message); } }
/// <summary> /// Parses the BAM file and returns the Header. /// </summary> private SAMAlignmentHeader GetHeader() { var header = new SAMAlignmentHeader(); RefSeqNames = new RegexValidatedStringList(SAMAlignedSequenceHeader.RNameRegxExprPattern); _refSeqLengths = new List <int>(); ReadStream.Seek(0, SeekOrigin.Begin); _deCompressedStream = null; var array = new byte[8]; ReadUnCompressedData(array, 0, 8); var lText = Helper.GetInt32(array, 4); var samHeaderData = new byte[lText]; if (lText != 0) { ReadUnCompressedData(samHeaderData, 0, lText); } ReadUnCompressedData(array, 0, 4); var noofRefSeqs = Helper.GetInt32(array, 0); for (var i = 0; i < noofRefSeqs; i++) { ReadUnCompressedData(array, 0, 4); var len = Helper.GetInt32(array, 0); var refName = new byte[len]; ReadUnCompressedData(refName, 0, len); ReadUnCompressedData(array, 0, 4); var refLen = Helper.GetInt32(array, 0); RefSeqNames.Add(Encoding.ASCII.GetString(refName, 0, refName.Length - 1)); _refSeqLengths.Add(refLen); } if (samHeaderData.Length != 0) { var str = Encoding.ASCII.GetString(samHeaderData); using (var reader = new StringReader(str)) { header = SAMParser.ParseSAMHeader(reader); } } header.ReferenceSequences.Clear(); for (var i = 0; i < RefSeqNames.Count; i++) { var refname = RefSeqNames[i]; var length = _refSeqLengths[i]; header.ReferenceSequences.Add(new ReferenceSequenceInfo(refname, length)); } return(header); }
/// <summary> /// Writes BAM header to the specified stream in BAM format. /// </summary> /// <param name="header">SAMAlignmentHeader object</param> /// <param name="writer">Stream to write.</param> public void WriteHeader(SAMAlignmentHeader header, Stream writer) { if (header == null) { throw new ArgumentNullException("header"); } if (writer == null) { throw new ArgumentNullException("writer"); } string samHeader; if (_refSequences == null) { _refSequences = SortSequenceRanges(header.GetReferenceSequenceRanges()); } using (StringWriter strwriter = new StringWriter(CultureInfo.InvariantCulture)) { SAMFormatter.WriteHeader(header, strwriter); samHeader = strwriter.ToString(); } int samHeaderLen = samHeader.Length; byte[] bytes = System.Text.ASCIIEncoding.ASCII.GetBytes(samHeader); byte[] bamMagicNumber = new byte[4] { 66, 65, 77, 1 }; // write BAM magic number writer.Write(bamMagicNumber, 0, 4); // Length of the header text writer.Write(Helper.GetLittleEndianByteArray(samHeaderLen), 0, 4); //Plain header text in SAM writer.Write(bytes, 0, bytes.Length); // number of reference sequences writer.Write(Helper.GetLittleEndianByteArray(_refSequences.Count), 0, 4); for (int i = 0; i < _refSequences.Count; i++) { int len = _refSequences[i].ID.Length; byte[] array = System.Text.ASCIIEncoding.ASCII.GetBytes(_refSequences[i].ID); writer.Write(Helper.GetLittleEndianByteArray(len + 1), 0, 4); writer.Write(array, 0, len); writer.WriteByte((byte)'\0'); writer.Write(Helper.GetLittleEndianByteArray((int)_refSequences[i].End), 0, 4); } }
// Validates the alignment. private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment) { SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap; if (seqAlignmentMap != null) { ValidateAlignmentHeader(seqAlignmentMap.Header); if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates) { this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); } else { this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges(); } return(seqAlignmentMap); } SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader; if (header == null) { throw new ArgumentException(Properties.Resource.SAMAlignmentHeaderNotFound); } ValidateAlignmentHeader(header); seqAlignmentMap = new SequenceAlignmentMap(header); if (CreateSortedBAMFile && SortType == BAMSortByFields.ChromosomeNameAndCoordinates) { this.refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); } else { this.refSequences = seqAlignmentMap.Header.GetReferenceSequenceRanges(); } foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences) { SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader; if (alignedHeader == null) { throw new ArgumentException(Properties.Resource.SAMAlignedSequenceHeaderNotFound); } SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader); samAlignedSeq.QuerySequence = alignedSeq.Sequences[0]; seqAlignmentMap.QuerySequences.Add(samAlignedSeq); } return(seqAlignmentMap); }
/// <summary> /// Writes aligned sequence to output stream. /// </summary> /// <param name="header">Alignment header.</param> /// <param name="alignedSequence">Aligned sequence to write.</param> private void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSequence) { if (UnCompressedBAM || BAMOutput) { // Incase of compressed bamoutput uncompressed file will be compressed before sending it to output stream. bamformatter.WriteAlignedSequence(header, alignedSequence, bamUncompressedOutStream); } else { SAMFormatter.WriteSAMAlignedSequence(alignedSequence, writer); } }
/// <summary> /// Writes BAM header to the specified stream in BAM format. /// </summary> /// <param name="header">SAMAlignmentHeader object</param> /// <param name="writer">Stream to write.</param> public void WriteHeader(SAMAlignmentHeader header, Stream writer) { if (header == null) { throw new ArgumentNullException("header"); } if (writer == null) { throw new ArgumentNullException("writer"); } string samHeader; if (this.refSequences == null) { this.refSequences = header.GetReferenceSequenceRanges(); } using (StringWriter strwriter = new StringWriter(CultureInfo.InvariantCulture)) { SAMFormatter.WriteHeader(strwriter, header); samHeader = strwriter.ToString(); } int samHeaderLen = samHeader.Length; byte[] bytes = Encoding.UTF8.GetBytes(samHeader); byte[] bamMagicNumber = { 66, 65, 77, 1 }; // write BAM magic number writer.Write(bamMagicNumber, 0, 4); // Length of the header text writer.Write(Helper.GetLittleEndianByteArray(samHeaderLen), 0, 4); //Plain header text in SAM writer.Write(bytes, 0, bytes.Length); // number of reference sequences writer.Write(Helper.GetLittleEndianByteArray(this.refSequences.Count), 0, 4); foreach (SequenceRange range in this.refSequences) { int len = range.ID.Length; byte[] array = Encoding.UTF8.GetBytes(range.ID); writer.Write(Helper.GetLittleEndianByteArray(len + 1), 0, 4); writer.Write(array, 0, len); writer.WriteByte((byte)'\0'); writer.Write(Helper.GetLittleEndianByteArray((int)range.End), 0, 4); } }
// Validates the alignment. private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment) { SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap; if (seqAlignmentMap != null) { ValidateAlignmentHeader(seqAlignmentMap.Header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); foreach (SAMAlignedSequence alignedSequence in seqAlignmentMap.QuerySequences) { string message = alignedSequence.IsValidHeader(); if (!string.IsNullOrEmpty(message)) { throw new ArgumentException(message); } ValidateSQHeader(alignedSequence.RName); } return(seqAlignmentMap); } SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader; if (header == null) { throw new ArgumentException(Resource.SAMAlignmentHeaderNotFound); } ValidateAlignmentHeader(header); seqAlignmentMap = new SequenceAlignmentMap(header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences) { SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader; if (alignedHeader == null) { throw new ArgumentException(Resource.SAMAlignedSequenceHeaderNotFound); } ValidateAlignedSequenceHeader(alignedHeader); ValidateSQHeader(alignedHeader.RName); SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader); samAlignedSeq.QuerySequence = alignedSeq.Sequences[0]; } return(seqAlignmentMap); }
public void InvalidateSAMWriteTextWriter() { SAMAlignmentHeader header = new SAMAlignmentHeader(); try { SAMFormatter.WriteHeader(null, header); Assert.Fail(); } catch (ArgumentNullException) { ApplicationLog.WriteLine( "SAM Formatter P2 : Successfully validated the exception"); } }
/// <summary> /// Writes the header to output stream /// </summary> /// <param name="header"></param> private void WriteHeader(SAMAlignmentHeader header) { if (!Header && !HeaderOnly) { return; } if (UnCompressedBAM || BAMOutput) { // Incase of compressed bamoutput uncompressed file will be compressed before sending it to output stream. bamformatter.WriteHeader(header, bamUncompressedOutStream); } else { SAMFormatter.WriteHeader(header, writer); } }
void validateInputFileAndLoadSampleNames() { if (!File.Exists(InputFilename)) { throw new FileNotFoundException("Could not find file: " + InputFilename); } using (Stream stream = new FileStream(InputFilename, FileMode.Open, FileAccess.Read)) { BAMParser bp = new BAMParser(); header = bp.GetHeader(stream); var tmp = header.RecordFields.Where(x => x.Typecode == "RG").ToList(); sampleNames = tmp.Select(z => z.Tags.Where(p => p.Tag == "ID").First()).Select(z => z.Value).ToList(); NumberOfReadGroups = sampleNames.Count; NumerOfSequences = header.ReferenceSequences.Count; } Console.WriteLine("Processing file with " + sampleNames.Count.ToString() + " samples and " + NumerOfSequences.ToString() + " reference sequences."); }
/// <summary> /// Writes SAMAlignedSequence to specified stream. /// </summary> /// <param name="header">Header from SAM object.</param> /// <param name="alignedSeq">SAMAlignedSequence object.</param> /// <param name="writer">Stream to write.</param> public void WriteAlignedSequence(SAMAlignmentHeader header, SAMAlignedSequence alignedSeq, Stream writer) { if (header == null) { throw new ArgumentNullException("header"); } if (alignedSeq == null) { throw new ArgumentNullException("alignedSeq"); } if (writer == null) { throw new ArgumentNullException("writer"); } _refSequences = SortSequenceRanges(header.GetReferenceSequenceRanges()); WriteAlignedSequence(alignedSeq, writer); }
public IEnumerable <CompactSAMSequence> ParseRangeAsEnumerableSequences(string fileName, string refSeqName, int start = 0, int end = Int32.MaxValue) { if (refSeqName == null) { throw new ArgumentNullException("refSeqName"); } using (FileStream bamStream = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { string bamIndexFileName = getBAMIndexFileName(fileName); using (BAMIndexFile bamIndexFile = new BAMIndexFile(bamIndexFileName, FileMode.Open, FileAccess.Read)) { readStream = bamStream; if (readStream == null || readStream.Length == 0) { throw new FileFormatException(Properties.Resource.BAM_InvalidBAMFile); } ValidateReader(); SAMAlignmentHeader header = GetHeader(); // verify whether there is any reads related to chromosome. int refSeqIndex = refSeqNames.IndexOf(refSeqName); if (refSeqIndex < 0) { string message = string.Format(CultureInfo.InvariantCulture, Properties.Resource.BAM_RefSeqNotFound, refSeqName); throw new ArgumentException(message, "refSeqName"); } BAMIndex bamIndexInfo = bamIndexFile.Read(); BAMReferenceIndexes refIndex = bamIndexInfo.RefIndexes[refSeqIndex]; IList <Chunk> chunks = GetChunks(refIndex, start, end); foreach (var s in EnumerateAlignedSequences(chunks)) { if (s != null && (s.RName == "*" || (s.Pos >= (start - 1) && s.RefEndPos < end))) { yield return(s); } } readStream = null; } } }
// Validates the alignment. private SequenceAlignmentMap ValidateAlignment(ISequenceAlignment sequenceAlignment) { SequenceAlignmentMap seqAlignmentMap = sequenceAlignment as SequenceAlignmentMap; if (seqAlignmentMap != null) { ValidateAlignmentHeader(seqAlignmentMap.Header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); return(seqAlignmentMap); } SAMAlignmentHeader header = sequenceAlignment.Metadata[Helper.SAMAlignmentHeaderKey] as SAMAlignmentHeader; if (header == null) { throw new ArgumentException(Resource.SAMAlignmentHeaderNotFound); } ValidateAlignmentHeader(header); seqAlignmentMap = new SequenceAlignmentMap(header); _refSequences = SortSequenceRanges(seqAlignmentMap.Header.GetReferenceSequenceRanges()); foreach (IAlignedSequence alignedSeq in sequenceAlignment.AlignedSequences) { SAMAlignedSequenceHeader alignedHeader = alignedSeq.Metadata[Helper.SAMAlignedSequenceHeaderKey] as SAMAlignedSequenceHeader; if (alignedHeader == null) { throw new ArgumentException(Resource.SAMAlignedSequenceHeaderNotFound); } SAMAlignedSequence samAlignedSeq = new SAMAlignedSequence(alignedHeader); samAlignedSeq.QuerySequence = alignedSeq.Sequences[0]; seqAlignmentMap.QuerySequences.Add(samAlignedSeq); } return(seqAlignmentMap); }
/// <summary> /// Updates the header with reference name from reads in input file. /// </summary> /// <param name="header">SAM alignment header.</param> private void UpdateReferenceInformationFromReads(SAMAlignmentHeader header) { // If the ReferenceNamesAndLength file name is not specified and there is no @SQ header, // then get the refernece names from read information. List <string> refSeqNames = new List <string>(); using (StreamReader textReader = new StreamReader(InputFilePath)) { foreach (SAMAlignedSequence alignedSeq in GetAlignedSequence(textReader)) { if (!alignedSeq.RName.Equals("*", StringComparison.OrdinalIgnoreCase) && !refSeqNames.Contains(alignedSeq.RName, StringComparer.OrdinalIgnoreCase)) { refSeqNames.Add(alignedSeq.RName); } } } foreach (string refname in refSeqNames) { header.ReferenceSequences.Add(new ReferenceSequenceInfo(refname, 0)); } }
/// <summary> /// Comapare Sequence Alignment Header fields /// </summary> /// <param name="actualAlignment">Actual sequence alignment object</param> /// <param name="expectedAlignment">Expected sequence alignment object</param> /// <returns></returns> bool CompareSequencedAlignmentHeader(SequenceAlignmentMap actualAlignment, SequenceAlignmentMap expectedAlignment) { SAMAlignmentHeader aheader = actualAlignment.Header; IList <SAMRecordField> arecordFields = aheader.RecordFields; SAMAlignmentHeader expectedheader = expectedAlignment.Header; IList <SAMRecordField> expectedrecordFields = expectedheader.RecordFields; int tagKeysCount = 0; int tagValuesCount = 0; for (int index = 0; index < expectedrecordFields.Count; index++) { if (0 != string.Compare(expectedrecordFields[index].Typecode.ToString(), arecordFields[index].Typecode.ToString())) { return(false); } for (int tags = 0; tags < expectedrecordFields[index].Tags.Count; tags++) { if ((0 != string.Compare(expectedrecordFields[index].Tags[tags].Tag.ToString(), arecordFields[index].Tags[tags].Tag.ToString())) || (0 != string.Compare(expectedrecordFields[index].Tags[tags].Value.ToString(), arecordFields[index].Tags[tags].Value.ToString()))) { Console.WriteLine(string.Format(null, "SAM Parser BVT : Sequence alignment header does not match")); ApplicationLog.WriteLine(string.Format(null, "SAM Parser BVT : Sequence alignment header does not match")); return(false); } tagKeysCount++; tagValuesCount++; } } return(true); }
/// <summary> /// Merge multiple sorted alignments. /// SAMUtil.exe out.bam in1.bam in2.bam /// </summary> public void DoMerge() { if (FilePaths == null) { throw new InvalidOperationException("FilePath"); } if (FilePaths.Length < 3) { throw new InvalidOperationException(Resources.MergeHelp); } IList <IList <BAMSortedIndex> > sortedIndexes = new List <IList <BAMSortedIndex> >(); IList <SequenceAlignmentMap> sequenceAlignmentMaps = new List <SequenceAlignmentMap>(); IList <int> help = new List <int>(); Parallel.For(1, FilePaths.Length, (int index) => { IList <BAMSortedIndex> sortedIndex; BAMParser parser = new BAMParser();; SequenceAlignmentMap map; if (index == 1) { try { map = parser.Parse(FilePaths[1]); } catch { throw new InvalidOperationException(Resources.InvalidBAMFile); } if (map == null) { throw new InvalidOperationException(Resources.EmptyFile); } if (string.IsNullOrEmpty(HeaderFile) && map.Header.RecordFields.Count == 0) { throw new InvalidOperationException(Resources.HeaderMissing); } if (!string.IsNullOrEmpty(HeaderFile)) { SAMParser parse = new SAMParser(); SequenceAlignmentMap head; try { head = parse.Parse(HeaderFile); } catch { throw new InvalidOperationException(Resources.IncorrectHeaderFile); } if (head == null) { throw new InvalidOperationException(Resources.EmptyFile); } _header = head.Header; } else { _header = map.Header; } sortedIndex = Sort(map, SortByReadName ? BAMSortByFields.ReadNames : BAMSortByFields.ChromosomeCoordinates); } else { try { map = parser.Parse(FilePaths[index]); } catch { throw new InvalidOperationException(Resources.InvalidBAMFile); } if (map == null) { throw new InvalidOperationException(Resources.EmptyFile); } sortedIndex = Sort(map, SortByReadName ? BAMSortByFields.ReadNames : BAMSortByFields.ChromosomeCoordinates); } lock (sortedIndexes) { sortedIndexes.Add(sortedIndex); sequenceAlignmentMaps.Add(map); } }); string filePath = Path.GetTempFileName(); using (FileStream fstemp = new FileStream(filePath, FileMode.Create, FileAccess.ReadWrite)) { BAMFormatter formatter = new BAMFormatter(); formatter.WriteHeader(_header, fstemp); int[] indexes = new int[sortedIndexes.Count]; if (SortByReadName) { IList <BAMSortedIndex> sortedIndex = sortedIndexes.Select(a => a.First()).ToList(); WriteMergeFileSortedByReadName(sortedIndex, fstemp, formatter, sequenceAlignmentMaps); } else { WriteMergeFile(sortedIndexes, fstemp, formatter, sequenceAlignmentMaps); } using (FileStream fsoutput = new FileStream(FilePaths[0], FileMode.Create, FileAccess.Write)) { fstemp.Seek(0, SeekOrigin.Begin); formatter.CompressBAMFile(fstemp, fsoutput); } } File.Delete(filePath); }
// Gets new header with sorted SQ Fields. // If SQ fields are already sorted then returns the same header. private SAMAlignmentHeader GetHeaderWithSortedSQFields(SAMAlignmentHeader header, bool canChangeOtherTagPos) { if (IsSortedByChromosomeNames(GetSQHeaders(header.RecordFields))) { return(header); } SAMAlignmentHeader newHeader = new SAMAlignmentHeader(); int i = 0; if (canChangeOtherTagPos) { List <SAMRecordField> sqHeaders = new List <SAMRecordField>(); for (; i < header.RecordFields.Count; i++) { SAMRecordField field = header.RecordFields[i]; if (field.Typecode.Equals("SQ")) { sqHeaders.Add(field); } else { newHeader.RecordFields.Add(field); } sqHeaders.Sort(CompareByChromosomeName); foreach (SAMRecordField sqfield in sqHeaders) { newHeader.RecordFields.Add(sqfield); } foreach (string str in header.Comments) { newHeader.Comments.Add(str); } } } else { Bio.Util.SortedList <SAMRecordField, int> map = new Bio.Util.SortedList <SAMRecordField, int>(new ComparisonWrapper <SAMRecordField>(CompareByChromosomeName)); for (; i < header.RecordFields.Count; i++) { SAMRecordField field = header.RecordFields[i]; if (field.Typecode.Equals("SQ")) { map.Add(field, i); } newHeader.RecordFields.Add(field); } i = 0; foreach (int index in map.Values.OrderBy(I => I)) { newHeader.RecordFields[index] = map.Keys[i++]; } foreach (string str in header.Comments) { newHeader.Comments.Add(str); } } return(newHeader); }
/// <summary> /// Run ConPADE on each contig of the input BAM file. /// </summary> /// <param name="bamName">Name of the input BAM file.</param> public void RunFile(string bamName) { // Current implementation requires that minimum ploidy be 1 int min_ploidy = 1; int number_of_ploidies = max_ploidy - min_ploidy + 1; // Set nucleotide proportions (genotypes) double[][][] nuc_props = Nuc_Props(min_ploidy, number_of_ploidies); // Set dosage probabilities double SNP_density = (double)1 / snpDens; double no_SNP_prob = Math.Log((1 - SNP_density) / 2); double[][] dose_probs = Dose_Probs(min_ploidy, number_of_ploidies, SNP_density, no_SNP_prob); // Set HiSeq error model double[, , , ,] log_probs = Error_Probs(); // Set substitution model double[, ,] log_subst_probs = Subst_Probs(); // Set SNP calling probability double log_SNP_thres = SNPthres * Math.Log(10) / -10; Stopwatch clock = new Stopwatch(); Console.WriteLine("Program started at {0}\n", DateTime.Now); Stream bam_stream = new FileStream(bamName, FileMode.Open, FileAccess.Read); BAMParser parser = new BAMParser(); SAMAlignmentHeader header = parser.GetHeader(bam_stream); string temp = Path.GetFileNameWithoutExtension(bamName); // Find first valid alignment in BAM file SAMAlignedSequence next_alignment = parser.GetAlignedSequence(true); while (next_alignment == null || next_alignment.RName == "*" || next_alignment.IsDummyRead) { next_alignment = parser.GetAlignedSequence(true); } TextWriter writer_log_like = null; TextWriter writer_SNP = null; TextWriter writer_ploidy = null; TextWriter writer_reads = null; // Create global output files and write headers. if (!splitContigs) { string SNP_file = temp + "_SNP.txt"; writer_SNP = new StreamWriter(SNP_file); writer_SNP.WriteLine("Contig\tPosition\tAlleles\tCounts\tDosage\tPhredQuality"); string ploidy_file = temp + "_ploidy.txt"; writer_ploidy = new StreamWriter(ploidy_file); writer_ploidy.Write("Contig\tBestPloidy"); for (int i = 0; i < number_of_ploidies; i++) { writer_ploidy.Write("\tlogLike_M{0}", i + min_ploidy); } writer_ploidy.WriteLine(""); string reads_file = temp + "_readStats.txt"; writer_reads = new StreamWriter(reads_file); writer_reads.WriteLine("Contig\tAlignedReads\tAlignedBases\tUsedReads\tUsedBases"); } // Run over each contig in input BAM file. int contig_ind = -1; while (next_alignment != null && next_alignment.RName != "*" && !next_alignment.IsDummyRead) { string contig_name = next_alignment.RName; Console.WriteLine("Started contig {0} at {1}", contig_name, DateTime.Now); clock.Restart(); #region Variables and file handles for current contig long number_of_aligned_reads = 0; long number_of_aligned_base_pairs = 0; long number_of_used_reads = 0; long number_of_used_base_pairs = 0; // Create individual output files for the current contig. if (splitContigs) { string name = temp + "_" + contig_name; string log_like_file = name + "_log_likelihoods.txt"; writer_log_like = new StreamWriter(log_like_file); string SNP_file = name + "_SNP.txt"; writer_SNP = new StreamWriter(SNP_file); string ploidy_file = name + "_ploidy.txt"; writer_ploidy = new StreamWriter(ploidy_file); string reads_file = name + "_readStats.txt"; writer_reads = new StreamWriter(reads_file); } double[] global_log_like = new double[number_of_ploidies]; while (header.ReferenceSequences[++contig_ind].Name != contig_name) { ; } long contig_length = header.ReferenceSequences[contig_ind].Length; // Create a queue to include all reads that overlap with a given position. Queue <Padded_Read> read_queue = new Queue <Padded_Read>(); // Create a queue to include best doses for each tested position. Queue <Best_Dose> dose_queue = new Queue <Best_Dose>((int)contig_length); #endregion Variables and file handles for current contig int positions_to_compute = 0; long current_position = 0; #region Run over every position in contig while (current_position < contig_length) { if ((current_position % 1000000) == 0 && current_position != 0) { Console.WriteLine("At position {0} of {1}", current_position + 1, contig_length); } // Search for reads starting at current position. Search_Reads(parser, ref next_alignment, contig_name, ref number_of_aligned_reads, ref number_of_aligned_base_pairs, ref number_of_used_reads, ref number_of_used_base_pairs, read_queue, current_position); if (read_queue.Count > 0) { positions_to_compute++; // Extract information from each read in queue. byte[] obs_nucs; byte[] is_GG; bool[] reverse; int[] quality_scores; int[] neigh_quality_scores; int[] scores; int[] counts; int k; Extract_Read_Info(read_queue, current_position, out obs_nucs, out is_GG, out reverse, out quality_scores, out neigh_quality_scores, out scores, out counts, out k); // Find two most abundant nucleotides for this position. byte nuc_one; byte nuc_two; Get_Two_Nucs(scores, out nuc_one, out nuc_two); // Calculate Pr(obs|allele1) and Pr(obs|allele2). double[][] log_nuc_probs = Obs_Probs(log_probs, log_subst_probs, obs_nucs, is_GG, reverse, quality_scores, neigh_quality_scores, counts, k, nuc_one, nuc_two); // Calculate log_likelihoods of genotypes for current position. double[][] log_likelihoods = Log_Likelihoods(min_ploidy, max_ploidy, log_nuc_probs, nuc_props); // Calculate log_likelihood of each ploidy and keep most likely allele dosage. Global_Likelihood_Keep_Dose(min_ploidy, number_of_ploidies, dose_probs, global_log_like, dose_queue, current_position, counts, nuc_one, nuc_two, log_likelihoods); } // Remove finished reads from queue. Finished reads no longer overlap with current position. Padded_Read read_to_remove; if (read_queue.Count > 0) { read_to_remove = read_queue.First(); } else { read_to_remove = null; } while (read_to_remove != null && (read_to_remove.alignment.Pos + read_to_remove.alignment_length - 2) < current_position) { read_queue.Dequeue(); if (read_queue.Count > 0) { read_to_remove = read_queue.First(); } else { read_to_remove = null; } } ++current_position; } #endregion Run over every position in contig // Output log_likelihoods. int best_log_like = 0; for (int i = 0; i < number_of_ploidies; i++) { if (global_log_like[i] > global_log_like[best_log_like]) { best_log_like = i; } if (splitContigs) { writer_log_like.WriteLine("Ploidy {0} - log_likelihood {1}", i + min_ploidy, global_log_like[i]); } } // Output most likely ploidy. int best_ploidy = best_log_like + min_ploidy; if (splitContigs) { writer_ploidy.WriteLine(best_ploidy); } else { writer_ploidy.Write("{0}\t{1}", contig_name, best_ploidy); for (int i = 0; i < number_of_ploidies; i++) { writer_ploidy.Write("\t{0}", global_log_like[i]); } writer_ploidy.WriteLine(""); } // Output SNPs. if (splitContigs) { writer_SNP.WriteLine("Position\tAlleles\tCounts\tDosage\tPhredQuality"); } char[] nuc_chars = new char[4] { 'A', 'C', 'G', 'T' }; foreach (Best_Dose cur_doses in dose_queue) { double cur_SNP_posterior = cur_doses.SNP_posterior[best_log_like]; if (cur_SNP_posterior <= log_SNP_thres) { int cur_best_dose = cur_doses.best_dose[best_log_like]; if (cur_best_dose != best_ploidy && cur_best_dose != 0) { if (splitContigs) { writer_SNP.WriteLine("{0}\t{1}|{2}\t{3}|{4}\t{5}\t{6}", cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one, cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10)); } else { writer_SNP.WriteLine("{0}\t{1}\t{2}|{3}\t{4}|{5}\t{6}\t{7}", contig_name, cur_doses.position + 1, nuc_chars[cur_doses.nuc_one], nuc_chars[cur_doses.nuc_two], cur_doses.count_one, cur_doses.count_two, cur_best_dose, -10 * cur_SNP_posterior / Math.Log(10)); } } } } // Output read statistics. if (splitContigs) { writer_reads.WriteLine("\nNumber of aligned reads: {0}", number_of_aligned_reads); writer_reads.WriteLine("Number of aligned base pairs: {0}", number_of_aligned_base_pairs); writer_reads.WriteLine("\nNumber of used reads: {0}", number_of_used_reads); writer_reads.WriteLine("Number of used base pairs: {0}", number_of_used_base_pairs); } else { writer_reads.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", contig_name, number_of_aligned_reads, number_of_aligned_base_pairs, number_of_used_reads, number_of_used_base_pairs); } if (splitContigs) { writer_log_like.Close(); writer_SNP.Close(); writer_ploidy.Close(); writer_reads.Close(); } clock.Stop(); Console.WriteLine("Time to run contig: {0} s\n", (double)clock.ElapsedMilliseconds / 1000); } if (!splitContigs) { writer_SNP.Close(); writer_ploidy.Close(); writer_reads.Close(); } parser.Dispose(); Console.WriteLine("Finished at {0}\n", DateTime.Now); }
public IEnumerable <CompactSAMSequence> Parse() { if (string.IsNullOrWhiteSpace(_fileName)) { throw new ArgumentNullException("fileName"); } using (readStream = new FileStream(_fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream reader = readStream; if (reader == null || reader.Length == 0) { throw new FileFormatException(Properties.Resource.BAM_InvalidBAMFile); } if (!String.IsNullOrEmpty(ChromosomeToGet)) { foreach (var s in ParseRangeAsEnumerableSequences(_fileName, ChromosomeToGet)) { if (s != null) { yield return(s); } ////TODO: Super inefficient right now, am parsing the sequence multiple times, ////fix this. //var s2 = s.ToArray (); //var alpha = Alphabets.AutoDetectAlphabet(s2, 0, s2.Length, null); //var strippedOfInfo = new Sequence(alpha, s2); //yield return strippedOfInfo; } } else { readStream = reader; ValidateReader(); SAMAlignmentHeader header = GetHeader(); SequenceAlignmentMap sequenceAlignmentMap = null; if (sequenceAlignmentMap == null) { sequenceAlignmentMap = new SequenceAlignmentMap(header); } while (!IsEOF()) { #if WANT_OLD_VERSION SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue); #else var alignedSeq = GetAlignedSequence(); #endif if (alignedSeq != null) { #if WANT_OLD_VERSION //make a new Sequence ISequence strippedOfInfo = null; try { var syms = alignedSeq.QuerySequence.ToArray(); var alpha = Alphabets.AutoDetectAlphabet(syms, 0, syms.Length, null); strippedOfInfo = new Sequence(alpha, alignedSeq.QuerySequence.ToArray()); strippedOfInfo = alignedSeq; } catch (ArgumentOutOfRangeException exception) { Debug.Write("Could not convert sequence: " + exception.Message); } if (strippedOfInfo != null) { yield return(strippedOfInfo); } #else yield return(alignedSeq); #endif } alignedSeq = null; } } } }
/// <summary> /// Converts the input SAM to BAM file format. /// </summary> private void ConvertFromSAMTOBAM() { SAMAlignmentHeader header = null; try { header = SAMParser.ParseSAMHeader(InputFilePath); } catch (Exception ex) { throw new InvalidOperationException(Resources.InvalidSAMFile, ex); } if (header == null) { Console.Error.WriteLine("Warning: SAM file doesn't contian header"); } if (HeaderOnly) { if (header != null) { WriteHeader(header); } } else { if (header == null) { header = new SAMAlignmentHeader(); } if (!string.IsNullOrEmpty(Library)) { rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList(); } if (!string.IsNullOrEmpty(ReferenceNamesAndLength)) { this.UpdateReferenceInformationFromFile(header); } else if (header.ReferenceSequences.Count == 0) { this.UpdateReferenceInformationFromReads(header); } WriteHeader(header); using (StreamReader textReader = new StreamReader(InputFilePath)) { foreach (SAMAlignedSequence alignedSeq in GetAlignedSequence(textReader)) { WriteAlignedSequence(header, alignedSeq); } } } if (UnCompressedBAM) { bamUncompressedOutStream.Flush(); if (writer != null) { DisplayBAMContent(bamUncompressedOutStream); } } if (BAMOutput && !UnCompressedBAM) { bamUncompressedOutStream.Flush(); bamUncompressedOutStream.Seek(0, SeekOrigin.Begin); bamformatter.CompressBAMFile(bamUncompressedOutStream, bamCompressedOutStream); bamCompressedOutStream.Flush(); if (writer != null) { DisplayBAMContent(bamCompressedOutStream); } } }
/// <summary> /// Extract/print all or sub alignments in SAM or BAM format. /// By default, this command assumes the file on the command line is in /// BAM format and it prints the alignments in SAM. /// SAMUtil.exe view in.bam /// </summary> public void ViewResult() { try { if (string.IsNullOrEmpty(InputFilePath)) { throw new InvalidOperationException("Input File Not specified"); } if (!string.IsNullOrEmpty(Region)) { StringToRegionConverter(); } Initialize(); SAMAlignmentHeader header = null; if (!SAMInput) { Stream stream = new FileStream(InputFilePath, FileMode.Open, FileAccess.Read); try { header = bamparser.GetHeader(stream); } catch { throw new InvalidOperationException(Resources.InvalidBAMFile); } WriteHeader(header); if (!HeaderOnly) { if (!string.IsNullOrEmpty(Library)) { rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList(); } foreach (SAMAlignedSequence alignedSequence in GetAlignedSequence(stream)) { WriteAlignedSequence(header, alignedSequence); } } } else { try { header = SAMParser.ParseSAMHeader(InputFilePath); } catch { throw new InvalidOperationException(Resources.InvalidSAMFile); } if (header == null) { throw new InvalidOperationException("SAM file doesn't contian header"); } WriteHeader(header); if (!HeaderOnly) { if (!string.IsNullOrEmpty(Library)) { rgRecFields = header.RecordFields.Where(R => R.Typecode.ToUpper().Equals("RG")).ToList(); } using (StreamReader textReader = new StreamReader(InputFilePath)) { foreach (SAMAlignedSequence alignedSeq in GetAlignedSequence(textReader)) { WriteAlignedSequence(header, alignedSeq); } } } if (UnCompressedBAM) { bamUncompressedOutStream.Flush(); if (writer != null) { DisplayBAMContent(bamUncompressedOutStream); } } if (BAMOutput && !UnCompressedBAM) { bamUncompressedOutStream.Flush(); bamUncompressedOutStream.Seek(0, SeekOrigin.Begin); bamformatter.CompressBAMFile(bamUncompressedOutStream, bamCompressedOutStream); bamCompressedOutStream.Flush(); if (writer != null) { DisplayBAMContent(bamCompressedOutStream); } } } } finally { Close(); } }