/// <summary> /// Get chromoses with orphan regions /// </summary> /// <param name="filename">Path of the BAM file</param> /// <param name="mean">Mean value</param> /// <param name="deviation">Standard deviation</param> /// <returns></returns> private void DisplayOrphans(string filename) { SequenceAlignmentMap alignmentMapobj = null; if (!SAMInput) { BAMParser bamParser = new BAMParser(); alignmentMapobj = bamParser.Parse(filename); } else { SAMParser samParser = new SAMParser(); alignmentMapobj = samParser.Parse(filename); } // get reads from sequence alignment map object. IList <PairedRead> pairedReads = null; // Get Aligned sequences IList <SAMAlignedSequence> alignedSeqs = alignmentMapobj.QuerySequences; pairedReads = alignmentMapobj.GetPairedReads(0, 0); // Get the orphan regions. var orphans = pairedReads.Where(PR => PR.PairedType == PairedReadType.Orphan); if (orphans.Count() == 0) { Console.WriteLine("No Orphans to display"); } List <ISequenceRange> orphanRegions = new List <ISequenceRange>(orphans.Count()); foreach (PairedRead orphanRead in orphans) { orphanRegions.Add(GetRegion(orphanRead.Read1)); } // Get sequence range grouping object. SequenceRangeGrouping rangeGroup = new SequenceRangeGrouping(orphanRegions); if (rangeGroup.GroupIDs.Count() == 0) { Console.Write("\r\nNo Orphan reads to display"); } else { Console.Write("Region of Orphan reads:"); DisplaySequenceRange(rangeGroup); } SequenceRangeGrouping mergedRegions = rangeGroup.MergeOverlaps(); if (mergedRegions.GroupIDs.Count() == 0) { Console.Write("\r\nNo hot spots to display"); } else { Console.Write("\r\nChromosomal hot spot:"); DisplaySequenceRange(mergedRegions); } }
/// <summary> /// Indentify hot spot chromosomes for length anamoly regions. /// </summary> /// <param name="inputFile"> Input file</param> /// <param name="mean">Mean value</param> /// <param name="standardDeviation">Standard deviation</param> private void IdentifyLentghAnamolies(string filename, float mean = -1, float deviation = -1) { bool calculateMeanNdeviation = false; if (mean == -1 || deviation == -1) { calculateMeanNdeviation = true; } SequenceAlignmentMap alignmentMapobj = null; if (!SAMInput) { BAMParser bamParser = new BAMParser(); alignmentMapobj = bamParser.Parse(filename); } else { SAMParser samParser = new SAMParser(); alignmentMapobj = samParser.Parse(filename); } // get reads from sequence alignment map object. IList <PairedRead> pairedReads = null; if (calculateMeanNdeviation) { pairedReads = alignmentMapobj.GetPairedReads(); } else { pairedReads = alignmentMapobj.GetPairedReads(mean, deviation); } // Get the orphan regions. var orphans = pairedReads.Where(PR => PR.PairedType == PairedReadType.Orphan); if (orphans.Count() == 0) { Console.WriteLine("No Orphans to display"); } List <ISequenceRange> orphanRegions = new List <ISequenceRange>(orphans.Count()); foreach (PairedRead orphanRead in orphans) { orphanRegions.Add(GetRegion(orphanRead.Read1)); } // Get sequence range grouping for Orphan regions. SequenceRangeGrouping orphanRangegroup = new SequenceRangeGrouping(orphanRegions); // Get the Length anomalies regions. var lengthAnomalies = pairedReads.Where(PE => PE.PairedType == PairedReadType.LengthAnomaly); if (lengthAnomalies.Count() == 0) { Console.WriteLine("No Anomalies to display"); } List <ISequenceRange> lengthAnomalyRegions = new List <ISequenceRange>(lengthAnomalies.Count()); foreach (PairedRead laRead in lengthAnomalies) { SequenceRange range = new SequenceRange(); range.ID = laRead.Read1.RName; range.Start = laRead.Read1.Pos; range.End = laRead.Read1.Pos + laRead.InsertLength; lengthAnomalyRegions.Add(range); } // Get sequence range grouping for length anomaly regions. SequenceRangeGrouping lengthAnomalyRangegroup = new SequenceRangeGrouping(lengthAnomalyRegions); if (lengthAnomalyRangegroup.GroupIDs.Count() == 0) { Console.Write("\r\nNo Length anomalies reads to display"); } else { Console.Write("Region of length anomaly:"); DisplaySequenceRange(lengthAnomalyRangegroup); } if (orphanRangegroup.GroupIDs.Count() == 0) { Console.Write("\r\nNo Orphan reads to display"); } else { Console.Write("\r\nRegion of Orphan reads:"); DisplaySequenceRange(orphanRangegroup); } SequenceRangeGrouping intersectedRegions = lengthAnomalyRangegroup.Intersect(orphanRangegroup); if (intersectedRegions.GroupIDs.Count() == 0) { Console.Write("\r\nNo Hot spots found"); } else { Console.Write("\r\nChromosomal Hot spot of length anomaly and Orphan region:"); DisplaySequenceRange(intersectedRegions); } }
/// <summary> /// Initializes required parsers, formatters, input and output files based on user option. /// </summary> private void Initialize() { bamparser = new BAMParser(); bamformatter = new BAMFormatter(); bamUncompressedOutStream = null; bamCompressedOutStream = null; if (string.IsNullOrEmpty(OutputFilename)) { writer = Console.Out; } else { if (UnCompressedBAM || BAMOutput) { writer = null; if (UnCompressedBAM) { bamUncompressedOutStream = new FileStream(OutputFilename, FileMode.Create, FileAccess.ReadWrite); } else { bamCompressedOutStream = new FileStream(OutputFilename, FileMode.Create, FileAccess.ReadWrite); } } else { writer = new StreamWriter(OutputFilename); } } #region Intialize temp files long inputfileSize = (new FileInfo(InputFilePath)).Length; long unCompressedSize = inputfileSize; if (!SAMInput) { unCompressedSize = inputfileSize * 4; // as uncompressed bam file will be Aprox 4 times that of the compressed file. } long compressedSize = unCompressedSize / 4; // uncompressed file is required for both uncompressed and compressed outputs. if ((UnCompressedBAM || BAMOutput) && bamUncompressedOutStream == null) { if (HeaderOnly || (MemStreamLimit >= unCompressedSize)) { bamUncompressedOutStream = new MemoryStream(); } else { uncompressedTempfile = Path.GetTempFileName(); bamUncompressedOutStream = new FileStream(uncompressedTempfile, FileMode.Open, FileAccess.ReadWrite); } } if (BAMOutput && !UnCompressedBAM && bamCompressedOutStream == null) { if (HeaderOnly || (MemStreamLimit >= compressedSize)) { bamCompressedOutStream = new MemoryStream((int)(inputfileSize)); } else { compressedTempfile = Path.GetTempFileName(); bamCompressedOutStream = new FileStream(compressedTempfile, FileMode.Open, FileAccess.ReadWrite); } } #endregion Intialize temp files }
/// <summary> /// Merge multiple sorted alignments. /// SAMUtil.exe out.bam in1.bam in2.bam /// </summary> public void DoMerge() { if (FilePaths == null) { throw new InvalidOperationException("FilePath"); } if (FilePaths.Length < 3) { throw new InvalidOperationException(Resources.MergeHelp); } IList <IList <BAMSortedIndex> > sortedIndexes = new List <IList <BAMSortedIndex> >(); IList <SequenceAlignmentMap> sequenceAlignmentMaps = new List <SequenceAlignmentMap>(); IList <int> help = new List <int>(); Parallel.For(1, FilePaths.Length, (int index) => { IList <BAMSortedIndex> sortedIndex; BAMParser parser = new BAMParser();; SequenceAlignmentMap map; if (index == 1) { try { map = parser.Parse(FilePaths[1]); } catch { throw new InvalidOperationException(Resources.InvalidBAMFile); } if (map == null) { throw new InvalidOperationException(Resources.EmptyFile); } if (string.IsNullOrEmpty(HeaderFile) && map.Header.RecordFields.Count == 0) { throw new InvalidOperationException(Resources.HeaderMissing); } if (!string.IsNullOrEmpty(HeaderFile)) { SAMParser parse = new SAMParser(); SequenceAlignmentMap head; try { head = parse.Parse(HeaderFile); } catch { throw new InvalidOperationException(Resources.IncorrectHeaderFile); } if (head == null) { throw new InvalidOperationException(Resources.EmptyFile); } header = head.Header; } else { header = map.Header; } sortedIndex = Sort(map, SortByReadName ? BAMSortByFields.ReadNames : BAMSortByFields.ChromosomeCoordinates); } else { try { map = parser.Parse(FilePaths[index]); } catch { throw new InvalidOperationException(Resources.InvalidBAMFile); } if (map == null) { throw new InvalidOperationException(Resources.EmptyFile); } sortedIndex = Sort(map, SortByReadName ? BAMSortByFields.ReadNames : BAMSortByFields.ChromosomeCoordinates); } lock (sortedIndexes) { sortedIndexes.Add(sortedIndex); sequenceAlignmentMaps.Add(map); } }); string filePath = Path.GetTempFileName(); using (FileStream fstemp = new FileStream(filePath, FileMode.Create, FileAccess.ReadWrite)) { BAMFormatter formatter = new BAMFormatter(); formatter.WriteHeader(header, fstemp); int[] indexes = new int[sortedIndexes.Count]; if (SortByReadName) { IList <BAMSortedIndex> sortedIndex = sortedIndexes.Select(a => a.First()).ToList(); WriteMergeFileSortedByReadName(sortedIndex, fstemp, formatter, sequenceAlignmentMaps); } else { WriteMergeFile(sortedIndexes, fstemp, formatter, sequenceAlignmentMaps); } using (FileStream fsoutput = new FileStream(FilePaths[0], FileMode.Create, FileAccess.Write)) { fstemp.Seek(0, SeekOrigin.Begin); formatter.CompressBAMFile(fstemp, fsoutput); } } File.Delete(filePath); }
} = new Dictionary <string, int>(); // key: mappingStrand + strandFromGene /// <summary> /// Given a BAM file, try to guess the RNA-Seq experiment: /// 1) single-end or pair-end /// 2) strand_specific or not /// 3) if it is strand-specific, what's the strand_ness of the protocol /// </summary> /// <param name="spritzDirectory"></param> /// <param name="bamPath"></param> /// <param name="geneModelPath"></param> /// <param name="minFractionStrandSpecific"></param> /// <returns></returns> private void CheckProperties(string bamPath, string geneModelPath, Genome genome, double minFractionStrandSpecific) { GeneModel gm = new GeneModel(genome, geneModelPath); using (var reader = File.OpenRead(bamPath)) { Console.WriteLine("Reading BAM file."); // read bam, and filter out reads that are QC failures, unmapped, duplicates, or secondary BAMParser bam = new BAMParser(); var unfilteredReads = bam.Parse(reader).ToList(); var reads = unfilteredReads.Where(read => !read.Flag.HasFlag(SAMFlags.QualityCheckFailure) && !read.Flag.HasFlag(SAMFlags.UnmappedQuery) && !read.Flag.HasFlag(SAMFlags.Duplicate) && !read.Flag.HasFlag(SAMFlags.NonPrimeAlignment)).ToList(); Console.WriteLine("Evaluating reads."); Parallel.ForEach(reads, read => { // set the interval contained by this read, and get the gene regions nearby bool isReversed = read.Flag.HasFlag(SAMFlags.QueryOnReverseStrand); Interval readInterval = new Interval(null, read.RName, "source", isReversed ? "-" : "+", read.Pos, read.RefEndPos, null); bool hasNearbyRegion = gm.GenomeForest.Forest.TryGetValue(readInterval.ChromosomeID, out IntervalTree nearbyGeneTree); if (hasNearbyRegion) { List <Interval> nearbyGeneRegions = nearbyGeneTree.Query(readInterval); if (nearbyGeneRegions.Count > 0) { // count up paired-end or single-end read properties string mapStrand = isReversed ? "-" : "+"; bool isPaired = read.Flag.HasFlag(SAMFlags.PairedRead); bool isRead1 = read.Flag.HasFlag(SAMFlags.FirstReadInPair); bool isRead2 = read.Flag.HasFlag(SAMFlags.SecondReadInPair); string readId = isRead1 ? "1" : isRead2 ? "2" : null; HashSet <string> strandFromGene = new HashSet <string>(nearbyGeneRegions.Select(x => x.Strand)); foreach (string strand in strandFromGene) { Dictionary <string, int> dict = isPaired ? PairedStrandedness : SingleStrandedness; string key = isPaired ? readId + mapStrand + strand : mapStrand + strand; lock (dict) { if (dict.TryGetValue(key, out int count)) { count++; } else { dict[key] = 1; } } } } } }); // From RSeQC: // Not strand specific: // This is PairEnd Data // Fraction of reads failed to determine: 0.0172 // Fraction of reads explained by "1++,1--,2+-,2-+": 0.4903 // Fraction of reads explained by "1+-,1-+,2++,2--": 0.4925 // Strand specific: // This is PairEnd Data // Fraction of reads failed to determine: 0.0072 // Fraction of reads explained by "1++,1--,2+-,2-+": 0.9441 // Fraction of reads explained by "1+-,1-+,2++,2--": 0.0487 SingleStrandedness.TryGetValue("++", out int sForward1); SingleStrandedness.TryGetValue("--", out int sForward2); SingleStrandedness.TryGetValue("+-", out int sReverse1); SingleStrandedness.TryGetValue("-+", out int sReverse2); PairedStrandedness.TryGetValue("1++", out int pForward1); PairedStrandedness.TryGetValue("1--", out int pForward2); PairedStrandedness.TryGetValue("2+-", out int pForward3); PairedStrandedness.TryGetValue("2-+", out int pForward4); PairedStrandedness.TryGetValue("1+-", out int pReverse1); PairedStrandedness.TryGetValue("1-+", out int pReverse2); PairedStrandedness.TryGetValue("2++", out int pReverse3); PairedStrandedness.TryGetValue("2--", out int pReverse4); if (PairedStrandedness.Count > 0 && SingleStrandedness.Count == 0) { Protocol = RnaSeqProtocol.PairedEnd; FractionForwardStranded = (double)(pForward1 + pForward2 + pForward3 + pForward4) / (double)PairedStrandedness.Values.Sum(); FractionReverseStranded = (double)(pReverse1 + pReverse2 + pReverse3 + pReverse4) / (double)PairedStrandedness.Values.Sum(); FractionUndetermined = 1 - FractionForwardStranded - FractionReverseStranded; if (FractionUndetermined > 0.5) { throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath); } Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward : FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse : Strandedness.None; } else if (SingleStrandedness.Count > 0 && PairedStrandedness.Count == 0) { Protocol = RnaSeqProtocol.SingleEnd; FractionForwardStranded = (double)(sForward1 + sForward2) / (double)SingleStrandedness.Values.Sum(); FractionReverseStranded = (double)(sReverse1 + sReverse2) / (double)SingleStrandedness.Values.Sum(); FractionUndetermined = 1 - FractionForwardStranded - FractionReverseStranded; if (FractionUndetermined > 0.5) { throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath); } Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward : FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse : Strandedness.None; } else { Protocol = RnaSeqProtocol.Mixture; Strandedness = Strandedness.None; FractionForwardStranded = (double)(sForward1 + sForward2 + pForward1 + pForward2 + pForward3 + pForward4) / (double)PairedStrandedness.Values.Sum(); FractionReverseStranded = (double)(sReverse1 + sReverse2 + pReverse1 + pReverse2 + pReverse3 + pReverse4) / (double)PairedStrandedness.Values.Sum(); FractionUndetermined = 1 - FractionForwardStranded - FractionReverseStranded; if (FractionUndetermined > 0.5) { throw new ArgumentException("A large number of reads failed to determine the standedness of the protocol within " + bamPath); } Strandedness = FractionForwardStranded >= minFractionStrandSpecific ? Strandedness.Forward : FractionReverseStranded >= minFractionStrandSpecific ? Strandedness.Reverse : Strandedness.None; } } }