public InterimTsvsMerger(IEnumerable <string> annotationFiles, IEnumerable <string> intervalFiles, string miscFile, IEnumerable <string> geneFiles, string compressedReference, string outputDirectory) { _outputDirectory = outputDirectory; var refSequenceProvider = new ReferenceSequenceProvider(FileUtilities.GetReadStream(compressedReference)); _genomeAssembly = refSequenceProvider.GenomeAssembly; _refNameToChromosome = refSequenceProvider.RefNameToChromosome; _tsvReaders = ReaderUtilities.GetSaTsvReaders(annotationFiles); _miscReader = ReaderUtilities.GetMiscTsvReader(miscFile); _geneReaders = ReaderUtilities.GetGeneReaders(geneFiles); _intervalReaders = ReaderUtilities.GetIntervalReaders(intervalFiles); _saHeaders = new List <SaHeader>(); _saHeaders.AddRange(ReaderUtilities.GetTsvHeaders(_tsvReaders)); _saHeaders.AddRange(ReaderUtilities.GetTsvHeaders(_intervalReaders)); _geneHeaders = ReaderUtilities.GetTsvHeaders(_geneReaders)?.ToList(); _refNames = new HashSet <string>(); _refNames.UnionWith(ReaderUtilities.GetRefNames(_tsvReaders)); _refNames.UnionWith(ReaderUtilities.GetRefNames(_intervalReaders)); if (_miscReader != null) { _refNames.UnionWith(_miscReader.RefNames); } DisplayDataSources(_saHeaders, _geneHeaders); MergeUtilities.CheckAssemblyConsistancy(_saHeaders); }
private void MergeChrom(string refName) { var creationBench = new Benchmark(); var currentChrAnnotationCount = 0; int refMinorCount; var saEnumerators = GetSaEnumerators(refName); //return; var globalMajorAlleleInRefMinors = GetGlobalMajorAlleleForRefMinors(refName); var dataSourceVersions = MergeUtilities.GetDataSourceVersions(_saHeaders); var ucscRefName = _refNameToChromosome[refName].UcscName; var header = new SupplementaryAnnotationHeader(ucscRefName, DateTime.Now.Ticks, SaDataBaseCommon.DataVersion, dataSourceVersions, _genomeAssembly); //we need a list because we will enumerate over it multiple times var intervals = MergeUtilities.GetIntervals(_intervalReaders, refName).OrderBy(x => x.Start).ThenBy(x => x.End).ToList(); var svIntervals = MergeUtilities.GetSpecificIntervals(ReportFor.StructuralVariants, intervals); var allVariantsIntervals = MergeUtilities.GetSpecificIntervals(ReportFor.AllVariants, intervals); var smallVariantIntervals = MergeUtilities.GetSpecificIntervals(ReportFor.SmallVariants, intervals); var saPath = Path.Combine(_outputDirectory, $"{ucscRefName}.nsa"); using (var stream = FileUtilities.GetCreateStream(saPath)) using (var idxStream = FileUtilities.GetCreateStream(saPath + ".idx")) using (var blockSaWriter = new SaWriter(stream, idxStream, header, smallVariantIntervals, svIntervals, allVariantsIntervals, globalMajorAlleleInRefMinors)) { int position; ISaPosition saPosition; (position, saPosition) = GetNextInterimPosition(saEnumerators); while (saPosition != null) { blockSaWriter.Write(saPosition, position); currentChrAnnotationCount++; (position, saPosition) = GetNextInterimPosition(saEnumerators); } refMinorCount = blockSaWriter.RefMinorCount; } Console.WriteLine($"{ucscRefName,-23} {currentChrAnnotationCount,10:n0} {intervals.Count,6:n0} {refMinorCount,6:n0} {creationBench.GetElapsedIterationTime(currentChrAnnotationCount, "variants", out double _)}"); }
private static (int, ISaPosition) GetNextInterimPosition(List <IEnumerator <IInterimSaItem> > iSaEnumerators) { var minItems = MergeUtilities.GetMinItems(iSaEnumerators); return(MergeUtilities.GetSaPosition(minItems)); }