internal static ISample ExtractSample(string sampleColumn, FormatIndices formatIndices, ISimplePosition simplePosition, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, LegacySampleFieldExtractor legacyExtractor = null, bool enableDq = false) { // sanity check: make sure we have a format column if (string.IsNullOrEmpty(sampleColumn)) { return(Sample.EmptySample); } string[] sampleColumns = sampleColumn.OptimizedSplit(':', formatIndices.NumColumns); if (sampleColumns.Length == 1 && sampleColumns[0] == ".") { return(Sample.EmptySample); } sampleColumns.NormalizeNulls(); if (legacyExtractor != null) { return(legacyExtractor.ExtractSample(sampleColumn)); } int[] alleleDepths = sampleColumns.GetString(formatIndices.AD).GetIntegers(); float?artifactAdjustedQualityScore = sampleColumns.GetString(formatIndices.AQ).GetFloat(); int? copyNumber = sampleColumns.GetString(formatIndices.CN).GetInteger(); string[] diseaseAffectedStatuses = sampleColumns.GetString(formatIndices.DST).GetStrings(); bool failedFilter = sampleColumns.GetString(formatIndices.FT).GetFailedFilter(); string genotype = sampleColumns.GetString(formatIndices.GT); int? genotypeQuality = sampleColumns.GetString(formatIndices.GQ).GetInteger(); bool isDeNovo = sampleColumns.GetString(formatIndices.DN).IsDeNovo(); double? deNovoQuality = enableDq? sampleColumns.GetString(formatIndices.DQ).GetDouble():null; float? likelihoodRatioQualityScore = sampleColumns.GetString(formatIndices.LQ).GetFloat(); int[] pairedEndReadCounts = sampleColumns.GetString(formatIndices.PR).GetIntegers(); int[] repeatUnitCounts = sampleColumns.GetString(formatIndices.REPCN).GetIntegers('/'); int[] splitReadCounts = sampleColumns.GetString(formatIndices.SR).GetIntegers(); int? totalDepth = sampleColumns.GetString(formatIndices.DP).GetInteger(); double?variantFrequency = sampleColumns.GetString(formatIndices.VF).GetDouble(); int? minorHaplotypeCopyNumber = sampleColumns.GetString(formatIndices.MCN).GetInteger(); double?somaticQuality = sampleColumns.GetString(formatIndices.SQ).GetDouble(); int? binCount = sampleColumns.GetString(formatIndices.BC).GetInteger(); double[] variantFrequencies = VariantFrequency.GetVariantFrequencies(variantFrequency, alleleDepths, simplePosition.AltAlleles.Length); string[] mitoHeteroplasmyPercentiles = mitoHeteroplasmyProvider?.GetVrfPercentiles(simplePosition.Chromosome, simplePosition.Start, simplePosition.AltAlleles, variantFrequencies)?.Select(x => x?.ToString("0.##") ?? "null").ToArray(); var isLoh = GetLoh(copyNumber, minorHaplotypeCopyNumber, genotype); var sample = new Sample(alleleDepths, artifactAdjustedQualityScore, copyNumber, diseaseAffectedStatuses, failedFilter, genotype, genotypeQuality, isDeNovo, deNovoQuality, likelihoodRatioQualityScore, pairedEndReadCounts, repeatUnitCounts, splitReadCounts, totalDepth, variantFrequencies, minorHaplotypeCopyNumber, somaticQuality, isLoh, mitoHeteroplasmyPercentiles, binCount); return(sample); }
private VcfReader(StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider) { _headerReader = headerReader; _reader = vcfLineReader; _variantFactory = new VariantFactory(sequenceProvider.Sequence, vidCreator); _sequenceProvider = sequenceProvider; _refMinorProvider = refMinorProvider; _vcfFilter = vcfFilter; _refNameToChromosome = sequenceProvider.RefNameToChromosome; _mitoHeteroplasmyProvider = mitoHeteroplasmyProvider; }
private static VcfReader GetVcfReader(Stream headerStream, Stream vcfStream, IAnnotationResources annotationResources, IVcfFilter vcfFilter, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider) { var vcfReader = FileUtilities.GetStreamReader(vcfStream); StreamReader headerReader; if (headerStream == null) { headerReader = vcfReader; } else { headerReader = FileUtilities.GetStreamReader(headerStream); vcfStream.Position = Tabix.VirtualPosition.From(annotationResources.InputStartVirtualPosition).BlockOffset; } return(VcfReader.Create(headerReader, vcfReader, annotationResources.SequenceProvider, annotationResources.RefMinorProvider, annotationResources.Recomposer, vcfFilter, annotationResources.VidCreator, mitoHeteroplasmyProvider)); }
internal static ISample[] ToSamples(this string[] vcfColumns, FormatIndices formatIndices, ISimplePosition simplePosition, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, bool enableDq = false) { if (vcfColumns.Length < VcfCommon.MinNumColumnsSampleGenotypes) { return(null); } int numSamples = vcfColumns.Length - VcfCommon.MinNumColumnsSampleGenotypes + 1; var samples = new ISample[numSamples]; formatIndices.Set(vcfColumns[VcfCommon.FormatIndex]); var legacySampleExtractor = IsLegacyVariantCaller(formatIndices) ? new LegacySampleFieldExtractor(vcfColumns, formatIndices) : null; for (int index = VcfCommon.GenotypeIndex; index < vcfColumns.Length; index++) { samples[index - VcfCommon.GenotypeIndex] = ExtractSample(vcfColumns[index], formatIndices, simplePosition, mitoHeteroplasmyProvider, legacySampleExtractor, enableDq); } return(samples); }
public static VcfReader Create(StreamReader headerReader, StreamReader vcfLineReader, ISequenceProvider sequenceProvider, IRefMinorProvider refMinorProvider, IRecomposer recomposer, IVcfFilter vcfFilter, IVariantIdCreator vidCreator, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, bool enableDq = false) { var vcfReader = new VcfReader(headerReader, vcfLineReader, sequenceProvider, refMinorProvider, vcfFilter, vidCreator, mitoHeteroplasmyProvider, enableDq); vcfReader.ParseHeader(); vcfReader.SetRecomposer(recomposer); return(vcfReader); }
public static IPosition ToPosition(ISimplePosition simplePosition, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory, bool enableDq = false) { if (simplePosition == null) { return(null); } sequenceProvider.LoadChromosome(simplePosition.Chromosome); string[] vcfFields = simplePosition.VcfFields; string[] altAlleles = vcfFields[VcfCommon.AltIndex].OptimizedSplit(','); bool isReference = altAlleles.Length == 1 && VcfCommon.ReferenceAltAllele.Contains(altAlleles[0]); string globalMajorAllele = isReference ? refMinorProvider?.GetGlobalMajorAllele(simplePosition.Chromosome, simplePosition.Start) : null; bool isRefMinor = isReference && globalMajorAllele != null; if (isReference && !isRefMinor) { return(GetReferencePosition(simplePosition)); } var infoData = VcfInfoParser.Parse(vcfFields[VcfCommon.InfoIndex]); int end = ExtractEnd(infoData, simplePosition.Start, simplePosition.RefAllele.Length); double?quality = vcfFields[VcfCommon.QualIndex].GetNullableValue <double>(double.TryParse); string[] filters = vcfFields[VcfCommon.FilterIndex].OptimizedSplit(';'); ISample[] samples = vcfFields.ToSamples(variantFactory.FormatIndices, simplePosition, mitoHeteroplasmyProvider, enableDq); IVariant[] variants = variantFactory.CreateVariants(simplePosition.Chromosome, simplePosition.Start, end, simplePosition.RefAllele, altAlleles, infoData, simplePosition.IsDecomposed, simplePosition.IsRecomposed, simplePosition.LinkedVids, globalMajorAllele); return(new Position(simplePosition.Chromosome, simplePosition.Start, end, simplePosition.RefAllele, altAlleles, quality, filters, variants, samples, infoData, vcfFields, simplePosition.IsDecomposed, simplePosition.IsRecomposed)); }
public static ExitCodes Annotate(Stream headerStream, Stream inputVcfStream, Stream outputJsonStream, Stream outputJsonIndexStream, AnnotationResources annotationResources, IVcfFilter vcfFilter, bool ignoreEmptyChromosome) { var metrics = annotationResources.Metrics; PerformanceMetrics.ShowAnnotationHeader(); IChromosome currentChromosome = new EmptyChromosome("dummy"); int numVariants = 0; IMitoHeteroplasmyProvider mitoHeteroplasmyProvider = MitoHeteroplasmyReader.GetProvider(); using (var vcfReader = GetVcfReader(headerStream, inputVcfStream, annotationResources, vcfFilter, mitoHeteroplasmyProvider)) using (var jsonWriter = new JsonWriter(outputJsonStream, outputJsonIndexStream, annotationResources, Date.CurrentTimeStamp, vcfReader.GetSampleNames(), false)) { try { CheckGenomeAssembly(annotationResources, vcfReader); SetMitochondrialAnnotationBehavior(annotationResources, vcfReader); IPosition position; while ((position = vcfReader.GetNextPosition()) != null) { IChromosome chromosome = position.Chromosome; if (ignoreEmptyChromosome && chromosome.IsEmpty()) { continue; } if (chromosome.Index != currentChromosome.Index) { if (!currentChromosome.IsEmpty()) { metrics.ShowAnnotationEntry(currentChromosome, numVariants); } numVariants = 0; metrics.Preload.Start(); annotationResources.PreLoad(chromosome); metrics.Preload.Stop(); metrics.Annotation.Start(); currentChromosome = chromosome; } var annotatedPosition = position.Variants != null?annotationResources.Annotator.Annotate(position) : null; string json = annotatedPosition?.GetJsonString(); if (json != null) { jsonWriter.WritePosition(annotatedPosition.Position, json); } numVariants++; } jsonWriter.WriteGenes(annotationResources.Annotator.GetGeneAnnotations()); } catch (Exception e) { e.Data[ExitCodeUtilities.VcfLine] = vcfReader.VcfLine; throw; } } if (!currentChromosome.IsEmpty()) { metrics.ShowAnnotationEntry(currentChromosome, numVariants); } metrics.ShowSummaryTable(); return(ExitCodes.Success); }
internal static IPosition ParseVcfLine(string vcfLine, IRefMinorProvider refMinorProvider, ISequenceProvider sequenceProvider, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, VariantFactory variantFactory) { var simplePosition = GetSimplePosition(vcfLine, sequenceProvider.RefNameToChromosome); return(Position.ToPosition(simplePosition, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory)); }
internal static IAnnotatedPosition GetAnnotatedPosition(string cacheFilePrefix, List <string> saPaths, IMitoHeteroplasmyProvider mitoHeteroplasmyProvider, string vcfLine) { var annotationFiles = new AnnotationFiles(); saPaths?.ForEach(x => annotationFiles.AddFiles(x)); var refMinorProvider = ProviderUtilities.GetRefMinorProvider(annotationFiles); var(annotator, sequenceProvider) = GetAnnotatorAndSequenceProvider(cacheFilePrefix, saPaths); var variantFactory = new VariantFactory(sequenceProvider.Sequence, new VariantId()); var position = ParseVcfLine(vcfLine, refMinorProvider, sequenceProvider, mitoHeteroplasmyProvider, variantFactory); var annotatedPosition = annotator.Annotate(position); return(annotatedPosition); }