internal static IInterval <uint> ConvertPositionToCiInterval( this uint position, [NotNull] IVcfVariant variant, [NotNull] string ciInfoTag) { if (!variant.Info.TryGetValue(ciInfoTag, out var posString)) { return(BedInterval.Create(position > 0 ? position - 1 : 0, position == 0 ? 1 : position)); } var split = posString.Split(WittyerConstants.InfoValueDel); if (split.Length != 2) { throw VcfVariantFormatException.Create(variant.ToString(), ImmutableHashSet.Create(VcfColumn.Info), $"Invalid {ciInfoTag} found: {posString}", variant.ToStrings().ToList().AsReadOnly()); } var(start, stop) = ConvertPositionToCiInterval(position, (GetParsedAbsValue(split[0]), GetParsedAbsValue(split[1]))); return(BedInterval.Create(start, stop)); uint GetParsedAbsValue(string val) { var parsed = int.Parse(val); return((uint)(parsed < 0 ? -parsed : parsed)); } }
internal static IWittyerVariant Create([NotNull] IVcfVariant baseVariant, string sample, double percentageDistance, uint basepairDistance, IReadOnlyList <uint> bins, WittyerVariantType svType) { if (svType == WittyerVariantType.Invalid) { throw new InvalidDataException( $"Invalid {VcfConstants.SvTypeKey} in variant: \n{baseVariant}\nNot sure why you got here though. Check with a witty.er developer!"); } var end = baseVariant.Position + baseVariant.GetSvLength(); var baseInterval = BedInterval.Create(baseVariant.Position, end); var borderInterval = baseVariant.Position.CalculateBorderInterval(baseInterval, baseVariant.ParseCi(WittyerConstants.Cipos), percentageDistance, basepairDistance); // wittyerVariant should all have end border, it's a matter of how to find it, // either END key in INFO field, sort out through SVLEN or other ways, details can be defined in FindEndBorder() later var endInterval = (end - 1).CalculateBorderInterval(baseInterval, baseVariant.ParseCi(WittyerConstants.Ciend), percentageDistance, basepairDistance); var posContigAndInterval = ContigAndInterval.Create(baseVariant.Contig, borderInterval.Start, borderInterval.Stop + 1); var endContigAndInterval = ContigAndInterval.Create(baseVariant.Contig, endInterval.Start, endInterval.Stop + 1); return(Create(baseVariant, baseInterval, svType, posContigAndInterval, Winner.Create(svType, baseInterval, bins), new List <OverlapAnnotation>(), WittyerSample.CreateOverall(baseVariant, sample, svType == WittyerVariantType.CopyNumberReference), endContigAndInterval)); }
private static uint?GetSvLength([NotNull] this IVcfVariant variant, bool throwException, out bool sharedFirstBase, out bool sharedLastBase, out uint?endPos) { endPos = null; if (IsSimpleSequence(variant, out var absoluteDiff, out sharedFirstBase, out sharedLastBase, true)) { return(absoluteDiff); } if (variant.Info.TryGetValue(VcfConstants.EndTagKey, out var endStr)) { if (!uint.TryParse(endStr, out var end)) { return(throwException ? throw new InvalidDataException( $"Invalid value for {VcfConstants.EndTagKey} for variant\n{variant}") : default(uint?)); } endPos = end; // when end and pos is the same, we do 1, even though based on strict vcf spec, it's 0 length :( var diff = end == variant.Position ? 1 : end - variant.Position; if (variant.Alts.Count == 0) // ref site has 0 alts in VariantUtils { diff++; } return(diff); } var exception = TryGetSvLength(variant, out var ret); return(exception == null ? ret : throwException ? throw exception : default(uint?)); }
internal static IInterval <uint> ConvertPositionToCiInterval( this uint position, [NotNull] IVcfVariant variant, [NotNull] string ciInfoTag) { if (!variant.Info.TryGetValue(ciInfoTag, out var posString)) { return(BedInterval.Create(position > 0 ? position - 1 : 0, position == 0 ? 1 : position)); } var split = posString.Split(WittyerConstants.InfoValueDel); if (split.Length != 2) { throw VcfVariantFormatException.Create(variant.ToString(), ImmutableHashSet.Create(VcfColumn.Info), $"Invalid {ciInfoTag} found: {posString}", variant.ToStrings().ToList().AsReadOnly()); } var parsedStart = GetParsedAbsValue(split[0]); if (parsedStart == null) { throw new InvalidOperationException($"Failed to parse {ciInfoTag}={posString}!"); } var parsedStop = GetParsedAbsValue(split[1]); if (parsedStop == null) { throw new InvalidOperationException($"Failed to parse {ciInfoTag}={posString}!"); } var(start, stop) = ConvertPositionToCiInterval(position, (parsedStart.Value, parsedStop.Value)); return(BedInterval.Create(start, stop)); uint?GetParsedAbsValue(string val) => !int.TryParse(val, out var parsed) ? (uint?)null : (uint)(parsed < 0 ? -parsed : parsed); }
internal static bool IsRefCall([NotNull] this IVcfVariant variant, [CanBeNull] string sampleName) { //refsite is a refcall for sure if (variant.IsRefSite()) { return(true); } //if not refsite and no sample field, not a refcall if (variant.Samples.Count == 0) { return(false); } var sample = sampleName == null ? variant.Samples[0] : variant.Samples[sampleName]; var isCn = sample.SampleDictionary.TryGetValue(VcfConstants.CnSampleFieldKey, out var cnString); var isGt = sample.SampleDictionary.TryGetValue(VcfConstants.GenotypeKey, out var gt); if (isGt) { //todo: refining how to deal with ploidy. Also here we don't deal with LOH. assuming CN = ploidy is ref var gtArray = gt.Split('/', '|'); if (isCn && int.TryParse(cnString, out var intCn)) { return(intCn == gtArray.Length); } return(gtArray.All(alleleIndex => alleleIndex == "0")); } return(isCn && cnString == "2"); }
internal static WittyerVariantInternal Create([NotNull] IVcfVariant baseVariant, IInterval <uint> baseInterval, WittyerVariantType svType, IContigAndInterval startInterval, Winner win, List <OverlapAnnotation> overlapInfo, IWittyerSample sample, IContigAndInterval endInterval) => new WittyerVariantInternal(baseVariant, baseInterval, svType, startInterval, win, overlapInfo, sample, endInterval);
private GeneralBnd(IVcfVariant variant, IInterval <uint> interval, bool is3Prime, ISimpleBreakEnd mate) { _baseVariant = variant; _interval = interval; Is3Prime = is3Prime; Mate = mate; }
private WittyerBndInternal([NotNull] IVcfVariant baseVariant, IInterval <uint> posInterval, Winner win, IContigAndInterval endInterval, List <OverlapAnnotation> overlapInfo, IWittyerSample sample, IVcfVariant endOriginalVariant, WittyerVariantType svType) { Contig = baseVariant.Contig; _posInterval = posInterval; EndInterval = endInterval; OverlapInfo = overlapInfo; Sample = sample; OriginalVariant = baseVariant; EndOriginalVariant = endOriginalVariant; Win = win; VariantType = svType; }
internal static IWittyerBnd Create([NotNull] IVcfVariant variant, IVcfVariant secondVariant, [CanBeNull] string sampleName, double percentageDistance, uint basepairDistance, IReadOnlyList <uint> bins) { var(first, second) = MiscUtils.FindBndEntriesOrder(variant, secondVariant); var posInterval = first.CalculateBndBorderInterval(second, first.ParseCi(WittyerConstants.Cipos), percentageDistance, basepairDistance); var endInterval = second.CalculateBndBorderInterval(first, second.ParseCi(WittyerConstants.Cipos), percentageDistance, basepairDistance); var svType = variant.ParseWittyerVariantType(sampleName); var winner = GetWinner(); var overlapInfo = new List <OverlapAnnotation>(); var sample = WittyerSample.CreateOverall(variant, sampleName, false); return(Create(first, posInterval, winner, endInterval, overlapInfo, sample, second, svType)); Winner GetWinner() { if (svType == WittyerVariantType.TranslocationBreakend) { return(Winner.Create(svType)); } if (svType != WittyerVariantType.Insertion) { return(Winner.Create(svType, BedInterval.Create(first.Position, second.Position + 1), bins)); } uint?end = null; // insertion, try sequences first if (variant.IsSimpleSequence(out var length)) { end = length; } // try svlength, but if not, assume unknown length. else if (variant.TryGetSvLength(out length) == null) { end = length; } return(Winner.Create(svType, end == null ? null : BedInterval.Create(variant.Position, variant.Position + end.Value), bins)); } }
public static IWittyerSample CreateFromVariant(IVcfVariant baseVariant, [CanBeNull] IVcfSample sample, bool isReference) { if (isReference) { return(CreateReferenceSample(baseVariant, sample)); } if (sample == null) { return(WittyerSampleInternal.Create(null)); } var wittyerSample = WittyerSampleInternal.Create(sample); var hasGt = sample.SampleDictionary.ContainsKey(VcfConstants.GenotypeKey); if (!sample.SampleDictionary.TryGetValue(VcfConstants.CnSampleFieldKey, out var cnString)) { return(hasGt ? WittyerGenotypedSample.Create(wittyerSample, GenotypeInfo.CreateFromSample(sample)) as IWittyerSample : wittyerSample); } uint?cnNumber; if (cnString == VcfConstants.MissingValueString) { cnNumber = null; } else if (uint.TryParse(cnString, out var cnNumberLocal)) { cnNumber = cnNumberLocal; } else { throw new InvalidDataException($"{VcfConstants.CnSampleFieldKey} does not have a valid value in {baseVariant}"); } var cnSample = WittyerCopyNumberSample.Create(wittyerSample, cnNumber); if (!hasGt) { return(cnSample); } var gtInfo = GenotypeInfo.CreateFromSample(sample); return(WittyerGenotypedCopyNumberSample.Create(cnSample, gtInfo)); }
internal static bool IsSimpleSequence([NotNull] this IVcfVariant variant, out uint absoluteDiff) { var isSequence = variant.Alts.All(x => x.All(nucleotide => VcfConstants.ValidAltNucleotideChars.Contains(nucleotide))); if (isSequence) { absoluteDiff = GetAbsoluteDiff((uint)variant.Alts.First().Length, (uint)variant.Ref.Length); return(true); } absoluteDiff = default; return(false); }
private WittyerVariantInternal([NotNull] IVcfVariant baseVariant, IInterval <uint> baseInterval, WittyerVariantType svType, IContigAndInterval posInterval, Winner win, List <OverlapAnnotation> overlapInfo, IWittyerSample sample, IContigAndInterval endInterval) { OriginalVariant = baseVariant; Contig = baseVariant.Contig; VariantType = svType; Win = win; OverlapInfo = overlapInfo; Sample = sample; _baseInterval = baseInterval; PosInterval = posInterval; EndInterval = endInterval; }
private WittyerBndInternal([NotNull] WittyerType svType, [NotNull] IVcfVariant baseVariant, [NotNull] IInterval <uint> posInterval, [NotNull] IInterval <uint> ciPosInterval, [NotNull] IVcfVariant endOriginalVariant, [NotNull] IContigAndInterval endInterval, [NotNull] IInterval <uint> ciEndInterval, [NotNull] Winner win, [NotNull] IWittyerSample sample) { Contig = baseVariant.Contig; _posInterval = posInterval; EndInterval = endInterval; Sample = sample; OriginalVariant = baseVariant; EndOriginalVariant = endOriginalVariant; Win = win; VariantType = svType; CiPosInterval = ciPosInterval; CiEndInterval = ciEndInterval; }
private WittyerVariantInternal([NotNull] WittyerType svType, [NotNull] IVcfVariant baseVariant, [NotNull] IInterval <uint> baseInterval, [NotNull] Winner win, [NotNull] IContigAndInterval posInterval, [NotNull] IInterval <uint> ciPosInterval, [NotNull] IContigAndInterval endInterval, [NotNull] IInterval <uint> ciEndInterval, [NotNull] IWittyerSample sample) { OriginalVariant = baseVariant; Contig = baseVariant.Contig; VariantType = svType; Win = win; Sample = sample; _baseInterval = baseInterval; PosInterval = posInterval; EndInterval = endInterval; CiPosInterval = ciPosInterval; CiEndInterval = ciEndInterval; }
internal static InclusiveInterval <int> ParseCi([NotNull] this IVcfVariant variant, string tag) { if (!variant.Info.TryGetValue(tag, out var posString)) { return(new InclusiveInterval <int>(0, 0)); } var split = posString.Split(WittyerConstants.InfoValueDel); if (split.Length != 2) { throw VcfVariantFormatException.Create(variant.ToString(), ImmutableHashSet.Create(VcfColumn.Info), $"Invalid {tag} found: {posString}", variant.ToStrings().ToList().AsReadOnly()); } return(new InclusiveInterval <int>(int.Parse(split[0]), int.Parse(split[1]))); }
internal static IVcfVariant ConvertGenomeType([NotNull] this IVcfVariant variant, GenomeType type) { switch (type) { case GenomeType.Ucsc: return(variant.ToUcscStyleVariant()); case GenomeType.Grch: return(variant.ToGrchStyleVariant()); case GenomeType.Unknown: return(variant); default: throw new InvalidDataException( $"Not sure why there's a genometype {type.ToString()} in vcf which we are not supporting!"); } }
internal static Exception TryGetSvLength([NotNull] this IVcfVariant variant, out uint svLength) { svLength = default; if (!variant.Info.TryGetValue(VcfConstants.SvLenKey, out var svLenStr)) { return(new InvalidDataException( $"Found a symbolic SV have no END or SVLEN key in info field, cannot process the variant \n{variant}")); } if (svLenStr.StartsWith(MinusSign)) { svLenStr = svLenStr.Substring(1); } return(uint.TryParse(svLenStr, out svLength) ? null : new InvalidDataException($"Invalid value for {VcfConstants.SvLenKey} for variant\n{variant}")); }
internal static uint GetSvLength([NotNull] this IVcfVariant variant) { // possible bug if more than one shared base or no shared bases! see https://jira.illumina.com/browse/WIT-86 if (IsSimpleSequence(variant, out var absoluteDiff)) { return(absoluteDiff); } if (variant.Info.TryGetValue(VcfConstants.EndTagKey, out var endStr)) { return(uint.TryParse(endStr, out var end) ? GetAbsoluteDiff(end, variant.Position) : throw new InvalidDataException( $"Invalid value for {VcfConstants.EndTagKey} for variant\n{variant}")); } var exception = TryGetSvLength(variant, out var ret); return(exception == null ? ret : throw exception); }
public static IGeneralBnd CreateFromVariant([NotNull] IVcfVariant variant) { var altBnd = variant.GetSingleAlt(); var thisRef = variant.Ref[0]; var mate = SimpleBreakEnd.Parse(altBnd, out var firstField, out var lastField); var is3Prime = !string.IsNullOrWhiteSpace(firstField); if (is3Prime && !firstField.StartsWith(thisRef) || !is3Prime && !lastField.EndsWith(thisRef)) { throw new InvalidDataException( $"Invalid breakend because neither the alt didn't start or end with ref's first base: {variant}"); } var interval = BedInterval.Create(variant.Position - 1, variant.Position); return(new GeneralBnd(variant, interval, is3Prime, mate)); }
internal static IInterval <uint> ToBedInterval([NotNull] this IVcfVariant baseVariant, bool throwException, out uint endVal, out bool sharedFirstBase) { endVal = baseVariant.Position; if (IsSimpleSequence(baseVariant, out var refLenVal, out sharedFirstBase, out var sharedLastBase, false)) // first need to save the original end before normalizing { endVal += refLenVal; if (baseVariant.Alts.Count > 0) // refsites don't have Alts in VariantUtils { baseVariant = baseVariant.TryNormalizeVariant(VariantNormalizer.TrimCommonBases, 0).GetOrThrow(); } } var refLen = baseVariant.GetSvLength(throwException, out sharedFirstBase, out sharedLastBase, out var endPos); if (refLen == null) // means insertion of unknown length. { return(null); } if (endPos != null) { endVal = endPos.Value; } var start = sharedFirstBase || baseVariant.Position == 0 ? baseVariant.Position : baseVariant.Position - 1; var end = start + refLen.Value; if (sharedLastBase) // rare case { end--; } return(BedInterval.Create(start, end)); }
internal static bool IsSimpleSequence([NotNull] this IVcfVariant variant, out uint absoluteDiff, out bool sharedFirstBase, out bool sharedLastBase, bool isTrimmed) { absoluteDiff = default; sharedFirstBase = false; sharedLastBase = false; if (variant.Alts.Count == 0) { return(false); } if (variant.IsAltSimpleSequence(0)) { absoluteDiff = (uint)Math.Max(variant.Ref.Length, variant.Alts[0].Length); sharedFirstBase = variant.Ref[0].Letter == variant.Alts[0][0]; sharedLastBase = variant.Ref.Last().Letter == variant.Alts[0].Last(); if (isTrimmed && sharedFirstBase && sharedLastBase && (variant.Ref.Length == 1 || variant.Alts[0].Length == 1)) { sharedLastBase = false; // corner case like chr1 1 A AA would be true for both. } if (isTrimmed && sharedFirstBase && sharedLastBase) { throw new InvalidDataException( "Somehow we got a variant that after trimming, shares first and last base: " + variant); } if (sharedFirstBase || sharedLastBase) { absoluteDiff--; } return(true); } // always assume the first base is shared in symbolic alleles. sharedFirstBase = true; // and last base never shared. sharedLastBase = false; absoluteDiff = default; return(false); }
internal static IVcfVariant CreateUnsupportedVariant([NotNull] IVcfVariant baseVariant, [CanBeNull] IVcfSample sample, FailedReason why, bool isTruth) { var realName = (isTruth ? SamplePair.Default.TruthSampleName : SamplePair.Default.QuerySampleName) ?? throw new InvalidDataException( $"Somehow, {nameof(SamplePair)}.{nameof(SamplePair.Default)} was null!!"); var sampleBuilder = SampleDictionaries.CreateBuilder() .AddSample(realName).MoveOnToDictionaries(); var dicts = (sample?.SampleDictionary ?? ImmutableDictionary <string, string> .Empty.AsEnumerable()) .Select(kvp => (kvp.Key, kvp.Value)) .FollowedWith( (WittyerConstants.WittyerMetaInfoLineKeys.Wit, NotAssessed), (WittyerConstants.WittyerMetaInfoLineKeys.Why, why.ToString())); foreach (var tuple in dicts) { sampleBuilder.SetSampleField(realName, tuple); } return(baseVariant.ToBuilder().SetSamples(sampleBuilder.Build()).Build()); }
internal static WittyerVariantType ParseWittyerVariantType([NotNull] this IVcfVariant variant, [CanBeNull] string sampleName) { if (variant.IsRefCall(sampleName)) { return(WittyerVariantType.CopyNumberReference); } //anything NOT a refcall requires SVTYPE INFO key if (!variant.Info.TryGetValue(VcfConstants.SvTypeKey, out var svTypeStr)) { throw new InvalidDataException( $"Following variant does not have {VcfConstants.SvTypeKey} info key:\n{variant}"); } if (TryParseEnumOrDescription(svTypeStr, out WittyerVariantType svType)) { if (variant.Samples.Count > 0 && variant.Samples[0].SampleDictionary.ContainsKey(VcfConstants.CnSampleFieldKey) && WittyerConstants.BaseLevelStatsTypes.Contains(svType)) { return(WittyerVariantType.Cnv); } return(svType); } if (!TryParseEnumOrDescription(svTypeStr, out SvType type) || !type.Equals(SvType.TranslocationBreakend)) { throw new InvalidDataException($"Cannot recognize {svTypeStr}"); } var bnd = variant as IGeneralBnd ?? GeneralBnd.Create(variant); return(!bnd.Contig.Equals(bnd.Mate.Contig) ? WittyerVariantType.TranslocationBreakend : WittyerVariantType.IntraChromosomeBreakend); }
internal static IWittyerVariant Create([NotNull] IVcfVariant baseVariant, [CanBeNull] IVcfSample sample, [NotNull] WittyerType svType, [NotNull] IReadOnlyList <uint> bins, [CanBeNull] double?percentageDistance, uint basepairDistance) { // originalInterval is needed to adjust CIPOS and CIEND against for PD/BPD, but it won't be used for actual reflen and binning. var baseInterval = baseVariant.ToBedInterval(true, out var originalEnd, out var sharedFirstBase); if (baseInterval == null) { throw new InvalidOperationException( $"Expected failure of {nameof(WittyerUtils.ToBedInterval)} to throw, but didn't..."); } // CI intervals are always based on the original POS/END var posStart = baseVariant.Position; if (sharedFirstBase) { posStart++; } var ciPosInterval = posStart.ConvertPositionToCiInterval(baseVariant, WittyerConstants.Cipos); var ciEndInterval = originalEnd.ConvertPositionToCiInterval(baseVariant, WittyerConstants.Ciend); var baseStart = sharedFirstBase ? baseInterval.Start : baseInterval.Start + 1; // not sharing first base (ref site or complex types, etc) need adjustment // the pd/bpd intervals are based on the trimmed variant's coordinates. var(posInterval, endInterval) = WittyerUtils.GetPosAndEndInterval(baseVariant.Contig, percentageDistance, basepairDistance, ciPosInterval, baseStart, ciEndInterval, baseInterval.Stop); return(new WittyerVariantInternal(svType, baseVariant, baseInterval, Winner.Create(svType, baseInterval, bins), posInterval, ciPosInterval, endInterval, ciEndInterval, WittyerSample.CreateFromVariant(baseVariant, sample, svType == WittyerType.CopyNumberReference))); }
internal static string ToString([NotNull] IVcfVariant variant, bool?isTruth) { var ret = variant.ToStrings().Take(FormatIndex).ToList(); // order the info fields if (ret[InfoIndex] != MissingValueString) { ret[InfoIndex] = variant.Info.OrderBy(kvp => kvp.Key) .Select(kvp => kvp.Value.IsNullOrEmpty() ? kvp.Key : $"{kvp.Key}{InfoFieldKeyValueDelimiter}{kvp.Value}") .StringJoin(InfoFieldDelimiter); } // add format column ret.Add(variant.Samples[0].SampleDictionary.Keys.StringJoin(SampleFieldDelimiter)); if (isTruth == null) { var values = variant.Samples[0].SampleDictionary.Values; ret.Add(values.All(it => string.IsNullOrEmpty(it) || it == MissingValueString) ? MissingValueString : values.StringJoin(SampleFieldDelimiter)); } else if (isTruth.Value) { ret.Add(variant.Samples[0].SampleDictionary.Values.StringJoin(SampleFieldDelimiter)); ret.Add(MissingValueString); } else { ret.Add(MissingValueString); ret.Add(variant.Samples[1].SampleDictionary.Values.StringJoin(SampleFieldDelimiter)); } return(ret.StringJoin(ColumnDelimiter)); }
internal static IWittyerGenotypedCopyNumberSample CreateReferenceSample([NotNull] IVcfVariant baseVariant, [CanBeNull] IVcfSample sample) { var ploidy = 2; if (sample == null) { return(WittyerGenotypedCopyNumberSample.Create( WittyerCopyNumberSample.Create(WittyerSampleInternal.Create(null), (uint)ploidy), GenotypeInfo.CreateRef(ploidy, false))); } var isPhased = false; if (sample.SampleDictionary.TryGetValue(VcfConstants.GenotypeKey, out var originalGt)) { isPhased = originalGt.Contains(VcfConstants.GtPhasedValueDelimiter); ploidy = originalGt .Split(isPhased ? VcfConstants.GtPhasedValueDelimiter : VcfConstants.GtUnphasedValueDelimiter).Length; } var cnSample = WittyerCopyNumberSample.Create(WittyerSampleInternal.Create(sample), (uint)ploidy); return(WittyerGenotypedCopyNumberSample.Create(cnSample, GenotypeInfo.CreateRef(ploidy, isPhased))); }
internal static bool IsSamplePassFilter([NotNull] this IVcfVariant variant, [CanBeNull] string name) => name == null || variant.Samples.Count == 0 || variant.Samples[name].IsSampleFtPassFilter();
internal static bool IsFilterIncluded([NotNull] this IVcfVariant variant, IImmutableSet <string> includedFilters, IImmutableSet <string> excludedFilters) => !variant.Filters.Any(excludedFilters.Contains) && (includedFilters.Count == 0 || variant.Filters.Any(includedFilters.Contains));
/// <inheritdoc /> public void AddUnsupported(IVcfVariant variant) => _baseResult.AddUnsupported(variant);
internal Builder AddNonSupported(IVcfVariant variant) { _notSupported = _notSupported.Add(variant); return this; }