internal IEnumerable <OneKGenItem> ExtractItems(string vcfLine) { var splitLine = vcfLine.OptimizedSplit('\t');// we don't care about the many fields after info field if (splitLine.Length < 8) { yield break; } Clear(); var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refNameDictionary.ContainsKey(chromosomeName)) { yield break; } var chromosome = _refNameDictionary[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var infoFields = splitLine[VcfCommon.InfoIndex]; // parses the info fields and extract frequencies, ancestral allele, allele counts, etc. var hasSymbolicAllele = altAlleles.Any(x => x.OptimizedStartsWith('<') && x.OptimizedEndsWith('>')); if (hasSymbolicAllele) { yield break; } // ReSharper disable once ConditionIsAlwaysTrueOrFalse ParseInfoField(infoFields, hasSymbolicAllele); for (var i = 0; i < altAlleles.Length; i++) { var(shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAlleles[i], _sequenceProvider.Sequence); yield return(new OneKGenItem( chromosome, shiftedPos, shiftedRef, shiftedAlt, _ancestralAllele, GetAlleleCount(_allAlleleCounts, i), GetAlleleCount(_afrAlleleCounts, i), GetAlleleCount(_amrAlleleCounts, i), GetAlleleCount(_eurAlleleCounts, i), GetAlleleCount(_easAlleleCounts, i), GetAlleleCount(_sasAlleleCounts, i), _allAlleleNumber, _afrAlleleNumber, _amrAlleleNumber, _eurAlleleNumber, _easAlleleNumber, _sasAlleleNumber )); } }
public IVariant[] CreateVariants(IChromosome chromosome, int start, int end, string refAllele, string[] altAlleles, IInfoData infoData, bool[] isDecomposed, bool isRecomposed, List <string>[] linkedVids, string globalMajorAllele) { string firstAltAllele = altAlleles[0]; bool isReference = globalMajorAllele != null; bool isSymbolicAllele = IsSymbolicAllele(firstAltAllele); var variantCategory = GetVariantCategory(firstAltAllele, isReference, isSymbolicAllele, infoData.SvType); if (isReference) { return new[] { GetVariant(chromosome, start, end, refAllele, firstAltAllele, infoData, variantCategory, isDecomposed[0], isRecomposed, linkedVids?[0]?.ToArray(), globalMajorAllele) } } ; _sequenceProvider.LoadChromosome(chromosome); var variants = new List <IVariant>(); for (var i = 0; i < altAlleles.Length; i++) { #if (!NI_ALLELE) if (VcfCommon.IsNonInformativeAltAllele(altAlleles[i])) { continue; } #endif bool isDecomposedVar = isDecomposed[i]; (int shiftedStart, string shiftedRef, string shiftedAlt) = VariantUtils.TrimAndLeftAlign(start, refAllele, altAlleles[i], _sequenceProvider.Sequence); variants.Add(GetVariant(chromosome, shiftedStart, end - (start - shiftedStart), shiftedRef, shiftedAlt, infoData, variantCategory, isDecomposedVar, isRecomposed, linkedVids?[i]?.ToArray(), null)); } return(variants.Count == 0 ? null : variants.ToArray()); }
/// <summary> /// Extracts a dbSNP item from the specified VCF line. /// </summary> /// <param name="vcfLine"></param> /// <returns></returns> public IEnumerable <DbSnpItem> ExtractItem(string vcfLine) { var splitLine = vcfLine.Split('\t', 6); if (splitLine.Length < 5) { yield break; } var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_sequenceProvider.RefNameToChromosome.ContainsKey(chromosomeName)) { yield break; } var chromosome = _sequenceProvider.RefNameToChromosome[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]); var dbSnpId = Convert.ToInt64(splitLine[VcfCommon.IdIndex].Substring(2)); var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); foreach (var altAllele in altAlleles) { var(shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); yield return(new DbSnpItem(chromosome, shiftedPos, dbSnpId, shiftedRef, shiftedAlt)); } }
public void Left_align_multiple_padding_bases() { var reference = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "ATGTGTTGTTATTCTGTGTGCAT"); var rotatedVariant = VariantUtils.TrimAndLeftAlign(501, "AT", "ATT", reference); Assert.Equal(502, rotatedVariant.start); Assert.Equal("T", rotatedVariant.altAllele); }
public void Left_align_insertion(int position, string altAllele, int rotatedPos, string rotatedAlt) { var reference = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "ATGTGTTGTTATTCTGTGTGCAT"); var rotatedVariant = VariantUtils.TrimAndLeftAlign(position, "", altAllele, reference); Assert.Equal(rotatedPos, rotatedVariant.start); Assert.Equal(rotatedAlt, rotatedVariant.altAllele); }
private List <AlleleFrequencyItem> ExtractItems(string vcfLine) { var splitLine = vcfLine.Split(new[] { '\t' }, 9);// we don't care about the many fields after info field if (splitLine.Length < 8) { return(null); } Clear(); var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refNameDictionary.ContainsKey(chromosomeName)) { return(null); } var chromosome = _refNameDictionary[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var infoFields = splitLine[VcfCommon.InfoIndex]; // parses the info fields and extract frequencies, ancestral allele, allele counts, etc. ParseInfoField(infoFields); if (_allAlleleNumber == null) { return(null); } var items = new List <AlleleFrequencyItem>(); for (var i = 0; i < altAlleles.Length; i++) { var alleleCount = GetAlleleCount(_allAlleleCounts, i); if (alleleCount == null || alleleCount == 0) { continue; } var frequency = 1.0 * alleleCount.Value / _allAlleleNumber.Value; var(shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAlleles[i], _sequenceProvider.Sequence); items.Add(new AlleleFrequencyItem(chromosome, shiftedPos, shiftedRef, shiftedAlt, frequency)); } return(items.Count > 0? items: null); }
private TopMedItem ExtractItems(string vcfLine) { if (vcfLine == null) { return(null); } var splitLine = vcfLine.OptimizedSplit('\t'); if (splitLine.Length < 8) { return(null); } Clear(); // chr1 10169 TOPMed_freeze_5?chr1:10,169 T C 255 SVM VRT=1;NS=62784;AN=125568;AC=20;AF=0.000159276;Het=20;Hom=0 NA:FRQ 125568:0.000159276 var chromosome = splitLine[VcfCommon.ChromIndex]; if (!_refChromDict.ContainsKey(chromosome)) { return(null); } var chrom = _refChromDict[chromosome]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAllele = splitLine[VcfCommon.AltIndex]; var filters = splitLine[VcfCommon.FilterIndex]; var infoFields = splitLine[VcfCommon.InfoIndex]; if (altAllele.Contains(",")) { Console.WriteLine(vcfLine); throw new InvalidDataException("het site found!!"); } var failedFilter = !(filters.Equals("PASS") || filters.Equals(".")); ParseInfoField(infoFields); if (_alleleNum == 0) { return(null); } var(shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); return(new TopMedItem(chrom, shiftedPos, shiftedRef, shiftedAlt, _alleleNum, _alleleCount, _homCount, failedFilter)); }
private List <ClinVarItem> GetValidItems(List <ClinVarItem> clinVarItems) { var shiftedItems = new List <ClinVarItem>(); foreach (var item in clinVarItems) { _sequenceProvider.LoadChromosome(item.Chromosome); if (!ValidateRefAllele(item)) { continue; } string refAllele = item.RefAllele, altAllele = item.AltAllele; if (string.IsNullOrEmpty(item.RefAllele) && item.VariantType == "Deletion") { refAllele = GetReferenceAllele(item, _sequenceProvider.Sequence); } if (string.IsNullOrEmpty(item.RefAllele) && item.VariantType == "Indel" && !string.IsNullOrEmpty(item.AltAllele)) { refAllele = GetReferenceAllele(item, _sequenceProvider.Sequence); } if (string.IsNullOrEmpty(item.AltAllele) && item.VariantType == "Duplication") { altAllele = GetAltAllele(item, _sequenceProvider.Sequence); } if (string.IsNullOrEmpty(refAllele) && string.IsNullOrEmpty(altAllele)) { continue; } int start; (start, refAllele, altAllele) = VariantUtils.TrimAndLeftAlign(item.Position, refAllele, altAllele, _sequenceProvider.Sequence); shiftedItems.Add(new ClinVarItem(item.Chromosome, start, item.Stop, refAllele, altAllele, item.JsonSchema, item.AlleleOrigins, item.VariantType, item.Id, item.VariationId, item.ReviewStatus, item.MedGenIds, item.OmimIds, item.OrphanetIds, item.Phenotypes, item.Significances, item.PubmedIds, item.LastUpdatedDate)); } shiftedItems.Sort(); return(shiftedItems); }
private List <AncestralAlleleItem> ExtractItems(string vcfLine) { var splitLine = vcfLine.Split(new[] { '\t' }, 9);// we don't care about the many fields after info field if (splitLine.Length < 8) { return(null); } Clear(); var chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refNameDictionary.ContainsKey(chromosomeName)) { return(null); } var chromosome = _refNameDictionary[chromosomeName]; var position = int.Parse(splitLine[VcfCommon.PosIndex]);//we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var infoFields = splitLine[VcfCommon.InfoIndex]; // parses the info fields and extract frequencies, ancestral allele, allele counts, etc. var hasSymbolicAllele = altAlleles.Any(x => x.OptimizedStartsWith('<') && x.OptimizedEndsWith('>')); if (hasSymbolicAllele) { return(null); } // ReSharper disable once ConditionIsAlwaysTrueOrFalse ParseInfoField(infoFields); var ancestralAlleleItems = new List <AncestralAlleleItem>(); foreach (string altAllele in altAlleles) { var(shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); ancestralAlleleItems.Add(new AncestralAlleleItem(chromosome, shiftedPos, shiftedRef, shiftedAlt, _ancestralAllele)); } return(ancestralAlleleItems); }
public static void UpdateChromToPositions(Dictionary <IChromosome, List <int> > chromPositions, IChromosome chromosome, int position, string refAllele, string altAllele, ISequence refSequence) { if (!chromPositions.ContainsKey(chromosome)) { chromPositions.Add(chromosome, new List <int>(16 * 1024)); } foreach (string allele in altAllele.OptimizedSplit(',')) { if (allele.OptimizedStartsWith('<') || allele.Contains('[') || altAllele.Contains(']')) { continue; } (int shiftedPos, string _, string _) = VariantUtils.TrimAndLeftAlign(position, refAllele, allele, refSequence); chromPositions[chromosome].Add(shiftedPos); } }
public IVariant[] CreateVariants(IChromosome chromosome, int start, int end, string refAllele, string[] altAlleles, IInfoData infoData, bool[] isDecomposedByAllele, bool isRecomposed, List <string>[] linkedVids, string globalMajorAllele) { bool isReference = globalMajorAllele != null; if (isReference) { return(ReferenceVariantCreator.Create(_vidCreator, _sequence, chromosome, start, end, refAllele, altAlleles[0], globalMajorAllele)); } var variantCategory = GetVariantCategory(altAlleles[0], infoData.SvType); var variants = new List <IVariant>(altAlleles.Length); for (var i = 0; i < altAlleles.Length; i++) { #if (!NI_ALLELE) if (VcfCommon.IsNonInformativeAltAllele(altAlleles[i])) { continue; } #endif string altAllele = altAlleles[i]; bool isDecomposed = isDecomposedByAllele[i]; if (isDecomposed && isRecomposed) { throw new InvalidDataException("A variant can't be both decomposed and recomposed"); } (int shiftedStart, string shiftedRef, string shiftedAlt) = VariantUtils.TrimAndLeftAlign(start, refAllele, altAllele, _sequence); if (variantCategory == VariantCategory.SmallVariant || variantCategory == VariantCategory.Reference) { end = shiftedStart + shiftedRef.Length - 1; } variants.Add(GetVariant(chromosome, shiftedStart, end, shiftedRef, shiftedAlt, infoData, variantCategory, isDecomposed, isRecomposed, linkedVids?[i]?.ToArray())); } return(variants.Count == 0 ? null : variants.ToArray()); }
internal List <CosmicItem> ExtractCosmicItems(string vcfLine) { var splitLine = vcfLine.Split(new[] { '\t' }, 8); //skipping large variants if (splitLine[VcfCommon.RefIndex].Length > MaxVariantLength || splitLine[VcfCommon.AltIndex].Length > MaxVariantLength) { return(null); } string chromosomeName = splitLine[VcfCommon.ChromIndex]; if (!_refChromDict.ContainsKey(chromosomeName)) { return(null); } var chromosome = _refChromDict[chromosomeName]; int position = int.Parse(splitLine[VcfCommon.PosIndex]); string cosmicId = splitLine[VcfCommon.IdIndex]; string refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); string infoField = splitLine[VcfCommon.InfoIndex]; Clear(); ParseInfoField(infoField); var cosmicItems = new List <CosmicItem>(); foreach (string altAllele in altAlleles) { var(shiftedPos, shiftedRef, shiftedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAllele, _sequenceProvider.Sequence); cosmicItems.Add(_studies.TryGetValue(cosmicId, out var studies) ? new CosmicItem(chromosome, shiftedPos, cosmicId, shiftedRef, shiftedAlt, _geneName, studies, _sampleCount) : new CosmicItem(chromosome, shiftedPos, cosmicId, shiftedRef, shiftedAlt, _geneName, null, _sampleCount)); } return(cosmicItems); }
private string GetNextChromDestinations(string line) { //extracting current chrom info from first line provided var currentChromName = line.Split('\t', 2)[VcfCommon.ChromIndex]; Console.Write($"Getting destinations for chromosome:{currentChromName}..."); var currentChrom = ReferenceNameUtilities.GetChromosome(_desSequenceProvider.RefNameToChromosome, currentChromName); _desSequenceProvider.LoadChromosome(currentChrom); do { var splits = line.Split('\t', VcfCommon.InfoIndex); var chrom = splits[VcfCommon.ChromIndex]; if (chrom != currentChromName) { break; } var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) { continue; } var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _desSequenceProvider.Sequence)).ToArray(); foreach (var(start, variantRef, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { if (!_destinationVariants.TryGetValue((rsId, variantRef.Length, variantAlt), out var variants)) { variants = new List <int>(); _destinationVariants[(rsId, variantRef.Length, variantAlt)] = variants;
public int Map() { // write out the relocated locations of the leftover rsIds whenever possible //reading in the leftover ids var leftoverIds = new HashSet <(long, string)>(); Console.Write("Loading leftover ids..."); string line; while ((line = _leftoverReader.ReadLine()) != null) { var splits = line.Split('#', 3); var id = long.Parse(splits[0]); var alt = splits[1]; leftoverIds.Add((id, alt)); } Console.WriteLine($"{leftoverIds.Count} found."); // stream through the dest file to find locations var leftoversWithDest = new Dictionary <(long, string), List <GenomicLocation> >(); var currentChromName = ""; while ((line = _destReader.ReadLine()) != null) { if (line.OptimizedStartsWith('#')) { continue; } var splits = line.Split('\t', VcfCommon.InfoIndex); var chromName = splits[VcfCommon.ChromIndex]; if (chromName != currentChromName) { currentChromName = chromName; Console.WriteLine($"Getting destinations for chromosome:{currentChromName}..."); var currentChrom = ReferenceNameUtilities.GetChromosome(_desSequenceProvider.RefNameToChromosome, currentChromName); _desSequenceProvider.LoadChromosome(currentChrom); } var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) { continue; } var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _desSequenceProvider.Sequence)).ToArray(); foreach (var(_, _, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { if (!leftoverIds.Contains((rsId, variantAlt))) { continue; } var pos = int.Parse(splits[VcfCommon.PosIndex]); if (!leftoversWithDest.TryGetValue((rsId, variantAlt), out var locations)) { locations = new List <GenomicLocation>(); leftoversWithDest[(rsId, variantAlt)] = locations;
private string ProcessNextChromSource(string line) { //extracting current chrom info from first line provided var currentChromName = line.Split('\t', 2)[VcfCommon.ChromIndex]; var currentChrom = ReferenceNameUtilities.GetChromosome(_srcSequenceProvider.RefNameToChromosome, currentChromName); _srcSequenceProvider.LoadChromosome(currentChrom); var leftoverCount = 0; do { var splits = line.Split('\t', VcfCommon.InfoIndex); var chrom = splits[VcfCommon.ChromIndex]; if (chrom != currentChromName) { break; } var refAllele = splits[VcfCommon.RefIndex]; var altAlleles = splits[VcfCommon.AltIndex].Split(','); var position = int.Parse(splits[VcfCommon.PosIndex]); var rsIds = Utilities.GetRsids(splits[VcfCommon.IdIndex]); if (rsIds == null) { continue; } var processedVariants = altAlleles.Select(x => VariantUtils.TrimAndLeftAlign(position, refAllele, x, _srcSequenceProvider.Sequence)).ToArray(); var foundInDest = false; foreach (var(_, variantRef, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { if (!_destinationVariants.TryGetValue((rsId, variantRef.Length, variantAlt), out var targetPositions)) { continue; } targetPositions.ForEach(x => WriteRemappedEntry(chrom, x, variantRef, variantAlt, line)); //flipping the sign to indicate it has been mapped //_destinationVariants[rsId] = (-variant.position, variant.refAllele, variant.altAlleles); foundInDest = true; } } if (foundInDest) { continue; } foreach (var(_, _, variantAlt) in processedVariants) { foreach (var rsId in rsIds) { _leftoverWriter.WriteLine(string.Join('#', rsId.ToString(), variantAlt, line)); } } leftoverCount++; } while ((line = _srcReader.ReadLine()) != null); Console.WriteLine($"Leftover count for {currentChromName}: {leftoverCount}"); //Console.WriteLine($"Number of entries discarded due to allele mismatch: {_alleleMismatchCount}"); _leftoverCount += leftoverCount; return(line); }
/// <summary> /// Extracts a gnomad item(s) from the specified VCF line. /// </summary> /// <param name="vcfline"></param> /// <returns></returns> private List <GnomadItem> ExtractItems(string vcfline) { if (vcfline == null) { return(null); } var splitLine = vcfline.OptimizedSplit('\t'); if (splitLine.Length < 8) { return(null); } Clear(); var chromosome = splitLine[VcfCommon.ChromIndex]; if (!_sequenceProvider.RefNameToChromosome.ContainsKey(chromosome)) { return(null); } var chrom = _sequenceProvider.RefNameToChromosome[chromosome]; var position = int.Parse(splitLine[VcfCommon.PosIndex]); //we have to get it from RSPOS in info var refAllele = splitLine[VcfCommon.RefIndex]; var altAlleles = splitLine[VcfCommon.AltIndex].OptimizedSplit(','); var filters = splitLine[VcfCommon.FilterIndex]; var infoFields = splitLine[VcfCommon.InfoIndex]; var hasFailedFilters = !(filters.Equals("PASS") || filters.Equals(".")); // parses the info fields and extract frequencies, coverage, num samples. try { ParseInfoField(infoFields); } catch (Exception e) { Console.WriteLine(vcfline); Console.WriteLine(e); throw; } if (_anAll == 0) { return(null); } var gnomadItemsList = new List <GnomadItem>(); for (int i = 0; i < altAlleles.Length; i++) { var(alignedPos, alignedRef, alignedAlt) = VariantUtils.TrimAndLeftAlign(position, refAllele, altAlleles[i], _sequenceProvider.Sequence); gnomadItemsList.Add(new GnomadItem( chrom, alignedPos, alignedRef, alignedAlt, _totalDepth, _anAll, _anAfr, _anAmr, _anEas, _anFin, _anNfe, _anOth, _anAsj, _anSas, GetCount(_acAll, i), GetCount(_acAfr, i), GetCount(_acAmr, i), GetCount(_acEas, i), GetCount(_acFin, i), GetCount(_acNfe, i), GetCount(_acOth, i), GetCount(_acAsj, i), GetCount(_acSas, i), GetCount(_hcAll, i), GetCount(_hcAfr, i), GetCount(_hcAmr, i), GetCount(_hcEas, i), GetCount(_hcFin, i), GetCount(_hcNfe, i), GetCount(_hcOth, i), GetCount(_hcAsj, i), GetCount(_hcSas, i), hasFailedFilters) ); } return(gnomadItemsList); }