public void IdentifyConflictingItems() { var sequence = new SimpleSequence(new string('A', VariantUtils.MaxUpstreamLength) + "TAAGCCAGCCAGCCAGCCAAGCTGGCCAAGCCAGACAGGCAGCCAAGCCAACCAAGACACCCAGGCAGCCAAGCCAGC", 16558315 - VariantUtils.MaxUpstreamLength); var refNameToChrom = new Dictionary <string, IChromosome> { { "22", Chrom22 } }; var sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, sequence, refNameToChrom); var gnomadReader = new GnomadReader(new StreamReader(GetConflictingItemsStream()), sequenceProvider); var items = new List <ISupplementaryDataItem>(); foreach (GnomadItem item in gnomadReader.GetItems()) { //item.Trim(); if (item.Position == 16558315) { items.Add(item); } } items = SuppDataUtilities.RemoveConflictingAlleles(items, false); //two if the items were removed as conflicting items Assert.Equal(3, items.Count); }
private static ISupplementaryDataItem GetRefMinorItem(IList <ISupplementaryDataItem> saItems) { var totalAltAlleleFreq = 0.0; var alleleFrequencies = new Dictionary <string, double>(); string refAllele = null; foreach (var supplementaryDataItem in saItems) { var item = (AlleleFrequencyItem)supplementaryDataItem; if (!IsSnv(item.RefAllele) || !IsSnv(item.AltAllele)) { continue; } refAllele = item.RefAllele; totalAltAlleleFreq += item.AltFrequency; alleleFrequencies[item.AltAllele] = item.AltFrequency; } var isRefMinor = totalAltAlleleFreq >= SaCommon.RefMinorThreshold; if (!isRefMinor) { return(null); } string globalMajor = SuppDataUtilities.GetMostFrequentAllele(alleleFrequencies, refAllele); return(new RefMinorItem(saItems[0].Chromosome, saItems[0].Position, globalMajor)); }
private void WriteUptoPosition(MinHeap <ISupplementaryDataItem> itemsHeap, int position) { if (position < 1) { return; } if (itemsHeap.Count() == 0) { return; } var bufferMin = itemsHeap.GetMin(); while (bufferMin.Position < position) { var itemsAtMinPosition = new List <ISupplementaryDataItem>(); while (itemsHeap.Count() > 0 && SuppDataUtilities.CompareTo(bufferMin, itemsHeap.GetMin()) == 0) { itemsAtMinPosition.Add(itemsHeap.ExtractMin()); } if (itemsAtMinPosition.Count > 0) { _count += itemsAtMinPosition.Count; WritePosition(itemsAtMinPosition); } if (itemsHeap.Count() == 0) { break; } bufferMin = itemsHeap.GetMin(); } }
public void GetItems_test() { var reader = new GlobalMinorReader(GetStream(), _chromDict); var items = reader.GetItems().Cast <ISupplementaryDataItem>().ToList(); var globalMinor = SuppDataUtilities.GetPositionalAnnotation(items); Assert.Equal("{\"globalMinorAllele\":\"G\",\"globalMinorAlleleFrequency\":0.3472}", globalMinor.GetJsonString()); }
public void RemoveConflictingAlleles_does_not_remove_duplicates() { var seqProvider = ParserTestUtils.GetSequenceProvider(70220313, "TGCC", 'A', _chromDict); var topMedReader = new TopMedReader(new StreamReader(GetDupItemsStream()), seqProvider); var items = topMedReader.GetItems().ToList(); var saItems = new List <ISupplementaryDataItem>(items); saItems = SuppDataUtilities.RemoveConflictingAlleles(saItems, false); Assert.Single(saItems); }
private void WritePosition(List <ISupplementaryDataItem> items) { int position = items[0].Position; _memStream.Position = 0; if (_isPositional) { var positionalItem = SuppDataUtilities.GetPositionalAnnotation(items); if (positionalItem == null) { return; } _memWriter.Write(positionalItem.GetJsonString()); } else { // any data source that is reported by allele and is not an array (e.g. allele frequencies) need this filtering step if (_index.MatchByAllele && !_index.IsArray) { items = SuppDataUtilities.RemoveConflictingAlleles(items, _throwErrorOnConflicts); } if (_index.JsonKey == SaCommon.PrimateAiTag) { items = SuppDataUtilities.DeDuplicatePrimateAiItems(items); } _memWriter.WriteOpt(items.Count); foreach (ISupplementaryDataItem saItem in items) { _memWriter.WriteOptAscii(saItem.RefAllele); _memWriter.WriteOptAscii(saItem.AltAllele); _memWriter.Write(saItem.GetJsonString()); } } int numBytes = (int)_memStream.Position; if (!_block.HasSpace(numBytes)) { Flush(items[0].Chromosome.Index); } _block.Add(_memBuffer, numBytes, position); }
public IEnumerable <ISupplementaryDataItem> GetItems() { var vcvList = GetVariationRecords(); Console.WriteLine($"Found {vcvList.Count} VCV records"); var unknownVcvs = new HashSet <int>(); foreach (var clinVarItem in GetRcvItems()) { if (clinVarItem.VariationId == null) { yield return(clinVarItem); continue; } var vcvId = clinVarItem.VariationId.Value; var vcvIndex = SuppDataUtilities.BinarySearch(vcvList, vcvId); if (vcvIndex < 0) { Console.WriteLine($"Unknown vcv id:{vcvId} found in {clinVarItem.Id}"); unknownVcvs.Add(vcvId); //remove the VariationId yield return(new ClinVarItem(clinVarItem.Chromosome, clinVarItem.Position, clinVarItem.Stop, clinVarItem.RefAllele, clinVarItem.AltAllele, clinVarItem.JsonSchema, clinVarItem.AlleleOrigins, clinVarItem.VariantType, clinVarItem.Id, null, clinVarItem.ReviewStatus, clinVarItem.MedGenIds, clinVarItem.OmimIds, clinVarItem.OrphanetIds, clinVarItem.Phenotypes, clinVarItem.Significances, clinVarItem.PubmedIds, clinVarItem.LastUpdatedDate)); continue; } var vcvItem = vcvList[vcvIndex]; yield return(new VcvSaItem(clinVarItem.Chromosome, clinVarItem.Position, clinVarItem.RefAllele, clinVarItem.AltAllele, vcvItem.Accession, vcvItem.Version, vcvItem.LastUpdatedDate, vcvItem.ReviewStatus, vcvItem.Significances)); yield return(clinVarItem); } Console.WriteLine($"{unknownVcvs.Count} unknown VCVs found in RCVs."); Console.WriteLine($"{string.Join(',', unknownVcvs)}"); }
private static (Dictionary <(string refAllele, string altAllele), GnomadItem> genomeItems, Dictionary <(string refAllele, string altAllele), GnomadItem> exomeItems) GetMinItems(MinHeap <GnomadItem> minHeap) { var genomeItems = new List <ISupplementaryDataItem>(); var exomeItems = new List <ISupplementaryDataItem>(); if (minHeap.Count() == 0) { return(null, null); } var position = minHeap.GetMin().Position; while (minHeap.Count() > 0 && minHeap.GetMin().Position == position) { var item = minHeap.ExtractMin(); if (item.DataType == GnomadDataType.Genome) { genomeItems.Add(item); } else { exomeItems.Add(item); } } genomeItems = SuppDataUtilities.RemoveConflictingAlleles(genomeItems, false); exomeItems = SuppDataUtilities.RemoveConflictingAlleles(exomeItems, false); var genomeItemsByAllele = new Dictionary <(string refAllele, string altAllele), GnomadItem>(); foreach (var item in genomeItems) { genomeItemsByAllele.Add((item.RefAllele, item.AltAllele), (GnomadItem)item); } var exomeItemsByAllele = new Dictionary <(string refAllele, string altAllele), GnomadItem>(); foreach (var item in exomeItems) { exomeItemsByAllele.Add((item.RefAllele, item.AltAllele), (GnomadItem)item); } return(genomeItemsByAllele, exomeItemsByAllele); }
public void ResolveDuplicates() { var entrezToHgnc = new Dictionary <string, string> { { "255403", "Gene1" } }; var ensemblToHgnc = new Dictionary <string, string> { { "ENSG00000234810", "Gene2" } }; var primateParser = new PrimateAiParser(GetDuplicateItemStream(), GetSequenceProvider(), entrezToHgnc, ensemblToHgnc); var items = primateParser.GetItems().Cast <ISupplementaryDataItem>().ToList(); var deDupItems = SuppDataUtilities.DeDuplicatePrimateAiItems(items); Assert.Single(deDupItems); Assert.Equal("\"hgnc\":\"Gene1\",\"scorePercentile\":0.93", deDupItems[0].GetJsonString()); }
public IEnumerable <ISupplementaryDataItem> GetItems() { _vcvItems = GetVariationRecords(); Console.WriteLine($"Found {_vcvItems.Count} VCV records"); var unknownVcvs = new HashSet <int>(); foreach (var clinVarItem in GetRcvItems()) { if (string.IsNullOrEmpty(clinVarItem.VariationId)) { yield return(clinVarItem); continue; } var vcvId = int.Parse(clinVarItem.VariationId); var vcvIndex = SuppDataUtilities.BinarySearch(_vcvItems, vcvId); if (vcvIndex < 0) { Console.WriteLine($"Unknown vcv id:{vcvId} found in {clinVarItem.Id}"); unknownVcvs.Add(vcvId); //remove the VariationId clinVarItem.VariationId = null; yield return(clinVarItem); continue; } var vcvItem = _vcvItems[vcvIndex]; yield return(new VcvSaItem(clinVarItem.Chromosome, clinVarItem.Position, clinVarItem.RefAllele, clinVarItem.AltAllele, vcvItem.Accession, vcvItem.Version, vcvItem.LastUpdatedDate, vcvItem.ReviewStatus, vcvItem.Significances)); clinVarItem.VariationId = $"{vcvItem.Accession}.{vcvItem.Version}"; yield return(clinVarItem); } Console.WriteLine($"{unknownVcvs.Count} unknown VCVs found in RCVs."); Console.WriteLine($"{string.Join(',', unknownVcvs)}"); }