/// <summary> /// returns a reference to a gene given an a reference string /// </summary> public static DataStructures.VEP.Gene ParseReference(string reference, ImportDataStore dataStore) { var geneReferenceMatch = ReferenceRegex.Match(reference); if (!geneReferenceMatch.Success) { throw new GeneralException( $"Unable to use the regular expression on the gene reference string: [{reference}]"); } int transcriptIndex; if (!int.TryParse(geneReferenceMatch.Groups[1].Value, out transcriptIndex)) { throw new GeneralException( $"Unable to convert the transcript index from a string to an integer: [{geneReferenceMatch.Groups[1].Value}]"); } // sanity check: make sure we have at least that many transcripts in our list if (transcriptIndex < 0 || transcriptIndex >= dataStore.Transcripts.Count) { throw new GeneralException( $"Unable to link the gene reference: transcript index: [{transcriptIndex}], current # of transcripts: [{dataStore.Transcripts.Count}]"); } return(dataStore.Transcripts[transcriptIndex].Gene); }
/// <summary> /// returns a reference to an exon given an a reference string /// </summary> public static DataStructures.VEP.Exon ParseReference(string reference, ImportDataStore dataStore) { var transExonArrayReferenceMatch = TransExonArrayReferenceRegex.Match(reference); if (transExonArrayReferenceMatch.Success) { return(ParseTransExonArrayReference(transExonArrayReferenceMatch, dataStore)); } var sortedExonsReferenceMatch = SortedExonsReferenceRegex.Match(reference); if (sortedExonsReferenceMatch.Success) { return(ParseSortedExonsReference(sortedExonsReferenceMatch, dataStore)); } var translationReferenceMatch = TranslationReferenceRegex.Match(reference); if (translationReferenceMatch.Success) { return(ParseTranslationReference(translationReferenceMatch, dataStore)); } throw new GeneralException($"Unable to use the regular expression on the exon translation reference string: [{reference}]"); }
public void Merge(ImportDataStore originalDataStore, ImportDataStore mergedDataStore, FeatureStatistics statistics) { var regulatoryDict = GetMergedRegulatoryRegions(originalDataStore); mergedDataStore.RegulatoryFeatures.AddRange(regulatoryDict.Values.ToList()); statistics.Increment(mergedDataStore.RegulatoryFeatures.Count, originalDataStore.RegulatoryFeatures.Count); }
public void Merge(ImportDataStore originalDataStore, ImportDataStore mergedDataStore, FeatureStatistics statistics) { var transcriptDict = GetMergedTranscripts(originalDataStore); mergedDataStore.Transcripts.AddRange(transcriptDict.Values.ToList()); statistics.Increment(mergedDataStore.Transcripts.Count, originalDataStore.Transcripts.Count); }
/// <summary> /// constructor /// </summary> public VepCacheParser(TranscriptDataSource ds) { ImportDataStore.TranscriptSource = ds; _uniqueDataStore = new ImportDataStore(); _nonUniquedataStore = new ImportDataStore(); _tempDataStore = new ImportDataStore(); _regulatoryStatistics = new FeatureStatistics("Regulatory"); _transcriptStatistics = new FeatureStatistics("Transcripts"); _geneStatistics = new FeatureStatistics("Genes"); _intronStatistics = new FeatureStatistics("Introns"); _exonStatistics = new FeatureStatistics("Exons"); _mirnaStatistics = new FeatureStatistics("miRNAs"); _siftStatistics = new FeatureStatistics("SIFT matrices"); _polyphenStatistics = new FeatureStatistics("PolyPhen matrices"); _cdnaStatistics = new FeatureStatistics("cDNA seqs"); _peptideStatistics = new FeatureStatistics("Peptide seqs"); }
/// <summary> /// returns a reference to a PolyPhen object given an a reference string /// </summary> public static DataStructures.VEP.PolyPhen ParseReference(string reference, ImportDataStore dataStore) { var polyPhenReferenceMatch = PolyPhenReferenceRegex.Match(reference); int transcriptIndex; if (!int.TryParse(polyPhenReferenceMatch.Groups[1].Value, out transcriptIndex)) { throw new GeneralException( $"Unable to convert the transcript index from a string to an integer: [{polyPhenReferenceMatch.Groups[1].Value}]"); } // sanity check: make sure we have at least that many transcripts in our list if (transcriptIndex < 0 || transcriptIndex >= dataStore.Transcripts.Count) { throw new GeneralException( $"Unable to link the PolyPhen reference: transcript index: [{transcriptIndex}], current # of transcripts: [{dataStore.Transcripts.Count}]"); } return(dataStore.Transcripts[transcriptIndex].VariantEffectCache.ProteinFunctionPredictions.PolyPhen); }
/// <summary> /// parses the relevant data from each pair genomic object /// </summary> public static DataStructures.VEP.PairGenomic Parse(ObjectValue objectValue, ImportDataStore dataStore) { var pairGenomic = new DataStructures.VEP.PairGenomic(); // loop over all of the key/value pairs in the pair genomic object foreach (AbstractData ad in objectValue) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(ad.Key)) { throw new GeneralException($"Encountered an unknown key in the pair genomic object: {ad.Key}"); } // handle each key switch (ad.Key) { case GenomicKey: var genomicNode = ad as ListObjectKeyValue; if (genomicNode != null) { pairGenomic.Genomic = MapperPair.ParseList(genomicNode.Values, dataStore); } else if (DumperUtilities.IsUndefined(ad)) { pairGenomic.Genomic = null; } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } return(pairGenomic); }
/// <summary> /// places a reference to already existing exons into the array of exons /// </summary> public static void ParseListReference(List <AbstractData> abstractDataList, DataStructures.VEP.Exon[] exons, ImportDataStore dataStore) { // loop over all of the exons for (int exonIndex = 0; exonIndex < abstractDataList.Count; exonIndex++) { var exonNode = abstractDataList[exonIndex]; // skip normal exons if (!DumperUtilities.IsReference(exonNode)) { continue; } var referenceStringValue = exonNode as ReferenceStringValue; if (referenceStringValue != null) { exons[exonIndex] = ParseReference(referenceStringValue.Value, dataStore); } } }
/// <summary> /// returns an array of exons given a list of ObjectValues (AbstractData) /// </summary> public static DataStructures.VEP.Exon[] ParseList(List <AbstractData> abstractDataList, ImportDataStore dataStore) { var exons = new DataStructures.VEP.Exon[abstractDataList.Count]; // loop over all of the exons for (int exonIndex = 0; exonIndex < abstractDataList.Count; exonIndex++) { // skip references if (DumperUtilities.IsReference(abstractDataList[exonIndex])) { continue; } var objectValue = abstractDataList[exonIndex] as ObjectValue; if (objectValue != null) { var newExon = Parse(objectValue, dataStore.CurrentReferenceIndex); // DS.VEP.Exon oldExon; // if (dataStore.Exons.TryGetValue(newExon, out oldExon)) //{ // exons[exonIndex] = oldExon; //} // else //{ exons[exonIndex] = newExon; // dataStore.Exons[newExon] = newExon; //} } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectValue: [{abstractDataList[exonIndex].GetType()}]"); } } return(exons); }
/// <summary> /// returns a reference to an exon given a translation reference string /// </summary> private static DataStructures.VEP.Exon ParseTranslationReference(Match referenceMatch, ImportDataStore dataStore) { int transcriptIndex; if (!int.TryParse(referenceMatch.Groups[1].Value, out transcriptIndex)) { throw new GeneralException( $"Unable to convert the transcript index from a string to an integer: [{referenceMatch.Groups[1].Value}]"); } // sanity check: make sure we have at least that many transcripts in our list if (transcriptIndex < 0 || transcriptIndex >= dataStore.Transcripts.Count) { throw new GeneralException( $"Unable to link the exon reference: transcript index: [{transcriptIndex}], current # of transcripts: [{dataStore.Transcripts.Count}]"); } string exonKey = referenceMatch.Groups[2].Value; // Console.WriteLine("reference: {0}", reference); // Console.WriteLine("transcript index: {0}", transcriptIndex); // Console.WriteLine("exon key: {0}", exonKey); DataStructures.VEP.Exon ret; switch (exonKey) { case Translation.EndExonKey: ret = dataStore.Transcripts[transcriptIndex].Translation.EndExon; break; case Translation.StartExonKey: ret = dataStore.Transcripts[transcriptIndex].Translation.StartExon; break; default: throw new GeneralException($"Unable to determine the correct exon translation to use: {exonKey}"); } return(ret); }
private static DataStructures.VEP.Exon ParseTransExonArrayReference(Match referenceMatch, ImportDataStore dataStore) { int transcriptIndex; if (!int.TryParse(referenceMatch.Groups[1].Value, out transcriptIndex)) { throw new GeneralException( $"Unable to convert the transcript index from a string to an integer: [{referenceMatch.Groups[1].Value}]"); } // sanity check: make sure we have at least that many transcripts in our list if (transcriptIndex < 0 || transcriptIndex >= dataStore.Transcripts.Count) { throw new GeneralException( $"Unable to link the exon reference: transcript index: [{transcriptIndex}], current # of transcripts: [{dataStore.Transcripts.Count}]"); } int exonIndex; if (!int.TryParse(referenceMatch.Groups[2].Value, out exonIndex)) { throw new GeneralException( $"Unable to convert the exon index from a string to an integer: [{referenceMatch.Groups[2].Value}]"); } // sanity check: make sure we have at least that many exons in our list if (exonIndex < 0 || exonIndex >= dataStore.Transcripts[transcriptIndex].TransExons.Length) { throw new GeneralException( $"Unable to link the exon reference: exon index: [{exonIndex}], current # of exons: [{dataStore.Transcripts[transcriptIndex].TransExons.Length}]"); } // Console.WriteLine("reference: {0}", reference); // Console.WriteLine("transcript index: {0}", transcriptIndex); // Console.WriteLine("exon index: {0}", exonIndex); return(dataStore.Transcripts[transcriptIndex].TransExons[exonIndex]); }
/// <summary> /// parses the relevant data from each mapper /// </summary> public static void ParseReference(ObjectValue objectValue, DataStructures.VEP.Mapper mapper, ImportDataStore dataStore) { // loop over all of the key/value pairs in the mapper object foreach (AbstractData ad in objectValue) { switch (ad.Key) { case PairCodingDnaKey: var pairCodingDnaNode = ad as ObjectKeyValue; if (pairCodingDnaNode != null) { PairCodingDna.ParseReference(pairCodingDnaNode.Value, mapper.PairCodingDna, dataStore); } break; case PairGenomicKey: var pairGenomicNode = ad as ObjectKeyValue; if (pairGenomicNode != null) { PairGenomic.ParseReference(pairGenomicNode.Value, mapper.PairGenomic, dataStore); } break; } } }
/// <summary> /// parses the relevant data from each transcript /// </summary> public static void ParseReferences(ObjectValue objectValue, int transcriptIndex, ImportDataStore dataStore) { // Console.WriteLine("*** ParseReferences {0} / {1} ***", transcriptIndex + 1, _tempTranscripts.Count); var transcript = dataStore.Transcripts[transcriptIndex]; // loop over all of the key/value pairs in the transcript object foreach (AbstractData ad in objectValue) { // skip undefined keys if (DumperUtilities.IsUndefined(ad)) { continue; } // handle each key ReferenceKeyValue referenceKeyValue; // references found in: // 'transcript' -> '_variation_effect_feature_cache' -> 'introns' -> 'slice' has references // 'transcript' -> 'gene' has references // 'transcript' -> 'slice' has references // 'transcript' -> '_trans_exon_array' -> [] has references // 'transcript' -> 'translation'-> 'end_exon' has references // 'transcript' -> 'translation'-> 'start_exon' has references // 'transcript' -> 'translation'-> 'transcript' has references switch (ad.Key) { case GeneKey: // works well if (DumperUtilities.IsReference(ad)) { referenceKeyValue = ad as ReferenceKeyValue; if (referenceKeyValue != null) { transcript.Gene = Gene.ParseReference(referenceKeyValue.Value, dataStore); } } break; case SliceKey: if (DumperUtilities.IsReference(ad)) { referenceKeyValue = ad as ReferenceKeyValue; if (referenceKeyValue != null) { transcript.Slice = Slice.ParseReference(referenceKeyValue.Value, dataStore); } } break; case TransExonArrayKey: var exonsList = ad as ListObjectKeyValue; if (exonsList != null) { Exon.ParseListReference(exonsList.Values, transcript.TransExons, dataStore); } break; case TranslationKey: var translationNode = ad as ObjectKeyValue; if (translationNode != null) { Translation.ParseReference(translationNode.Value, transcript.Translation, dataStore); } break; case VariationEffectFeatureCacheKey: var cacheNode = ad as ObjectKeyValue; if (cacheNode != null) { VariantEffectFeatureCache.ParseReference(cacheNode.Value, transcript.VariantEffectCache, dataStore); } break; } } }
/// <summary> /// parses the relevant data from each variant effect feature cache /// </summary> public static DataStructures.VEP.VariantEffectFeatureCache Parse(ObjectValue objectValue, ImportDataStore dataStore) { var cache = new DataStructures.VEP.VariantEffectFeatureCache(); // loop over all of the key/value pairs in the cache object foreach (AbstractData ad in objectValue) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(ad.Key)) { throw new GeneralException( $"Encountered an unknown key in the dumper variant effect feature cache object: {ad.Key}"); } // handle each key switch (ad.Key) { case SelenocysteinesKey: case ThreePrimeUtrKey: case SeqEditsKey: case CodonTableKey: case ProteinFeaturesKey: // not used break; case IntronsKey: var intronsList = ad as ListObjectKeyValue; if (intronsList != null) { cache.Introns = Intron.ParseList(intronsList.Values, dataStore); } else if (DumperUtilities.IsUndefined(ad)) { cache.Introns = null; } else { throw new GeneralException( $"Could not transform the AbstractData object into a ListObjectKeyValue: [{ad.GetType()}]"); } break; case MapperKey: var mapperNode = ad as ObjectKeyValue; if (mapperNode != null) { cache.Mapper = TranscriptMapper.Parse(mapperNode.Value, dataStore); } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } break; case PeptideKey: cache.Peptide = DumperUtilities.GetString(ad); break; case ProteinFunctionPredictionsKey: var predictionsNode = ad as ObjectKeyValue; if (predictionsNode != null) { cache.ProteinFunctionPredictions = ProteinFunctionPredictions.Parse(predictionsNode.Value); } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } break; case SortedExonsKey: var exonsList = ad as ListObjectKeyValue; if (exonsList != null) { cache.Exons = Exon.ParseList(exonsList.Values, dataStore); } else { throw new GeneralException($"Could not transform the AbstractData object into a ListObjectKeyValue: [{ad.GetType()}]"); } break; case TranslateableSeqKey: cache.TranslateableSeq = DumperUtilities.GetString(ad); break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } return(cache); }
/// <summary> /// returns a new exon given an ObjectValue /// </summary> private static DataStructures.VEP.Intron Parse(ObjectValue objectValue, ImportDataStore dataStore) { var intron = new DataStructures.VEP.Intron(); // loop over all of the key/value pairs in the intron object foreach (AbstractData ad in objectValue) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(ad.Key)) { throw new GeneralException($"Encountered an unknown key in the dumper mapper object: {ad.Key}"); } // handle each key switch (ad.Key) { case Transcript.EndKey: intron.End = DumperUtilities.GetInt32(ad); break; case Transcript.SliceKey: var sliceNode = ad as ObjectKeyValue; if (sliceNode != null) { var newSlice = Slice.Parse(sliceNode.Value, dataStore.CurrentReferenceIndex); // DS.VEP.Slice oldSlice; // if (dataStore.Slices.TryGetValue(newSlice, out oldSlice)) //{ // intron.Slice = oldSlice; //} // else //{ intron.Slice = newSlice; // dataStore.Slices[newSlice] = newSlice; //} } else if (DumperUtilities.IsReference(ad)) { // skip references until the second pass } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectKeyValue or ReferenceKeyValue: [{ad.GetType()}]"); } break; case Transcript.StartKey: intron.Start = DumperUtilities.GetInt32(ad); break; case Transcript.StrandKey: TranscriptUtilities.GetStrand(ad); break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } return(intron); }
/// <summary> /// parses the relevant data from each intron object /// </summary> public static DataStructures.VEP.Intron[] ParseList(List <AbstractData> abstractDataList, ImportDataStore dataStore) { var introns = new DataStructures.VEP.Intron[abstractDataList.Count]; // loop over all of the introns for (int intronIndex = 0; intronIndex < abstractDataList.Count; intronIndex++) { var objectValue = abstractDataList[intronIndex] as ObjectValue; if (objectValue == null) { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectValue: [{abstractDataList[intronIndex].GetType()}]"); } introns[intronIndex] = Parse(objectValue, dataStore); } return(introns); }
/// <summary> /// parses the relevant data from each transcript mapper /// </summary> public static DataStructures.VEP.TranscriptMapper Parse(ObjectValue objectValue, ImportDataStore dataStore) { var mapper = new DataStructures.VEP.TranscriptMapper(); // loop over all of the key/value pairs in the transcript mapper object foreach (AbstractData ad in objectValue) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(ad.Key)) { throw new GeneralException( $"Encountered an unknown key in the dumper transcript mapper object: {ad.Key}"); } // handle each key switch (ad.Key) { case CodingDnaCodingEndKey: case CodingDnaCodingStartKey: case StartPhaseKey: break; case ExonCoordinateMapperKey: var exonCoordMapperNode = ad as ObjectKeyValue; if (exonCoordMapperNode != null) { mapper.ExonCoordinateMapper = Mapper.Parse(exonCoordMapperNode.Value, dataStore); } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } return(mapper); }
private Dictionary <string, DataStructures.VEP.Transcript> GetMergedTranscripts(ImportDataStore other) { var transcriptDict = new Dictionary <string, DataStructures.VEP.Transcript>(); foreach (var transcript in other.Transcripts) { if (string.IsNullOrEmpty(transcript.StableId)) { throw new GeneralException("Found a transcript with no ID."); } // apply whitelist filtering if (!FoundPrefix(transcript.StableId)) { continue; } // ignore transcripts with the name dupl if (transcript.StableId.Contains("dupl")) { continue; } // merge transcripts var transcriptKey = $"{transcript.StableId}.{transcript.Start}.{transcript.End}"; DataStructures.VEP.Transcript prevTranscript; if (transcriptDict.TryGetValue(transcriptKey, out prevTranscript)) { MergeTranscript(prevTranscript, transcript); } else { transcriptDict[transcriptKey] = transcript; } } return(transcriptDict); }
/// <summary> /// parses the relevant data from each transcript mapper cache /// </summary> public static void ParseReference(ObjectValue objectValue, DataStructures.VEP.TranscriptMapper transcriptMapper, ImportDataStore dataStore) { // loop over all of the key/value pairs in the transcript mapper object foreach (AbstractData ad in objectValue) { if (ad.Key != ExonCoordinateMapperKey) { continue; } var exonMapperNode = ad as ObjectKeyValue; if (exonMapperNode != null) { Mapper.ParseReference(exonMapperNode.Value, transcriptMapper.ExonCoordinateMapper, dataStore); } } }
/// <summary> /// points to a slice that has already been created /// </summary> private static DataStructures.VEP.Slice ParseCacheReference(string reference, ImportDataStore dataStore) { var sliceReferenceMatch = CacheReferenceRegex.Match(reference); if (!sliceReferenceMatch.Success) { throw new GeneralException( $"Unable to use the regular expression on the slice reference string: [{reference}]"); } int transcriptIndex; if (!int.TryParse(sliceReferenceMatch.Groups[1].Value, out transcriptIndex)) { throw new GeneralException( $"Unable to convert the transcript index from a string to an integer: [{sliceReferenceMatch.Groups[1].Value}]"); } // sanity check: make sure we have at least that many transcripts in our list if (transcriptIndex < 0 || transcriptIndex >= dataStore.Transcripts.Count) { throw new GeneralException( $"Unable to link the slice reference: transcript index: [{transcriptIndex}], current # of transcripts: [{dataStore.Transcripts.Count}]"); } int intronIndex; if (!int.TryParse(sliceReferenceMatch.Groups[2].Value, out intronIndex)) { throw new GeneralException( $"Unable to convert the intron index from a string to an integer: [{sliceReferenceMatch.Groups[2].Value}]"); } // sanity check: make sure we have at least that many introns in our list if (intronIndex < 0 || intronIndex >= dataStore.Transcripts[transcriptIndex].VariantEffectCache.Introns.Length) { throw new GeneralException( $"Unable to link the intron reference: intron index: [{intronIndex}], current # of introns: [{dataStore.Transcripts[transcriptIndex].VariantEffectCache.Introns.Length}]"); } // Console.WriteLine("reference: {0}", reference); // Console.WriteLine("transcript index: {0}", transcriptIndex); // Console.WriteLine("intron index: {0}", intronIndex); // Environment.Exit(1); return(dataStore.Transcripts[transcriptIndex].VariantEffectCache.Introns[intronIndex].Slice); }
/// <summary> /// parses the relevant data from each regulatory element /// </summary> public static void Parse(ObjectValue objectValue, int regulatoryFeatureIndex, ImportDataStore dataStore) { // Console.WriteLine("*** Parse {0} ***", regulatoryFeatureIndex + 1); int start = -1; int end = -1; string stableId = null; string type = null; // loop over all of the key/value pairs in the transcript object foreach (AbstractData ad in objectValue) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(ad.Key)) { throw new GeneralException( $"Encountered an unknown key in the dumper regulatory element object: {ad.Key}"); } // handle each key switch (ad.Key) { case BoundLengthsKey: case CellTypeCountKey: case CellTypesKey: case DbIdKey: case DisplayLabelKey: case HasEvidenceKey: case ProjectedKey: case SetKey: case Transcript.StrandKey: case Transcript.SliceKey: // not used break; case FeatureTypeKey: type = DumperUtilities.GetString(ad); break; case Transcript.EndKey: end = DumperUtilities.GetInt32(ad); break; case Transcript.StableIdKey: stableId = DumperUtilities.GetString(ad); break; case Transcript.StartKey: start = DumperUtilities.GetInt32(ad); break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } dataStore.RegulatoryFeatures.Add(new DataStructures.VEP.RegulatoryFeature(dataStore.CurrentReferenceIndex, start, end, stableId, type)); }
/// <summary> /// parses the relevant data from each exon coordinate mapper object /// </summary> public static DataStructures.VEP.Mapper Parse(ObjectValue objectValue, ImportDataStore dataStore) { var mapper = new DataStructures.VEP.Mapper(); // loop over all of the key/value pairs in the exon coordinate mapper object foreach (AbstractData ad in objectValue) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(ad.Key)) { throw new GeneralException($"Encountered an unknown key in the dumper mapper object: {ad.Key}"); } // handle each key switch (ad.Key) { case FromCoordSystemKey: if (!DumperUtilities.IsUndefined(ad)) { throw new GeneralException("Found an unexpected value in FromCoordSystemKey"); } break; case FromNameKey: mapper.FromType = DumperUtilities.GetString(ad); break; case IsSortedKey: mapper.IsSorted = DumperUtilities.GetBool(ad); break; case PairCodingDnaKey: var pairCodingDnaNode = ad as ObjectKeyValue; if (pairCodingDnaNode != null) { mapper.PairCodingDna = PairCodingDna.Parse(pairCodingDnaNode.Value, dataStore); } else if (DumperUtilities.IsUndefined(ad)) { mapper.PairCodingDna = null; } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } break; case PairCountKey: mapper.PairCount = DumperUtilities.GetInt32(ad); break; case PairGenomicKey: var pairGenomicNode = ad as ObjectKeyValue; if (pairGenomicNode != null) { mapper.PairGenomic = PairGenomic.Parse(pairGenomicNode.Value, dataStore); } else if (DumperUtilities.IsUndefined(ad)) { mapper.PairGenomic = null; } else { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } break; case ToCoordSystemKey: if (!DumperUtilities.IsUndefined(ad)) { throw new GeneralException("Found an unexpected value in ToCoordSystemKey"); } break; case ToNameKey: mapper.ToType = DumperUtilities.GetString(ad); break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } return(mapper); }
/// <summary> /// parses the relevant data from each variant effect feature cache /// </summary> public static void ParseReference(ObjectValue objectValue, DataStructures.VEP.VariantEffectFeatureCache cache, ImportDataStore dataStore) { // loop over all of the key/value pairs in the cache object foreach (AbstractData ad in objectValue) { switch (ad.Key) { case IntronsKey: var intronsList = ad as ListObjectKeyValue; if (intronsList != null) { Intron.ParseListReference(intronsList.Values, cache.Introns, dataStore); } break; case MapperKey: var mapperNode = ad as ObjectKeyValue; if (mapperNode != null) { TranscriptMapper.ParseReference(mapperNode.Value, cache.Mapper, dataStore); } break; case ProteinFunctionPredictionsKey: var predictionsNode = ad as ObjectKeyValue; if (predictionsNode != null) { ProteinFunctionPredictions.ParseReference(predictionsNode.Value, cache.ProteinFunctionPredictions, dataStore); } break; } } }
/// <summary> /// parses the relevant data from each mapper pairs object /// </summary> public static void ParseListReference(List <AbstractData> abstractDataList, List <DataStructures.VEP.MapperPair> mapperPairs, ImportDataStore dataStore) { // loop over all of the key/value pairs in the mapper pairs object for (int mapperPairIndex = 0; mapperPairIndex < abstractDataList.Count; mapperPairIndex++) { var mapperNode = abstractDataList[mapperPairIndex]; // skip normal mapper pairs if (!DumperUtilities.IsReference(mapperNode)) { continue; } var referenceStringValue = mapperNode as ReferenceStringValue; if (referenceStringValue != null) { var mapperPair = ParseReference(referenceStringValue.Value, dataStore); mapperPairs[mapperPairIndex] = mapperPair; } } }
/// <summary> /// parses the relevant data from each pair genomic object /// </summary> public static void ParseReference(ObjectValue objectValue, DataStructures.VEP.PairGenomic pairGenomic, ImportDataStore dataStore) { // loop over all of the key/value pairs in the pair genomic object foreach (AbstractData ad in objectValue) { // handle each key switch (ad.Key) { case GenomicKey: var genomicNode = ad as ListObjectKeyValue; if (genomicNode != null) { MapperPair.ParseListReference(genomicNode.Values, pairGenomic.Genomic, dataStore); } break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } }
/// <summary> /// parses the relevant data from each transcript /// </summary> public static void Parse(ObjectValue objectValue, int transcriptIndex, ImportDataStore dataStore) { // Console.WriteLine("*** Parse {0} ***", transcriptIndex + 1); var bioType = BioType.Unknown; var geneSymbolSource = GeneSymbolSource.Unknown; // HGNC SimpleInterval[] microRnas = null; DataStructures.VEP.Exon[] transExons = null; DataStructures.VEP.Gene gene = null; DataStructures.VEP.Translation translation = null; DataStructures.VEP.VariantEffectFeatureCache variantEffectCache = null; DataStructures.VEP.Slice slice = null; bool onReverseStrand = false; bool isCanonical = false; int compDnaCodingStart = -1; int compDnaCodingEnd = -1; int start = -1; int end = -1; byte version = 1; string ccdsId = null; string databaseId = null; string proteinId = null; string refSeqId = null; string geneStableId = null; string stableId = null; string geneSymbol = null; // DDX11L1 int hgncId = -1; // 37102 // loop over all of the key/value pairs in the transcript object foreach (AbstractData ad in objectValue) { // sanity check: make sure we know about the keys are used for if (!KnownKeys.Contains(ad.Key)) { throw new GeneralException($"Encountered an unknown key in the dumper transcript object: {ad.Key}"); } // handle each key switch (ad.Key) { case CodingRegionStartKey: case CodingRegionEndKey: case CreatedDateKey: case DescriptionKey: case DisplayXrefKey: case ExternalDbKey: case ExternalDisplayNameKey: case ExternalNameKey: case ExternalStatusKey: case GenePhenotypeKey: case ModifiedDateKey: case SourceKey: case SwissProtKey: case TremblKey: case UniParcKey: // not used break; case AttributesKey: var attributesList = ad as ListObjectKeyValue; if (attributesList != null) { microRnas = Attribute.ParseList(attributesList.Values); } break; case BiotypeKey: bioType = TranscriptUtilities.GetBiotype(ad); break; case CcdsKey: ccdsId = DumperUtilities.GetString(ad); if (ccdsId == "-" || ccdsId == "") { ccdsId = null; } break; case CdnaCodingEndKey: compDnaCodingEnd = DumperUtilities.GetInt32(ad); break; case CdnaCodingStartKey: compDnaCodingStart = DumperUtilities.GetInt32(ad); break; case DbIdKey: databaseId = DumperUtilities.GetString(ad); if (databaseId == "-" || databaseId == "") { databaseId = null; } break; case EndKey: end = DumperUtilities.GetInt32(ad); break; case GeneHgncIdKey: var hgnc = DumperUtilities.GetString(ad); if (hgnc != null && hgnc.StartsWith("HGNC:")) { hgnc = hgnc.Substring(5); } if (hgnc == "-" || hgnc == "") { hgnc = null; } if (hgnc != null) { hgncId = int.Parse(hgnc); } break; case GeneSymbolKey: case GeneHgncKey: // older key geneSymbol = DumperUtilities.GetString(ad); if (geneSymbol == "-" || geneSymbol == "") { geneSymbol = null; } break; case GeneSymbolSourceKey: geneSymbolSource = TranscriptUtilities.GetGeneSymbolSource(ad); break; case GeneKey: var geneNode = ad as ObjectKeyValue; if (geneNode != null) { gene = Gene.Parse(geneNode.Value, dataStore.CurrentReferenceIndex); } break; case GeneStableIdKey: geneStableId = DumperUtilities.GetString(ad); if (geneStableId == "-" || geneStableId == "") { geneStableId = null; } break; case IsCanonicalKey: isCanonical = DumperUtilities.GetBool(ad); break; case ProteinKey: proteinId = DumperUtilities.GetString(ad); if (proteinId == "-" || proteinId == "") { proteinId = null; } break; case RefseqKey: refSeqId = DumperUtilities.GetString(ad); if (refSeqId == "-" || refSeqId == "") { refSeqId = null; } break; case SliceKey: var sliceNode = ad as ObjectKeyValue; if (sliceNode != null) { slice = Slice.Parse(sliceNode.Value, dataStore.CurrentReferenceIndex); } break; case StableIdKey: stableId = DumperUtilities.GetString(ad); if (stableId == "-" || stableId == "") { stableId = null; } break; case StartKey: start = DumperUtilities.GetInt32(ad); break; case StrandKey: onReverseStrand = TranscriptUtilities.GetStrand(ad); break; case TransExonArrayKey: var exonsList = ad as ListObjectKeyValue; if (exonsList != null) { transExons = Exon.ParseList(exonsList.Values, dataStore); } else { throw new GeneralException($"Could not transform the AbstractData object into a ListObjectKeyValue: [{ad.GetType()}]"); } break; case TranslationKey: var translationNode = ad as ObjectKeyValue; if (translationNode != null) { translation = Translation.Parse(translationNode.Value, dataStore); } else if (DumperUtilities.IsUndefined(ad)) { translation = null; } else { throw new GeneralException($"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } break; case VariationEffectFeatureCacheKey: var cacheNode = ad as ObjectKeyValue; if (cacheNode == null) { throw new GeneralException($"Could not transform the AbstractData object into an ObjectKeyValue: [{ad.GetType()}]"); } variantEffectCache = VariantEffectFeatureCache.Parse(cacheNode.Value, dataStore); break; case VersionKey: version = (byte)DumperUtilities.GetInt32(ad); break; default: throw new GeneralException($"Unknown key found: {ad.Key}"); } } dataStore.Transcripts.Add(new DataStructures.VEP.Transcript(bioType, transExons, gene, translation, variantEffectCache, slice, onReverseStrand, isCanonical, compDnaCodingStart, compDnaCodingEnd, dataStore.CurrentReferenceIndex, start, end, ccdsId, databaseId, proteinId, refSeqId, geneStableId, stableId, geneSymbol, geneSymbolSource, hgncId, version, microRnas)); }
/// <summary> /// points to a introns that have already been created /// </summary> public static void ParseListReference(List <AbstractData> abstractDataList, DataStructures.VEP.Intron[] introns, ImportDataStore dataStore) { // loop over all of the introns for (int intronIndex = 0; intronIndex < abstractDataList.Count; intronIndex++) { var intronNode = abstractDataList[intronIndex]; var objectValue = intronNode as ObjectValue; if (objectValue == null) { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectValue: [{abstractDataList[intronIndex].GetType()}]"); } ParseReference(objectValue, introns[intronIndex], dataStore); } }
/// <summary> /// points to a mapper pair that has already been created /// </summary> private static DataStructures.VEP.MapperPair ParseCodingDnaReference(string reference, ImportDataStore dataStore) { var mapperPairReferenceMatch = ReferenceCodingDnaRegex.Match(reference); if (!mapperPairReferenceMatch.Success) { throw new GeneralException( $"Unable to use the regular expression on the mapper pair reference string: [{reference}]"); } int transcriptIndex; if (!int.TryParse(mapperPairReferenceMatch.Groups[1].Value, out transcriptIndex)) { throw new GeneralException( $"Unable to convert the transcript index from a string to an integer: [{mapperPairReferenceMatch.Groups[1].Value}]"); } // sanity check: make sure we have at least that many transcripts in our list if (transcriptIndex < 0 || transcriptIndex >= dataStore.Transcripts.Count) { throw new GeneralException( $"Unable to link the mapper pair reference: transcript index: [{transcriptIndex}], current # of transcripts: [{dataStore.Transcripts.Count}]"); } int codingDnaMapperPairIndex; if (!int.TryParse(mapperPairReferenceMatch.Groups[2].Value, out codingDnaMapperPairIndex)) { throw new GeneralException( $"Unable to convert the mapper pair index from a string to an integer: [{mapperPairReferenceMatch.Groups[2].Value}]"); } // sanity check: make sure we have at least that many mapper pairs in our list int numGenomicMapperPairs = dataStore.Transcripts[transcriptIndex].VariantEffectCache.Mapper.ExonCoordinateMapper.PairGenomic.Genomic.Count; if (codingDnaMapperPairIndex < 0 || codingDnaMapperPairIndex >= numGenomicMapperPairs) { throw new GeneralException( $"Unable to link the mapper pair reference: mapper pair index: [{codingDnaMapperPairIndex}], current # of mapper pairs: [{numGenomicMapperPairs}]"); } // Console.WriteLine("reference: {0}", reference); // Console.WriteLine("transcript index: {0}", transcriptIndex); // Console.WriteLine("mapper pair index: {0}", genomicMapperPairIndex); return(dataStore.Transcripts[transcriptIndex].VariantEffectCache.Mapper.ExonCoordinateMapper.PairCodingDna.CodingDna[codingDnaMapperPairIndex]); }
/// <summary> /// parses the relevant data from each intron object /// </summary> private static void ParseReference(ObjectValue objectValue, DataStructures.VEP.Intron intron, ImportDataStore dataStore) { // loop over all of the key/value pairs in the intron object foreach (AbstractData ad in objectValue) { // skip normal entries if (!DumperUtilities.IsReference(ad)) { continue; } // handle each key switch (ad.Key) { case Transcript.SliceKey: var referenceKeyValue = ad as ReferenceKeyValue; if (referenceKeyValue != null) { intron.Slice = Slice.ParseReference(referenceKeyValue.Value, dataStore); } break; default: throw new GeneralException($"Found an unhandled reference in the intron object: {ad.Key}"); } } }
/// <summary> /// parses the relevant data from each mapper pairs object /// </summary> public static List <DataStructures.VEP.MapperPair> ParseList(List <AbstractData> abstractDataList, ImportDataStore dataStore) { var mapperPairs = DumperUtilities.GetPopulatedList <DataStructures.VEP.MapperPair>(abstractDataList.Count); // loop over all of the key/value pairs in the mapper pairs object for (int mapperPairIndex = 0; mapperPairIndex < abstractDataList.Count; mapperPairIndex++) { var ad = abstractDataList[mapperPairIndex]; // skip references if (DumperUtilities.IsReference(ad)) { continue; } if (ad.DataType != DataType) { throw new GeneralException( $"Expected a mapper pair data type, but found the following data type: [{ad.DataType}]"); } var mapperPairNode = ad as ObjectValue; if (mapperPairNode == null) { throw new GeneralException( $"Could not transform the AbstractData object into an ObjectValue: [{ad.GetType()}]"); } var newMapperPair = Parse(mapperPairNode, dataStore.CurrentReferenceIndex); // DS.VEP.MapperPair oldMapperPair; // if (dataStore.MapperPairs.TryGetValue(newMapperPair, out oldMapperPair)) //{ // mapperPairs[mapperPairIndex] = oldMapperPair; //} // else //{ mapperPairs[mapperPairIndex] = newMapperPair; // dataStore.MapperPairs[newMapperPair] = newMapperPair; //} } return(mapperPairs); }