private void ParseCitation(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } foreach (var child in xmlElement.Children) { switch (child.Name) { case "ID": if (child.Attributes.ContainsKey("Source")) { switch (child.Attributes["Source"]) { case "PubMed": var value = child.StringValues[0].TrimEnd('.'); value = value.TrimStart('0'); if (value.All(char.IsDigit) && value.Length <= 8) //pubmed ids with more than 9 digits are bad { _pubMedIds.Add(Convert.ToInt64(value)); } break; } } break; } } }
/// <summary> /// Contains phenotype information for the trait /// </summary> /// <param name="xmlElement"></param> private void ParsePnenotype(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } foreach (var child in xmlElement.Children) { switch (child.Name) { case "ElementValue": // contains phenotype // <ElementValue Type="Preferred">Breast-ovarian cancer, familial 1</ElementValue> ParsePhenotypeElementValue(child); if (!IsPreferredPhenotype(child)) { return; //we do not want to parse XRef for alternates } break; case "XRef": ParseXref(child); break; } } }
private void ParseXref(LiteXmlElement xmlElement) { if (!xmlElement.Attributes.ContainsKey("DB")) { return; } switch (xmlElement.Attributes["DB"]) { case "MedGen": _medGenIDs.Add(xmlElement.Attributes["ID"]); break; case "Orphanet": _orphanetIDs.Add(xmlElement.Attributes["ID"]); break; case "OMIM": if (xmlElement.Attributes.ContainsKey("Type")) { if (xmlElement.Attributes["Type"] == "MIM") { _omimIDs.Add(xmlElement.Attributes["ID"]); } } break; case "dbSNP": _dbSnp = string.IsNullOrEmpty(_dbSnp) ? xmlElement.Attributes["ID"] : _dbSnp + "," + xmlElement.Attributes["ID"]; break; } }
private static bool IsPreferredPhenotype(LiteXmlElement xmlElement) { if (!xmlElement.Attributes.ContainsKey("Type")) { return(false); } return(xmlElement.Attributes["Type"] == "Preferred"); }
private void ParseMeasure(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } _dbSnp = null; //the variant type is available in the attributes string varType = null; foreach (var attribute in xmlElement.Attributes) { if (attribute.Key == "Type") { varType = attribute.Value; } } foreach (var child in xmlElement.Children) { switch (child.Name) { case "SequenceLocation": var variant = GetClinvarVariant(child, _compressedSequence.GenomeAssembly); if (variant != null) { variant.VariantType = varType; if (variant.AltAllele != null && variant.AltAllele.Length == 1 && _iupacBases.ContainsKey(variant.AltAllele[0])) { AddIupacVariants(variant); } else { _variantList.Add(variant); } } break; case "XRef": ParseXref(child); break; } } //if we don't have a dbSNP for this variant, we will skip it if (_dbSnp == null) { _variantList.Clear(); return; } foreach (var variant in _variantList) { variant.DbSnp = _dbSnp; } }
private static LiteXmlElement ParseXmlElement(XmlTextReader xmlReader) { var xmlElement = new LiteXmlElement(xmlReader.Name); var isEmptyElement = xmlReader.IsEmptyElement; if (xmlReader.HasAttributes) { while (xmlReader.MoveToNextAttribute()) { xmlElement.Attributes[xmlReader.Name] = xmlReader.Value; } } if (isEmptyElement) { return(xmlElement.IsEmpty()? null: xmlElement); } while (xmlReader.Read()) { //we will read till an end tag is observed switch (xmlReader.NodeType) { case XmlNodeType.Element: // The node is an element. var child = ParseXmlElement(xmlReader); if (child != null) { xmlElement.Children.Add(child); } break; case XmlNodeType.Text: if (!string.IsNullOrEmpty(xmlReader.Value)) { xmlElement.StringValues.Add(xmlReader.Value); } break; case XmlNodeType.EndElement: //Display the end of the element. if (xmlReader.Name == xmlElement.Name) { return(xmlElement.IsEmpty()? null: xmlElement); } Console.WriteLine("WARNING!! encountered unexpected endElement tag:" + xmlReader.Name); break; } } return(null); }
/// <summary> /// Parses a ClinVar file and return an enumeration object containing all the ClinVar objects /// that have been extracted /// </summary> private IEnumerable <ClinVarItem> GetItems() { using (var reader = GZipUtilities.GetAppropriateStreamReader(_clinVarXmlFileInfo.FullName)) using (var xmlReader = XmlTextReader.Create(reader, new XmlReaderSettings { DtdProcessing = DtdProcessing.Prohibit, IgnoreWhitespace = true })) { string elementName = null; //skipping the top level element to go down to its children xmlReader.ReadToDescendant("ClinVarSet"); do { LiteXmlElement xmlElement = null; switch (xmlReader.NodeType) { case XmlNodeType.Element: // The node is an element. elementName = xmlReader.Name; xmlElement = ParseXmlElement(xmlReader); break; case XmlNodeType.EndElement: //Display the end of the element. // Release set is the top level element we skipped. So, we will encounter this mismatch. if (xmlReader.Name != "ReleaseSet" && xmlReader.Name != elementName) { throw new InvalidDataException("WARNING!! encountered unexpected endElement tag:" + xmlReader.Name); } break; default: continue; } var clinVarItems = ExtractClinVarItems(xmlElement); if (clinVarItems == null) { continue; } foreach (var clinVarItem in clinVarItems) { yield return(clinVarItem); } } while (xmlReader.Read()); } }
private void ParsePhenotypeElementValue(LiteXmlElement xmlElement) { if (!xmlElement.Attributes.ContainsKey("Type")) { return; } if (xmlElement.Attributes["Type"] == "Preferred") { _prefPhenotypes.Add(xmlElement.StringValues[0]); } if (xmlElement.Attributes["Type"] == "Alternate") { _altPhenotypes.Add(xmlElement.StringValues[0]); } }
private static ClinvarVariant GetClinvarVariant(LiteXmlElement xmlElement, GenomeAssembly genomeAssembly) { if (xmlElement.Children == null) { return(null); } //<SequenceLocation Assembly="GRCh38" Chr="17" Accession="NC_000017.11" start="43082402" stop="43082402" variantLength="1" referenceAllele="A" alternateAllele="C" /> string chromosome = null, referenceAllele = null, altAllele = null; int start = 0, stop = 0; foreach (var attribute in xmlElement.Attributes) { switch (attribute.Key) { case "Assembly": if (attribute.Value != genomeAssembly.ToString() && genomeAssembly != GenomeAssembly.Unknown) { return(null); } break; case "Chr": chromosome = attribute.Value; break; case "display_start": start = Convert.ToInt32(attribute.Value); break; case "display_stop": stop = Convert.ToInt32(attribute.Value); break; case "referenceAllele": referenceAllele = attribute.Value; break; case "alternateAllele": altAllele = attribute.Value; break; } } AdjustVariant(ref start, ref stop, ref referenceAllele, ref altAllele); return(new ClinvarVariant(chromosome, start, stop, referenceAllele, altAllele)); }
private void ParseMeasureSet(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } foreach (var child in xmlElement.Children) { switch (child.Name) { case "Measure": // this element contains the sequence location info ParseMeasure(child); break; } } }
private void ParseTraitSet(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } foreach (var child in xmlElement.Children) { switch (child.Name) { case "Trait": // this element contains xref and phenotype name ParseTrait(child); break; } } }
private void ParseRefClinVarAssertion(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } //<ReferenceClinVarAssertion DateCreated="2013-10-28" DateLastUpdated="2016-04-20" ID="182406"> foreach (var attribute in xmlElement.Attributes) { if (attribute.Key == "DateLastUpdated") { _lastUpdatedDate = ParseDate(attribute.Value); } } foreach (var child in xmlElement.Children) { switch (child.Name) { case "RecordStatus": _recordStatus = child.StringValues[0]; break; case "ClinVarAccession": _id = child.Attributes["Acc"] + "." + child.Attributes["Version"]; break; case "ClinicalSignificance": GetClinicalSignificance(child); break; case "MeasureSet": //get variant info like position ref and alt, etc ParseMeasureSet(child); break; case "TraitSet": // contains cross ref, phenotype ParseTraitSet(child); break; } } }
private void GetClinicalSignificance(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } foreach (var child in xmlElement.Children) { switch (child.Name) { case "ReviewStatus": _reviewStatus = child.StringValues[0]; break; case "Description": _significance = child.StringValues[0].ToLower(); break; } } }
private void ParseTrait(LiteXmlElement xmlElement) { if (xmlElement.Children == null) { return; } foreach (var child in xmlElement.Children) { switch (child.Name) { case "XRef": // this contains MedGen, Orphanet, Omim ids ParseXref(child); break; case "Name": ParsePnenotype(child); break; } } }
private void ParseScv(LiteXmlElement xmlElement) { //the information we want from SCVs is pubmed ids and allele origins if (xmlElement.Children == null) { return; } foreach (var child in xmlElement.Children) { if (child.Name == "Citation") { ParseCitation(child); } if (child.Name == "Origin") { _alleleOrigins.Add(child.StringValues[0]); } ParseScv(child); //keep going deeper } }
private List <ClinVarItem> ExtractClinVarItems(LiteXmlElement xmlElement) { ClearClinvarFields(); if (xmlElement == null) { return(null); } if (xmlElement.IsEmpty()) { return(null); } foreach (var child in xmlElement.Children) { switch (child.Name) { case "ReferenceClinVarAssertion": ParseRefClinVarAssertion(child); break; case "ClinVarAssertion": ParseScv(child); break; } } if (_recordStatus != "current") { Console.WriteLine($"record status not current: {_recordStatus} for {_id}"); return(null); } var clinvarList = new List <ClinVarItem>(); foreach (var variant in _variantList) { // in order to match the VCF, we leave out the ones that do not have dbsnp id if (variant.DbSnp == null) { continue; } if (!InputFileParserUtilities.IsDesiredChromosome(variant.Chromosome, _compressedSequence.Renamer)) { continue; } if (variant.VariantType == "Microsatellite") { continue; } var refIndex = _compressedSequence.Renamer.GetReferenceIndex(variant.Chromosome); if (refIndex == ChromosomeRenamer.UnknownReferenceIndex) { throw new GeneralException($"Could not find the reference index for: {variant.Chromosome}"); } _dataFileManager.LoadReference(refIndex, () => {}); ClinvarVariant shiftedVariant = variant; //some entries do not have ref allele in the xml file. For those, we extract them from our ref sequence if (variant.ReferenceAllele == null && variant.VariantType == "Deletion") { shiftedVariant = GenerateRefAllele(variant, _compressedSequence); } if (variant.AltAllele == null && variant.VariantType == "Duplication") { shiftedVariant = GenerateAltAllele(variant, _compressedSequence); } //left align the variant shiftedVariant = LeftShift(shiftedVariant); if (variant.ReferenceAllele == null && variant.VariantType == "Indel" && variant.AltAllele != null) { shiftedVariant = GenerateRefAllele(variant, _compressedSequence); } _pubMedIds.Sort(); if (string.IsNullOrEmpty(shiftedVariant.ReferenceAllele) && string.IsNullOrEmpty(shiftedVariant.AltAllele)) { continue; } clinvarList.Add( new ClinVarItem(shiftedVariant.Chromosome, shiftedVariant.Start, _alleleOrigins.Distinct().ToList(), shiftedVariant.AltAllele ?? "", _id, _reviewStatus, _medGenIDs.Distinct().ToList(), _omimIDs.Distinct().ToList(), _orphanetIDs.Distinct().ToList(), _prefPhenotypes.Count > 0? _prefPhenotypes.Distinct().ToList(): _altPhenotypes.Distinct().ToList(), shiftedVariant.ReferenceAllele ?? "", _significance, _pubMedIds.Distinct().ToList(), _lastUpdatedDate)); } return(clinvarList.Count > 0 ? clinvarList: null); }