Exemple #1
0
        private void ParseCitation(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }


            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "ID":
                    if (child.Attributes.ContainsKey("Source"))
                    {
                        switch (child.Attributes["Source"])
                        {
                        case "PubMed":
                            var value = child.StringValues[0].TrimEnd('.');
                            value = value.TrimStart('0');
                            if (value.All(char.IsDigit) && value.Length <= 8)                                            //pubmed ids with more than 9 digits are bad
                            {
                                _pubMedIds.Add(Convert.ToInt64(value));
                            }

                            break;
                        }
                    }
                    break;
                }
            }
        }
Exemple #2
0
        /// <summary>
        /// Contains phenotype information for the trait
        /// </summary>
        /// <param name="xmlElement"></param>
        private void ParsePnenotype(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "ElementValue":
                    // contains phenotype
                    // <ElementValue Type="Preferred">Breast-ovarian cancer, familial 1</ElementValue>
                    ParsePhenotypeElementValue(child);
                    if (!IsPreferredPhenotype(child))
                    {
                        return;                                //we do not want to parse XRef for alternates
                    }
                    break;

                case "XRef":
                    ParseXref(child);
                    break;
                }
            }
        }
Exemple #3
0
        private void ParseXref(LiteXmlElement xmlElement)
        {
            if (!xmlElement.Attributes.ContainsKey("DB"))
            {
                return;
            }

            switch (xmlElement.Attributes["DB"])
            {
            case "MedGen":
                _medGenIDs.Add(xmlElement.Attributes["ID"]);
                break;

            case "Orphanet":
                _orphanetIDs.Add(xmlElement.Attributes["ID"]);
                break;

            case "OMIM":
                if (xmlElement.Attributes.ContainsKey("Type"))
                {
                    if (xmlElement.Attributes["Type"] == "MIM")
                    {
                        _omimIDs.Add(xmlElement.Attributes["ID"]);
                    }
                }
                break;

            case "dbSNP":
                _dbSnp = string.IsNullOrEmpty(_dbSnp) ? xmlElement.Attributes["ID"] : _dbSnp + "," + xmlElement.Attributes["ID"];
                break;
            }
        }
Exemple #4
0
        private static bool IsPreferredPhenotype(LiteXmlElement xmlElement)
        {
            if (!xmlElement.Attributes.ContainsKey("Type"))
            {
                return(false);
            }

            return(xmlElement.Attributes["Type"] == "Preferred");
        }
Exemple #5
0
        private void ParseMeasure(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }

            _dbSnp = null;

            //the variant type is available in the attributes
            string varType = null;

            foreach (var attribute in xmlElement.Attributes)
            {
                if (attribute.Key == "Type")
                {
                    varType = attribute.Value;
                }
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "SequenceLocation":
                    var variant = GetClinvarVariant(child, _compressedSequence.GenomeAssembly);
                    if (variant != null)
                    {
                        variant.VariantType = varType;
                        if (variant.AltAllele != null && variant.AltAllele.Length == 1 && _iupacBases.ContainsKey(variant.AltAllele[0]))
                        {
                            AddIupacVariants(variant);
                        }
                        else
                        {
                            _variantList.Add(variant);
                        }
                    }
                    break;

                case "XRef":
                    ParseXref(child);
                    break;
                }
            }
            //if we don't have a dbSNP for this variant, we will skip it
            if (_dbSnp == null)
            {
                _variantList.Clear();
                return;
            }
            foreach (var variant in _variantList)
            {
                variant.DbSnp = _dbSnp;
            }
        }
Exemple #6
0
        private static LiteXmlElement ParseXmlElement(XmlTextReader xmlReader)
        {
            var xmlElement = new LiteXmlElement(xmlReader.Name);

            var isEmptyElement = xmlReader.IsEmptyElement;

            if (xmlReader.HasAttributes)
            {
                while (xmlReader.MoveToNextAttribute())
                {
                    xmlElement.Attributes[xmlReader.Name] = xmlReader.Value;
                }
            }

            if (isEmptyElement)
            {
                return(xmlElement.IsEmpty()? null: xmlElement);
            }

            while (xmlReader.Read())
            {
                //we will read till an end tag is observed
                switch (xmlReader.NodeType)
                {
                case XmlNodeType.Element:                         // The node is an element.
                    var child = ParseXmlElement(xmlReader);
                    if (child != null)
                    {
                        xmlElement.Children.Add(child);
                    }
                    break;

                case XmlNodeType.Text:
                    if (!string.IsNullOrEmpty(xmlReader.Value))
                    {
                        xmlElement.StringValues.Add(xmlReader.Value);
                    }
                    break;

                case XmlNodeType.EndElement:                         //Display the end of the element.
                    if (xmlReader.Name == xmlElement.Name)
                    {
                        return(xmlElement.IsEmpty()? null: xmlElement);
                    }
                    Console.WriteLine("WARNING!! encountered unexpected endElement tag:" + xmlReader.Name);
                    break;
                }
            }
            return(null);
        }
Exemple #7
0
        /// <summary>
        /// Parses a ClinVar file and return an enumeration object containing all the ClinVar objects
        /// that have been extracted
        /// </summary>
        private IEnumerable <ClinVarItem> GetItems()
        {
            using (var reader = GZipUtilities.GetAppropriateStreamReader(_clinVarXmlFileInfo.FullName))
                using (var xmlReader = XmlTextReader.Create(reader, new XmlReaderSettings {
                    DtdProcessing = DtdProcessing.Prohibit, IgnoreWhitespace = true
                }))
                {
                    string elementName = null;

                    //skipping the top level element to go down to its children
                    xmlReader.ReadToDescendant("ClinVarSet");

                    do
                    {
                        LiteXmlElement xmlElement = null;

                        switch (xmlReader.NodeType)
                        {
                        case XmlNodeType.Element:                         // The node is an element.
                            elementName = xmlReader.Name;
                            xmlElement  = ParseXmlElement(xmlReader);
                            break;

                        case XmlNodeType.EndElement:                         //Display the end of the element.
                            // Release set is the top level element we skipped. So, we will encounter this mismatch.
                            if (xmlReader.Name != "ReleaseSet" && xmlReader.Name != elementName)
                            {
                                throw new InvalidDataException("WARNING!! encountered unexpected endElement tag:" + xmlReader.Name);
                            }
                            break;

                        default:
                            continue;
                        }

                        var clinVarItems = ExtractClinVarItems(xmlElement);

                        if (clinVarItems == null)
                        {
                            continue;
                        }

                        foreach (var clinVarItem in clinVarItems)
                        {
                            yield return(clinVarItem);
                        }
                    } while (xmlReader.Read());
                }
        }
Exemple #8
0
 private void ParsePhenotypeElementValue(LiteXmlElement xmlElement)
 {
     if (!xmlElement.Attributes.ContainsKey("Type"))
     {
         return;
     }
     if (xmlElement.Attributes["Type"] == "Preferred")
     {
         _prefPhenotypes.Add(xmlElement.StringValues[0]);
     }
     if (xmlElement.Attributes["Type"] == "Alternate")
     {
         _altPhenotypes.Add(xmlElement.StringValues[0]);
     }
 }
Exemple #9
0
        private static ClinvarVariant GetClinvarVariant(LiteXmlElement xmlElement, GenomeAssembly genomeAssembly)
        {
            if (xmlElement.Children == null)
            {
                return(null);
            }
            //<SequenceLocation Assembly="GRCh38" Chr="17" Accession="NC_000017.11" start="43082402" stop="43082402" variantLength="1" referenceAllele="A" alternateAllele="C" />

            string chromosome = null, referenceAllele = null, altAllele = null;
            int    start = 0, stop = 0;

            foreach (var attribute in xmlElement.Attributes)
            {
                switch (attribute.Key)
                {
                case "Assembly":
                    if (attribute.Value != genomeAssembly.ToString() &&
                        genomeAssembly != GenomeAssembly.Unknown)
                    {
                        return(null);
                    }
                    break;

                case "Chr":
                    chromosome = attribute.Value;
                    break;

                case "display_start":
                    start = Convert.ToInt32(attribute.Value);
                    break;

                case "display_stop":
                    stop = Convert.ToInt32(attribute.Value);
                    break;

                case "referenceAllele":
                    referenceAllele = attribute.Value;
                    break;

                case "alternateAllele":
                    altAllele = attribute.Value;
                    break;
                }
            }

            AdjustVariant(ref start, ref stop, ref referenceAllele, ref altAllele);
            return(new ClinvarVariant(chromosome, start, stop, referenceAllele, altAllele));
        }
Exemple #10
0
        private void ParseMeasureSet(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "Measure":
                    // this element contains the sequence location info
                    ParseMeasure(child);
                    break;
                }
            }
        }
Exemple #11
0
        private void ParseTraitSet(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "Trait":
                    // this element contains xref and phenotype name
                    ParseTrait(child);
                    break;
                }
            }
        }
Exemple #12
0
        private void ParseRefClinVarAssertion(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }
            //<ReferenceClinVarAssertion DateCreated="2013-10-28" DateLastUpdated="2016-04-20" ID="182406">
            foreach (var attribute in xmlElement.Attributes)
            {
                if (attribute.Key == "DateLastUpdated")
                {
                    _lastUpdatedDate = ParseDate(attribute.Value);
                }
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "RecordStatus":
                    _recordStatus = child.StringValues[0];
                    break;

                case "ClinVarAccession":
                    _id = child.Attributes["Acc"] + "." + child.Attributes["Version"];
                    break;

                case "ClinicalSignificance":
                    GetClinicalSignificance(child);
                    break;

                case "MeasureSet":
                    //get variant info like position ref and alt, etc
                    ParseMeasureSet(child);
                    break;

                case "TraitSet":
                    // contains cross ref, phenotype
                    ParseTraitSet(child);
                    break;
                }
            }
        }
Exemple #13
0
        private void GetClinicalSignificance(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "ReviewStatus":
                    _reviewStatus = child.StringValues[0];
                    break;

                case "Description":
                    _significance = child.StringValues[0].ToLower();
                    break;
                }
            }
        }
Exemple #14
0
        private void ParseTrait(LiteXmlElement xmlElement)
        {
            if (xmlElement.Children == null)
            {
                return;
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "XRef":
                    // this contains MedGen, Orphanet, Omim ids
                    ParseXref(child);
                    break;

                case "Name":
                    ParsePnenotype(child);
                    break;
                }
            }
        }
Exemple #15
0
        private void ParseScv(LiteXmlElement xmlElement)
        {
            //the  information we want from SCVs is pubmed ids and allele origins
            if (xmlElement.Children == null)
            {
                return;
            }

            foreach (var child in xmlElement.Children)
            {
                if (child.Name == "Citation")
                {
                    ParseCitation(child);
                }
                if (child.Name == "Origin")
                {
                    _alleleOrigins.Add(child.StringValues[0]);
                }

                ParseScv(child);                //keep going deeper
            }
        }
Exemple #16
0
        private List <ClinVarItem> ExtractClinVarItems(LiteXmlElement xmlElement)
        {
            ClearClinvarFields();

            if (xmlElement == null)
            {
                return(null);
            }
            if (xmlElement.IsEmpty())
            {
                return(null);
            }

            foreach (var child in xmlElement.Children)
            {
                switch (child.Name)
                {
                case "ReferenceClinVarAssertion":
                    ParseRefClinVarAssertion(child);
                    break;

                case "ClinVarAssertion":
                    ParseScv(child);
                    break;
                }
            }

            if (_recordStatus != "current")
            {
                Console.WriteLine($"record status not current: {_recordStatus} for {_id}");
                return(null);
            }

            var clinvarList = new List <ClinVarItem>();

            foreach (var variant in _variantList)
            {
                // in order to match the VCF, we leave out the ones that do not have dbsnp id
                if (variant.DbSnp == null)
                {
                    continue;
                }
                if (!InputFileParserUtilities.IsDesiredChromosome(variant.Chromosome, _compressedSequence.Renamer))
                {
                    continue;
                }
                if (variant.VariantType == "Microsatellite")
                {
                    continue;
                }

                var refIndex = _compressedSequence.Renamer.GetReferenceIndex(variant.Chromosome);
                if (refIndex == ChromosomeRenamer.UnknownReferenceIndex)
                {
                    throw new GeneralException($"Could not find the reference index for: {variant.Chromosome}");
                }
                _dataFileManager.LoadReference(refIndex, () => {});

                ClinvarVariant shiftedVariant = variant;
                //some entries do not have ref allele in the xml file. For those, we extract them from our ref sequence
                if (variant.ReferenceAllele == null && variant.VariantType == "Deletion")
                {
                    shiftedVariant = GenerateRefAllele(variant, _compressedSequence);
                }
                if (variant.AltAllele == null && variant.VariantType == "Duplication")
                {
                    shiftedVariant = GenerateAltAllele(variant, _compressedSequence);
                }


                //left align the variant
                shiftedVariant = LeftShift(shiftedVariant);

                if (variant.ReferenceAllele == null && variant.VariantType == "Indel" && variant.AltAllele != null)
                {
                    shiftedVariant = GenerateRefAllele(variant, _compressedSequence);
                }

                _pubMedIds.Sort();

                if (string.IsNullOrEmpty(shiftedVariant.ReferenceAllele) && string.IsNullOrEmpty(shiftedVariant.AltAllele))
                {
                    continue;
                }

                clinvarList.Add(
                    new ClinVarItem(shiftedVariant.Chromosome,
                                    shiftedVariant.Start,
                                    _alleleOrigins.Distinct().ToList(),
                                    shiftedVariant.AltAllele ?? "",
                                    _id,
                                    _reviewStatus,
                                    _medGenIDs.Distinct().ToList(),
                                    _omimIDs.Distinct().ToList(),
                                    _orphanetIDs.Distinct().ToList(),
                                    _prefPhenotypes.Count > 0? _prefPhenotypes.Distinct().ToList(): _altPhenotypes.Distinct().ToList(),
                                    shiftedVariant.ReferenceAllele ?? "",
                                    _significance,
                                    _pubMedIds.Distinct().ToList(),
                                    _lastUpdatedDate));
            }

            return(clinvarList.Count > 0 ? clinvarList: null);
        }