public UniprotParser(string swissprotFileName, string tremblFileName, bool includeTrembl, HandleUniprotEntry handle) { if (swissprotFileName != null){ this.swissprotFileName = swissprotFileName; } if (tremblFileName != null){ this.tremblFileName = tremblFileName; } Parse(this.swissprotFileName, handle, false); if (includeTrembl){ Parse(this.tremblFileName, handle, true); } }
private void Parse(string filename, HandleUniprotEntry handle, bool isTrembl) { Stream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read); Stream stream = filename.ToLower().EndsWith(".gz") ? new GZipStream(fileStream, CompressionMode.Decompress) : fileStream; XmlTextReader reader = new XmlTextReader(new StreamReader(stream)); while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: string name = reader.Name; Dictionary <string, string> attributes = new Dictionary <string, string>(); if (reader.HasAttributes) { for (int i = 0; i < reader.AttributeCount; i++) { reader.MoveToAttribute(i); attributes.Add(reader.Name, reader.Value); } } StartElement(name, attributes); level++; break; case XmlNodeType.EndElement: level--; EndElement(reader.Name, handle, isTrembl); break; case XmlNodeType.Text: Characters(reader.Value, 0, reader.Value.Length); break; } } }
private void Parse(string filename, HandleUniprotEntry handle, bool isTrembl) { Stream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read); Stream stream = filename.ToLower().EndsWith(".gz") ? new GZipStream(fileStream, CompressionMode.Decompress) : fileStream; XmlTextReader reader = new XmlTextReader(new StreamReader(stream)); while (reader.Read()){ switch (reader.NodeType){ case XmlNodeType.Element: string name = reader.Name; Dictionary<string, string> attributes = new Dictionary<string, string>(); if (reader.HasAttributes){ for (int i = 0; i < reader.AttributeCount; i++){ reader.MoveToAttribute(i); attributes.Add(reader.Name, reader.Value); } } StartElement(name, attributes); level++; break; case XmlNodeType.EndElement: level--; EndElement(reader.Name, handle, isTrembl); break; case XmlNodeType.Text: Characters(reader.Value, 0, reader.Value.Length); break; } } }
private void EndElement(IEquatable<string> qName, HandleUniprotEntry handle, bool isTrembl) { if (qName.Equals("sequence")){ entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString()); sequence = null; } else if (qName.Equals("keyword")) { entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString())); keyword = null; } else if (qName.Equals("molecule") && dbReferenceType.Equals("Ensembl")){ string mol = molecule.ToString().Trim(); entry.AddDbEntryProperty(dbReferenceType, dbReferenceId, "isoform ID", mol); if (!isoformToEnst.ContainsKey(mol)) isoformToEnst.Add(mol, new List<string>()); isoformToEnst[mol].Add(dbReferenceId); molecule = null; } else if (qName.Equals("entry")){ entry.Accessions = accessions.ToArray(); entry.ProteinFullNames = proteinFullNames.ToArray(); entry.ProteinShortNames = proteinShortNames.ToArray(); entry.ProteinEcNumbers = proteinEcNumbers.ToArray(); entry.GeneNamesAndTypes = gnames.ToArray(); entry.OrganismNames = onames.ToArray(); entry.UniprotNames = unames.ToArray(); entry.IsTrembl = isTrembl; if (resolveIsoforms){ if (numIsoforms > 1 && isoformToEnst.Count > 1){ List<UniprotEntry> isoEntries = entry.ResolveIsoforms(isoformToEnst); foreach (UniprotEntry e in isoEntries){ handle(e); } } else handle(entry); } else handle(entry); } else if (qName.Equals("dbReference")){ inDbRef = false; } else if (qName.Equals("accession")){ accessions.Add(StringUtils.RemoveWhitespace(accession.ToString())); accession = null; } else if (qName.Equals("location")){ if (inFeature){ //inFeatureLocation = false; entry.AddFeatureLocation(featureBegin, featureEnd); } } else if (qName.Equals("variation")){ if (inFeature){ entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString())); variation = null; } } else if (qName.Equals("original")){ if (inFeature){ entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString())); original = null; } } else if (qName.Equals("feature")){ inFeature = false; foreach (FeatureType type in entry.GetAllFeatureTypes()){ int c = entry.GetFeatureCount(type); if (!featureCounts.ContainsKey(type)){ featureCounts.Add(type, 0); } featureCounts[type] += c; } } else if (qName.Equals("fullName") && inProteinRecommendedName){ proteinFullNames.Add(proteinFullName.ToString().Trim()); proteinFullName = null; } else if (qName.Equals("shortName") && inProteinRecommendedName){ proteinShortNames.Add(proteinShortName.ToString().Trim()); proteinShortName = null; } else if (qName.Equals("ecNumber") && inProteinRecommendedName){ proteinEcNumbers.Add(proteinEcNumber.ToString().Trim()); proteinEcNumber = null; } else if (qName.Equals("name") && inGene){ gnames.Add(new Tuple<string, string>(gname.ToString().Trim(), gnameType.Trim())); gname = null; gnameType = null; } else if (qName.Equals("name") && inOrganism){ string on = oname?.ToString().Trim(); if (@on?.Length > 0){ onames.Add(@on); oname = null; } } else if (qName.Equals("name") && level == 1){ unames.Add(uname.ToString().Trim()); uname = null; } else if (qName.Equals("protein")){ inProtein = false; } else if (qName.Equals("recommendedName") && inProtein){ inProteinRecommendedName = false; } else if (qName.Equals("gene")){ inGene = false; } else if (qName.Equals("organism")) { inOrganism = false; } else if (qName.Equals("organismHost")) { inOrganismHost = false; } }
private void EndElement(IEquatable<string> qName, HandleUniprotEntry handle, bool isTrembl) { if (qName.Equals("sequence")){ entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString()); sequence = null; } else if (qName.Equals("keyword")) { entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString())); keyword = null; } else if (qName.Equals("entry")) { entry.Accessions = accessions.ToArray(); entry.ProteinFullNames = proteinFullNames.ToArray(); entry.ProteinShortNames = proteinShortNames.ToArray(); entry.ProteinEcNumbers = proteinEcNumbers.ToArray(); entry.GeneNamesAndTypes = gnames.ToArray(); entry.OrganismNames = onames.ToArray(); entry.UniprotNames = unames.ToArray(); entry.IsTrembl = isTrembl; handle(entry); entryCount++; } else if (qName.Equals("dbReference")){ inDbRef = false; } else if (qName.Equals("accession")){ accessions.Add(StringUtils.RemoveWhitespace(accession.ToString())); accession = null; } else if (qName.Equals("location")){ if (inFeature){ inFeatureLocation = false; entry.AddFeatureLocation(featureBegin, featureEnd); } } else if (qName.Equals("variation")){ if (inFeature){ entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString())); variation = null; } } else if (qName.Equals("original")){ if (inFeature){ entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString())); original = null; } } else if (qName.Equals("feature")){ inFeature = false; foreach (FeatureType type in entry.GetAllFeatureTypes()){ int c = entry.GetFeatureCount(type); if (!featureCounts.ContainsKey(type)){ featureCounts.Add(type, 0); } featureCounts[type] += c; } } else if (qName.Equals("fullName") && inProteinRecommendedName){ proteinFullNames.Add(proteinFullName.ToString().Trim()); proteinFullName = null; } else if (qName.Equals("shortName") && inProteinRecommendedName){ proteinShortNames.Add(proteinShortName.ToString().Trim()); proteinShortName = null; } else if (qName.Equals("ecNumber") && inProteinRecommendedName){ proteinEcNumbers.Add(proteinEcNumber.ToString().Trim()); proteinEcNumber = null; } else if (qName.Equals("name") && inGene){ gnames.Add(new Tuple<string, string>(gname.ToString().Trim(), gnameType.Trim())); gname = null; gnameType = null; } else if (qName.Equals("name") && inOrganism){ if (oname != null){ string on = oname.ToString().Trim(); if (on.Length > 0){ onames.Add(on); oname = null; } } } else if (qName.Equals("name") && level == 1){ unames.Add(uname.ToString().Trim()); uname = null; } else if (qName.Equals("protein")){ inProtein = false; } else if (qName.Equals("recommendedName") && inProtein){ inProteinRecommendedName = false; } else if (qName.Equals("gene")){ inGene = false; } else if (qName.Equals("organism")) { inOrganism = false; } else if (qName.Equals("organismHost")) { inOrganismHost = false; } }
public UniprotParser(string swissprotFileName, string tremblFileName, bool includeTrembl, HandleUniprotEntry handle, bool resolveIsos) { resolveIsoforms = resolveIsos; if (swissprotFileName != null) { this.swissprotFileName = swissprotFileName; } if (tremblFileName != null) { this.tremblFileName = tremblFileName; } Parse(this.swissprotFileName, handle, false); if (includeTrembl) { Parse(this.tremblFileName, handle, true); } }
private void EndElement(IEquatable <string> qName, HandleUniprotEntry handle, bool isTrembl) { if (qName.Equals("sequence")) { entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString()); sequence = null; } else if (qName.Equals("keyword")) { entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString())); keyword = null; } else if (qName.Equals("molecule") && dbReferenceType.Equals("Ensembl")) { string mol = molecule.ToString().Trim(); entry.AddDbEntryProperty(dbReferenceType, dbReferenceId, "isoform ID", mol); if (!isoformToEnst.ContainsKey(mol)) { isoformToEnst.Add(mol, new List <string>()); } isoformToEnst[mol].Add(dbReferenceId); molecule = null; } else if (qName.Equals("entry")) { entry.Accessions = accessions.ToArray(); entry.ProteinFullNames = proteinFullNames.ToArray(); entry.ProteinShortNames = proteinShortNames.ToArray(); entry.ProteinEcNumbers = proteinEcNumbers.ToArray(); entry.GeneNamesAndTypes = gnames.ToArray(); entry.OrganismNames = onames.ToArray(); entry.UniprotNames = unames.ToArray(); entry.IsTrembl = isTrembl; if (resolveIsoforms) { if (numIsoforms > 1 && isoformToEnst.Count > 1) { List <UniprotEntry> isoEntries = entry.ResolveIsoforms(isoformToEnst); foreach (UniprotEntry e in isoEntries) { handle(e); } } else { handle(entry); } } else { handle(entry); } } else if (qName.Equals("dbReference")) { inDbRef = false; } else if (qName.Equals("accession")) { accessions.Add(StringUtils.RemoveWhitespace(accession.ToString())); accession = null; } else if (qName.Equals("location")) { if (inFeature) { //inFeatureLocation = false; entry.AddFeatureLocation(featureBegin, featureEnd); } } else if (qName.Equals("variation")) { if (inFeature) { entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString())); variation = null; } } else if (qName.Equals("original")) { if (inFeature) { entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString())); original = null; } } else if (qName.Equals("feature")) { inFeature = false; foreach (FeatureType type in entry.GetAllFeatureTypes()) { int c = entry.GetFeatureCount(type); if (!featureCounts.ContainsKey(type)) { featureCounts.Add(type, 0); } featureCounts[type] += c; } } else if (qName.Equals("fullName") && inProteinRecommendedName) { proteinFullNames.Add(proteinFullName.ToString().Trim()); proteinFullName = null; } else if (qName.Equals("shortName") && inProteinRecommendedName) { proteinShortNames.Add(proteinShortName.ToString().Trim()); proteinShortName = null; } else if (qName.Equals("ecNumber") && inProteinRecommendedName) { proteinEcNumbers.Add(proteinEcNumber.ToString().Trim()); proteinEcNumber = null; } else if (qName.Equals("name") && inGene) { gnames.Add(new Tuple <string, string>(gname.ToString().Trim(), gnameType.Trim())); gname = null; gnameType = null; } else if (qName.Equals("name") && inOrganism) { string on = oname?.ToString().Trim(); if (@on?.Length > 0) { onames.Add(@on); oname = null; } } else if (qName.Equals("name") && level == 1) { unames.Add(uname.ToString().Trim()); uname = null; } else if (qName.Equals("protein")) { inProtein = false; } else if (qName.Equals("recommendedName") && inProtein) { inProteinRecommendedName = false; } else if (qName.Equals("gene")) { inGene = false; } else if (qName.Equals("organism")) { inOrganism = false; } else if (qName.Equals("organismHost")) { inOrganismHost = false; } }
private void EndElement(IEquatable <string> qName, HandleUniprotEntry handle, bool isTrembl) { if (qName.Equals("sequence")) { entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString()); sequence = null; } else if (qName.Equals("keyword")) { entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString())); keyword = null; } else if (qName.Equals("entry")) { entry.Accessions = accessions.ToArray(); entry.ProteinFullNames = proteinFullNames.ToArray(); entry.ProteinShortNames = proteinShortNames.ToArray(); entry.ProteinEcNumbers = proteinEcNumbers.ToArray(); entry.GeneNamesAndTypes = gnames.ToArray(); entry.OrganismNames = onames.ToArray(); entry.UniprotNames = unames.ToArray(); entry.IsTrembl = isTrembl; handle(entry); entryCount++; } else if (qName.Equals("dbReference")) { inDbRef = false; } else if (qName.Equals("accession")) { accessions.Add(StringUtils.RemoveWhitespace(accession.ToString())); accession = null; } else if (qName.Equals("location")) { if (inFeature) { inFeatureLocation = false; entry.AddFeatureLocation(featureBegin, featureEnd); } } else if (qName.Equals("variation")) { if (inFeature) { entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString())); variation = null; } } else if (qName.Equals("original")) { if (inFeature) { entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString())); original = null; } } else if (qName.Equals("feature")) { inFeature = false; foreach (FeatureType type in entry.GetAllFeatureTypes()) { int c = entry.GetFeatureCount(type); if (!featureCounts.ContainsKey(type)) { featureCounts.Add(type, 0); } featureCounts[type] += c; } } else if (qName.Equals("fullName") && inProteinRecommendedName) { proteinFullNames.Add(proteinFullName.ToString().Trim()); proteinFullName = null; } else if (qName.Equals("shortName") && inProteinRecommendedName) { proteinShortNames.Add(proteinShortName.ToString().Trim()); proteinShortName = null; } else if (qName.Equals("ecNumber") && inProteinRecommendedName) { proteinEcNumbers.Add(proteinEcNumber.ToString().Trim()); proteinEcNumber = null; } else if (qName.Equals("name") && inGene) { gnames.Add(new Tuple <string, string>(gname.ToString().Trim(), gnameType.Trim())); gname = null; gnameType = null; } else if (qName.Equals("name") && inOrganism) { if (oname != null) { string on = oname.ToString().Trim(); if (on.Length > 0) { onames.Add(on); oname = null; } } } else if (qName.Equals("name") && level == 1) { unames.Add(uname.ToString().Trim()); uname = null; } else if (qName.Equals("protein")) { inProtein = false; } else if (qName.Equals("recommendedName") && inProtein) { inProteinRecommendedName = false; } else if (qName.Equals("gene")) { inGene = false; } else if (qName.Equals("organism")) { inOrganism = false; } else if (qName.Equals("organismHost")) { inOrganismHost = false; } }