Ejemplo n.º 1
0
 public UniprotParser(string swissprotFileName, string tremblFileName, bool includeTrembl, HandleUniprotEntry handle)
 {
     if (swissprotFileName != null){
         this.swissprotFileName = swissprotFileName;
     }
     if (tremblFileName != null){
         this.tremblFileName = tremblFileName;
     }
     Parse(this.swissprotFileName, handle, false);
     if (includeTrembl){
         Parse(this.tremblFileName, handle, true);
     }
 }
Ejemplo n.º 2
0
        private void Parse(string filename, HandleUniprotEntry handle, bool isTrembl)
        {
            Stream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read);
            Stream stream     = filename.ToLower().EndsWith(".gz")
                                ? new GZipStream(fileStream, CompressionMode.Decompress) : fileStream;
            XmlTextReader reader = new XmlTextReader(new StreamReader(stream));

            while (reader.Read())
            {
                switch (reader.NodeType)
                {
                case XmlNodeType.Element:
                    string name = reader.Name;
                    Dictionary <string, string> attributes = new Dictionary <string, string>();
                    if (reader.HasAttributes)
                    {
                        for (int i = 0; i < reader.AttributeCount; i++)
                        {
                            reader.MoveToAttribute(i);
                            attributes.Add(reader.Name, reader.Value);
                        }
                    }
                    StartElement(name, attributes);
                    level++;
                    break;

                case XmlNodeType.EndElement:
                    level--;
                    EndElement(reader.Name, handle, isTrembl);
                    break;

                case XmlNodeType.Text:
                    Characters(reader.Value, 0, reader.Value.Length);
                    break;
                }
            }
        }
Ejemplo n.º 3
0
 private void Parse(string filename, HandleUniprotEntry handle, bool isTrembl)
 {
     Stream fileStream = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read);
     Stream stream = filename.ToLower().EndsWith(".gz")
         ? new GZipStream(fileStream, CompressionMode.Decompress) : fileStream;
     XmlTextReader reader = new XmlTextReader(new StreamReader(stream));
     while (reader.Read()){
         switch (reader.NodeType){
             case XmlNodeType.Element:
                 string name = reader.Name;
                 Dictionary<string, string> attributes = new Dictionary<string, string>();
                 if (reader.HasAttributes){
                     for (int i = 0; i < reader.AttributeCount; i++){
                         reader.MoveToAttribute(i);
                         attributes.Add(reader.Name, reader.Value);
                     }
                 }
                 StartElement(name, attributes);
                 level++;
                 break;
             case XmlNodeType.EndElement:
                 level--;
                 EndElement(reader.Name, handle, isTrembl);
                 break;
             case XmlNodeType.Text:
                 Characters(reader.Value, 0, reader.Value.Length);
                 break;
         }
     }
 }
Ejemplo n.º 4
0
 private void EndElement(IEquatable<string> qName, HandleUniprotEntry handle, bool isTrembl)
 {
     if (qName.Equals("sequence")){
         entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString());
         sequence = null;
     } else if (qName.Equals("keyword")) {
         entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString()));
         keyword = null;
     } else if (qName.Equals("molecule") && dbReferenceType.Equals("Ensembl")){
         string mol = molecule.ToString().Trim();
         entry.AddDbEntryProperty(dbReferenceType, dbReferenceId, "isoform ID", mol);
         if (!isoformToEnst.ContainsKey(mol))
             isoformToEnst.Add(mol, new List<string>());
         isoformToEnst[mol].Add(dbReferenceId);
         molecule = null;
     } else if (qName.Equals("entry")){
         entry.Accessions = accessions.ToArray();
         entry.ProteinFullNames = proteinFullNames.ToArray();
         entry.ProteinShortNames = proteinShortNames.ToArray();
         entry.ProteinEcNumbers = proteinEcNumbers.ToArray();
         entry.GeneNamesAndTypes = gnames.ToArray();
         entry.OrganismNames = onames.ToArray();
         entry.UniprotNames = unames.ToArray();
         entry.IsTrembl = isTrembl;
         if (resolveIsoforms){
             if (numIsoforms > 1 && isoformToEnst.Count > 1){
                 List<UniprotEntry> isoEntries = entry.ResolveIsoforms(isoformToEnst);
                 foreach (UniprotEntry e in isoEntries){
                     handle(e);
                 }
             } else
                 handle(entry);
         } else
             handle(entry);
     } else if (qName.Equals("dbReference")){
         inDbRef = false;
     } else if (qName.Equals("accession")){
         accessions.Add(StringUtils.RemoveWhitespace(accession.ToString()));
         accession = null;
     } else if (qName.Equals("location")){
         if (inFeature){
             //inFeatureLocation = false;
             entry.AddFeatureLocation(featureBegin, featureEnd);
         }
     } else if (qName.Equals("variation")){
         if (inFeature){
             entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString()));
             variation = null;
         }
     } else if (qName.Equals("original")){
         if (inFeature){
             entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString()));
             original = null;
         }
     } else if (qName.Equals("feature")){
         inFeature = false;
         foreach (FeatureType type in entry.GetAllFeatureTypes()){
             int c = entry.GetFeatureCount(type);
             if (!featureCounts.ContainsKey(type)){
                 featureCounts.Add(type, 0);
             }
             featureCounts[type] += c;
         }
     } else if (qName.Equals("fullName") && inProteinRecommendedName){
         proteinFullNames.Add(proteinFullName.ToString().Trim());
         proteinFullName = null;
     } else if (qName.Equals("shortName") && inProteinRecommendedName){
         proteinShortNames.Add(proteinShortName.ToString().Trim());
         proteinShortName = null;
     } else if (qName.Equals("ecNumber") && inProteinRecommendedName){
         proteinEcNumbers.Add(proteinEcNumber.ToString().Trim());
         proteinEcNumber = null;
     } else if (qName.Equals("name") && inGene){
         gnames.Add(new Tuple<string, string>(gname.ToString().Trim(), gnameType.Trim()));
         gname = null;
         gnameType = null;
     } else if (qName.Equals("name") && inOrganism){
         string on = oname?.ToString().Trim();
         if (@on?.Length > 0){
             onames.Add(@on);
             oname = null;
         }
     } else if (qName.Equals("name") && level == 1){
         unames.Add(uname.ToString().Trim());
         uname = null;
     } else if (qName.Equals("protein")){
         inProtein = false;
     } else if (qName.Equals("recommendedName") && inProtein){
         inProteinRecommendedName = false;
     } else if (qName.Equals("gene")){
         inGene = false;
     } else if (qName.Equals("organism")) {
         inOrganism = false;
     } else if (qName.Equals("organismHost")) {
         inOrganismHost = false;
     }
 }
Ejemplo n.º 5
0
 private void EndElement(IEquatable<string> qName, HandleUniprotEntry handle, bool isTrembl)
 {
     if (qName.Equals("sequence")){
         entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString());
         sequence = null;
     } else if (qName.Equals("keyword")) {
         entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString()));
         keyword = null;
     } else if (qName.Equals("entry")) {
         entry.Accessions = accessions.ToArray();
         entry.ProteinFullNames = proteinFullNames.ToArray();
         entry.ProteinShortNames = proteinShortNames.ToArray();
         entry.ProteinEcNumbers = proteinEcNumbers.ToArray();
         entry.GeneNamesAndTypes = gnames.ToArray();
         entry.OrganismNames = onames.ToArray();
         entry.UniprotNames = unames.ToArray();
         entry.IsTrembl = isTrembl;
         handle(entry);
         entryCount++;
     } else if (qName.Equals("dbReference")){
         inDbRef = false;
     } else if (qName.Equals("accession")){
         accessions.Add(StringUtils.RemoveWhitespace(accession.ToString()));
         accession = null;
     } else if (qName.Equals("location")){
         if (inFeature){
             inFeatureLocation = false;
             entry.AddFeatureLocation(featureBegin, featureEnd);
         }
     } else if (qName.Equals("variation")){
         if (inFeature){
             entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString()));
             variation = null;
         }
     } else if (qName.Equals("original")){
         if (inFeature){
             entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString()));
             original = null;
         }
     } else if (qName.Equals("feature")){
         inFeature = false;
         foreach (FeatureType type in entry.GetAllFeatureTypes()){
             int c = entry.GetFeatureCount(type);
             if (!featureCounts.ContainsKey(type)){
                 featureCounts.Add(type, 0);
             }
             featureCounts[type] += c;
         }
     } else if (qName.Equals("fullName") && inProteinRecommendedName){
         proteinFullNames.Add(proteinFullName.ToString().Trim());
         proteinFullName = null;
     } else if (qName.Equals("shortName") && inProteinRecommendedName){
         proteinShortNames.Add(proteinShortName.ToString().Trim());
         proteinShortName = null;
     } else if (qName.Equals("ecNumber") && inProteinRecommendedName){
         proteinEcNumbers.Add(proteinEcNumber.ToString().Trim());
         proteinEcNumber = null;
     } else if (qName.Equals("name") && inGene){
         gnames.Add(new Tuple<string, string>(gname.ToString().Trim(), gnameType.Trim()));
         gname = null;
         gnameType = null;
     } else if (qName.Equals("name") && inOrganism){
         if (oname != null){
             string on = oname.ToString().Trim();
             if (on.Length > 0){
                 onames.Add(on);
                 oname = null;
             }
         }
     } else if (qName.Equals("name") && level == 1){
         unames.Add(uname.ToString().Trim());
         uname = null;
     } else if (qName.Equals("protein")){
         inProtein = false;
     } else if (qName.Equals("recommendedName") && inProtein){
         inProteinRecommendedName = false;
     } else if (qName.Equals("gene")){
         inGene = false;
     } else if (qName.Equals("organism")) {
         inOrganism = false;
     } else if (qName.Equals("organismHost")) {
         inOrganismHost = false;
     }
 }
Ejemplo n.º 6
0
 public UniprotParser(string swissprotFileName, string tremblFileName, bool includeTrembl, HandleUniprotEntry handle, bool resolveIsos)
 {
     resolveIsoforms = resolveIsos;
     if (swissprotFileName != null)
     {
         this.swissprotFileName = swissprotFileName;
     }
     if (tremblFileName != null)
     {
         this.tremblFileName = tremblFileName;
     }
     Parse(this.swissprotFileName, handle, false);
     if (includeTrembl)
     {
         Parse(this.tremblFileName, handle, true);
     }
 }
Ejemplo n.º 7
0
 private void EndElement(IEquatable <string> qName, HandleUniprotEntry handle, bool isTrembl)
 {
     if (qName.Equals("sequence"))
     {
         entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString());
         sequence       = null;
     }
     else if (qName.Equals("keyword"))
     {
         entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString()));
         keyword = null;
     }
     else if (qName.Equals("molecule") && dbReferenceType.Equals("Ensembl"))
     {
         string mol = molecule.ToString().Trim();
         entry.AddDbEntryProperty(dbReferenceType, dbReferenceId, "isoform ID", mol);
         if (!isoformToEnst.ContainsKey(mol))
         {
             isoformToEnst.Add(mol, new List <string>());
         }
         isoformToEnst[mol].Add(dbReferenceId);
         molecule = null;
     }
     else if (qName.Equals("entry"))
     {
         entry.Accessions        = accessions.ToArray();
         entry.ProteinFullNames  = proteinFullNames.ToArray();
         entry.ProteinShortNames = proteinShortNames.ToArray();
         entry.ProteinEcNumbers  = proteinEcNumbers.ToArray();
         entry.GeneNamesAndTypes = gnames.ToArray();
         entry.OrganismNames     = onames.ToArray();
         entry.UniprotNames      = unames.ToArray();
         entry.IsTrembl          = isTrembl;
         if (resolveIsoforms)
         {
             if (numIsoforms > 1 && isoformToEnst.Count > 1)
             {
                 List <UniprotEntry> isoEntries = entry.ResolveIsoforms(isoformToEnst);
                 foreach (UniprotEntry e in isoEntries)
                 {
                     handle(e);
                 }
             }
             else
             {
                 handle(entry);
             }
         }
         else
         {
             handle(entry);
         }
     }
     else if (qName.Equals("dbReference"))
     {
         inDbRef = false;
     }
     else if (qName.Equals("accession"))
     {
         accessions.Add(StringUtils.RemoveWhitespace(accession.ToString()));
         accession = null;
     }
     else if (qName.Equals("location"))
     {
         if (inFeature)
         {
             //inFeatureLocation = false;
             entry.AddFeatureLocation(featureBegin, featureEnd);
         }
     }
     else if (qName.Equals("variation"))
     {
         if (inFeature)
         {
             entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString()));
             variation = null;
         }
     }
     else if (qName.Equals("original"))
     {
         if (inFeature)
         {
             entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString()));
             original = null;
         }
     }
     else if (qName.Equals("feature"))
     {
         inFeature = false;
         foreach (FeatureType type in entry.GetAllFeatureTypes())
         {
             int c = entry.GetFeatureCount(type);
             if (!featureCounts.ContainsKey(type))
             {
                 featureCounts.Add(type, 0);
             }
             featureCounts[type] += c;
         }
     }
     else if (qName.Equals("fullName") && inProteinRecommendedName)
     {
         proteinFullNames.Add(proteinFullName.ToString().Trim());
         proteinFullName = null;
     }
     else if (qName.Equals("shortName") && inProteinRecommendedName)
     {
         proteinShortNames.Add(proteinShortName.ToString().Trim());
         proteinShortName = null;
     }
     else if (qName.Equals("ecNumber") && inProteinRecommendedName)
     {
         proteinEcNumbers.Add(proteinEcNumber.ToString().Trim());
         proteinEcNumber = null;
     }
     else if (qName.Equals("name") && inGene)
     {
         gnames.Add(new Tuple <string, string>(gname.ToString().Trim(), gnameType.Trim()));
         gname     = null;
         gnameType = null;
     }
     else if (qName.Equals("name") && inOrganism)
     {
         string on = oname?.ToString().Trim();
         if (@on?.Length > 0)
         {
             onames.Add(@on);
             oname = null;
         }
     }
     else if (qName.Equals("name") && level == 1)
     {
         unames.Add(uname.ToString().Trim());
         uname = null;
     }
     else if (qName.Equals("protein"))
     {
         inProtein = false;
     }
     else if (qName.Equals("recommendedName") && inProtein)
     {
         inProteinRecommendedName = false;
     }
     else if (qName.Equals("gene"))
     {
         inGene = false;
     }
     else if (qName.Equals("organism"))
     {
         inOrganism = false;
     }
     else if (qName.Equals("organismHost"))
     {
         inOrganismHost = false;
     }
 }
Ejemplo n.º 8
0
 private void EndElement(IEquatable <string> qName, HandleUniprotEntry handle, bool isTrembl)
 {
     if (qName.Equals("sequence"))
     {
         entry.Sequence = StringUtils.RemoveWhitespace(sequence.ToString());
         sequence       = null;
     }
     else if (qName.Equals("keyword"))
     {
         entry.AddKeyword(StringUtils.RemoveWhitespace(keyword.ToString()));
         keyword = null;
     }
     else if (qName.Equals("entry"))
     {
         entry.Accessions        = accessions.ToArray();
         entry.ProteinFullNames  = proteinFullNames.ToArray();
         entry.ProteinShortNames = proteinShortNames.ToArray();
         entry.ProteinEcNumbers  = proteinEcNumbers.ToArray();
         entry.GeneNamesAndTypes = gnames.ToArray();
         entry.OrganismNames     = onames.ToArray();
         entry.UniprotNames      = unames.ToArray();
         entry.IsTrembl          = isTrembl;
         handle(entry);
         entryCount++;
     }
     else if (qName.Equals("dbReference"))
     {
         inDbRef = false;
     }
     else if (qName.Equals("accession"))
     {
         accessions.Add(StringUtils.RemoveWhitespace(accession.ToString()));
         accession = null;
     }
     else if (qName.Equals("location"))
     {
         if (inFeature)
         {
             inFeatureLocation = false;
             entry.AddFeatureLocation(featureBegin, featureEnd);
         }
     }
     else if (qName.Equals("variation"))
     {
         if (inFeature)
         {
             entry.AddFeatureVariation(StringUtils.RemoveWhitespace(variation.ToString()));
             variation = null;
         }
     }
     else if (qName.Equals("original"))
     {
         if (inFeature)
         {
             entry.AddFeatureOriginal(StringUtils.RemoveWhitespace(original.ToString()));
             original = null;
         }
     }
     else if (qName.Equals("feature"))
     {
         inFeature = false;
         foreach (FeatureType type in entry.GetAllFeatureTypes())
         {
             int c = entry.GetFeatureCount(type);
             if (!featureCounts.ContainsKey(type))
             {
                 featureCounts.Add(type, 0);
             }
             featureCounts[type] += c;
         }
     }
     else if (qName.Equals("fullName") && inProteinRecommendedName)
     {
         proteinFullNames.Add(proteinFullName.ToString().Trim());
         proteinFullName = null;
     }
     else if (qName.Equals("shortName") && inProteinRecommendedName)
     {
         proteinShortNames.Add(proteinShortName.ToString().Trim());
         proteinShortName = null;
     }
     else if (qName.Equals("ecNumber") && inProteinRecommendedName)
     {
         proteinEcNumbers.Add(proteinEcNumber.ToString().Trim());
         proteinEcNumber = null;
     }
     else if (qName.Equals("name") && inGene)
     {
         gnames.Add(new Tuple <string, string>(gname.ToString().Trim(), gnameType.Trim()));
         gname     = null;
         gnameType = null;
     }
     else if (qName.Equals("name") && inOrganism)
     {
         if (oname != null)
         {
             string on = oname.ToString().Trim();
             if (on.Length > 0)
             {
                 onames.Add(on);
                 oname = null;
             }
         }
     }
     else if (qName.Equals("name") && level == 1)
     {
         unames.Add(uname.ToString().Trim());
         uname = null;
     }
     else if (qName.Equals("protein"))
     {
         inProtein = false;
     }
     else if (qName.Equals("recommendedName") && inProtein)
     {
         inProteinRecommendedName = false;
     }
     else if (qName.Equals("gene"))
     {
         inGene = false;
     }
     else if (qName.Equals("organism"))
     {
         inOrganism = false;
     }
     else if (qName.Equals("organismHost"))
     {
         inOrganismHost = false;
     }
 }