Exemplo n.º 1
0
 /// <summary>
 /// Constructing a gene using Bio object
 /// </summary>
 /// <param name="id"></param>
 /// <param name="chromosome"></param>
 /// <param name="strand"></param>
 /// <param name="oneBasedStart"></param>
 /// <param name="oneBasedEnd"></param>
 /// <param name="metadata"></param>
 public Gene(string id, Chromosome chromosome, string source, string strand, long oneBasedStart, long oneBasedEnd, MetadataListItem <List <string> > featureMetadata)
     : base(chromosome, chromosome.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null)
 {
     ID              = id;
     Chromosome      = chromosome;
     FeatureMetadata = featureMetadata;
 }
Exemplo n.º 2
0
 /// <summary>
 /// Constructor from the GFF3 reader information, including IDs, strand and Protein ID if available.
 /// </summary>
 /// <param name="id"></param>
 /// <param name="gene"></param>
 /// <param name="metadata"></param>
 /// <param name="ProteinID"></param>
 public Transcript(string id, Gene gene, string source, string strand, long oneBasedStart, long oneBasedEnd, string proteinID, MetadataListItem <List <string> > featureMetadata)
     : base(gene, gene.ChromosomeID, source, strand, oneBasedStart, oneBasedEnd)
 {
     ID              = id;
     ProteinID       = proteinID ?? id;
     Gene            = gene;
     FeatureMetadata = featureMetadata;
 }
Exemplo n.º 3
0
        // Returns a tab plus the sub-item text or a "." if the sub-item is absent.
        private string GetSubItemString(MetadataListItem <List <string> > feature, string subItemName)
        {
            List <string> list = null;

            if (feature.SubItems.TryGetValue(subItemName, out list))
            {
                return(list[0]);
            }

            return(".");
        }
Exemplo n.º 4
0
        /// <summary>
        /// Get the value of a particular key from GFF metadata structure
        /// </summary>
        /// <param name="listItem">GFF Metadata</param>
        /// <param name="itemKey">Header of the column in GFF metadata</param>
        /// <returns>Value of the given column</returns>
        private static string GetGFFColumnValue(MetadataListItem <List <string> > listItem, string itemKey)
        {
            List <string> values = new List <string>(1);

            if (listItem.SubItems.TryGetValue(itemKey, out values))
            {
                if (values.Count > 0)
                {
                    return(values[0]);
                }
            }

            return(string.Empty);
        }
Exemplo n.º 5
0
        public MetadataListItem <List <string> > GetGtfFeatureMetadata()
        {
            var feature = new MetadataListItem <List <string> >(FeatureType, GetGtfAttributes());

            feature.SubItems["source"] = new List <string> {
                Source.ToString()
            };
            feature.SubItems["start"] = new List <string> {
                OneBasedStart.ToString()
            };
            feature.SubItems["end"] = new List <string> {
                OneBasedEnd.ToString()
            };
            if (Strand != ".")
            {
                feature.SubItems["strand"] = new List <string> {
                    Strand.ToString()
                };
            }                                                                                           // might take in features without strand later on
            return(feature);
        }
Exemplo n.º 6
0
        private static MetadataListItem <List <string> > CDSFeatureMetadata(CDS cds, Exon exon)
        {
            string cdsAttributes = exon.GetGtfAttributes() + " protein_id \"" + (cds.Parent as Transcript).ProteinID + "\";";
            var    feature       = new MetadataListItem <List <string> >(cds.FeatureType, cdsAttributes);

            feature.SubItems["source"] = new List <string> {
                cds.Source.ToString()
            };
            feature.SubItems["start"] = new List <string> {
                cds.OneBasedStart.ToString()
            };
            feature.SubItems["end"] = new List <string> {
                cds.OneBasedEnd.ToString()
            };
            if (cds.Strand != ".")
            {
                feature.SubItems["strand"] = new List <string> {
                    cds.Strand.ToString()
                };
            }                                                                                                   // might take in features without strand later on
            return(feature);
        }
Exemplo n.º 7
0
 /// <summary>
 /// Reads gene model features into data structures contained within this library
 /// </summary>
 /// <param name="geneModelFile"></param>
 public void ReadGeneFeatures(string geneModelFile)
 {
     foreach (ISequence chromFeatures in SimplerParse(geneModelFile))
     {
         Chromosome chrom = Genome.Chromosomes.FirstOrDefault(x => x.FriendlyName == chromFeatures.ID);
         if (chrom == null)
         {
             continue;
         }
         chromFeatures.Metadata.TryGetValue("features", out object f);
         List <MetadataListItem <List <string> > > features = f as List <MetadataListItem <List <string> > >;
         for (int i = 0; i < features.Count; i++)
         {
             MetadataListItem <List <string> > feature = features[i];
             long.TryParse(feature.SubItems["start"][0], out long start);
             long.TryParse(feature.SubItems["end"][0], out long end);
             var attributes = SplitAttributes(feature.FreeText);
             if (feature.FreeText.Contains('='))
             {
                 ProcessGff3Feature(feature, start, end, chrom, attributes);
             }
             else
             {
                 ProcessGtfFeature(feature, start, end, chrom, attributes);
             }
         }
     }
     if (currentTranscript != null)
     {
         Transcript.SetRegions(currentTranscript);
         currentTranscript.FrameCorrection();
     }
     CreateIntergenicRegions();
     // possibly check transcript sanity here with Parallel.ForEach(Genes.SelectMany(g => g.Transcripts).ToList(), t => t.SanityCheck());
     GenomeForest.Build();
 }
Exemplo n.º 8
0
        /// <summary>
        /// Helper method to parse the feature of Gff data
        /// </summary>
        /// <param name="sequence">sequence object</param>
        /// <param name="cellRange">Range of cells</param>
        /// <param name="rowIndex">Current index of row</param>
        /// <returns>Index of row</returns>
        private static int ParseGffFeatures(ISequence sequence, object[,] cellRange, int rowIndex)
        {
            Dictionary<string, object> metadata = sequence.Metadata;

            Sequence seq = sequence as Sequence;

            if (cellRange.GetLength(1) < 9 && cellRange.GetLength(1) > 10)
            {
                throw new FormatException(Resources.UnrecognizedGffMetadataFormat);
            }

            int nameColIndex = -1;
            int sourceColIndex = -1;
            int typeColIndex = -1;
            int startColIndex = -1;
            int endColIndex = -1;
            int scoreColIndex = -1;
            int strandColIndex = -1;
            int frameColIndex = -1;
            int groupColIndex = -1;

            for (int i = 1; i < cellRange.GetLength(1); i++)
            {
                if (cellRange[rowIndex, i] == null)
                {
                    continue;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnName.ToUpperInvariant()))
                {
                    nameColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnSource.ToUpperInvariant()))
                {
                    sourceColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnType.ToUpperInvariant()))
                {
                    typeColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnStart.ToUpperInvariant()))
                {
                    startColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnEnd.ToUpperInvariant()))
                {
                    endColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnScore.ToUpperInvariant()))
                {
                    scoreColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnStrand.ToUpperInvariant()))
                {
                    strandColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnFrame.ToUpperInvariant()))
                {
                    frameColIndex = i;
                }

                if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnGroup.ToUpperInvariant()))
                {
                    groupColIndex = i;
                }
            }

            if (nameColIndex == -1 ||
                sourceColIndex == -1 ||
                typeColIndex == -1 ||
                startColIndex == -1 ||
                endColIndex == -1 ||
             scoreColIndex == -1 ||
            strandColIndex == -1 ||
             frameColIndex == -1)
            {
                throw new FormatException(Resources.UnrecognizedGffMetadataFormat);
            }

            List<MetadataListItem<List<string>>> featureList = new List<MetadataListItem<List<string>>>();

            metadata["features"] = featureList;
            rowIndex++;

            while (rowIndex < cellRange.GetLength(0))
            {
                string name = cellRange[rowIndex, nameColIndex] != null ? cellRange[rowIndex, nameColIndex].ToString() : string.Empty;
                string value = cellRange[rowIndex, typeColIndex] != null ? cellRange[rowIndex, typeColIndex].ToString() : string.Empty;

                string attributes = string.Empty;
                if (groupColIndex != -1)
                {
                    attributes = cellRange[rowIndex, groupColIndex] != null ? cellRange[rowIndex, groupColIndex].ToString() : string.Empty;
                }

                MetadataListItem<List<string>> feature = new MetadataListItem<List<string>>(value, attributes);

                value = cellRange[rowIndex, sourceColIndex] != null ? cellRange[rowIndex, sourceColIndex].ToString() : string.Empty;
                feature.SubItems.Add("source", new List<string> { value });

                // start is an int
                int ignoreMe;

                value = cellRange[rowIndex, startColIndex] != null ? cellRange[rowIndex, startColIndex].ToString() : string.Empty;
                if (!int.TryParse(value, out ignoreMe))
                {
                    string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resources.GffInvalidField,
                            "start",
                            value);
                    throw new InvalidDataException(message);
                }

                feature.SubItems.Add("start", new List<string> { value });

                // end is an int
                value = cellRange[rowIndex, endColIndex] != null ? cellRange[rowIndex, endColIndex].ToString() : string.Empty;
                if (!int.TryParse(value, out ignoreMe))
                {
                    string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resources.GffInvalidField,
                            "end",
                            value);

                    throw new InvalidDataException(message);
                }

                feature.SubItems.Add("end", new List<string> { value });

                // source is a double, or a dot as a space holder
                value = cellRange[rowIndex, scoreColIndex] != null ? cellRange[rowIndex, scoreColIndex].ToString() : string.Empty;
                if (string.IsNullOrWhiteSpace(value))
                {
                    value = ".";
                }

                if (value != ".")
                {
                    double ignoreMeToo;
                    if (!double.TryParse(value, out ignoreMeToo))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resources.GffInvalidField,
                            "score",
                            value);

                        throw new InvalidDataException(message);
                    }
                    feature.SubItems.Add("score", new List<string> { value });
                }

                // strand is + or -, or a dot as a space holder
                value = cellRange[rowIndex, strandColIndex] != null ? cellRange[rowIndex, strandColIndex].ToString() : string.Empty;
                if (string.IsNullOrWhiteSpace(value))
                {
                    value = ".";
                }

                if (value != ".")
                {
                    if (value != "+" && value != "-")
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resources.GffInvalidField,
                            "strand",
                           value);

                        throw new InvalidDataException(message);
                    }
                    feature.SubItems.Add("strand", new List<string> { value });
                }

                // frame is an int, or a dot as a space holder
                value = cellRange[rowIndex, frameColIndex] != null ? cellRange[rowIndex, frameColIndex].ToString() : string.Empty;
                if (string.IsNullOrWhiteSpace(value))
                {
                    value = ".";
                }

                if (value != ".")
                {
                    if (!int.TryParse(value, out ignoreMe))
                    {
                        string message = String.Format(
                        CultureInfo.CurrentCulture,
                            Resources.GffInvalidField,
                            "frame",
                            value);

                        throw new InvalidDataException(message);
                    }

                    feature.SubItems.Add("frame", new List<string> { value });
                }

                // done with that one
                featureList.Add(feature);
                rowIndex++;
            }

            return rowIndex;
        }
Exemplo n.º 9
0
        /// <summary>
        /// Processes a feature from a GTF gene model file.
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="oneBasedStart"></param>
        /// <param name="oneBasedEnd"></param>
        /// <param name="chrom"></param>
        /// <param name="attributes"></param>
        public void ProcessGtfFeature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes)
        {
            bool hasGeneId       = attributes.TryGetValue("gene_id", out string geneId);
            bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId);
            bool hasProteinId    = attributes.TryGetValue("protein_id", out string proteinId);
            bool hasExonId       = attributes.TryGetValue("exon_id", out string exonId);
            bool hasSource       = feature.SubItems.TryGetValue("source", out List <string> sourceish);
            bool hasStrand       = feature.SubItems.TryGetValue("strand", out List <string> strandish);
            bool hasFrame        = feature.SubItems.TryGetValue("frame", out List <string> framey);

            string source = hasSource ? sourceish[0] : "";

            if (!hasStrand)
            {
                return;
            }                           // strand is a required to do anything in this program
            string strand = strandish[0];
            int    frame  = 0;

            if (hasFrame)
            {
                int.TryParse(framey[0], out frame);
            }

            // Trim prefixes from the IDs
            string genePrefix       = "gene:";
            string transcriptPrefix = "transcript:";

            if (hasGeneId && geneId.StartsWith(genePrefix))
            {
                string newGeneId = geneId.Substring(genePrefix.Length);
                feature.FreeText.Replace(geneId, newGeneId);
                geneId = newGeneId;
            }
            if (hasTranscriptId && transcriptId.StartsWith(transcriptPrefix))
            {
                string newTranscriptId = transcriptId.Substring(transcriptPrefix.Length);
                feature.FreeText.Replace(transcriptId, newTranscriptId);
                transcriptId = newTranscriptId;
            }
            if (hasProteinId && proteinId.StartsWith(transcriptPrefix))
            {
                proteinId = proteinId.Substring(transcriptPrefix.Length); // transcript id is used for protein id sometimes
            }

            // Catch the transcript features before they go by if available, i.e. if the file doesn't just have exons
            if (feature.Key == "transcript" && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID))
            {
                if (currentGene == null || hasGeneId && geneId != currentGene.ID)
                {
                    currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                    Genes.Add(currentGene);
                    GenomeForest.Add(currentGene);
                }

                currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                currentGene.Transcripts.Add(currentTranscript);
                GenomeForest.Add(currentTranscript);
            }

            if (feature.Key == "exon" || feature.Key == "CDS")
            {
                if (currentGene == null || hasGeneId && geneId != currentGene.ID)
                {
                    currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                    Genes.Add(currentGene);
                    GenomeForest.Add(currentGene);
                }

                if (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)
                {
                    if (currentTranscript != null)
                    {
                        Transcript.SetRegions(currentTranscript);
                        currentTranscript.FrameCorrection();
                    }
                    currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                    currentGene.Transcripts.Add(currentTranscript);
                    GenomeForest.Add(currentTranscript);
                }

                if (feature.Key == "exon")
                {
                    ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1);
                    Exon      exon     = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(),
                                                  source, oneBasedStart, oneBasedEnd, chrom.Sequence.ID, strand, null, feature);
                    if (exon.Length() > 0)
                    {
                        currentTranscript.Exons.Add(exon);
                    }
                }
                else if (feature.Key == "CDS")
                {
                    CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame);
                    if (hasProteinId)
                    {
                        currentTranscript.ProteinID = proteinId;
                    }
                    if (cds.Length() > 0)
                    {
                        currentTranscript.CodingDomainSequences.Add(cds);
                    }
                }
                else
                { // nothing to do
                }
            }
        }
Exemplo n.º 10
0
        /// <summary>
        /// Processes a feature from a GFF3 gene model file.
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="oneBasedStart"></param>
        /// <param name="oneBasedEnd"></param>
        /// <param name="chrom"></param>
        /// <param name="attributes"></param>
        public void ProcessGff3Feature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes)
        {
            bool hasGeneId       = attributes.TryGetValue("gene_id", out string geneId);
            bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId);
            bool hasExonId       = attributes.TryGetValue("exon_id", out string exonId);
            bool hasProteinId    = attributes.TryGetValue("protein_id", out string proteinId);
            bool hasSource       = feature.SubItems.TryGetValue("source", out List <string> sourceish); // false if empty ("." in GFF format)
            bool hasStrand       = feature.SubItems.TryGetValue("strand", out List <string> strandish); // false if empty ("." in GFF format)
            bool hasFrame        = feature.SubItems.TryGetValue("frame", out List <string> framey);     // false if empty ("." in GFF format)

            string source = hasSource ? sourceish[0] : "";

            if (!hasStrand)
            {
                return;
            }                           // strand is a required to do anything in this program
            string strand = strandish[0];
            int    frame  = 0;

            if (hasFrame)
            {
                int.TryParse(framey[0], out frame);
            }

            if (hasGeneId && (currentGene == null || hasGeneId && geneId != currentGene.ID))
            {
                currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature);
                Genes.Add(currentGene);
                GenomeForest.Add(currentGene);
            }

            if (hasTranscriptId && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID))
            {
                if (currentTranscript != null)
                {
                    Transcript.SetRegions(currentTranscript);
                    currentTranscript.FrameCorrection();
                }
                currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature);
                currentGene.Transcripts.Add(currentTranscript);
                GenomeForest.Add(currentTranscript);
            }

            if (hasExonId)
            {
                ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1);
                Exon      exon     = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(),
                                              source, oneBasedStart, oneBasedEnd, chrom == null ? "" : chrom.ChromosomeID, strand, null, feature);
                if (exon.Length() > 0)
                {
                    currentTranscript.Exons.Add(exon);
                }
            }
            else if (hasProteinId)
            {
                CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame);
                if (cds.Length() > 0)
                {
                    currentTranscript.CodingDomainSequences.Add(cds);
                    currentTranscript.ProteinID = proteinId;
                }
            }
            else // nothing to do
            {
            }
        }
Exemplo n.º 11
0
        /// <summary>
        /// Parses the consecutive feature lines for one sequence.
        /// </summary>
        /// <param name="reader"></param>
        /// <param name="line"></param>
        /// <returns></returns>
        private string ParseFeatures(TextReader reader, string line)
        {
            // The non-comment lines contain features, which are each stored as MetadataListItems.
            // The fields of each feature are referred to as sub-items.  For GFF, these have
            // unique keys, but for compatibility with our internal representation of features from
            // GenBank format, each sub-item is a list of strings, rather than a simple string.
            List <MetadataListItem <List <string> > > featureList = null;

            Tuple <ISequence, List <byte> > specificSeq = null;

            while (line == "")
            {
                line = reader.ReadLine();
            }
            while (line != null)
            {
                if (line.StartsWith(HeaderMark, StringComparison.Ordinal))
                {
                    line = reader.ReadLine();
                }
                else
                {
                    string[] featureFields = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);

                    if (featureFields.Length < MinFieldsPerFeature || featureFields.Length > MaxFieldsPerFeature)
                    {
                        string message = string.Format(
                            CultureInfo.CurrentCulture,
                            Resource.INVALID_INPUT_FILE,
                            this.Name);
                        ;
                        throw new InvalidDataException(message);
                    }

                    // The featureFields array should now contain the following fields:
                    //      featureFields[0]: sequence name
                    //      featureFields[1]: source
                    //      featureFields[2]: feature name
                    //      featureFields[3]: start
                    //      featureFields[4]: end
                    //      featureFields[5]: score
                    //      featureFields[6]: strand
                    //      featureFields[7]: frame
                    //      featureFields[8]: attributes (optional)

                    // Process sequence name.
                    if (specificSeq == null)
                    {
                        specificSeq = this.GetSpecificSequence(featureFields[0], null);

                        // Retrieve features list, or add empty features list to metadata if this
                        // is the first feature.
                        if (specificSeq.Item1.Metadata.ContainsKey("features"))
                        {
                            featureList = specificSeq.Item1.Metadata["features"] as List <MetadataListItem <List <string> > >;
                        }
                        else
                        {
                            featureList = new List <MetadataListItem <List <string> > >();
                            specificSeq.Item1.Metadata["features"] = featureList;
                        }
                    }
                    else if (specificSeq.Item1.ID != featureFields[0])
                    {
                        // don't go to next line; current line still needs to be processed
                        break;
                    }

                    // use feature name as key; attributes field is stored as free text
                    string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty);
                    var    feature    = new MetadataListItem <List <string> >(featureFields[2], attributes);

                    // source
                    feature.SubItems.Add(SourceKey, new List <string> {
                        featureFields[1]
                    });

                    // start is an int
                    int ignoreMe;
                    if (!int.TryParse(featureFields[3], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resource.GffInvalidField,
                            "start",
                            featureFields[3]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    feature.SubItems.Add("start", new List <string> {
                        featureFields[3]
                    });

                    // end is an int
                    if (!int.TryParse(featureFields[4], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resource.GffInvalidField,
                            "end",
                            featureFields[4]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    feature.SubItems.Add("end", new List <string> {
                        featureFields[4]
                    });

                    // source is a double, or a dot as a space holder
                    if (featureFields[5] != ".")
                    {
                        double ignoreMeToo;
                        if (!double.TryParse(featureFields[5], out ignoreMeToo))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Resource.GffInvalidField,
                                "score",
                                featureFields[5]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("score", new List <string> {
                            featureFields[5]
                        });
                    }

                    // strand is + or -, or a dot as a space holder
                    if (featureFields[6] != ".")
                    {
                        if (featureFields[6] != "+" && featureFields[6] != "-")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Resource.GffInvalidField,
                                "strand",
                                featureFields[6]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("strand", new List <string> {
                            featureFields[6]
                        });
                    }

                    // frame is an int, or a dot as a space holder
                    if (featureFields[7] != ".")
                    {
                        if (!int.TryParse(featureFields[7], out ignoreMe))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Resource.GffInvalidField,
                                "frame",
                                featureFields[7]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }

                        feature.SubItems.Add("frame", new List <string> {
                            featureFields[7]
                        });
                    }

                    // done with that one
                    featureList.Add(feature);
                    line = reader.ReadLine();
                }
            }

            // if any seqs are left in _sequencesInHeader add it to _sequences
            if (this.sequencesInHeader.Count > 0)
            {
                this.sequences.AddRange(this.sequencesInHeader);

                this.sequencesInHeader.Clear();
            }
            return(line);
        }
Exemplo n.º 12
0
        /// <summary>
        ///     Process the headers.
        /// </summary>
        /// <returns></returns>
        private string ParseHeaders(TextReader reader)
        {
            string comments      = string.Empty;
            int    commentsCount = 1;
            string line          = reader.ReadLine();

            while (line == "")
            {
                line = reader.ReadLine();
            }

            while ((line != null) && line.TrimStart().StartsWith(CommentMark, StringComparison.Ordinal))
            {
                // process headers, but ignore other comments
                if (line.StartsWith(HeaderMark, StringComparison.Ordinal))
                {
                    string[] fields = line.Substring(3 - 1).Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    // Add if any comments.
                    if (!string.IsNullOrEmpty(comments))
                    {
                        this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)
                        ]        = comments;
                        comments = string.Empty;
                        commentsCount++;
                    }

                    Tuple <ISequence, List <byte> > specificSeq = null;
                    switch (fields[0].ToUpperInvariant())
                    {
                    case GffVersionKey:
                        if (fields.Length > 1 && fields[1] != "2")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Resource.GffUnsupportedVersion);
                            Trace.Report(message);
                            throw new NotSupportedException(message);
                        }

                        // Store "GFF-VERSION" to get keep the order of comments/headers.
                        this.commonSeq.Metadata[GffVersionKey] = fields[1];

                        break;

                    case SourceVersionKey:

                        var sourceVersion = new MetadataListItem <string>(SourceVersionKey, string.Empty);
                        sourceVersion.SubItems.Add(SourceKey, fields[1]);
                        sourceVersion.SubItems.Add(VersionKey, fields[2]);
                        this.commonSeq.Metadata[SourceVersionKey] = sourceVersion;

                        break;

                    case DateKey:
                        DateTime date;
                        if (!DateTime.TryParse(fields[1], out date))
                        {
                            string message = String.Format(CultureInfo.CurrentCulture, Resource.ParserInvalidDate);
                            Trace.Report(message);
                            throw new FormatException(message);
                        }

                        this.commonSeq.Metadata[DateLowerCaseKey] = date;
                        break;

                    case TypeKey:
                        if (fields.Length == 2)
                        {
                            this.commonSeq.Alphabet = GetAlphabetType(fields[1]);
                            if (this.commonSeq.Alphabet == null)
                            {
                                string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            this.commonSeq.Metadata[TypeKey] = fields[1];
                        }
                        else
                        {
                            specificSeq = this.GetSpecificSequence(fields[2], GetAlphabetType(fields[1]), false);

                            if (specificSeq.Item1.Alphabet == null)
                            {
                                string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            // Store seq id as value.
                            this.commonSeq.Metadata[MultiTypeKey + fields[2]] = fields[2];
                        }
                        break;

                    case "DNA":
                    case "RNA":
                    case "PROTEIN":
                        line = reader.ReadLine();

                        // Store seq id as value.
                        this.commonSeq.Metadata[MultiSeqDataKey + fields[1]] = fields[1];
                        specificSeq = this.GetSpecificSequence(fields[1], GetAlphabetType(fields[0]), false);

                        long sequenceDataLength = 0;
                        while ((line != null) && line != SeqDataEnd + fields[0])
                        {
                            if (!line.StartsWith(HeaderMark, StringComparison.Ordinal))
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Resource.GffInvalidSequence);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }
                            byte[] tempSeqData = Encoding.UTF8.GetBytes(line.Substring(3 - 1).ToCharArray());
                            sequenceDataLength += tempSeqData.Length;

                            specificSeq.Item2.AddRange(tempSeqData);
                            line = reader.ReadLine();
                        }
                        break;

                    case SeqRegKey:

                        specificSeq = this.GetSpecificSequence(fields[1], null, false);
                        specificSeq.Item1.Metadata["start"] = fields[2];
                        specificSeq.Item1.Metadata["end"]   = fields[3];

                        // Store seq id as value.
                        this.commonSeq.Metadata[MultiSeqRegKey + fields[1]] = fields[1];
                        break;
                    }
                }
                else
                {
                    comments = string.IsNullOrEmpty(comments) ? line : comments + Environment.NewLine + line;
                }

                line = reader.ReadLine();
                while (line == "")
                {
                    line = reader.ReadLine();
                }
            }

            if (!string.IsNullOrEmpty(comments))
            {
                this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] =
                    comments;
                comments = string.Empty;
            }
            return(line);
        }
Exemplo n.º 13
0
        // Parses the consecutive feature lines for one sequence.
        private void ParseFeatures(BioTextReader bioReader)
        {
            // The non-comment lines contain features, which are each stored as MetadataListItems.
            // The fields of each feature are referred to as sub-items.  For GFF, these have
            // unique keys, but for compatability with our internal representation of features from
            // GenBank format, each sub-item is a list of strings, rather than a simple string.
            List <MetadataListItem <List <string> > > featureList = null;

            Sequence specificSeq = null;

            while (bioReader.HasLines)
            {
                if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    // ignore comments
                    bioReader.GoToNextLine();
                }
                else
                {
                    // fields are tab-delimited
                    string[] featureFields = bioReader.Line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);

                    if (featureFields.Length < _minFieldsPerFeature ||
                        featureFields.Length > _maxFieldsPerFeature)
                    {
                        string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);;
                        throw new InvalidDataException(message);
                    }

                    // The featureFields array should now contain the following fields:
                    //      featureFields[0]: sequence name
                    //      featureFields[1]: source
                    //      featureFields[2]: feature name
                    //      featureFields[3]: start
                    //      featureFields[4]: end
                    //      featureFields[5]: score
                    //      featureFields[6]: strand
                    //      featureFields[7]: frame
                    //      featureFields[8]: attributes (optional)

                    // Process sequence name.
                    if (specificSeq == null)
                    {
                        specificSeq = GetSpecificSequence(featureFields[0], MoleculeType.Invalid, bioReader);

                        // Retrieve features list, or add empty features list to metadata if this
                        // is the first feature.
                        if (specificSeq.Metadata.ContainsKey("features"))
                        {
                            featureList = specificSeq.Metadata["features"] as
                                          List <MetadataListItem <List <string> > >;
                        }
                        else
                        {
                            featureList = new List <MetadataListItem <List <string> > >();
                            specificSeq.Metadata["features"] = featureList;
                        }
                    }
                    else if (specificSeq.DisplayID != featureFields[0])
                    {
                        // don't go to next line; current line still needs to be processed
                        break;
                    }

                    // use feature name as key; attributes field is stored as free text
                    string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty);
                    MetadataListItem <List <string> > feature = new MetadataListItem <List <string> >(featureFields[2], attributes);

                    // source
                    feature.SubItems.Add("source", new List <string> {
                        featureFields[1]
                    });

                    // start is an int
                    int ignoreMe;
                    if (!int.TryParse(featureFields[3], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.GffInvalidField,
                            "start",
                            featureFields[3]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    feature.SubItems.Add("start", new List <string> {
                        featureFields[3]
                    });

                    // end is an int
                    if (!int.TryParse(featureFields[4], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.GffInvalidField,
                            "end",
                            featureFields[4]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    feature.SubItems.Add("end", new List <string> {
                        featureFields[4]
                    });

                    // source is a double, or a dot as a space holder
                    if (featureFields[5] != ".")
                    {
                        double ignoreMeToo;
                        if (!double.TryParse(featureFields[5], out ignoreMeToo))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "score",
                                featureFields[5]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("score", new List <string> {
                            featureFields[5]
                        });
                    }

                    // strand is + or -, or a dot as a space holder
                    if (featureFields[6] != ".")
                    {
                        if (featureFields[6] != "+" && featureFields[6] != "-")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "strand",
                                featureFields[6]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("strand", new List <string> {
                            featureFields[6]
                        });
                    }

                    // frame is an int, or a dot as a space holder
                    if (featureFields[7] != ".")
                    {
                        if (!int.TryParse(featureFields[7], out ignoreMe))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "frame",
                                featureFields[7]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }

                        feature.SubItems.Add("frame", new List <string> {
                            featureFields[7]
                        });
                    }

                    // done with that one
                    featureList.Add(feature);
                    bioReader.GoToNextLine();
                }
            }

            // A feature file with no features?  May it never be.
            if (featureList == null)
            {
                string message = Properties.Resource.GFFNoFeatures;
                Trace.Report(message);
                throw new InvalidOperationException(message);
            }
        }
Exemplo n.º 14
0
        /// <summary>
        /// Get the value of a particular key from GFF metadata structure
        /// </summary>
        /// <param name="listItem">GFF Metadata</param>
        /// <param name="itemKey">Header of the column in GFF metadata</param>
        /// <returns>Value of the given column</returns>
        private static string GetGFFColumnValue(MetadataListItem<List<string>> listItem, string itemKey)
        {
            List<string> values = new List<string>(1);
            if (listItem.SubItems.TryGetValue(itemKey, out values))
            {
                if (values.Count > 0)
                {
                    return values[0];
                }
            }

            return string.Empty;
        }
Exemplo n.º 15
0
        // The headers for all sequences go at the top of the file before any features.
        private void WriteHeaders(ICollection <ISequence> sequenceList, TextWriter writer)
        {
            // look for file-scope data tha is common to all sequences; null signifies no match
            MetadataListItem <string> sourceVersion = null;
            string        source               = null;
            string        version              = null;
            string        type                 = null;
            bool          firstSeq             = true;
            ISequence     commonSeq            = null;
            List <string> typeExceptionList    = new List <string>();
            List <string> seqDataExceptionList = new List <string>();
            List <string> seqRegExceptionList  = new List <string>();

            foreach (ISequence sequence in sequenceList)
            {
                if (firstSeq)
                {
                    // consider first seq for common metadata.
                    commonSeq = sequence;

                    object tmpobj;
                    // source and version go together; can't output one without the other
                    if (sequence.Metadata.TryGetValue(_sourceVersionKey, out tmpobj))
                    {
                        sourceVersion = tmpobj as MetadataListItem <string>;
                        if (sourceVersion != null && sourceVersion.SubItems.Count > 1)
                        {
                            source  = sourceVersion.SubItems[_sourceKey];
                            version = sourceVersion.SubItems[_versionKey];
                        }
                    }

                    // map to generic string; e.g. mRNA, tRNA -> RNA
                    type = GetGenericTypeString(sequence.MoleculeType);

                    firstSeq = false;
                }
                else
                {
                    // source and version go together; can't output one without the other
                    if (source != null)
                    {
                        bool sourceAndVersionMatchOthers = false;

                        object tmpobj;
                        // source and version go together; can't output one without the other
                        if (sequence.Metadata.TryGetValue(_sourceVersionKey, out tmpobj))
                        {
                            sourceVersion = tmpobj as MetadataListItem <string>;
                            if (sourceVersion != null && sourceVersion.SubItems.Count > 1)
                            {
                                sourceAndVersionMatchOthers = source == sourceVersion.SubItems[_sourceKey] &&
                                                              version == sourceVersion.SubItems[_versionKey];
                            }
                        }

                        // set both to null if this seq source and version don't match previous ones
                        if (!sourceAndVersionMatchOthers)
                        {
                            source  = null;
                            version = null;
                        }
                    }

                    // set type to null if this seq type doesn't match previous types
                    if (type != null && type != GetGenericTypeString(sequence.MoleculeType))
                    {
                        type = null;
                    }
                }
            }

            if (commonSeq == null)
            {
                commonSeq = new Sequence(Alphabets.DNA);
            }

            WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 1);

            int totalTypeCount   = commonSeq.Metadata.Keys.Count(K => K.ToUpperInvariant().Contains(_multiTypeKey));
            int currentTypeCount = 0;
            int totalSeqData     = commonSeq.Metadata.Keys.Count(K => K.ToUpperInvariant().Contains(_multiSeqDataKey));
            int totalSeqRegs     = commonSeq.Metadata.Keys.Count(K => K.ToUpperInvariant().Contains(_multiSeqRegKey));

            ISequence seq = null;

            foreach (string key in commonSeq.Metadata.Keys)
            {
                string keyToCompare = key.ToUpperInvariant();
                string value        = string.Empty;

                if (keyToCompare.Contains(_commentSectionKey))
                {
                    keyToCompare = _commentSectionKey;
                    value        = commonSeq.Metadata[key] as string;
                }

                if (keyToCompare.Contains(_multiTypeKey))
                {
                    keyToCompare = _multiTypeKey;
                    value        = commonSeq.Metadata[key] as string;
                }

                if (keyToCompare.Contains(_multiSeqDataKey))
                {
                    keyToCompare = _multiSeqDataKey;
                    value        = commonSeq.Metadata[key] as string;
                }

                if (keyToCompare.Contains(_multiSeqRegKey))
                {
                    keyToCompare = _multiSeqRegKey;
                    value        = commonSeq.Metadata[key] as string;
                }

                switch (keyToCompare)
                {
                case _commentSectionKey:
                    writer.WriteLine(value);
                    break;

                case _gffVersionKey:
                    // formatting using gff version 2
                    WriteHeaderLine(writer, _gffVersionLowercaseKey, "2");
                    WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 2);
                    break;

                case _sourceVersionKey:

                    // only output source if they all match
                    if (source != null)
                    {
                        WriteHeaderLine(writer, _sourceVersionLowercaseKey, source, version);
                    }

                    WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 3);
                    break;

                case _dateKey:
                    // today's date
                    WriteHeaderLine(writer, _dateLowercaseKey, DateTime.Today.ToString("yyyy-MM-dd"));
                    WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 4);
                    break;

                case _typeKey:
                    // type header
                    if (type != null)
                    {
                        // output that the types all match; don't need to output if DNA, as DNA is default
                        if (type != MoleculeType.DNA.ToString())
                        {
                            WriteHeaderLine(writer, _typeLowercaseKey, type);
                        }
                    }
                    else if (totalTypeCount == 0)
                    {
                        foreach (ISequence sequence in sequenceList)
                        {
                            type = GetGenericTypeString(sequence.MoleculeType);

                            // only ouput seq-specific type header if this seq won't have its type
                            // output as part of a sequence data header; don't need to output if DNA,
                            // as DNA is default
                            if (type != MoleculeType.DNA.ToString() &&
                                (!ShouldWriteSequenceData || sequence.Count == 0))
                            {
                                WriteHeaderLine(writer, _typeLowercaseKey, type, sequence.DisplayID);
                            }
                        }
                    }
                    break;

                case _multiTypeKey:

                    if (totalTypeCount > 0)
                    {
                        if (type == null)
                        {
                            seq = sequenceList.FirstOrDefault(S => S.DisplayID.Equals(value));
                            if (seq != null)
                            {
                                WriteHeaderLine(writer, _typeLowercaseKey, seq.MoleculeType.ToString(), seq.DisplayID);
                                typeExceptionList.Add(seq.DisplayID);
                            }

                            currentTypeCount++;

                            if (currentTypeCount == totalTypeCount)
                            {
                                foreach (ISequence sequence in sequenceList)
                                {
                                    if (typeExceptionList.Contains(sequence.DisplayID))
                                    {
                                        continue;
                                    }

                                    type = GetGenericTypeString(sequence.MoleculeType);

                                    // only ouput seq-specific type header if this seq won't have its type
                                    // output as part of a sequence data header; don't need to output if DNA,
                                    // as DNA is default
                                    if (type != MoleculeType.DNA.ToString() &&
                                        (!ShouldWriteSequenceData || sequence.Count == 0))
                                    {
                                        WriteHeaderLine(writer, _typeLowercaseKey, type, sequence.DisplayID);
                                    }
                                }
                            }
                        }
                        else
                        {
                            // output that the types all match; don't need to output if DNA, as DNA is default
                            if (type != MoleculeType.DNA.ToString())
                            {
                                WriteHeaderLine(writer, _typeLowercaseKey, type);
                            }

                            totalTypeCount = 0;
                        }
                    }
                    break;

                case _multiSeqDataKey:
                    // sequence data
                    if (ShouldWriteSequenceData)
                    {
                        seq = sequenceList.FirstOrDefault(S => S.DisplayID.Equals(value));
                        if (seq != null)
                        {
                            WriteSeqData(seq, type, writer);
                            seqDataExceptionList.Add(seq.DisplayID);
                        }

                        totalSeqData--;

                        if (totalSeqData == 0)
                        {
                            foreach (ISequence sequence in sequenceList)
                            {
                                if (seqDataExceptionList.Contains(sequence.DisplayID))
                                {
                                    continue;
                                }

                                WriteSeqData(sequence, type, writer);
                            }
                        }
                    }

                    break;

                case _multiSeqRegKey:
                    seq = sequenceList.FirstOrDefault(S => S.DisplayID.Equals(value));
                    if (seq != null)
                    {
                        if (seq.Metadata.ContainsKey(_startKey) && seq.Metadata.ContainsKey(_endKey))
                        {
                            WriteHeaderLine(writer, _seqRegKey, seq.DisplayID,
                                            seq.Metadata[_startKey] as string, seq.Metadata[_endKey] as string);
                        }

                        seqRegExceptionList.Add(value);
                    }


                    totalSeqRegs--;
                    if (totalSeqRegs == 0)
                    {
                        // sequence-region header
                        foreach (ISequence sequence in sequenceList)
                        {
                            if (seqRegExceptionList.Contains(sequence.DisplayID))
                            {
                                continue;
                            }

                            if (sequence.Metadata.ContainsKey(_startKey) && sequence.Metadata.ContainsKey(_endKey))
                            {
                                WriteHeaderLine(writer, _seqRegKey, sequence.DisplayID,
                                                sequence.Metadata[_startKey] as string, sequence.Metadata[_endKey] as string);
                            }
                        }
                    }
                    break;
                }
            }
        }
Exemplo n.º 16
0
        // Processes headers, which are a type of comment.
        private void ParseHeaders(MBFTextReader mbfReader)
        {
            string comments      = string.Empty;
            int    commentsCount = 1;

            while (mbfReader.HasLines && mbfReader.Line.TrimStart().StartsWith(_commentMark, StringComparison.Ordinal))
            {
                Sequence specificSeq = null;

                // process headers, but ignore other comments
                if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    string[] fields = mbfReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    // Add if any comments.
                    if (!string.IsNullOrEmpty(comments))
                    {
                        _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments;
                        comments = string.Empty;
                        commentsCount++;
                    }

                    switch (fields[0].ToUpperInvariant())
                    {
                    case _gffVersionKey:
                        if (fields.Length > 1 && fields[1] != "2")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffUnsupportedVersion,
                                mbfReader.LocationString);
                            Trace.Report(message);
                            throw new NotSupportedException(message);
                        }

                        // Store "GFF-VERSION" to get keep the order of comments/headers.
                        _commonSeq.Metadata[_gffVersionKey] = fields[1];

                        break;

                    case _sourceVersionKey:

                        MetadataListItem <string> sourceVersion = new MetadataListItem <string>(_sourceVersionKey, string.Empty);
                        sourceVersion.SubItems.Add(_sourceKey, fields[1]);
                        sourceVersion.SubItems.Add(_versionKey, fields[2]);

                        _commonSeq.Metadata[_sourceVersionKey] = sourceVersion;

                        break;

                    case _dateKey:
                        DateTime date;
                        if (!DateTime.TryParse(fields[1], out date))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.ParserInvalidDate,
                                mbfReader.LocationString);
                            Trace.Report(message);
                            throw new FormatException(message);
                        }

                        _commonSeq.Metadata[_dateLowerCaseKey] = date;
                        break;

                    case _typeKey:
                        if (fields.Length == 2)
                        {
                            _commonSeq.MoleculeType = GetMoleculeType(fields[1]);
                            if (_commonSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            _commonSeq.Metadata[_typeKey] = fields[1];
                        }
                        else
                        {
                            specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), mbfReader, false);

                            if (specificSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            // Store seq id as value.
                            _commonSeq.Metadata[_multiTypeKey + fields[2]] = fields[2];
                        }
                        break;

                    case "DNA":
                    case "RNA":
                    case "PROTEIN":
                        specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), mbfReader, false);
                        mbfReader.GoToNextLine();

                        // Store seq id as value.
                        _commonSeq.Metadata[_multiSeqDataKey + fields[1]] = fields[1];

                        while (mbfReader.HasLines && mbfReader.Line != _seqDataEnd + fields[0])
                        {
                            if (!mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.GffInvalidSequence,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            specificSeq.InsertRange(specificSeq.Count, mbfReader.GetLineField(3));

                            mbfReader.GoToNextLine();
                        }

                        break;

                    case _seqRegKey:

                        specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, mbfReader, false);
                        specificSeq.Metadata["start"] = fields[2];
                        specificSeq.Metadata["end"]   = fields[3];

                        // Store seq id as value.
                        _commonSeq.Metadata[_multiSeqRegKey + fields[1]] = fields[1];
                        break;
                    }
                }
                else
                {
                    comments = string.IsNullOrEmpty(comments) ? mbfReader.Line : comments + Environment.NewLine + mbfReader.Line;
                }

                mbfReader.GoToNextLine();
            }

            if (!string.IsNullOrEmpty(comments))
            {
                _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments;
                comments = string.Empty;
            }
        }
Exemplo n.º 17
0
        /// <summary>
        /// Parses the consecutive feature lines for one sequence.
        /// </summary>
        /// <param name="reader"></param>
        /// <param name="line"></param>
        /// <returns></returns>
        private string ParseFeatures(TextReader reader, string line)
        {
            // The non-comment lines contain features, which are each stored as MetadataListItems.
            // The fields of each feature are referred to as sub-items.  For GFF, these have
            // unique keys, but for compatibility with our internal representation of features from
            // GenBank format, each sub-item is a list of strings, rather than a simple string.
            List<MetadataListItem<List<string>>> featureList = null;

            Tuple<ISequence, List<byte>> specificSeq = null;
            while (line == "")
            {
                line = reader.ReadLine();
            }
            while (line != null)
            {
                if (line.StartsWith(HeaderMark, StringComparison.Ordinal))
                {
                    line = reader.ReadLine();
                }
                else
                {
                    string[] featureFields = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);

                    if (featureFields.Length < MinFieldsPerFeature || featureFields.Length > MaxFieldsPerFeature)
                    {
                        string message = string.Format(
                            CultureInfo.CurrentCulture,
                            Resource.INVALID_INPUT_FILE,
                            this.Name);
                        ;
                        throw new InvalidDataException(message);
                    }

                    // The featureFields array should now contain the following fields:
                    //      featureFields[0]: sequence name
                    //      featureFields[1]: source
                    //      featureFields[2]: feature name
                    //      featureFields[3]: start
                    //      featureFields[4]: end
                    //      featureFields[5]: score
                    //      featureFields[6]: strand
                    //      featureFields[7]: frame
                    //      featureFields[8]: attributes (optional)

                    // Process sequence name.
                    if (specificSeq == null)
                    {
                        specificSeq = this.GetSpecificSequence(featureFields[0], null);

                        // Retrieve features list, or add empty features list to metadata if this
                        // is the first feature.
                        if (specificSeq.Item1.Metadata.ContainsKey("features"))
                        {
                            featureList = specificSeq.Item1.Metadata["features"] as List<MetadataListItem<List<string>>>;
                        }
                        else
                        {
                            featureList = new List<MetadataListItem<List<string>>>();
                            specificSeq.Item1.Metadata["features"] = featureList;
                        }
                    }
                    else if (specificSeq.Item1.ID != featureFields[0])
                    {
                        // don't go to next line; current line still needs to be processed
                        break;
                    }

                    // use feature name as key; attributes field is stored as free text
                    string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty);
                    var feature = new MetadataListItem<List<string>>(featureFields[2], attributes);

                    // source
                    feature.SubItems.Add(SourceKey, new List<string> { featureFields[1] });

                    // start is an int
                    int ignoreMe;
                    if (!int.TryParse(featureFields[3], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resource.GffInvalidField,
                            "start",
                            featureFields[3]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    feature.SubItems.Add("start", new List<string> { featureFields[3] });

                    // end is an int
                    if (!int.TryParse(featureFields[4], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Resource.GffInvalidField,
                            "end",
                            featureFields[4]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    feature.SubItems.Add("end", new List<string> { featureFields[4] });

                    // source is a double, or a dot as a space holder
                    if (featureFields[5] != ".")
                    {
                        double ignoreMeToo;
                        if (!double.TryParse(featureFields[5], out ignoreMeToo))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Resource.GffInvalidField,
                                "score",
                                featureFields[5]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("score", new List<string> { featureFields[5] });
                    }

                    // strand is + or -, or a dot as a space holder
                    if (featureFields[6] != ".")
                    {
                        if (featureFields[6] != "+" && featureFields[6] != "-")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Resource.GffInvalidField,
                                "strand",
                                featureFields[6]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("strand", new List<string> { featureFields[6] });
                    }

                    // frame is an int, or a dot as a space holder
                    if (featureFields[7] != ".")
                    {
                        if (!int.TryParse(featureFields[7], out ignoreMe))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Resource.GffInvalidField,
                                "frame",
                                featureFields[7]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }

                        feature.SubItems.Add("frame", new List<string> { featureFields[7] });
                    }

                    // done with that one
                    featureList.Add(feature);
                    line = reader.ReadLine();
                }
            }

            // if any seqs are left in _sequencesInHeader add it to _sequences
            if (this.sequencesInHeader.Count > 0)
            {
                this.sequences.AddRange(this.sequencesInHeader);

                this.sequencesInHeader.Clear();
            }
            return line;
        }
Exemplo n.º 18
0
        /// <summary>
        ///     Process the headers.
        /// </summary>
        /// <returns></returns>
        private string ParseHeaders(TextReader reader)
        {
            string comments = string.Empty;
            int commentsCount = 1;
            string line = reader.ReadLine();
            while (line == "")
            {
                line = reader.ReadLine();
            }

            while ((line != null) && line.TrimStart().StartsWith(CommentMark, StringComparison.Ordinal))
            {
                // process headers, but ignore other comments
                if (line.StartsWith(HeaderMark, StringComparison.Ordinal))
                {
                    string[] fields = line.Substring(3 - 1).Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    // Add if any comments.
                    if (!string.IsNullOrEmpty(comments))
                    {
                        this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)
                            ] = comments;
                        comments = string.Empty;
                        commentsCount++;
                    }

                    Tuple<ISequence, List<byte>> specificSeq = null;
                    switch (fields[0].ToUpperInvariant())
                    {
                        case GffVersionKey:
                            if (fields.Length > 1 && fields[1] != "2")
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Resource.GffUnsupportedVersion);
                                Trace.Report(message);
                                throw new NotSupportedException(message);
                            }

                            // Store "GFF-VERSION" to get keep the order of comments/headers.
                            this.commonSeq.Metadata[GffVersionKey] = fields[1];

                            break;

                        case SourceVersionKey:

                            var sourceVersion = new MetadataListItem<string>(SourceVersionKey, string.Empty);
                            sourceVersion.SubItems.Add(SourceKey, fields[1]);
                            sourceVersion.SubItems.Add(VersionKey, fields[2]);
                            this.commonSeq.Metadata[SourceVersionKey] = sourceVersion;

                            break;
                        case DateKey:
                            DateTime date;
                            if (!DateTime.TryParse(fields[1], out date))
                            {
                                string message = String.Format(CultureInfo.CurrentCulture, Resource.ParserInvalidDate);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            this.commonSeq.Metadata[DateLowerCaseKey] = date;
                            break;
                        case TypeKey:
                            if (fields.Length == 2)
                            {
                                this.commonSeq.Alphabet = GetAlphabetType(fields[1]);
                                if (this.commonSeq.Alphabet == null)
                                {
                                    string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType);
                                    Trace.Report(message);
                                    throw new FormatException(message);
                                }

                                // Store "TYPE" to get keep the order of comments/headers.
                                this.commonSeq.Metadata[TypeKey] = fields[1];
                            }
                            else
                            {
                                specificSeq = this.GetSpecificSequence(fields[2], GetAlphabetType(fields[1]), false);

                                if (specificSeq.Item1.Alphabet == null)
                                {
                                    string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType);
                                    Trace.Report(message);
                                    throw new FormatException(message);
                                }

                                // Store "TYPE" to get keep the order of comments/headers.
                                // Store seq id as value.
                                this.commonSeq.Metadata[MultiTypeKey + fields[2]] = fields[2];
                            }
                            break;
                        case "DNA":
                        case "RNA":
                        case "PROTEIN":
                            line = reader.ReadLine();

                            // Store seq id as value.
                            this.commonSeq.Metadata[MultiSeqDataKey + fields[1]] = fields[1];
                            specificSeq = this.GetSpecificSequence(fields[1], GetAlphabetType(fields[0]), false);

                            long sequenceDataLength = 0;
                            while ((line != null) && line != SeqDataEnd + fields[0])
                            {
                                if (!line.StartsWith(HeaderMark, StringComparison.Ordinal))
                                {
                                    string message = String.Format(
                                        CultureInfo.CurrentCulture,
                                        Resource.GffInvalidSequence);
                                    Trace.Report(message);
                                    throw new FormatException(message);
                                }
                                byte[] tempSeqData = Encoding.UTF8.GetBytes(line.Substring(3 - 1).ToCharArray());
                                sequenceDataLength += tempSeqData.Length;

                                specificSeq.Item2.AddRange(tempSeqData);
                                line = reader.ReadLine();
                            }
                            break;
                        case SeqRegKey:

                            specificSeq = this.GetSpecificSequence(fields[1], null, false);
                            specificSeq.Item1.Metadata["start"] = fields[2];
                            specificSeq.Item1.Metadata["end"] = fields[3];

                            // Store seq id as value.
                            this.commonSeq.Metadata[MultiSeqRegKey + fields[1]] = fields[1];
                            break;
                    }
                }
                else
                {
                    comments = string.IsNullOrEmpty(comments) ? line : comments + Environment.NewLine + line;
                }

                line = reader.ReadLine();
                while (line == "")
                {
                    line = reader.ReadLine();
                }
            }

            if (!string.IsNullOrEmpty(comments))
            {
                this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] =
                    comments;
                comments = string.Empty;
            }
            return line;
        }
Exemplo n.º 19
0
 // Returns a tab plus the sub-item text or a "." if the sub-item is absent.
 private string GetSubItemString(MetadataListItem <List <string> > feature, string subItemName)
 {
     return("\t" + (feature.SubItems.ContainsKey(subItemName) ? feature.SubItems[subItemName][0] : "."));
 }
Exemplo n.º 20
0
 /// <summary>
 /// Construct an exon
 /// </summary>
 /// <param name="parent"></param>
 /// <param name="Sequence"></param>
 /// <param name="oneBasedStart"></param>
 /// <param name="oneBasedEnd"></param>
 /// <param name="chromID"></param>
 /// <param name="strand"></param>
 public Exon(Transcript parent, ISequence Sequence, string source, long oneBasedStart, long oneBasedEnd, string chromID, string strand, MetadataListItem <List <string> > featureMetadata)
     : base(parent, chromID, source, strand, oneBasedStart, oneBasedEnd, Sequence)
 {
     FeatureMetadata = featureMetadata;
 }
Exemplo n.º 21
0
        /// <summary>
        ///     Returns a tab plus the sub-item text or a "." if the sub-item is absent.
        /// </summary>
        /// <param name="feature"></param>
        /// <param name="subItemName"></param>
        /// <returns></returns>
        private string GetSubItemString(MetadataListItem<List<string>> feature, string subItemName)
        {
            List<string> list;
            if (feature.SubItems.TryGetValue(subItemName, out list))
            {
                if (list.Count >= 1)
                {
                    return list[0];
                }
            }

            return ".";
        }