/// <summary> /// Constructing a gene using Bio object /// </summary> /// <param name="id"></param> /// <param name="chromosome"></param> /// <param name="strand"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="metadata"></param> public Gene(string id, Chromosome chromosome, string source, string strand, long oneBasedStart, long oneBasedEnd, MetadataListItem <List <string> > featureMetadata) : base(chromosome, chromosome.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null) { ID = id; Chromosome = chromosome; FeatureMetadata = featureMetadata; }
/// <summary> /// Constructor from the GFF3 reader information, including IDs, strand and Protein ID if available. /// </summary> /// <param name="id"></param> /// <param name="gene"></param> /// <param name="metadata"></param> /// <param name="ProteinID"></param> public Transcript(string id, Gene gene, string source, string strand, long oneBasedStart, long oneBasedEnd, string proteinID, MetadataListItem <List <string> > featureMetadata) : base(gene, gene.ChromosomeID, source, strand, oneBasedStart, oneBasedEnd) { ID = id; ProteinID = proteinID ?? id; Gene = gene; FeatureMetadata = featureMetadata; }
// Returns a tab plus the sub-item text or a "." if the sub-item is absent. private string GetSubItemString(MetadataListItem <List <string> > feature, string subItemName) { List <string> list = null; if (feature.SubItems.TryGetValue(subItemName, out list)) { return(list[0]); } return("."); }
/// <summary> /// Get the value of a particular key from GFF metadata structure /// </summary> /// <param name="listItem">GFF Metadata</param> /// <param name="itemKey">Header of the column in GFF metadata</param> /// <returns>Value of the given column</returns> private static string GetGFFColumnValue(MetadataListItem <List <string> > listItem, string itemKey) { List <string> values = new List <string>(1); if (listItem.SubItems.TryGetValue(itemKey, out values)) { if (values.Count > 0) { return(values[0]); } } return(string.Empty); }
public MetadataListItem <List <string> > GetGtfFeatureMetadata() { var feature = new MetadataListItem <List <string> >(FeatureType, GetGtfAttributes()); feature.SubItems["source"] = new List <string> { Source.ToString() }; feature.SubItems["start"] = new List <string> { OneBasedStart.ToString() }; feature.SubItems["end"] = new List <string> { OneBasedEnd.ToString() }; if (Strand != ".") { feature.SubItems["strand"] = new List <string> { Strand.ToString() }; } // might take in features without strand later on return(feature); }
private static MetadataListItem <List <string> > CDSFeatureMetadata(CDS cds, Exon exon) { string cdsAttributes = exon.GetGtfAttributes() + " protein_id \"" + (cds.Parent as Transcript).ProteinID + "\";"; var feature = new MetadataListItem <List <string> >(cds.FeatureType, cdsAttributes); feature.SubItems["source"] = new List <string> { cds.Source.ToString() }; feature.SubItems["start"] = new List <string> { cds.OneBasedStart.ToString() }; feature.SubItems["end"] = new List <string> { cds.OneBasedEnd.ToString() }; if (cds.Strand != ".") { feature.SubItems["strand"] = new List <string> { cds.Strand.ToString() }; } // might take in features without strand later on return(feature); }
/// <summary> /// Reads gene model features into data structures contained within this library /// </summary> /// <param name="geneModelFile"></param> public void ReadGeneFeatures(string geneModelFile) { foreach (ISequence chromFeatures in SimplerParse(geneModelFile)) { Chromosome chrom = Genome.Chromosomes.FirstOrDefault(x => x.FriendlyName == chromFeatures.ID); if (chrom == null) { continue; } chromFeatures.Metadata.TryGetValue("features", out object f); List <MetadataListItem <List <string> > > features = f as List <MetadataListItem <List <string> > >; for (int i = 0; i < features.Count; i++) { MetadataListItem <List <string> > feature = features[i]; long.TryParse(feature.SubItems["start"][0], out long start); long.TryParse(feature.SubItems["end"][0], out long end); var attributes = SplitAttributes(feature.FreeText); if (feature.FreeText.Contains('=')) { ProcessGff3Feature(feature, start, end, chrom, attributes); } else { ProcessGtfFeature(feature, start, end, chrom, attributes); } } } if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } CreateIntergenicRegions(); // possibly check transcript sanity here with Parallel.ForEach(Genes.SelectMany(g => g.Transcripts).ToList(), t => t.SanityCheck()); GenomeForest.Build(); }
/// <summary> /// Helper method to parse the feature of Gff data /// </summary> /// <param name="sequence">sequence object</param> /// <param name="cellRange">Range of cells</param> /// <param name="rowIndex">Current index of row</param> /// <returns>Index of row</returns> private static int ParseGffFeatures(ISequence sequence, object[,] cellRange, int rowIndex) { Dictionary<string, object> metadata = sequence.Metadata; Sequence seq = sequence as Sequence; if (cellRange.GetLength(1) < 9 && cellRange.GetLength(1) > 10) { throw new FormatException(Resources.UnrecognizedGffMetadataFormat); } int nameColIndex = -1; int sourceColIndex = -1; int typeColIndex = -1; int startColIndex = -1; int endColIndex = -1; int scoreColIndex = -1; int strandColIndex = -1; int frameColIndex = -1; int groupColIndex = -1; for (int i = 1; i < cellRange.GetLength(1); i++) { if (cellRange[rowIndex, i] == null) { continue; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnName.ToUpperInvariant())) { nameColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnSource.ToUpperInvariant())) { sourceColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnType.ToUpperInvariant())) { typeColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnStart.ToUpperInvariant())) { startColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnEnd.ToUpperInvariant())) { endColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnScore.ToUpperInvariant())) { scoreColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnStrand.ToUpperInvariant())) { strandColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnFrame.ToUpperInvariant())) { frameColIndex = i; } if (cellRange[rowIndex, i].ToString().ToUpperInvariant().Equals(Properties.Resources.GffColumnGroup.ToUpperInvariant())) { groupColIndex = i; } } if (nameColIndex == -1 || sourceColIndex == -1 || typeColIndex == -1 || startColIndex == -1 || endColIndex == -1 || scoreColIndex == -1 || strandColIndex == -1 || frameColIndex == -1) { throw new FormatException(Resources.UnrecognizedGffMetadataFormat); } List<MetadataListItem<List<string>>> featureList = new List<MetadataListItem<List<string>>>(); metadata["features"] = featureList; rowIndex++; while (rowIndex < cellRange.GetLength(0)) { string name = cellRange[rowIndex, nameColIndex] != null ? cellRange[rowIndex, nameColIndex].ToString() : string.Empty; string value = cellRange[rowIndex, typeColIndex] != null ? cellRange[rowIndex, typeColIndex].ToString() : string.Empty; string attributes = string.Empty; if (groupColIndex != -1) { attributes = cellRange[rowIndex, groupColIndex] != null ? cellRange[rowIndex, groupColIndex].ToString() : string.Empty; } MetadataListItem<List<string>> feature = new MetadataListItem<List<string>>(value, attributes); value = cellRange[rowIndex, sourceColIndex] != null ? cellRange[rowIndex, sourceColIndex].ToString() : string.Empty; feature.SubItems.Add("source", new List<string> { value }); // start is an int int ignoreMe; value = cellRange[rowIndex, startColIndex] != null ? cellRange[rowIndex, startColIndex].ToString() : string.Empty; if (!int.TryParse(value, out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resources.GffInvalidField, "start", value); throw new InvalidDataException(message); } feature.SubItems.Add("start", new List<string> { value }); // end is an int value = cellRange[rowIndex, endColIndex] != null ? cellRange[rowIndex, endColIndex].ToString() : string.Empty; if (!int.TryParse(value, out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resources.GffInvalidField, "end", value); throw new InvalidDataException(message); } feature.SubItems.Add("end", new List<string> { value }); // source is a double, or a dot as a space holder value = cellRange[rowIndex, scoreColIndex] != null ? cellRange[rowIndex, scoreColIndex].ToString() : string.Empty; if (string.IsNullOrWhiteSpace(value)) { value = "."; } if (value != ".") { double ignoreMeToo; if (!double.TryParse(value, out ignoreMeToo)) { string message = String.Format( CultureInfo.CurrentCulture, Resources.GffInvalidField, "score", value); throw new InvalidDataException(message); } feature.SubItems.Add("score", new List<string> { value }); } // strand is + or -, or a dot as a space holder value = cellRange[rowIndex, strandColIndex] != null ? cellRange[rowIndex, strandColIndex].ToString() : string.Empty; if (string.IsNullOrWhiteSpace(value)) { value = "."; } if (value != ".") { if (value != "+" && value != "-") { string message = String.Format( CultureInfo.CurrentCulture, Resources.GffInvalidField, "strand", value); throw new InvalidDataException(message); } feature.SubItems.Add("strand", new List<string> { value }); } // frame is an int, or a dot as a space holder value = cellRange[rowIndex, frameColIndex] != null ? cellRange[rowIndex, frameColIndex].ToString() : string.Empty; if (string.IsNullOrWhiteSpace(value)) { value = "."; } if (value != ".") { if (!int.TryParse(value, out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resources.GffInvalidField, "frame", value); throw new InvalidDataException(message); } feature.SubItems.Add("frame", new List<string> { value }); } // done with that one featureList.Add(feature); rowIndex++; } return rowIndex; }
/// <summary> /// Processes a feature from a GTF gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGtfFeature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } // Trim prefixes from the IDs string genePrefix = "gene:"; string transcriptPrefix = "transcript:"; if (hasGeneId && geneId.StartsWith(genePrefix)) { string newGeneId = geneId.Substring(genePrefix.Length); feature.FreeText.Replace(geneId, newGeneId); geneId = newGeneId; } if (hasTranscriptId && transcriptId.StartsWith(transcriptPrefix)) { string newTranscriptId = transcriptId.Substring(transcriptPrefix.Length); feature.FreeText.Replace(transcriptId, newTranscriptId); transcriptId = newTranscriptId; } if (hasProteinId && proteinId.StartsWith(transcriptPrefix)) { proteinId = proteinId.Substring(transcriptPrefix.Length); // transcript id is used for protein id sometimes } // Catch the transcript features before they go by if available, i.e. if the file doesn't just have exons if (feature.Key == "transcript" && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon" || feature.Key == "CDS") { if (currentGene == null || hasGeneId && geneId != currentGene.ID) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (feature.Key == "exon") { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom.Sequence.ID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (feature.Key == "CDS") { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (hasProteinId) { currentTranscript.ProteinID = proteinId; } if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); } } else { // nothing to do } } }
/// <summary> /// Processes a feature from a GFF3 gene model file. /// </summary> /// <param name="feature"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chrom"></param> /// <param name="attributes"></param> public void ProcessGff3Feature(MetadataListItem <List <string> > feature, long oneBasedStart, long oneBasedEnd, Chromosome chrom, Dictionary <string, string> attributes) { bool hasGeneId = attributes.TryGetValue("gene_id", out string geneId); bool hasTranscriptId = attributes.TryGetValue("transcript_id", out string transcriptId); bool hasExonId = attributes.TryGetValue("exon_id", out string exonId); bool hasProteinId = attributes.TryGetValue("protein_id", out string proteinId); bool hasSource = feature.SubItems.TryGetValue("source", out List <string> sourceish); // false if empty ("." in GFF format) bool hasStrand = feature.SubItems.TryGetValue("strand", out List <string> strandish); // false if empty ("." in GFF format) bool hasFrame = feature.SubItems.TryGetValue("frame", out List <string> framey); // false if empty ("." in GFF format) string source = hasSource ? sourceish[0] : ""; if (!hasStrand) { return; } // strand is a required to do anything in this program string strand = strandish[0]; int frame = 0; if (hasFrame) { int.TryParse(framey[0], out frame); } if (hasGeneId && (currentGene == null || hasGeneId && geneId != currentGene.ID)) { currentGene = new Gene(geneId, chrom, source, strand, oneBasedStart, oneBasedEnd, feature); Genes.Add(currentGene); GenomeForest.Add(currentGene); } if (hasTranscriptId && (currentTranscript == null || hasTranscriptId && transcriptId != currentTranscript.ID)) { if (currentTranscript != null) { Transcript.SetRegions(currentTranscript); currentTranscript.FrameCorrection(); } currentTranscript = new Transcript(transcriptId, currentGene, source, strand, oneBasedStart, oneBasedEnd, null, null, feature); currentGene.Transcripts.Add(currentTranscript); GenomeForest.Add(currentTranscript); } if (hasExonId) { ISequence exon_dna = chrom.Sequence.GetSubSequence(oneBasedStart - 1, oneBasedEnd - oneBasedStart + 1); Exon exon = new Exon(currentTranscript, currentTranscript.IsStrandPlus() ? exon_dna : exon_dna.GetReverseComplementedSequence(), source, oneBasedStart, oneBasedEnd, chrom == null ? "" : chrom.ChromosomeID, strand, null, feature); if (exon.Length() > 0) { currentTranscript.Exons.Add(exon); } } else if (hasProteinId) { CDS cds = new CDS(currentTranscript, chrom.Sequence.ID, source, strand, oneBasedStart, oneBasedEnd, null, frame); if (cds.Length() > 0) { currentTranscript.CodingDomainSequences.Add(cds); currentTranscript.ProteinID = proteinId; } } else // nothing to do { } }
/// <summary> /// Parses the consecutive feature lines for one sequence. /// </summary> /// <param name="reader"></param> /// <param name="line"></param> /// <returns></returns> private string ParseFeatures(TextReader reader, string line) { // The non-comment lines contain features, which are each stored as MetadataListItems. // The fields of each feature are referred to as sub-items. For GFF, these have // unique keys, but for compatibility with our internal representation of features from // GenBank format, each sub-item is a list of strings, rather than a simple string. List <MetadataListItem <List <string> > > featureList = null; Tuple <ISequence, List <byte> > specificSeq = null; while (line == "") { line = reader.ReadLine(); } while (line != null) { if (line.StartsWith(HeaderMark, StringComparison.Ordinal)) { line = reader.ReadLine(); } else { string[] featureFields = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (featureFields.Length < MinFieldsPerFeature || featureFields.Length > MaxFieldsPerFeature) { string message = string.Format( CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); ; throw new InvalidDataException(message); } // The featureFields array should now contain the following fields: // featureFields[0]: sequence name // featureFields[1]: source // featureFields[2]: feature name // featureFields[3]: start // featureFields[4]: end // featureFields[5]: score // featureFields[6]: strand // featureFields[7]: frame // featureFields[8]: attributes (optional) // Process sequence name. if (specificSeq == null) { specificSeq = this.GetSpecificSequence(featureFields[0], null); // Retrieve features list, or add empty features list to metadata if this // is the first feature. if (specificSeq.Item1.Metadata.ContainsKey("features")) { featureList = specificSeq.Item1.Metadata["features"] as List <MetadataListItem <List <string> > >; } else { featureList = new List <MetadataListItem <List <string> > >(); specificSeq.Item1.Metadata["features"] = featureList; } } else if (specificSeq.Item1.ID != featureFields[0]) { // don't go to next line; current line still needs to be processed break; } // use feature name as key; attributes field is stored as free text string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty); var feature = new MetadataListItem <List <string> >(featureFields[2], attributes); // source feature.SubItems.Add(SourceKey, new List <string> { featureFields[1] }); // start is an int int ignoreMe; if (!int.TryParse(featureFields[3], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "start", featureFields[3]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("start", new List <string> { featureFields[3] }); // end is an int if (!int.TryParse(featureFields[4], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "end", featureFields[4]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("end", new List <string> { featureFields[4] }); // source is a double, or a dot as a space holder if (featureFields[5] != ".") { double ignoreMeToo; if (!double.TryParse(featureFields[5], out ignoreMeToo)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "score", featureFields[5]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("score", new List <string> { featureFields[5] }); } // strand is + or -, or a dot as a space holder if (featureFields[6] != ".") { if (featureFields[6] != "+" && featureFields[6] != "-") { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "strand", featureFields[6]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("strand", new List <string> { featureFields[6] }); } // frame is an int, or a dot as a space holder if (featureFields[7] != ".") { if (!int.TryParse(featureFields[7], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "frame", featureFields[7]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("frame", new List <string> { featureFields[7] }); } // done with that one featureList.Add(feature); line = reader.ReadLine(); } } // if any seqs are left in _sequencesInHeader add it to _sequences if (this.sequencesInHeader.Count > 0) { this.sequences.AddRange(this.sequencesInHeader); this.sequencesInHeader.Clear(); } return(line); }
/// <summary> /// Process the headers. /// </summary> /// <returns></returns> private string ParseHeaders(TextReader reader) { string comments = string.Empty; int commentsCount = 1; string line = reader.ReadLine(); while (line == "") { line = reader.ReadLine(); } while ((line != null) && line.TrimStart().StartsWith(CommentMark, StringComparison.Ordinal)) { // process headers, but ignore other comments if (line.StartsWith(HeaderMark, StringComparison.Ordinal)) { string[] fields = line.Substring(3 - 1).Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // Add if any comments. if (!string.IsNullOrEmpty(comments)) { this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture) ] = comments; comments = string.Empty; commentsCount++; } Tuple <ISequence, List <byte> > specificSeq = null; switch (fields[0].ToUpperInvariant()) { case GffVersionKey: if (fields.Length > 1 && fields[1] != "2") { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffUnsupportedVersion); Trace.Report(message); throw new NotSupportedException(message); } // Store "GFF-VERSION" to get keep the order of comments/headers. this.commonSeq.Metadata[GffVersionKey] = fields[1]; break; case SourceVersionKey: var sourceVersion = new MetadataListItem <string>(SourceVersionKey, string.Empty); sourceVersion.SubItems.Add(SourceKey, fields[1]); sourceVersion.SubItems.Add(VersionKey, fields[2]); this.commonSeq.Metadata[SourceVersionKey] = sourceVersion; break; case DateKey: DateTime date; if (!DateTime.TryParse(fields[1], out date)) { string message = String.Format(CultureInfo.CurrentCulture, Resource.ParserInvalidDate); Trace.Report(message); throw new FormatException(message); } this.commonSeq.Metadata[DateLowerCaseKey] = date; break; case TypeKey: if (fields.Length == 2) { this.commonSeq.Alphabet = GetAlphabetType(fields[1]); if (this.commonSeq.Alphabet == null) { string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. this.commonSeq.Metadata[TypeKey] = fields[1]; } else { specificSeq = this.GetSpecificSequence(fields[2], GetAlphabetType(fields[1]), false); if (specificSeq.Item1.Alphabet == null) { string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. // Store seq id as value. this.commonSeq.Metadata[MultiTypeKey + fields[2]] = fields[2]; } break; case "DNA": case "RNA": case "PROTEIN": line = reader.ReadLine(); // Store seq id as value. this.commonSeq.Metadata[MultiSeqDataKey + fields[1]] = fields[1]; specificSeq = this.GetSpecificSequence(fields[1], GetAlphabetType(fields[0]), false); long sequenceDataLength = 0; while ((line != null) && line != SeqDataEnd + fields[0]) { if (!line.StartsWith(HeaderMark, StringComparison.Ordinal)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidSequence); Trace.Report(message); throw new FormatException(message); } byte[] tempSeqData = Encoding.UTF8.GetBytes(line.Substring(3 - 1).ToCharArray()); sequenceDataLength += tempSeqData.Length; specificSeq.Item2.AddRange(tempSeqData); line = reader.ReadLine(); } break; case SeqRegKey: specificSeq = this.GetSpecificSequence(fields[1], null, false); specificSeq.Item1.Metadata["start"] = fields[2]; specificSeq.Item1.Metadata["end"] = fields[3]; // Store seq id as value. this.commonSeq.Metadata[MultiSeqRegKey + fields[1]] = fields[1]; break; } } else { comments = string.IsNullOrEmpty(comments) ? line : comments + Environment.NewLine + line; } line = reader.ReadLine(); while (line == "") { line = reader.ReadLine(); } } if (!string.IsNullOrEmpty(comments)) { this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; } return(line); }
// Parses the consecutive feature lines for one sequence. private void ParseFeatures(BioTextReader bioReader) { // The non-comment lines contain features, which are each stored as MetadataListItems. // The fields of each feature are referred to as sub-items. For GFF, these have // unique keys, but for compatability with our internal representation of features from // GenBank format, each sub-item is a list of strings, rather than a simple string. List <MetadataListItem <List <string> > > featureList = null; Sequence specificSeq = null; while (bioReader.HasLines) { if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { // ignore comments bioReader.GoToNextLine(); } else { // fields are tab-delimited string[] featureFields = bioReader.Line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (featureFields.Length < _minFieldsPerFeature || featureFields.Length > _maxFieldsPerFeature) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);; throw new InvalidDataException(message); } // The featureFields array should now contain the following fields: // featureFields[0]: sequence name // featureFields[1]: source // featureFields[2]: feature name // featureFields[3]: start // featureFields[4]: end // featureFields[5]: score // featureFields[6]: strand // featureFields[7]: frame // featureFields[8]: attributes (optional) // Process sequence name. if (specificSeq == null) { specificSeq = GetSpecificSequence(featureFields[0], MoleculeType.Invalid, bioReader); // Retrieve features list, or add empty features list to metadata if this // is the first feature. if (specificSeq.Metadata.ContainsKey("features")) { featureList = specificSeq.Metadata["features"] as List <MetadataListItem <List <string> > >; } else { featureList = new List <MetadataListItem <List <string> > >(); specificSeq.Metadata["features"] = featureList; } } else if (specificSeq.DisplayID != featureFields[0]) { // don't go to next line; current line still needs to be processed break; } // use feature name as key; attributes field is stored as free text string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty); MetadataListItem <List <string> > feature = new MetadataListItem <List <string> >(featureFields[2], attributes); // source feature.SubItems.Add("source", new List <string> { featureFields[1] }); // start is an int int ignoreMe; if (!int.TryParse(featureFields[3], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "start", featureFields[3]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("start", new List <string> { featureFields[3] }); // end is an int if (!int.TryParse(featureFields[4], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "end", featureFields[4]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("end", new List <string> { featureFields[4] }); // source is a double, or a dot as a space holder if (featureFields[5] != ".") { double ignoreMeToo; if (!double.TryParse(featureFields[5], out ignoreMeToo)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "score", featureFields[5]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("score", new List <string> { featureFields[5] }); } // strand is + or -, or a dot as a space holder if (featureFields[6] != ".") { if (featureFields[6] != "+" && featureFields[6] != "-") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "strand", featureFields[6]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("strand", new List <string> { featureFields[6] }); } // frame is an int, or a dot as a space holder if (featureFields[7] != ".") { if (!int.TryParse(featureFields[7], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "frame", featureFields[7]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("frame", new List <string> { featureFields[7] }); } // done with that one featureList.Add(feature); bioReader.GoToNextLine(); } } // A feature file with no features? May it never be. if (featureList == null) { string message = Properties.Resource.GFFNoFeatures; Trace.Report(message); throw new InvalidOperationException(message); } }
/// <summary> /// Get the value of a particular key from GFF metadata structure /// </summary> /// <param name="listItem">GFF Metadata</param> /// <param name="itemKey">Header of the column in GFF metadata</param> /// <returns>Value of the given column</returns> private static string GetGFFColumnValue(MetadataListItem<List<string>> listItem, string itemKey) { List<string> values = new List<string>(1); if (listItem.SubItems.TryGetValue(itemKey, out values)) { if (values.Count > 0) { return values[0]; } } return string.Empty; }
// The headers for all sequences go at the top of the file before any features. private void WriteHeaders(ICollection <ISequence> sequenceList, TextWriter writer) { // look for file-scope data tha is common to all sequences; null signifies no match MetadataListItem <string> sourceVersion = null; string source = null; string version = null; string type = null; bool firstSeq = true; ISequence commonSeq = null; List <string> typeExceptionList = new List <string>(); List <string> seqDataExceptionList = new List <string>(); List <string> seqRegExceptionList = new List <string>(); foreach (ISequence sequence in sequenceList) { if (firstSeq) { // consider first seq for common metadata. commonSeq = sequence; object tmpobj; // source and version go together; can't output one without the other if (sequence.Metadata.TryGetValue(_sourceVersionKey, out tmpobj)) { sourceVersion = tmpobj as MetadataListItem <string>; if (sourceVersion != null && sourceVersion.SubItems.Count > 1) { source = sourceVersion.SubItems[_sourceKey]; version = sourceVersion.SubItems[_versionKey]; } } // map to generic string; e.g. mRNA, tRNA -> RNA type = GetGenericTypeString(sequence.MoleculeType); firstSeq = false; } else { // source and version go together; can't output one without the other if (source != null) { bool sourceAndVersionMatchOthers = false; object tmpobj; // source and version go together; can't output one without the other if (sequence.Metadata.TryGetValue(_sourceVersionKey, out tmpobj)) { sourceVersion = tmpobj as MetadataListItem <string>; if (sourceVersion != null && sourceVersion.SubItems.Count > 1) { sourceAndVersionMatchOthers = source == sourceVersion.SubItems[_sourceKey] && version == sourceVersion.SubItems[_versionKey]; } } // set both to null if this seq source and version don't match previous ones if (!sourceAndVersionMatchOthers) { source = null; version = null; } } // set type to null if this seq type doesn't match previous types if (type != null && type != GetGenericTypeString(sequence.MoleculeType)) { type = null; } } } if (commonSeq == null) { commonSeq = new Sequence(Alphabets.DNA); } WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 1); int totalTypeCount = commonSeq.Metadata.Keys.Count(K => K.ToUpperInvariant().Contains(_multiTypeKey)); int currentTypeCount = 0; int totalSeqData = commonSeq.Metadata.Keys.Count(K => K.ToUpperInvariant().Contains(_multiSeqDataKey)); int totalSeqRegs = commonSeq.Metadata.Keys.Count(K => K.ToUpperInvariant().Contains(_multiSeqRegKey)); ISequence seq = null; foreach (string key in commonSeq.Metadata.Keys) { string keyToCompare = key.ToUpperInvariant(); string value = string.Empty; if (keyToCompare.Contains(_commentSectionKey)) { keyToCompare = _commentSectionKey; value = commonSeq.Metadata[key] as string; } if (keyToCompare.Contains(_multiTypeKey)) { keyToCompare = _multiTypeKey; value = commonSeq.Metadata[key] as string; } if (keyToCompare.Contains(_multiSeqDataKey)) { keyToCompare = _multiSeqDataKey; value = commonSeq.Metadata[key] as string; } if (keyToCompare.Contains(_multiSeqRegKey)) { keyToCompare = _multiSeqRegKey; value = commonSeq.Metadata[key] as string; } switch (keyToCompare) { case _commentSectionKey: writer.WriteLine(value); break; case _gffVersionKey: // formatting using gff version 2 WriteHeaderLine(writer, _gffVersionLowercaseKey, "2"); WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 2); break; case _sourceVersionKey: // only output source if they all match if (source != null) { WriteHeaderLine(writer, _sourceVersionLowercaseKey, source, version); } WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 3); break; case _dateKey: // today's date WriteHeaderLine(writer, _dateLowercaseKey, DateTime.Today.ToString("yyyy-MM-dd")); WriteCommonMetadata(commonSeq, sequenceList, writer, source, version, type, 4); break; case _typeKey: // type header if (type != null) { // output that the types all match; don't need to output if DNA, as DNA is default if (type != MoleculeType.DNA.ToString()) { WriteHeaderLine(writer, _typeLowercaseKey, type); } } else if (totalTypeCount == 0) { foreach (ISequence sequence in sequenceList) { type = GetGenericTypeString(sequence.MoleculeType); // only ouput seq-specific type header if this seq won't have its type // output as part of a sequence data header; don't need to output if DNA, // as DNA is default if (type != MoleculeType.DNA.ToString() && (!ShouldWriteSequenceData || sequence.Count == 0)) { WriteHeaderLine(writer, _typeLowercaseKey, type, sequence.DisplayID); } } } break; case _multiTypeKey: if (totalTypeCount > 0) { if (type == null) { seq = sequenceList.FirstOrDefault(S => S.DisplayID.Equals(value)); if (seq != null) { WriteHeaderLine(writer, _typeLowercaseKey, seq.MoleculeType.ToString(), seq.DisplayID); typeExceptionList.Add(seq.DisplayID); } currentTypeCount++; if (currentTypeCount == totalTypeCount) { foreach (ISequence sequence in sequenceList) { if (typeExceptionList.Contains(sequence.DisplayID)) { continue; } type = GetGenericTypeString(sequence.MoleculeType); // only ouput seq-specific type header if this seq won't have its type // output as part of a sequence data header; don't need to output if DNA, // as DNA is default if (type != MoleculeType.DNA.ToString() && (!ShouldWriteSequenceData || sequence.Count == 0)) { WriteHeaderLine(writer, _typeLowercaseKey, type, sequence.DisplayID); } } } } else { // output that the types all match; don't need to output if DNA, as DNA is default if (type != MoleculeType.DNA.ToString()) { WriteHeaderLine(writer, _typeLowercaseKey, type); } totalTypeCount = 0; } } break; case _multiSeqDataKey: // sequence data if (ShouldWriteSequenceData) { seq = sequenceList.FirstOrDefault(S => S.DisplayID.Equals(value)); if (seq != null) { WriteSeqData(seq, type, writer); seqDataExceptionList.Add(seq.DisplayID); } totalSeqData--; if (totalSeqData == 0) { foreach (ISequence sequence in sequenceList) { if (seqDataExceptionList.Contains(sequence.DisplayID)) { continue; } WriteSeqData(sequence, type, writer); } } } break; case _multiSeqRegKey: seq = sequenceList.FirstOrDefault(S => S.DisplayID.Equals(value)); if (seq != null) { if (seq.Metadata.ContainsKey(_startKey) && seq.Metadata.ContainsKey(_endKey)) { WriteHeaderLine(writer, _seqRegKey, seq.DisplayID, seq.Metadata[_startKey] as string, seq.Metadata[_endKey] as string); } seqRegExceptionList.Add(value); } totalSeqRegs--; if (totalSeqRegs == 0) { // sequence-region header foreach (ISequence sequence in sequenceList) { if (seqRegExceptionList.Contains(sequence.DisplayID)) { continue; } if (sequence.Metadata.ContainsKey(_startKey) && sequence.Metadata.ContainsKey(_endKey)) { WriteHeaderLine(writer, _seqRegKey, sequence.DisplayID, sequence.Metadata[_startKey] as string, sequence.Metadata[_endKey] as string); } } } break; } } }
// Processes headers, which are a type of comment. private void ParseHeaders(MBFTextReader mbfReader) { string comments = string.Empty; int commentsCount = 1; while (mbfReader.HasLines && mbfReader.Line.TrimStart().StartsWith(_commentMark, StringComparison.Ordinal)) { Sequence specificSeq = null; // process headers, but ignore other comments if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string[] fields = mbfReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // Add if any comments. if (!string.IsNullOrEmpty(comments)) { _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; commentsCount++; } switch (fields[0].ToUpperInvariant()) { case _gffVersionKey: if (fields.Length > 1 && fields[1] != "2") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffUnsupportedVersion, mbfReader.LocationString); Trace.Report(message); throw new NotSupportedException(message); } // Store "GFF-VERSION" to get keep the order of comments/headers. _commonSeq.Metadata[_gffVersionKey] = fields[1]; break; case _sourceVersionKey: MetadataListItem <string> sourceVersion = new MetadataListItem <string>(_sourceVersionKey, string.Empty); sourceVersion.SubItems.Add(_sourceKey, fields[1]); sourceVersion.SubItems.Add(_versionKey, fields[2]); _commonSeq.Metadata[_sourceVersionKey] = sourceVersion; break; case _dateKey: DateTime date; if (!DateTime.TryParse(fields[1], out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } _commonSeq.Metadata[_dateLowerCaseKey] = date; break; case _typeKey: if (fields.Length == 2) { _commonSeq.MoleculeType = GetMoleculeType(fields[1]); if (_commonSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. _commonSeq.Metadata[_typeKey] = fields[1]; } else { specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), mbfReader, false); if (specificSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. // Store seq id as value. _commonSeq.Metadata[_multiTypeKey + fields[2]] = fields[2]; } break; case "DNA": case "RNA": case "PROTEIN": specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), mbfReader, false); mbfReader.GoToNextLine(); // Store seq id as value. _commonSeq.Metadata[_multiSeqDataKey + fields[1]] = fields[1]; while (mbfReader.HasLines && mbfReader.Line != _seqDataEnd + fields[0]) { if (!mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidSequence, mbfReader.LocationString); Trace.Report(message); throw new FormatException(message); } specificSeq.InsertRange(specificSeq.Count, mbfReader.GetLineField(3)); mbfReader.GoToNextLine(); } break; case _seqRegKey: specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, mbfReader, false); specificSeq.Metadata["start"] = fields[2]; specificSeq.Metadata["end"] = fields[3]; // Store seq id as value. _commonSeq.Metadata[_multiSeqRegKey + fields[1]] = fields[1]; break; } } else { comments = string.IsNullOrEmpty(comments) ? mbfReader.Line : comments + Environment.NewLine + mbfReader.Line; } mbfReader.GoToNextLine(); } if (!string.IsNullOrEmpty(comments)) { _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; } }
/// <summary> /// Parses the consecutive feature lines for one sequence. /// </summary> /// <param name="reader"></param> /// <param name="line"></param> /// <returns></returns> private string ParseFeatures(TextReader reader, string line) { // The non-comment lines contain features, which are each stored as MetadataListItems. // The fields of each feature are referred to as sub-items. For GFF, these have // unique keys, but for compatibility with our internal representation of features from // GenBank format, each sub-item is a list of strings, rather than a simple string. List<MetadataListItem<List<string>>> featureList = null; Tuple<ISequence, List<byte>> specificSeq = null; while (line == "") { line = reader.ReadLine(); } while (line != null) { if (line.StartsWith(HeaderMark, StringComparison.Ordinal)) { line = reader.ReadLine(); } else { string[] featureFields = line.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (featureFields.Length < MinFieldsPerFeature || featureFields.Length > MaxFieldsPerFeature) { string message = string.Format( CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); ; throw new InvalidDataException(message); } // The featureFields array should now contain the following fields: // featureFields[0]: sequence name // featureFields[1]: source // featureFields[2]: feature name // featureFields[3]: start // featureFields[4]: end // featureFields[5]: score // featureFields[6]: strand // featureFields[7]: frame // featureFields[8]: attributes (optional) // Process sequence name. if (specificSeq == null) { specificSeq = this.GetSpecificSequence(featureFields[0], null); // Retrieve features list, or add empty features list to metadata if this // is the first feature. if (specificSeq.Item1.Metadata.ContainsKey("features")) { featureList = specificSeq.Item1.Metadata["features"] as List<MetadataListItem<List<string>>>; } else { featureList = new List<MetadataListItem<List<string>>>(); specificSeq.Item1.Metadata["features"] = featureList; } } else if (specificSeq.Item1.ID != featureFields[0]) { // don't go to next line; current line still needs to be processed break; } // use feature name as key; attributes field is stored as free text string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty); var feature = new MetadataListItem<List<string>>(featureFields[2], attributes); // source feature.SubItems.Add(SourceKey, new List<string> { featureFields[1] }); // start is an int int ignoreMe; if (!int.TryParse(featureFields[3], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "start", featureFields[3]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("start", new List<string> { featureFields[3] }); // end is an int if (!int.TryParse(featureFields[4], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "end", featureFields[4]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("end", new List<string> { featureFields[4] }); // source is a double, or a dot as a space holder if (featureFields[5] != ".") { double ignoreMeToo; if (!double.TryParse(featureFields[5], out ignoreMeToo)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "score", featureFields[5]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("score", new List<string> { featureFields[5] }); } // strand is + or -, or a dot as a space holder if (featureFields[6] != ".") { if (featureFields[6] != "+" && featureFields[6] != "-") { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "strand", featureFields[6]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("strand", new List<string> { featureFields[6] }); } // frame is an int, or a dot as a space holder if (featureFields[7] != ".") { if (!int.TryParse(featureFields[7], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidField, "frame", featureFields[7]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("frame", new List<string> { featureFields[7] }); } // done with that one featureList.Add(feature); line = reader.ReadLine(); } } // if any seqs are left in _sequencesInHeader add it to _sequences if (this.sequencesInHeader.Count > 0) { this.sequences.AddRange(this.sequencesInHeader); this.sequencesInHeader.Clear(); } return line; }
/// <summary> /// Process the headers. /// </summary> /// <returns></returns> private string ParseHeaders(TextReader reader) { string comments = string.Empty; int commentsCount = 1; string line = reader.ReadLine(); while (line == "") { line = reader.ReadLine(); } while ((line != null) && line.TrimStart().StartsWith(CommentMark, StringComparison.Ordinal)) { // process headers, but ignore other comments if (line.StartsWith(HeaderMark, StringComparison.Ordinal)) { string[] fields = line.Substring(3 - 1).Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // Add if any comments. if (!string.IsNullOrEmpty(comments)) { this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture) ] = comments; comments = string.Empty; commentsCount++; } Tuple<ISequence, List<byte>> specificSeq = null; switch (fields[0].ToUpperInvariant()) { case GffVersionKey: if (fields.Length > 1 && fields[1] != "2") { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffUnsupportedVersion); Trace.Report(message); throw new NotSupportedException(message); } // Store "GFF-VERSION" to get keep the order of comments/headers. this.commonSeq.Metadata[GffVersionKey] = fields[1]; break; case SourceVersionKey: var sourceVersion = new MetadataListItem<string>(SourceVersionKey, string.Empty); sourceVersion.SubItems.Add(SourceKey, fields[1]); sourceVersion.SubItems.Add(VersionKey, fields[2]); this.commonSeq.Metadata[SourceVersionKey] = sourceVersion; break; case DateKey: DateTime date; if (!DateTime.TryParse(fields[1], out date)) { string message = String.Format(CultureInfo.CurrentCulture, Resource.ParserInvalidDate); Trace.Report(message); throw new FormatException(message); } this.commonSeq.Metadata[DateLowerCaseKey] = date; break; case TypeKey: if (fields.Length == 2) { this.commonSeq.Alphabet = GetAlphabetType(fields[1]); if (this.commonSeq.Alphabet == null) { string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. this.commonSeq.Metadata[TypeKey] = fields[1]; } else { specificSeq = this.GetSpecificSequence(fields[2], GetAlphabetType(fields[1]), false); if (specificSeq.Item1.Alphabet == null) { string message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidType); Trace.Report(message); throw new FormatException(message); } // Store "TYPE" to get keep the order of comments/headers. // Store seq id as value. this.commonSeq.Metadata[MultiTypeKey + fields[2]] = fields[2]; } break; case "DNA": case "RNA": case "PROTEIN": line = reader.ReadLine(); // Store seq id as value. this.commonSeq.Metadata[MultiSeqDataKey + fields[1]] = fields[1]; specificSeq = this.GetSpecificSequence(fields[1], GetAlphabetType(fields[0]), false); long sequenceDataLength = 0; while ((line != null) && line != SeqDataEnd + fields[0]) { if (!line.StartsWith(HeaderMark, StringComparison.Ordinal)) { string message = String.Format( CultureInfo.CurrentCulture, Resource.GffInvalidSequence); Trace.Report(message); throw new FormatException(message); } byte[] tempSeqData = Encoding.UTF8.GetBytes(line.Substring(3 - 1).ToCharArray()); sequenceDataLength += tempSeqData.Length; specificSeq.Item2.AddRange(tempSeqData); line = reader.ReadLine(); } break; case SeqRegKey: specificSeq = this.GetSpecificSequence(fields[1], null, false); specificSeq.Item1.Metadata["start"] = fields[2]; specificSeq.Item1.Metadata["end"] = fields[3]; // Store seq id as value. this.commonSeq.Metadata[MultiSeqRegKey + fields[1]] = fields[1]; break; } } else { comments = string.IsNullOrEmpty(comments) ? line : comments + Environment.NewLine + line; } line = reader.ReadLine(); while (line == "") { line = reader.ReadLine(); } } if (!string.IsNullOrEmpty(comments)) { this.commonSeq.Metadata[CommentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments; comments = string.Empty; } return line; }
// Returns a tab plus the sub-item text or a "." if the sub-item is absent. private string GetSubItemString(MetadataListItem <List <string> > feature, string subItemName) { return("\t" + (feature.SubItems.ContainsKey(subItemName) ? feature.SubItems[subItemName][0] : ".")); }
/// <summary> /// Construct an exon /// </summary> /// <param name="parent"></param> /// <param name="Sequence"></param> /// <param name="oneBasedStart"></param> /// <param name="oneBasedEnd"></param> /// <param name="chromID"></param> /// <param name="strand"></param> public Exon(Transcript parent, ISequence Sequence, string source, long oneBasedStart, long oneBasedEnd, string chromID, string strand, MetadataListItem <List <string> > featureMetadata) : base(parent, chromID, source, strand, oneBasedStart, oneBasedEnd, Sequence) { FeatureMetadata = featureMetadata; }
/// <summary> /// Returns a tab plus the sub-item text or a "." if the sub-item is absent. /// </summary> /// <param name="feature"></param> /// <param name="subItemName"></param> /// <returns></returns> private string GetSubItemString(MetadataListItem<List<string>> feature, string subItemName) { List<string> list; if (feature.SubItems.TryGetValue(subItemName, out list)) { if (list.Count >= 1) { return list[0]; } } return "."; }