/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and bytes. /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="values">An array of bytes representing the symbols.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public Sequence(IAlphabet alphabet, byte[] values, bool validate) { // validate the inputs if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (values == null) { throw new ArgumentNullException("values"); } if (validate) { // Validate sequence data if (!alphabet.ValidateSequence(values, 0, values.GetLongLength())) { throw Helper.GenerateAlphabetCheckFailureException(alphabet, values); } } this._sequenceData = new byte[values.GetLongLength()]; this.ID = string.Empty; Helper.Copy(values, this._sequenceData, values.GetLongLength()); this.Alphabet = alphabet; this.Count = this._sequenceData.GetLongLength(); }
/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and string sequence. /// </summary> /// <param name="alphabet">Alphabet to which this class should conform.</param> /// <param name="sequence">The sequence in string form.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public Sequence(IAlphabet alphabet, string sequence, bool validate) { // validate the inputs if (sequence == null) { throw new ArgumentNullException("sequence"); } if (alphabet == null) { throw new ArgumentNullException("alphabet"); } this.Alphabet = alphabet; this.ID = string.Empty; byte[] values = Encoding.UTF8.GetBytes(sequence); if (validate) { // Validate sequence data if (!alphabet.ValidateSequence(values, 0, values.GetLongLength())) { throw Helper.GenerateAlphabetCheckFailureException(alphabet, values); } } this._sequenceData = values; this.Count = this._sequenceData.GetLongLength(); }
/// <summary> /// Creates a sparse sequence based on the specified parameters. /// /// The item parameter must contain an alphabet as specified in the alphabet parameter, /// else an exception will occur. /// /// The index parameter value must be a non negative value. /// Count property of an instance created by this constructor will be set to value of index + 1. /// </summary> /// <param name="alphabet"> /// The alphabet the sequence uses (e.g. Alphabets.DNA or Alphabets.RNA or Alphabets.Protein)</param> /// <param name="index">Position of the specified sequence item.</param> /// <param name="item">A sequence item which is known by the alphabet.</param> public SparseSequence(IAlphabet alphabet, int index, byte item) : this(alphabet) { if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (index < 0 || index == int.MaxValue) { throw new ArgumentOutOfRangeException( Properties.Resource.ParameterNameIndex, Properties.Resource.SparseSequenceConstructorIndexOutofRange); } if (!alphabet.ValidateSequence(new[] { item }, 0, 1)) { throw new ArgumentException( string.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidSymbol, item)); } Statistics = new SequenceStatistics(alphabet); sparseSeqItems.Add(index, item); Statistics.Add((char)item); Count = index + 1; }
/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and bytes. /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="values">An array of bytes representing the symbols.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public Sequence(IAlphabet alphabet, byte[] values, bool validate) { // validate the inputs if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (values == null) { throw new ArgumentNullException("values"); } if (validate) { // Validate sequence data if (!alphabet.ValidateSequence(values, 0, values.LongLength())) { throw new ArgumentOutOfRangeException("values"); } } this._sequenceData = new byte[values.LongLength()]; this.ID = string.Empty; #if (SILVERLIGHT == false) Array.Copy(values, this._sequenceData, values.LongLength); #else Array.Copy(values, this.sequenceData, values.Length); #endif this.Alphabet = alphabet; this.Count = this._sequenceData.LongLength(); }
/// <summary> /// Validate ValidateSequence method. /// Input Data : Valid Dna/Rna/Protein Sequences. /// Output Data : Validate Sequences for all Alphabet instances. /// </summary> void ValidateSequenceTypes(AlphabetsTypes option) { IAlphabet alphabetInstance = null; string sequence = ""; switch (option) { case AlphabetsTypes.Protein: alphabetInstance = ProteinAlphabet.Instance; sequence = utilityObj.xmlUtil.GetTextValue(Constants.ProteinDerivedSequenceNode, Constants.ExpectedDerivedSequence); break; case AlphabetsTypes.Rna: alphabetInstance = RnaAlphabet.Instance; sequence = utilityObj.xmlUtil.GetTextValue(Constants.RnaDerivedSequenceNode, Constants.ExpectedDerivedSequence); break; case AlphabetsTypes.Dna: alphabetInstance = DnaAlphabet.Instance; sequence = utilityObj.xmlUtil.GetTextValue(Constants.DnaDerivedSequenceNode, Constants.ExpectedDerivedSequence); break; } Assert.IsTrue(alphabetInstance.ValidateSequence(encodingObj.GetBytes(sequence), 0, 4)); ApplicationLog.WriteLine(string.Concat(@"Alphabets BVT: Validation of Validate Sequence method for ", option, " completed successfully.")); }
/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and string sequence. /// </summary> /// <param name="alphabet">Alphabet to which this class should conform.</param> /// <param name="sequence">The sequence in string form.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public Sequence(IAlphabet alphabet, string sequence, bool validate) { // validate the inputs if (sequence == null) { throw new ArgumentNullException("sequence"); } if (alphabet == null) { throw new ArgumentNullException("alphabet"); } this.Alphabet = alphabet; this.ID = string.Empty; byte[] values = ASCIIEncoding.ASCII.GetBytes(sequence); if (validate) { // Validate sequence data if (!alphabet.ValidateSequence(values, 0, values.LongLength)) { throw new ArgumentOutOfRangeException("sequence"); } } this.sequenceData = values; this.Count = this.sequenceData.LongLength; }
/// <summary> /// Creates a sparse sequence based on the specified parameters. /// The sequenceItems parameter must contain sequence items known by the specified alphabet, /// else an exception will occur. /// /// The index parameter value must be a non negative. /// </summary> /// <param name="alphabet"> /// The alphabet the sequence uses (e.g.. Alphabets.DNA or Alphabets.RNA or Alphabets.Protein)</param> /// <param name="index">A non negative value which indicates the start position of the specified sequence items.</param> /// <param name="sequenceItems"> /// A sequence which contain items known by the alphabet.</param> public SparseSequence(IAlphabet alphabet, int index, IEnumerable <byte> sequenceItems) : this(alphabet) { if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (index < 0 || index == int.MaxValue) { throw new ArgumentOutOfRangeException( Properties.Resource.ParameterNameIndex, Properties.Resource.SparseSequenceConstructorIndexOutofRange); } if (sequenceItems == null) { throw new ArgumentNullException(Properties.Resource.ParameterNameSequenceItems); } var sequenceArray = sequenceItems.ToArray(); if (!alphabet.ValidateSequence(sequenceArray, 0, sequenceArray.LongLength)) { throw new ArgumentOutOfRangeException("sequenceItems"); } Statistics = new SequenceStatistics(alphabet); int position = index; foreach (byte sequenceItem in sequenceItems) { sparseSeqItems.Add(position, sequenceItem); Statistics.Add((char)sequenceItem); position++; } if (sequenceItems.Count() > 0) { Count = index + sequenceItems.Count(); } }
/// <summary> /// Returns a single QualitativeSequence from the FASTQ data. /// </summary> /// <param name="reader">Reader to be parsed.</param> /// <param name="formatType">FASTQ format type.</param> /// <returns>Returns a QualitativeSequence.</returns> private IQualitativeSequence ParseOne(StreamReader reader, FastQFormatType formatType) { if (reader.EndOfStream) { return(null); } string line = ReadNextLine(reader, true); if (line == null || !line.StartsWith("@", StringComparison.Ordinal)) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); throw new Exception(message); } // Process header line. string id = line.Substring(1).Trim(); line = ReadNextLine(reader, true); if (string.IsNullOrEmpty(line)) { string details = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Get sequence from second line. byte[] sequenceData = Encoding.ASCII.GetBytes(line); // Goto third line. line = ReadNextLine(reader, true); // Check for '+' symbol in the third line. if (line == null || !line.StartsWith("+", StringComparison.Ordinal)) { string details = string.Format( CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } string qualScoreId = line.Substring(1).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string details = string.Format( CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Goto fourth line. line = ReadNextLine(reader, true); if (string.IsNullOrEmpty(line)) { string details = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Get the quality scores from the fourth line. byte[] qualScores = Encoding.ASCII.GetBytes(line); // Check for sequence length and quality score length. if (sequenceData.GetLongLength() != qualScores.GetLongLength()) { string details = string.Format( CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet IAlphabet alphabet = this.Alphabet; if (alphabet == null) { alphabet = Alphabets.AutoDetectAlphabet(sequenceData, 0, sequenceData.GetLongLength(), alphabet); if (alphabet == null) { throw new Exception(Resource.CouldNotIdentifyAlphabetType); } } else { if (!alphabet.ValidateSequence(sequenceData, 0, sequenceData.GetLongLength())) { throw new Exception(Resource.InvalidAlphabetType); } } return(new QualitativeSequence(alphabet, formatType, sequenceData, qualScores, false) { ID = id }); }
/// <summary> /// Returns an IEnumerable of sequences in the stream being parsed. /// </summary> /// <param name="reader">Stream to parse.</param> /// <param name="buffer">Buffer to use.</param> /// <returns>Returns a Sequence.</returns> ISequence ParseOne(TextReader reader, byte[] buffer) { if (reader == null) { throw new ArgumentNullException("reader"); } if (reader.Peek() == -1) { return(null); } int currentBufferSize = PlatformManager.Services.DefaultBufferSize; string message; string line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line)) { line = reader.ReadLine(); } if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.INVALID_INPUT_FILE, Properties.Resource.FASTA_NAME); throw new Exception(message); } string name = line.Substring(1); int bufferPosition = 0; // Read next line. line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line)) { line = reader.ReadLine(); } if (line == null) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, string.Empty); throw new Exception(message); } IAlphabet alphabet = Alphabet; bool tryAutoDetectAlphabet = alphabet == null; do { // Files > 2G are not supported in this release. if ((((long)bufferPosition + line.Length) >= PlatformManager.Services.MaxSequenceSize)) { throw new ArgumentOutOfRangeException( string.Format(CultureInfo.CurrentUICulture, Properties.Resource.SequenceDataGreaterthan2GB, name)); } int neededSize = bufferPosition + line.Length; if (neededSize >= currentBufferSize) { //Grow file dynamically, by buffer size, or if too small to fit the new sequence by the size of the sequence int suggestedSize = buffer.Length + PlatformManager.Services.DefaultBufferSize; int newSize = neededSize < suggestedSize ? suggestedSize : neededSize; Array.Resize(ref buffer, newSize); currentBufferSize = newSize; } byte[] symbols = Encoding.UTF8.GetBytes(line); // Array.Copy -- for performance improvement. Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length); // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet if (tryAutoDetectAlphabet) { // If we have a base alphabet we detected earlier, // then try that first. if (this.baseAlphabet != null && this.baseAlphabet.ValidateSequence(buffer, bufferPosition, line.Length)) { alphabet = this.baseAlphabet; } // Otherwise attempt to identify alphabet else { // Different alphabet - try to auto detect. this.baseAlphabet = null; alphabet = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet); if (alphabet == null) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line)); } } // Determine the base alphabet used. if (this.baseAlphabet == null) { this.baseAlphabet = alphabet; } else { // If they are not the same, then this might be an error. if (this.baseAlphabet != alphabet) { // If the new alphabet includes all the base alphabet then use it instead. // This happens when we hit an ambiguous form of the alphabet later in the file. if (!this.baseAlphabet.HasAmbiguity && Alphabets.GetAmbiguousAlphabet(this.baseAlphabet) == alphabet) { this.baseAlphabet = alphabet; } else if (alphabet.HasAmbiguity || Alphabets.GetAmbiguousAlphabet(alphabet) != this.baseAlphabet) { throw new Exception(Properties.Resource.FastAContainsMorethanOnebaseAlphabet); } } } } else { // Validate against supplied alphabet. if (!alphabet.ValidateSequence(buffer, bufferPosition, line.Length)) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line)); } } bufferPosition += line.Length; if (reader.Peek() == (byte)'>') { break; } // Read next line. line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line) && reader.Peek() != (byte)'>') { line = reader.ReadLine(); } }while (line != null); // Truncate buffer to remove trailing 0's byte[] tmpBuffer = new byte[bufferPosition]; Array.Copy(buffer, tmpBuffer, bufferPosition); if (tryAutoDetectAlphabet) { alphabet = this.baseAlphabet; } // In memory sequence return(new Sequence(alphabet, tmpBuffer, false) { ID = name }); }
/// <summary> /// Parses out the file. /// </summary> /// <returns></returns> public IEnumerable <ISequence> Parse(StreamReader reader) { using (reader) { // Read the first non-blank line string line = ReadLine(reader); if (line == null || !line.StartsWith("=")) { yield break; } do { // Get the name of the sequence. string id = line.Substring(1); // Look for Metadata var metadata = new Dictionary <string, string>(); while ((line = ReadLine(reader)) != null) { if (line.StartsWith(":")) { string[] keyValue = line.Split(new[] { ':' }); metadata.Add(keyValue[1], string.Join(":", keyValue.Skip(2))); } else { break; } } // Now read the data. if (line == null) { yield break; } if (!line.StartsWith("|")) { throw new FormatException("Missing Sequence Data"); } int count = 0; byte[] data = new byte[line.Length - 1]; while (line != null && line.StartsWith("|")) { int newDataSize = line.Length - 1; // Not enough space - increase our array size. if (newDataSize + count > data.Length) { Array.Resize(ref data, newDataSize + count); } // Add the bytes - skip the first byte Array.Copy(Encoding.ASCII.GetBytes(line), 1, data, count, newDataSize); count += newDataSize; line = ReadLine(reader); } // If we have not established the alphabet for this file, do so now. if (Alphabet == null) { // Try DNA, RNA and then finally Protein. Alphabet = DnaAlphabet.Instance; if (!Alphabet.ValidateSequence(data, 0, count)) { Alphabet = RnaAlphabet.Instance; if (!Alphabet.ValidateSequence(data, 0, count)) { Alphabet = ProteinAlphabet.Instance; if (!Alphabet.ValidateSequence(data, 0, count)) { throw new FormatException("Failed to identify proper alphabet for symbols."); } } } } // Create the sequence Sequence sequence = new Sequence(Alphabet, data, false) { ID = id }; // Add the metadata to the sequence foreach (var kvp in metadata) { sequence.Metadata.Add(kvp.Key, kvp.Value); } // Return it as part of our enumerable. yield return(sequence); }while (line != null && line.StartsWith("=")); } }
/// <summary> /// Gets the IEnumerable of QualitativeSequences from the stream being parsed. /// </summary> /// <param name="streamReader">Stream to be parsed.</param> /// <returns>Returns a QualitativeSequence.</returns> private QualitativeSequence ParseOne(StreamReader streamReader) { IAlphabet alphabet = this.Alphabet; bool autoDetectFastQFormat = this.AutoDetectFastQFormat; FastQFormatType formatType = this.FormatType; bool skipBlankLine = true; bool tryAutoDetectAlphabet; if (alphabet == null) { tryAutoDetectAlphabet = true; } else { tryAutoDetectAlphabet = false; } if (streamReader.EndOfStream) { string exMessage = string.Format( CultureInfo.InvariantCulture, Properties.Resource.INVALID_INPUT_FILE, Properties.Resource.FastQName); throw new FileFormatException(exMessage); } string message = string.Empty; string line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } if (line == null || !line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name); throw new FileFormatException(message); } // Process header line. string id = line.Substring(1).Trim(); line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } if (string.IsNullOrEmpty(line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Get sequence from second line. byte[] sequenceData = UTF8Encoding.UTF8.GetBytes(line); // Goto third line. line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } // Check for '+' symbol in the third line. if (line == null || !line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } string qualScoreId = line.Substring(1).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Goto fourth line. line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } if (string.IsNullOrEmpty(line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Get the quality scores from the fourth line. byte[] qualScores = UTF8Encoding.UTF8.GetBytes(line); // Check for sequence length and quality score length. if (sequenceData.LongLength() != qualScores.LongLength()) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet if (tryAutoDetectAlphabet) { alphabet = Alphabets.AutoDetectAlphabet(sequenceData, 0, sequenceData.LongLength(), alphabet); if (alphabet == null) { throw new FileFormatException(Properties.Resource.CouldNotIdentifyAlphabetType); } } else if (alphabet != null) { if (!alphabet.ValidateSequence(sequenceData, 0, sequenceData.LongLength())) { throw new FileFormatException(Properties.Resource.InvalidAlphabetType); } } // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (autoDetectFastQFormat) { formatType = IdentifyFastQFormatType(qualScores); } QualitativeSequence qualitativeSequence = new QualitativeSequence(alphabet, formatType, sequenceData, qualScores, false); qualitativeSequence.ID = id; // Update the propeties so that next parse will use this data. this.FormatType = formatType; return(qualitativeSequence); }
/// <summary> /// Creates a sparse sequence based on the specified parameters. /// The sequenceItems parameter must contain sequence items known by the specified alphabet, /// else an exception will occur. /// /// The index parameter value must be a non negative. /// </summary> /// <param name="alphabet"> /// The alphabet the sequence uses (e.g.. Alphabets.DNA or Alphabets.RNA or Alphabets.Protein)</param> /// <param name="index">A non negative value which indicates the start position of the specified sequence items.</param> /// <param name="sequenceItems"> /// A sequence which contain items known by the alphabet.</param> public SparseSequence(IAlphabet alphabet, int index, IEnumerable<byte> sequenceItems) : this(alphabet) { if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (index < 0 || index == int.MaxValue) { throw new ArgumentOutOfRangeException( Properties.Resource.ParameterNameIndex, Properties.Resource.SparseSequenceConstructorIndexOutofRange); } if (sequenceItems == null) { throw new ArgumentNullException(Properties.Resource.ParameterNameSequenceItems); } var sequenceArray = sequenceItems.ToArray(); if (!alphabet.ValidateSequence(sequenceArray, 0, sequenceArray.GetLongLength())) { throw new ArgumentOutOfRangeException("sequenceItems"); } Statistics = new SequenceStatistics(alphabet); int position = index; foreach (byte sequenceItem in sequenceArray) { sparseSeqItems.Add(position, sequenceItem); Statistics.Add((char)sequenceItem); position++; } if (sequenceArray.Any()) { Count = index + sequenceArray.Length; } }