/// <summary> /// Parses one line from the text file. /// </summary> /// <param name="line"></param> /// <returns></returns> private ISequence ParseLine(string line) { string[] splitLine = line.Split(this.Delimiter); if (splitLine.Length != 2) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, line)); } IAlphabet alphabet = this.Alphabet; if (alphabet == null) { byte[] byteArray = Encoding.UTF8.GetBytes(splitLine[1]); alphabet = Alphabets.AutoDetectAlphabet(byteArray, 0, byteArray.Length, null); if (alphabet == null) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, splitLine[1])); } } return(new Sequence(alphabet, splitLine[1]) { ID = splitLine[0] }); }
/// <summary> /// Analyze the given sequences and store a consensus into its Consensus property. /// </summary> /// <param name="referenceSequence">Reference sequence.</param> /// <param name="querySequence">Query sequence.</param> /// <returns>Consensus of sequences.</returns> protected ISequence MakeConsensus( ISequence referenceSequence, ISequence querySequence) { if (referenceSequence == null) { throw new ArgumentNullException("referenceSequence"); } if (querySequence == null) { throw new ArgumentNullException("querySequence"); } // For each pair of symbols (characters) in reference and query sequence // get the consensus symbol and append it. byte[] consensus = new byte[referenceSequence.Count]; for (int index = 0; index < referenceSequence.Count; index++) { consensus[index] = ConsensusResolver.GetConsensus( new byte[] { referenceSequence[index], querySequence[index] }); } IAlphabet alphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.LongLength, referenceSequence.Alphabet); return(new Sequence(alphabet, consensus, false)); }
/// <summary> /// Parses the GenBank Origin data from the GenBank file. /// </summary> /// <param name="line">parse line</param> /// <param name="metadata">The GenBank metadata.</param> /// <param name="stream">The stream reader.</param> private void ParseOrigin(ref string line, GenBankMetadata metadata, StreamReader stream) { // The origin line can contain optional data; don't put empty string into // metadata. string lineData = GetLineData(line, DataIndent); if (!String.IsNullOrEmpty(lineData)) { metadata.Origin = lineData; } line = GoToNextLine(line, stream); IAlphabet alphabet = null; var sequenceBuilder = new StringBuilder(); while ((line != null) && line[0] == ' ') { // Using a regex is too slow. int len = line.Length; int k = 10; while (k < len) { string seqData = line.Substring(k, Math.Min(10, len - k)); sequenceBuilder.Append(seqData); k += 11; } line = GoToNextLine(line, stream); } var sequenceString = sequenceBuilder.ToString().Trim(); if (!string.IsNullOrEmpty(sequenceString)) { if (Alphabet == null) { byte[] tempData = System.Text.Encoding.ASCII.GetBytes(sequenceString.ToUpper(CultureInfo.InvariantCulture)); alphabet = Alphabets.AutoDetectAlphabet(tempData, 0, tempData.Length, alphabet); if (alphabet == null) { var message = String.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line); Trace.Report(message); throw new InvalidDataException(message); } } else { alphabet = Alphabet; } sequenceWithData = new Sequence(alphabet, sequenceString); } }
/// <summary> /// Adds consensus to the alignment result. At this point, it is a very simple algorithm /// which puts an ambiguity character where the two aligned sequences do not match. /// Uses X and N for protein and DNA/RNA alignments, respectively. /// </summary> /// <param name="alignment"> /// Alignment to which to add the consensus. This is the result returned by the main Align /// or AlignSimple method, which contains the aligned sequences but not yet a consensus sequence. /// </param> private void AddSimpleConsensusToResult(PairwiseAlignedSequence alignment) { ISequence seq0 = alignment.FirstSequence; ISequence seq1 = alignment.SecondSequence; byte[] consensus = new byte[seq0.Count]; for (int i = 0; i < seq0.Count; i++) { consensus[i] = ConsensusResolver.GetConsensus( new byte[] { seq0[i], seq1[i] }); } IAlphabet consensusAlphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.GetLongLength(), seq0.Alphabet); alignment.Consensus = new Sequence(consensusAlphabet, consensus, false); }
public void ValidateAutoDetectAlphabet() { string alphabetName = utilityObj.xmlUtil.GetTextValue( Constants.DnaDerivedSequenceNode, Constants.AlphabetNameNode); string dnaSequence = utilityObj.xmlUtil.GetTextValue( Constants.DnaDerivedSequenceNode, Constants.ExpectedDerivedSequence); byte[] dnaArray = encodingObj.GetBytes(dnaSequence); //Validating for Dna. IAlphabet dnaAplhabet = Alphabets.AutoDetectAlphabet(dnaArray, 0, 4, null); Assert.AreEqual(dnaAplhabet.Name, alphabetName); ApplicationLog.WriteLine(string.Concat( "Alphabets BVT: Validation of Auto Detect method for Dna completed successfully.")); //Validating for Rna. alphabetName = ""; alphabetName = utilityObj.xmlUtil.GetTextValue( Constants.RnaDerivedSequenceNode, Constants.AlphabetNameNode); string rnaSequence = utilityObj.xmlUtil.GetTextValue( Constants.RnaDerivedSequenceNode, Constants.ExpectedDerivedSequence); byte[] rnaArray = encodingObj.GetBytes(rnaSequence); IAlphabet rnaAplhabet = Alphabets.AutoDetectAlphabet(rnaArray, 0, 4, null); Assert.AreEqual(rnaAplhabet.Name, alphabetName); ApplicationLog.WriteLine(string.Concat( "Alphabets BVT: Validation of Auto Detect method for Rna completed successfully.")); //Validating for Protein. alphabetName = ""; alphabetName = utilityObj.xmlUtil.GetTextValue( Constants.ProteinDerivedSequenceNode, Constants.AlphabetNameNode); string proteinSequence = utilityObj.xmlUtil.GetTextValue( Constants.ProteinDerivedSequenceNode, Constants.ExpectedDerivedSequence); byte[] proteinArray = encodingObj.GetBytes(proteinSequence); IAlphabet proteinAplhabet = Alphabets.AutoDetectAlphabet(proteinArray, 0, 4, null); Assert.AreEqual(proteinAplhabet.Name, alphabetName); ApplicationLog.WriteLine(string.Concat( "Alphabets BVT: Validation of Auto Detect method for Protein completed successfully.")); }
private ISequence ParseLine(string line) { string[] splitLine = line.Split(Delimiter); string message; if (splitLine.Length != 2) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, line); Trace.Report(message); throw new FileFormatException(message); } IAlphabet alphabet = Alphabet; if (alphabet == null) { byte[] byteArray = UTF8Encoding.UTF8.GetBytes(splitLine[1]); alphabet = Alphabets.AutoDetectAlphabet(byteArray, 0, byteArray.Length, null); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, splitLine[1]); Trace.Report(message); throw new FileFormatException(message); } } Sequence sequence; sequence = new Sequence(alphabet, splitLine[1]) { ID = splitLine[0] }; return(sequence); }
/// <summary> /// Parses a single sequence from the file. /// </summary> /// <param name="header">Parsed header</param> /// <param name="reader">Binary reader</param> /// <returns>Sequence</returns> private ISequence ParseOne(SffHeader header, BinaryReader reader) { // Parse out the read header. ushort headerLength = C2BE(reader.ReadUInt16()); ushort nameLength = C2BE(reader.ReadUInt16()); uint numberOfBases = C2BE(reader.ReadUInt32()); // TODO: use clipping data ushort clipQualityLeft = C2BE(reader.ReadUInt16()); ushort clipQualityRight = C2BE(reader.ReadUInt16()); ushort clipAdapterLeft = C2BE(reader.ReadUInt16()); ushort clipAdapterRight = C2BE(reader.ReadUInt16()); string name = new string(reader.ReadChars(nameLength)); long paddingSize = headerLength - (16 + nameLength); if (paddingSize < 0 || paddingSize > 8) { throw new Exception("Invalid read header size found."); } if (paddingSize > 0) { if (reader.Read(new char[8], 0, (int)paddingSize) != paddingSize) { throw new Exception("Could not parse read header (padding)."); } } // Parse out the read data section ushort[] flowgramValues = new ushort[header.NumberOfFlowsPerRead]; for (int flowCount = 0; flowCount < header.NumberOfFlowsPerRead; flowCount++) { flowgramValues[flowCount] = C2BE(reader.ReadUInt16()); } byte[] flowIndexPerBase = new byte[numberOfBases]; if (reader.Read(flowIndexPerBase, 0, (int)numberOfBases) != numberOfBases) { throw new Exception("Unable to read flow indexes."); } byte[] bases = new byte[numberOfBases]; if (reader.Read(bases, 0, (int)numberOfBases) != numberOfBases) { throw new Exception("Unable to read base information."); } byte[] qscores = new byte[numberOfBases]; if (reader.Read(qscores, 0, (int)numberOfBases) != numberOfBases) { throw new Exception("Unable to read quality scores."); } for (int i = 0; i < qscores.Length; i++) { qscores[i] += 33; // adjust for Sanger } // Adjust for 8-byte padding at end of read segment long currentSize = header.NumberOfFlowsPerRead * 2 + 3 * numberOfBases; if ((currentSize & 7) > 0) { paddingSize = (((currentSize >> 3) + 1) << 3) - currentSize; if (paddingSize < 0 || paddingSize > 8) { throw new Exception("Invalid read data size found."); } if (paddingSize > 0) { if (reader.Read(new char[8], 0, (int)paddingSize) != paddingSize) { throw new Exception("Could not parse read header (padding)."); } } } // Determine the alphabet. var alphabet = Alphabet ?? Alphabets.AutoDetectAlphabet(bases, 0, bases.Length, null) ?? Alphabets.AmbiguousDNA; // Return our qSequence return(new QualitativeSequence(alphabet, FastQFormatType.Sanger, bases, qscores, false) { ID = name }); }
public IEnumerable <CompactSAMSequence> Parse() { if (string.IsNullOrWhiteSpace(_fileName)) { throw new ArgumentNullException("fileName"); } using (readStream = new FileStream(_fileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { Stream reader = readStream; if (reader == null || reader.Length == 0) { throw new FileFormatException(Properties.Resource.BAM_InvalidBAMFile); } if (!String.IsNullOrEmpty(ChromosomeToGet)) { foreach (var s in ParseRangeAsEnumerableSequences(_fileName, ChromosomeToGet)) { if (s != null) { yield return(s); } ////TODO: Super inefficient right now, am parsing the sequence multiple times, ////fix this. //var s2 = s.ToArray (); //var alpha = Alphabets.AutoDetectAlphabet(s2, 0, s2.Length, null); //var strippedOfInfo = new Sequence(alpha, s2); //yield return strippedOfInfo; } } else { readStream = reader; ValidateReader(); SAMAlignmentHeader header = GetHeader(); SequenceAlignmentMap sequenceAlignmentMap = null; if (sequenceAlignmentMap == null) { sequenceAlignmentMap = new SequenceAlignmentMap(header); } while (!IsEOF()) { #if WANT_OLD_VERSION SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue); #else var alignedSeq = GetAlignedSequence(); #endif if (alignedSeq != null) { #if WANT_OLD_VERSION //make a new Sequence ISequence strippedOfInfo = null; try { var syms = alignedSeq.QuerySequence.ToArray(); var alpha = Alphabets.AutoDetectAlphabet(syms, 0, syms.Length, null); strippedOfInfo = new Sequence(alpha, alignedSeq.QuerySequence.ToArray()); strippedOfInfo = alignedSeq; } catch (ArgumentOutOfRangeException exception) { Debug.Write("Could not convert sequence: " + exception.Message); } if (strippedOfInfo != null) { yield return(strippedOfInfo); } #else yield return(alignedSeq); #endif } alignedSeq = null; } } } }
/// <summary> /// Parses a single biological sequence alignment text from a stream. /// </summary> /// <param name="reader">Reader</param> /// <returns>Sequence</returns> private ISequenceAlignment ParseOne(StreamReader reader) { // no empty files allowed if (line == null) { ReadNextLine(reader); } if (line == null) { throw new InvalidDataException(Properties.Resource.IONoTextToParse); } if (!line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase)) { throw new InvalidDataException( string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name)); } ReadNextLine(reader); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. skipBlankLines = false; var mapIdToSequence = new Dictionary <string, Tuple <ISequence, List <byte> > >(); IAlphabet alignmentAlphabet = null; bool isFirstBlock = true; bool inBlock = false; var endOfBlockSymbols = new HashSet <char> { '*', ' ', '.', '+', ':' }; while (reader.Peek() != -1) { // Blank line or consensus line signals end of block. if (String.IsNullOrEmpty(line) || line.ToCharArray().All(endOfBlockSymbols.Contains)) { if (inBlock) { // Blank line signifies end of block inBlock = false; isFirstBlock = false; } } else // It's not a blank or consensus line. { // It's a data line in a block. // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore string[] tokens = line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters string id = tokens[0]; string data = tokens[1].ToUpperInvariant(); byte[] byteData = Encoding.UTF8.GetBytes(data); Tuple <ISequence, List <byte> > sequenceTuple; IAlphabet alphabet = Alphabet; inBlock = true; if (isFirstBlock) { if (null == alphabet) { alphabet = Alphabets.AutoDetectAlphabet(byteData, 0, byteData.Length, alphabet); if (null == alphabet) { throw new InvalidDataException(string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, data)); } if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { throw new InvalidDataException(string.Format( CultureInfo.CurrentCulture, Properties.Resource.SequenceAlphabetMismatch)); } } } sequenceTuple = new Tuple <ISequence, List <byte> >( new Sequence(alphabet, "") { ID = id }, new List <byte>()); sequenceTuple.Item2.AddRange(byteData); mapIdToSequence.Add(id, sequenceTuple); } else { if (!mapIdToSequence.ContainsKey(id)) { throw new InvalidDataException(string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id)); } sequenceTuple = mapIdToSequence[id]; sequenceTuple.Item2.AddRange(byteData); } } ReadNextLine(reader); } var sequenceAlignment = new SequenceAlignment(); var alignedSequence = new AlignedSequence(); sequenceAlignment.AlignedSequences.Add(alignedSequence); foreach (var alignmentSequenceTuple in mapIdToSequence.Values) { alignedSequence.Sequences.Add( new Sequence(alignmentSequenceTuple.Item1.Alphabet, alignmentSequenceTuple.Item2.ToArray()) { ID = alignmentSequenceTuple.Item1.ID }); } return(sequenceAlignment); }
/// <summary> /// Returns a single QualitativeSequence from the FASTQ data. /// </summary> /// <param name="reader">Reader to be parsed.</param> /// <param name="formatType">FASTQ format type.</param> /// <returns>Returns a QualitativeSequence.</returns> private IQualitativeSequence ParseOne(StreamReader reader, FastQFormatType formatType) { if (reader.EndOfStream) { return(null); } string line = ReadNextLine(reader, true); if (line == null || !line.StartsWith("@", StringComparison.Ordinal)) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name); throw new Exception(message); } // Process header line. string id = line.Substring(1).Trim(); line = ReadNextLine(reader, true); if (string.IsNullOrEmpty(line)) { string details = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Get sequence from second line. byte[] sequenceData = Encoding.ASCII.GetBytes(line); // Goto third line. line = ReadNextLine(reader, true); // Check for '+' symbol in the third line. if (line == null || !line.StartsWith("+", StringComparison.Ordinal)) { string details = string.Format( CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } string qualScoreId = line.Substring(1).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string details = string.Format( CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Goto fourth line. line = ReadNextLine(reader, true); if (string.IsNullOrEmpty(line)) { string details = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Get the quality scores from the fourth line. byte[] qualScores = Encoding.ASCII.GetBytes(line); // Check for sequence length and quality score length. if (sequenceData.GetLongLength() != qualScores.GetLongLength()) { string details = string.Format( CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); string message = string.Format( CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, this.Name, details); throw new Exception(message); } // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet IAlphabet alphabet = this.Alphabet; if (alphabet == null) { alphabet = Alphabets.AutoDetectAlphabet(sequenceData, 0, sequenceData.GetLongLength(), alphabet); if (alphabet == null) { throw new Exception(Resource.CouldNotIdentifyAlphabetType); } } else { if (!alphabet.ValidateSequence(sequenceData, 0, sequenceData.GetLongLength())) { throw new Exception(Resource.InvalidAlphabetType); } } return(new QualitativeSequence(alphabet, formatType, sequenceData, qualScores, false) { ID = id }); }
protected CompactSAMSequence GetAlignedSequence() { byte[] array = new byte[4]; ReadUnCompressedData(array, 0, 4); int blockLen = Helper.GetInt32(array, 0); byte[] alignmentBlock = new byte[blockLen]; ReadUnCompressedData(alignmentBlock, 0, blockLen); int value; UInt32 UnsignedValue; // 0-4 bytes int refSeqIndex = Helper.GetInt32(alignmentBlock, 0); string RName; if (refSeqIndex == -1) { RName = "*"; } else { RName = refSeqNames[refSeqIndex]; } // 4-8 bytes int Pos = Helper.GetInt32(alignmentBlock, 4) + 1; // 8 - 12 bytes "bin<<16|mapQual<<8|read_name_len" UnsignedValue = Helper.GetUInt32(alignmentBlock, 8); int queryNameLen = (int)(UnsignedValue & 0x000000FF); // 12 - 16 bytes UnsignedValue = Helper.GetUInt32(alignmentBlock, 12); int flagValue = (int)(UnsignedValue & 0xFFFF0000) >> 16; int cigarLen = (int)(UnsignedValue & 0x0000FFFF); //// 16-20 bytes int readLen = Helper.GetInt32(alignmentBlock, 16); // 32-(32+readLen) bytes string name = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, 32, queryNameLen - 1); StringBuilder strbuilder = new StringBuilder(); int startIndex = 32 + queryNameLen; for (int i = startIndex; i < (startIndex + cigarLen * 4); i += 4) { // Get the CIGAR operation length stored in first 28 bits. UInt32 cigarValue = Helper.GetUInt32(alignmentBlock, i); strbuilder.Append(((cigarValue & 0xFFFFFFF0) >> 4).ToString(CultureInfo.InvariantCulture)); // Get the CIGAR operation stored in last 4 bits. value = (int)cigarValue & 0x0000000F; // MIDNSHP=>0123456 switch (value) { case 0: strbuilder.Append("M"); break; case 1: strbuilder.Append("I"); break; case 2: strbuilder.Append("D"); break; case 3: strbuilder.Append("N"); break; case 4: strbuilder.Append("S"); break; case 5: strbuilder.Append("H"); break; case 6: strbuilder.Append("P"); break; case 7: strbuilder.Append("="); break; case 8: strbuilder.Append("X"); break; default: throw new FileFormatException(Properties.Resource.BAM_InvalidCIGAR); } } string cigar = strbuilder.ToString(); if (string.IsNullOrWhiteSpace(cigar)) { cigar = "*"; } startIndex += cigarLen * 4; //strbuilder = new StringBuilder(); byte[] seqData = new byte[readLen]; int seqDataIndex = 0; int index = startIndex; for (; index < (startIndex + (readLen + 1) / 2) - 1; index++) { // Get first 4 bit value value = (alignmentBlock[index] & 0xF0) >> 4; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); // Get last 4 bit value value = alignmentBlock[index] & 0x0F; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); } value = (alignmentBlock[index] & 0xF0) >> 4; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); if (readLen % 2 == 0) { value = alignmentBlock[index] & 0x0F; //strbuilder.Append(GetSeqChar(value)); seqData[seqDataIndex++] = GetSeqCharAsByte(value); } startIndex = index + 1; // string strSequence = strbuilder.ToString(); //Insert qual value catch here? ADDING NEW QUALITY SCORE FINDER!!! byte[] qualValues = new byte[readLen]; string strQualValues = "*"; if (alignmentBlock[startIndex] != 0xFF) { for (int i = startIndex; i < (startIndex + readLen); i++) { qualValues[i - startIndex] = (byte)(alignmentBlock[i] + 33); } strQualValues = System.Text.ASCIIEncoding.ASCII.GetString(qualValues); } //END NEW EDITION! //var syms = Encoding.UTF8.GetBytes(strSequence); var alpha = Alphabets.AutoDetectAlphabet(seqData, 0, seqData.Length, null); //Sequence toReturn = new Sequence(alpha, syms); //TODO: Possibly a bit unsafe here var toReturn = new CompactSAMSequence(alpha, FastQFormatType.GATK_Recalibrated, seqData, qualValues, false); toReturn.ID = name; toReturn.Pos = Pos; toReturn.CIGAR = cigar; toReturn.RName = RName; toReturn.SAMFlags = (SAMFlags)flagValue; return(toReturn); }
/// <summary> /// Parses a single biological sequence alignment text from a reader. /// </summary> /// <param name="reader">A reader for a biological sequence alignment text.</param> /// <returns>The parsed ISequenceAlignment object.</returns> public ISequenceAlignment ParseOne(TextReader reader) { string message = string.Empty; if (reader == null) { throw new ArgumentNullException("reader"); } if (line == null) { ReadNextLine(reader); } // no empty files allowed if (line == null) { throw new InvalidDataException(Properties.Resource.IONoTextToParse); } // Parse first line IList <string> tokens = line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); if (2 != tokens.Count) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name); throw new InvalidDataException(message); } bool isFirstBlock = true; int sequenceCount = 0; int sequenceLength = 0; IList <Tuple <Sequence, List <byte> > > data = new List <Tuple <Sequence, List <byte> > >(); string id = string.Empty; string sequenceString = string.Empty; Tuple <Sequence, List <byte> > sequence = null; IAlphabet alignmentAlphabet = null; sequenceCount = Int32.Parse(tokens[0], CultureInfo.InvariantCulture); sequenceLength = Int32.Parse(tokens[1], CultureInfo.InvariantCulture); ReadNextLine(reader); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. skipBlankLines = false; while (reader.Peek() != -1) { if (string.IsNullOrWhiteSpace(line)) { ReadNextLine(reader); continue; } for (int index = 0; index < sequenceCount; index++) { if (isFirstBlock) { // First 10 characters are sequence ID, remaining is the first block of sequence // Note that both may contain whitespace, and there may be no whitespace between them. if (line.Length <= 10) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name); throw new Exception(message); } id = line.Substring(0, 10).Trim(); sequenceString = line.Substring(10).Replace(" ", ""); byte[] sequenceBytes = System.Text.ASCIIEncoding.ASCII.GetBytes(sequenceString); IAlphabet alphabet = Alphabet; if (null == alphabet) { alphabet = Alphabets.AutoDetectAlphabet(sequenceBytes, 0, sequenceBytes.Length, alphabet); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, sequenceString); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { throw new InvalidDataException(Properties.Resource.SequenceAlphabetMismatch); } } } } Tuple <Sequence, List <byte> > sequenceStore = new Tuple <Sequence, List <byte> >( new Sequence(alphabet, string.Empty) { ID = id }, new List <byte>()); sequenceStore.Item2.AddRange(sequenceBytes); data.Add(sequenceStore); } else { sequence = data[index]; byte[] sequenceBytes = System.Text.ASCIIEncoding.ASCII.GetBytes(line.Replace(" ", "")); sequence.Item2.AddRange(sequenceBytes); } ReadNextLine(reader); } // Reset the first block flag isFirstBlock = false; } // Validate for the count of sequence if (sequenceCount != data.Count) { throw new InvalidDataException(Properties.Resource.SequenceCountMismatch); } SequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); foreach (var dataSequence in data) { // Validate for the count of sequence if (sequenceLength != dataSequence.Item2.Count) { throw new InvalidDataException(Properties.Resource.SequenceLengthMismatch); } sequenceAlignment.AlignedSequences[0].Sequences.Add( new Sequence(dataSequence.Item1.Alphabet, dataSequence.Item2.ToArray()) { ID = dataSequence.Item1.ID }); } return(sequenceAlignment); }
/// <summary> /// Returns an IEnumerable of sequences in the stream being parsed. /// </summary> /// <param name="reader">Stream to parse.</param> /// <param name="buffer">Buffer to use.</param> /// <returns>Returns a Sequence.</returns> ISequence ParseOne(TextReader reader, byte[] buffer) { if (reader == null) { throw new ArgumentNullException("reader"); } if (reader.Peek() == -1) { return(null); } int currentBufferSize = PlatformManager.Services.DefaultBufferSize; string message; string line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line)) { line = reader.ReadLine(); } if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.INVALID_INPUT_FILE, Properties.Resource.FASTA_NAME); throw new Exception(message); } string name = line.Substring(1); int bufferPosition = 0; // Read next line. line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line)) { line = reader.ReadLine(); } if (line == null) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, string.Empty); throw new Exception(message); } IAlphabet alphabet = Alphabet; bool tryAutoDetectAlphabet = alphabet == null; do { // Files > 2G are not supported in this release. if ((((long)bufferPosition + line.Length) >= PlatformManager.Services.MaxSequenceSize)) { throw new ArgumentOutOfRangeException( string.Format(CultureInfo.CurrentUICulture, Properties.Resource.SequenceDataGreaterthan2GB, name)); } int neededSize = bufferPosition + line.Length; if (neededSize >= currentBufferSize) { //Grow file dynamically, by buffer size, or if too small to fit the new sequence by the size of the sequence int suggestedSize = buffer.Length + PlatformManager.Services.DefaultBufferSize; int newSize = neededSize < suggestedSize ? suggestedSize : neededSize; Array.Resize(ref buffer, newSize); currentBufferSize = newSize; } byte[] symbols = Encoding.UTF8.GetBytes(line); // Array.Copy -- for performance improvement. Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length); // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet if (tryAutoDetectAlphabet) { // If we have a base alphabet we detected earlier, // then try that first. if (this.baseAlphabet != null && this.baseAlphabet.ValidateSequence(buffer, bufferPosition, line.Length)) { alphabet = this.baseAlphabet; } // Otherwise attempt to identify alphabet else { // Different alphabet - try to auto detect. this.baseAlphabet = null; alphabet = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet); if (alphabet == null) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line)); } } // Determine the base alphabet used. if (this.baseAlphabet == null) { this.baseAlphabet = alphabet; } else { // If they are not the same, then this might be an error. if (this.baseAlphabet != alphabet) { // If the new alphabet includes all the base alphabet then use it instead. // This happens when we hit an ambiguous form of the alphabet later in the file. if (!this.baseAlphabet.HasAmbiguity && Alphabets.GetAmbiguousAlphabet(this.baseAlphabet) == alphabet) { this.baseAlphabet = alphabet; } else if (alphabet.HasAmbiguity || Alphabets.GetAmbiguousAlphabet(alphabet) != this.baseAlphabet) { throw new Exception(Properties.Resource.FastAContainsMorethanOnebaseAlphabet); } } } } else { // Validate against supplied alphabet. if (!alphabet.ValidateSequence(buffer, bufferPosition, line.Length)) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line)); } } bufferPosition += line.Length; if (reader.Peek() == (byte)'>') { break; } // Read next line. line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line) && reader.Peek() != (byte)'>') { line = reader.ReadLine(); } }while (line != null); // Truncate buffer to remove trailing 0's byte[] tmpBuffer = new byte[bufferPosition]; Array.Copy(buffer, tmpBuffer, bufferPosition); if (tryAutoDetectAlphabet) { alphabet = this.baseAlphabet; } // In memory sequence return(new Sequence(alphabet, tmpBuffer, false) { ID = name }); }
/// <summary> /// Convert the delta alignment object to its sequence representation /// </summary> /// <returns>Reference sequence alignment at 0th index and /// Query sequence alignment at 1st index</returns> public PairwiseAlignedSequence ConvertDeltaToSequences() { PairwiseAlignedSequence alignedSequence = new PairwiseAlignedSequence(); int gap = 0; List <long> startOffsets = new List <long>(2); List <long> endOffsets = new List <long>(2); List <long> insertions = new List <long>(2); startOffsets.Add(FirstSequenceStart); startOffsets.Add(SecondSequenceStart); endOffsets.Add(FirstSequenceEnd); endOffsets.Add(SecondSequenceEnd); insertions.Add(0); insertions.Add(0); // Create the new sequence object with given start and end indices List <byte> referenceSequence = new List <byte>(); for (long index = this.FirstSequenceStart; index <= this.FirstSequenceEnd; index++) { referenceSequence.Add(this.ReferenceSequence[index]); } List <byte> querySequence = new List <byte>(); for (long index = this.SecondSequenceStart; index <= this.SecondSequenceEnd; index++) { querySequence.Add(this.QuerySequence[index]); } // Insert the Alignment character at delta position // +ve delta: Insertion in reference sequence // -ve delta: Insertion in query sequence (deletion in reference sequence) foreach (int delta in Deltas) { gap += Math.Abs(delta); if (delta < 0) { referenceSequence.Insert(gap - 1, DnaAlphabet.Instance.Gap); insertions[0]++; } else { querySequence.Insert(gap - 1, DnaAlphabet.Instance.Gap); insertions[1]++; } } byte[] refSeq = referenceSequence.ToArray(); IAlphabet alphabet = Alphabets.AutoDetectAlphabet(refSeq, 0, refSeq.LongLength, null); alignedSequence.FirstSequence = new Sequence(alphabet, refSeq, false); byte[] querySeq = querySequence.ToArray(); alphabet = Alphabets.AutoDetectAlphabet(querySeq, 0, querySeq.LongLength, QuerySequence.Alphabet); alignedSequence.SecondSequence = new Sequence(alphabet, querySeq, false); alignedSequence.Metadata["StartOffsets"] = startOffsets; alignedSequence.Metadata["EndOffsets"] = endOffsets; alignedSequence.Metadata["Insertions"] = insertions; return(alignedSequence); }
/// <summary> /// Get all the gaps in each sequence and call pairwise alignment. /// </summary> /// <param name="referenceSequence">Reference sequence.</param> /// <param name="sequence">Query sequence.</param> /// <param name="mums">List of MUMs.</param> /// <returns>Aligned sequences.</returns> private PairwiseAlignedSequence ProcessGaps( ISequence referenceSequence, ISequence sequence, IList <Match> mums) { List <byte> sequenceResult1 = new List <byte>(); List <byte> sequenceResult2 = new List <byte>(); List <byte> consensusResult = new List <byte>(); PairwiseAlignedSequence alignedSequence = new PairwiseAlignedSequence(); Match mum1; Match mum2; // Run the alignment for gap before first MUM List <long> insertions = new List <long>(2); insertions.Add(0); insertions.Add(0); List <long> gapInsertions; mum1 = mums.First(); alignedSequence.Score += this.AlignGap( referenceSequence, sequence, sequenceResult1, sequenceResult2, consensusResult, new Match() { Length = 0 }, // Here the first MUM does not exist mum1, out gapInsertions); insertions[0] += gapInsertions[0]; insertions[1] += gapInsertions[1]; // Run the alignment for all the gaps between MUM for (int index = 1; index < mums.Count; index++) { mum2 = mums[index]; alignedSequence.Score += this.AlignGap( referenceSequence, sequence, sequenceResult1, sequenceResult2, consensusResult, mum1, mum2, out gapInsertions); insertions[0] += gapInsertions[0]; insertions[1] += gapInsertions[1]; mum1 = mum2; } // Run the alignment for gap after last MUM alignedSequence.Score += this.AlignGap( referenceSequence, sequence, sequenceResult1, sequenceResult2, consensusResult, mum1, new Match() { Length = 0 }, out gapInsertions); insertions[0] += gapInsertions[0]; insertions[1] += gapInsertions[1]; byte[] result1 = sequenceResult1.ToArray(); IAlphabet alphabet = Alphabets.AutoDetectAlphabet(result1, 0, result1.LongLength, referenceSequence.Alphabet); alignedSequence.FirstSequence = new Sequence( alphabet, result1) { ID = referenceSequence.ID, Metadata = referenceSequence.Metadata }; byte[] result2 = sequenceResult2.ToArray(); alphabet = Alphabets.AutoDetectAlphabet(result2, 0, result2.LongLength, sequence.Alphabet); alignedSequence.SecondSequence = new Sequence( alphabet, result2) { ID = sequence.ID, Metadata = sequence.Metadata }; byte[] consensus = consensusResult.ToArray(); alphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.LongLength, referenceSequence.Alphabet); alignedSequence.Consensus = new Sequence( alphabet, consensus); // Offset is not required as Smith Waterman will fragmented alignment. // Offset is the starting position of alignment of sequence1 with respect to sequence2. if (this.PairWiseAlgorithm is NeedlemanWunschAligner) { alignedSequence.FirstOffset = alignedSequence.FirstSequence.IndexOfNonGap() - referenceSequence.IndexOfNonGap(); alignedSequence.SecondOffset = alignedSequence.SecondSequence.IndexOfNonGap() - sequence.IndexOfNonGap(); } List <long> startOffsets = new List <long>(2); List <long> endOffsets = new List <long>(2); startOffsets.Add(0); startOffsets.Add(0); endOffsets.Add(referenceSequence.Count - 1); endOffsets.Add(sequence.Count - 1); alignedSequence.Metadata["StartOffsets"] = startOffsets; alignedSequence.Metadata["EndOffsets"] = endOffsets; alignedSequence.Metadata["Insertions"] = insertions; // return the aligned sequence return(alignedSequence); }
/// <summary> /// Parses a single biological sequence alignment text from a reader. /// </summary> /// <param name="reader">A reader for a biological sequence alignment text.</param> /// <returns>The parsed ISequenceAlignment object.</returns> ISequenceAlignment ParseOne(TextReader reader) { ReadNextLine(reader); if (line == null) { throw new Exception(Properties.Resource.INVALID_INPUT_FILE); } this.ParseHeader(reader); var alignedSequence = new AlignedSequence(); IList <string> ids = null; bool isInBlock = true; if (this.line.StartsWith("begin", StringComparison.OrdinalIgnoreCase)) { while (this.line != null && isInBlock) { if (string.IsNullOrEmpty(this.line.Trim())) { this.ReadNextLine(reader); continue; } string blockName = GetTokens(this.line)[1]; switch (blockName.ToUpperInvariant()) { case "TAXA": case "TAXA;": // This block contains the count of sequence & title of each sequence ids = this.ParseTaxaBlock(reader); break; case "CHARACTERS": case "CHARACTERS;": // Block contains sequences Dictionary <string, string> dataSet = this.ParseCharacterBlock(reader, ids); IAlphabet alignmentAlphabet = null; foreach (string id in ids) { IAlphabet alphabet = this.Alphabet; string data = dataSet[id]; if (null == alphabet) { byte[] dataArray = data.ToByteArray(); alphabet = Alphabets.AutoDetectAlphabet(dataArray, 0, dataArray.Length, null); if (null == alphabet) { throw new InvalidDataException(string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, data)); } if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { throw new InvalidDataException(string.Format( CultureInfo.InvariantCulture, Properties.Resource.SequenceAlphabetMismatch)); } } } alignedSequence.Sequences.Add(new Sequence(alphabet, data) { ID = id }); } break; case "END": case "END;": // Have reached the end of block isInBlock = false; break; default: // skip this block while (this.line != null) { this.ReadNextLine(reader); if (0 == string.Compare(this.line, "end;", StringComparison.OrdinalIgnoreCase)) { break; } } break; } this.ReadNextLine(reader); } } ISequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(alignedSequence); return(sequenceAlignment); }
/// <summary> /// Parses a single biological sequence alignment text from a reader. /// </summary> /// <param name="reader">A reader for a biological sequence alignment text.</param> /// <returns>The parsed ISequenceAlignment object.</returns> public ISequenceAlignment ParseOne(TextReader reader) { if (reader == null) { throw new ArgumentNullException("reader"); } ReadNextLine(reader); if (line == null) { string message = Properties.Resource.INVALID_INPUT_FILE; Trace.Report(message); throw new FileFormatException(message); } else { ParseHeader(reader); string message = string.Empty; ISequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); IList <string> ids = null; bool isInBlock = true; if (line.StartsWith("begin", StringComparison.OrdinalIgnoreCase)) { while (line != null && isInBlock) { if (string.IsNullOrEmpty(line.Trim())) { ReadNextLine(reader); continue; } string blockName = GetTokens(line)[1]; switch (blockName.ToUpper(CultureInfo.InvariantCulture)) { case "TAXA": case "TAXA;": // This block contains the count of sequence & title of each sequence ids = (IList <string>)ParseTaxaBlock(reader); break; case "CHARACTERS": case "CHARACTERS;": // Block contains sequences Dictionary <string, string> dataSet = ParseCharacterBlock(reader, ids); IAlphabet alignmentAlphabet = null; string data = string.Empty; foreach (string ID in ids) { IAlphabet alphabet = Alphabet; Sequence sequence = null; data = dataSet[ID]; if (null == alphabet) { byte[] dataArray = data.Select(a => (byte)a).ToArray(); alphabet = Alphabets.AutoDetectAlphabet(dataArray, 0, dataArray.Length, null); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, data); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.SequenceAlphabetMismatch); throw new InvalidDataException(message); } } } } sequence = new Sequence(alphabet, data); sequence.ID = ID; sequenceAlignment.AlignedSequences[0].Sequences.Add(sequence); } break; case "END": case "END;": // Have reached the end of block isInBlock = false; break; default: // skip this block while (line != null) { ReadNextLine(reader); if (0 == string.Compare(line, "end;", StringComparison.OrdinalIgnoreCase)) { break; } } break; } ReadNextLine(reader); } } return(sequenceAlignment); } }
/// <summary> /// Launches the alignment algorithm /// </summary> public virtual List <IPairwiseSequenceAlignment> Align() { InitializeCache(); // Grid for (int diagonal = 0; diagonal < gridCols + gridRows - 2; diagonal++) { for (int blockRow = 0; blockRow < gridRows; blockRow++) { int blockCol = diagonal - blockRow; if ((blockCol >= 0) && (blockCol < gridCols)) { int lastRow = (blockRow == gridRows - 1) ? (int)(colHeight - Math.BigMul(blockRow, gridStride) - 1) : gridStride; int lastCol = (blockCol == gridCols - 1) ? (int)(rowWidth - Math.BigMul(blockCol, gridStride) - 1) : gridStride; ComputeIntermediateBlock(blockRow, blockCol, lastRow, lastCol); } } } sbyte[][] trace = new sbyte[gridStride + 1][]; for (int i = 0; i <= gridStride; i++) { trace[i] = new sbyte[gridStride + 1]; } // Last Block - grid calculation and Traceback combined int completeTraceRow = gridRows - 1; int completeTraceCol = gridCols - 1; int completeLastRow = (int)(colHeight - Math.BigMul(completeTraceRow, gridStride) - 1); int completeLastCol = (int)(rowWidth - Math.BigMul(completeTraceCol, gridStride) - 1); ComputeCornerBlock(completeTraceRow, completeTraceCol, completeLastRow, completeLastCol, trace); //Traceback if (optScoreCells.Count == 0) { return(new List <IPairwiseSequenceAlignment>()); } else { PairwiseSequenceAlignment alignment = new PairwiseSequenceAlignment(sequenceI, sequenceJ); for (int alignmentCount = 0; alignmentCount < optScoreCells.Count; alignmentCount++) { PairwiseAlignedSequence result = new PairwiseAlignedSequence(); result.Score = optScore; long alignmentRow = optScoreCells[alignmentCount].Item1; long alignmentCol = optScoreCells[alignmentCount].Item2; int blockRow = (int)(alignmentRow / gridStride); int blockCol = (int)(alignmentCol / gridStride); int lastRow = (int)(alignmentRow - Math.BigMul(blockRow, gridStride)); int lastCol = (int)(alignmentCol - Math.BigMul(blockCol, gridStride)); result.Metadata["EndOffsets"] = new List <long> { alignmentRow - 1, alignmentCol - 1 }; long alignmentLength = 0; byte[] sequence1 = new byte[colHeight + rowWidth]; byte[] sequence2 = new byte[colHeight + rowWidth]; int colGaps = 0; int rowGaps = 0; while ((blockRow >= 0) && (blockCol >= 0)) { if ((blockRow != completeTraceRow) || (blockCol != completeTraceCol) || (lastRow > completeLastRow) || (lastCol > completeLastCol)) { ComputeTraceBlock(blockRow, blockCol, lastRow, lastCol, trace); completeTraceRow = blockRow; completeTraceCol = blockCol; completeLastRow = lastRow; completeLastCol = lastCol; } long startPositionI = blockRow * gridStride - 1; long startPositionJ = blockCol * gridStride - 1; while ((trace[lastRow][lastCol] != SourceDirection.Stop) && (trace[lastRow][lastCol] != SourceDirection.Block)) { switch (trace[lastRow][lastCol]) { case SourceDirection.Diagonal: // diagonal, no gap, use both sequence residues sequence1[alignmentLength] = sequenceI[startPositionI + lastRow]; sequence2[alignmentLength] = sequenceJ[startPositionJ + lastCol]; alignmentLength++; lastRow--; lastCol--; break; case SourceDirection.Up: // up, gap in J sequence1[alignmentLength] = sequenceI[startPositionI + lastRow]; sequence2[alignmentLength] = this.gapCode; alignmentLength++; lastRow--; colGaps++; break; case SourceDirection.Left: // left, gap in I sequence1[alignmentLength] = this.gapCode; sequence2[alignmentLength] = sequenceJ[startPositionJ + lastCol]; alignmentLength++; lastCol--; rowGaps++; break; } } if (trace[lastRow][lastCol] == SourceDirection.Stop) { // Be nice, turn aligned solutions around so that they match the input sequences byte[] alignedA = new byte[alignmentLength]; byte[] alignedB = new byte[alignmentLength]; for (long i = 0, j = alignmentLength - 1; i < alignmentLength; i++, j--) { alignedA[i] = sequence1[j]; alignedB[i] = sequence2[j]; } // If alphabet of inputA is DnaAlphabet then alphabet of alignedA may be Dna or AmbiguousDna. IAlphabet alphabet = Alphabets.AutoDetectAlphabet(alignedA, 0, alignedA.LongLength, sequenceI.Alphabet); Sequence seq = new Sequence(alphabet, alignedA, false); seq.ID = sequenceI.ID; // seq.DisplayID = aInput.DisplayID; result.FirstSequence = seq; alphabet = Alphabets.AutoDetectAlphabet(alignedB, 0, alignedB.LongLength, sequenceJ.Alphabet); seq = new Sequence(alphabet, alignedB, false); seq.ID = sequenceJ.ID; // seq.DisplayID = bInput.DisplayID; result.SecondSequence = seq; // Offset is start of alignment in input sequence with respect to other sequence. if (lastCol >= lastRow) { result.FirstOffset = lastCol - lastRow; result.SecondOffset = 0; } else { result.FirstOffset = 0; result.SecondOffset = lastRow - lastCol; } result.Metadata["StartOffsets"] = new List <long> { lastRow, lastCol }; result.Metadata["Insertions"] = new List <long> { rowGaps, colGaps }; alignment.PairwiseAlignedSequences.Add(result); break; } else { if (lastRow == 0 && lastCol == 0) { blockRow--; blockCol--; lastRow = gridStride; lastCol = gridStride; } else { if (lastRow == 0) { blockRow--; lastRow = gridStride; } else { blockCol--; lastCol = gridStride; } } } } } return(new List <IPairwiseSequenceAlignment>() { alignment }); } }
/// <summary> /// Returns an IEnumerable of DeltaAlignment in the file being parsed. /// </summary> /// <returns>Returns DeltaAlignment collection.</returns> public IList <IEnumerable <DeltaAlignment> > Parse() { bool skipBlankLine = true; int currentBufferSize = BufferSize; byte[] buffer = new byte[currentBufferSize]; IAlphabet alphabet = null; List <IEnumerable <DeltaAlignment> > result = new List <IEnumerable <DeltaAlignment> >(); IList <DeltaAlignment> deltaAlignments = new List <DeltaAlignment>(); string message = string.Empty; using (StreamReader streamReader = new StreamReader(this.Filename)) { if (streamReader.EndOfStream) { message = string.Format( CultureInfo.InvariantCulture, Resources.INVALID_INPUT_FILE, Resources.Parser_Name); throw new FileFormatException(message); } ReadNextLine(streamReader); do { if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format( CultureInfo.InvariantCulture, Resources.INVALID_INPUT_FILE, Resources.Parser_Name); throw new FileFormatException(message); } //First line - reference id string referenceId = line.Substring(1); int bufferPosition = 0; // Read next line. ReadNextLine(streamReader); //Second line - Query sequence id string queryId = line; //third line - query sequence // Read next line. ReadNextLine(streamReader); // For large files copy the data in memory mapped file. if ((((long)bufferPosition + line.Length) >= MaximumSequenceLength)) { throw new ArgumentOutOfRangeException( string.Format(CultureInfo.CurrentUICulture, Resources.SequenceDataGreaterthan2GB, queryId)); } if (((bufferPosition + line.Length) >= currentBufferSize)) { Array.Resize <byte>(ref buffer, buffer.Length + BufferSize); currentBufferSize += BufferSize; } byte[] symbols = ASCIIEncoding.ASCII.GetBytes(line); // Array.Copy -- for performance improvement. Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length); alphabet = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet); if (alphabet == null) { throw new FileFormatException(string.Format(Resources.InvalidSymbolInString, line)); } bufferPosition += line.Length; // Truncate buffer to remove trailing 0's byte[] tmpBuffer = new byte[bufferPosition]; Array.Copy(buffer, tmpBuffer, bufferPosition); Sequence sequence = null; // In memory sequence sequence = new Sequence(alphabet, tmpBuffer, false); sequence.ID = queryId; Sequence refEmpty = new Sequence(sequence.Alphabet, "A", false); refEmpty.ID = referenceId; DeltaAlignment deltaAlignment = new DeltaAlignment(refEmpty, sequence); //Fourth line - properties of deltaalignment // Read next line. ReadNextLine(streamReader); string[] deltaAlignmentProperties = line.Split(' '); if (deltaAlignmentProperties != null && deltaAlignmentProperties.Length == 7) { long temp; deltaAlignment.FirstSequenceStart = long.TryParse(deltaAlignmentProperties[0], out temp) ? temp : 0; deltaAlignment.FirstSequenceEnd = long.TryParse(deltaAlignmentProperties[1], out temp) ? temp : 0; deltaAlignment.SecondSequenceStart = long.TryParse(deltaAlignmentProperties[2], out temp) ? temp : 0; deltaAlignment.SecondSequenceEnd = long.TryParse(deltaAlignmentProperties[3], out temp) ? temp : 0; int error; deltaAlignment.Errors = int.TryParse(deltaAlignmentProperties[4], out error) ? error : 0; deltaAlignment.SimilarityErrors = int.TryParse(deltaAlignmentProperties[5], out error) ? error : 0; deltaAlignment.NonAlphas = int.TryParse(deltaAlignmentProperties[6], out error) ? error : 0; } //Fifth line - either a 0 - marks the end of the delta alignment or they are deltas while (line != null && !line.StartsWith("*", StringComparison.OrdinalIgnoreCase)) { long temp; if (long.TryParse(line, out temp)) { deltaAlignment.Deltas.Add(temp); } // Read next line. line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } } deltaAlignments.Add(deltaAlignment); //Read the next line line = streamReader.ReadLine(); if (line.StartsWith("--", StringComparison.OrdinalIgnoreCase)) { result.Add(deltaAlignments); //clear the inner list deltaAlignments = new List <DeltaAlignment>(); //skip until the next valid delta is found do { line = streamReader.ReadLine(); }while (line != null && line.StartsWith("--", StringComparison.OrdinalIgnoreCase)); } }while (line != null); } return(result); }
/// <summary> /// Get all the gaps in each sequence and call pairwise alignment. /// </summary> /// <param name="referenceSequence">Reference sequence.</param> /// <param name="sequence">Query sequence.</param> /// <param name="mums">List of MUMs.</param> /// <returns>Aligned sequences.</returns> private PairwiseAlignedSequence ProcessGaps( ISequence referenceSequence, ISequence sequence, IList <Match> mums) { List <byte> sequenceResult1 = new List <byte>(); List <byte> sequenceResult2 = new List <byte>(); List <byte> consensusResult = new List <byte>(); PairwiseAlignedSequence alignedSequence = new PairwiseAlignedSequence(); Match mum1; Match mum2; // Run the alignment for gap before first MUM List <long> insertions = new List <long>(2); insertions.Add(0); insertions.Add(0); List <long> gapInsertions; mum1 = mums.First(); alignedSequence.Score += this.AlignGap( referenceSequence, sequence, sequenceResult1, sequenceResult2, consensusResult, new Match() { Length = 0 }, // Here the first MUM does not exist mum1, out gapInsertions); insertions[0] += gapInsertions[0]; insertions[1] += gapInsertions[1]; // Run the alignment for all the gaps between MUM for (int index = 1; index < mums.Count; index++) { mum2 = mums[index]; alignedSequence.Score += this.AlignGap( referenceSequence, sequence, sequenceResult1, sequenceResult2, consensusResult, mum1, mum2, out gapInsertions); insertions[0] += gapInsertions[0]; insertions[1] += gapInsertions[1]; mum1 = mum2; } // Run the alignment for gap after last MUM alignedSequence.Score += this.AlignGap( referenceSequence, sequence, sequenceResult1, sequenceResult2, consensusResult, mum1, new Match() { Length = 0 }, out gapInsertions); insertions[0] += gapInsertions[0]; insertions[1] += gapInsertions[1]; byte[] result1 = sequenceResult1.ToArray(); IAlphabet alphabet = Alphabets.AutoDetectAlphabet(result1, 0, result1.GetLongLength(), referenceSequence.Alphabet); alignedSequence.FirstSequence = new Sequence( alphabet, result1) { ID = referenceSequence.ID, // Do not shallow copy dictionary //Metadata = referenceSequence.Metadata }; byte[] result2 = sequenceResult2.ToArray(); alphabet = Alphabets.AutoDetectAlphabet(result2, 0, result2.GetLongLength(), sequence.Alphabet); alignedSequence.SecondSequence = new Sequence( alphabet, result2) { ID = sequence.ID, // Do not shallow copy dictionary //Metadata = sequence.Metadata }; byte[] consensus = consensusResult.ToArray(); alphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.GetLongLength(), referenceSequence.Alphabet); alignedSequence.Consensus = new Sequence( alphabet, consensus); alignedSequence.FirstOffset = alignedSequence.FirstSequence.IndexOfNonGap() - referenceSequence.IndexOfNonGap(); alignedSequence.SecondOffset = alignedSequence.SecondSequence.IndexOfNonGap() - sequence.IndexOfNonGap(); List <long> startOffsets = new List <long>(2); List <long> endOffsets = new List <long>(2); startOffsets.Add(0); startOffsets.Add(0); endOffsets.Add(referenceSequence.Count - 1); endOffsets.Add(sequence.Count - 1); alignedSequence.Metadata["StartOffsets"] = startOffsets; alignedSequence.Metadata["EndOffsets"] = endOffsets; alignedSequence.Metadata["Insertions"] = insertions; // return the aligned sequence return(alignedSequence); }
/// <summary> /// Returns an IEnumerable of sequences in the file being parsed. /// </summary> /// <returns>Returns ISequence arrays.</returns> public IEnumerable <ISequence> Parse() { IAlphabet alphabet = this.Alphabet; IAlphabet baseAlphabet = null; int currentBufferSize = BufferSize; byte[] buffer = new byte[currentBufferSize]; bool skipBlankLine = true; string message = string.Empty; bool tryAutoDetectAlphabet; if (alphabet == null) { tryAutoDetectAlphabet = true; } else { tryAutoDetectAlphabet = false; } using (StreamReader streamReader = new StreamReader(this.Filename)) { if (streamReader.EndOfStream) { message = string.Format( CultureInfo.InvariantCulture, Bio.Properties.Resource.INVALID_INPUT_FILE, Bio.Properties.Resource.FASTA_NAME); throw new FileFormatException(message); } string line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } do { if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format( CultureInfo.InvariantCulture, Bio.Properties.Resource.INVALID_INPUT_FILE, Bio.Properties.Resource.FASTA_NAME); throw new FileFormatException(message); } string name = line.Substring(1); int bufferPosition = 0; if (tryAutoDetectAlphabet) { alphabet = baseAlphabet; } string sequenceTempFileName = string.Empty; // Read next line. line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } if (line == null) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, string.Empty); throw new FileFormatException(message); } do { // For large files copy the data in memory mapped file. if ((((long)bufferPosition + line.Length) >= MaximumSequenceLength)) { throw new ArgumentOutOfRangeException( string.Format(CultureInfo.CurrentUICulture, Properties.Resource.SequenceDataGreaterthan2GB, name)); } if (((bufferPosition + line.Length) >= currentBufferSize)) { Array.Resize <byte>(ref buffer, buffer.Length + BufferSize); currentBufferSize += BufferSize; } byte[] symbols = ASCIIEncoding.ASCII.GetBytes(line); // Array.Copy -- for performance improvement. Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length); // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet if (tryAutoDetectAlphabet) { alphabet = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet); if (alphabet == null) { throw new FileFormatException(string.Format(Properties.Resource.InvalidSymbolInString, line)); } } else if (this.Alphabet != null) { if (!this.Alphabet.ValidateSequence(buffer, bufferPosition, bufferPosition + line.Length)) { throw new FileFormatException(string.Format(Properties.Resource.InvalidSymbolInString, line)); } } bufferPosition += line.Length; // Read next line. line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } }while (line != null && !line.StartsWith(">", StringComparison.OrdinalIgnoreCase)); // Truncate buffer to remove trailing 0's byte[] tmpBuffer = new byte[bufferPosition]; Array.Copy(buffer, tmpBuffer, bufferPosition); Sequence sequence = null; if (tryAutoDetectAlphabet) { IAlphabet tmpalphabet = alphabet; IAlphabet tmpbaseAlphabet = null; while (Alphabets.AlphabetToBaseAlphabetMap.TryGetValue(tmpalphabet, out tmpbaseAlphabet)) { tmpalphabet = tmpbaseAlphabet; } if (tmpbaseAlphabet == null) { tmpbaseAlphabet = tmpalphabet; } if (baseAlphabet == null) { baseAlphabet = tmpbaseAlphabet; } if (tmpbaseAlphabet != baseAlphabet) { throw new FileFormatException(Properties.Resource.FastAContainsMorethanOnebaseAlphabet); } } // In memory sequence sequence = new Sequence(alphabet, tmpBuffer, false); sequence.ID = name; yield return(sequence); }while (line != null); } }
/// <summary> /// Gets the IEnumerable of QualitativeSequences from the stream being parsed. /// </summary> /// <param name="streamReader">Stream to be parsed.</param> /// <returns>Returns a QualitativeSequence.</returns> private QualitativeSequence ParseOne(StreamReader streamReader) { IAlphabet alphabet = this.Alphabet; bool autoDetectFastQFormat = this.AutoDetectFastQFormat; FastQFormatType formatType = this.FormatType; bool skipBlankLine = true; bool tryAutoDetectAlphabet; if (alphabet == null) { tryAutoDetectAlphabet = true; } else { tryAutoDetectAlphabet = false; } if (streamReader.EndOfStream) { string exMessage = string.Format( CultureInfo.InvariantCulture, Properties.Resource.INVALID_INPUT_FILE, Properties.Resource.FastQName); throw new FileFormatException(exMessage); } string message = string.Empty; string line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } if (line == null || !line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name); throw new FileFormatException(message); } // Process header line. string id = line.Substring(1).Trim(); line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } if (string.IsNullOrEmpty(line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Get sequence from second line. byte[] sequenceData = UTF8Encoding.UTF8.GetBytes(line); // Goto third line. line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } // Check for '+' symbol in the third line. if (line == null || !line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } string qualScoreId = line.Substring(1).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Goto fourth line. line = streamReader.ReadLine(); // Continue reading if blank line found. while (skipBlankLine && line != null && string.IsNullOrEmpty(line)) { line = streamReader.ReadLine(); } if (string.IsNullOrEmpty(line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Get the quality scores from the fourth line. byte[] qualScores = UTF8Encoding.UTF8.GetBytes(line); // Check for sequence length and quality score length. if (sequenceData.LongLength() != qualScores.LongLength()) { string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1); throw new FileFormatException(message); } // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet if (tryAutoDetectAlphabet) { alphabet = Alphabets.AutoDetectAlphabet(sequenceData, 0, sequenceData.LongLength(), alphabet); if (alphabet == null) { throw new FileFormatException(Properties.Resource.CouldNotIdentifyAlphabetType); } } else if (alphabet != null) { if (!alphabet.ValidateSequence(sequenceData, 0, sequenceData.LongLength())) { throw new FileFormatException(Properties.Resource.InvalidAlphabetType); } } // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (autoDetectFastQFormat) { formatType = IdentifyFastQFormatType(qualScores); } QualitativeSequence qualitativeSequence = new QualitativeSequence(alphabet, formatType, sequenceData, qualScores, false); qualitativeSequence.ID = id; // Update the propeties so that next parse will use this data. this.FormatType = formatType; return(qualitativeSequence); }