//ASCIIEncoding ae = new ASCIIEncoding(); /* Main compression method that keeps the two streams aligned and calls * the encoding methods for sequences and qualities. */ public void Compress(FastqReader reader, BinaryWriter sequenceWriter, BinaryWriter qualityWriter) { long seqI = 0; long qualI = 0; byte[] buffer = new byte[BUFFER]; WriteBitShepherd bits = new WriteBitShepherd(sequenceWriter); while (true) { Spin("Encoding..."); if (seqI <= qualI && reader.HasSeqLeft(seqI, 1)) { EncodeSeq(bits, ref seqI, reader); continue; } if (reader.HasQLeft(qualI, 1)) { EncodeQual(buffer, ref qualI, qualityWriter, reader); continue; } break; } bits.Close(); }
/* Main compression method that keeps the two streams aligned and calls * the encoding methods for sequences and qualities. */ public void Compress(FastqReader reader, BinaryWriter writer) { long seqI = 0; long qualI = 0; int idCounter = 0; byte[] buffer = new byte[BUFFER]; ChooseIddc(reader, writer); while (true) { Spin("Encoding..."); if (encodeIds && idCounter <= seqI / length && reader.HasIDLeft(idCounter, 1)) { iddc.EncodeId(ref idCounter); continue; } if ((seqI <= qualI || !encodeQualities) && reader.HasSeqLeft(seqI, 1)) { EncodeSeq(buffer, ref seqI, writer, reader); continue; } if (encodeQualities && reader.HasQLeft(qualI, 1)) { EncodeQual(buffer, ref qualI, writer, reader); continue; } break; } Console.Error.WriteLine(); }
void ChooseIddc(FastqReader reader, BinaryWriter writer) { if (!encodeIds) { iddc = new PlaceholderIdGenerator(); return; } //@SRX000571_SRR002321.54856271 080226_CMLIVERKIDNEY_0007:8:330:23:135 length=36 Regex sra = new Regex(@"^(@[^.]+\.)\d+\s([\S]+)(?:\d+:){3}\d+.*$", RegexOptions.Singleline); Regex length = new Regex(@"^.+length=\d+$", RegexOptions.Singleline); //@HWUSI-EAS627_1:3:1:0:370/1 (or /2) //@BILLIEHOLIDAY_3_FC30G08AAXX:1:1:0:1966 Regex encode = new Regex(@"^(@[\S]+)(?:\d+:){3}\d+(\/[12])*$", RegexOptions.Singleline); String id = reader.GetID(0); Match sraMatch = sra.Match(id); Match encodeMatch = encode.Match(id); if (sraMatch.Success) //type 0 { Match lengthMatch = length.Match(id); writer.Write(0); iddc = new SraIdDeCompresser(reader, writer, sraMatch, lengthMatch.Success); } else if (encodeMatch.Success) //type 2 { writer.Write(2); iddc = new EncodeIdDeCompresser(reader, writer, encodeMatch); } else //type 1 { writer.Write(1); iddc = new PlainIdDeCompresser(reader, writer); } }
void EncodeQual(byte[] buffer, ref long i, BinaryWriter writer, FastqReader reader) { //the first byte starts with 0 if we are encoding a quality byte first = (byte)0; int b = 0; if (qualityRunLengthContinuation != 0) { buffer[b++] = (byte)qualityRunLengthContinuation; qualityRunLengthContinuation = 0; } while (reader.HasQLeft(i, 1) && b < BUFFER) { long j = i + 1; int rl = 1; while (reader.HasQLeft(j, 1) && reader.GetQ(j - 1) == reader.GetQ(j) && rl < MAX) { j++; rl++; } if (rl > 1) //run length { ae.GetBytes(reader.GetQ(j - 1).ToString(), 0, 1, buffer, b); buffer[b] = (byte)(buffer[b] + 128); b++; if (b >= BUFFER) { qualityRunLengthContinuation = rl; } else { buffer[b++] = (byte)rl; } i = j; } else //single char { ae.GetBytes(reader.GetQ(i).ToString(), 0, 1, buffer, b); b++; i++; } } if (b == BUFFER) { writer.Write(first); writer.Write(buffer); } else { first += (byte)32; //we have to tell the decoder that we have a block with a length //different than BUFFER writer.Write(first); writer.Write(b); writer.Write(buffer, 0, b); } }
void EncodeQual(byte[] buffer, ref long i, BinaryWriter writer, FastqReader reader) { int b = 0; if (qualityRunLengthContinuation != 0) { buffer[b++] = (byte)qualityRunLengthContinuation; qualityRunLengthContinuation = 0; } while (reader.HasQLeft(i, 1) && b < BUFFER) { long j = i + 1; int rl = 1; while (reader.HasQLeft(j, 1) && reader.GetQ(j - 1) == reader.GetQ(j) && rl < MAX) { j++; rl++; } if (rl > 1) //run length //ae.GetBytes(reader.GetQ(j-1).ToString(), 0, 1, buffer, b); { buffer[b] = Convert.ToByte(reader.GetQ(j - 1)); buffer[b] = (byte)(buffer[b] + 128); b++; if (b >= BUFFER) { qualityRunLengthContinuation = rl; } else { buffer[b++] = (byte)rl; } i = j; } else //single char //ae.GetBytes(reader.GetQ(i).ToString(), 0, 1, buffer, b); { buffer[b] = Convert.ToByte(reader.GetQ(i)); b++; i++; } } if (b == BUFFER) { writer.Write(buffer); } else { writer.Write(buffer, 0, b); } }
public SraIdDeCompresser(FastqReader reader, BinaryWriter writer, Match match, bool length) { this.reader = reader; this.writer = writer; string firstPart = match.Groups[1].Value; // id name string secondPart = match.Groups[2].Value; // sample name writer.Write(firstPart.Length); writer.Write(ae.GetBytes(firstPart)); writer.Write(secondPart.Length); writer.Write(ae.GetBytes(secondPart)); writer.Write(length); if (!length) { wantedSplit = 7; } }
public EncodeIdDeCompresser(FastqReader reader, BinaryWriter writer, Match match) { this.reader = reader; this.writer = writer; string firstPart = match.Groups[1].Value; // id name string paired = ""; writer.Write(firstPart.Length); writer.Write(ae.GetBytes(firstPart)); if (match.Groups.Count == 3) { paired = match.Groups[2].Value; // paired reads info writer.Write(paired.Length); writer.Write(ae.GetBytes(paired)); } else { writer.Write(0); } }
/* Main compression method that keeps the two streams aligned and calls * the encoding methods for sequences and qualities. */ //public void Compress(FastqReader reader, BinaryWriter writer) public void Compress(FastqReader reader, BinaryWriter sequenceWriter, BinaryWriter qualityWriter) { long seqI = 0; long qualI = 0; byte[] buffer = new byte[BUFFER]; while (true) { Spin("Encoding..."); if (seqI <= qualI && reader.HasSeqLeft(seqI, 1)) { EncodeSeq(buffer, ref seqI, sequenceWriter, reader); continue; } if (reader.HasQLeft(qualI, 1)) { EncodeQual(buffer, ref qualI, qualityWriter, reader); continue; } break; } Console.Error.WriteLine(); }
/* Encodes sequencing starting at the given index (i) until buffer * is full or the sequence ends and writes the result in the given WriteBitSheperd. * Updates i according to its progression. */ void EncodeSeq(WriteBitShepherd bits, ref long i, FastqReader reader) { int writtenBits = 0; while (reader.HasSeqLeft(i, 4) && writtenBits < BIT_BUFFER) { //Check for run-length run. if (reader.GetSeq(i) == reader.GetSeq(i + 3) && reader.GetSeq(i) == reader.GetSeq(i + 2) && reader.GetSeq(i) == reader.GetSeq(i + 1)) { long j = i + 4; long l = i + 8199; while (j < l && reader.HasSeqLeft(j, 1) && reader.GetSeq(j) == reader.GetSeq(j - 1)) { j++; } int length = (int)(j - i); if (length > 35) { bits.Write(127, 7); //flag for long run length bits.Write(GetRandomBit(), 1); bits.Write(Array.IndexOf(Bases, reader.GetSeq(j - 1)), 3); bits.Write(length - 4, 13); writtenBits += 24; } else { bits.Write(0, 7); //flag for short run length bits.Write(GetRandomBit(), 1); bits.Write(Array.IndexOf(Bases, reader.GetSeq(j - 1)), 3); bits.Write(length - 4, 5); writtenBits += 16; } i = j; } else { bits.Write(Encode(reader.GetSeq(i), reader.GetSeq(i + 1), reader.GetSeq(i + 2)), 7); bits.Write(GetRandomBit(), 1); i += 3; writtenBits += 8; } } bool end = false; string last = ""; if (!reader.HasSeqLeft(i, 4)) { while (reader.HasSeqLeft(i, 1)) //could still have 1, 2 or 3 bases { last += reader.GetSeq(i++); } end = true; } if (last != "") { last = last.PadRight(3, 'N'); bits.Write(Encode(last[0], last[1], last[2]), 7); bits.Write(GetRandomBit(), 1); writtenBits += 8; } if (end) { bits.Write(126, 7); // mark end of sequences blocks bits.Write(GetRandomBit(), 1); writtenBits += 8; } }
public static int Main(string[] args) { bool showHelp = false; bool encodeIds = true; bool encodeQualities = true; string histogram = ""; int length = 0; string filename = ""; string compression = "gzip"; string mode = ""; int cutoff = -1; var p = new OptionSet() { { "m|mode=", "the mode: encode|decode", v => mode = v }, { "l|length=", "the length of the reads", (int v) => length = v }, { "z|compression=", "the compression method to use: none|gzip|lzma - default is gzip", v => compression = v }, { "i|noId", "do not encode/decode Ids", v => encodeIds = v == null }, { "c|cutoff=", "cutoff to be used when encoding losing qualities -\n" + "bases associated with a quality lower than the cutoff will be encoded as\n" + "N", (int v) => cutoff = v }, { "q|noQuality", "do not encode/decode qualities - will use cutoff if given", v => encodeQualities = v == null }, { "s|qualityStats=", "create a SVG with a graph of fastq qualities and a .txt with quality values\n" + "associated with counts, the given parameter is\n" + "the desired basename of files (Warning: if they exist they will be REWRITTEN)\n" + "will have effects alone or when in encode mode", (string v) => histogram = v }, { "h|help", "show this message and exit", v => showHelp = v != null }, }; Boolean stop = false; List <string> extraArgs = null; string e = ""; try { extraArgs = p.Parse(args); } catch (OptionException oe) { stop = true; e = oe.Message; } if ((length <= 0 || mode == "") && (histogram == "")) { Console.Error.WriteLine("Wrong (or no) length given or missing mode without the s option"); stop = true; } if (!stop && extraArgs.Count <= 1) { if (extraArgs.Count != 0) { filename = extraArgs[0]; } } else { stop = true; } if (mode == "decode" && histogram != "") { Console.Error.WriteLine("Warning! The option -s has no effect when decoding a file!"); } if (showHelp) { ShowHelp(p); return(0); } if (stop) { Console.WriteLine(e); ShowHelp(p); return(-1); } Stream output = Console.OpenStandardOutput(); if (compression == "lzma") { if (BitConverter.IsLittleEndian == false) { throw new Exception("Lzma compression not implemented for big endian machines."); } if (filename == "") { if (mode == "decode") { throw new InvalidOperationException("When decoding lzma files stdin cannot be used as input! " + "Use a straight file instead."); } else { throw new InvalidOperationException("When encoding with lzma stdout cannot be used as output! " + "Use a straight file instead."); } } if (mode == "encode") { output = new FileStream(filename, FileMode.Create); } } IFastqDeCompresser fdc = new FastqDeCompresser(length, encodeIds, encodeQualities); try { if (mode == "encode") { if (cutoff != -1 && !encodeQualities) { encodeQualities = true; } //we need to store qualities to put N in sequences when -c has been used! BinaryWriter writer = null; Stream zipWriter = null; if (compression == "none") { writer = new BinaryWriter(output); } else if (compression == "gzip") { zipWriter = new GZipOutputStream(output, BUFFER); writer = new BinaryWriter(zipWriter); } else if (compression == "lzma") { zipWriter = new LzmaStream(output, false); writer = new BinaryWriter(zipWriter); } else { Console.Error.WriteLine("Wrong compression method given"); ShowHelp(p); return(-1); } FastqReader reader = null; if (filename != "" && compression != "lzma") { if (cutoff == -1) { reader = new FastqReader(filename, length, encodeIds, encodeQualities, histogram); } else { reader = new FastqCutoffReader(filename, length, encodeIds, encodeQualities, cutoff, histogram); } } else { if (cutoff == -1) { reader = new FastqReader(Console.In, length, encodeIds, encodeQualities, histogram); } else { reader = new FastqCutoffReader(Console.In, length, encodeIds, encodeQualities, cutoff, histogram); } } fdc.Compress(reader, writer); reader.Close(); writer.Close(); } else if (mode == "decode") { EncodedFastqReader reader = null; StreamWriter writer = new StreamWriter(Console.OpenStandardOutput()); if (filename != "") { reader = new EncodedFastqReader(File.OpenRead(filename), compression); } else { reader = new EncodedFastqReader(Console.OpenStandardInput(), compression); } fdc.Decompress(reader, writer); reader.Close(); writer.Close(); } else { if (histogram == "") { Console.Error.WriteLine("Wrong or missing mode argument!"); ShowHelp(p); return(-1); } else { FastqReader fq; if (filename != "") { fq = new FastqReader(filename, histogram); } else { fq = new FastqReader(Console.In, histogram); } fq.Run(); fq.Close(); } } } catch (InvalidOperationException ioe) { Console.Error.WriteLine(ioe.Message); return(1); } catch (FileNotFoundException fnfe) { Console.Error.WriteLine("File {0} not found {1}!", filename, fnfe.Message); return(1); } return(0); }
public static int Main(string[] args) { bool showHelp = false; bool encodeIds = true; int length = 0; string filename = ""; string compression = "gzip"; string mode = ""; string prefix = ""; string suffix = ".qfq"; int cutoff = -1; bool encodeQualities = true; var p = new OptionSet() { { "m|mode=", "the mode: encode|decode", v => mode = v }, { "l|length=", "the length of the reads", (int v) => length = v }, { "z|compression=", "the compression method to use: none|gzip|lzma - default is gzip", v => compression = v }, { "p|prefix=", "the prefix for the output file", v => prefix = v }, { "i|noId", "do not encode/decode Ids", v => encodeIds = v == null }, { "c|cutoff=", "cutoff to be used when encoding losing qualities -\n" + "bases associated with a quality lower than the cutoff will be encoded as\n" + "N", (int v) => cutoff = v }, { "q|noQuality", "do not encode/decode qualities - will use cutoff if given", v => encodeQualities = v == null }, { "h|help", "show this message and exit", v => showHelp = v != null }, }; bool stop = false; List <string> extraArgs = null; string e = ""; try { extraArgs = p.Parse(args); } catch (OptionException oe) { stop = true; e = oe.Message; } if (length <= 0 || mode == "" || (mode == "encode" && prefix == "")) { stop = true; } if (mode == "encode" && cutoff != -1 && encodeQualities) { stop = true; e = "In encode mode -c option can be used only with -q option!"; } if (extraArgs.Count <= 1) { if (extraArgs.Count != 0) { filename = extraArgs[0]; } } else { stop = true; } if (showHelp) { ShowHelp(p); return(0); } if (stop) { Console.WriteLine(e); ShowHelp(p); return(-1); } string outputFile = prefix + suffix; if (compression == "gzip") { outputFile += ".gz"; } else if (compression == "lzma") { outputFile += ".lzma"; } IFastqDeCompresser fdc = new FastqDeCompresser(length, encodeIds, encodeQualities); try { if (mode == "encode") { FileStream outStream = new FileStream(outputFile, FileMode.Create); BinaryWriter writer = null; Stream zipWriter = null; try { if (compression == "none") { writer = new BinaryWriter(outStream); } else if (compression == "gzip") { zipWriter = new GZipOutputStream(outStream); writer = new BinaryWriter(zipWriter); } else if (compression == "lzma") { zipWriter = new LzmaEncodeStream(outStream); writer = new BinaryWriter(zipWriter); } else { Console.Error.WriteLine("Wrong compression method given"); ShowHelp(p); return(-1); } FastqReader reader = null; if (filename != "") { if (cutoff == -1) { reader = new FastqReader(filename, length); } else { reader = new FastqCutoffReader(filename, length, cutoff); } } else { if (cutoff == -1) { reader = new FastqReader(Console.In, length); } else { reader = new FastqCutoffReader(Console.In, length, cutoff); } } fdc.Compress(reader, writer); reader.Close(); } finally { writer.Close(); outStream.Close(); //XXX should close correct streams! } } else if (mode == "decode") { EncodedFastqReader reader = new EncodedFastqReader(File.OpenRead(outputFile), compression); StreamWriter writer = new StreamWriter(Console.OpenStandardOutput()); try { fdc.Decompress(reader, writer); } finally { reader.Close(); writer.Close(); } } else { ShowHelp(p); return(-1); } } catch (InvalidOperationException ioe) { Console.Error.WriteLine(ioe.Message); return(1); } catch (FileNotFoundException fnfe) { Console.Error.WriteLine("File {0} not found {1}!", filename, fnfe.Message); return(1); } /*finally { * // TODO ASK * }*/ return(0); }
/* Encodes sequencing starting at the given index (i) until buffer * is full or the sequence ends and writes the result in the given BinaryWriter. * Updates i according to its progression. */ void EncodeSeq(byte[] buffer, ref long i, BinaryWriter writer, FastqReader reader) { //the first byte starts with 1 if we are encoding a seq byte first = (byte)128; int b = 0; while (reader.HasSeqLeft(i, 4) && b < BUFFER) { //Check for run-length run. if (reader.GetSeq(i) == reader.GetSeq(i + 3) && reader.GetSeq(i) == reader.GetSeq(i + 2) && reader.GetSeq(i) == reader.GetSeq(i + 1)) { long j = i + 4; long l = i + 19; while (j < l && reader.HasSeqLeft(j, 1) && reader.GetSeq(j) == reader.GetSeq(i)) { j++; } buffer[b++] = (byte)(128 + ((j - i - 4) << 3) + Array.IndexOf(Bases, reader.GetSeq(i))); i = j; } else { buffer[b++] = Encode(reader.GetSeq(i++), reader.GetSeq(i++), reader.GetSeq(i++)); } } string last = ""; byte lastSeqByte = 0; bool lastByte = false; if (!reader.HasSeqLeft(i, 4)) { while (reader.HasSeqLeft(i, 1)) //could still have 1, 2 or 3 bases { last += reader.GetSeq(i++); } } if (last != "") { lastByte = true; last = last.PadRight(3, 'N'); lastSeqByte = Encode(last[0], last[1], last[2]); } if (b == BUFFER && !lastByte) { writer.Write(first); writer.Write(buffer); } else { first += (byte)32; //we have to tell the decoder that we have a block with a length //different than BUFFER writer.Write(first); writer.Write(b + (lastByte ? 1 : 0)); writer.Write(buffer, 0, b); if (lastByte) { writer.Write(lastSeqByte); } } }
public PlainIdDeCompresser(FastqReader reader, BinaryWriter writer) { this.reader = reader; this.writer = writer; }