void ChooseIddc(EncodedFastqReader reader) { if (!encodeIds) { iddc = new PlaceholderIdGenerator(); return; } int iddcType = reader.Reader.ReadInt32(); if (iddcType == 0) { iddc = new SraIdDeCompresser(reader, length); } else if (iddcType == 1) { iddc = new PlainIdDeCompresser(reader); } else if (iddcType == 2) { iddc = new EncodeIdDeCompresser(reader); } else { throw new Exception("Missing header info in compressed file! " + iddcType); } }
public SraIdDeCompresser(EncodedFastqReader encReader, int length) { this.encReader = encReader; int l = encReader.Reader.ReadInt32(); string firstIdPart = ae.GetString(encReader.Reader.ReadBytes(l)); //@SRX000571_SRR002322. l = encReader.Reader.ReadInt32(); string secondIdPart = ae.GetString(encReader.Reader.ReadBytes(l)); //080317_CM-KID-LIV-2-REPEAT_0003: SPAZIO //@SRX000571_SRR002322.18437692 080317_CM-KID-LIV-2-REPEAT_0003:7:330:466:87 length=36 //or //@SRR029238.3 SOLEXAWS1_20FDNAAXX:1:1:737:1043 if (encReader.Reader.ReadBoolean()) { idBuilder = firstIdPart + "{0} " + secondIdPart + "{1}:{2}:{3}:{4} length=" + length; } else { idBuilder = firstIdPart + "{0} " + secondIdPart + "{1}:{2}:{3}:{4}"; } }
public EncodeIdDeCompresser(EncodedFastqReader encReader) { this.encReader = encReader; int l = encReader.Reader.ReadInt32(); string firstIdPart = ae.GetString(encReader.Reader.ReadBytes(l)); //@HWUSI-EAS627_1 l = encReader.Reader.ReadInt32(); string paired = ""; if (l != 0) { paired = ae.GetString(encReader.Reader.ReadBytes(l)); // /1 (or /2) } //@HWUSI-EAS627_1:3:1:0:370/1 (or /2) //@BILLIEHOLIDAY_3_FC30G08AAXX:1:1:0:1966 //Regex encode = new Regex(@"^(@[^:]+)(?:\d+:){3}\d+(\/[12])*$", RegexOptions.Singleline); //ignoring: //@080514_HWI-EAS229_0029_20768AAXX_5_1_120:242 //@TUPAC:1:1:5:710#0/1 idBuilder = firstIdPart + "{0}:{1}:{2}:{3}" + paired; }
public static int Main(string[] args) { bool showHelp = false; bool encodeIds = true; bool encodeQualities = true; string histogram = ""; int length = 0; string filename = ""; string compression = "gzip"; string mode = ""; int cutoff = -1; var p = new OptionSet() { { "m|mode=", "the mode: encode|decode", v => mode = v }, { "l|length=", "the length of the reads", (int v) => length = v }, { "z|compression=", "the compression method to use: none|gzip|lzma - default is gzip", v => compression = v }, { "i|noId", "do not encode/decode Ids", v => encodeIds = v == null }, { "c|cutoff=", "cutoff to be used when encoding losing qualities -\n" + "bases associated with a quality lower than the cutoff will be encoded as\n" + "N", (int v) => cutoff = v }, { "q|noQuality", "do not encode/decode qualities - will use cutoff if given", v => encodeQualities = v == null }, { "s|qualityStats=", "create a SVG with a graph of fastq qualities and a .txt with quality values\n" + "associated with counts, the given parameter is\n" + "the desired basename of files (Warning: if they exist they will be REWRITTEN)\n" + "will have effects alone or when in encode mode", (string v) => histogram = v }, { "h|help", "show this message and exit", v => showHelp = v != null }, }; Boolean stop = false; List <string> extraArgs = null; string e = ""; try { extraArgs = p.Parse(args); } catch (OptionException oe) { stop = true; e = oe.Message; } if ((length <= 0 || mode == "") && (histogram == "")) { Console.Error.WriteLine("Wrong (or no) length given or missing mode without the s option"); stop = true; } if (!stop && extraArgs.Count <= 1) { if (extraArgs.Count != 0) { filename = extraArgs[0]; } } else { stop = true; } if (mode == "decode" && histogram != "") { Console.Error.WriteLine("Warning! The option -s has no effect when decoding a file!"); } if (showHelp) { ShowHelp(p); return(0); } if (stop) { Console.WriteLine(e); ShowHelp(p); return(-1); } Stream output = Console.OpenStandardOutput(); if (compression == "lzma") { if (BitConverter.IsLittleEndian == false) { throw new Exception("Lzma compression not implemented for big endian machines."); } if (filename == "") { if (mode == "decode") { throw new InvalidOperationException("When decoding lzma files stdin cannot be used as input! " + "Use a straight file instead."); } else { throw new InvalidOperationException("When encoding with lzma stdout cannot be used as output! " + "Use a straight file instead."); } } if (mode == "encode") { output = new FileStream(filename, FileMode.Create); } } IFastqDeCompresser fdc = new FastqDeCompresser(length, encodeIds, encodeQualities); try { if (mode == "encode") { if (cutoff != -1 && !encodeQualities) { encodeQualities = true; } //we need to store qualities to put N in sequences when -c has been used! BinaryWriter writer = null; Stream zipWriter = null; if (compression == "none") { writer = new BinaryWriter(output); } else if (compression == "gzip") { zipWriter = new GZipOutputStream(output, BUFFER); writer = new BinaryWriter(zipWriter); } else if (compression == "lzma") { zipWriter = new LzmaStream(output, false); writer = new BinaryWriter(zipWriter); } else { Console.Error.WriteLine("Wrong compression method given"); ShowHelp(p); return(-1); } FastqReader reader = null; if (filename != "" && compression != "lzma") { if (cutoff == -1) { reader = new FastqReader(filename, length, encodeIds, encodeQualities, histogram); } else { reader = new FastqCutoffReader(filename, length, encodeIds, encodeQualities, cutoff, histogram); } } else { if (cutoff == -1) { reader = new FastqReader(Console.In, length, encodeIds, encodeQualities, histogram); } else { reader = new FastqCutoffReader(Console.In, length, encodeIds, encodeQualities, cutoff, histogram); } } fdc.Compress(reader, writer); reader.Close(); writer.Close(); } else if (mode == "decode") { EncodedFastqReader reader = null; StreamWriter writer = new StreamWriter(Console.OpenStandardOutput()); if (filename != "") { reader = new EncodedFastqReader(File.OpenRead(filename), compression); } else { reader = new EncodedFastqReader(Console.OpenStandardInput(), compression); } fdc.Decompress(reader, writer); reader.Close(); writer.Close(); } else { if (histogram == "") { Console.Error.WriteLine("Wrong or missing mode argument!"); ShowHelp(p); return(-1); } else { FastqReader fq; if (filename != "") { fq = new FastqReader(filename, histogram); } else { fq = new FastqReader(Console.In, histogram); } fq.Run(); fq.Close(); } } } catch (InvalidOperationException ioe) { Console.Error.WriteLine(ioe.Message); return(1); } catch (FileNotFoundException fnfe) { Console.Error.WriteLine("File {0} not found {1}!", filename, fnfe.Message); return(1); } return(0); }
public static int Main(string[] args) { bool showHelp = false; bool encodeIds = true; int length = 0; string filename = ""; string compression = "gzip"; string mode = ""; string prefix = ""; string suffix = ".qfq"; int cutoff = -1; bool encodeQualities = true; var p = new OptionSet() { { "m|mode=", "the mode: encode|decode", v => mode = v }, { "l|length=", "the length of the reads", (int v) => length = v }, { "z|compression=", "the compression method to use: none|gzip|lzma - default is gzip", v => compression = v }, { "p|prefix=", "the prefix for the output file", v => prefix = v }, { "i|noId", "do not encode/decode Ids", v => encodeIds = v == null }, { "c|cutoff=", "cutoff to be used when encoding losing qualities -\n" + "bases associated with a quality lower than the cutoff will be encoded as\n" + "N", (int v) => cutoff = v }, { "q|noQuality", "do not encode/decode qualities - will use cutoff if given", v => encodeQualities = v == null }, { "h|help", "show this message and exit", v => showHelp = v != null }, }; bool stop = false; List <string> extraArgs = null; string e = ""; try { extraArgs = p.Parse(args); } catch (OptionException oe) { stop = true; e = oe.Message; } if (length <= 0 || mode == "" || (mode == "encode" && prefix == "")) { stop = true; } if (mode == "encode" && cutoff != -1 && encodeQualities) { stop = true; e = "In encode mode -c option can be used only with -q option!"; } if (extraArgs.Count <= 1) { if (extraArgs.Count != 0) { filename = extraArgs[0]; } } else { stop = true; } if (showHelp) { ShowHelp(p); return(0); } if (stop) { Console.WriteLine(e); ShowHelp(p); return(-1); } string outputFile = prefix + suffix; if (compression == "gzip") { outputFile += ".gz"; } else if (compression == "lzma") { outputFile += ".lzma"; } IFastqDeCompresser fdc = new FastqDeCompresser(length, encodeIds, encodeQualities); try { if (mode == "encode") { FileStream outStream = new FileStream(outputFile, FileMode.Create); BinaryWriter writer = null; Stream zipWriter = null; try { if (compression == "none") { writer = new BinaryWriter(outStream); } else if (compression == "gzip") { zipWriter = new GZipOutputStream(outStream); writer = new BinaryWriter(zipWriter); } else if (compression == "lzma") { zipWriter = new LzmaEncodeStream(outStream); writer = new BinaryWriter(zipWriter); } else { Console.Error.WriteLine("Wrong compression method given"); ShowHelp(p); return(-1); } FastqReader reader = null; if (filename != "") { if (cutoff == -1) { reader = new FastqReader(filename, length); } else { reader = new FastqCutoffReader(filename, length, cutoff); } } else { if (cutoff == -1) { reader = new FastqReader(Console.In, length); } else { reader = new FastqCutoffReader(Console.In, length, cutoff); } } fdc.Compress(reader, writer); reader.Close(); } finally { writer.Close(); outStream.Close(); //XXX should close correct streams! } } else if (mode == "decode") { EncodedFastqReader reader = new EncodedFastqReader(File.OpenRead(outputFile), compression); StreamWriter writer = new StreamWriter(Console.OpenStandardOutput()); try { fdc.Decompress(reader, writer); } finally { reader.Close(); writer.Close(); } } else { ShowHelp(p); return(-1); } } catch (InvalidOperationException ioe) { Console.Error.WriteLine(ioe.Message); return(1); } catch (FileNotFoundException fnfe) { Console.Error.WriteLine("File {0} not found {1}!", filename, fnfe.Message); return(1); } /*finally { * // TODO ASK * }*/ return(0); }
/* Main decompression method that decodes the compressed file and * directly write the obtained fastq in the given StreamWriter. */ public void Decompress(EncodedFastqReader reader, StreamWriter writer) { long IdByte = 0; long seqI = 0; int s = 0; long qualI = 0; int q = 0; int nSeq = 0; int howmany = 0; char which = ' '; byte encoded; //continuations variables int continueSequenceRunLength = 0; char[] continueSequenceChar = new char[] { ' ', ' ' }; //0 - the rl char or the second char of the triplet across reads //1 - the rl char or the third char of the triplet across reads int continueQualityRunLength = 0; char continueQualityChar = ' '; ChooseIddc(reader); writer.WriteLine(iddc.GetNextID(ref IdByte)); while (reader.HasSeqLeft(seqI, 1) || continueSequenceRunLength != 0 || continueSequenceChar[0] != ' ' || (encodeQualities && reader.HasQLeft(qualI, 1)) || continueQualityRunLength != 0) { q = 0; s = 0; while (s < length) { if (continueSequenceRunLength != 0) { while (continueSequenceRunLength > 0 && s < length) { s++; continueSequenceRunLength--; writer.Write(continueSequenceChar[0]); } if (continueSequenceRunLength == 0) { continueSequenceChar[0] = ' '; } } else if (continueSequenceChar[0] != ' ') { //we assume that 1 or 2 char(s) will always fit in the new read //ie reads will always be longer than 2 writer.Write(continueSequenceChar[0]); s++; if (continueSequenceChar[1] != ' ') { writer.Write(continueSequenceChar[1]); s++; } continueSequenceChar[0] = ' '; continueSequenceChar[1] = ' '; } else if (reader.HasSeqLeft(seqI, 1)) { encoded = reader.GetSeqByte(seqI); seqI++; if ((encoded & 128) == 128) //run length { encoded = (byte)(127 & encoded); howmany = (encoded >> 3) + 4; which = Bases[(int)(encoded & 7)]; int i = 0; while (i < howmany && s < length) { i++; s++; writer.Write(which); } if (i < howmany) { continueSequenceChar[0] = which; continueSequenceRunLength = howmany - i; } } else //three bases { string triplet = decoding[(int)encoded]; int k = 0; while (s < length && k < 3) { writer.Write(triplet[k++]); s++; } int i = 0; while (k < 3) { continueSequenceChar[i] = triplet[k]; k++; i++; } } } } nSeq++; if (encodeQualities) { writer.WriteLine("\n+"); } while (encodeQualities && q < length) { if (continueQualityRunLength != 0) { while (continueQualityRunLength > 0 && q < length) { q++; continueQualityRunLength--; writer.Write(continueQualityChar); } if (continueQualityRunLength == 0) { continueQualityChar = ' '; } } else if (reader.HasQLeft(qualI, 1)) { encoded = reader.GetQualByte(qualI); qualI++; if ((encoded & 128) != 128) //single quality data { which = Convert.ToChar(encoded); writer.Write(which); q++; } else //run length { encoded = (byte)(127 & encoded); which = Convert.ToChar(encoded); howmany = (int)reader.GetQualByte(qualI); qualI++; int i = 0; while (i < howmany && q < length) { i++; q++; writer.Write(which); } if (i < howmany) { continueQualityRunLength = howmany - i; continueQualityChar = which; } } } } if (reader.HasSeqLeft(seqI, 1) || continueSequenceRunLength != 0) { //if we have got a sequence run length it cannot be padding and if we have //a continuation derived for a triplet we will have sequences left if it's //not a padding writer.WriteLine("\n" + iddc.GetNextID(ref IdByte)); if (seqI % 10000 == 0) { Spin("Decoding..."); } } else if (!reader.HasSeqLeft(seqI, 1) && continueSequenceChar[0] != ' ') { break; } } writer.WriteLine(); Console.Error.WriteLine(); }
public PlainIdDeCompresser(EncodedFastqReader encReader) { this.encReader = encReader; }