Пример #1
0
        //ASCIIEncoding ae = new ASCIIEncoding();

        /* Main compression method that keeps the two streams aligned and calls
         * the encoding methods for sequences and qualities.
         */
        public void Compress(FastqReader reader, BinaryWriter sequenceWriter, BinaryWriter qualityWriter)
        {
            long seqI  = 0;
            long qualI = 0;

            byte[]           buffer = new byte[BUFFER];
            WriteBitShepherd bits   = new WriteBitShepherd(sequenceWriter);

            while (true)
            {
                Spin("Encoding...");
                if (seqI <= qualI && reader.HasSeqLeft(seqI, 1))
                {
                    EncodeSeq(bits, ref seqI, reader);
                    continue;
                }
                if (reader.HasQLeft(qualI, 1))
                {
                    EncodeQual(buffer, ref qualI, qualityWriter, reader);
                    continue;
                }
                break;
            }
            bits.Close();
        }
Пример #2
0
        /* Main compression method that keeps the two streams aligned and calls
         * the encoding methods for sequences and qualities.
         */
        public void Compress(FastqReader reader, BinaryWriter writer)
        {
            long seqI      = 0;
            long qualI     = 0;
            int  idCounter = 0;

            byte[] buffer = new byte[BUFFER];
            ChooseIddc(reader, writer);

            while (true)
            {
                Spin("Encoding...");
                if (encodeIds && idCounter <= seqI / length && reader.HasIDLeft(idCounter, 1))
                {
                    iddc.EncodeId(ref idCounter);
                    continue;
                }
                if ((seqI <= qualI || !encodeQualities) && reader.HasSeqLeft(seqI, 1))
                {
                    EncodeSeq(buffer, ref seqI, writer, reader);
                    continue;
                }
                if (encodeQualities && reader.HasQLeft(qualI, 1))
                {
                    EncodeQual(buffer, ref qualI, writer, reader);
                    continue;
                }
                break;
            }
            Console.Error.WriteLine();
        }
Пример #3
0
        void ChooseIddc(FastqReader reader, BinaryWriter writer)
        {
            if (!encodeIds)
            {
                iddc = new PlaceholderIdGenerator();
                return;
            }
            //@SRX000571_SRR002321.54856271 080226_CMLIVERKIDNEY_0007:8:330:23:135 length=36
            Regex sra    = new Regex(@"^(@[^.]+\.)\d+\s([\S]+)(?:\d+:){3}\d+.*$", RegexOptions.Singleline);
            Regex length = new Regex(@"^.+length=\d+$", RegexOptions.Singleline);
            //@HWUSI-EAS627_1:3:1:0:370/1 (or /2)
            //@BILLIEHOLIDAY_3_FC30G08AAXX:1:1:0:1966
            Regex encode = new Regex(@"^(@[\S]+)(?:\d+:){3}\d+(\/[12])*$", RegexOptions.Singleline);

            String id          = reader.GetID(0);
            Match  sraMatch    = sra.Match(id);
            Match  encodeMatch = encode.Match(id);

            if (sraMatch.Success)   //type 0
            {
                Match lengthMatch = length.Match(id);
                writer.Write(0);
                iddc = new SraIdDeCompresser(reader, writer, sraMatch, lengthMatch.Success);
            }
            else if (encodeMatch.Success)     //type 2
            {
                writer.Write(2);
                iddc = new EncodeIdDeCompresser(reader, writer, encodeMatch);
            }
            else     //type 1
            {
                writer.Write(1);
                iddc = new PlainIdDeCompresser(reader, writer);
            }
        }
Пример #4
0
        void EncodeQual(byte[] buffer, ref long i, BinaryWriter writer, FastqReader reader)
        {
            //the first byte starts with 0 if we are encoding a quality
            byte first = (byte)0;
            int  b     = 0;

            if (qualityRunLengthContinuation != 0)
            {
                buffer[b++] = (byte)qualityRunLengthContinuation;
                qualityRunLengthContinuation = 0;
            }
            while (reader.HasQLeft(i, 1) && b < BUFFER)
            {
                long j  = i + 1;
                int  rl = 1;
                while (reader.HasQLeft(j, 1) && reader.GetQ(j - 1) == reader.GetQ(j) && rl < MAX)
                {
                    j++;
                    rl++;
                }
                if (rl > 1)   //run length
                {
                    ae.GetBytes(reader.GetQ(j - 1).ToString(), 0, 1, buffer, b);
                    buffer[b] = (byte)(buffer[b] + 128);
                    b++;
                    if (b >= BUFFER)
                    {
                        qualityRunLengthContinuation = rl;
                    }
                    else
                    {
                        buffer[b++] = (byte)rl;
                    }
                    i = j;
                }
                else     //single char
                {
                    ae.GetBytes(reader.GetQ(i).ToString(), 0, 1, buffer, b);
                    b++;
                    i++;
                }
            }

            if (b == BUFFER)
            {
                writer.Write(first);
                writer.Write(buffer);
            }
            else
            {
                first += (byte)32;  //we have to tell the decoder that we have a block with a length
                                    //different than BUFFER
                writer.Write(first);
                writer.Write(b);
                writer.Write(buffer, 0, b);
            }
        }
Пример #5
0
        void EncodeQual(byte[] buffer, ref long i, BinaryWriter writer, FastqReader reader)
        {
            int b = 0;

            if (qualityRunLengthContinuation != 0)
            {
                buffer[b++] = (byte)qualityRunLengthContinuation;
                qualityRunLengthContinuation = 0;
            }
            while (reader.HasQLeft(i, 1) && b < BUFFER)
            {
                long j  = i + 1;
                int  rl = 1;
                while (reader.HasQLeft(j, 1) && reader.GetQ(j - 1) == reader.GetQ(j) && rl < MAX)
                {
                    j++;
                    rl++;
                }
                if (rl > 1)   //run length
                //ae.GetBytes(reader.GetQ(j-1).ToString(), 0, 1, buffer, b);
                {
                    buffer[b] = Convert.ToByte(reader.GetQ(j - 1));
                    buffer[b] = (byte)(buffer[b] + 128);
                    b++;
                    if (b >= BUFFER)
                    {
                        qualityRunLengthContinuation = rl;
                    }
                    else
                    {
                        buffer[b++] = (byte)rl;
                    }
                    i = j;
                }
                else     //single char
                //ae.GetBytes(reader.GetQ(i).ToString(), 0, 1, buffer, b);
                {
                    buffer[b] = Convert.ToByte(reader.GetQ(i));
                    b++;
                    i++;
                }
            }

            if (b == BUFFER)
            {
                writer.Write(buffer);
            }
            else
            {
                writer.Write(buffer, 0, b);
            }
        }
Пример #6
0
        public SraIdDeCompresser(FastqReader reader, BinaryWriter writer, Match match, bool length)
        {
            this.reader = reader;
            this.writer = writer;
            string firstPart  = match.Groups[1].Value; // id name
            string secondPart = match.Groups[2].Value; // sample name

            writer.Write(firstPart.Length);
            writer.Write(ae.GetBytes(firstPart));
            writer.Write(secondPart.Length);
            writer.Write(ae.GetBytes(secondPart));
            writer.Write(length);
            if (!length)
            {
                wantedSplit = 7;
            }
        }
Пример #7
0
        public EncodeIdDeCompresser(FastqReader reader, BinaryWriter writer, Match match)
        {
            this.reader = reader;
            this.writer = writer;
            string firstPart = match.Groups[1].Value; // id name
            string paired    = "";

            writer.Write(firstPart.Length);
            writer.Write(ae.GetBytes(firstPart));
            if (match.Groups.Count == 3)
            {
                paired = match.Groups[2].Value; // paired reads info
                writer.Write(paired.Length);
                writer.Write(ae.GetBytes(paired));
            }
            else
            {
                writer.Write(0);
            }
        }
Пример #8
0
        /* Main compression method that keeps the two streams aligned and calls
         * the encoding methods for sequences and qualities.
         */
        //public void Compress(FastqReader reader, BinaryWriter writer)
        public void Compress(FastqReader reader, BinaryWriter sequenceWriter, BinaryWriter qualityWriter)
        {
            long seqI  = 0;
            long qualI = 0;

            byte[] buffer = new byte[BUFFER];
            while (true)
            {
                Spin("Encoding...");
                if (seqI <= qualI && reader.HasSeqLeft(seqI, 1))
                {
                    EncodeSeq(buffer, ref seqI, sequenceWriter, reader);
                    continue;
                }
                if (reader.HasQLeft(qualI, 1))
                {
                    EncodeQual(buffer, ref qualI, qualityWriter, reader);
                    continue;
                }
                break;
            }
            Console.Error.WriteLine();
        }
Пример #9
0
        /* Encodes sequencing starting at the given index (i) until buffer
         * is full or the sequence ends and writes the result in the given WriteBitSheperd.
         * Updates i according to its progression.
         */
        void EncodeSeq(WriteBitShepherd bits, ref long i, FastqReader reader)
        {
            int writtenBits = 0;

            while (reader.HasSeqLeft(i, 4) && writtenBits < BIT_BUFFER)
            {
                //Check for run-length run.
                if (reader.GetSeq(i) == reader.GetSeq(i + 3) && reader.GetSeq(i) == reader.GetSeq(i + 2) &&
                    reader.GetSeq(i) == reader.GetSeq(i + 1))
                {
                    long j = i + 4;
                    long l = i + 8199;
                    while (j < l && reader.HasSeqLeft(j, 1) && reader.GetSeq(j) == reader.GetSeq(j - 1))
                    {
                        j++;
                    }
                    int length = (int)(j - i);
                    if (length > 35)
                    {
                        bits.Write(127, 7); //flag for long run length
                        bits.Write(GetRandomBit(), 1);
                        bits.Write(Array.IndexOf(Bases, reader.GetSeq(j - 1)), 3);
                        bits.Write(length - 4, 13);
                        writtenBits += 24;
                    }
                    else
                    {
                        bits.Write(0, 7); //flag for short run length
                        bits.Write(GetRandomBit(), 1);
                        bits.Write(Array.IndexOf(Bases, reader.GetSeq(j - 1)), 3);
                        bits.Write(length - 4, 5);
                        writtenBits += 16;
                    }
                    i = j;
                }
                else
                {
                    bits.Write(Encode(reader.GetSeq(i), reader.GetSeq(i + 1), reader.GetSeq(i + 2)), 7);
                    bits.Write(GetRandomBit(), 1);
                    i           += 3;
                    writtenBits += 8;
                }
            }

            bool   end  = false;
            string last = "";

            if (!reader.HasSeqLeft(i, 4))
            {
                while (reader.HasSeqLeft(i, 1)) //could still have 1, 2 or 3 bases
                {
                    last += reader.GetSeq(i++);
                }
                end = true;
            }
            if (last != "")
            {
                last = last.PadRight(3, 'N');
                bits.Write(Encode(last[0], last[1], last[2]), 7);
                bits.Write(GetRandomBit(), 1);
                writtenBits += 8;
            }
            if (end)
            {
                bits.Write(126, 7); // mark end of sequences blocks
                bits.Write(GetRandomBit(), 1);
                writtenBits += 8;
            }
        }
Пример #10
0
        public static int Main(string[] args)
        {
            bool   showHelp        = false;
            bool   encodeIds       = true;
            bool   encodeQualities = true;
            string histogram       = "";
            int    length          = 0;
            string filename        = "";
            string compression     = "gzip";
            string mode            = "";
            int    cutoff          = -1;

            var p = new OptionSet()
            {
                { "m|mode=", "the mode: encode|decode",
                  v => mode = v },
                { "l|length=", "the length of the reads",
                  (int v) => length = v },
                { "z|compression=", "the compression method to use: none|gzip|lzma - default is gzip",
                  v => compression = v },
                { "i|noId", "do not encode/decode Ids",
                  v => encodeIds = v == null },
                { "c|cutoff=", "cutoff to be used when encoding losing qualities -\n" +
                  "bases associated with a quality lower than the cutoff will be encoded as\n" +
                  "N",
                  (int v) => cutoff = v },
                { "q|noQuality", "do not encode/decode qualities - will use cutoff if given",
                  v => encodeQualities = v == null },
                { "s|qualityStats=", "create a SVG with a graph of fastq qualities and a .txt with quality values\n" +
                  "associated with counts, the given parameter is\n" +
                  "the desired basename of files (Warning: if they exist they will be REWRITTEN)\n" +
                  "will have effects alone or when in encode mode",
                  (string v) => histogram = v },
                { "h|help", "show this message and exit",
                  v => showHelp = v != null },
            };

            Boolean       stop      = false;
            List <string> extraArgs = null;
            string        e         = "";

            try {
                extraArgs = p.Parse(args);
            }
            catch (OptionException oe) {
                stop = true;
                e    = oe.Message;
            }

            if ((length <= 0 || mode == "") && (histogram == ""))
            {
                Console.Error.WriteLine("Wrong (or no) length given or missing mode without the s option");
                stop = true;
            }

            if (!stop && extraArgs.Count <= 1)
            {
                if (extraArgs.Count != 0)
                {
                    filename = extraArgs[0];
                }
            }
            else
            {
                stop = true;
            }

            if (mode == "decode" && histogram != "")
            {
                Console.Error.WriteLine("Warning! The option -s has no effect when decoding a file!");
            }

            if (showHelp)
            {
                ShowHelp(p);
                return(0);
            }
            if (stop)
            {
                Console.WriteLine(e);
                ShowHelp(p);
                return(-1);
            }

            Stream output = Console.OpenStandardOutput();

            if (compression == "lzma")
            {
                if (BitConverter.IsLittleEndian == false)
                {
                    throw new Exception("Lzma compression not implemented for big endian machines.");
                }
                if (filename == "")
                {
                    if (mode == "decode")
                    {
                        throw new InvalidOperationException("When decoding lzma files stdin cannot be used as input! " +
                                                            "Use a straight file instead.");
                    }
                    else
                    {
                        throw new InvalidOperationException("When encoding with lzma stdout cannot be used as output! " +
                                                            "Use a straight file instead.");
                    }
                }
                if (mode == "encode")
                {
                    output = new FileStream(filename, FileMode.Create);
                }
            }


            IFastqDeCompresser fdc = new FastqDeCompresser(length, encodeIds, encodeQualities);

            try {
                if (mode == "encode")
                {
                    if (cutoff != -1 && !encodeQualities)
                    {
                        encodeQualities = true;
                    }
                    //we need to store qualities to put N in sequences when -c has been used!
                    BinaryWriter writer    = null;
                    Stream       zipWriter = null;
                    if (compression == "none")
                    {
                        writer = new BinaryWriter(output);
                    }
                    else if (compression == "gzip")
                    {
                        zipWriter = new GZipOutputStream(output, BUFFER);
                        writer    = new BinaryWriter(zipWriter);
                    }
                    else if (compression == "lzma")
                    {
                        zipWriter = new LzmaStream(output, false);
                        writer    = new BinaryWriter(zipWriter);
                    }
                    else
                    {
                        Console.Error.WriteLine("Wrong compression method given");
                        ShowHelp(p);
                        return(-1);
                    }
                    FastqReader reader = null;
                    if (filename != "" && compression != "lzma")
                    {
                        if (cutoff == -1)
                        {
                            reader = new FastqReader(filename, length, encodeIds, encodeQualities, histogram);
                        }
                        else
                        {
                            reader = new FastqCutoffReader(filename, length, encodeIds, encodeQualities, cutoff, histogram);
                        }
                    }
                    else
                    {
                        if (cutoff == -1)
                        {
                            reader = new FastqReader(Console.In, length, encodeIds, encodeQualities, histogram);
                        }
                        else
                        {
                            reader = new FastqCutoffReader(Console.In, length, encodeIds, encodeQualities, cutoff, histogram);
                        }
                    }
                    fdc.Compress(reader, writer);
                    reader.Close();
                    writer.Close();
                }
                else if (mode == "decode")
                {
                    EncodedFastqReader reader = null;
                    StreamWriter       writer = new StreamWriter(Console.OpenStandardOutput());
                    if (filename != "")
                    {
                        reader = new EncodedFastqReader(File.OpenRead(filename), compression);
                    }
                    else
                    {
                        reader = new EncodedFastqReader(Console.OpenStandardInput(), compression);
                    }
                    fdc.Decompress(reader, writer);
                    reader.Close();
                    writer.Close();
                }
                else
                {
                    if (histogram == "")
                    {
                        Console.Error.WriteLine("Wrong or missing mode argument!");
                        ShowHelp(p);
                        return(-1);
                    }
                    else
                    {
                        FastqReader fq;
                        if (filename != "")
                        {
                            fq = new FastqReader(filename, histogram);
                        }
                        else
                        {
                            fq = new FastqReader(Console.In, histogram);
                        }
                        fq.Run();
                        fq.Close();
                    }
                }
            } catch (InvalidOperationException ioe) {
                Console.Error.WriteLine(ioe.Message);
                return(1);
            } catch (FileNotFoundException fnfe) {
                Console.Error.WriteLine("File {0} not found {1}!", filename, fnfe.Message);
                return(1);
            }

            return(0);
        }
Пример #11
0
        public static int Main(string[] args)
        {
            bool   showHelp        = false;
            bool   encodeIds       = true;
            int    length          = 0;
            string filename        = "";
            string compression     = "gzip";
            string mode            = "";
            string prefix          = "";
            string suffix          = ".qfq";
            int    cutoff          = -1;
            bool   encodeQualities = true;

            var p = new OptionSet()
            {
                { "m|mode=", "the mode: encode|decode",
                  v => mode = v },
                { "l|length=", "the length of the reads",
                  (int v) => length = v },
                { "z|compression=", "the compression method to use: none|gzip|lzma - default is gzip",
                  v => compression = v },
                { "p|prefix=", "the prefix for the output file",
                  v => prefix = v },
                { "i|noId", "do not encode/decode Ids",
                  v => encodeIds = v == null },
                { "c|cutoff=", "cutoff to be used when encoding losing qualities -\n" +
                  "bases associated with a quality lower than the cutoff will be encoded as\n" +
                  "N",
                  (int v) => cutoff = v },
                { "q|noQuality", "do not encode/decode qualities - will use cutoff if given",
                  v => encodeQualities = v == null },
                { "h|help", "show this message and exit",
                  v => showHelp = v != null },
            };

            bool          stop      = false;
            List <string> extraArgs = null;
            string        e         = "";

            try {
                extraArgs = p.Parse(args);
            }
            catch (OptionException oe) {
                stop = true;
                e    = oe.Message;
            }

            if (length <= 0 || mode == "" || (mode == "encode" && prefix == ""))
            {
                stop = true;
            }

            if (mode == "encode" && cutoff != -1 && encodeQualities)
            {
                stop = true;
                e    = "In encode mode -c option can be used only with -q option!";
            }

            if (extraArgs.Count <= 1)
            {
                if (extraArgs.Count != 0)
                {
                    filename = extraArgs[0];
                }
            }
            else
            {
                stop = true;
            }

            if (showHelp)
            {
                ShowHelp(p);
                return(0);
            }
            if (stop)
            {
                Console.WriteLine(e);
                ShowHelp(p);
                return(-1);
            }

            string outputFile = prefix + suffix;

            if (compression == "gzip")
            {
                outputFile += ".gz";
            }
            else if (compression == "lzma")
            {
                outputFile += ".lzma";
            }


            IFastqDeCompresser fdc = new FastqDeCompresser(length, encodeIds, encodeQualities);

            try {
                if (mode == "encode")
                {
                    FileStream   outStream = new FileStream(outputFile, FileMode.Create);
                    BinaryWriter writer    = null;
                    Stream       zipWriter = null;
                    try {
                        if (compression == "none")
                        {
                            writer = new BinaryWriter(outStream);
                        }
                        else if (compression == "gzip")
                        {
                            zipWriter = new GZipOutputStream(outStream);
                            writer    = new BinaryWriter(zipWriter);
                        }
                        else if (compression == "lzma")
                        {
                            zipWriter = new LzmaEncodeStream(outStream);
                            writer    = new BinaryWriter(zipWriter);
                        }
                        else
                        {
                            Console.Error.WriteLine("Wrong compression method given");
                            ShowHelp(p);
                            return(-1);
                        }
                        FastqReader reader = null;
                        if (filename != "")
                        {
                            if (cutoff == -1)
                            {
                                reader = new FastqReader(filename, length);
                            }
                            else
                            {
                                reader = new FastqCutoffReader(filename, length, cutoff);
                            }
                        }
                        else
                        {
                            if (cutoff == -1)
                            {
                                reader = new FastqReader(Console.In, length);
                            }
                            else
                            {
                                reader = new FastqCutoffReader(Console.In, length, cutoff);
                            }
                        }
                        fdc.Compress(reader, writer);
                        reader.Close();
                    } finally {
                        writer.Close();
                        outStream.Close();
                        //XXX should close correct streams!
                    }
                }
                else if (mode == "decode")
                {
                    EncodedFastqReader reader = new EncodedFastqReader(File.OpenRead(outputFile), compression);
                    StreamWriter       writer = new StreamWriter(Console.OpenStandardOutput());
                    try {
                        fdc.Decompress(reader, writer);
                    } finally {
                        reader.Close();
                        writer.Close();
                    }
                }
                else
                {
                    ShowHelp(p);
                    return(-1);
                }
            } catch (InvalidOperationException ioe) {
                Console.Error.WriteLine(ioe.Message);
                return(1);
            } catch (FileNotFoundException fnfe) {
                Console.Error.WriteLine("File {0} not found {1}!", filename, fnfe.Message);
                return(1);
            } /*finally {
               * // TODO ASK
               * }*/

            return(0);
        }
Пример #12
0
        /* Encodes sequencing starting at the given index (i) until buffer
         * is full or the sequence ends and writes the result in the given BinaryWriter.
         * Updates i according to its progression.
         */
        void EncodeSeq(byte[] buffer, ref long i, BinaryWriter writer, FastqReader reader)
        {
            //the first byte starts with 1 if we are encoding a seq
            byte first = (byte)128;
            int  b     = 0;

            while (reader.HasSeqLeft(i, 4) && b < BUFFER)
            {
                //Check for run-length run.
                if (reader.GetSeq(i) == reader.GetSeq(i + 3) && reader.GetSeq(i) == reader.GetSeq(i + 2) && reader.GetSeq(i) == reader.GetSeq(i + 1))
                {
                    long j = i + 4;
                    long l = i + 19;
                    while (j < l && reader.HasSeqLeft(j, 1) && reader.GetSeq(j) == reader.GetSeq(i))
                    {
                        j++;
                    }
                    buffer[b++] = (byte)(128 + ((j - i - 4) << 3) + Array.IndexOf(Bases, reader.GetSeq(i)));
                    i           = j;
                }
                else
                {
                    buffer[b++] = Encode(reader.GetSeq(i++), reader.GetSeq(i++), reader.GetSeq(i++));
                }
            }

            string last        = "";
            byte   lastSeqByte = 0;
            bool   lastByte    = false;

            if (!reader.HasSeqLeft(i, 4))
            {
                while (reader.HasSeqLeft(i, 1)) //could still have 1, 2 or 3 bases
                {
                    last += reader.GetSeq(i++);
                }
            }
            if (last != "")
            {
                lastByte    = true;
                last        = last.PadRight(3, 'N');
                lastSeqByte = Encode(last[0], last[1], last[2]);
            }

            if (b == BUFFER && !lastByte)
            {
                writer.Write(first);
                writer.Write(buffer);
            }
            else
            {
                first += (byte)32;  //we have to tell the decoder that we have a block with a length
                                    //different than BUFFER
                writer.Write(first);
                writer.Write(b + (lastByte ? 1 : 0));
                writer.Write(buffer, 0, b);
                if (lastByte)
                {
                    writer.Write(lastSeqByte);
                }
            }
        }
Пример #13
0
 public PlainIdDeCompresser(FastqReader reader, BinaryWriter writer)
 {
     this.reader = reader;
     this.writer = writer;
 }