void ChooseIddc(FastqReader reader, BinaryWriter writer) { if (!encodeIds) { iddc = new PlaceholderIdGenerator(); return; } //@SRX000571_SRR002321.54856271 080226_CMLIVERKIDNEY_0007:8:330:23:135 length=36 Regex sra = new Regex(@"^(@[^.]+\.)\d+\s([\S]+)(?:\d+:){3}\d+.*$", RegexOptions.Singleline); Regex length = new Regex(@"^.+length=\d+$", RegexOptions.Singleline); //@HWUSI-EAS627_1:3:1:0:370/1 (or /2) //@BILLIEHOLIDAY_3_FC30G08AAXX:1:1:0:1966 Regex encode = new Regex(@"^(@[\S]+)(?:\d+:){3}\d+(\/[12])*$", RegexOptions.Singleline); String id = reader.GetID(0); Match sraMatch = sra.Match(id); Match encodeMatch = encode.Match(id); if (sraMatch.Success) //type 0 { Match lengthMatch = length.Match(id); writer.Write(0); iddc = new SraIdDeCompresser(reader, writer, sraMatch, lengthMatch.Success); } else if (encodeMatch.Success) //type 2 { writer.Write(2); iddc = new EncodeIdDeCompresser(reader, writer, encodeMatch); } else //type 1 { writer.Write(1); iddc = new PlainIdDeCompresser(reader, writer); } }
/* Encodes IDs starting at the given index (id) until "buffer is full" * or the fastq file ends and writes the result in the given BinaryWriter. * Updates id according to its advancements. */ public void EncodeId(ref int id) { // should check if "mode" is right (ie. reader && writer != null) // but we avoid doing so for efficiency //the first byte starts with 11 if we are encoding an ID byte first = (byte)64; int b = 0; StringBuilder ids = new StringBuilder(); if (idContinuation.Length != 0) { ids.Append(idContinuation); b += idContinuation.Length; idContinuation = new StringBuilder(); } //we assume that a continuation will never be longer //than BUFFER while (reader.HasIDLeft(id, 1) && b < ID_BUFFER) { string currentId = reader.GetID(id); b += currentId.Length; if (b > ID_BUFFER) { //continuation ids.Append(currentId.Substring(0, ID_BUFFER - (b - currentId.Length))); idContinuation.Append(currentId.Substring(ID_BUFFER - (b - currentId.Length))); b = ID_BUFFER; } else { ids.Append(currentId); } id++; //here method to deal with known ID's structure } //we use ascii encoding, so 1 char = 1 byte if (b == ID_BUFFER) { writer.Write(first); writer.Write(ae.GetBytes(ids.ToString())); } else if (b < ID_BUFFER) { //mark smaller buffer first += (byte)32; //we have to tell the decoder that we have a block with a length //different than BUFFER writer.Write(first); writer.Write(b); writer.Write(ae.GetBytes(ids.ToString())); } }
/* Encodes IDs starting at the given index (id) until "buffer is full" * or the fastq file ends and writes the result in the given BinaryWriter. * Updates id according to its advancements. */ public void EncodeId(ref int id) { idBuffer.Seek(0, SeekOrigin.Begin); encodedId.Seek(0, SeekOrigin.Begin); // should check if "mode" is right (ie. reader && writer != null) // but we avoid doing so for efficiency //the first byte starts with 11 if we are encoding an ID byte first = (byte)64; int b = 0; if (continuationLength != 0) { encodedId.Write(idContinuation, 0, continuationLength); b += continuationLength; writtenContinuation = continuationLength; continuationLength = 0; } //we assume that a continuation will never be longer //than BUFFER while (reader.HasIDLeft(id, 1) && b < ID_BUFFER) { //encodedId.Seek(0, SeekOrigin.Begin); string[] currentId = reader.GetID(id).Split(separators); if (currentId.Length < 5) { throw new Exception("invalid ID format"); } //1 2 3 4 //XXX TODO check if you can use only idBuffer and not also encodedID //the first number that has to be encoded for (int i = 0; i < 4; i++) { encodedId.Write(Convert.ToUInt16(currentId[i + 1])); // we skip the first item } b += ENCODED_ID_LENGTH; byte[] buffer = ((MemoryStream)encodedId.BaseStream).GetBuffer(); if (b > ID_BUFFER) { //continuation continuationLength = b - ID_BUFFER; int firstExceedingByte = ENCODED_ID_LENGTH - continuationLength; for (int i = 0; i < continuationLength; i++) { idContinuation[i] = buffer[firstExceedingByte + i]; } idBuffer.Write(buffer, 0, firstExceedingByte); //we have to write firstExceedingByte bytes as the count argument b = ID_BUFFER; } else { //XXX dopo continuation non scrive primi byte per l'uint?248.1 idBuffer.Write(buffer, 0, ENCODED_ID_LENGTH + writtenContinuation); } id++; encodedId.Seek(0, SeekOrigin.Begin); writtenContinuation = 0; } if (b == ID_BUFFER) { writer.Write(first); writer.Write(((MemoryStream)idBuffer.BaseStream).GetBuffer(), 0, b); } else if (b < ID_BUFFER) { //mark smaller buffer first += (byte)32; //we have to tell the decoder that we have a block with a length //different than BUFFER writer.Write(first); writer.Write(b); writer.Write(((MemoryStream)idBuffer.BaseStream).GetBuffer(), 0, b); } }