libUTL++
|
BWT-based coder. More...
#include <BWTencoder.h>
Public Member Functions | |
BWTencoder (uint_t mode, Stream *stream, bool owner=true, uint_t blockSize=KB(256)) | |
Constructor. More... | |
virtual size_t | decode (byte_t *block, size_t num) |
Decode data from the associated stream. More... | |
virtual size_t | encode (const byte_t *block, size_t num) |
Encode data to the associated stream. More... | |
void | start (uint_t mode, Stream *stream, bool owner=true, uint_t blockSize=KB(256)) |
Initialize for encoding or decoding. More... | |
Public Member Functions inherited from utl::Encoder | |
Encoder (uint_t mode, Stream *stream, bool owner=true) | |
Constructor. More... | |
void | set (uint_t mode, Stream *stream, bool owner=true, size_t bufSize=KB(16)) |
Set a new configuration. More... | |
virtual void | close () |
Close the stream. More... | |
uint32_t | getCRC () const |
Get the CRC code. More... | |
uint32_t | getLastCRC () const |
Get the last CRC code. More... | |
bool | isCRC () const |
Get the crc flag. More... | |
void | setCRC (bool crc) |
Set the crc flag. More... | |
Public Member Functions inherited from utl::BufferedStream | |
BufferedStream (Stream *stream, bool streamOwner=true, size_t iBufSize=KB(16), size_t oBufSize=KB(16)) | |
Constructor. More... | |
virtual void | copy (const Object &rhs) |
Copy another instance. More... | |
const Stream * | getStream () const |
Get the stream. More... | |
Stream * | getStream () |
Get the stream. More... | |
void | setStream (Stream *stream, bool streamOwner=true, size_t iBufSize=KB(16), size_t oBufSize=KB(16)) |
Set the attached stream. More... | |
Stream * | takeStream () |
Steal the stream. More... | |
bool | eofBlocking () |
Determine whether an EOF condition exists. More... | |
void | setMode (uint_t mode, bool p_setBufs=true) |
Set the mode. More... | |
virtual bool | isInput () const |
Determine whether the stream is an input stream. More... | |
virtual void | setInput (bool input) |
Set the input mode flag. More... | |
virtual bool | isOutput () const |
Determine whether the stream is an output stream. More... | |
virtual void | setOutput (bool output) |
Set the output mode flag. More... | |
virtual bool | eof () const |
Determine whether an EOF condition exists. More... | |
virtual void | setEOF (bool eof) |
Set the EOF flag. More... | |
virtual bool | error () const |
Determine whether an error condition exists. More... | |
virtual void | setError (bool error) |
Set the error flag. More... | |
bool | hasInput () const |
Has unread input? | |
virtual BufferedStream & | flush (uint_t mode=io_wr) |
Flush the stream (if it is buffered). More... | |
bool | isLineBuffered () const |
Determine whether the stream is line-buffered. More... | |
void | setLineBuffered (bool lineBuffered) |
Set the line-buffered flag. More... | |
void | setBufs (size_t iBufSize=KB(16), size_t oBufSize=KB(16)) |
Set the buffers for input and/or output. More... | |
void | setInputBuf (size_t size=KB(16)) |
Set the input buffer size. More... | |
void | setOutputBuf (size_t size=KB(16)) |
Set the output buffer size. More... | |
BufferedStream & | operator<< (bsmanip0 manip) |
BufferedStream & | operator<< (void *ptr) |
BufferedStream & | operator<< (const char *str) |
BufferedStream & | operator<< (char c) |
BufferedStream & | operator>> (char &c) |
BufferedStream & | operator<< (byte_t b) |
BufferedStream & | operator>> (byte_t &b) |
BufferedStream & | operator<< (short n) |
BufferedStream & | operator>> (short &n) |
BufferedStream & | operator<< (ushort_t n) |
BufferedStream & | operator>> (ushort_t &n) |
BufferedStream & | operator<< (int n) |
BufferedStream & | operator>> (int &n) |
BufferedStream & | operator<< (uint_t n) |
BufferedStream & | operator>> (uint_t &n) |
BufferedStream & | operator<< (long n) |
BufferedStream & | operator>> (long &n) |
BufferedStream & | operator<< (ulong_t n) |
BufferedStream & | operator>> (ulong_t &n) |
BufferedStream & | operator<< (double n) |
BufferedStream & | operator>> (double &n) |
void | get (byte_t &b) |
void | get (char &c) |
byte_t | get () |
byte_t | peek () |
Peek at the next byte to be read from the stream. More... | |
virtual Stream & | readLine (String &str) |
Read a line from stream into the given String object. More... | |
virtual size_t | read (byte_t *array, size_t maxBytes, size_t minBytes=size_t_max) |
Read data into a given buffer. More... | |
void | unget (byte_t b) |
Un-get a byte. More... | |
void | unget (char c) |
Unget a char. More... | |
virtual BufferedStream & | newline () |
Write a newline. More... | |
BufferedStream & | newline (bool forceFlush) |
Write a newline. More... | |
BufferedStream & | put (byte_t b) |
Write the given byte. More... | |
BufferedStream & | put (char c) |
Write the given character. More... | |
BufferedStream & | put (int c) |
Write the given character. More... | |
virtual void | write (const byte_t *array, size_t num) |
Write a sequence of bytes. More... | |
Public Member Functions inherited from utl::Stream | |
bool | isOwner () const |
Get the owner flag. More... | |
void | setOwner (bool owner) |
Set the owner flag. More... | |
size_t | getInputCount () const |
Get the count of input bytes. More... | |
size_t | getOutputCount () const |
Get the count of output bytes. More... | |
uint_t | getMode () const |
Get the current mode. More... | |
const String & | getName () const |
Get the stream's name. More... | |
const String * | getNamePtr () const |
Get the stream's name. More... | |
void | setName (const String &name) |
Set the name. More... | |
void | setName (String *name) |
Set the name. More... | |
virtual bool | isBOL () const |
Get the begin-of-line flag. More... | |
virtual void | setBOL (bool p_bol) |
Set the BOL flag. More... | |
bool | isRDWR () const |
Determine whether the stream is an input/output stream. More... | |
void | setBase (uint_t base) |
Set the base, for writing integer values. More... | |
void | setMode (uint_t mode) |
Set the mode. More... | |
Stream & | put (byte_t b) |
Write the given character. More... | |
Stream & | put (char c) |
Write the given character. More... | |
Stream & | put (int c) |
Write the given character. More... | |
Stream & | put (const char *str) |
Write the given string. More... | |
Stream & | put (const char *str, size_t len) |
Write the given string. More... | |
Stream & | putBit (bool b) |
Write the given bit. More... | |
Stream & | putBits () |
Write all outstanding bits. More... | |
Stream & | putBits (uint32_t n, uint_t numBits) |
Write multiple bits. More... | |
Stream & | putLine (const char *str) |
Write the given string, followed by a newline. More... | |
void | writeSpaces (size_t num) |
Write spaces. More... | |
Stream & | operator<< (smanip0 manip) |
Execute a manipulator (e.g. More... | |
Stream & | operator<< (void *ptr) |
Write a void pointer. More... | |
Stream & | operator<< (const char *str) |
Write a string. More... | |
Stream & | operator<< (char c) |
Stream & | operator>> (char &c) |
Stream & | operator<< (byte_t b) |
Stream & | operator>> (byte_t &b) |
Stream & | operator<< (int16_t n) |
Stream & | operator>> (int16_t &n) |
Stream & | operator<< (uint16_t n) |
Stream & | operator>> (uint16_t &n) |
Stream & | operator<< (int32_t n) |
Stream & | operator>> (int32_t &n) |
Stream & | operator<< (uint32_t n) |
Stream & | operator>> (uint32_t &n) |
Stream & | operator<< (long n) |
Stream & | operator>> (long &n) |
Stream & | operator<< (ulong_t n) |
Stream & | operator>> (ulong_t &n) |
Stream & | operator<< (double n) |
Stream & | operator>> (double &n) |
uint_t | getIndent () const |
Get the indentation level. More... | |
void | setIndent (uint_t indent) |
Set the indentation level. More... | |
void | indent (uint_t num=4) |
Increase indentation by the given number of spaces. More... | |
void | unindent (uint_t num=4) |
Decrease indentation by the given number of spaces. More... | |
void | checkOK () |
If there is an error condition (error()), throw StreamErrorEx. More... | |
void | clearEOF () |
Clear the EOF condition. More... | |
void | clearError () |
Clear the error condition. More... | |
bool | ok () const |
Determine whether an error condition exists. More... | |
size_t | copyData (Stream &in, size_t numBytes=size_t_max, size_t bufSize=KB(4)) |
Copy data from another stream. More... | |
void | get (byte_t &b) |
Get a single byte. More... | |
void | get (char &c) |
Get a single char. More... | |
byte_t | get () |
Get a single byte. More... | |
bool | getBit () |
Get a single bit. More... | |
uint32_t | getBits (uint_t numBits) |
Get multiple bits. More... | |
Public Member Functions inherited from utl::Object | |
void | clear () |
Revert to initial state. More... | |
virtual int | compare (const Object &rhs) const |
Compare with another object. More... | |
virtual void | vclone (const Object &rhs) |
Make an exact copy of another instance. More... | |
virtual void | steal (Object &rhs) |
"Steal" the internal representation from another instance. More... | |
virtual void | dump (Stream &os, uint_t level=uint_t_max) const |
Dump a human-readable representation of self to the given output stream. More... | |
void | dumpWithClassName (Stream &os, uint_t indent=4, uint_t level=uint_t_max) const |
Front-end for dump() that prints the object's class name. More... | |
virtual const Object & | getKey () const |
Get the key for this object. More... | |
bool | hasKey () const |
Determine whether or not the object has a key. More... | |
virtual const Object & | getProxiedObject () const |
Get the proxied object (= self if none). More... | |
virtual Object & | getProxiedObject () |
Get the proxied object (= self if none). More... | |
virtual size_t | hash (size_t size) const |
Get the hash code for the object. More... | |
bool | _isA (const RunTimeClass *runTimeClass) const |
Determine whether self's class is a descendent of the given class. More... | |
virtual String | toString () const |
Return a string representation of self. More... | |
operator String () const | |
Conversion to String. More... | |
size_t | allocatedSize () const |
Get the total allocated size of this object. More... | |
virtual size_t | innerAllocatedSize () const |
Get the "inner" allocated size. More... | |
virtual void | addOwnedIt (const class FwdIt *it) const |
Notify self that it owns the given iterator. More... | |
virtual void | removeOwnedIt (const class FwdIt *it) const |
Notify self that the given owned iterator has been destroyed. More... | |
bool | operator< (const Object &rhs) const |
Less-than operator. More... | |
bool | operator<= (const Object &rhs) const |
Less-than-or-equal-to operator. More... | |
bool | operator> (const Object &rhs) const |
Greater-than operator. More... | |
bool | operator>= (const Object &rhs) const |
Greater-than-or-equal-to operator. More... | |
bool | operator== (const Object &rhs) const |
Equal-to operator. More... | |
bool | operator!= (const Object &rhs) const |
Unequal-to operator. More... | |
void | serializeIn (Stream &is, uint_t mode=ser_default) |
Serialize from an input stream. More... | |
void | serializeOut (Stream &os, uint_t mode=ser_default) const |
Serialize to an output stream. More... | |
virtual void | serialize (Stream &stream, uint_t io, uint_t mode=ser_default) |
Serialize to or from a stream. More... | |
void | serializeOutBoxed (Stream &os, uint_t mode=ser_default) const |
Serialize a boxed object to an output stream. More... | |
Protected Member Functions | |
virtual void | finishEncoding () |
Perform any necessary work when encoding is finished. More... | |
Protected Member Functions inherited from utl::Encoder | |
virtual void | overflow () |
Encode the write buffer to the stream (by calling encode()). More... | |
virtual void | underflow () |
Decode from the stream into the read buffer (by calling decode()). More... | |
virtual size_t | finishDecoding () |
Perform any necessary work when decoding is finished. More... | |
bool | isLastBlock () const |
Get the lastBlock flag. More... | |
void | setLastBlock (bool lastBlock) |
Set the lastBlock flag. More... | |
Protected Member Functions inherited from utl::BufferedStream | |
Protected Member Functions inherited from utl::Stream | |
void | indentIfBOL () |
If begin-of-line flag is true, clear the flag and indent. More... | |
void | _indentIfBOL () |
Write _indent spaces at beginning of line. More... | |
void | throwStreamEOFex () |
Throw a StreamEOFex exception. More... | |
void | throwStreamErrorEx () |
Throw a StreamErrorEx exception. More... | |
void | readToken (char *buf, size_t size) |
Read a token from the stream. More... | |
void | readUntilWS (char *buf, size_t size) |
Read until a whitespace character is encountered. More... | |
byte_t | skipWS () |
Skip past whitespace. More... | |
Protected Member Functions inherited from utl::FlagsMI | |
FlagsMI () | |
Constructor. More... | |
virtual | ~FlagsMI () |
Destructor. More... | |
void | copyFlags (const FlagsMI &rhs) |
Copy the given flags. More... | |
void | copyFlags (const FlagsMI &rhs, uint_t lsb, uint_t msb) |
Copy (some of) the given flags. More... | |
void | copyFlags (uint64_t flags, uint_t lsb, uint_t msb) |
Copy (some of) the given flags. More... | |
bool | getFlag (uint_t flagNum) const |
Get a user-defined flag. More... | |
void | setFlag (uint_t flagNum, bool val) |
Set a user-defined flag. More... | |
uint64_t | getFlagsNumber (uint64_t mask, uint64_t shift=0) |
Get a multi-bit value in the flags data (which is stored as one 64-bit integer). More... | |
void | setFlagsNumber (uint64_t mask, uint64_t shift, uint64_t num) |
Set a multi-bit value in the flags data (which is stored as one 64-bit integer). More... | |
uint64_t | getFlags () const |
Get the flags. More... | |
void | setFlags (uint64_t flags) |
Set the flags. More... | |
Additional Inherited Members | |
Static Public Member Functions inherited from utl::Object | |
static Object * | serializeInNullable (Stream &is, uint_t mode=ser_default) |
Serialize a nullptr-able object from an input stream. More... | |
static void | serializeOutNullable (const Object *object, Stream &os, uint_t mode=ser_default) |
Serialize a nullptr-able object to an output stream. More... | |
static void | serializeNullable (Object *&object, Stream &stream, uint_t io, uint_t mode=ser_default) |
Serialize a nullptr-able object to or from a stream. More... | |
static Object * | serializeInBoxed (Stream &is, uint_t mode=ser_default) |
Serialize a boxed object from an input stream. More... | |
static void | serializeBoxed (Object *&object, Stream &stream, uint_t io, uint_t mode=ser_default) |
Serialize a boxed object to or from a stream. More... | |
BWT-based coder.
BWTencoder is a block-sorting, lossless data compressor based on the Burrows-Wheeler Transform (BWT). bzip2 by Julian Seward is currently the most popular compressor based on the BWT. I have stolen some ideas (and even code!) from bzip2, and I've made it clear (with code comments) where I've done so. In fact, I have to admit that this implementation contains no original ideas or insights of my own. My goal here isn't research. I've tried to provide a solid implementation that is:
I think I've achieved these goals. I'll leave it to you to decide if this implementation is a good choice for your application. The best I can do is be honest about the strengths and weaknesses.
Attributes
Advantages
Disadvantages
Worked Example
I thought it would be useful to provide a worked example, with a bit of explanation. We'll transform a small piece of text, and then undo the transformation.
Suppose the message to be transformed is "BANANA". The first thing we'll do is "stripe" the original input to make 6 strings, as follows:
Next, we'll sort those strings:
To reverse the transformation & recover the original message, we only need the last column in the sort ("NNBAAA"), together with the starting point (2). How can we do this? We will do this by iterating over the characters in the last column of the sorted block, creating a map that tells us the index of the left-shifted version of each string, and then using that to traverse from the starting point ("B") to the 5 remaining characters in the message.
How do we make this traversal map? First, look at the strings in the sorted block that end in "N", and then find their right-shifted versions (their predecessors).
The key observation here is that the predecessors of the strings ending in "N" will be found in the exact same order as their successors, and the reason for this is the block sort that was performed. "ABANAN" is before "ANABAN" in the sorted block, and likewise "NABANA" is before "NANABA" in the sorted block. If it was not so, that would mean the block had not been sorted.
Armed with this understanding, it's a simple matter to reconstruct the original message. First, we create two arrays:
Initially, cum[] and count[] are zeroed, and we initialize cum[] as follows:
Then we proceed down the last column in the sorted block, identifying its predecessor & storing a mapping from the predecessor to it.
Putting all this information together, we have:
As you can see, undoing the transformation is very simple & efficient.
The BWT uses the characters following each symbol as context. We will tend to find the same characters preceding repeated sequences in the message, so the last column in the sorted block can be very efficiently encoded with a move-to-front (MTF) encoder.
Research Papers
Links
Definition at line 179 of file BWTencoder.h.
|
inline |
Constructor.
mode | io_rd to encode, io_wr to decode (see utl::io_t) |
stream | (optional) associated stream |
owner | (optional : true) owner flag for stream |
blockSize | (optional : 256 KB) block size |
Definition at line 191 of file BWTencoder.h.
References utl::compare(), utl::deInit(), utl::init(), utl::insertionSort(), utl::KB(), utl::multiKeyInsertionSort(), utl::multiKeyQuickSort(), utl::quickSort(), utl::swap(), and utl::uint_t_max.
|
virtual |
Decode data from the associated stream.
block | block to decode into |
num | size of block |
Implements utl::Encoder.
|
virtual |
Encode data to the associated stream.
block | block to encode |
num | size of block |
Implements utl::Encoder.
void utl::BWTencoder::start | ( | uint_t | mode, |
Stream * | stream, | ||
bool | owner = true , |
||
uint_t | blockSize = KB(256) |
||
) |
Initialize for encoding or decoding.
mode | io_rd to encode, io_wr to decode (see utl::io_t) |
stream | (optional) associated stream |
owner | (optional : true) owner flag for stream |
blockSize | (optional : 256 KB) block size |
|
protectedvirtual |
Perform any necessary work when encoding is finished.
Reimplemented from utl::Encoder.