libUTL++
utl::BWTencoder Class Reference

BWT-based coder. More...

#include <BWTencoder.h>

Inheritance diagram for utl::BWTencoder:

Public Member Functions

 BWTencoder (uint_t mode, Stream *stream, bool owner=true, uint_t blockSize=KB(256))
 Constructor. More...
 
virtual size_t decode (byte_t *block, size_t num)
 Decode data from the associated stream. More...
 
virtual size_t encode (const byte_t *block, size_t num)
 Encode data to the associated stream. More...
 
void start (uint_t mode, Stream *stream, bool owner=true, uint_t blockSize=KB(256))
 Initialize for encoding or decoding. More...
 
- Public Member Functions inherited from utl::Encoder
 Encoder (uint_t mode, Stream *stream, bool owner=true)
 Constructor. More...
 
void set (uint_t mode, Stream *stream, bool owner=true, size_t bufSize=KB(16))
 Set a new configuration. More...
 
virtual void close ()
 Close the stream. More...
 
uint32_t getCRC () const
 Get the CRC code. More...
 
uint32_t getLastCRC () const
 Get the last CRC code. More...
 
bool isCRC () const
 Get the crc flag. More...
 
void setCRC (bool crc)
 Set the crc flag. More...
 
- Public Member Functions inherited from utl::BufferedStream
 BufferedStream (Stream *stream, bool streamOwner=true, size_t iBufSize=KB(16), size_t oBufSize=KB(16))
 Constructor. More...
 
virtual void copy (const Object &rhs)
 Copy another instance. More...
 
const StreamgetStream () const
 Get the stream. More...
 
StreamgetStream ()
 Get the stream. More...
 
void setStream (Stream *stream, bool streamOwner=true, size_t iBufSize=KB(16), size_t oBufSize=KB(16))
 Set the attached stream. More...
 
StreamtakeStream ()
 Steal the stream. More...
 
bool eofBlocking ()
 Determine whether an EOF condition exists. More...
 
void setMode (uint_t mode, bool p_setBufs=true)
 Set the mode. More...
 
virtual bool isInput () const
 Determine whether the stream is an input stream. More...
 
virtual void setInput (bool input)
 Set the input mode flag. More...
 
virtual bool isOutput () const
 Determine whether the stream is an output stream. More...
 
virtual void setOutput (bool output)
 Set the output mode flag. More...
 
virtual bool eof () const
 Determine whether an EOF condition exists. More...
 
virtual void setEOF (bool eof)
 Set the EOF flag. More...
 
virtual bool error () const
 Determine whether an error condition exists. More...
 
virtual void setError (bool error)
 Set the error flag. More...
 
bool hasInput () const
 Has unread input?
 
virtual BufferedStreamflush (uint_t mode=io_wr)
 Flush the stream (if it is buffered). More...
 
bool isLineBuffered () const
 Determine whether the stream is line-buffered. More...
 
void setLineBuffered (bool lineBuffered)
 Set the line-buffered flag. More...
 
void setBufs (size_t iBufSize=KB(16), size_t oBufSize=KB(16))
 Set the buffers for input and/or output. More...
 
void setInputBuf (size_t size=KB(16))
 Set the input buffer size. More...
 
void setOutputBuf (size_t size=KB(16))
 Set the output buffer size. More...
 
BufferedStreamoperator<< (bsmanip0 manip)
 
BufferedStreamoperator<< (void *ptr)
 
BufferedStreamoperator<< (const char *str)
 
BufferedStreamoperator<< (char c)
 
BufferedStreamoperator>> (char &c)
 
BufferedStreamoperator<< (byte_t b)
 
BufferedStreamoperator>> (byte_t &b)
 
BufferedStreamoperator<< (short n)
 
BufferedStreamoperator>> (short &n)
 
BufferedStreamoperator<< (ushort_t n)
 
BufferedStreamoperator>> (ushort_t &n)
 
BufferedStreamoperator<< (int n)
 
BufferedStreamoperator>> (int &n)
 
BufferedStreamoperator<< (uint_t n)
 
BufferedStreamoperator>> (uint_t &n)
 
BufferedStreamoperator<< (long n)
 
BufferedStreamoperator>> (long &n)
 
BufferedStreamoperator<< (ulong_t n)
 
BufferedStreamoperator>> (ulong_t &n)
 
BufferedStreamoperator<< (double n)
 
BufferedStreamoperator>> (double &n)
 
void get (byte_t &b)
 
void get (char &c)
 
byte_t get ()
 
byte_t peek ()
 Peek at the next byte to be read from the stream. More...
 
virtual StreamreadLine (String &str)
 Read a line from stream into the given String object. More...
 
virtual size_t read (byte_t *array, size_t maxBytes, size_t minBytes=size_t_max)
 Read data into a given buffer. More...
 
void unget (byte_t b)
 Un-get a byte. More...
 
void unget (char c)
 Unget a char. More...
 
virtual BufferedStreamnewline ()
 Write a newline. More...
 
BufferedStreamnewline (bool forceFlush)
 Write a newline. More...
 
BufferedStreamput (byte_t b)
 Write the given byte. More...
 
BufferedStreamput (char c)
 Write the given character. More...
 
BufferedStreamput (int c)
 Write the given character. More...
 
virtual void write (const byte_t *array, size_t num)
 Write a sequence of bytes. More...
 
- Public Member Functions inherited from utl::Stream
bool isOwner () const
 Get the owner flag. More...
 
void setOwner (bool owner)
 Set the owner flag. More...
 
size_t getInputCount () const
 Get the count of input bytes. More...
 
size_t getOutputCount () const
 Get the count of output bytes. More...
 
uint_t getMode () const
 Get the current mode. More...
 
const StringgetName () const
 Get the stream's name. More...
 
const StringgetNamePtr () const
 Get the stream's name. More...
 
void setName (const String &name)
 Set the name. More...
 
void setName (String *name)
 Set the name. More...
 
virtual bool isBOL () const
 Get the begin-of-line flag. More...
 
virtual void setBOL (bool p_bol)
 Set the BOL flag. More...
 
bool isRDWR () const
 Determine whether the stream is an input/output stream. More...
 
void setBase (uint_t base)
 Set the base, for writing integer values. More...
 
void setMode (uint_t mode)
 Set the mode. More...
 
Streamput (byte_t b)
 Write the given character. More...
 
Streamput (char c)
 Write the given character. More...
 
Streamput (int c)
 Write the given character. More...
 
Streamput (const char *str)
 Write the given string. More...
 
Streamput (const char *str, size_t len)
 Write the given string. More...
 
StreamputBit (bool b)
 Write the given bit. More...
 
StreamputBits ()
 Write all outstanding bits. More...
 
StreamputBits (uint32_t n, uint_t numBits)
 Write multiple bits. More...
 
StreamputLine (const char *str)
 Write the given string, followed by a newline. More...
 
void writeSpaces (size_t num)
 Write spaces. More...
 
Streamoperator<< (smanip0 manip)
 Execute a manipulator (e.g. More...
 
Streamoperator<< (void *ptr)
 Write a void pointer. More...
 
Streamoperator<< (const char *str)
 Write a string. More...
 
Streamoperator<< (char c)
 
Streamoperator>> (char &c)
 
Streamoperator<< (byte_t b)
 
Streamoperator>> (byte_t &b)
 
Streamoperator<< (int16_t n)
 
Streamoperator>> (int16_t &n)
 
Streamoperator<< (uint16_t n)
 
Streamoperator>> (uint16_t &n)
 
Streamoperator<< (int32_t n)
 
Streamoperator>> (int32_t &n)
 
Streamoperator<< (uint32_t n)
 
Streamoperator>> (uint32_t &n)
 
Streamoperator<< (long n)
 
Streamoperator>> (long &n)
 
Streamoperator<< (ulong_t n)
 
Streamoperator>> (ulong_t &n)
 
Streamoperator<< (double n)
 
Streamoperator>> (double &n)
 
uint_t getIndent () const
 Get the indentation level. More...
 
void setIndent (uint_t indent)
 Set the indentation level. More...
 
void indent (uint_t num=4)
 Increase indentation by the given number of spaces. More...
 
void unindent (uint_t num=4)
 Decrease indentation by the given number of spaces. More...
 
void checkOK ()
 If there is an error condition (error()), throw StreamErrorEx. More...
 
void clearEOF ()
 Clear the EOF condition. More...
 
void clearError ()
 Clear the error condition. More...
 
bool ok () const
 Determine whether an error condition exists. More...
 
size_t copyData (Stream &in, size_t numBytes=size_t_max, size_t bufSize=KB(4))
 Copy data from another stream. More...
 
void get (byte_t &b)
 Get a single byte. More...
 
void get (char &c)
 Get a single char. More...
 
byte_t get ()
 Get a single byte. More...
 
bool getBit ()
 Get a single bit. More...
 
uint32_t getBits (uint_t numBits)
 Get multiple bits. More...
 
- Public Member Functions inherited from utl::Object
void clear ()
 Revert to initial state. More...
 
virtual int compare (const Object &rhs) const
 Compare with another object. More...
 
virtual void vclone (const Object &rhs)
 Make an exact copy of another instance. More...
 
virtual void steal (Object &rhs)
 "Steal" the internal representation from another instance. More...
 
virtual void dump (Stream &os, uint_t level=uint_t_max) const
 Dump a human-readable representation of self to the given output stream. More...
 
void dumpWithClassName (Stream &os, uint_t indent=4, uint_t level=uint_t_max) const
 Front-end for dump() that prints the object's class name. More...
 
virtual const ObjectgetKey () const
 Get the key for this object. More...
 
bool hasKey () const
 Determine whether or not the object has a key. More...
 
virtual const ObjectgetProxiedObject () const
 Get the proxied object (= self if none). More...
 
virtual ObjectgetProxiedObject ()
 Get the proxied object (= self if none). More...
 
virtual size_t hash (size_t size) const
 Get the hash code for the object. More...
 
bool _isA (const RunTimeClass *runTimeClass) const
 Determine whether self's class is a descendent of the given class. More...
 
virtual String toString () const
 Return a string representation of self. More...
 
 operator String () const
 Conversion to String. More...
 
size_t allocatedSize () const
 Get the total allocated size of this object. More...
 
virtual size_t innerAllocatedSize () const
 Get the "inner" allocated size. More...
 
virtual void addOwnedIt (const class FwdIt *it) const
 Notify self that it owns the given iterator. More...
 
virtual void removeOwnedIt (const class FwdIt *it) const
 Notify self that the given owned iterator has been destroyed. More...
 
bool operator< (const Object &rhs) const
 Less-than operator. More...
 
bool operator<= (const Object &rhs) const
 Less-than-or-equal-to operator. More...
 
bool operator> (const Object &rhs) const
 Greater-than operator. More...
 
bool operator>= (const Object &rhs) const
 Greater-than-or-equal-to operator. More...
 
bool operator== (const Object &rhs) const
 Equal-to operator. More...
 
bool operator!= (const Object &rhs) const
 Unequal-to operator. More...
 
void serializeIn (Stream &is, uint_t mode=ser_default)
 Serialize from an input stream. More...
 
void serializeOut (Stream &os, uint_t mode=ser_default) const
 Serialize to an output stream. More...
 
virtual void serialize (Stream &stream, uint_t io, uint_t mode=ser_default)
 Serialize to or from a stream. More...
 
void serializeOutBoxed (Stream &os, uint_t mode=ser_default) const
 Serialize a boxed object to an output stream. More...
 

Protected Member Functions

virtual void finishEncoding ()
 Perform any necessary work when encoding is finished. More...
 
- Protected Member Functions inherited from utl::Encoder
virtual void overflow ()
 Encode the write buffer to the stream (by calling encode()). More...
 
virtual void underflow ()
 Decode from the stream into the read buffer (by calling decode()). More...
 
virtual size_t finishDecoding ()
 Perform any necessary work when decoding is finished. More...
 
bool isLastBlock () const
 Get the lastBlock flag. More...
 
void setLastBlock (bool lastBlock)
 Set the lastBlock flag. More...
 
- Protected Member Functions inherited from utl::BufferedStream
- Protected Member Functions inherited from utl::Stream
void indentIfBOL ()
 If begin-of-line flag is true, clear the flag and indent. More...
 
void _indentIfBOL ()
 Write _indent spaces at beginning of line. More...
 
void throwStreamEOFex ()
 Throw a StreamEOFex exception. More...
 
void throwStreamErrorEx ()
 Throw a StreamErrorEx exception. More...
 
void readToken (char *buf, size_t size)
 Read a token from the stream. More...
 
void readUntilWS (char *buf, size_t size)
 Read until a whitespace character is encountered. More...
 
byte_t skipWS ()
 Skip past whitespace. More...
 
- Protected Member Functions inherited from utl::FlagsMI
 FlagsMI ()
 Constructor. More...
 
virtual ~FlagsMI ()
 Destructor. More...
 
void copyFlags (const FlagsMI &rhs)
 Copy the given flags. More...
 
void copyFlags (const FlagsMI &rhs, uint_t lsb, uint_t msb)
 Copy (some of) the given flags. More...
 
void copyFlags (uint64_t flags, uint_t lsb, uint_t msb)
 Copy (some of) the given flags. More...
 
bool getFlag (uint_t flagNum) const
 Get a user-defined flag. More...
 
void setFlag (uint_t flagNum, bool val)
 Set a user-defined flag. More...
 
uint64_t getFlagsNumber (uint64_t mask, uint64_t shift=0)
 Get a multi-bit value in the flags data (which is stored as one 64-bit integer). More...
 
void setFlagsNumber (uint64_t mask, uint64_t shift, uint64_t num)
 Set a multi-bit value in the flags data (which is stored as one 64-bit integer). More...
 
uint64_t getFlags () const
 Get the flags. More...
 
void setFlags (uint64_t flags)
 Set the flags. More...
 

Additional Inherited Members

- Static Public Member Functions inherited from utl::Object
static ObjectserializeInNullable (Stream &is, uint_t mode=ser_default)
 Serialize a nullptr-able object from an input stream. More...
 
static void serializeOutNullable (const Object *object, Stream &os, uint_t mode=ser_default)
 Serialize a nullptr-able object to an output stream. More...
 
static void serializeNullable (Object *&object, Stream &stream, uint_t io, uint_t mode=ser_default)
 Serialize a nullptr-able object to or from a stream. More...
 
static ObjectserializeInBoxed (Stream &is, uint_t mode=ser_default)
 Serialize a boxed object from an input stream. More...
 
static void serializeBoxed (Object *&object, Stream &stream, uint_t io, uint_t mode=ser_default)
 Serialize a boxed object to or from a stream. More...
 

Detailed Description

BWT-based coder.

BWTencoder is a block-sorting, lossless data compressor based on the Burrows-Wheeler Transform (BWT). bzip2 by Julian Seward is currently the most popular compressor based on the BWT. I have stolen some ideas (and even code!) from bzip2, and I've made it clear (with code comments) where I've done so. In fact, I have to admit that this implementation contains no original ideas or insights of my own. My goal here isn't research. I've tried to provide a solid implementation that is:

  • fast enough for many purposes
  • easy to read and maintain
  • easy to use within the UTL++ framework

I think I've achieved these goals. I'll leave it to you to decide if this implementation is a good choice for your application. The best I can do is be honest about the strengths and weaknesses.

Attributes

  • blockSize : Setting the block size is the sole method of compression tuning. Compression benefits from larger block size, but the law of diminishing returns applies here. 256 KB tends to be a nice balance between compression ratio and memory/CPU usage. I don't recommend a block size smaller than 64 KB or larger than 1 MB though. Space required is approx. (9 * blockSize) for compression and (5 * blockSize) for decompression.

Advantages

  • provides excellent compression
  • code is easy to read and maintain
  • fast enough for many purposes
  • easy to use within the UTL++ framework
  • you can study the code to learn about the BWT

Disadvantages

  • it's a pig:
  • – requires more space than bzip2
  • – requires more time than bzip2 (a lot more in the worst case)
  • possible patent issues due to the use of arith-coding
  • not thoroughly tested – use at your own risk!

Worked Example

I thought it would be useful to provide a worked example, with a bit of explanation. We'll transform a small piece of text, and then undo the transformation.

Suppose the message to be transformed is "BANANA". The first thing we'll do is "stripe" the original input to make 6 strings, as follows:

0. BANANA
1. ANANAB
2. NANABA
3. ANABAN
4. NABANA
5. ABANAN

Next, we'll sort those strings:

0. ABANAN
1. ANABAN
2. ANANAB
3. BANANA
4. NABANA
5. NANABA

To reverse the transformation & recover the original message, we only need the last column in the sort ("NNBAAA"), together with the starting point (2). How can we do this? We will do this by iterating over the characters in the last column of the sorted block, creating a map that tells us the index of the left-shifted version of each string, and then using that to traverse from the starting point ("B") to the 5 remaining characters in the message.

How do we make this traversal map? First, look at the strings in the sorted block that end in "N", and then find their right-shifted versions (their predecessors).

NABANA --precedes--> ABANAN
NANABA --precedes--> ANABAN

The key observation here is that the predecessors of the strings ending in "N" will be found in the exact same order as their successors, and the reason for this is the block sort that was performed. "ABANAN" is before "ANABAN" in the sorted block, and likewise "NABANA" is before "NANABA" in the sorted block. If it was not so, that would mean the block had not been sorted.

Armed with this understanding, it's a simple matter to reconstruct the original message. First, we create two arrays:

cum[] - for storing the index of the first string starting with a given symbol
count[] - for storing the number of times we've encountered each symbol
next[] - for storing the index of the successor to each index

Initially, cum[] and count[] are zeroed, and we initialize cum[] as follows:

cum['A'] = 0
cum['B'] = 3
cum['N'] = 4

Then we proceed down the last column in the sorted block, identifying its predecessor & storing a mapping from the predecessor to it.

The predecessor of string 0 is calculated as cum['N'] + count['N'] which is 4.
next[4] = 0 ; ++count['N']
The predecessor of string 1 is calculated as cum['N'] + count['N'] which is 5.
next[5] = 1 ; ++count['N']
The predecessor of string 2 is calculated as cum['B'] + count['B'] which is 3.
next[3] = 2 ; ++count['B']
The predecessor of string 3 is calculated as cum['A'] + count['A'] which is 0.
next[0] = 3 ; ++count['A']
The predecessor of string 4 is calculated as cum['A'] + count['A'] which is 1.
next[1] = 4 ; ++count['A']
The predecessor of string 5 is calculated as cum['A'] + count['A'] which is 2.
next[2] = 5 ; ++count['A']

Putting all this information together, we have:

next
0. ABANAN 3
1. ANABAN 4
2. ANANAB 5 <-- starting point
3. BANANA 2
4. NABANA 0
5. NANABA 1

As you can see, undoing the transformation is very simple & efficient.

The BWT uses the characters following each symbol as context. We will tend to find the same characters preceding repeated sequences in the message, so the last column in the sorted block can be very efficiently encoded with a move-to-front (MTF) encoder.

Research Papers

M. Burrows and D. J. Wheeler
"A block-sorting lossless data compression algorithm"
SRC Research Report 124

Links

Author
Adam McKee

Definition at line 179 of file BWTencoder.h.

Constructor & Destructor Documentation

◆ BWTencoder()

utl::BWTencoder::BWTencoder ( uint_t  mode,
Stream stream,
bool  owner = true,
uint_t  blockSize = KB(256) 
)
inline

Constructor.

Parameters
modeio_rd to encode, io_wr to decode (see utl::io_t)
stream(optional) associated stream
owner(optional : true) owner flag for stream
blockSize(optional : 256 KB) block size

Definition at line 191 of file BWTencoder.h.

References utl::compare(), utl::deInit(), utl::init(), utl::insertionSort(), utl::KB(), utl::multiKeyInsertionSort(), utl::multiKeyQuickSort(), utl::quickSort(), utl::swap(), and utl::uint_t_max.

Member Function Documentation

◆ decode()

virtual size_t utl::BWTencoder::decode ( byte_t block,
size_t  num 
)
virtual

Decode data from the associated stream.

Returns
number of bytes decoded
Parameters
blockblock to decode into
numsize of block

Implements utl::Encoder.

◆ encode()

virtual size_t utl::BWTencoder::encode ( const byte_t block,
size_t  num 
)
virtual

Encode data to the associated stream.

Returns
number of bytes encoded
Parameters
blockblock to encode
numsize of block

Implements utl::Encoder.

◆ start()

void utl::BWTencoder::start ( uint_t  mode,
Stream stream,
bool  owner = true,
uint_t  blockSize = KB(256) 
)

Initialize for encoding or decoding.

Parameters
modeio_rd to encode, io_wr to decode (see utl::io_t)
stream(optional) associated stream
owner(optional : true) owner flag for stream
blockSize(optional : 256 KB) block size

◆ finishEncoding()

virtual void utl::BWTencoder::finishEncoding ( )
protectedvirtual

Perform any necessary work when encoding is finished.

Reimplemented from utl::Encoder.


The documentation for this class was generated from the following file: