/******************************************************************************
Tecella Lossless Compression Format
Copyright (c) 2010 Tecella LLC
Licensed under the MIT License

*******************************************************************************
File Description:
Implements TLCv0, which has the following attributes:
	3% to 5% better compression than TLCv1.
	More complicated than TLCv1.
	Slower compression than TLCv1.
	Parallelizable only at a macro scale:
		Many frames with 1 thread per frame.
	Whereas TLCv1 is parallelizable at a micro scale:
		Many frames with many threads per frame.

See documentation for details about the encoding scheme.

*******************************************************************************
Contributors:
Brian Anderson (Tecella) - Initial implementation

******************************************************************************
TODO:  N/A
******************************************************************************/

#include "tlc_v0.h"

#include <assert.h>


/******************************************************************************
* tlc_bit_packer
*/
//takes an external queue the packed bytes will be pushed into
tlc_bit_packer::tlc_bit_packer(vector<unsigned char> *buffer)
{
	bits_left = 0;
	this->buffer = buffer;
}

#include <iostream>

//writes the least significant bits of data to the queue
void tlc_bit_packer::write_bits(unsigned int data, unsigned int bits)
{
	//cout << hex << "New bp data: " << data << "," << bits;
	//mask out unused bits so we don't accidently use them
	data &= (1<<bits)-1;

	while(bits>0)
	{
		if(bits_left==0) {
			buffer->push_back(0);
			bits_left = 8;
		}

		if(bits>bits_left) {
			int mask = (1<<bits_left)-1;
			bits -= bits_left;
			buffer->back() |= mask & (data >> bits);
			bits_left = 0;
		}
		else {
			int mask = (1<<bits)-1;
			bits_left -= bits;
			buffer->back() |= (data & mask) << bits_left;
			bits = 0;
		}
		//cout << hex << "; back:" << int(buffer->back());
	}
	//cout << endl;
}


/******************************************************************************
* tlc_bit_unpacker
*/
tlc_bit_unpacker::tlc_bit_unpacker(vector<unsigned char> &data_in)
		: data_in(data_in)
{
	i_data_in = 0;
	data_out = 0;
	data_valid_bits = 0;
}

//returns true if bits were read
//false if there weren't enough bits
//pop indicates if the data read should be removed
bool tlc_bit_unpacker::read_bits(unsigned int *data, unsigned int bits, bool pop)
{
	if(bits>32) return false;

	//cout << "Read bp: " << bits << "; ";
	while( bits>data_valid_bits ) {
		unsigned char byte = 0;
		if( i_data_in<data_in.size() ) {
			byte = data_in[i_data_in++];
		} else {
			return false;
		}
		unsigned long long new_data = byte;
		data_out |= (new_data << (56-data_valid_bits));
		data_valid_bits += 8;
		//cout << hex << "nb=" << int(byte) << ", ";
	}
	//cout << "; ";

	if( bits <= data_valid_bits ) {
		*data = (unsigned int)( data_out >> (64-bits) );
		if(pop) {
			data_out <<= bits;
			data_valid_bits -= bits;
		}
		//cout << hex << "data_out=" << data_out << ", data=" << *data << endl;
		return true;
	}

	return false;
}

bool tlc_bit_unpacker::read_bits_se(int *data, unsigned int bits, bool pop)
{
	unsigned int sign_extend_mask = 0xFFFFFFFF ^ ((1<<bits)-1);
	unsigned int negative_bit_mask = 1<<(bits-1);
	unsigned int temp = 0;
	bool ret = read_bits(&temp, bits, pop);
	if(temp & negative_bit_mask)
		temp |= sign_extend_mask;
	*data = temp;
	return ret;
}


/******************************************************************************
* tlc_v0_sub_frame_t1
*/
tlc_v0_sub_frame_t1::tlc_v0_sub_frame_t1(tlc_v0_frame *parent, int bits_per_diff)
{
	parent_frame = parent;
	this->bits_per_diff = bits_per_diff;
}

unsigned char tlc_v0_sub_frame_t1::get_method_id()
{
	return 1;
}

void tlc_v0_sub_frame_t1::decode(tlc_bit_unpacker &bit_unpacker, vector<short> &data)
{
	short last_sample = data.back();

	//get number of bits per diff
	unsigned int bits_per_diff;
	bit_unpacker.read_bits(&bits_per_diff, 4);
	bits_per_diff++;
	
	//get number of samples
	unsigned int sample_count;
	unsigned int sample_count_size;
	bit_unpacker.read_bits(&sample_count_size, 2);
	switch(sample_count_size) {
		case 0:   bit_unpacker.read_bits(&sample_count,  4);   break;
		case 1:   bit_unpacker.read_bits(&sample_count,  8);   break;
		case 2:   bit_unpacker.read_bits(&sample_count, 12);   break;
		case 3:   bit_unpacker.read_bits(&sample_count, 16);   break;
		default:  return;                                      break;
	}

	//process all the diffs
	for(unsigned int i=0; i<sample_count; i++)
	{
		int diff;
		bit_unpacker.read_bits_se(&diff, bits_per_diff);
		last_sample += short(diff);
		data.push_back(last_sample);
	}
}

void tlc_v0_sub_frame_t1::encode(const short *data, int sample_count, tlc_bit_packer &bit_packer)
{
	unsigned char bits_per_sample = parent_frame->file_header->bits_per_sample;
	int sample_mask = ((1<<bits_per_sample)-1);
	int sign_extend_mask = 0xFFFFFFFF ^ sample_mask;
	int negative_bit_mask = 1<<(bits_per_sample-1);		
TCD
	//determine how many bits we need to encode the worst diff
	if(bits_per_diff<=0) {
		for(int i=0; i<sample_count; i++)
		{
			// be sure to mask and negate the diff properly
			int diff = (data[i]-data[i-1]) & sample_mask;
			if(diff & negative_bit_mask)
				diff |= sign_extend_mask;
			unsigned char bpd = bits_needed(diff);
			if( bpd > bits_per_diff )
				bits_per_diff = bpd;
		}
	}
TCD	assert( bits_per_diff <= 16);

	//write sub frame header
TCD	bit_packer.write_bits( (bits_per_diff-1) , 4);
	if(sample_count<=0x000F) {
		bit_packer.write_bits(0,2);
		bit_packer.write_bits(sample_count,4);
	}
	else if(sample_count<=0x00FF) {
		bit_packer.write_bits(1,2);
		bit_packer.write_bits(sample_count,8);
	}
	else if(sample_count<=0x0FFF) {
		bit_packer.write_bits(2,2);
		bit_packer.write_bits(sample_count,12);
	}
	else if(sample_count<=0xFFFF) {
		bit_packer.write_bits(3,2);
		bit_packer.write_bits(sample_count,16);
	}
	else {
		return;
	}
TCD
	//write all the diffs
	for(int i=0; i<sample_count; i++)
	{
		//diff gets truncated by write_bits, so we don't have to worry.
		unsigned int diff = data[i]-data[i-1];
		bit_packer.write_bits(diff, bits_per_diff);			
	}
}


/******************************************************************************
* tlc_v0_frame
*/
tlc_v0_frame::tlc_v0_frame(tlc_file_header *file_header, mutex *frames_mutex, condition *frames_condition)
	: tlc_frame(file_header,frames_mutex,frames_condition)
{
TCD	samples.reserve(file_header->samples_per_frame);
}

tlc_v0_frame::~tlc_v0_frame()
{
TCD	reset_subframes();
}

void tlc_v0_frame::reset_subframes()
{
	
TCD	for(unsigned int i=0; i<subframes.size(); i++)
	{
		if(subframes[i])
		{
			delete subframes[i];
			subframes[i] = 0;
		}
	}
TCD	subframes.resize(0);
}

//returns false if read failed, true otherwise
//file_offset should be set before calling read
bool tlc_v0_frame::decode(vector<unsigned char> &encoded_data, unsigned int sample_count)
{
TCD	tlc_bit_unpacker bit_unpacker(encoded_data);

	//read first sample
	unsigned char bits_per_sample = file_header->bits_per_sample;
	int temp;
	bit_unpacker.read_bits_se(&temp, bits_per_sample);
	samples.resize(0);
	samples.push_back( short(temp) );

	//iterate through the subframes
	reset_subframes();
	unsigned int encoding_method;
	while( samples.size() < sample_count )
	{
		if( !bit_unpacker.read_bits(&encoding_method,4) ) {
			return false;
		}
		
		switch(encoding_method)
		{
		case 0:
			subframes.push_back( new tlc_v0_sub_frame_t1(this) );
			break;
		default:
			//assert(0);
			//return false;
			break;
		};

		subframes.back()->decode(bit_unpacker,samples);
	}

	//assert(samples.size()==sample_count);

	//notify listeners that we're done
	{
		unique_lock<mutex> frames_lock(*frames_mutex);
		finished = true;
	}
	frames_condition->notify_all();

TCD	return (samples.size()==sample_count);
};


//samples must be set
void tlc_v0_frame::encode()
{
TCD	encoded_data.resize(0);
	tlc_bit_packer bit_packer(&encoded_data);

	//write the first sample to the bit packer
	unsigned char bits_per_sample = file_header->bits_per_sample;
	bit_packer.write_bits( samples[0], bits_per_sample );
	unsigned int samples_offset = 1;	

	//some masks for correct sign extension
	int sample_mask = ((1<<bits_per_sample)-1);
	int sign_extend_mask = 0xFFFFFFFF ^ sample_mask;
	int negative_bit_mask = 1<<(bits_per_sample-1);

	//partition the frame into subframes
	reset_subframes();
	const unsigned int SUBFRAME_SWITCH_BIT_COST = 16;
	vector<unsigned int> bpd_run(16,0);
TCD	for(unsigned int i=samples_offset; i<samples.size(); i++)
	{
TCD		// get bits needed for current diff
		// be sure to sign extend the diff properly
		int diff = (samples[i]-samples[i-1]) & sample_mask;
		if(diff & negative_bit_mask)
			diff |= sign_extend_mask;
		unsigned int bpd = bits_needed(diff);
		assert(bpd>=1);
		assert(bpd<=16);

		//a quick check to see if it might be worth combining this diff
		// with other diffs.
		//a more thorough check will be done later on.
		unsigned int accum = 0;
		for(int j=bpd-2; j>=0; j--)	{
			accum += bpd_run[j];
		}

		//considering the current diff,
		//decide if we should encode all enqueued bpd runs
TCD		if( accum > SUBFRAME_SWITCH_BIT_COST ) {

			//combine past runs that aren't worth the overhead cost of a new subframe
			vector<unsigned int> bpd_run_combined(bpd_run);
			for(int j=0; j<15; j++) {
				if( bpd_run_combined[j]==0 ) continue;

				for(int k=j+1; k<15; k++) {
					if( bpd_run_combined[k] > 0 ) {
						if( (k-j)*bpd_run_combined[j] < SUBFRAME_SWITCH_BIT_COST ) {
							bpd_run_combined[k] += bpd_run_combined[j];
							bpd_run_combined[j] = 0;
						}
						break;
					}
				}
			}
TCD
			//calculate real cost of combining runs due to current diff
			unsigned int combine_cost = 0;
			for(unsigned int j=0; j<bpd-1; j++) {
				if( bpd_run_combined[j]>0 ){
					combine_cost += (bpd_run_combined[j] * (bpd-j-1)) - SUBFRAME_SWITCH_BIT_COST;
				}
			}

TCD			//encode the enqueued bpd runs if it's not worth combining the curent diff
			if( combine_cost>0 ) {
				for( int k=15; k>=0; k--) {
					bpd_run[k] = 0;
					while( bpd_run_combined[k] > 0) {
						unsigned int run_length = bpd_run_combined[k] > 0xFFFF ? 0xFFFF : bpd_run_combined[k];
						bit_packer.write_bits( 0, 4 );
						subframes.push_back( new tlc_v0_sub_frame_t1(this, k+1) );
						subframes.back()->encode( &samples[samples_offset], run_length, bit_packer );
						samples_offset += run_length;
						bpd_run_combined[k] -= run_length;
					}
				}
			}
TCD		}
		
		//update the bpd runs for to-be-encoded samples
		accum = 1;
TCD		for(int j=bpd-2; j>=0; j--)
		{
			accum += bpd_run[j];
			bpd_run[j] = 0;
		}
		bpd_run[bpd-1] += accum;
TCD	}
	
	//BCA todo: combine with code above
	//combine past runs that aren't worth the overhead cost of a new subframe
	vector<unsigned int> bpd_run_combined(bpd_run);
	for(int j=0; j<15; j++) {
TCD		if( bpd_run_combined[j]==0 ) continue;

		for(int k=j+1; k<15; k++) {
			if( bpd_run_combined[k] > 0 ) {
				if( (k-j)*bpd_run_combined[j] < SUBFRAME_SWITCH_BIT_COST ) {
					bpd_run_combined[k] += bpd_run_combined[j];
					bpd_run_combined[j] = 0;
				}
				break;
			}
		}
TCD	}
	//encode the remaining runs
	for( int k=15; k>=0; k--) {
TCD		bpd_run[k] = 0;
		while( bpd_run_combined[k] > 0) {
			unsigned int run_length = bpd_run_combined[k] > 0xFFFF ? 0xFFFF : bpd_run_combined[k];
			bit_packer.write_bits( 0, 4 );
			subframes.push_back( new tlc_v0_sub_frame_t1(this, k+1) );
			subframes.back()->encode( &samples[samples_offset], run_length, bit_packer );
			samples_offset += run_length;
			bpd_run_combined[k] -= run_length;
		}
TCD	}

	//notify listeners that we're done
	{
		unique_lock<mutex> frames_lock(*frames_mutex);
		finished = true;
	}
	frames_condition->notify_all();
TCD
}


//write the completed frame to file
void tlc_v0_frame::write(ofstream &ofs)
{
	ofs.write( (char*)&encoded_data[0], (unsigned int)encoded_data.size()*sizeof(encoded_data[0]) );
}
