/******************************************************************************
Tecella Lossless Compression Format
Copyright (c) 2010 Tecella LLC
Licensed under the MIT License

*******************************************************************************
File Description:
Implements TLCv1, which has the following attributes:
	Faster compression than TLCv0.
	3% to 5% worse compression than TLCv1.
	Simpler than TLCv0.
	Parallelizable at a micro scale:
		Many frames with many threads per frame.
		Easier to accelerate for SIMD architectures.
	Whereas TLCv0 is only parallelizable at a macro scale:
		Many frames with 1 thread per frame.
		Difficult to accelerate for SIMD architectures.

TLCv1 was designed with GPU acceleration in mind.

See documentation for details about the encoding scheme.

*******************************************************************************
Contributors:
Brian Anderson (Tecella) - Initial implementation

*******************************************************************************
TODO:
1) Add CUDA support.
2) Add OpenCL support.

******************************************************************************/

#include "tlc_v1.h"

#include <assert.h>
using namespace std;


//note (sample_count/block_size) < 256, so the max number of
// run lengths can be specified within a byte
const unsigned int TLC_V1_BLOCK_SIZE = 8;
const unsigned int TLC_V1_SUBFRAME_SAMPLE_COUNT = 1024;
const unsigned int TLC_V1_ENCODED_HEADERS_PITCH = TLC_V1_SUBFRAME_SAMPLE_COUNT * 2 / TLC_V1_BLOCK_SIZE; //worst case of 2 bytes per block.
const unsigned int TLC_V1_ENCODED_DIFFS_PITCH = TLC_V1_SUBFRAME_SAMPLE_COUNT * 2;  //# samples * 2 bytes, because we're doing an in-place encoding


/******************************************************************************
* tlc_v1_sub_frame
*/
tlc_v1_sub_frame::tlc_v1_sub_frame(tlc_file_header *file_header)
{
	this->file_header = file_header;
}

tlc_v1_sub_frame::~tlc_v1_sub_frame()
{
}


//unpackDiffs
//Copies tightly packed deltas in diffs to un-packed
// values in dst, accumulating the diffs along the way.
// diffs are tightly packed differences, each bpd (bits per diff) wide.
// *(dst-1) should contain the previous value
//bpd must not be 0
//
//Note this function tries to be pure C, so that it can be used in
// CUDA or OpenCL if needed.
unsigned int unpackDiffs(unsigned char *diffs, unsigned char bpd, short *dst, unsigned int dst_size)
{
	//some masks for correct sign extension
	int sample_mask = ((1<<bpd)-1);
	int sign_extend_mask = ~sample_mask;
	int negative_bit_mask = 1<<(bpd-1);

	int shift = 8 - bpd;

	unsigned int diff_count = min( dst_size, TLC_V1_BLOCK_SIZE );
	for(unsigned int i=0; i<diff_count; ++i)
	{
		//move the diff data into the current sample
		(*dst) = 0;
		while(shift<0) {
			*dst |= *diffs << (-shift);
			diffs++;
			shift += 8;
		}
		*dst |= (*diffs >> shift);

		//advance the shift variable
		if(shift==0) {
			shift = 8 - bpd;
			diffs++;
		} else {
			shift -= bpd;
		}

		//sign extend properly
		if(*dst & negative_bit_mask) {
			*dst |= sign_extend_mask;
		} else {
			*dst &= sample_mask;
		}

		*dst += *(dst-1);

		//next sample
		++dst;
	}

	return diff_count;
}

//returns false if read failed, true otherwise
//file_offset should be set before calling read
bool tlc_v1_sub_frame::decode( unsigned char *encoded_data, unsigned int encoded_data_size,
							   short *samples, unsigned int sample_count)
{
	//unsigned int group_count = (sample_count+TLC_V1_BLOCK_SIZE-2) / TLC_V1_BLOCK_SIZE;

	//read the first sample
	samples[0] = short(encoded_data[0]) << 8 | encoded_data[1];
	if(--sample_count == 0) {
		return true;
	}

	//make sure we don't access out of bounds memory for the header
	// (should never happen if file was encoded properly.)
	const size_t MIN_HEADER_SIZE = 5;
	if(encoded_data_size < MIN_HEADER_SIZE) {
		return false;
	}

	//read the run_length_count
	// add 1 because the encoding is zero based
	unsigned int run_length_count = ((unsigned int)encoded_data[2]) + 1;

	size_t i_header = 3;
	size_t i_diffs = i_header + run_length_count*2;
	size_t i_sample = 1;

	//read the bits per delta table
	for(unsigned int i=0; i<run_length_count; ++i)
	{
		if( i_header >= encoded_data_size ) {
			return false;
		}

		unsigned char run_length = encoded_data[i_header++];
		unsigned char bpd = encoded_data[i_header++];

		if( i_diffs+bpd > encoded_data_size ) {
			return false;
		}

		for(unsigned int j=0; j<=run_length; ++j)
		{
			sample_count -=
				unpackDiffs( &encoded_data[i_diffs], bpd, &samples[i_sample], sample_count );
			i_diffs += bpd;
			i_sample += TLC_V1_BLOCK_SIZE;
		}
	}

	return true;
}


//bpd must not be 0
void packDiffs(char *dst, unsigned char bpd, int *diffs, unsigned int diff_count)
{
	*dst = 0;
	int mask = (1<<bpd)-1;
	int shift = 8 - bpd;

	for(unsigned int i=0; i<diff_count; ++i)
	{
		while(shift<0) {
			*dst |= (diffs[i] & mask) >> (-shift);
			shift += 8;
			*(++dst) = 0;
		}

		*dst |= (diffs[i] & mask) << shift;

		if(shift==0) {
			shift = 8 - bpd;
			*(++dst) = 0;
		} else {
			shift -= bpd;
		}
	}
}

//encode overwrites samples to perform an in-place encoding
//returns the size of the encoded data
unsigned int tlc_v1_sub_frame::encode(short *samples, unsigned int sample_count)
{
	return 0;
}


void tlc_v1_encode_subframe( unsigned char bits_per_sample,
                             short *samples, unsigned int sample_count,
                             char *encoded_header, unsigned int *encoded_header_size,
                             char *encoded_diffs, unsigned int *encoded_diffs_size,
                             char *subframe_size )
{
	//properly round up group count  (-2 because 1st sample isn't part of a group).
	unsigned int group_count = (sample_count+TLC_V1_BLOCK_SIZE-2) / TLC_V1_BLOCK_SIZE;

	//some masks for correct sign extension
	int sample_mask = ((1<<bits_per_sample)-1);
	int sign_extend_mask = 0xFFFFFFFF ^ sample_mask;
	int negative_bit_mask = 1<<(bits_per_sample-1);

	int i_header = 0;
	int i_diffs = 0;

	//encode the first sample
	encoded_header[i_header]   = ( (samples[0]&0xFF00) >> 8 );
	encoded_header[++i_header] = ( (samples[0]&0x00FF) );

	//special case if there's only one sample: encode the sample only
	if(sample_count==1) {
		*encoded_header_size = 2;
		*encoded_diffs_size = 0;
		subframe_size[0] = 0;
		subframe_size[1] = 2;
		return;
	}

	//reserve a byte to encode how many bpd run lengths there are
	// encoded_data[2]
	++i_header;

	//encode the header and compressed diffs simultaneously
	unsigned short prev_sample = samples[0];
	unsigned short run_length_count = 0;
	unsigned char last_max_bpd = 0;
	unsigned int  last_max_bpd_run_length = 0;
	for(unsigned int i=0; i<group_count; ++i)
	{
		//calculate the full diffs and
		//bits needed per diff for the current block
		unsigned char bpd = 0, max_bpd = 0;
		int diffs[TLC_V1_BLOCK_SIZE];
		unsigned int j = i*TLC_V1_BLOCK_SIZE + 1;
		unsigned int diff_count = TLC_V1_BLOCK_SIZE;
		for(unsigned int k=0; k<TLC_V1_BLOCK_SIZE; ++k) {
			if(j >= sample_count) {
				diffs[k] = 0;
				diff_count = k;
				break;
			} else {
				diffs[k] = (samples[j]-prev_sample) & sample_mask;
				if( diffs[k] & negative_bit_mask ) {
					diffs[k] |= sign_extend_mask;
				}
				prev_sample = samples[j];
				j++;
			}
			bpd = bits_needed_v1(diffs[k]);
	 		if( bpd > max_bpd ) { max_bpd = bpd; }
		}

		//pack the full diffs into compressed form
		packDiffs( &encoded_diffs[i_diffs], max_bpd, diffs, diff_count );
		i_diffs += max_bpd;
		
		//update the bpd run lengths in the header if neccessary
		if( (last_max_bpd!=max_bpd || last_max_bpd_run_length>=256) &&
		    last_max_bpd_run_length!=0 )
		{
			encoded_header[++i_header] = (last_max_bpd_run_length-1);
			encoded_header[++i_header] = last_max_bpd;
			last_max_bpd_run_length = 1;
			++run_length_count;
		}
		else {
			++last_max_bpd_run_length;
		}
		last_max_bpd = max_bpd;
	}
	//encode the leftover run length
	encoded_header[++i_header] = (last_max_bpd_run_length-1);
	encoded_header[++i_header] = last_max_bpd;
	//the actual run_lengh_count should be incremented here
	// but we want to encode one less than the run length since
	// we know there is always at least one run length
	//++run_length_count;

	//now that we know the run length count, go back and set it
	encoded_header[2] = run_length_count;

	//resize the encoded header and return the size of the encoded diffs
	*encoded_header_size = i_header+1;
	*encoded_diffs_size = i_diffs;
	subframe_size[0] = ((*encoded_diffs_size + *encoded_header_size) >> 8) & 0xFF;
	subframe_size[1] = (*encoded_diffs_size + *encoded_header_size) & 0xFF;
}

bool tlc_v1_sub_frame::write(ofstream &ofs)
{
	ofs.write( (char*)&encoded_header[0], encoded_header.size() );
	ofs.write( (char*)encoded_diffs, encoded_diffs_size );
	return true;
}



/******************************************************************************
* tlc_v1_frame
*/
tlc_v1_frame::tlc_v1_frame(tlc_file_header *file_header, mutex *frames_mutex, condition *frames_condition)
	:  tlc_frame(file_header,frames_mutex,frames_condition)
{
}

tlc_v1_frame::~tlc_v1_frame()
{
}

//this class is used as a thread to decode a block of subframes
void decode_subframe_block( unsigned int i1, unsigned int i2,
                            unsigned char *encoded_data, short *samples, unsigned int sample_count,
                            unsigned int *subframe_boundaries, unsigned int subframe_boundaries_size,
                            tlc_file_header *file_header  )

{
	unsigned int subframe_count = subframe_boundaries_size - 1;
	for(unsigned int i=i1; i<i2; ++i)
	{
		unsigned int subframe_sample_count = TLC_V1_SUBFRAME_SAMPLE_COUNT;
		if(i+1==subframe_count) {
			subframe_sample_count = sample_count % TLC_V1_SUBFRAME_SAMPLE_COUNT;
			if(subframe_sample_count==0) {
				subframe_sample_count = TLC_V1_SUBFRAME_SAMPLE_COUNT;
			}
		}
		tlc_v1_sub_frame sub_frame(file_header);
		unsigned int e_begin = subframe_boundaries[i];
		unsigned int e_size = subframe_boundaries[i+1] - e_begin;
		sub_frame.decode( &encoded_data[e_begin], e_size,
		                  &samples[i*TLC_V1_SUBFRAME_SAMPLE_COUNT],
		                  subframe_sample_count );
	}
}


//returns false if read failed, true otherwise
bool tlc_v1_frame::decode(vector<unsigned char> &encoded_data, unsigned int sample_count)
{
	samples.resize(sample_count);

	//calculate the expected number of subframes
	unsigned int subframe_count =
			(sample_count + TLC_V1_SUBFRAME_SAMPLE_COUNT - 1) /
			TLC_V1_SUBFRAME_SAMPLE_COUNT;

	//read the size of each subframe and compute the
	// start/end boundaries of each subframe in encoded_data
	vector<unsigned int> subframe_boundaries(subframe_count+1);
	subframe_boundaries[0] = subframe_count*2;
	for(unsigned int i=0; i<subframe_count; ++i) {
		unsigned int j = i*2;
		subframe_boundaries[i+1] = subframe_boundaries[i] +
			( ((unsigned short)encoded_data[j])<<8 | encoded_data[j+1] );
	}


#ifdef TLC_USE_FRAME_BLOCK_READ_THREADS
	//split the subframes into blocks of subframes
	// decode each block in a separate thread
	thread_group frame_block_threads;
	unsigned int thread_count = thread::hardware_concurrency();
	unsigned int t=0;
	while( t<thread_count )
	{
		unsigned int tp1 = t+1;
		frame_block_threads.create_thread( bind(
			decode_subframe_block, t*subframe_count/thread_count,
			                       tp1*subframe_count/thread_count,
			                       &encoded_data[0], &samples[0], sample_count,
			                       &subframe_boundaries[0], subframe_boundaries.size(),
			                       file_header )
		);
		t = tp1;
	}
	frame_block_threads.join_all();
#else
	decode_subframe_block( 0, subframe_count,
	                       &encoded_data[0], &samples[0], sample_count,
	                       &subframe_boundaries[0], subframe_boundaries.size(),
	                       file_header );
#endif


	//notify listeners that we're done
	{
		unique_lock<mutex> frames_lock(*frames_mutex);
		finished = true;
	}
	frames_condition->notify_all();

	return true;
}


//this function encodes a single frame i
void encode_subframe_block( unsigned int i, unsigned int subframe_count, unsigned char bits_per_sample,
                            short *samples, unsigned int sample_count,
                            char *encoded_headers, unsigned int *encoded_headers_sizes,
			    char *encoded_diffs, unsigned int *encoded_diffs_sizes,
			    char *subframe_sizes )
{
	samples += i * TLC_V1_SUBFRAME_SAMPLE_COUNT;
	encoded_headers += i * TLC_V1_ENCODED_HEADERS_PITCH;
	encoded_headers_sizes += i;
	encoded_diffs += i * TLC_V1_ENCODED_DIFFS_PITCH;
	encoded_diffs_sizes += i;
	subframe_sizes += i*2;

	unsigned int subframe_sample_count = TLC_V1_SUBFRAME_SAMPLE_COUNT;
	if(i+1==subframe_count) {
		subframe_sample_count = sample_count % TLC_V1_SUBFRAME_SAMPLE_COUNT;
		if(subframe_sample_count==0) {
			subframe_sample_count = TLC_V1_SUBFRAME_SAMPLE_COUNT;
		}
	}

	tlc_v1_encode_subframe( bits_per_sample, samples, subframe_sample_count,
	                        encoded_headers, encoded_headers_sizes,
	                        encoded_diffs, encoded_diffs_sizes,
	                        subframe_sizes );
}

//this function encodes a range of frames
void encode_subframe_blocks( unsigned int i1, unsigned int i2,
                             unsigned int subframe_count, unsigned char bits_per_sample,
                             short *samples, unsigned int sample_count,
                             char *encoded_headers, unsigned int *encoded_header_sizes,
                             char *encoded_diffs, unsigned int *encoded_diffs_sizes,
                             char *subframe_sizes )
{
	for(unsigned int i=i1; i<i2; ++i) {
		encode_subframe_block( i, subframe_count, bits_per_sample,
		                       samples, sample_count,
		                       encoded_headers, encoded_header_sizes,
		                       encoded_diffs, encoded_diffs_sizes,
		                       subframe_sizes );
	}
}


//encode overwrites samples to perform an in-place encoding
void tlc_v1_frame::encode()
{
	//calculate the expected number of subframes
	unsigned int subframe_count =
			(samples.size() + TLC_V1_SUBFRAME_SAMPLE_COUNT - 1) /
			TLC_V1_SUBFRAME_SAMPLE_COUNT;

	encoded_headers.resize( subframe_count * TLC_V1_ENCODED_HEADERS_PITCH );
	encoded_headers_sizes.resize( subframe_count );
	encoded_diffs_sizes.resize( subframe_count );
	subframe_sizes.resize( subframe_count * 2 );

#ifdef TLC_USE_FRAME_BLOCK_WRITE_THREADS
	//split the subframes into blocks of subframes
	// encode each block in a separate thread
	thread_group frame_block_threads;
	unsigned int thread_count = thread::hardware_concurrency();
	unsigned int t=0;
	while( t<thread_count )
	{
		unsigned int tp1 = t+1;
		frame_block_threads.create_thread( bind(
			encode_subframe_block, t*subframe_count/thread_count,
			                       tp1*subframe_count/thread_count,
			                       &samples[0], samples.size(),
			                       &subframe[0], &subframe_sizes[0], subframe_count )
		);
		t = tp1;
	}
	frame_block_threads.join_all();
#else
	encode_subframe_blocks( 0, subframe_count,  subframe_count, file_header->bits_per_sample,
	                        &samples[0], samples.size(),
	                        &encoded_headers[0], &encoded_headers_sizes[0],
	                        (char*)&samples[0], &encoded_diffs_sizes[0],
	                        &subframe_sizes[0] );
#endif


	//notify listeners that we're done
	{
#ifdef TLC_USE_FRAME_WRITE_THREAD_POOL  //file write thread is a listener
		unique_lock<mutex> frames_lock(*frames_mutex);
#endif
		finished = true;
	}
	frames_condition->notify_all();
}


void tlc_v1_frame::write(ofstream &ofs)
{
	//write the subframe sizes followed by the subframes
	ofs.write( &subframe_sizes[0], subframe_sizes.size() );
	for(unsigned int i=0; i<encoded_headers_sizes.size(); ++i) {
		ofs.write( (char*) &encoded_headers[i * TLC_V1_ENCODED_HEADERS_PITCH],
		           encoded_headers_sizes[i] );
		ofs.write( (char*) &samples[i * TLC_V1_ENCODED_DIFFS_PITCH / 2],  //div 2 because we're indexing by words not bytes
		           encoded_diffs_sizes[i] );
	}
}
