#include "solver.hpp"

static unsigned int lfsr, lfsr_x, lfsr_y;

static void lfsr_random_init(unsigned int seed, unsigned int seed_x, unsigned int seed_y) {
#pragma HLS INLINE
	lfsr = seed;
	lfsr_x = seed_x;
	lfsr_y = seed_y;
}

static unsigned int lfsr_random() {
#pragma HLS INLINE
	bool b_32 = false, b_22 = false, b_2 = false, b_1 = false;
	if((lfsr >>  0) % 2 == 1) b_32 = true;
	if((lfsr >> 10) % 2 == 1) b_22 = true;
	if((lfsr >> 30) % 2 == 1) b_2 = true;
	if((lfsr >> 31) % 2 == 1) b_1 = true;
	bool new_bit = b_32 ^ b_22 ^ b_2 ^ b_1;
	lfsr >>= 1;
	if(new_bit) lfsr |= 0x80000000;
	return lfsr;
}
static unsigned int lfsr_x_random() {
#pragma HLS INLINE
	bool b_32 = false, b_22 = false, b_2 = false, b_1 = false;
	if((lfsr_x >>  0) % 2 == 1) b_32 = true;
	if((lfsr_x >> 10) % 2 == 1) b_22 = true;
	if((lfsr_x >> 30) % 2 == 1) b_2 = true;
	if((lfsr_x >> 31) % 2 == 1) b_1 = true;
	bool new_bit = b_32 ^ b_22 ^ b_2 ^ b_1;
	lfsr_x >>= 1;
	if(new_bit) lfsr_x |= 0x80000000;
	return lfsr_x;
}
static unsigned int lfsr_y_random() {
#pragma HLS INLINE
	bool b_32 = false, b_22 = false, b_2 = false, b_1 = false;
	if((lfsr_y >>  0) % 2 == 1) b_32 = true;
	if((lfsr_y >> 10) % 2 == 1) b_22 = true;
	if((lfsr_y >> 30) % 2 == 1) b_2 = true;
	if((lfsr_y >> 31) % 2 == 1) b_1 = true;
	bool new_bit = b_32 ^ b_22 ^ b_2 ^ b_1;
	lfsr_y >>= 1;
	if(new_bit) lfsr_y |= 0x80000000;
	return lfsr_y;
}

bool solver(hls::stream<ap_uint<4> >& state, unsigned int seed[3], short int *pbnum, short int *plnum,
		short int block_info[MAX_BLOCKS+1][5][3], short int *pwi, short int *phi, short int block_place_global[MAX_BLOCKS+1][2], short int opt_result[MAX_CELLS]) {
#pragma HLS INTERFACE s_axilite port=return
#pragma HLS INTERFACE ap_hs register port=state
#pragma HLS INTERFACE m_axi depth=1 port=seed offset=slave bundle=io_data
#pragma HLS INTERFACE m_axi depth=1 port=pbnum offset=slave bundle=io_data
#pragma HLS INTERFACE m_axi depth=1 port=plnum offset=slave bundle=io_data
#pragma HLS INTERFACE m_axi depth=1 port=block_info offset=slave bundle=io_data
#pragma HLS INTERFACE m_axi depth=1 port=pwi offset=slave bundle=io_data
#pragma HLS INTERFACE m_axi depth=1 port=phi offset=slave bundle=io_data
#pragma HLS INTERFACE m_axi depth=1 port=block_place_global offset=slave bundle=io_data
#pragma HLS INTERFACE m_axi depth=1 port=opt_result offset=slave bundle=io_data

	state.write(0b0000);

	short int blocks = *pbnum;
	short int line_num = *plnum;
	short int wi, hi;
	
	lfsr_random_init(seed[0], seed[1], seed[2]);
	
	bool status;
	
	state.write(0b0001);

	status = local_placement_1(line_num, blocks, &wi, &hi, block_info, block_place_global, opt_result);
	if(!status) { // Cannot route
		state.write(0b0010);
		return false;
	}

	state.write(0b0011);

	status = local_placement_2(line_num, blocks, &wi, &hi, block_info, opt_result);
	if(!status) {
		state.write(0b0100);
		return false;
	}
	else {
		state.write(0b0111);
	}

	*pwi = wi;
	*phi = hi;

	return true;
}

bool local_placement_1(short int line_num, short int blocks, short int *pwi, short int *phi, short int block_info[MAX_BLOCKS+1][5][3], short int block_place_global[MAX_BLOCKS+1][2], short int opt_result[MAX_CELLS]) {
	
	short int wi, hi; // board size (w, h)
	
	// size of each slot (w, h)
	short int slot_w[SLOT_MAX_SIZE];
	short int slot_h[SLOT_MAX_SIZE];
	// basic position of each slot (x, y)
	short int slot_w_t[SLOT_MAX_SIZE];
	short int slot_h_t[SLOT_MAX_SIZE];
	
	short int block_place_slack[MAX_BLOCKS+1][2];
	short int block_place_basis[MAX_BLOCKS+1][2];
	short int block_place_delta[MAX_BLOCKS+1][2];
	short int block_place_opt[MAX_BLOCKS+1][2];
	
	short int board_str[MAX_CELLS];
#pragma HLS ARRAY_PARTITION variable=board_str cyclic factor=4 dim=1
	
	// load a placement result
	short int min_x = SLOT_MAX_SIZE, min_y = SLOT_MAX_SIZE, max_x = 0, max_y = 0;
	PLACE_1_LOAD_1:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		short int p = block_place_global[i][0];
		short int q = block_place_global[i][1];
		if(p < min_x) { min_x = p; }
		if(q < min_y) { min_y = q; }
		if(p > max_x) { max_x = p; }
		if(q > max_y) { max_y = q; }
	}
	PLACE_1_LOAD_2:
	for(short int p = min_x; p <= max_x; p++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=64
		slot_w[p] = 0;
	}
	PLACE_1_LOAD_3:
	for(short int q = min_y; q <= max_y; q++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=64
		slot_h[q] = 0;
	}
	PLACE_1_LOAD_4:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		short int sx = block_info[i][0][0];
		short int sy = block_info[i][0][1];
		short int p = block_place_global[i][0];
		short int q = block_place_global[i][1];
		if(sx > slot_w[p]) { slot_w[p] = sx; }
		if(sy > slot_h[q]) { slot_h[q] = sy; }
	}
	wi = INTER_BLOCK_MARGIN;
	PLACE_1_LOAD_5:
	for(short int p = min_x; p <= max_x; p++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=64
		slot_w_t[p] = wi;
		wi += slot_w[p] + INTER_BLOCK_MARGIN;
	}
	hi = INTER_BLOCK_MARGIN;
	PLACE_1_LOAD_6:
	for(short int q = min_y; q <= max_y; q++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=64
		slot_h_t[q] = hi;
		hi += slot_h[q] + INTER_BLOCK_MARGIN;
	}
	
#ifdef DEBUG_PRINT
	cout << "== Local placement with routing (1) ==" << endl;
	cout << "Board size = " << wi << "X" << hi << endl;
#endif
	
	// calculate slack for each block
	PLACE_1_CALC_SLACK:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		short int sx = block_info[i][0][0];
		short int sy = block_info[i][0][1];
		short int p = block_place_global[i][0];
		short int q = block_place_global[i][1];
		block_place_basis[i][0] = slot_w_t[p];
		block_place_basis[i][1] = slot_h_t[q];
		block_place_slack[i][0] = slot_w[p] - sx;
		block_place_slack[i][1] = slot_h[q] - sy;
	}
	
	bool find_route = false;
	int min_status = MAX_CELLS;
	PLACE_1_SEARCH:
	for(int counter = 0; counter < LOOP; counter++) {
#ifdef DEBUG_PRINT
		cout << "."; fflush(stdout);
#endif
		PLACE_1_SEARCH_INIT:
		for(int s = 0; s < MAX_CELLS; s++) {
#pragma HLS UNROLL factor=8
			board_str[s] = 0;
		}
		PLACE_1_SEARCH_SET:
		for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
			short int x = block_place_basis[i][0];
			short int y = block_place_basis[i][1];
			short int dx = lfsr_x_random() % (block_place_slack[i][0] + 1);
			short int dy = lfsr_y_random() % (block_place_slack[i][1] + 1);
			block_place_delta[i][0] = dx;
			block_place_delta[i][1] = dy;
			PLACE_1_SEARCH_SET_1:
			for(int it = 1; it < 5; it++) {
				short int tx = block_info[i][it][0];
				short int ty = block_info[i][it][1];
				short int tn = block_info[i][it][2];
				if(tn < 0) break;
				short int p = x + dx + tx;
				short int q = y + dy + ty;
				short int idx = q * MAX_SIZE + p;
				if(tn > 0) { board_str[idx] = tn; }
				else { board_str[idx] = -1; }
			}
		}
		
		int status = router(wi, hi, line_num, board_str); // routing
		
		if(status >= 0 && status < min_status) {
			min_status = status;
			find_route = true;
			PLACE_1_SEARCH_OUT_1:
			for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
				block_place_opt[i][0] = block_place_basis[i][0] + block_place_delta[i][0];
				block_place_opt[i][1] = block_place_basis[i][1] + block_place_delta[i][1];
			}
			PLACE_1_SEARCH_OUT_2_Y:
			for(short int y = 0; y < hi; y++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
				PLACE_1_SEARCH_OUT_2_X:
				for(short int x = 0; x < wi; x++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
					short int idx = y * MAX_SIZE + x;
					opt_result[idx] = board_str[idx];
				}
			}
		}
	}
#ifdef DEBUG_PRINT
	cout << endl;
#endif
	
	if(!find_route) return false;
	
	// Update
	PLACE_1_UPDATE_I:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		block_place_basis[i][0] = block_place_opt[i][0];
		block_place_basis[i][1] = block_place_opt[i][1];
	}
	
	/// print for debug ///
#ifdef DEBUG_PRINT
	cout << "Min-wire status: " << min_status << endl;
	show_result(line_num, blocks, wi, hi, opt_result);
#endif
	/// print for debug ///
	
	//cout << "== Local placement with routing (1+) ==" << endl;
	bool remove_flag;
	short int deleted_line_w = 0, deleted_line_h = 0;
	PLACE_1_UPDATE_Y:
	for(short int q = hi - 1; q >= 0; q--) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
		remove_flag = true;
		PLACE_1_UPDATE_Y_X:
		for(short int p = 0; p < wi; p++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
			short int idx = q * MAX_SIZE + p;
			if(opt_result[idx] != 0) {
				remove_flag = false;
				break;
			}
		}
		if(!remove_flag) continue;
		PLACE_1_UPDATE_Y_B:
		for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
			if(block_place_basis[i][1] > q) { block_place_basis[i][1] -= 1; }
		}
		deleted_line_h++;
	}
	PLACE_1_UPDATE_X:
	for(short int p = wi - 1; p >= 0; p--) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
		remove_flag = true;
		PLACE_1_UPDATE_X_Y:
		for(short int q = 0; q < hi; q++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
			short int idx = q * MAX_SIZE + p;
			if(opt_result[idx] != 0) {
				remove_flag = false;
				break;
			}
		}
		if(!remove_flag) continue;
		PLACE_1_UPDATE_X_B:
		for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
			if(block_place_basis[i][0] > p) { block_place_basis[i][0] -= 1; }
		}
		deleted_line_w++;
	}
	hi -= deleted_line_h;
	wi -= deleted_line_w;
	
	PLACE_1_REROUTE_INIT:
	for(int s = 0; s < MAX_CELLS; s++) {
#pragma HLS UNROLL factor=8
		board_str[s] = 0;
	}
	PLACE_1_REROUTE_SET:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		short int x = block_place_basis[i][0];
		short int y = block_place_basis[i][1];
		PLACE_1_REROUTE_SET_1:
		for(int it = 1; it < 5; it++) {
			short int tx = block_info[i][it][0];
			short int ty = block_info[i][it][1];
			short int tn = block_info[i][it][2];
			if(tn < 0) break;
			short int p = x + tx;
			short int q = y + ty;
			short int idx = q * MAX_SIZE + p;
			if(tn > 0) { board_str[idx] = tn; }
			else { board_str[idx] = -1; }
		}
	}
	
	int counter = 0;
	PLACE_1_REROUTE_LOOP:
	while(router(wi, hi, line_num, board_str) < 0) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=5
		if(counter++ >= LOOP) return false;
	}

	PLACE_1_O_1_Y:
	for(short int y = 0; y < hi; y++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
		PLACE_1_O_1_X:
		for(short int x = 0; x < wi; x++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
			short int idx = y * MAX_SIZE + x;
			opt_result[idx] = board_str[idx];
		}
	}
	PLACE_1_O_2:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		block_info[i][0][0] = block_place_basis[i][0];
		block_info[i][0][1] = block_place_basis[i][1];
	}
	
	*pwi = wi;
	*phi = hi;

	return true;
}

bool local_placement_2(short int line_num, short int blocks, short int *pwi, short int *phi, short int block_info[MAX_BLOCKS+1][5][3], short int opt_result[MAX_CELLS]) {
	
	short int wi = *pwi, hi = *phi;
	
	short int block_place_slack[MAX_BLOCKS+1][2];
	short int block_place_basis[MAX_BLOCKS+1][2];
	short int block_place_delta[MAX_BLOCKS+1][2];
	short int block_place_opt[MAX_BLOCKS+1][2];
	
	short int board_str[MAX_CELLS];
#pragma HLS ARRAY_PARTITION variable=board_str cyclic factor=4 dim=1
	
	PLACE_2_I:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		block_place_basis[i][0] = block_info[i][0][0];
		block_place_basis[i][1] = block_info[i][0][1];
	}
	
#ifdef DEBUG_PRINT
	cout << "== Local placement with routing (2) ==" << endl;
#endif
	int direction = 0; // 0: X, 1: Y
	int try_count = 1;
	int no_update = 0;
	
	if(hi > wi) direction = 1;
	PLACE_2_BODY:
	while(try_count <= TRY_LIMIT) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=50
#ifdef DEBUG_PRINT
		cout << "Phase " << try_count << ": direction: ";
		switch(direction) {
			case 0: cout << "X" << endl; break;
			case 1: cout << "Y" << endl; break;
		}
		cout << "Board size = " << wi << "X" << hi << endl;
#endif
		
		// calculate slack for each block
		PLACE_2_CALC_SLACK:
		for(int t = 1; t <= blocks; t++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
			block_place_slack[t][0] = 0;
			block_place_slack[t][1] = 0;
			PLACE_2_CALC_SLACK_I:
			for(int s = 0; s < MAX_CELLS; s++) {
#pragma HLS UNROLL factor=8
				board_str[s] = 0;
			}
			PLACE_2_CALC_SLACK_1:
			for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
				if(i == t) continue;
				short int x = block_place_basis[i][0];
				short int y = block_place_basis[i][1];
				PLACE_2_CALC_SLACK_1_1:
				for(int it = 1; it < 5; it++) {
					short int tx = block_info[i][it][0];
					short int ty = block_info[i][it][1];
					short int tn = block_info[i][it][2];
					if(tn < 0) break;
					short int p = x + tx;
					short int q = y + ty;
					short int idx = q * MAX_SIZE + p;
					if(tn > 0) { board_str[idx] = tn; }
					else { board_str[idx] = -1; }
				}
			}
			bool move_x = true, move_y = true;
			PLACE_2_CALC_SLACK_2: // X direction
			while(direction == 0) {
#pragma HLS LOOP_TRIPCOUNT min=0 max=10

				short int x = block_place_basis[t][0];
				short int y = block_place_basis[t][1];
				PLACE_2_CALC_SLACK_2_1:
				for(int it = 1; it < 5; it++) {
					short int tx = block_info[t][it][0];
					short int ty = block_info[t][it][1];
					short int tn = block_info[t][it][2];
					if(tn < 0) break;
					short int p = x + tx - (block_place_slack[t][0]+1);
					short int q = y + ty;
					short int idx = q * MAX_SIZE + p;
					if(p < 0 || board_str[idx] != 0) {
						move_x = false;
						break;
					}
				}
				if(move_x) {
					if(++block_place_slack[t][0] >= 9) break;
				}
				else {
					break;
				}
			}
			PLACE_2_CALC_SLACK_3: // Y direction
			while(direction == 1) {
#pragma HLS LOOP_TRIPCOUNT min=0 max=10
				short int x = block_place_basis[t][0];
				short int y = block_place_basis[t][1];
				PLACE_2_CALC_SLACK_3_1:
				for(int it = 1; it < 5; it++) {
					short int tx = block_info[t][it][0];
					short int ty = block_info[t][it][1];
					short int tn = block_info[t][it][2];
					if(tn < 0) break;
					short int p = x + tx;
					short int q = y + ty - (block_place_slack[t][1]+1);
					short int idx = q * MAX_SIZE + p;
					if(q < 0 || board_str[idx] != 0) {
						move_y = false;
						break;
					}
				}
				if(move_y) {
					if(++block_place_slack[t][1] >= 9) break;
				}
				else {
					break;
				}
			}
		}
		
		no_update++;
		bool find_route = false;
		int d_sum_max = 0;
		PLACE_2_SEARCH:
		for(int counter = 0; counter < LOOP; counter++) {
#ifdef DEBUG_PRINT
			cout << "."; fflush(stdout);
#endif
			PLACE_2_SEARCH_INIT:
			for(int s = 0; s < MAX_CELLS; s++) {
#pragma HLS UNROLL factor=8
				board_str[s] = 0;
			}
			int d_sum = 0;
			PLACE_2_SEARCH_SET:
			for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
				short int x = block_place_basis[i][0];
				short int y = block_place_basis[i][1];
				short int dx = 0;
				short int dy = 0;
				switch(direction) {
					case 0: dx = lfsr_x_random() % (block_place_slack[i][0] + 1); break;
					case 1: dy = lfsr_y_random() % (block_place_slack[i][1] + 1); break;
				}
				if(lfsr_random() % 16 >= 2) {
					dx = 0;
					dy = 0;
				}
				block_place_delta[i][0] = dx;
				block_place_delta[i][1] = dy;
				d_sum += dx + dy;
				PLACE_2_SEARCH_SET_1:
				for(int it = 1; it < 5; it++) {
					short int tx = block_info[i][it][0];
					short int ty = block_info[i][it][1];
					short int tn = block_info[i][it][2];
					if(tn < 0) break;
					short int p = x - dx + tx;
					short int q = y - dy + ty;
					short int idx = q * MAX_SIZE + p;
					if(tn > 0) { board_str[idx] = tn; }
					else{ board_str[idx] = -1; }
				}
			}
			
			int status = router(wi, hi, line_num, board_str);
			
			if(status >= 0 && d_sum > d_sum_max) {
				d_sum_max = d_sum;
				find_route = true;
				PLACE_2_SEARCH_OUT_1:
				for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
					block_place_opt[i][0] = block_place_basis[i][0] - block_place_delta[i][0];
					block_place_opt[i][1] = block_place_basis[i][1] - block_place_delta[i][1];
				}
				PLACE_2_SEARCH_OUT_2_Y:
				for(short int y = 0; y < hi; y++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
					PLACE_2_SEARCH_OUT_2_X:
					for(short int x = 0; x < wi; x++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
						short int idx = y * MAX_SIZE + x;
						opt_result[idx] = board_str[idx];
					}
				}
			}
		}
#ifdef DEBUG_PRINT
		cout << endl;
#endif
		
		if(find_route) {
			
			no_update = 0; // reset
			
			// Update
			PLACE_2_UPDATE_I:
			for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
				block_place_basis[i][0] = block_place_opt[i][0];
				block_place_basis[i][1] = block_place_opt[i][1];
			}
			
			/// print for debug ///
#ifdef DEBUG_PRINT
			cout << "Max dSum: " << d_sum_max << endl;
			show_result(line_num, blocks, wi, hi, opt_result);
#endif
			/// print for debug ///
			
			bool remove_flag;
			short int deleted_line_w = 0, deleted_line_h = 0;
			PLACE_2_UPDATE_Y:
			for(short int q = hi - 1; q >= 0; q--) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
				remove_flag = true;
				PLACE_2_UPDATE_Y_X:
				for(short int p = 0; p < wi; p++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
					short int idx = q * MAX_SIZE + p;
					if(opt_result[idx] != 0) {
						remove_flag = false;
						break;
					}
				}
				if(!remove_flag) continue;
				PLACE_2_UPDATE_Y_B:
				for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
					if(block_place_basis[i][1] > q) { block_place_basis[i][1] -= 1; }
				}
				deleted_line_h++;
			}
			PLACE_2_UPDATE_X:
			for(short int p = wi - 1; p >= 0; p--) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
				remove_flag = true;
				PLACE_2_UPDATE_X_Y:
				for(short int q = 0; q < hi; q++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
					short int idx = q * MAX_SIZE + p;
					if(opt_result[idx] != 0) {
						remove_flag = false;
						break;
					}
				}
				if(!remove_flag) continue;
				PLACE_2_UPDATE_X_B:
				for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
					if(block_place_basis[i][0] > p) { block_place_basis[i][0] -= 1; }
				}
				deleted_line_w++;
			}
			hi -= deleted_line_h;
			wi -= deleted_line_w;
			
			PLACE_2_REROUTE_INIT:
			for(int s = 0; s < MAX_CELLS; s++) {
#pragma HLS UNROLL factor=8
				board_str[s] = 0;
			}
			PLACE_2_REROUTE_SET:
			for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
				short int x = block_place_basis[i][0];
				short int y = block_place_basis[i][1];
				PLACE_2_REROUTE_SET_1:
				for(int it = 1; it < 5; it++) {
					short int tx = block_info[i][it][0];
					short int ty = block_info[i][it][1];
					short int tn = block_info[i][it][2];
					if(tn < 0) break;
					short int p = x + tx;
					short int q = y + ty;
					short int idx = q * MAX_SIZE + p;
					if(tn > 0) { board_str[idx] = tn; }
					else { board_str[idx] = -1; }
				}
			}
			
			int counter = 0;
			PLACE_2_REROUTE_LOOP:
			while(router(wi, hi, line_num, board_str) < 0) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=5
				if(counter++ >= LOOP) return false;
			}
			PLACE_2_REROUTE_OUT_Y:
			for(short int y = 0; y < hi; y++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
				PLACE_2_REROUTE_OUT_X:
				for(short int x = 0; x < wi; x++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
					short int idx = y * MAX_SIZE + x;
					opt_result[idx] = board_str[idx];
				}
			}
		} // End update
		
		if(no_update >= NO_MOVE) break;
		
		try_count++;
		switch(direction) {
			case 0: direction = 1; break;
			case 1: direction = 0; break;
		}
	}
#ifdef DEBUG_PRINT
	cout << "#Tries: " << try_count << endl;
#endif

	PLACE_2_O_2:
	for(int i = 1; i <= blocks; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
		block_info[i][0][0] = block_place_basis[i][0];
		block_info[i][0][1] = block_place_basis[i][1];
	}
	
	*pwi = wi;
	*phi = hi;

	return true;
}







unsigned short int new_weight(unsigned short int x) {
#pragma HLS INLINE
	unsigned short int y = (x & 0x00FF) + 1;
	return y;
}

unsigned short int my_abs(short int x) {
#pragma HLS INLINE
	if(x < 0) x *= -1;
	return x;
}

// DATA_BIT: bit length of cell idx
// PRIO_BIT: bit length of cost(priority)
int router(short int size_x, short int size_y, short int line_num, short int board_str[MAX_CELLS]) {
	
	if(line_num == 0) return 0;
	
	short int terminals[MAX_LINES][2];
	short int paths_size[MAX_LINES];
	unsigned short int paths[MAX_LINES][MAX_PATH];
	bool adjacents[MAX_LINES];
	
	unsigned short int weights[MAX_CELLS];
	
	SET_LOOP_1:
	for(int i = 0; i < line_num; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=256
		terminals[i][0] = -1;
		terminals[i][1] = -1;
		paths_size[i] = 0;
		adjacents[i] = false;
	}
	SET_LOOP_2_Y:
	for(short int y = 0; y < size_y; y++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
		SET_LOOP_2_X:
		for(short int x = 0; x < size_x; x++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
			short int j = y * MAX_SIZE + x;
			short int num = board_str[j];
			switch(num) {
			case 0:
				weights[j] = 1;
				break;
			case -1:
				weights[j] = PRIO_MAX;
				break;
			default:
				weights[j] = PRIO_MAX;
				if(terminals[num-1][0] >= 0) { terminals[num-1][1] = j; }
				else { terminals[num-1][0] = j; }
				break;
			}
		}
	}
	
#ifdef PRINT_BOARD
	cout << "== Print board ==" << endl;
	for(short int y = 0; y < size_y; y++) {
		for(short int x = 0; x < size_x; x++) {
			short int j = y * MAX_SIZE + x;
			short int num = board_str[j];
			if(num == 0) { cout << "--"; }
			else if(num == -1) { cout << "XX"; }
			else{ cout << setfill('0') << setw(2) << hex << num; }
			if(x != size_x-1) { cout << " "; }
		}
		cout << endl;
	}
	cout << setfill(' ');
	cout << dec;
#endif
	
	SET_LOOP_3:
	for(int i = 0; i < line_num; i++) {
#pragma HLS UNROLL factor=2
#pragma HLS LOOP_TRIPCOUNT min=1 max=256
		short int t1 = terminals[i][0];
		short int t2 = terminals[i][1];
		short int t1_y = t1 / MAX_SIZE;
		short int t1_x = t1 - MAX_SIZE * t1_y;
		short int t2_y = t2 / MAX_SIZE;
		short int t2_x = t2 - MAX_SIZE * t2_y;
		unsigned short int dist = my_abs(t1_x - t2_x) + my_abs(t1_y - t2_y);
		if(dist == 1) {
			adjacents[i] = true;
#ifdef PRINT_BOARD
			cout << "Line #" << i + 1 << " needs no routing" << endl;
#endif
		}
	}
	
	// Step 1 //
	ROUTING_1:
	for(int i = 0; i < line_num; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=256
		if(adjacents[i]) continue;
#ifdef PRINT_SEARCH
		cout << "Line #" << i + 1 << endl;
#endif
		short int t1 = terminals[i][0];
		short int t2 = terminals[i][1];
		weights[t1] = 1;
		if(search(size_x, size_y, &paths_size[i], paths[i], t1, t2, weights) < 0) { return -1; }
		weights[t1] = PRIO_MAX;
	}
	
	bool has_overlap;
	short int overlap_checks[MAX_CELLS];
	short int last_target = -1;
	
	// Step 2 //
	ROUTING_2:
	for(int round = 1; round <= ROUND_LIMIT; round++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=4096
		short int target = lfsr_random() % line_num;
		if(adjacents[target]) continue;
		if(target == last_target) continue;
		last_target = target;
		unsigned short int round_weight = new_weight(round);

#ifdef PRINT_SEARCH		
		cout << "Line #" << target + 1 << "(round: " << round << ", weight: " << round_weight << ")" << endl;
#endif
		
		short int t1 = terminals[target][0];
		short int t2 = terminals[target][1];
		
		WEIGHT_UPDATE_1:
		for(short int p = 0; p < paths_size[target]; p++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
			weights[paths[target][p]] = 1;
		}
		WEIGHT_UPDATE_2:
		for(int i = 0; i < line_num; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=256
			if(i == target) continue;
			WEIGHT_UPDATE_2_1:
			for(short int p = 0; p < paths_size[i]; p++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
				weights[paths[i][p]] = round_weight;
			}
		}
		weights[t1] = 1;
		search(size_x, size_y, &paths_size[target], paths[target], t1, t2, weights);
		weights[t1] = PRIO_MAX;
		
		has_overlap = false;
		OVERLAP_CHECK_1:
		for(short int y = 0; y < size_y; y++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
			OVERLAP_CHECK_1_1:
			for(short int x = 0; x < size_x; x++) {
#pragma HLS LOOP_TRIPCOUNT min=10 max=128
				short int j = y * MAX_SIZE + x;
				overlap_checks[j] = 0;
			}
		}
		OVERLAP_CHECK_2:
		for(int i = 0; i < line_num && !has_overlap; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=256
			OVERLAP_CHECK_2_1:
			for(short int p = 0; p < paths_size[i]; p++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
				short int cell_id = paths[i][p];
				if(overlap_checks[cell_id] == 1) {
					has_overlap = true;
					break;
				}
				overlap_checks[cell_id] = 1;
			}
		}
		if(!has_overlap) break;
	}
	
	if(has_overlap) { // Cannot solve
		return -1;
	}
	
	int total_wire_length = 0;
	OUTPUT_1:
	for(short int i = 0; i < line_num; i++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=256
		total_wire_length += paths_size[i];
		OUTPUT_1_1:
		for(short int p = 0; p < paths_size[i]; p++) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
			short int cell_id = paths[i][p];
			board_str[cell_id] = i + 1;
		}
	}
	
#ifdef PRINT_BOARD
	cout << "== Print answer ==" << endl;
	for(short int y = 0; y < size_y; y++) {
		for(short int x = 0; x < size_x; x++) {
			short int j = y * MAX_SIZE + x;
			short int num = board_str[j];
			if(num == 0) { cout << "--"; }
			else if(num == -1) { cout << "XX"; }
			else{ cout << setfill('0') << setw(2) << hex << num; }
			if(x != size_x-1) { cout << " "; }
		}
		cout << endl;
	}
	cout << setfill(' ');
	cout << dec;
#endif
	
	return total_wire_length;
}

int search(short int size_x, short int size_y, short int *path_size, unsigned short int path[MAX_PATH], unsigned short int start, unsigned short int goal, unsigned short int w[MAX_CELLS]) {
	
	unsigned short int dist[MAX_CELLS];
#pragma HLS ARRAY_PARTITION variable=dist cyclic factor=16 dim=1
	unsigned short int prev[MAX_CELLS];
#pragma HLS ARRAY_PARTITION variable=prev cyclic factor=16 dim=1

	SEARCH_INIT_DIST:
	for(int j = 0; j < MAX_CELLS; j++) {
#pragma HLS UNROLL factor=32
		dist[j] = PRIO_MAX;
	}
	
	ap_uint<PQ_BIT> pq_len = 0;
    bool is_empty = true;
    unsigned int pq_nodes[MAX_PQ];
	
	short int goal_y = goal / MAX_SIZE;
	short int goal_x = goal - MAX_SIZE * goal_y;
	
	dist[start] = 0;
	enqueue(pq_nodes, 0, start, &pq_len, &is_empty);
	
	bool find_path = false;
	SEARCH_BODY:
	while(!is_empty) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=1000
		
		unsigned short int prev_cost;
		unsigned short int s;
		dequeue(pq_nodes, &prev_cost, &s, &pq_len, &is_empty);
		
		if(s == goal) {
			find_path = true;
			break;
		}
		
		unsigned short int dist_s = dist[s];
		unsigned short int cost = w[s];
		unsigned short int dist_d = dist_s + cost;
		
		short int s_y = s / MAX_SIZE;
		short int s_x = s - MAX_SIZE * s_y;

		SEARCH_ADJACENTS:
		for(int a = 0; a < 4; a++) {
#pragma HLS UNROLL
			short int d_x = s_x;
			short int d_y = s_y;
			switch(a) {
			case 0: d_x -= 1; break;
			case 1: d_x += 1; break;
			case 2: d_y -= 1; break;
			case 3: d_y += 1; break;
			}
			if(d_x >= 0 && d_x < size_x && d_y >= 0 && d_y < size_y) {
				unsigned short int d = d_y * MAX_SIZE + d_x;
				if(w[d] == PRIO_MAX && d != goal) continue;
				if(dist_d < dist[d]) {
					dist[d] = dist_d;
					prev[d] = s;
					unsigned short int new_dist = dist_d + my_abs(d_x - goal_x) + my_abs(d_y - goal_y);
					enqueue(pq_nodes, new_dist, d, &pq_len, &is_empty);
				}
			}
		}
	}
	if(!find_path){ return -1; }
	
	unsigned short int t = prev[goal];
	short int p = 0;
#ifdef PRINT_SEARCH
	cout << "Path: ";
#endif
	SEARCH_BACKTRACK:
	while(t != start) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=128
#ifdef PRINT_SEARCH
		cout << "(" << t % MAX_SIZE << ", " << t / MAX_SIZE << ")";
#endif
		path[p] = t;
		p++;
		t = prev[t];
	}
#ifdef PRINT_SEARCH
	cout << endl;
#endif
	*path_size = p;
	
	return 0;
}

// Enqueue (Insert an element)
void enqueue(unsigned int pq_nodes[MAX_PQ], unsigned short int queue_priority, unsigned short int queue_data, ap_uint<PQ_BIT> *pq_len, bool *is_empty) {
#pragma HLS INLINE

	(*pq_len)++;
	if ((*pq_len) == 0) { (*pq_len)--; } // Queue is full -> Last element is automatically removed
	// Note that last element is not always the lowest priority one. //

	ap_uint<PQ_BIT> i = (*pq_len);
	ap_uint<PQ_BIT> p = (*pq_len) >> 1; // parent node
	
	ENQUEUE_LOOP:
	while (i > 1 && (unsigned short int)(pq_nodes[p] >> DATA_BIT) >= queue_priority) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=12
		pq_nodes[i] = pq_nodes[p];
		i = p;
		p = p >> 1; // parent node
	}
	pq_nodes[i] = ((unsigned int)queue_priority << DATA_BIT) | (unsigned int)queue_data;
	*is_empty = false;
}

// Dequeue (Extract and remove the top element)
void dequeue(unsigned int pq_nodes[MAX_PQ], unsigned short int *ret_priority, unsigned short int *ret_data, ap_uint<PQ_BIT> *pq_len, bool *is_empty) {
#pragma HLS INLINE

	*ret_priority = (unsigned short int)(pq_nodes[1] >> DATA_BIT);
	*ret_data     = (unsigned short int)(pq_nodes[1] & DATA_MASK);
	
	ap_uint<PQ_BIT> i = 1; // root node
	unsigned short int last_priority = (unsigned short int)(pq_nodes[*pq_len] >> DATA_BIT); // Priority of last element

	DEQUEUE_LOOP:
	while (!(i >> (PQ_BIT-1))) {
#pragma HLS LOOP_TRIPCOUNT min=1 max=12
		ap_uint<PQ_BIT> c1 = i << 1; // child node (L)
		ap_uint<PQ_BIT> c2 = c1 + 1; // child node (R)
		if (c1 < *pq_len && (unsigned short int)(pq_nodes[c1] >> DATA_BIT) <= last_priority) {
			if (c2 < *pq_len && (unsigned short int)(pq_nodes[c2] >> DATA_BIT) <= (unsigned short int)(pq_nodes[c1] >> DATA_BIT)) {
				pq_nodes[i] = pq_nodes[c2];
				i = c2;
			}
			else {
				pq_nodes[i] = pq_nodes[c1];
				i = c1;
			}
		}
		else {
			if (c2 < *pq_len && (unsigned short int)(pq_nodes[c2] >> DATA_BIT) <= last_priority) {
				pq_nodes[i] = pq_nodes[c2];
				i = c2;
			}
			else {
				break;
			}
		}
	}
	pq_nodes[i] = pq_nodes[*pq_len];
	(*pq_len)--;
	if ((*pq_len) == 0) { *is_empty = true; }
}
