初始化
cudaFuncSetCacheConfig(reduc_rows, cudaFuncCachePreferShared);
cudaFuncSetSharedMemConfig(reduc_rows, cudaSharedMemBankSizeEightByte);
static __global__ void
rmac_elim_init(uint64_t* const sys, uint64_t** rows, uint32_t* row_indices,
const uint32_t eq_num, uint32_t slot_num) {
const uint32_t tid = global_tid();
if(tid < eq_num) {
rows[tid] = sys + tid * slot_num;
row_indices[tid] = tid;
}
}