Computation-intensive tensor operators constitute over 90% of the computations in Large Language Models (LLMs) and Deep Neural Networks. Automatically and efficiently generating high-performance tensor operators with hardware primitives is crucial for diverse and ever-evolving hardware architectures like RISC-V, ARM, and GPUs, as manually optimized implementation takes at least months and lacks portability. LLMs excel at generating high-level language codes, but they struggle to fully comprehend hardware characteristics and produce high-performance tensor operators.
We introduce a tensor-operator auto-generation framework with a one-line user prompt (QiMeng-TensorOp), which enables LLMs to automatically exploit hardware characteristics to generate tensor operators with hardware primitives, and tune parameters for optimal performance across diverse hardware. Experimental results on various hardware platforms, SOTA LLMs, and typical tensor operators demonstrate that QiMeng-TensorOp effectively unleashes the computing capability of various hardware platforms, and automatically generates tensor operators of superior performance. Compared with vanilla LLMs, QiMeng-TensorOp achieves up to 1291× performance improvement. Even compared with human experts, QiMeng-TensorOp could reach 251% of OpenBLAS on RISC-V CPUs, and 124% of cuBLAS on NVIDIA GPUs. Additionally, QiMeng-TensorOp also significantly reduces development costs by 200× compared with human experts.
We conduct comprehensive evaluations of QiMeng-TensorOp across multiple hardware platforms, state-of-the-art LLMs, and diverse tensor operators to validate its performance and generality. Our experiments demonstrate significant improvements over baseline methods and handcrafted optimization libraries.
Platform Evaluation: QiMeng-TensorOp is extensively tested on a wide range of hardware architectures including RISC-V CPUs (C906, C908, C910, K1), ARM CPUs (A76, A72), and NVIDIA GPUs (RTX 4060, A100 with Tensor Cores).
LLM Validation: The system's performance is validated using two leading LLMs - proprietary GPT-4o and open-source DeepSeek-V3, with additional testing on Claude 3.5 Sonnet and Llama-3.1-405B for ablation studies.
Benchmark Coverage: Experiments include representative tensor operators (GEMM and Conv) with dimensions from popular models (Llama7b, Llama3 70b) and classical CNNs (ResNet-50, VGG-16, U-Net).
The following is a partial result of the GEMM.
The following is the result of the convolution operator.
This is a GEMM code generated for the C910 CPU, showing the C code sketch and assembly kernel code below:
#include "gemm.h"
#include <stdlib.h>
#include <stdio.h>
#define GEMM_R 1024 // N dimension block size
#define GEMM_Q 144 // K dimension block size
#define GEMM_P 48 // M dimension block size
// Helper function to determine optimal block size
static inline int choose_block_size(int remaining, int block_size) {
return (remaining < block_size) ? remaining : block_size;
}
void my_cblas_sgemm(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB, const int M, const int N,
const int K, const float alpha, const float *A,
const int lda, const float *B, const int ldb,
const float beta, float *C, const int ldc) {
// Handle layout and transpose parameters
// (assuming CblasRowMajor and CblasNoTrans for this implementation)
// Dynamically allocate memory for packed matrices
float *sa = (float *)malloc(GEMM_P * GEMM_Q * sizeof(float));
float *sb = (float *)malloc(GEMM_Q * GEMM_R * sizeof(float));
// Check for memory allocation failure
if (sa == NULL || sb == NULL) {
fprintf(stderr, "Error: Memory allocation failed for packed matrices\n");
if (sa) free(sa);
if (sb) free(sb);
return;
}
// Beta scaling: Scale the output matrix C by beta
BETA_OPERATION(0, M, 0, N, beta, C, ldc);
// Main computation loop following NKM cycle order for cache optimization
for (int js = 0; js < N; js += GEMM_R) {
int min_j = choose_block_size(N - js, GEMM_R);
for (int ls = 0; ls < K; ls += GEMM_Q) {
int min_l = choose_block_size(K - ls, GEMM_Q);
// Pack submatrix of B into buffer sb for better cache locality
OCOPY_OPERATION(min_l, min_j, B, ldb, ls, js, sb);
for (int is = 0; is < M; is += GEMM_P) {
int min_i = choose_block_size(M - is, GEMM_P);
// Pack submatrix of A into buffer sa
ICOPY_OPERATION(min_l, min_i, A, lda, ls, is, sa);
// Perform matrix multiplication on packed submatrices
// This calls the optimized RISC-V vector kernel
KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, C, ldc, is, js);
}
}
}
// Clean up dynamically allocated memory
free(sa);
free(sb);
}
#include <riscv_vector.h>
#define L2_16_4 \
...
#define L2_8_4 \
".L8x4_Init: \n\t"\
"li t0, 4 \n\t"\
"vsetvli t0, t0, e32, m1 \n\t"\
"fmv.w.x ft11, zero \n\t"\
"vfmv.v.f v12, ft11 \n\t"\
"vfmv.v.f v13, ft11 \n\t"\
"vfmv.v.f v14, ft11 \n\t"\
"vfmv.v.f v15, ft11 \n\t"\
"vfmv.v.f v16, ft11 \n\t"\
"vfmv.v.f v17, ft11 \n\t"\
"vfmv.v.f v18, ft11 \n\t"\
"vfmv.v.f v19, ft11 \n\t"\
"vlw.v v0, (%[PA]) \n\t"\
"flw ft0, 0(%[PB]) \n\t"\
"addi %[PA], %[PA], 16 \n\t"\
"vfmv.v.f v4, ft0 \n\t"\
"flw ft2, 8(%[PB]) \n\t"\
"vlw.v v1, (%[PA]) \n\t"\
"flw ft1, 4(%[PB]) \n\t"\
"addi %[PA], %[PA], 16 \n\t"\
"vfmv.v.f v5, ft1 \n\t"\
"vfmv.v.f v6, ft2 \n\t"\
"flw ft3, 12(%[PB]) \n\t"\
"vfmv.v.f v7, ft3 \n\t"\
"addi %[PB], %[PB], 16 \n\t"\
" mv t0, %[BK] \n\t"\
" srli t1, t0, 1 \n\t"\
" blez t0, .L8x4_End \n\t"\
".L8x4_Main: \n\t"\
" blez t1, .L8x4_Maintail \n\t"\
".L8x4_Mainloop: \n\t"\
"vfmacc.vv v12, v0, v4 \n\t"\
"vfmacc.vv v13, v0, v5 \n\t"\
"vfmacc.vv v14, v0, v6 \n\t"\
"vfmacc.vv v15, v0, v7 \n\t"\
"vfmacc.vv v16, v1, v4 \n\t"\
"vfmacc.vv v17, v1, v5 \n\t"\
"vfmacc.vv v18, v1, v6 \n\t"\
"vfmacc.vv v19, v1, v7 \n\t"\
"vlw.v v2, (%[PA]) \n\t"\
"flw ft4, 0(%[PB]) \n\t"\
"addi %[PA], %[PA], 16 \n\t"\
"vfmv.v.f v8, ft4 \n\t"\
"vlw.v v3, (%[PA]) \n\t"\
"flw ft5, 4(%[PB]) \n\t"\
"addi %[PA], %[PA], 16 \n\t"\
...
int gemm_kernel(long bm,long bn,long bk,float alpha,float* ba,float* bb,float* C,long ldc)
{
long i,j,k;
float *C0,*C1,*C2,*C3;
float *ptrba,*ptrbb;
for (j=0; j<bn/4; j+=1){
C0 = C;
C1 = C0+ldc;
C2 = C1+ldc;
C3 = C2+ldc;
ptrba = ba;
for(i=0; i<bm/16; i+=1) {
ptrbb = bb;
asm volatile(L2_16_4);
}
if(bm&8) {
ptrbb = bb;
asm volatile(L2_8_4);
}
if(bm&4) {
ptrbb = bb;
asm volatile(L2_4_4);
}
if(bm&2) {
ptrbb = bb;
asm volatile(L2_2_4);
}
if(bm&1) {
ptrbb = bb;
asm volatile(L2_1_4);
}
k = bk*4;
bb = bb+k;
i = ldc*4;
C = C+i;
}
if(bn&2) {
C0 = C;
C1 = C0+ldc;
ptrba = ba;
for(i=0; i<bm/16; i+=1) {
ptrbb = bb;
asm volatile(L2_16_2);
}
if(bm&8) {
ptrbb = bb;
asm volatile(L2_8_2);
}
if(bm&4) {
ptrbb = bb;
asm volatile(L2_4_2);
}
if(bm&2) {
ptrbb = bb;
asm volatile(L2_2_2);
}
if(bm&1) {
ptrbb = bb;
asm volatile(L2_1_2);
}
k = bk*2;
bb = bb+k;
i = ldc*2;
C = C+i;
}
if(bn&1) {
C0 = C;
ptrba = ba;
for(i=0; i<bm/16; i+=1) {
ptrbb = bb;
asm volatile(L2_16_1);
}
if(bm&8) {
ptrbb = bb;
asm volatile(L2_8_1);
}
if(bm&4) {
ptrbb = bb;
asm volatile(L2_4_1);
}
if(bm&2) {
ptrbb = bb;
asm volatile(L2_2_1);
}
if(bm&1) {
ptrbb = bb;
asm volatile(L2_1_1);
}
k = bk;
bb = bb+k;
C = C+ldc;
}
return 0 ;
}
@misc{zhang2025qimengtensoropautomaticallygeneratinghighperformance,
title={QiMeng-TensorOp: Automatically Generating High-Performance Tensor Operators with Hardware Primitives},
author={Xuzhi Zhang and Shaohui Peng and Qirui Zhou and Yuanbo Wen and Qi Guo and Ruizhi Chen and Xinguo Zhu and Weiqiang Xiong and Haixin Chen and Congying Ma and Ke Gao and Chen Zhao and Yanjun Wu and Yunji Chen and Ling Li},
year={2025},
eprint={2505.06302},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={https://arxiv.org/abs/2505.06302},
}