驱动部分 3 对比 Cuda实现矩阵加法
PCIe SerDes 全流程实战
驱动部分——对比CUDA实现矩阵加法
1 代码和结果讲解
- 驱动层的核心流程可以概括为“定位 BAR → 映射 MMIO → 批量搬运 → 校验与抽样输出”。程序首先通过 PCI 配置空间读取 BAR0 的物理地址并用内核提供的物理内存映射接口将 64 KiB 空间映射到用户态,从而获得对 PCIe 寄存器的直接访问权。
- 随后构造 1024 组随机 3 × 3 uint8 矩阵(A、B),在本地 CPU 得到期望结果后,按 16 B 对齐规则循环 1024 次:先把 A、B 两块各 9 B 数据写入 DATA_A_BASE 与 DATA_B_BASE,然后将启动位写入 CTRL_REG;硬件一个时钟周期后把 RESULT_BASE 的 16 B 读回,其中前 9 B 即为加法结果。
- 所有结果写入 fpgaR 缓冲区,与 CPU 参考值逐元素比较,若完全一致则打印 “ALL PASS”,否则立即报告首个失配元素位置并停止。为方便人工审阅,驱动每处理 256 张矩阵就抽样一张,通过“CPU | FPGA/GPGPU”并排打印三行 3 × 3 数字。
- 输出格式与 CUDA 版本保持一致,便于横向比对。整个过程不依赖内核线程或 DMA,靠纯粹的 MMIO 读写即可完成端到端链路验证,验证了 PCIe SerDes 通道的可靠性。
2 矩阵加法演示结果
以下是Redox OS + FPGA/GPGPU执行后的结果
root:~# rustpython vec_add_3x3_batch.py
[INFO] 16:52:00.503 BAR0 phys=0xFC810000 map ok
🎉 ALL PASS – FPGA 结果与 CPU 完全一致
===== Samples every 256 matrices =====
--- Matrix #0 (CPU | FPGA/GPGPU)---
13 77 28 | 13 77 28
100 106 52 | 100 106 52
53 237 157 | 53 237 157
--- Matrix #256 (CPU | FPGA/GPGPU)---
87 119 191 | 87 119 191
1 228 175 | 1 228 175
35 126 6 | 35 126 6
--- Matrix #512 (CPU | FPGA/GPGPU)---
9 62 221 | 9 62 221
12 105 231 | 12 105 231
85 80 33 | 85 80 33
--- Matrix #768 (CPU | FPGA/GPGPU)---
231 14 155 | 231 14 155
110 107 117 | 110 107 117
218 180 139 | 218 180 139
以下是RTX3060 CUDA执行后的结果
PS D:\project\cuda_test> nvcc -arch=sm_86 -o test3x3_addition.exe test3x3_addition.cu; .\test3x3_addition.exe
test3x3_addition.cu
tmpxft_00039a50_00000000-10_test3x3_addition.cudafe1.cpp
正在创建库 test3x3_addition.lib 和对象 test3x3_addition.exp
Success! All GPU results match CPU reference.
Matrix #0 (CPU | GPU)
76 66 77 | 76 66 77
132 226 58 | 132 226 58
172 212 89 | 172 212 89
Matrix #256 (CPU | GPU)
26 213 213 | 26 213 213
167 127 42 | 167 127 42
126 141 57 | 126 141 57
Matrix #512 (CPU | GPU)
71 199 188 | 71 199 188
42 126 122 | 42 126 122
174 165 120 | 174 165 120
Matrix #768 (CPU | GPU)
211 26 3 | 211 26 3
14 220 42 | 14 220 42
64 31 24 | 64 31 24
3 硬件设备
本文使用RTX3060作为CUDA计算设备
> nvidia-smi.exe
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94 Driver Version: 560.94 CUDA Version: 12.6 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 3060 WDDM | 00000000:0A:00.0 On | N/A |
| 0% 46C P8 15W / 170W | 4914MiB / 12288MiB | 25% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
4 Redox OS rustpython代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
vec_add_3x3_batch.py
--------------------
• 1 k × (3×3) uint8 Vec-Add,硬件一次 ≤16 B,循环 1024 次完成
• 支持 --simulate 仅用 CPU 模拟 FPGA 计算
• 每 256 张矩阵抽样打印 4 组,格式:CPU | FPGA
"""
import os
import sys
import mmap
import struct
import argparse
import random
from datetime import datetime
# ----------------------------------------------------------------------
# ★ 环境常量(按需修改)
# ----------------------------------------------------------------------
PCI_CH_PATH = "/scheme/pci/00-04--00.0/channel"
BAR_INDEX = 0
BAR_BYTES = 64 * 1024
PHY_MEM_PATH = "/scheme/memory/physical@uc"
CTRL_REG = 0x00
DATA_A_BASE = 0x10
DATA_B_BASE = 0x20
RESULT_BASE = 0x30
CTRL_START_MASK = 0x80
# ----------------------------------------------------------------------
# ★ 工具函数
# ----------------------------------------------------------------------
ts = lambda: datetime.now().strftime("%H:%M:%S.%f")[:-3]
def get_bar_phys(pci_ch_path: str, bar_index: int = 0) -> int:
with open(pci_ch_path, "r+b", buffering=0) as ch:
ch.write(struct.pack("<IH", 7, 0x10 + bar_index * 4))
size = struct.unpack("<Q", ch.read(8))[0]
_, bar_val = struct.unpack("<II", ch.read(size))
return bar_val & 0xFFFFFFF0
def map_bar(bar_phys: int, length: int) -> mmap.mmap:
fd = os.open(PHY_MEM_PATH, os.O_RDWR)
return mmap.mmap(fd, length, mmap.MAP_SHARED,
mmap.PROT_READ | mmap.PROT_WRITE,
offset=bar_phys)
def wr(mm: mmap.mmap, off: int, data: bytes): # 写任意长度
mm[off:off+len(data)] = data
def rd(mm: mmap.mmap, off: int, n: int) -> bytes:
return mm[off:off+n]
# ----------------------------------------------------------------------
# ★ 主流程
# ----------------------------------------------------------------------
def main(argv=None):
parser = argparse.ArgumentParser(description="1k×3×3 Vec-Add 批量测试")
parser.add_argument("-q", "--quiet", action="store_true",
help="仅打印 PASS / FAIL 摘要")
parser.add_argument("--simulate", action="store_true",
help="不用 FPGA,完全在 CPU 上模拟")
args = parser.parse_args(argv)
NUM_MATS = 1024 # 1 k
MAT_ELEMS = 9 # 3×3
CHUNK_BYTES = 16 # 单次 ≤16 字节
TOTAL_ELEMS = NUM_MATS * MAT_ELEMS
# ---------- 生成随机输入 ----------
vecA = [random.randint(0, 255) for _ in range(TOTAL_ELEMS)]
vecB = [random.randint(0, 255) for _ in range(TOTAL_ELEMS)]
# ---------- CPU 参考 ----------
expR = [(a + b) & 0xFF for a, b in zip(vecA, vecB)]
# ---------- 打开 / 映射 BAR ----------
if not args.simulate:
try:
bar_phys = get_bar_phys(PCI_CH_PATH, BAR_INDEX)
bar_mm = map_bar(bar_phys, BAR_BYTES)
if not args.quiet:
print(f"[INFO] {ts()} BAR0 phys=0x{bar_phys:08X} map ok")
except Exception as e:
print(f"[FATAL] mmap BAR 失败:{e}")
return 2
else:
bar_mm = None # 占位
# ---------- FPGA 计算 ----------
fpgaR = [0] * TOTAL_ELEMS
for idx in range(NUM_MATS): # 共 1024 次
base = idx * MAT_ELEMS
a_slice = bytes(vecA[base:base + MAT_ELEMS] + [0]*(CHUNK_BYTES - MAT_ELEMS))
b_slice = bytes(vecB[base:base + MAT_ELEMS] + [0]*(CHUNK_BYTES - MAT_ELEMS))
if not args.simulate:
# 写 A/B block
wr(bar_mm, DATA_A_BASE, a_slice)
wr(bar_mm, DATA_B_BASE, b_slice)
# 发 start
wr(bar_mm, CTRL_REG, struct.pack("<B", CTRL_START_MASK))
# 👉 若 RTL 有 BUSY 位,可轮询;此处假设 1 周期完成
# 读结果 block
res_slice = rd(bar_mm, RESULT_BASE, CHUNK_BYTES)
else:
# 纯 CPU 模拟
res_sim = [(a + b) & 0xFF for a, b in zip(a_slice, b_slice)]
res_slice = bytes(res_sim)
# 取前 9 个字节写回结果数组
fpgaR[base:base+MAT_ELEMS] = list(res_slice[:MAT_ELEMS])
# ---------- 校验 ----------
first_bad = -1
for i, (exp, got) in enumerate(zip(expR, fpgaR)):
if exp != got:
first_bad = i
break
if first_bad == -1:
print("🎉 ALL PASS – FPGA 结果与 CPU 完全一致")
else:
mat_id, elem_id = divmod(first_bad, MAT_ELEMS)
print(f"❌ Mismatch at matrix #{mat_id}, element {elem_id}: "
f"exp {expR[first_bad]}, got {fpgaR[first_bad]}")
# ---------- 抽样打印 ----------
if not args.quiet:
print("\n===== Samples every 256 matrices =====\n")
for sample in range(4):
midx = sample * 256
base = midx * MAT_ELEMS
print(f"--- Matrix #{midx} (CPU | FPGA/GPGPU)---")
for r in range(3):
cpu_row = " ".join(f"{expR[base + r*3 + c]:3d}" for c in range(3))
fpga_row = " ".join(f"{fpgaR[base + r*3 + c]:3d}" for c in range(3))
print(f"{cpu_row} | {fpga_row}")
print()
if __name__ == "__main__":
sys.exit(main())
5 用于对比的CUDA C代码
// test3x3_addition.cu
#include <cstdio>
#include <cstdint>
#include <cuda_runtime.h>
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while (0)
// ---------- 配置 ----------
constexpr int NUM_THREADS = 1024; // 1K 线程
constexpr int MAT_ELEMS = 9; // 3×3
using u8 = uint8_t;
// ---------------------------
// GPU kernel:每线程完成一组 3×3 无符号 8bit 加法
__global__ void add3x3_kernel(const u8* A, const u8* B, u8* C) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= NUM_THREADS) return;
int base = tid * MAT_ELEMS;
#pragma unroll
for (int i = 0; i < MAT_ELEMS; ++i) {
C[base + i] = A[base + i] + B[base + i]; // 溢出自动按模 256
}
}
// CPU 参考实现
void add3x3_cpu(const u8* A, const u8* B, u8* C) {
for (int tid = 0; tid < NUM_THREADS; ++tid) {
int base = tid * MAT_ELEMS;
for (int i = 0; i < MAT_ELEMS; ++i) {
C[base + i] = A[base + i] + B[base + i];
}
}
}
// ********* NEW: 打印 CPU/GPU 对照的 3×3 矩阵 *********
void print_matrix_pair(const u8* cpu, const u8* gpu, int idx) {
printf("Matrix #%d (CPU | GPU)\n", idx);
const u8* c_ptr = cpu + idx * MAT_ELEMS;
const u8* g_ptr = gpu + idx * MAT_ELEMS;
for (int r = 0; r < 3; ++r) {
for (int c = 0; c < 3; ++c)
printf("%3u ", c_ptr[r * 3 + c]);
printf(" | ");
for (int c = 0; c < 3; ++c)
printf("%3u ", g_ptr[r * 3 + c]);
printf("\n");
}
printf("\n");
}
// ********************************************************
int main() {
size_t bytes = NUM_THREADS * MAT_ELEMS * sizeof(u8);
// 主机端分配与初始化
u8 *hA = (u8*)malloc(bytes);
u8 *hB = (u8*)malloc(bytes);
u8 *hC_gpu = (u8*)malloc(bytes);
u8 *hC_cpu = (u8*)malloc(bytes);
for (size_t i = 0; i < NUM_THREADS * MAT_ELEMS; ++i) {
hA[i] = static_cast<u8>(rand() % 256);
hB[i] = static_cast<u8>(rand() % 256);
}
// 设备内存
u8 *dA, *dB, *dC;
CUDA_CHECK(cudaMalloc(&dA, bytes));
CUDA_CHECK(cudaMalloc(&dB, bytes));
CUDA_CHECK(cudaMalloc(&dC, bytes));
CUDA_CHECK(cudaMemcpy(dA, hA, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dB, hB, bytes, cudaMemcpyHostToDevice));
// Launch kernel(1 block × 1024 threads)
add3x3_kernel<<<1, NUM_THREADS>>>(dA, dB, dC);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(hC_gpu, dC, bytes, cudaMemcpyDeviceToHost));
// CPU 参考
add3x3_cpu(hA, hB, hC_cpu);
// 逐元素比对
int mismatches = 0;
for (size_t i = 0; i < NUM_THREADS * MAT_ELEMS; ++i) {
if (hC_gpu[i] != hC_cpu[i]) {
++mismatches;
if (mismatches <= 5) {
printf("Mismatch at idx %zu: GPU %u CPU %u\n",
i, hC_gpu[i], hC_cpu[i]);
}
}
}
printf("%s\n", mismatches ? "Found mismatches." :
"Success! All GPU results match CPU reference.");
// ********* NEW: 抽取并打印 4 个示例矩阵 *********
const int sample_step = 256;
for (int s = 0; s < 4; ++s) {
int idx = s * sample_step;
if (idx < NUM_THREADS) {
print_matrix_pair(hC_cpu, hC_gpu, idx);
}
}
// ***************************************************
// 清理
CUDA_CHECK(cudaFree(dA));
CUDA_CHECK(cudaFree(dB));
CUDA_CHECK(cudaFree(dC));
free(hA); free(hB); free(hC_gpu); free(hC_cpu);
return 0;
}
6 Redox Rustpython源码修改
因为Redox OS有些系统级API未实现,需要修改部分代码跳过检验
diff --git a/stdlib/src/mmap.rs b/stdlib/src/mmap.rs
index 9319bab64..29ee120e9 100644
--- a/stdlib/src/mmap.rs
+++ b/stdlib/src/mmap.rs
@@ -350,29 +350,7 @@ mod mmap {
};
if fd != -1 {
- let metadata = fstat(fd)
- .map_err(|err| io::Error::from_raw_os_error(err as i32).to_pyexception(vm))?;
- let file_len = metadata.st_size;
- if map_size == 0 {
- if file_len == 0 {
- return Err(vm.new_value_error("cannot mmap an empty file".to_owned()));
- }
-
- if offset > file_len {
- return Err(
- vm.new_value_error("mmap offset is greater than file size".to_owned())
- );
- }
-
- map_size = (file_len - offset)
- .try_into()
- .map_err(|_| vm.new_value_error("mmap length is too large".to_owned()))?;
- } else if offset > file_len || file_len - offset < map_size as libc::off_t {
- return Err(
- vm.new_value_error("mmap length is greater than file size".to_owned())
- );
- }
}
let mut mmap_opt = MmapOptions::new();