一份完整的DeepSeek-R1复现项目技术解析文档
Open R1是由Hugging Face团队开发的完全开源的DeepSeek-R1复现项目,目标是构建R1推理管道中缺失的组件,让所有人都能复现和构建类似的推理模型。
R1模型采用显式推理模式,将传统语言模型的隐式推理过程显式化:
传统模型: 问题 → [黑盒推理] → 答案
R1模型: 问题 → <think>推理过程</think> → 答案
<think>
步骤1:分析问题
- 理解题目要求
- 识别关键信息
- 确定解题策略
步骤2:制定计划
- 列出需要的公式/方法
- 规划解题步骤
- 预估可能遇到的困难
步骤3:执行求解
- 按步骤计算
- 验证中间结果
- 调整策略(如需要)
步骤4:验证答案
- 检查计算过程
- 验证答案合理性
- 确认最终结果
</think>
基于以上分析,这个问题的解答是...
Level 1: 格式学习 - 学会使用<think>标签
Level 2: 结构推理 - 掌握步骤化思考
Level 3: 深度推理 - 复杂问题分解与求解
Level 4: 自我验证 - 推理过程的自我纠错
SFT阶段使用标准的监督学习方法,通过大量的推理轨迹数据训练模型学习R1风格的推理模式。
sft.py
)def main(script_args, training_args, model_args):
# 1. 环境初始化
set_seed(training_args.seed)
# 2. 日志系统配置
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
# 3. 检查点恢复机制
last_checkpoint = None
if os.path.isdir(training_args.output_dir):
last_checkpoint = get_last_checkpoint(training_args.output_dir)
# 4. 数据加载与处理
dataset = get_dataset(script_args) # 支持单一和混合数据集
tokenizer = get_tokenizer(model_args, training_args)
model = get_model(model_args, training_args)
# 5. 聊天格式设置
if tokenizer.chat_template is None:
logger.info("No chat template provided, defaulting to ChatML.")
model, tokenizer = setup_chat_format(model, tokenizer, format="chatml")
# 6. SFT训练器初始化
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=(dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None),
processing_class=tokenizer,
peft_config=get_peft_config(model_args), # LoRA等参数高效微调
callbacks=get_callbacks(training_args, model_args),
)
# 7. 训练执行
train_result = trainer.train(resume_from_checkpoint=checkpoint)
# 8. 模型保存与优化
trainer.model.generation_config.eos_token_id = tokenizer.eos_token_id
trainer.save_model(training_args.output_dir)
data.py
)def get_dataset(args: ScriptArguments) -> DatasetDict:
"""支持单一数据集和混合数据集的加载"""
if args.dataset_name and not args.dataset_mixture:
# 单一数据集加载
logger.info(f"Loading dataset: {args.dataset_name}")
return datasets.load_dataset(args.dataset_name, args.dataset_config)
elif args.dataset_mixture:
# 混合数据集处理
logger.info(f"Creating dataset mixture with {len(args.dataset_mixture.datasets)} datasets")
datasets_list = []
for dataset_config in args.dataset_mixture.datasets:
# 加载单个数据集
ds = datasets.load_dataset(
dataset_config.id,
dataset_config.config,
split=dataset_config.split,
)
# 列选择
if dataset_config.columns is not None:
ds = ds.select_columns(dataset_config.columns)
# 权重采样
if dataset_config.weight is not None:
ds = ds.shuffle(seed=seed).select(range(int(len(ds) * dataset_config.weight)))
datasets_list.append(ds)
# 合并并打乱
combined_dataset = concatenate_datasets(datasets_list)
combined_dataset = combined_dataset.shuffle(seed=seed)
return DatasetDict({"train": combined_dataset})
model_utils.py
)def get_model(model_args: ModelConfig, training_args: SFTConfig | GRPOConfig):
"""优化的模型加载,支持量化和内存优化"""
torch_dtype = (
model_args.torch_dtype if model_args.torch_dtype in ["auto", None]
else getattr(torch, model_args.torch_dtype)
)
quantization_config = get_quantization_config(model_args)
model_kwargs = dict(
revision=model_args.model_revision,
trust_remote_code=model_args.trust_remote_code,
attn_implementation=model_args.attn_implementation,
torch_dtype=torch_dtype,
use_cache=False if training_args.gradient_checkpointing else True, # 内存优化
device_map=get_kbit_device_map() if quantization_config is not None else None,
quantization_config=quantization_config,
)
return AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
**model_kwargs,
)
通过系统提示强制模型学习R1格式:
system_prompt = """
You are Open-R1, a language model trained by Hugging Face to help users.
Your role as an assistant involves thoroughly exploring questions through a
systematic thinking process before providing the final precise and accurate solutions.
Please structure your response into two main sections:
<think> Thought section </think> Solution section
In the Thought section, detail your reasoning process in steps:
- analysing questions, summarizing relevant findings
- brainstorming new ideas, verifying accuracy
- refining errors, revisiting previous steps
"""
GRPO (Group Relative Policy Optimization) 是一种策略梯度强化学习方法,通过奖励函数引导模型优化推理质量。
# 对每个问题生成多个候选回答
for prompt in dataset:
candidates = model.generate(
prompt,
num_return_sequences=4, # 生成4个候选
temperature=0.8, # 控制多样性
do_sample=True # 启用采样
)
# 对每个候选计算多维奖励
rewards = []
for candidate in candidates:
reward_scores = {}
# 格式奖励 - 检查<think>标签
reward_scores['format'] = format_reward([candidate])
# 准确性奖励 - 验证答案正确性
reward_scores['accuracy'] = accuracy_reward([candidate], [ground_truth])
# 推理步骤奖励 - 鼓励结构化推理
reward_scores['reasoning'] = reasoning_steps_reward([candidate])
# 综合奖励
total_reward = sum(reward_scores.values())
rewards.append(total_reward)
# 基于奖励优化策略
advantages = compute_advantages(rewards)
policy_loss = compute_policy_loss(candidates, advantages)
model.backward(policy_loss)
model.step()
grpo.py
)def main(script_args, training_args, model_args):
# 1. 基础设置(与SFT相同)
set_seed(training_args.seed)
# 2. 数据和模型加载
dataset = get_dataset(script_args)
tokenizer = get_tokenizer(model_args, training_args)
model = get_model(model_args, training_args)
# 3. 关键差异:获取奖励函数
reward_funcs = get_reward_funcs(script_args)
# 4. 数据格式化为对话
def make_conversation(example, prompt_column: str = script_args.dataset_prompt_column):
prompt = []
if training_args.system_prompt is not None:
prompt.append({"role": "system", "content": training_args.system_prompt})
prompt.append({"role": "user", "content": example[prompt_column]})
return {"prompt": prompt}
dataset = dataset.map(make_conversation)
# 5. GRPO训练器初始化
trainer = GRPOTrainer(
model=model,
reward_funcs=reward_funcs, # 关键:奖励函数
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=(dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None),
peft_config=get_peft_config(model_args),
callbacks=get_callbacks(training_args, model_args),
processing_class=tokenizer,
)
# 6. 强化学习训练
train_result = trainer.train(resume_from_checkpoint=checkpoint)
rewards.py
)def get_reward_funcs(script_args) -> list[Callable]:
"""奖励函数注册表"""
REWARD_FUNCS_REGISTRY = {
"accuracy": accuracy_reward,
"format": format_reward,
"reasoning_steps": reasoning_steps_reward,
"cosine": get_cosine_scaled_reward(...),
"repetition_penalty": get_repetition_penalty_reward(...),
"length": len_reward,
"code": code_reward,
"binary_code": binary_code_reward,
"ioi_code": ioi_code_reward,
"cf_code": cf_code_reward,
"code_format": get_code_format_reward(...),
"tag_count": tag_count_reward,
"soft_overlong_punishment": get_soft_overlong_punishment(...),
}
# 根据配置选择奖励函数
reward_funcs = [REWARD_FUNCS_REGISTRY[func] for func in script_args.reward_funcs]
return reward_funcs
Open R1实现了一个模块化的奖励函数系统,支持多种奖励类型的组合使用。
def format_reward(completions, **kwargs):
"""检查R1格式的完整性"""
pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
completion_contents = [completion[0]["content"] for completion in completions]
matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completion_contents]
return [1.0 if match else 0.0 for match in matches]
作用: 确保模型输出符合标准的R1推理格式
def accuracy_reward(completions: list[list[dict[str, str]]], solution: list[str], **kwargs):
"""验证答案的正确性"""
contents = [completion[0]["content"] for completion in completions]
rewards = []
for content, sol in zip(contents, solution):
# 解析标准答案
gold_parsed = parse(
sol,
extraction_mode="first_match",
extraction_config=[LatexExtractionConfig()],
)
# 解析模型答案
answer_parsed = parse(
content,
extraction_config=[LatexExtractionConfig(...)],
extraction_mode="first_match",
)
# 验证正确性
is_correct = verify(answer_parsed, gold_parsed)
rewards.append(1.0 if is_correct else 0.0)
return rewards
作用: 基于数学验证确保答案正确性
def reasoning_steps_reward(completions, **kwargs):
"""鼓励结构化的推理过程"""
pattern = r"(Step \d+:|^\d+\.|\n-|\n\*|First,|Second,|Next,|Finally,)"
completion_contents = [completion[0]["content"] for completion in completions]
matches = [len(re.findall(pattern, content)) for content in completion_contents]
# 鼓励至少3个推理步骤
return [min(1.0, count / 3) for count in matches]
作用: 鼓励模型进行详细的步骤化推理
def code_reward(completions, verification_info, **kwargs):
"""通过代码执行验证编程问题"""
codes = extract_code_from_completions(completions)
execution_results = []
for code in codes:
# 在安全沙盒中执行代码
result = execute_in_sandbox(
code,
verification_info["test_cases"],
provider_type=kwargs.get("provider_type", "e2b")
)
execution_results.append(result)
# 基于通过率计算奖励
rewards = [result.pass_rate for result in execution_results]
return rewards
作用: 通过实际执行验证代码的正确性
def get_repetition_penalty_reward(ngram_size: int, max_penalty: float, language: str = "en"):
"""N-gram重复惩罚"""
def repetition_penalty_reward(completions, **kwargs):
contents = [completion[0]["content"] for completion in completions]
penalties = []
for content in contents:
# 计算N-gram重复度
ngrams = extract_ngrams(content, ngram_size)
repetition_score = calculate_repetition(ngrams)
# 应用惩罚
penalty = min(0.0, -repetition_score * max_penalty)
penalties.append(penalty)
return penalties
return repetition_penalty_reward
# 典型的奖励函数组合
reward_funcs = [
"format", # 确保格式正确
"accuracy", # 确保答案正确
"reasoning_steps", # 鼓励详细推理
"repetition_penalty"# 避免重复内容
]
# 权重化组合
final_reward = (
0.3 * format_score +
0.4 * accuracy_score +
0.2 * reasoning_score +
0.1 * repetition_penalty
)
# recipes/OpenR1-Distill-7B/sft/config_distill.yaml
# 模型参数
model_name_or_path: open-r1/Qwen2.5-Math-7B-RoPE-300k
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# 数据训练参数
chat_template: |
You are Open-R1, a language model trained by Hugging Face to help users.
Your role as an assistant involves thoroughly exploring questions through a
systematic thinking process before providing the final precise and accurate solutions.
Please structure your response into two main sections:
<think> Thought section </think> Solution section
dataset_name: open-r1/Mixture-of-Thoughts
dataset_config: all
dataset_num_proc: 12
eos_token: <|im_end|>
# SFT训练器配置
bf16: true
do_eval: false
eval_strategy: 'no'
gradient_accumulation_steps: 8
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: OpenR1-Distill-7B
hub_strategy: every_save
learning_rate: 4.0e-05
log_level: info
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
packing: false
max_grad_norm: 0.2
max_length: 32768
max_steps: -1
num_train_epochs: 5
output_dir: data/OpenR1-Distill-7B
overwrite_output_dir: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 2
push_to_hub: true
report_to:
- wandb
save_strategy: epoch
save_total_limit: 1
seed: 42
use_liger_kernel: true
warmup_ratio: 0.03
# recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
# 模型参数
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# 数据训练参数
dataset_name: open-r1/OpenR1-Math-220k
dataset_prompt_column: problem
# GRPO特定配置
reward_funcs: ["format", "accuracy", "reasoning_steps"]
vllm_mode: colocate
vllm_device_map: auto
# 奖励函数参数
cosine_min_value_wrong: -1.0
cosine_max_value_wrong: -0.5
cosine_min_value_correct: 0.5
cosine_max_value_correct: 1.0
cosine_max_len: 1000
repetition_n_grams: 4
repetition_max_penalty: -0.5
# 训练参数
learning_rate: 5.0e-07
num_train_epochs: 1
per_device_train_batch_size: 1
gradient_accumulation_steps: 32
logging_steps: 1
save_strategy: steps
save_steps: 100
warmup_ratio: 0.1
bf16: true
remove_unused_columns: false
gradient_checkpointing: true
# 数据集混合配置示例
dataset_mixture:
datasets:
- id: open-r1/Mixture-of-Thoughts
config: math
split: train
columns: [problem, solution]
weight: 0.4
- id: open-r1/OpenR1-Math-220k
config: default
split: train
columns: [problem, solution]
weight: 0.3
- id: open-r1/codeforces-cots
config: default
split: train
columns: [problem, solution]
weight: 0.3
seed: 42
test_split_size: 0.1
# CUDA版本检查
nvcc --version # 必须是12.4
# GPU资源检查
nvidia-smi
# 创建Python环境
uv venv openr1 --python 3.11
source openr1/bin/activate
# 安装核心依赖
uv pip install vllm==0.8.5.post1
uv pip install setuptools && uv pip install flash-attn --no-build-isolation
# 安装Open R1
GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"
# 配置账户
huggingface-cli login
wandb login
# 检查Git LFS
git-lfs --version
# 测试环境
python -c "import torch; print(torch.cuda.is_available())"
小型模型训练(单GPU):
accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml \
src/open_r1/sft.py \
--model_name_or_path Qwen/Qwen2.5-1.5B \
--dataset_name open-r1/Mixture-of-Thoughts \
--dataset_config math \
--eos_token '<|im_end|>' \
--learning_rate 4.0e-5 \
--num_train_epochs 3 \
--max_seq_length 16384 \
--per_device_train_batch_size 4 \
--gradient_checkpointing \
--bf16 \
--use_liger_kernel \
--output_dir data/OpenR1-Distill-1.5B
7B模型训练(8×H100):
accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml \
src/open_r1/sft.py \
--config recipes/OpenR1-Distill-7B/sft/config_distill.yaml
自定义数据集训练:
accelerate launch --config_file=recipes/accelerate_configs/zero3.yaml \
src/open_r1/sft.py \
--model_name_or_path your-base-model \
--dataset_name your-username/your-dataset \
--dataset_config your_config \
--chat_template "$(cat your_chat_template.jinja)" \
--eos_token '<|your_eos|>' \
--output_dir data/your-custom-model
单节点训练:
# 启动vLLM服务器
CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-1.5B-Instruct
# 启动GRPO训练
CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 ACCELERATE_LOG_LEVEL=info \
accelerate launch --config_file recipes/accelerate_configs/zero2.yaml --num_processes=7 \
src/open_r1/grpo.py --config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo.yaml
多节点训练:
sbatch --nodes=2 slurm/train.slurm \
--model Qwen2.5-1.5B-Instruct \
--task grpo \
--config demo \
--accelerator zero2 \
--dp 8 --tp 1
# 安装代码执行依赖
uv pip install -e '.[code]'
# 配置API密钥
echo "E2B_API_KEY=your_e2b_key" > .env
echo "MORPH_API_KEY=your_morph_key" >> .env
# 使用E2B沙盒训练
accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
src/open_r1/grpo.py \
--config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml \
--code_provider e2b
# 使用Morph沙盒训练
accelerate launch --config_file recipes/accelerate_configs/zero2.yaml \
src/open_r1/grpo.py \
--config recipes/Qwen2.5-1.5B-Instruct/grpo/config_demo_code.yaml \
--code_provider morph
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false # 使用新版本检查点
bf16: true # 使用bfloat16
fp16: false # 避免fp16的数值不稳定
use_liger_kernel: true # 启用优化的CUDA内核
packing: true # 将多个短序列打包到一个批次
max_length: 32768 # 根据GPU内存调整
# 多GPU数据并行
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
accelerate launch --config_file recipes/accelerate_configs/ddp.yaml
# 大模型张量并行
NUM_GPUS=8
MODEL_ARGS="tensor_parallel_size=$NUM_GPUS"
# DeepSpeed ZeRO配置
zero_stage: 3
offload_optimizer:
device: cpu
offload_param:
device: cpu
# recipes/accelerate_configs/zero3.yaml
compute_environment: LOCAL_MACHINE
distributed_type: DEEPSPEED
deepspeed_config:
zero_stage: 3
offload_optimizer_device: cpu
offload_param_device: cpu
zero3_init_flag: true
zero3_save_16bit_model: true
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
vllm_mode: colocate # 单节点模式
vllm_device_map: auto # 自动设备映射
vllm_max_model_len: 32768 # 最大序列长度
vllm_gpu_memory_utilization: 0.8 # GPU内存利用率
# AIME 2024 数学推理
MODEL=your-model-path
lighteval vllm "model_name=$MODEL,dtype=bfloat16" "lighteval|aime24|0|0" \
--use-chat-template \
--output-dir data/evals/$MODEL
# MATH-500 数学问题
lighteval vllm "model_name=$MODEL,dtype=bfloat16" "lighteval|math_500|0|0" \
--use-chat-template \
--output-dir data/evals/$MODEL
# GPQA Diamond 科学推理
lighteval vllm "model_name=$MODEL,dtype=bfloat16" "lighteval|gpqa:diamond|0|0" \
--use-chat-template \
--output-dir data/evals/$MODEL
# LiveCodeBench 代码生成
lighteval vllm "model_name=$MODEL,dtype=bfloat16" "extended|lcb:codegeneration|0|0" \
--use-chat-template \
--output-dir data/evals/$MODEL
# 使用脚本批量评估
python scripts/run_benchmarks.py \
--model-id your-model-id \
--benchmarks aime24,math_500,gpqa,lcb
# 数据并行评估
NUM_GPUS=8
MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,data_parallel_size=$NUM_GPUS"
# 张量并行评估(大模型)
MODEL_ARGS="model_name=$MODEL,dtype=bfloat16,tensor_parallel_size=$NUM_GPUS"
# vLLM服务部署
vllm serve your-model-path \
--host 0.0.0.0 \
--port 8000 \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.8
import openai
client = openai.OpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
completion = client.chat.completions.create(
model="your-model-path",
messages=[
{"role": "user", "content": "Solve: 2x + 3 = 11"}
],
temperature=0.6,
max_tokens=2048
)
print(completion.choices[0].message.content)
# 推送到Hub
trainer.push_to_hub(
repo_id="your-username/your-model-name",
tags=["open-r1", "reasoning"],
model_description="R1-style reasoning model"
)
Phase 1: 小模型验证 (1.5B) → 确认流程正确性
Phase 2: 中等模型训练 (7B) → 获得基础性能
Phase 3: 大模型优化 (32B+) → 达到最佳效果
- 高质量优于大数量
- 多领域数据平衡
- 定期数据去污染
- 验证数据质量
# 学习率策略
learning_rate_schedule = {
"SFT": 4e-5, # 监督微调
"GRPO": 5e-7, # 强化学习(更小)
}
# 批次大小策略
batch_size_strategy = {
"1.5B": {"batch_size": 8, "grad_accum": 4},
"7B": {"batch_size": 2, "grad_accum": 8},
"32B": {"batch_size": 1, "grad_accum": 16}
}
# 解决方案
gradient_checkpointing: true
per_device_train_batch_size: 1
gradient_accumulation_steps: 16
use_liger_kernel: true
# 解决方案
max_grad_norm: 0.2 # 梯度裁剪
warmup_ratio: 0.03 # 增加预热
lr_scheduler_type: cosine_with_min_lr
# 奖励函数权重调整
reward_weights = {
"format": 0.4, # 增加格式权重
"accuracy": 0.3,
"reasoning_steps": 0.2,
"repetition_penalty": 0.1
}
# 关键指标
metrics_to_monitor = [
"train_loss", # 训练损失
"eval_loss", # 验证损失
"learning_rate", # 学习率
"grad_norm", # 梯度范数
"format_reward_mean", # 格式奖励均值
"accuracy_reward_mean", # 准确性奖励均值
"gpu_memory_usage", # GPU内存使用
]
# 定期检查生成质量
def monitor_generation_quality(model, test_prompts):
for prompt in test_prompts:
response = model.generate(prompt)
# 检查格式
has_think_tags = "<think>" in response and "</think>" in response
# 检查推理深度
reasoning_steps = count_reasoning_steps(response)
# 记录指标
log_metrics({
"has_think_tags": has_think_tags,
"reasoning_steps": reasoning_steps,
"response_length": len(response)
})
模型 | AIME 2024 | MATH-500 | GPQA Diamond | LiveCodeBench |
---|---|---|---|---|
OpenR1-Distill-7B | 52.7 | 89.0 | 52.8 | 39.4 |
DeepSeek-R1-7B | 51.3 | 93.5 | 52.4 | 37.4 |
OpenR1-Distill-1.5B | 30.7 | 83.1 | 35.8 | 16.1 |
DeepSeek-R1-1.5B | 28.9 | 83.9 | 33.8 | 16.9 |
模型规模 | GPU配置 | 训练时间 | 内存需求 |
---|---|---|---|
1.5B | 1×RTX 4090 | 4-8小时 | 24GB |
7B | 8×H100 40GB | 12-24小时 | 320GB |
32B | 8×H100 80GB | 2-3天 | 640GB |