基础模型:
- Qwen/Qwen2.5-Coder-7B-Instruct
库名称: transformers
许可证: mit
评估指标:
- 准确率
流水线标签: 文本生成
Z1: 基于代码的高效测试时扩展
训练大语言模型通过思维迁移进行推理
[📜 论文] •
[🤗 HF模型] •
[🐱 GitHub]
模型详情
要了解思维迁移模式的启动方法,请参考 https://github.com/efficientscaling/Z1。
评估
Gradio演示
import copy
from typing import List
from dataclasses import dataclass
import gradio as gr
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
BOX=r"\boxed{}"
ANSWER_WITH_BOX=f"\n\n经过深入思考,最终答案{BOX}应为:\n\n"
ANSWER_WITHOUT_BOX=f"\n\n经过深入思考,最终答案应为:\n\n"
model_name = "efficientscaling/Z1-7B"
@dataclass
class ThinkingLLM(LLM):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def thinking_generate(self, prompts: List[str], sampling_params: SamplingParams = None, max_tokens_for_thinking: int = None):
if sampling_params is None:
raise ValueError("采样参数不能为空!")
else:
all_max_tokens = sampling_params.max_tokens
sampling_params.max_tokens = max_tokens_for_thinking
print(f"总token数: {all_max_tokens}")
print(f"思考token数: {max_tokens_for_thinking}")
trajectories = self.generate(prompts, sampling_params)
rethinking_str = ANSWER_WITHOUT_BOX
sampling_params.max_tokens = all_max_tokens
answers = copy.deepcopy(trajectories)
unfinished_id = []
thinking_token = 0
new_prompts = []
for id, traj in enumerate(trajectories):
if traj.outputs[0].finish_reason == 'length':
unfinished_id.append(id)
new_prompts.append(prompts[id] + traj.outputs[0].text + rethinking_str)
thinking_token += len(traj.outputs[0].token_ids)
avg_thinking_token = thinking_token / len(prompts)
if new_prompts:
print(new_prompts[0])
o = self.generate(
new_prompts,
sampling_params=sampling_params,
)
for i, uid in enumerate(unfinished_id):
answers[uid] = o[i]
return new_prompts, answers
def generate_text(prompt, max_tokens, max_tokens_for_thinking, temperature, top_p):
sampling_params = SamplingParams(
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
skip_special_tokens=False,
)
trajectories, outputs = llm.thinking_generate(prompt, sampling_params, max_tokens_for_thinking=max_tokens_for_thinking)
return trajectories[0] + '\n\n' + outputs[0].outputs[0].text if trajectories else outputs[0].outputs[0].text
llm = ThinkingLLM(
model=model_name,
tensor_parallel_size=1,
gpu_memory_utilization=0.96,
)
with gr.Blocks() as demo:
gr.Markdown("# 基于思维迁移的推理")
with gr.Row():
with gr.Column():
prompt_input = gr.Textbox(
label="输入提示",
placeholder="请输入内容",
lines=5,
)
max_tokens_for_thinking_input = gr.Slider(
label="思维迁移窗口大小",
minimum=1,
maximum=32786,
value=4000,
step=1,
)
max_tokens_input = gr.Slider(
label="最大token数",
minimum=1,
maximum=32786,
value=32786,
step=1,
)
temperature_input = gr.Slider(
label="温度参数",
minimum=00,
maximum=2.0,
value=0,
step=0.1,
)
top_p_input = gr.Slider(
label="Top-p采样",
minimum=0.0,
maximum=1.0,
value=1,
step=0.01,
)
generate_button = gr.Button("生成")
with gr.Column():
output_text = gr.Textbox(
label="思维迁移窗口",
placeholder="生成文本将显示在此处...",
lines=10,
)
generate_button.click(
fn=generate_text,
inputs=[prompt_input, max_tokens_for_thinking_input,max_tokens_input, temperature_input, top_p_input],
outputs=output_text,
)
if __name__ == "__main__":
demo.launch()
引用
@misc{yu2025efficientscaling,
title={Z1: 基于代码的高效测试时扩展},
author={于兆健 and 吴英豪 and 赵一伦 and Arman Cohan and 张小平},
year={2025},
eprint={2504.00810},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2504.00810},
}