pipeline_tag: 图像文本到文本
library_name: transformers
license: apache-2.0
ChemVLM-8B:面向化学领域的多模态大语言模型
这是ChemVLM的80亿参数版本,专为化学应用设计的跨模态大语言模型。
论文
ChemVLM:探索多模态大语言模型在化学领域的能力
摘要
大语言模型(LLMs)已取得显著成功,并应用于包括化学在内的多个科学领域。然而,许多化学任务需要处理视觉信息,现有化学大语言模型无法有效应对。这促使化学领域对能整合多模态信息模型的需求日益增长。本文介绍ChemVLM,一个专为化学应用设计的开源多模态大语言模型。ChemVLM基于精心构建的双语多模态数据集训练,增强了其理解文本和视觉化学信息的能力,包括分子结构、化学反应和化学试题。我们开发了三个评估数据集,分别针对化学光学字符识别(OCR)、多模态化学推理(MMCR)和多模态分子理解任务。我们在多项任务中将ChemVLM与开源和商业多模态大语言模型进行对比测试。实验结果表明,ChemVLM在所有评估任务中均展现出竞争力。模型地址:https://huggingface.co/AI4Chem/ChemVLM-26B。
模型描述
ChemVLM架构基于InternVLM,整合了视觉与语言处理组件。该模型在包含分子结构、化学反应和化学试题的双语多模态化学数据集上训练。更多架构细节详见Github说明文档。

引用
@inproceedings{li2025chemvlm,
title={Chemvlm: 探索多模态大语言模型在化学领域的能力},
author={李俊贤 and 张迪 and 王勋智 and 郝泽颖 and 雷静迪 and 谭茜 and 周才 and 刘伟 and 杨耀天 and 熊昕睿等},
booktitle={AAAI人工智能会议论文集},
volume={39},
number={1},
pages={415--423},
year={2025}
}
代码库与数据集
代码库及数据集详见:https://github.com/AI4Chem/ChemVlm。
80亿参数模型在多项任务中的表现
数据集 |
化学OCR |
分子理解 |
化学推理评测 |
反应类型 |
评估指标 |
相似度指数\tani@1.0 |
GPT-4o评分(%) |
GPT-4o评分(%) |
准确率(%) |
ChemVLM-8b得分 |
81.75/57.69 |
52.7(最优) |
33.6 |
16.79 |
快速开始
from transformers import AutoTokenizer, AutoModelforCasualLM
import torch
import torchvision.transforms as T
import transformers
from torchvision.transforms.functional import InterpolationMode
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=6):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
tokenizer = AutoTokenizer.from_pretrained('AI4Chem/ChemVLM-8B', trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"AI4Chem/ChemVLM-8B",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True
).cuda().eval()
query = "请描述图片中的分子结构"
image_path = "图片路径"
pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda()
gen_kwargs = {"max_length": 1000, "do_sample": True, "temperature": 0.7, "top_p": 0.9}
response = model.chat(tokenizer, pixel_values, query, gen_kwargs)
print(response)
安装依赖库:pip install transformers>=4.37.0 sentencepiece einops timm accelerate>=0.26.0
。请确保已安装torch
和torchvision
。