LLaVE-7B 多模态嵌入模型
模型概述
LLaVE系列是基于LLaVA-OneVision-7B模型的70亿参数多模态嵌入模型,上下文窗口为4K tokens。
训练/评估数据
- 训练数据: https://huggingface.co/datasets/TIGER-Lab/MMEB-train
- 评估数据: https://huggingface.co/datasets/TIGER-Lab/MMEB-eval
使用说明
预期用途
本模型具备对文本、图像、多图像和视频进行嵌入表示的能力。
MMEB排行榜
我们仅使用少量数据就在MMEB排行榜上取得了首位。

模型性能
LLaVE-7B仅使用66.2万训练样本就在MMEB上实现了最先进的性能。

尽管LLaVE是在图文数据上训练的,但它能以零样本方式泛化到文本-视频检索任务,并展现出强劲性能,这证明了其在其他嵌入任务上的卓越迁移潜力。

快速开始
首先克隆我们的GitHub仓库:
git clone https://github.com/DeepLearnXMU/LLaVE
cd LLaVE
pip install -e ".[train]"
我们提供了简化的嵌入处理流程。更多细节请参考GitHub。
import torch
import copy
from PIL import Image
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llava.conversation import conv_templates
from llava.model.builder import load_pretrained_model
from llava.mm_utils import tokenizer_image_token, process_images
pretrained = "zhibinlan/LLaVE-7B"
model_name = "llava_qwen"
device = "cuda"
device_map = "auto"
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map)
model.eval()
image = Image.open("figures/example.jpg")
image_tensor = process_images([image], image_processor, model.config)
image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor]
conv_template = "qwen_1_5"
question = DEFAULT_IMAGE_TOKEN + " 用以下问题表示给定图像:图中有什么"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], question)
conv.append_message(conv.roles[1], "\n")
prompt_question = conv.get_prompt()
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
attention_mask=input_ids.ne(tokenizer.pad_token_id)
image_sizes = [image.size]
query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=attention_mask,images=image_tensor, image_sizes=image_sizes)
target_string = "一只猫和一只狗"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], target_string)
conv.append_message(conv.roles[1], "\n")
target_string = conv.get_prompt()
target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
target_embed = model.encode_multimodal_embeddings(target_input_ids, attention_mask=attention_mask)
print("一只猫和一只狗的相似度得分:", query_embed @ target_embed.T)
neg_string = "一只猫和一只老虎"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], neg_string)
conv.append_message(conv.roles[1], "\n")
neg_string = conv.get_prompt()
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
neg_embed = model.encode_multimodal_embeddings(neg_input_ids, attention_mask=attention_mask)
print("一只猫和一只老虎的相似度得分:", query_embed @ neg_embed.T)
pos_string = "根据给定描述寻找匹配的日常图片:一只猫和一只狗。"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], pos_string)
conv.append_message(conv.roles[1], "\n")
pos_string = conv.get_prompt()
pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_mask=attention_mask)
target = DEFAULT_IMAGE_TOKEN + " 表示给定图像。"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], target)
conv.append_message(conv.roles[1], "\n")
prompt_target = conv.get_prompt()
target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
target_image_sizes = [image.size]
target_embed = model.encode_multimodal_embeddings(target_input_ids, attention_mask=attention_mask,images=image_tensor, image_sizes=target_image_sizes)
print("一只猫和一只狗的图片相似度得分:", pos_query_embed @ target_embed.T)
neg_string = "根据给定描述寻找匹配的日常图片:一只猫和一只老虎。"
conv = copy.deepcopy(conv_templates[conv_template])
conv.append_message(conv.roles[0], neg_string)
conv.append_message(conv.roles[1], "\n")
neg_string = conv.get_prompt()
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
neg_query_embed = model.encode_multimodal_embeddings(neg_input_ids, attention_mask=attention_mask)
print("一只猫和一只老虎的图片相似度得分:", neg_query_embed @ target_embed.T)
硬件与软件
引用
@article{lan2025llave,
title={LLaVE:基于难度加权对比学习的大语言视觉嵌入模型},
author={蓝志斌, 牛利强, 孟凡栋, 周杰, 苏劲松},
journal={arXiv预印本 arXiv:2503.04812},
year={2025}
}