库名称:transformers
许可证:mit
数据集:
- biglab/jitteredwebsites-merged-224-paraphrased
- biglab/jitteredwebsites-merged-224-paraphrased-paired
- biglab/uiclip_human_data_hf
基础模型:
- openai/clip-vit-base-patch32
- biglab/uiclip_jitteredwebsites-2-224-paraphrased_webpairs
模型卡片
UIClip 是一个用于量化用户界面(UI)截图在给定文本描述下的设计质量和相关性的模型。
模型描述
UIClip 是一个用于量化用户界面(UI)截图在给定文本描述下的设计质量和相关性的模型。
该模型还可用于生成自然语言设计建议(详见论文)。
此模型在发表于 UIST 2024 的论文《UIClip: A Data-driven Model for Assessing User Interface Design》中描述(https://arxiv.org/abs/2404.12500)。
用户界面(UI)设计是一项困难但重要的任务,关系到应用程序的可用性、可访问性和美学品质。在我们的论文中,我们开发了一个机器学习模型 UIClip,用于评估 UI 的设计质量和视觉相关性,基于其截图和自然语言描述。为了训练 UIClip,我们结合了自动爬取、合成增强和人工评分,构建了一个大规模的 UI 数据集,按描述整理并按设计质量排序。通过在该数据集上的训练,UIClip 隐式地学习了好设计和坏设计的特性,具体表现为:i) 分配一个代表 UI 设计相关性和质量的数值分数,ii) 提供设计建议。在一项将 UIClip 和其他基线模型的输出与 12 位人类设计师评分的 UI 进行比较的评估中,我们发现 UIClip 与真实排名的一致性最高。最后,我们展示了三个示例应用,说明 UIClip 如何促进依赖即时 UI 设计质量评估的下游应用:i) UI 代码生成,ii) UI 设计提示生成,iii) 质量感知的 UI 示例搜索。
- 开发团队: BigLab
- 模型类型: CLIP 风格的多模态双编码器 Transformer
- 语言(NLP): 英语
- 许可证: MIT
示例代码
import torch
from transformers import CLIPProcessor, CLIPModel
IMG_SIZE = 224
DEVICE = "cpu"
LOGIT_SCALE = 100
NORMALIZE_SCORING = True
model_path="uiclip_jitteredwebsites-2-224-paraphrased_webpairs_humanpairs"
processor_path="openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_path)
model = model.eval()
model = model.to(DEVICE)
processor = CLIPProcessor.from_pretrained(processor_path)
def compute_quality_scores(input_list):
description_list = ["ui screenshot. well-designed. " + input_item[0] for input_item in input_list]
img_list = [input_item[1] for input_item in input_list]
text_embeddings_tensor = compute_description_embeddings(description_list)
img_embeddings_tensor = compute_image_embeddings(img_list)
text_embeddings_tensor /= text_embeddings_tensor.norm(dim=-1, keepdim=True)
img_embeddings_tensor /= img_embeddings_tensor.norm(dim=-1, keepdim=True)
if NORMALIZE_SCORING:
text_embeddings_tensor_poor = compute_description_embeddings([d.replace("well-designed. ", "poor design. ") for d in description_list])
text_embeddings_tensor_poor /= text_embeddings_tensor_poor.norm(dim=-1, keepdim=True)
text_embeddings_tensor_all = torch.stack((text_embeddings_tensor, text_embeddings_tensor_poor), dim=1)
else:
text_embeddings_tensor_all = text_embeddings_tensor.unsqueeze(1)
img_embeddings_tensor = img_embeddings_tensor.unsqueeze(1)
scores = (LOGIT_SCALE * img_embeddings_tensor @ text_embeddings_tensor_all.permute(0, 2, 1)).squeeze(1)
if NORMALIZE_SCORING:
scores = scores.softmax(dim=-1)
return scores[:, 0]
def compute_description_embeddings(descriptions):
inputs = processor(text=descriptions, return_tensors="pt", padding=True)
inputs['input_ids'] = inputs['input_ids'].to(DEVICE)
inputs['attention_mask'] = inputs['attention_mask'].to(DEVICE)
text_embedding = model.get_text_features(**inputs)
return text_embedding
def compute_image_embeddings(image_list):
windowed_batch = [slide_window_over_image(img, IMG_SIZE) for img in image_list]
inds = []
for imgi in range(len(windowed_batch)):
inds.append([imgi for _ in windowed_batch[imgi]])
processed_batch = [item for sublist in windowed_batch for item in sublist]
inputs = processor(images=processed_batch, return_tensors="pt")
inputs['pixel_values'] = inputs['pixel_values'].to(DEVICE)
with torch.no_grad():
image_features = model.get_image_features(**inputs)
processed_batch_inds = torch.tensor([item for sublist in inds for item in sublist]).long().to(image_features.device)
embed_list = []
for i in range(len(image_list)):
mask = processed_batch_inds == i
embed_list.append(image_features[mask].mean(dim=0))
image_embedding = torch.stack(embed_list, dim=0)
return image_embedding
def preresize_image(image, image_size):
aspect_ratio = image.width / image.height
if aspect_ratio > 1:
image = image.resize((int(aspect_ratio * image_size), image_size))
else:
image = image.resize((image_size, int(image_size / aspect_ratio)))
return image
def slide_window_over_image(input_image, img_size):
input_image = preresize_image(input_image, img_size)
width, height = input_image.size
square_size = min(width, height)
longer_dimension = max(width, height)
num_steps = (longer_dimension + square_size - 1) // square_size
if num_steps > 1:
step_size = (longer_dimension - square_size) // (num_steps - 1)
else:
step_size = square_size
cropped_images = []
for y in range(0, height - square_size + 1, step_size if height > width else square_size):
for x in range(0, width - square_size + 1, step_size if width > height else square_size):
left = x
upper = y
right = x + square_size
lower = y + square_size
cropped_image = input_image.crop((left, upper, right, lower))
cropped_images.append(cropped_image)
return cropped_images
prediction_scores = compute_quality_scores(list(zip(test_descriptions, test_images)))