标签:
中文维基百科标点恢复工具
更多详情请访问: https://github.com/p208p2002/ZH-Punctuation-Restore
支持恢复6种标点符号:逗号(,)、顿号(、)。句号(。)问号(?)感叹号(!)分号(;)
安装
pip install zhpr
使用方法
from zhpr.predict import DocumentDataset,merge_stride,decode_pred
from transformers import AutoModelForTokenClassification,AutoTokenizer
from torch.utils.data import DataLoader
def predict_step(batch,model,tokenizer):
batch_out = []
batch_input_ids = batch
encodings = {'input_ids': batch_input_ids}
output = model(**encodings)
predicted_token_class_id_batch = output['logits'].argmax(-1)
for predicted_token_class_ids, input_ids in zip(predicted_token_class_id_batch, batch_input_ids):
out=[]
tokens = tokenizer.convert_ids_to_tokens(input_ids)
input_ids = input_ids.tolist()
try:
input_id_pad_start = input_ids.index(tokenizer.pad_token_id)
except:
input_id_pad_start = len(input_ids)
input_ids = input_ids[:input_id_pad_start]
tokens = tokens[:input_id_pad_start]
predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids]
predicted_tokens_classes = predicted_tokens_classes[:input_id_pad_start]
for token,ner in zip(tokens,predicted_tokens_classes):
out.append((token,ner))
batch_out.append(out)
return batch_out
if __name__ == "__main__":
window_size = 256
step = 200
text = "维基百科是维基媒体基金会运营的一个多语言的百科全书目前是全球网络上最大且最受大众欢迎的参考工具书名列全球二十大最受欢迎的网站特点是自由内容自由编辑与自由著作权"
dataset = DocumentDataset(text,window_size=window_size,step=step)
dataloader = DataLoader(dataset=dataset,shuffle=False,batch_size=5)
model_name = 'p208p2002/zh-wiki-punctuation-restore'
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_pred_out = []
for batch in dataloader:
batch_out = predict_step(batch,model,tokenizer)
for out in batch_out:
model_pred_out.append(out)
merge_pred_result = merge_stride(model_pred_out,step)
merge_pred_result_deocde = decode_pred(merge_pred_result)
merge_pred_result_deocde = ''.join(merge_pred_result_deocde)
print(merge_pred_result_deocde)
维基百科是维基媒体基金会运营的一个多语言的百科全书,目前是全球网络上最大且最受大众欢迎的参考工具书,名列全球二十大最受欢迎的网站,特点是自由内容、自由编辑与自由著作权。