Benchmark 평가 (MMLU)¶
- (가장) 유명한 데이터셋인 mmlu를 이용해서 평가를 해봅시다
- 오픈 웨이트 모델들을 먼저 평가해보고, 우리가 만든 모델도 마저 평가를 해보겠습니다
Model Download & Inference¶
- HuggingFace transformers 라이브러리를 이용해서 inference 테스트를 해봅니다.
In [4]:
Copied!
from huggingface_hub import login
login(token='')
from huggingface_hub import login
login(token='')
In [ ]:
Copied!
import transformers
import torch
model_id = "meta-llama/Llama-3.1-8B-Instruct"
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="auto",
token = "",
)
import transformers
import torch
model_id = "meta-llama/Llama-3.1-8B-Instruct"
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="auto",
token = "",
)
In [ ]:
Copied!
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "체첸 공화국에 대해서 설명해"},
]
outputs = pipeline(
messages,
max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])
messages = [
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": "체첸 공화국에 대해서 설명해"},
]
outputs = pipeline(
messages,
max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])
KMMLU 데이터 준비¶
- 한국어 버전의 MMLU 를 다운받아 테스트합니다.
- 여러 버전의 데이터셋이 있어서 원하는대로 사용하면 되는데, 본 예제에서는 HAERAE-HUB 의 KMMLU 데이터, 그 중에서도 한국사 서브셋을 사용하겠습니다.
- MMLU 데이터셋은 4지선다 문제를 푸는 형식입니다
In [ ]:
Copied!
#!pip install datasets
#!pip install datasets
In [ ]:
Copied!
from datasets import load_dataset
dataset = load_dataset("HAERAE-HUB/KMMLU", 'Korean-History')
from datasets import load_dataset
dataset = load_dataset("HAERAE-HUB/KMMLU", 'Korean-History')
In [ ]:
Copied!
dataset['test'][10]
dataset['test'][10]
In [ ]:
Copied!
item = dataset['test'][10]
context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"
context
item = dataset['test'][10]
context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"
context
채점 로직¶
- 평가는 zero-shot prompting 에서 4지선다 답변중 어떤 선택지를 골랐는지 체크해서 채점합니다.
- 모델의 output 에 따라 잘못된 채점이 될 가능성이 있습니다, output의 형태를 유의해서 체크해 보세요.
- 이는 휴리스틱 채점 코드 입니다, 개선을 해볼까요?
In [11]:
Copied!
# Function to evaluate model on MMLU
def evaluate_mmlu(model, tokenizer, dataset, split="test"):
correct = 0
total = 0
for item in dataset[split]:
# Prepare the context and question
context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"
# Encode input and run model inference
inputs = tokenizer(context, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=10)
# Decode and process the output to get the predicted choice
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
try:
prediction = generated_text.split("Answer:")[-1].strip()[0] # Get the first letter of the answer
except:
prediction = 'E' #exception handling
# Check if prediction matches the correct answer
correct_answer = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}[item['answer']]
print(f'Prediction: {prediction}, Answer: {correct_answer}')
if prediction == correct_answer:
correct += 1
total += 1
# Calculate accuracy
accuracy = correct / total if total > 0 else 0
return accuracy
# Function to evaluate model on MMLU
def evaluate_mmlu(model, tokenizer, dataset, split="test"):
correct = 0
total = 0
for item in dataset[split]:
# Prepare the context and question
context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"
# Encode input and run model inference
inputs = tokenizer(context, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=10)
# Decode and process the output to get the predicted choice
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)
try:
prediction = generated_text.split("Answer:")[-1].strip()[0] # Get the first letter of the answer
except:
prediction = 'E' #exception handling
# Check if prediction matches the correct answer
correct_answer = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}[item['answer']]
print(f'Prediction: {prediction}, Answer: {correct_answer}')
if prediction == correct_answer:
correct += 1
total += 1
# Calculate accuracy
accuracy = correct / total if total > 0 else 0
return accuracy
In [ ]:
Copied!
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time
# Load model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"
#model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()
# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time
# Load model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"
#model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()
# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")
In [ ]:
Copied!
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time
# Load model and tokenizer
model_name = "jonhpark/llama3.1-8b-kowiki-instruct-16bit"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()
# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time
# Load model and tokenizer
model_name = "jonhpark/llama3.1-8b-kowiki-instruct-16bit"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()
# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")