Benchmark 평가 (MMLU)¶

(가장) 유명한 데이터셋인 mmlu를 이용해서 평가를 해봅시다
오픈 웨이트 모델들을 먼저 평가해보고, 우리가 만든 모델도 마저 평가를 해보겠습니다

Model Download & Inference¶

HuggingFace transformers 라이브러리를 이용해서 inference 테스트를 해봅니다.

In [4]:

Copied!

from huggingface_hub import login

login(token='')
from huggingface_hub import login

login(token='')

In [ ]:

Copied!





import transformers
import torch

model_id = "meta-llama/Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    token = "",
)
import transformers
import torch

model_id = "meta-llama/Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    token = "",
)

In [ ]:

Copied!





messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "체첸 공화국에 대해서 설명해"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "체첸 공화국에 대해서 설명해"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

KMMLU 데이터 준비¶

한국어 버전의 MMLU 를 다운받아 테스트합니다.
여러 버전의 데이터셋이 있어서 원하는대로 사용하면 되는데, 본 예제에서는 HAERAE-HUB 의 KMMLU 데이터, 그 중에서도 한국사 서브셋을 사용하겠습니다.
MMLU 데이터셋은 4지선다 문제를 푸는 형식입니다

In [ ]:

Copied!

#!pip install datasets
#!pip install datasets

In [ ]:

Copied!

from datasets import load_dataset

dataset = load_dataset("HAERAE-HUB/KMMLU", 'Korean-History')
from datasets import load_dataset

dataset = load_dataset("HAERAE-HUB/KMMLU", 'Korean-History')

In [ ]:

Copied!

dataset['test'][10]
dataset['test'][10]

In [ ]:

Copied!

item = dataset['test'][10]

context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"
context
item = dataset['test'][10]

context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"
context

채점 로직¶

평가는 zero-shot prompting 에서 4지선다 답변중 어떤 선택지를 골랐는지 체크해서 채점합니다.
모델의 output 에 따라 잘못된 채점이 될 가능성이 있습니다, output의 형태를 유의해서 체크해 보세요.
이는 휴리스틱 채점 코드 입니다, 개선을 해볼까요?

In [11]:

Copied!





# Function to evaluate model on MMLU
def evaluate_mmlu(model, tokenizer, dataset, split="test"):
    correct = 0
    total = 0

    for item in dataset[split]:
        # Prepare the context and question
        context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"

        # Encode input and run model inference
        inputs = tokenizer(context, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=10)

        # Decode and process the output to get the predicted choice
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(generated_text)
        try:
            prediction = generated_text.split("Answer:")[-1].strip()[0]  # Get the first letter of the answer
        except:
            prediction = 'E' #exception handling

        # Check if prediction matches the correct answer
        correct_answer = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}[item['answer']]

        print(f'Prediction: {prediction}, Answer: {correct_answer}')
        if prediction == correct_answer:
            correct += 1
        total += 1

    # Calculate accuracy
    accuracy = correct / total if total > 0 else 0
    return accuracy
# Function to evaluate model on MMLU
def evaluate_mmlu(model, tokenizer, dataset, split="test"):
    correct = 0
    total = 0

    for item in dataset[split]:
        # Prepare the context and question
        context = f"Question: {item['question']}\nChoices: A) {item['A']}, B) {item['B']}, C) {item['C']}, D) {item['D']}\nAnswer:"

        # Encode input and run model inference
        inputs = tokenizer(context, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=10)

        # Decode and process the output to get the predicted choice
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(generated_text)
        try:
            prediction = generated_text.split("Answer:")[-1].strip()[0]  # Get the first letter of the answer
        except:
            prediction = 'E' #exception handling

        # Check if prediction matches the correct answer
        correct_answer = {1: 'A', 2: 'B', 3: 'C', 4: 'D'}[item['answer']]

        print(f'Prediction: {prediction}, Answer: {correct_answer}')
        if prediction == correct_answer:
            correct += 1
        total += 1

    # Calculate accuracy
    accuracy = correct / total if total > 0 else 0
    return accuracy

In [ ]:

Copied!





from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time

# Load model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"
#model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()

# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time

# Load model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"
#model_name = "Qwen/Qwen2.5-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()

# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")

In [ ]:

Copied!





from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time


# Load model and tokenizer
model_name = "jonhpark/llama3.1-8b-kowiki-instruct-16bit"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()

# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import time


# Load model and tokenizer
model_name = "jonhpark/llama3.1-8b-kowiki-instruct-16bit"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Run the evaluation and measure time
start_time = time.time()
accuracy = evaluate_mmlu(model, tokenizer, dataset)
end_time = time.time()

# Print accuracy and elapsed time
print(f"Model Accuracy on MMLU: {accuracy * 100:.2f}%")
print(f"Time to perform evaluate_mmlu: {end_time - start_time:.2f} seconds")