In [ ]:
Copied!
# download txt file from the link
!wget https://www.gutenberg.org/cache/epub/23950/pg23950.txt
# download txt file from the link
!wget https://www.gutenberg.org/cache/epub/23950/pg23950.txt
In [ ]:
Copied!
# read data from the file
with open('pg23950.txt', 'r') as f:
text = f.read()
# get rid of '\u3000'
text = text.replace('\u3000', '')
# read data from the file
with open('pg23950.txt', 'r') as f:
text = f.read()
# get rid of '\u3000'
text = text.replace('\u3000', '')
In [ ]:
Copied!
# split text files to chunk based on empty line, and get rid of metadata
chunks = text.split('\n\n')
chunks = chunks[10:] # cut starts
chunks = chunks[:3694] # cut ends
chunks[3]
# remove "\n" in the each chunk
#for i in range(len(chunks)):
# chunks[i] = chunks[i].replace("\n", "")
# split text files to chunk based on empty line, and get rid of metadata
chunks = text.split('\n\n')
chunks = chunks[10:] # cut starts
chunks = chunks[:3694] # cut ends
chunks[3]
# remove "\n" in the each chunk
#for i in range(len(chunks)):
# chunks[i] = chunks[i].replace("\n", "")
OPENAI API 를 사용해서 번역하기¶
- openai python library 설치
- api 키 발급 & 등록
- prompting 으로 chunk 단위 번역
In [ ]:
Copied!
!pip install -U openai
!pip install -U openai
In [ ]:
Copied!
import os
os.environ["OPENAI_API_KEY"]=""
import os
os.environ["OPENAI_API_KEY"]=""
In [ ]:
Copied!
import openai
client = openai.Client()
def translate_pipeline(prompt:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompt + user_input
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
import openai
client = openai.Client()
def translate_pipeline(prompt:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompt + user_input
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
In [ ]:
Copied!
translate_pipeline("", chunks[2])
translate_pipeline("", chunks[2])
LangSmith Tracing 추가하기¶
- LangSmith 가입 & api_key 발급
- 위 번역 task 에 tracing을 추가
In [ ]:
Copied!
!pip install -U langsmith
!pip install -U langsmith
In [ ]:
Copied!
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]=""
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]=""
In [ ]:
Copied!
import openai
from langsmith.wrappers import wrap_openai
from langsmith import traceable
# Auto-trace LLM calls in-context
client = wrap_openai(openai.Client())
@traceable # Auto-trace this function
def translate_pipeline(prompt:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompt + user_input
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
import openai
from langsmith.wrappers import wrap_openai
from langsmith import traceable
# Auto-trace LLM calls in-context
client = wrap_openai(openai.Client())
@traceable # Auto-trace this function
def translate_pipeline(prompt:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompt + user_input
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
In [ ]:
Copied!
translate_pipeline("", chunks[2])
translate_pipeline("", chunks[2])
LasngSmith Tracing 에 추가 정보 달기¶
- project name, tag, metadata
In [ ]:
Copied!
@traceable(name="Translate Pipeline",
project_name="Translator",
tags=["sentence"],
metadata={"version": "1.0"}) # Auto-trace this function
def translate_pipeline(prompt:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompt + user_input
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
@traceable(name="Translate Pipeline",
project_name="Translator",
tags=["sentence"],
metadata={"version": "1.0"}) # Auto-trace this function
def translate_pipeline(prompt:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompt + user_input
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
In [ ]:
Copied!
translate_pipeline("", chunks[2])
translate_pipeline("", chunks[2])
기타: 예상 비용 체크¶
- tiktoken 으로 token 수 체크
In [ ]:
Copied!
# install tiktoken
!pip install tiktoken
# install tiktoken
!pip install tiktoken
In [ ]:
Copied!
# find out number of tokens of txt file
import tiktoken
# Initialize tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
# Tokenize the text
tokens = encoding.encode(text)
# Get the size of the tokens
token_size = len(tokens)
print(f"Number of tokens: {token_size}")
# find out number of tokens of txt file
import tiktoken
# Initialize tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
# Tokenize the text
tokens = encoding.encode(text)
# Get the size of the tokens
token_size = len(tokens)
print(f"Number of tokens: {token_size}")
In [ ]:
Copied!
# for 20 chunks, translate it.
for chunk in chunks[:20]:
print(translate_pipeline("", chunk))
# for 20 chunks, translate it.
for chunk in chunks[:20]:
print(translate_pipeline("", chunk))
- 분명히 번역을 제대로 못한 문장들이 있을 것 입니다. LangSmith 에서 표시를 해둡시다.
- feedback 으로 점수를 매겨놔도 되고, annotation queue 에 넣어놔도 됩니다.
개선 작업¶
프롬프팅을 추가할 것인데, 위 데이터를 보고 마음에 안든 것이나, 예상되는 문제를 해결해보죠.
- 중국어가 섞여 나오는 경우가 있음.
- 프롬프팅을 강하게 주자.
- 재미가 없음.
- 직접 손으로 만들어준 데이터를 넣어주자.
- 원문 -> 번역본 예시를 N 개를 넣어주자 (* Few-shot Prompting)
- DATA link : 스프레드 시트 링크
- 앞/뒤 문장들이 영향을 주지 못 함.
- 원본의 1회 단위로 넣어주자
- 앞선 내용과 이어지도록 history 를 같이 넣어주자
In [ ]:
Copied!
# for loop these text to 1 ~ 220
# for i in ["第一回", "第二回", "第三回", "第四回", "第五回", ... , "第一二○回"]
def number_to_chinese(num):
numerals = {
0: '○', 1: '一', 2: '二', 3: '三', 4: '四', 5: '五',
6: '六', 7: '七', 8: '八', 9: '九', 10: '十'
}
if num <= 10:
return numerals[num]
elif num < 20:
return numerals[10] + numerals[num - 10]
elif num < 100:
if num % 10 == 0:
return numerals[num // 10] + numerals[10]
else:
return numerals[num // 10] + numerals[10] + numerals[num % 10]
else:
return numerals[num // 100] + numerals[(num%100) // 10] + numerals[num % 10]
rounds = [f"第{number_to_chinese(i)}回" for i in range(1, 122)]
# for loop these text to 1 ~ 220
# for i in ["第一回", "第二回", "第三回", "第四回", "第五回", ... , "第一二○回"]
def number_to_chinese(num):
numerals = {
0: '○', 1: '一', 2: '二', 3: '三', 4: '四', 5: '五',
6: '六', 7: '七', 8: '八', 9: '九', 10: '十'
}
if num <= 10:
return numerals[num]
elif num < 20:
return numerals[10] + numerals[num - 10]
elif num < 100:
if num % 10 == 0:
return numerals[num // 10] + numerals[10]
else:
return numerals[num // 10] + numerals[10] + numerals[num % 10]
else:
return numerals[num // 100] + numerals[(num%100) // 10] + numerals[num % 10]
rounds = [f"第{number_to_chinese(i)}回" for i in range(1, 122)]
In [ ]:
Copied!
round_texts = []
# loop for chunks until chunk has a round
current_round = 1
current_round_texts = []
for chunk in chunks:
current_round_texts.append(chunk)
if rounds[current_round] in chunk:
subject_chunk = current_round_texts[-1]
round_texts.append( "\n".join(current_round_texts[:-1]) )
current_round_texts = [subject_chunk]
current_round += 1
round_texts.append("\n".join(current_round_texts[:-1]))
round_texts = []
# loop for chunks until chunk has a round
current_round = 1
current_round_texts = []
for chunk in chunks:
current_round_texts.append(chunk)
if rounds[current_round] in chunk:
subject_chunk = current_round_texts[-1]
round_texts.append( "\n".join(current_round_texts[:-1]) )
current_round_texts = [subject_chunk]
current_round += 1
round_texts.append("\n".join(current_round_texts[:-1]))
In [ ]:
Copied!
print(round_texts[3])
print(round_texts[3])
In [ ]:
Copied!
import pandas as pd
# read data from google spreadsheet, make it dataframe
df = pd.read_csv('https://docs.google.com/spreadsheets/d/' +
'1DFK_gqMfJ34ufaMyk03crEzK5lEYEHoatmTK05WqU_0' +
'/export?gid=0&format=csv',
# Set first column as rownames in data frame
index_col=0,
)
df.head(5)
import pandas as pd
# read data from google spreadsheet, make it dataframe
df = pd.read_csv('https://docs.google.com/spreadsheets/d/' +
'1DFK_gqMfJ34ufaMyk03crEzK5lEYEHoatmTK05WqU_0' +
'/export?gid=0&format=csv',
# Set first column as rownames in data frame
index_col=0,
)
df.head(5)
In [ ]:
Copied!
print(df.loc["1화", "김환준 문체"])
print(df.loc["1화", "김환준 문체"])
In [ ]:
Copied!
# 프롬프팅
def prompting(examples, name, latest_sentences: str, chunk:str):
return f"""You are a professional translator and writer.
You will be provided with Chinese sentences which is the original text of 삼국지 (The Romance of Three Kingdoms).
Your task is write a translated version of the Chinese sentences into Korean.
The result will be published in Korean online novel platform.
You have to write sentences in the style of recent web novel to be interesting.
Here is the examples.
///
Chinese Input: {df.loc["1화", f"원문"]}
///
Korean Output: {df.loc["1화", f"{name} 문체"]}
///
///
Chinese Input: {df.loc["2화", f"원문"]}
///
Korean Output: {df.loc["2화", f"{name} 문체"]}
///
To make a consistency between sentences i will give you sentences that translated before.
Sentences written before:
///
{latest_sentences}
///
Now here is the sentences in chinse to translate.
///
{chunk}
///
Write sentences in Korean.
/// 라는 글자는 구분자를 쓴 것이므로 절대 넣지 말고.
원문을 번역하는 내용 외에 다른 설명도 절대 넣지 마.
"""
@traceable(name="Translate Pipeline",
project_name="Translator",
tags=["two-shot"],
metadata={"version": "2.0"}) # Auto-trace this function
def translate_pipeline(df, name, latest_sentences:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompting(df, name, latest_sentences,user_input)
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
# 프롬프팅
def prompting(examples, name, latest_sentences: str, chunk:str):
return f"""You are a professional translator and writer.
You will be provided with Chinese sentences which is the original text of 삼국지 (The Romance of Three Kingdoms).
Your task is write a translated version of the Chinese sentences into Korean.
The result will be published in Korean online novel platform.
You have to write sentences in the style of recent web novel to be interesting.
Here is the examples.
///
Chinese Input: {df.loc["1화", f"원문"]}
///
Korean Output: {df.loc["1화", f"{name} 문체"]}
///
///
Chinese Input: {df.loc["2화", f"원문"]}
///
Korean Output: {df.loc["2화", f"{name} 문체"]}
///
To make a consistency between sentences i will give you sentences that translated before.
Sentences written before:
///
{latest_sentences}
///
Now here is the sentences in chinse to translate.
///
{chunk}
///
Write sentences in Korean.
/// 라는 글자는 구분자를 쓴 것이므로 절대 넣지 말고.
원문을 번역하는 내용 외에 다른 설명도 절대 넣지 마.
"""
@traceable(name="Translate Pipeline",
project_name="Translator",
tags=["two-shot"],
metadata={"version": "2.0"}) # Auto-trace this function
def translate_pipeline(df, name, latest_sentences:str, user_input: str):
result = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You will be provided with a sentence in Chinese, and your task is to translate it into Korean."
},
{
"role": "user",
"content": prompting(df, name, latest_sentences,user_input)
}
],
temperature=0.7,
top_p=1
)
return result.choices[0].message.content
In [ ]:
Copied!
# Split the rounds in to sentences
def split_rounds(round_text):
sentences = round_text.split('\n')
num_sentences = len(sentences)
sentences_to_join = [sentences[i:i+40] for i in range(0, len(sentences), 40)]
# if length of last sentences_to_join is under 20, add up last two elements
if len(sentences_to_join[-1]) < 20:
sentences_to_join[-2] += sentences_to_join[-1]
del sentences_to_join[-1]
chunks = []
for sentence_list in sentences_to_join:
chunk = "".join(sentence_list)
chunks.append(chunk)
return chunks
# Split the rounds in to sentences
def split_rounds(round_text):
sentences = round_text.split('\n')
num_sentences = len(sentences)
sentences_to_join = [sentences[i:i+40] for i in range(0, len(sentences), 40)]
# if length of last sentences_to_join is under 20, add up last two elements
if len(sentences_to_join[-1]) < 20:
sentences_to_join[-2] += sentences_to_join[-1]
del sentences_to_join[-1]
chunks = []
for sentence_list in sentences_to_join:
chunk = "".join(sentence_list)
chunks.append(chunk)
return chunks
In [ ]:
Copied!
splitted = split_rounds(round_texts[2])
splitted[-1]
splitted = split_rounds(round_texts[2])
splitted[-1]
In [ ]:
Copied!
name = "김진호"
histories = [df.loc["1화", f"{name} 문체"], df.loc["2화", f"{name} 문체"]]
# 3화, 4화 테스트
i = 3
for chunk in round_texts[2:4]:
# open i.txt
with open(f"{i}.txt", "w") as f:
for chunk in split_rounds(chunk):
history = histories[-1]
result = translate_pipeline(df, name, history, chunk)
histories.append(result)
print(" ********* ")
print(result)
#write to f
f.write(result)
i += 1
name = "김진호"
histories = [df.loc["1화", f"{name} 문체"], df.loc["2화", f"{name} 문체"]]
# 3화, 4화 테스트
i = 3
for chunk in round_texts[2:4]:
# open i.txt
with open(f"{i}.txt", "w") as f:
for chunk in split_rounds(chunk):
history = histories[-1]
result = translate_pipeline(df, name, history, chunk)
histories.append(result)
print(" ********* ")
print(result)
#write to f
f.write(result)
i += 1
In [ ]:
Copied!
# zip ./*.txt files
!zip -r ./result.zip ./
# zip ./*.txt files
!zip -r ./result.zip ./
In [ ]:
Copied!