FineTuning Process Using OpenAI 1703440516
FineTuning Process Using OpenAI 1703440516
1
2.1 Loading dataset
2.1.1 AirlineTweets
[ ]: !wget -nc https://www.dropbox.com/s/lkd0eklmi64m9xm/AirlineTweets.csv?dl=0
[ ]: import pandas as pd
df = pd.read_csv('AirlineTweets.csv?dl=0')
df.head(2)
text tweet_coord \
0 @VirginAmerica What @dhepburn said. NaN
1 @VirginAmerica plus you've added commercials t… NaN
[ ]: training_data = []
system_message = "You are a helpful assistant. You are to extract the sentiment␣
↪analysis from the provided airline tweets."
def create_user_message(row):
return f"""Airline: {row['airline']}\n\nTweet: {row['text']}\n\nAirline␣
↪Sentiment: """
def create_final_message(row):
messages = []
2
messages.append({"role": "system", "content": system_message})
user_message = create_user_message(row)
messages.append({"role": "user", "content": user_message})
create_final_message(df.iloc[0])
training_df = df.loc[0:600]
training_data = training_df.apply(create_final_message, axis=1).tolist()
[ ]: print(example['messages'][1]['content'])
3
Airline: Virgin America
Airline Sentiment:
[ ]: validation_df = df.loc[600:800]
validation_data = validation_df.apply(create_final_message, axis=1).tolist()
[ ]: import json
import numpy as np
[ ]: training_file_name = "airline_tweets_training.jsonl"
write_jsonl(training_data, training_file_name)
validation_file_name = "airline_tweets_validation.jsonl"
write_jsonl(validation_data, validation_file_name)
[ ]: !head -n 2 airline_tweets_training.jsonl
4
print("First example:")
for message in dataset[500]["messages"]:
print(message)
Tokens count
[ ]: import tiktoken
if key == "name":
num_tokens += tokens_per_name
num_tokens += 3
return num_tokens
def num_assistant_tokens_from_messages(messages):
num_tokens = 0
for message in messages:
if message["role"] == "assistant":
num_tokens += len(encoding.encode(message["content"]))
return num_tokens
[ ]: num_missing_system = 0
num_missing_user = 0
num_messages = []
message_all_lens = []
assistant_message_lens = []
5
for ex in dataset:
messages = ex["messages"]
if not any(message["role"] == "system" for message in messages):
num_missing_system += 1
if not any(message["role"] == "user" for message in messages):
num_missing_user += 1
num_messages.append(len(messages))
message_all_lens.append(num_tokens_from_messages(messages))
assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
if num_sample_token_long>0:
print(f"\n{num_sample_token_long} messages may be over the 4096 token limit,␣
↪they will be truncated during fine-tuning")
else:
print(f"\nNo message will be truncated during fine-tuning")
Some statistics:
min=53, max=121
p5=61.0, p95=89.0
\Examples of tokens nbr from the first messages
[60, 64, 66, 77, 63]
[ ]: MAX_TOKENS_PER_EXAMPLE = 4096
6
TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25
n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)
Mounted at /content/drive
[ ]:
7
3 Fine Tuning process
[ ]: from google.colab import userdata
openai_api_key = userdata.get('OPENAI_API_KEY')
[ ]: training_file_creation
[ ]: FileObject(id='file-513TXhzAGIypvaUt2RqKLW39', bytes=234597,
created_at=1700860036, filename='airline_tweets_training.jsonl', object='file',
purpose='fine-tune', status='processed', status_details=None)
[ ]: id_file_training = training_file_creation.id
[ ]: validation_file_creation = client.files.create(
file=open(validation_file_name, "rb"),
purpose='fine-tune'
)
8
[ ]: validation_file_creation
[ ]: FileObject(id='file-aOQYL6VpOFbu8QdvOP5Q06ZI', bytes=77406,
created_at=1700860056, filename='airline_tweets_validation.jsonl',
object='file', purpose='fine-tune', status='processed', status_details=None)
[ ]: id_file_validation = validation_file_creation.id
[ ]: ## To delete files
# client.files.delete(validation_file_creation.id)
[ ]: fine_tuning_job
# status at the beginning ==> Validating_files
# The current status of the fine-tuning job, which can be either␣
↪validating_files, queued, running, succeeded, failed, or cancelled.
[ ]: FineTuningJob(id='ftjob-OWg7WZaUPIEnFFOJgBLTS5PO', created_at=1700860459,
error=None, fine_tuned_model=None, finished_at=None,
hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto',
learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613',
object='fine_tuning.job', organization_id='org-PKDk6D4mPARkEfXOj2JB21sK',
result_files=[], status='validating_files', trained_tokens=None,
training_file='file-513TXhzAGIypvaUt2RqKLW39', validation_file='file-
aOQYL6VpOFbu8QdvOP5Q06ZI')
[ ]: job_id = fine_tuning_job.id
job_id
[ ]: 'ftjob-OWg7WZaUPIEnFFOJgBLTS5PO'
[ ]: # Fine tuning process could take time ==> you will receive an email once␣
↪fine-tuning is finished
9
[ ]: client.fine_tuning.jobs.list()#.data[0]
[ ]: SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-
OWg7WZaUPIEnFFOJgBLTS5PO', created_at=1700860459, error=None,
fine_tuned_model='ft:gpt-3.5-turbo-0613:personal:airline-sentiment:8OYMFXDs',
finished_at=1700862634, hyperparameters=Hyperparameters(n_epochs=3,
batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613',
object='fine_tuning.job', organization_id='org-PKDk6D4mPARkEfXOj2JB21sK',
result_files=['file-anvVhIrPGl9TE0LKWGOMFsIK'], status='succeeded',
trained_tokens=131259, training_file='file-513TXhzAGIypvaUt2RqKLW39',
validation_file='file-aOQYL6VpOFbu8QdvOP5Q06ZI'), FineTuningJob(id='ftjob-
sGIus9vadEOTVEgSRPuu0sWZ', created_at=1697907644, error=None,
fine_tuned_model='ft:gpt-3.5-turbo-0613:personal:ner-recipe:8C9ptbha',
finished_at=1697908436, hyperparameters=Hyperparameters(n_epochs=3,
batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613',
object='fine_tuning.job', organization_id='org-PKDk6D4mPARkEfXOj2JB21sK',
result_files=['file-pBZUwlvYMQ5mbpFftjKSy76k'], status='succeeded',
trained_tokens=40047, training_file='file-tBa8aMquvTNAqTHYJfzJBYMl',
validation_file='file-PeCozuyv5cIshHhBoqACv3s3')], object='list',
has_more=False)
# ==> model ==> the last final model available in OpenAI ==> 'gpt-3.
↪5-turbo-0613'
[ ]: client.fine_tuning.jobs.list().data[0]
# job_id = 'ftjob-OWg7WZaUPIEnFFOJgBLTS5PO'
# job_id = client.fine_tuning.jobs.list().data[0].id
[ ]: FineTuningJob(id='ftjob-OWg7WZaUPIEnFFOJgBLTS5PO', created_at=1700860459,
error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:personal:airline-
sentiment:8OYMFXDs', finished_at=1700862634,
hyperparameters=Hyperparameters(n_epochs=3, batch_size=1,
learning_rate_multiplier=2), model='gpt-3.5-turbo-0613',
object='fine_tuning.job', organization_id='org-PKDk6D4mPARkEfXOj2JB21sK',
result_files=['file-anvVhIrPGl9TE0LKWGOMFsIK'], status='succeeded',
trained_tokens=131259, training_file='file-513TXhzAGIypvaUt2RqKLW39',
validation_file='file-aOQYL6VpOFbu8QdvOP5Q06ZI')
10
3.4 Cancel a fine-tuned job
[ ]: # client.fine_tuning.jobs.cancel(fine_tuning_job_id = job_id)
[ ]: FineTuningJob(id='ftjob-OWg7WZaUPIEnFFOJgBLTS5PO', created_at=1700860459,
error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:personal:airline-
sentiment:8OYMFXDs', finished_at=1700862634,
hyperparameters=Hyperparameters(n_epochs=3, batch_size=1,
learning_rate_multiplier=2), model='gpt-3.5-turbo-0613',
object='fine_tuning.job', organization_id='org-PKDk6D4mPARkEfXOj2JB21sK',
result_files=['file-anvVhIrPGl9TE0LKWGOMFsIK'], status='succeeded',
trained_tokens=131259, training_file='file-513TXhzAGIypvaUt2RqKLW39',
validation_file='file-aOQYL6VpOFbu8QdvOP5Q06ZI')
[ ]: retrieve_fine_tuned_job
[ ]: FineTuningJob(id='ftjob-OWg7WZaUPIEnFFOJgBLTS5PO', created_at=1700860459,
error=None, fine_tuned_model=None, finished_at=None,
hyperparameters=Hyperparameters(n_epochs=3, batch_size=1,
learning_rate_multiplier=2), model='gpt-3.5-turbo-0613',
object='fine_tuning.job', organization_id='org-PKDk6D4mPARkEfXOj2JB21sK',
result_files=[], status='running', trained_tokens=None,
training_file='file-513TXhzAGIypvaUt2RqKLW39', validation_file='file-
aOQYL6VpOFbu8QdvOP5Q06ZI')
events = events_list_job.data
events.reverse()
11
Step 101/1803: training loss=1.30, validation loss=5.98
Step 201/1803: training loss=0.00, validation loss=3.96
Step 301/1803: training loss=0.00, validation loss=3.28
Step 401/1803: training loss=0.00, validation loss=0.00
Step 501/1803: training loss=6.93, validation loss=0.00
Step 601/1803: training loss=0.00, validation loss=0.00
Step 701/1803: training loss=0.00, validation loss=0.00
Step 801/1803: training loss=0.00, validation loss=0.00
Step 901/1803: training loss=0.00, validation loss=0.00
Step 1001/1803: training loss=5.63, validation loss=0.00
Step 1101/1803: training loss=0.00, validation loss=0.00
Step 1201/1803: training loss=0.00, validation loss=0.56
Step 1301/1803: training loss=0.00, validation loss=0.00
Step 1401/1803: training loss=0.00, validation loss=0.00
Step 1501/1803: training loss=0.00, validation loss=6.16
Step 1601/1803: training loss=0.20, validation loss=0.00
Step 1701/1803: training loss=0.00, validation loss=0.00
Step 1801/1803: training loss=0.00, validation loss=0.00
New fine-tuned model created: ft:gpt-3.5-turbo-0613:personal:airline-
sentiment:8OYMFXDs
The job has successfully completed
4 Inference
4.1 Model name once fine-tuned is finished
[ ]: retrieve_fine_tuned_job = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = retrieve_fine_tuned_job.fine_tuned_model
if fine_tuned_model_id is None:
raise RuntimeError("Fine-tuned model ID not found. Your job has likely not␣
↪been completed yet.")
[ ]: (14640, 15)
12
4.2.1 Using chat completion for a given tweet example
[ ]: test_df = df.loc[800:1000]
test_row = test_df.iloc[10]
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = create_user_message(test_row)
test_messages.append({"role": "user", "content": create_user_message(test_row)})
print(test_messages)
[{'role': 'system', 'content': 'You are a helpful assistant. You are to extract
the sentiment analysis from the provided airline tweets.'}, {'role': 'user',
'content': 'Airline: United\n\nTweet: @united Your website deserves a new
design. #html5 FTW!\n\nAirline Sentiment: '}]
[ ]: response = client.chat.completions.create(
model=fine_tuned_model_id,
messages=test_messages
)
print(f"Tweet = {test_row['text']}")
print(f"\nrole: {response.choices[0].message.role}, content : {response.
↪choices[0].message.content}")
[ ]: def create_final_message_test(row):
test_messages = []
test_messages.append({"role": "system", "content": system_message})
user_message = create_user_message(row)
test_messages.append({"role": "user", "content": create_user_message(row)})
return test_messages
[ ]: %%time
finetuned_model_resp =[]
for i in range(0, len(test_df)):
row = test_df.iloc[i]
messages_test = create_final_message_test(row)
response = client.chat.completions.create(
model=fine_tuned_model_id,
13
messages=messages_test
)
# print(f"\nTweet = {row['text']}")
# print(f"role: {response.choices[0].message.role}, content : {response.
↪choices[0].message.content}")
# print(f"Tweet = {row['airline_sentiment']}")
finetuned_model_resp.append(response.choices[0].message.content)
[ ]: test_df['result'] = finetuned_model_resp
<ipython-input-113-059c20cd175a>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
y_true = test_df['airline_sentiment'].values
accuracy = accuracy_score(y_true, finetuned_model_resp)
precision = precision_score(y_true, finetuned_model_resp, average=None)
recall = recall_score(y_true, finetuned_model_resp,average=None)
f1 = f1_score(y_true, finetuned_model_resp,average=None)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
Accuracy: 0.8557213930348259
Precision: [0.93382353 0.64864865 0.75 ]
Recall: [0.91366906 0.66666667 0.80769231]
F1 Score: [0.92363636 0.65753425 0.77777778]
14