0% found this document useful (0 votes)
29 views

Code 1

This Python code defines functions for preprocessing datasets for machine learning including splitting data into training and test sets, one-hot encoding, min-max scaling, principal component analysis, and feature engineering.

Uploaded by

berniepinoy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
29 views

Code 1

This Python code defines functions for preprocessing datasets for machine learning including splitting data into training and test sets, one-hot encoding, min-max scaling, principal component analysis, and feature engineering.

Uploaded by

berniepinoy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

import numpy as np

import pandas as pd
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.model_selection
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

def tts( dataset: pd.DataFrame,


label_col: str,
test_size: float,
stratify: bool,
random_state: int) ->
tuple[pd.DataFrame,pd.DataFrame,pd.Series,pd.Series]:
features = dataset.drop(columns=[label_col])
labels = dataset[label_col]

if stratify:
train_features, test_features, train_labels, test_labels =
train_test_split(features, labels,

test_size=test_size,

stratify=labels,

random_state=random_state)
else:
train_features, test_features, train_labels, test_labels =
train_test_split(features, labels,

test_size=test_size,

random_state=random_state)
return train_features, test_features, train_labels, test_labels

class PreprocessDataset:
def __init__(self,
train_features:pd.DataFrame,
test_features:pd.DataFrame,
one_hot_encode_cols:list[str],
min_max_scale_cols:list[str],
n_components:int,
feature_engineering_functions:dict
):
self.train_features = train_features
self.test_features = test_features
self.one_hot_encode_cols = one_hot_encode_cols
self.min_max_scale_cols = min_max_scale_cols
self.n_components = n_components
self.feature_engineering_functions = feature_engineering_functions

def one_hot_encode_columns_train(self) -> pd.DataFrame:


encoder = OneHotEncoder()
encoded_data =
encoder.fit_transform(self.train_features[self.one_hot_encode_cols])
encoded_df = pd.DataFrame(encoded_data.toarray(),
columns=encoder.get_feature_names_out(self.one_hot_encode_cols))
self.train_features = pd.concat([self.train_features, encoded_df],
axis=1).drop(self.one_hot_encode_cols,

axis=1)
return self.train_features

def one_hot_encode_columns_test(self) -> pd.DataFrame:


encoder = OneHotEncoder()
encoded_data =
encoder.fit_transform(self.test_features[self.one_hot_encode_cols])
encoded_df = pd.DataFrame(encoded_data.toarray(),

columns=encoder.get_feature_names_out(self.one_hot_encode_cols))
self.test_features = pd.concat([self.test_features, encoded_df],
axis=1).drop(self.one_hot_encode_cols, axis=1)
return self.test_features

def min_max_scaled_columns_train(self) -> pd.DataFrame:


scaler = MinMaxScaler()
self.train_features[self.min_max_scale_cols] = scaler.fit_transform(
self.train_features[self.min_max_scale_cols])
return self.train_features

def min_max_scaled_columns_test(self) -> pd.DataFrame:


# Columns to be scaled: 'cost' and 'height'
columns_to_scale = ['cost', 'height']

# Create a copy of the DataFrame to avoid modifying the original data


min_max_scaled_dataset = self.data.copy()

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the numerical columns


min_max_scaled_dataset[columns_to_scale] =
scaler.fit_transform(min_max_scaled_dataset[columns_to_scale])

return min_max_scaled_dataset

def pca_train(self) -> pd.DataFrame:


pca = PCA(n_components=self.n_components)
pca_data = pca.fit_transform(self.train_features)
pca_df = pd.DataFrame(data=pca_data, columns=[f"PCA_{i + 1}" for i in
range(self.n_components)])
self.train_features = pd.concat([self.train_features, pca_df], axis=1)
return self.train_features

def pca_test(self) -> pd.DataFrame:


pca = PCA(n_components=self.n_components)
pca_data = pca.fit_transform(self.test_features)
pca_df = pd.DataFrame(data=pca_data, columns=[f"PCA_{i + 1}" for i in
range(self.n_components)])
self.test_features = pd.concat([self.test_features, pca_df], axis=1)
return self.test_features

def feature_engineering_train(self) -> pd.DataFrame:


for func_name, func in self.feature_engineering_functions.items():
self.train_features[func_name] = func(self.train_features)
return self.train_features

def feature_engineering_test(self) -> pd.DataFrame:


for func_name, func in self.feature_engineering_functions.items():
self.test_features[func_name] = func(self.test_features)
return self.test_features

def preprocess_train(self) -> pd.DataFrame:


self.train_features = self.one_hot_encode_columns_train()
self.train_features = self.min_max_scaled_columns_train()
self.train_features = self.pca_train()
self.train_features = self.feature_engineering_train()
return self.train_features

def preprocess_test(self) -> pd.DataFrame:


self.test_features = self.one_hot_encode_columns_test()
self.test_features = self.min_max_scaled_columns_test()
self.test_features = self.pca_test()
self.test_features = self.feature_engineering_test()
return self.test_features

You might also like