Code 1
Code 1
import pandas as pd
import sklearn.preprocessing
import sklearn.decomposition
import sklearn.model_selection
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
if stratify:
train_features, test_features, train_labels, test_labels =
train_test_split(features, labels,
test_size=test_size,
stratify=labels,
random_state=random_state)
else:
train_features, test_features, train_labels, test_labels =
train_test_split(features, labels,
test_size=test_size,
random_state=random_state)
return train_features, test_features, train_labels, test_labels
class PreprocessDataset:
def __init__(self,
train_features:pd.DataFrame,
test_features:pd.DataFrame,
one_hot_encode_cols:list[str],
min_max_scale_cols:list[str],
n_components:int,
feature_engineering_functions:dict
):
self.train_features = train_features
self.test_features = test_features
self.one_hot_encode_cols = one_hot_encode_cols
self.min_max_scale_cols = min_max_scale_cols
self.n_components = n_components
self.feature_engineering_functions = feature_engineering_functions
axis=1)
return self.train_features
columns=encoder.get_feature_names_out(self.one_hot_encode_cols))
self.test_features = pd.concat([self.test_features, encoded_df],
axis=1).drop(self.one_hot_encode_cols, axis=1)
return self.test_features
# Initialize MinMaxScaler
scaler = MinMaxScaler()
return min_max_scaled_dataset