caso2lau
caso2lau
February 7, 2025
# Cargar el dataset
df = pd.read_csv('Datos_complexivo.csv', encoding="latin1", delimiter=";")
index State Account length Area code International plan Voice mail plan \
0 0 LA 117 408 No No
1 1 IN 65 415 No No
2 2 NY 161 415 No No
3 3 SC 111 415 No No
4 4 HI 49 510 No No
1
1 NaN 129.1 137
2 NaN 500.0 67
3 NaN 110.4 103
4 NaN 119.3 117
[5 rows x 21 columns]
2
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (6.0.2)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (3.1.5)
Requirement already satisfied: visions<0.8.0,>=0.7.5 in
/usr/local/lib/python3.11/dist-packages (from
visions[type_image_path]<0.8.0,>=0.7.5->ydata-profiling) (0.7.6)
Requirement already satisfied: numpy<2.2,>=1.16.0 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (1.26.4)
Requirement already satisfied: htmlmin==0.1.12 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (0.1.12)
Requirement already satisfied: phik<0.13,>=0.11.1 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (0.12.4)
Requirement already satisfied: requests<3,>=2.24.0 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (2.32.3)
Requirement already satisfied: tqdm<5,>=4.48.2 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (4.67.1)
Requirement already satisfied: seaborn<0.14,>=0.10.1 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (0.13.2)
Requirement already satisfied: multimethod<2,>=1.4 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (1.12)
Requirement already satisfied: statsmodels<1,>=0.13.2 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (0.14.4)
Requirement already satisfied: typeguard<5,>=3 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (4.4.1)
Requirement already satisfied: imagehash==4.3.1 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (4.3.1)
Requirement already satisfied: wordcloud>=1.9.3 in
/usr/local/lib/python3.11/dist-packages (from ydata-profiling) (1.9.4)
Requirement already satisfied: dacite>=1.8 in /usr/local/lib/python3.11/dist-
packages (from ydata-profiling) (1.9.2)
Requirement already satisfied: PyWavelets in /usr/local/lib/python3.11/dist-
packages (from imagehash==4.3.1->ydata-profiling) (1.8.0)
Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages
(from imagehash==4.3.1->ydata-profiling) (11.1.0)
Requirement already satisfied: MarkupSafe>=2.0 in
/usr/local/lib/python3.11/dist-packages (from jinja2<3.2,>=2.11.1->ydata-
profiling) (3.0.2)
Requirement already satisfied: contourpy>=1.0.1 in
/usr/local/lib/python3.11/dist-packages (from matplotlib>=3.5->ydata-profiling)
(1.3.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-
packages (from matplotlib>=3.5->ydata-profiling) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in
/usr/local/lib/python3.11/dist-packages (from matplotlib>=3.5->ydata-profiling)
(4.55.7)
Requirement already satisfied: kiwisolver>=1.3.1 in
/usr/local/lib/python3.11/dist-packages (from matplotlib>=3.5->ydata-profiling)
(1.4.8)
3
Requirement already satisfied: packaging>=20.0 in
/usr/local/lib/python3.11/dist-packages (from matplotlib>=3.5->ydata-profiling)
(24.2)
Requirement already satisfied: pyparsing>=2.3.1 in
/usr/local/lib/python3.11/dist-packages (from matplotlib>=3.5->ydata-profiling)
(3.2.1)
Requirement already satisfied: python-dateutil>=2.7 in
/usr/local/lib/python3.11/dist-packages (from matplotlib>=3.5->ydata-profiling)
(2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-
packages (from pandas!=1.4.0,<3,>1.1->ydata-profiling) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-
packages (from pandas!=1.4.0,<3,>1.1->ydata-profiling) (2025.1)
Requirement already satisfied: joblib>=0.14.1 in /usr/local/lib/python3.11/dist-
packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.4.2)
Requirement already satisfied: annotated-types>=0.6.0 in
/usr/local/lib/python3.11/dist-packages (from pydantic>=2->ydata-profiling)
(0.7.0)
Requirement already satisfied: pydantic-core==2.27.2 in
/usr/local/lib/python3.11/dist-packages (from pydantic>=2->ydata-profiling)
(2.27.2)
Requirement already satisfied: typing-extensions>=4.12.2 in
/usr/local/lib/python3.11/dist-packages (from pydantic>=2->ydata-profiling)
(4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in
/usr/local/lib/python3.11/dist-packages (from requests<3,>=2.24.0->ydata-
profiling) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-
packages (from requests<3,>=2.24.0->ydata-profiling) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in
/usr/local/lib/python3.11/dist-packages (from requests<3,>=2.24.0->ydata-
profiling) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in
/usr/local/lib/python3.11/dist-packages (from requests<3,>=2.24.0->ydata-
profiling) (2024.12.14)
Requirement already satisfied: patsy>=0.5.6 in /usr/local/lib/python3.11/dist-
packages (from statsmodels<1,>=0.13.2->ydata-profiling) (1.0.1)
Requirement already satisfied: attrs>=19.3.0 in /usr/local/lib/python3.11/dist-
packages (from
visions<0.8.0,>=0.7.5->visions[type_image_path]<0.8.0,>=0.7.5->ydata-profiling)
(25.1.0)
Requirement already satisfied: networkx>=2.4 in /usr/local/lib/python3.11/dist-
packages (from
visions<0.8.0,>=0.7.5->visions[type_image_path]<0.8.0,>=0.7.5->ydata-profiling)
(3.4.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-
packages (from python-dateutil>=2.7->matplotlib>=3.5->ydata-profiling) (1.17.0)
4
[6]: #from ydata_profiling import ProfileReport
# Mostrar el informe
#profile.to_notebook_iframe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 index 3333 non-null int64
1 State 3333 non-null object
2 Account length 3333 non-null int64
3 Area code 3333 non-null int64
4 International plan 3333 non-null object
5 Voice mail plan 3333 non-null object
6 Number vmail messages 922 non-null float64
7 Total day minutes 3333 non-null float64
8 Total day calls 3333 non-null int64
9 Total day charge 3333 non-null float64
10 Total eve minutes 3333 non-null float64
11 Total eve calls 3333 non-null int64
12 Total eve charge 3333 non-null float64
13 Total night minutes 3333 non-null float64
14 Total night calls 3333 non-null int64
15 Total night charge 3333 non-null float64
16 Total intl minutes 3333 non-null float64
17 Total intl calls 1998 non-null float64
18 Total intl charge 3333 non-null float64
19 Customer service calls 3333 non-null int64
20 Churn 3333 non-null bool
dtypes: bool(1), float64(10), int64(7), object(3)
memory usage: 524.2+ KB
[48]: index 0
State 0
Account length 0
Area code 0
5
International plan 0
Voice mail plan 0
Number vmail messages 2411
Total day minutes 0
Total day calls 0
Total day charge 0
Total eve minutes 0
Total eve calls 0
Total eve charge 0
Total night minutes 0
Total night calls 0
Total night charge 0
Total intl minutes 0
Total intl calls 1335
Total intl charge 0
Customer service calls 0
Churn 0
dtype: int64
[49]: df.duplicated().sum()
[49]: 0
6
count 3333.000000 3333.000000 3333.000000
mean 203.522382 100.114311 17.083540
std 62.610089 19.922625 4.310668
min 0.000000 0.000000 0.000000
25% 166.600000 87.000000 14.160000
50% 201.400000 100.000000 17.120000
75% 235.300000 114.000000 20.000000
max 600.000000 170.000000 30.910000
Analisis univariado
Distribución de la variables
[51]: print('The percentage of customers churning from the company is: %{}'.
↪format((df['Churn'].sum()) *100/df.shape[0]) )
7
[52]: import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 8))
for i, var in enumerate(variables_categoricas, 1):
plt.subplot(2, 2, i)
sns.countplot(x=df[var], palette="viridis")
plt.title(f"Distribución de {var}")
plt.ylabel("Frecuencia")
plt.tight_layout()
plt.show()
8
"Total eve calls","Total eve charge","Total night␣
↪minutes","Total night calls",
"Total night charge","Total intl minutes","Total intl␣
↪calls","Total intl charge",
9
Relación entre churn y llamadas al servicio al cliente
[55]: sns.barplot(x='Churn', y='Customer service calls',data=df, palette="viridis")
10
[56]: churn_voicem = df.groupby(['Churn','International plan']).size()
churn_voicem.plot()
plt.show()
11
Distribución de variables respecto del churn
[59]: variables_numericas = ["Account length","Number vmail messages",
"Total day minutes","Total day calls","Total day␣
↪charge","Total eve minutes",
filas = 4
columnas = 4
fig, axes = plt.subplots(filas, columnas, figsize=(15, 15))
colores = ["#1f77b4", "#ff7f0e"]
12
sns.histplot(df, x=col, hue="Churn", kde=True, bins=30, palette=colores,␣
alpha=0.8, ax=ax)
↪
Análisis multivariado
13
[60]: # Matriz de correlación de variables numéricas
correlation_matrix = df[variables_numericas].corr()
14
[62]: filas = 4
columnas = 4
df_numeric = df.select_dtypes(include=[np.number])
fig, axes = plt.subplots(filas, columnas, figsize=(16, 12))
fig.suptitle("Diagrama de cajas y bigotes - Variables numéricas", fontsize=16,␣
↪fontweight="bold")
axes = axes.flatten()
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
15
3 Preparación de los Datos
[64]: # Imputar valores nulos con la mediana
df.loc[:, 'Number vmail messages'] = df['Number vmail messages'].
↪fillna(df['Number vmail messages'].median())
16
# Aplicar Target Encoding a "State"
target_encoder = TargetEncoder(cols=['State'])
df['State'] = target_encoder.fit_transform(df['State'], df['Churn'])
Manejo de outlierss
[68]: def replace_outliers(series):
# Calcular los cuartiles y los límites para los outliers
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
[70]: filas = 4
columnas = 4
df_numeric = df.select_dtypes(include=[np.number])
fig, axes = plt.subplots(filas, columnas, figsize=(16, 12))
fig.suptitle("Diagrama de cajas y bigotes - Variables numéricas", fontsize=16,␣
↪fontweight="bold")
axes = axes.flatten()
17
sns.boxplot(y=df_numeric[col], ax=axes[i], palette="cool")
axes[i].set_title(col, fontsize=12)
axes[i].grid(True, linestyle="--", alpha=0.7)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
4 Modelado
Suavizar el desbalance de la variable objetivo
[26]: !pip install --upgrade scikit-learn
18
Requirement already satisfied: threadpoolctl>=3.1.0 in
/usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.5.0)
# Leer los datos, asumiendo que el archivo CSV ya está cargado en 'df'
# df = pd.read_csv("tu_archivo.csv")
modelo_bagging.fit(X_train_balanced, y_train_balanced)
19
# Visualizar la importancia de las características
plt.figure(figsize=(12, 6))
plt.title("Importancia de las características")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
20
std 49.358279 19.589178 4.254357
min 64.300000 48.000000 5.470000
25% 167.200000 87.000000 14.210000
50% 201.000000 100.000000 17.090000
75% 236.000000 114.000000 20.060000
max 319.300000 154.000000 28.650000
21
[[821 33]
[ 51 95]]
precision recall f1-score support
22
y = df['Churn']
modelo_bagging.fit(X_train, y_train)
plt.xlabel('Número de árboles')
plt.ylabel('Error OOB')
plt.title('Error OOB vs. Número de árboles en el Random Forest')
plt.show()
plt.figure(figsize=(12, 6))
plt.title('Importancia de las características')
plt.bar(range(X_train.shape[1]), importances[indices], align='center',␣
↪color='firebrick')
23
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
[[847 7]
[ 60 86]]
precision recall f1-score support
24
False 0.93 0.99 0.96 854
True 0.92 0.59 0.72 146
proba = modelo_bagging.predict_proba(X_test)[:, 1]
25
plt.ylabel('Lift')
plt.title('Curva Lift')
plt.show()
26
27