0% found this document useful (0 votes)
31 views

Week-5 - Jupyter Notebook

Uploaded by

pramidibalu2005
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
31 views

Week-5 - Jupyter Notebook

Uploaded by

pramidibalu2005
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [1]: import sklearn


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]: from sklearn.model_selection import train_test_split


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn import metrics

In [5]: import warnings

In [6]: warnings.filterwarnings('ignore')

In [11]: #LOAD DATASET

In [7]: df=pd.read_csv('TSLA.csv')

In [9]: df.head()

Out[9]:
Date Open High Low Close Adj Close Volume

0 2010-06-29 19.000000 25.00 17.540001 23.889999 23.889999 18766300

1 2010-06-30 25.790001 30.42 23.299999 23.830000 23.830000 17187100

2 2010-07-01 25.000000 25.92 20.270000 21.959999 21.959999 8218800

3 2010-07-02 23.000000 23.10 18.709999 19.200001 19.200001 5139800

4 2010-07-06 20.000000 20.00 15.830000 16.110001 16.110001 6866900

In [10]: df.tail()

Out[10]:
Date Open High Low Close Adj Close Volume

2411 2020-01-28 568.489990 576.809998 558.080017 566.900024 566.900024 11788500

2412 2020-01-29 575.690002 589.799988 567.429993 580.989990 580.989990 17801500

2413 2020-01-30 632.419983 650.880005 618.000000 640.809998 640.809998 29005700

2414 2020-01-31 640.000000 653.000000 632.520020 650.570007 650.570007 15719300

2415 2020-02-03 673.690002 786.140015 673.520020 780.000000 780.000000 47065000

In [12]: #EXPLORE dimensions


print('number of data columns:',df.shape[1],'\nnumber of data rows:',df.shape[0])

number of data columns: 7


number of data rows: 2416

localhost:8888/notebooks/Week-5.ipynb# 1/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [13]: df.describe()

Out[13]:
Open High Low Close Adj Close Volume

count 2416.000000 2416.000000 2416.000000 2416.000000 2416.000000 2.416000e+03

mean 186.271147 189.578224 182.916639 186.403651 186.403651 5.572722e+06

std 118.740163 120.892329 116.857591 119.136020 119.136020 4.987809e+06

min 16.139999 16.629999 14.980000 15.800000 15.800000 1.185000e+05

25% 34.342498 34.897501 33.587501 34.400002 34.400002 1.899275e+06

50% 213.035004 216.745002 208.870002 212.960007 212.960007 4.578400e+06

75% 266.450012 270.927513 262.102501 266.774994 266.774994 7.361150e+06

max 673.690002 786.140015 673.520020 780.000000 780.000000 4.706500e+07

In [14]: df.info() #Summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2416 entries, 0 to 2415
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 2416 non-null object
1 Open 2416 non-null float64
2 High 2416 non-null float64
3 Low 2416 non-null float64
4 Close 2416 non-null float64
5 Adj Close 2416 non-null float64
6 Volume 2416 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 132.2+ KB

In [15]: df['date']=pd.to_datetime(df.Date)

In [16]: df.date.dtype

Out[16]: dtype('<M8[ns]')

EXPLORATORY DATA ANALYSIS

localhost:8888/notebooks/Week-5.ipynb# 2/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [17]: plt.figure(figsize=(15,5))
sns.lineplot(data=df,x='date',y='Close')
plt.title('Tesla Close Price',fontsize=15)
plt.ylabel('Price in dollars')
plt.show()

In [20]: #Check for same

In [19]: df[df['Close']==df['Adj Close']].shape

Out[19]: (2416, 8)

In [21]: df.drop(['Adj Close','date'],axis=1,inplace=True)

In [23]: df.head()

Out[23]:
Date Open High Low Close Volume

0 2010-06-29 19.000000 25.00 17.540001 23.889999 18766300

1 2010-06-30 25.790001 30.42 23.299999 23.830000 17187100

2 2010-07-01 25.000000 25.92 20.270000 21.959999 8218800

3 2010-07-02 23.000000 23.10 18.709999 19.200001 5139800

4 2010-07-06 20.000000 20.00 15.830000 16.110001 6866900

In [24]: df.isnull().sum()

Out[24]: Date 0
Open 0
High 0
Low 0
Close 0
Volume 0
dtype: int64

localhost:8888/notebooks/Week-5.ipynb# 3/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [26]: features=['Open','High','Low','Close','Volume']
plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.distplot(df[col])
plt.show()

In [28]: #For outliers

In [27]: plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.boxplot(df[col])
plt.show()

FEATURE ENGINEERING

localhost:8888/notebooks/Week-5.ipynb# 4/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

Feature Construction

In [29]: splitted=df['Date'].str.split('-',expand=True)

In [30]: df['Day']=splitted[2].astype('int')
df['Month']=splitted[1].astype('int')
df['Year']=splitted[0].astype('int')

In [31]: df.drop('Date',axis=1,inplace=True)

In [32]: df.head()

Out[32]:
Open High Low Close Volume Day Month Year

0 19.000000 25.00 17.540001 23.889999 18766300 29 6 2010

1 25.790001 30.42 23.299999 23.830000 17187100 30 6 2010

2 25.000000 25.92 20.270000 21.959999 8218800 1 7 2010

3 23.000000 23.10 18.709999 19.200001 5139800 2 7 2010

4 20.000000 20.00 15.830000 16.110001 6866900 6 7 2010

Month-3,6,9,12 value 1 else 0

In [33]: df['is_quarter_end']=np.where(df['Month']%3==0,1,0)

In [34]: df.head()

Out[34]:
Open High Low Close Volume Day Month Year is_quarter_end

0 19.000000 25.00 17.540001 23.889999 18766300 29 6 2010 1

1 25.790001 30.42 23.299999 23.830000 17187100 30 6 2010 1

2 25.000000 25.92 20.270000 21.959999 8218800 1 7 2010 0

3 23.000000 23.10 18.709999 19.200001 5139800 2 7 2010 0

4 20.000000 20.00 15.830000 16.110001 6866900 6 7 2010 0

In [39]: df.grouped=df.groupby('Year').mean() #GROUPING BY YEAR

localhost:8888/notebooks/Week-5.ipynb# 5/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [40]: plt.subplots(figsize=(20,10))
for i,col in enumerate(['Open','High','Low','Close']):
plt.subplot(2,2,i+1)
data_grouped[col].plot.bar()
plt.show()

In [42]: df.groupby('is_quarter_end').mean()

Out[42]:
Open High Low Close Volume Day Month

is_quarter_end

0 185.875081 189.254226 182.449499 186.085081 5.767062e+06 15.710396 6.173886 2014.

1 187.071200 190.232700 183.860262 187.047163 5.180154e+06 15.825000 7.597500 2014.

In [43]: df['open-close']=df['Open']-df['Close']
df['high-low']=df['High']-df['Low']
df['target']=np.where(df['Close'].shift(-1) > df['Close'],1,0)

localhost:8888/notebooks/Week-5.ipynb# 6/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [44]: plt.figure(figsize=(10,10))
sns.heatmap(df.corr()>0.9,annot=True,cbar=False)
plt.show()

localhost:8888/notebooks/Week-5.ipynb# 7/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [45]: plt.pie(df['target'].value_counts().values,labels=[0,1],autopct='%1.1f%%')
plt.show()

In [46]: features=df[['open-close','high-low','is_quarter_end']]
target=df['target']

In [47]: scaler=StandardScaler()
features=scaler.fit_transform(features)

In [48]: #SPLIT DATASET


x_train,x_test,y_train,y_test=train_test_split(features,target,test_size=0.1,random_st
print(x_test.shape,x_train.shape)

(242, 3) (2174, 3)

In [ ]: ​

localhost:8888/notebooks/Week-5.ipynb# 8/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook

In [50]: #MODEL DEVELOPMENT & EVALUATION


class CustomXGBClassifier(XGBClassifier):
def __repr__(self):
return "XGBClassifier"
models=[LogisticRegression(),SVC(kernel='poly',probability=True),CustomXGBClassifier(
for model in models:
model.fit(x_train,y_train)
training_accuracy=metrics.roc_auc_score(y_train,model.predict_proba(x_train)[:,1]
validation_accuracy=metrics.roc_auc_score(y_test,model.predict_proba(x_test)[:,1]
print(model)
print("Training Accuracy:",training_accuracy)
print("Validation Accuracy:",validation_accuracy)

LogisticRegression()
Training Accuracy: 0.5228802330060918
Validation Accuracy: 0.4923371647509579
SVC(kernel='poly', probability=True)
Training Accuracy: 0.4704775693536028
Validation Accuracy: 0.5374247400109469
XGBClassifier
Training Accuracy: 0.943461732220797
Validation Accuracy: 0.4487889983579639

In [ ]: ​

localhost:8888/notebooks/Week-5.ipynb# 9/9

You might also like