Week-5 - Jupyter Notebook
Week-5 - Jupyter Notebook
In [6]: warnings.filterwarnings('ignore')
In [7]: df=pd.read_csv('TSLA.csv')
In [9]: df.head()
Out[9]:
Date Open High Low Close Adj Close Volume
In [10]: df.tail()
Out[10]:
Date Open High Low Close Adj Close Volume
localhost:8888/notebooks/Week-5.ipynb# 1/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [13]: df.describe()
Out[13]:
Open High Low Close Adj Close Volume
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2416 entries, 0 to 2415
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 2416 non-null object
1 Open 2416 non-null float64
2 High 2416 non-null float64
3 Low 2416 non-null float64
4 Close 2416 non-null float64
5 Adj Close 2416 non-null float64
6 Volume 2416 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 132.2+ KB
In [15]: df['date']=pd.to_datetime(df.Date)
In [16]: df.date.dtype
Out[16]: dtype('<M8[ns]')
localhost:8888/notebooks/Week-5.ipynb# 2/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [17]: plt.figure(figsize=(15,5))
sns.lineplot(data=df,x='date',y='Close')
plt.title('Tesla Close Price',fontsize=15)
plt.ylabel('Price in dollars')
plt.show()
Out[19]: (2416, 8)
In [23]: df.head()
Out[23]:
Date Open High Low Close Volume
In [24]: df.isnull().sum()
Out[24]: Date 0
Open 0
High 0
Low 0
Close 0
Volume 0
dtype: int64
localhost:8888/notebooks/Week-5.ipynb# 3/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [26]: features=['Open','High','Low','Close','Volume']
plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.distplot(df[col])
plt.show()
In [27]: plt.subplots(figsize=(20,10))
for i,col in enumerate(features):
plt.subplot(2,3,i+1)
sns.boxplot(df[col])
plt.show()
FEATURE ENGINEERING
localhost:8888/notebooks/Week-5.ipynb# 4/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
Feature Construction
In [29]: splitted=df['Date'].str.split('-',expand=True)
In [30]: df['Day']=splitted[2].astype('int')
df['Month']=splitted[1].astype('int')
df['Year']=splitted[0].astype('int')
In [31]: df.drop('Date',axis=1,inplace=True)
In [32]: df.head()
Out[32]:
Open High Low Close Volume Day Month Year
In [33]: df['is_quarter_end']=np.where(df['Month']%3==0,1,0)
In [34]: df.head()
Out[34]:
Open High Low Close Volume Day Month Year is_quarter_end
localhost:8888/notebooks/Week-5.ipynb# 5/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [40]: plt.subplots(figsize=(20,10))
for i,col in enumerate(['Open','High','Low','Close']):
plt.subplot(2,2,i+1)
data_grouped[col].plot.bar()
plt.show()
In [42]: df.groupby('is_quarter_end').mean()
Out[42]:
Open High Low Close Volume Day Month
is_quarter_end
In [43]: df['open-close']=df['Open']-df['Close']
df['high-low']=df['High']-df['Low']
df['target']=np.where(df['Close'].shift(-1) > df['Close'],1,0)
localhost:8888/notebooks/Week-5.ipynb# 6/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [44]: plt.figure(figsize=(10,10))
sns.heatmap(df.corr()>0.9,annot=True,cbar=False)
plt.show()
localhost:8888/notebooks/Week-5.ipynb# 7/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
In [45]: plt.pie(df['target'].value_counts().values,labels=[0,1],autopct='%1.1f%%')
plt.show()
In [46]: features=df[['open-close','high-low','is_quarter_end']]
target=df['target']
In [47]: scaler=StandardScaler()
features=scaler.fit_transform(features)
(242, 3) (2174, 3)
In [ ]:
localhost:8888/notebooks/Week-5.ipynb# 8/9
8/5/24, 11:16 AM Week-5 - Jupyter Notebook
LogisticRegression()
Training Accuracy: 0.5228802330060918
Validation Accuracy: 0.4923371647509579
SVC(kernel='poly', probability=True)
Training Accuracy: 0.4704775693536028
Validation Accuracy: 0.5374247400109469
XGBClassifier
Training Accuracy: 0.943461732220797
Validation Accuracy: 0.4487889983579639
In [ ]:
localhost:8888/notebooks/Week-5.ipynb# 9/9