Startup 1668080110
Startup 1668080110
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
plt.style.use('seaborn')
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 1 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
San
0 1005 CA 42.358880 -71.056820 92101 c:6669 NaN
Diego
Los
1 204 CA 37.238916 -121.973718 95032 c:16283 NaN
Gatos
Cupertino
3 738 CA 37.320309 -122.050040 95014 c:42668 Cupertino
CA 95014
San
San
4 1002 CA 37.779281 -122.419236 94105 c:65806 Francisco
Francisco
CA 94105
San
918 352 CA 37.740594 -122.376471 94107 c:21343 NaN
Francisco
Burlington
919 721 MA 42.504817 -71.195611 1803 c:41747 Burlington
MA 1803
San
921 589 CA 37.556732 -122.288378 94404 c:33198 NaN
Francisco
Santa
Santa
922 462 CA 37.386778 -121.966277 95054 c:26702 Clara CA
Clara
95054
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 2 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923 entries, 0 to 922
Data columns (total 49 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 923 non-null int64
1 state_code 923 non-null object
2 latitude 923 non-null float64
3 longitude 923 non-null float64
4 zip_code 923 non-null object
5 id 923 non-null object
6 city 923 non-null object
7 Unnamed: 6 430 non-null object
8 name 923 non-null object
9 labels 923 non-null int64
10 founded_at 923 non-null object
11 closed_at 335 non-null object
12 first_funding_at 923 non-null object
13 last_funding_at 923 non-null object
14 age_first_funding_year 923 non-null float64
15 age_last_funding_year 923 non-null float64
16 age_first_milestone_year 771 non-null float64
17 age_last_milestone_year 771 non-null float64
18 relationships 923 non-null int64
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 3 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [14]:
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 4 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [14]:
#check corr relation of the dataset
data.corr()
Out[14]:
Unnamed: latitude longitude labels age_first_funding_year
0
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 5 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
Out[15]: Unnamed: 0 0
state_code 0
latitude 0
longitude 0
zip_code 0
id 0
city 0
Unnamed: 6 493
name 0
labels 0
founded_at 0
closed_at 588
first_funding_at 0
last_funding_at 0
age_first_funding_year 0
age_last_funding_year 0
age_first_milestone_year 152
age_last_milestone_year 152
relationships 0
funding_rounds 0
funding_total_usd 0
milestones 0
state_code.1 1
is_CA 0
is_NY 0
is_MA 0
is_TX 0
is_otherstate 0
category_code 0
is_software 0
is_web 0
is_mobile 0
is_enterprise 0
is_advertising 0
is_gamesvideo 0
is_ecommerce 0
is_biotech 0
is_consulting 0
is_othercategory 0
object_id 0
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 6 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
has_VC 0
has_angel 0
has_roundA 0
has_roundB 0
has_roundC 0
has_roundD 0
avg_participants 0
is_top500 0
status 0
dtype: int64
0 4.6685 6.7041 3
1 7.0055 7.0055 1
2 1.4575 2.2055 2
3 6.0027 6.0027 1
4 0.0384 0.0384 1
515 CA nan
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 7 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
Graphic Approach
In [24]: #Correlation heatmap
data['age_first_milestone_year'] = data.age_first_milestone_year.astype
data['age_last_milestone_year'] = data.age_last_milestone_year.astype
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 8 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
plt.figure(figsize=(30,20))
ax = sns.heatmap(data = data[features].corr(),cmap='YlGnBu',annot=True
Scatter plot
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 9 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 10 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
Box plots
In [28]: featuresNum = ['age_first_funding_year','age_last_funding_year','age_fi
plt.figure(figsize=(15, 7))
for i in range(0, len(featuresNum)):
plt.subplot(1, len(featuresNum), i+1)
sns.boxplot(y=data[featuresNum[i]], color='green', orient='v')
plt.tight_layout()
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 11 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
fig, ax = plt.subplots()
_ = sns.barplot(x="year", y="No_of_startup", data=cdf,
palette=sns.color_palette(['#003f5c', '#ffa600'], n_col
_ = ax.set(xlabel="Year", ylabel="No. of startup")
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 12 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 13 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [35]: sns.countplot(data['status'])
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 14 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set(xlabel="Category", ylabel="No. of startup")
plt.legend(bbox_to_anchor=(0.945, 0.90))
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 15 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
18 mobile 7263750881
30 software 2657598865
34 web 1729035436
3 biotech 1723699484
8 enterprise 1338882096
4 cleantech 1300284730
28 semiconductor 1105156970
0 advertising 918619012
11 games_video 844643530
12 hardware 773938873
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 16 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 17 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set(xlabel="state_code", ylabel="No. of startup")
plt.legend(bbox_to_anchor=(0.945, 0.90))
most_trending_statea = trending_statea[trending_statea.groupby('state_c
most_trending_statea = most_trending_statea.sort_values('num_startup'
most_trending_statea
2 CA 488
23 NY 106
12 MA 83
32 WA 42
29 TX 42
3 CO 19
9 IL 18
26 PA 17
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 18 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
31 VA 13
7 GA 11
20 NJ 7
13 MD 7
25 OR 7
18 NC 7
6 FL 6
24 OH 6
16 MN 5
5 DC 4
4 CT 4
15 MI 3
28 TN 3
27 RI 3
30 UT 3
22 NV 2
19 NH 2
1 AZ 2
14 ME 2
11 KY 2
10 IN 2
17 MO 2
33 WI 1
0 AR 1
21 NM 1
8 ID 1
34 WV 1
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 19 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
most_trending_statea = trending_statea[trending_statea.groupby('state_c
most_trending_statea = most_trending_statea.sort_values('num_startup'
most_trending_statea.head(10)
most_trending_statec = trending_statec[trending_statec.groupby('state_c
most_trending_statec = most_trending_statec.sort_values('num_startup'
most_trending_statec
most_trending_categorya = trending_categorya[trending_categorya.groupby
most_trending_categorya = most_trending_categorya.sort_values('num_star
most_trending_categorya
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 20 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
most_trending_categoryc = trending_categoryc[trending_categoryc.groupby
most_trending_categoryc = most_trending_categoryc.sort_values('num_star
most_trending_categoryc
91 Kirkland 5718914576
13 Austin 706317317
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 21 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [48]: df_what_in_kirkland.head()
Kirkland
98033- WA
62 332 WA 47.675489 -122.191667 c:19861 Kirkland
6314 98033-
6314
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 22 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set(xlabel="Has_VC", ylabel="No. of startup")
plt.legend(bbox_to_anchor=(0.945, 0.90))
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 23 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set(xlabel="is_top500", ylabel="No. of startup")
plt.legend(bbox_to_anchor=(0.945, 0.90))
In [51]: #How many Startup have both 'acquired' status and is_top500?
len(data[(data["status"] == True) & (data["is_top500"] == True)].index
Out[51]: 0
In [52]: #How many Startup have both 'closed' status and is_top500?
len(data[(data["status"] == False) & (data["is_top500"] == False)].
Out[52]: 0
In [53]: df_acquired["is_top500"].value_counts(normalize=True)
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 24 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 25 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
Out[57]: True
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 26 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [59]: new_data.head()
San
0 1005 CA 42.358880 -71.056820 92101 c:6669 NaN
Diego
Los
1 204 CA 37.238916 -121.973718 95032 c:16283 NaN
Gatos
Cupertino
3 738 CA 37.320309 -122.050040 95014 c:42668 Cupertino
CA 95014
San
San
4 1002 CA 37.779281 -122.419236 94105 c:65806 Francisco
Francisco
CA 94105
In [60]: age=["age_first_funding_year","age_last_funding_year","age_first_milest
for a in range(len(age)):
print("Is there any negative value in '{}' column : {} ".format
In [61]: df=data.drop(data[data.age_first_funding_year<0].index)
df=data.drop(data[data.age_last_funding_year<0].index)
df=data.drop(df[data.age_first_milestone_year<0].index)
df=data.drop(data[data.age_last_milestone_year<0].index)
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 27 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
Outliers
In [63]: featuresNumfinal = ['age_first_funding_year','age_last_funding_year'
plt.figure(figsize=(15, 7))
for i in range(0, len(featuresNumfinal)):
plt.subplot(1, len(featuresNumfinal), i+1)
sns.boxplot(y=df[featuresNumfinal[i]], color='green', orient='v'
plt.tight_layout()
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 28 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
plt.figure(figsize=(15, 7))
for i in range(0, len(featuresNumfinal)):
plt.subplot(1, len(featuresNumfinal), i+1)
sns.boxplot(y=df[featuresNumfinal[i]], color='green', orient='v'
plt.tight_layout()
Feature Engineering
In [66]: #New Column has_RoundABCD
df['has_RoundABCD'] = np.where((df['has_roundA'] == 1) | (df['has_round
df.head()
San
0 1005 CA 42.358880 -71.056820 92101 c:6669 NaN
Diego
Los
1 204 CA 37.238916 -121.973718 95032 c:16283 NaN
Gatos
Cupertino
3 738 CA 37.320309 -122.050040 95014 c:42668 Cupertino
CA 95014
San
San
4 1002 CA 37.779281 -122.419236 94105 c:65806 Francisco
Francisco
CA 94105
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 29 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
San
0 1005 CA 42.358880 -71.056820 92101 c:6669 NaN
Diego
Los
1 204 CA 37.238916 -121.973718 95032 c:16283 NaN
Gatos
Cupertino
3 738 CA 37.320309 -122.050040 95014 c:42668 Cupertino
CA 95014
San
San
4 1002 CA 37.779281 -122.419236 94105 c:65806 Francisco
Francisco
CA 94105
Out[68]: 674
Out[69]: 911
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 30 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
San
0 1005 CA 42.358880 -71.056820 92101 c:6669 NaN
Diego
Los
1 204 CA 37.238916 -121.973718 95032 c:16283 NaN
Gatos
Cupertino
3 738 CA 37.320309 -122.050040 95014 c:42668 Cupertino
CA 95014
San
San
4 1002 CA 37.779281 -122.419236 94105 c:65806 Francisco
Francisco
CA 94105
In [71]: df['has_Seed'] == 1
Out[71]: 0 True
1 False
2 False
3 False
4 True
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 True
13 False
14 False
15 True
16 False
17 False
18 False
19 False
Model belding
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 31 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
Out[72]: Unnamed:
state_code zip_code id city name founded_at closed_at
6
San
0 CA 92101 c:6669 NaN Bandsintown 1/1/2007 31/12/2013
Diego
Los
1 CA 95032 c:16283 NaN TriCipher 1/1/2000 31/12/2013
Gatos
Cupertino Solidcore
3 CA 95014 c:42668 Cupertino 1/1/2002 31/12/2013
CA 95014 Systems
San
San
4 CA 94105 c:65806 Francisco Inhale Digital 8/1/2010 10/1/2012
Francisco
CA 94105
In [73]: df = data.drop(['state_code'],axis=1)
df = df.drop(['id'],axis=1)
df = df.drop(['Unnamed: 6'],axis=1)
df = df.drop(['category_code'],axis=1)
df = df.drop(['object_id'],axis=1)
df = df.drop(['zip_code'],axis=1)
df = df.drop(['founded_at'],axis=1)
df = df.drop(['closed_at'],axis=1)
df = df.drop(['first_funding_at'],axis=1)
df = df.drop(['last_funding_at'],axis=1)
df = df.drop(['city'],axis=1)
df = df.drop(['name'],axis=1)
df = df.drop(['Unnamed: 0'],axis=1)
df = df.drop(['latitude','longitude'],axis=1)
df = df.drop(['geometry'],axis=1)
#df = df.drop(['age_closed_startup'],axis=1)
df = df.drop(['relationships'],axis=1)
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 32 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [74]: df.columns
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 33 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [76]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923 entries, 0 to 922
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 labels 923 non-null int64
1 age_first_funding_year 923 non-null float64
2 age_last_funding_year 923 non-null float64
3 age_first_milestone_year 923 non-null float64
4 age_last_milestone_year 923 non-null float64
5 funding_rounds 923 non-null int64
6 funding_total_usd 923 non-null int64
7 milestones 923 non-null int64
8 is_CA 923 non-null int64
9 is_NY 923 non-null int64
10 is_MA 923 non-null int64
11 is_TX 923 non-null int64
12 is_otherstate 923 non-null int64
13 is_software 923 non-null int64
14 is_web 923 non-null int64
15 is_mobile 923 non-null int64
16 is_enterprise 923 non-null int64
17 is_advertising 923 non-null int64
18 is_gamesvideo 923 non-null int64
19 is_ecommerce 923 non-null int64
20 is_biotech 923 non-null int64
21 is_consulting 923 non-null int64
22 is_othercategory 923 non-null int64
23 has_VC 923 non-null int64
24 has_angel 923 non-null int64
25 has_roundA 923 non-null int64
26 has_roundB 923 non-null int64
27 has_roundC 923 non-null int64
28 has_roundD 923 non-null int64
29 avg_participants 923 non-null float64
30 is_top500 923 non-null int64
31 status 923 non-null object
dtypes: float64(5), int64(26), object(1)
memory usage: 230.9+ KB
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 34 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [78]: print(X)
print(Y)
Out[85]: LogisticRegression()
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 35 of 36
Untitled2 - Jupyter Notebook 10/11/22, 3:14 AM
In [87]:
http://localhost:8888/notebooks/Downloads/Untitled2.ipynb Page 36 of 36