Open navigation menu
Close suggestions
Search
Search
en
Change Language
Upload
Sign in
Sign in
Download free for days
0 ratings
0% found this document useful (0 votes)
22 views
Final Draft
Uploaded by
cjchen2810
Copyright
© © All Rights Reserved
Available Formats
Download as PDF or read online on Scribd
Download now
Download
Save Final Draft For Later
Download
Save
Save Final Draft For Later
0%
0% found this document useful, undefined
0%
, undefined
Embed
Share
Print
Report
0 ratings
0% found this document useful (0 votes)
22 views
Final Draft
Uploaded by
cjchen2810
Copyright
© © All Rights Reserved
Available Formats
Download as PDF or read online on Scribd
Download now
Download
Save Final Draft For Later
Carousel Previous
Carousel Next
Save
Save Final Draft For Later
0%
0% found this document useful, undefined
0%
, undefined
Embed
Share
Print
Report
Download now
Download
You are on page 1
/ 36
Search
Fullscreen
s2nen017 Project Data Exploration Project Data Exploration Chris Chen November 26, 2017 FORMAT PLOTS CHANGE COLORS Exploratory Data Analysis #Install Packages ‘#install. packages (‘Lubridate’) winstallpackoges( “dplyr’) Hinstal. packages (‘ggptot2") #install.packages(‘atus") #install.packages(‘cowplot’) ‘#install.packages(‘ggmosaic’) Run Libraries Library (éplyr) i Warning: package ‘dplyr’ was built under R version 3.3.3 ae wi Attaching package: ‘dplyr* 4H The following objects are masked from "package:stats* a filter, lag 4 The following objects are masked from ‘package:base': a WH intersect, setdiff, setequal, union Library (1ubridate) 4H Warning: package ‘lubridate’ was built under R version 3.3.3 wt # Attaching package: ‘lubridate’ it The following object is masked from ‘package:base’: ae WH date fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1196s2nen017 Project Data Exploration Library (ggplot2) 4 Warning: package ‘ggplot2‘ was built under R version 3.3.3 Library(etus) 4H Warning: package ‘atus’ was built under R version 3.3.3 Library (cowplot) at a Attaching packag “conplot* 4H The following object is masked from ‘package:ggeplot2" wh WH gesave Library (ggnosaic) 4 Warning: package ‘ggnosaic’ was built under R version 3.3.3 4H Loading required package: productplots 4H Warning: package 'productplots' was built under R version 3.3.3 a2 ‘Attaching package: ‘ggmosaic’ The following objects are masked from “package:productplots': 222 decker, hspine, mosaic, prodcalc, spine, vspine #CoLum Names names (atuscps) WH [1] “tucaseid’ region" “state wt [5] “age” edu" "pace" wi [9] “country_born” “citizer "marital 4 [13] "Famincone” names(atusresp) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2186s2nen017 Project Data Exploration 4 [1] “tucaseid” “tuyear® #8 [3] “diary_no Hary_day sn [5] “holiday occup_code* 4 [7] “ind_code" abor_status” 4 [9] “student_status" pee #8 [11] "work_class" “ourly_wage" 8 [13] “weekly_earn "work_hr's_week’ #8 [15] “mult_jobs" “partner_hh #8 [17] “partner_works” “partner_ptft" #8 [19] “hh_size” chil ## [21] “hh_child_youngest_age" names (atusact) # [1] “tucaseid wStore the demographic and activity information in seperate datafranes demographic <- atuscps work <- atusresp activities <- atusact Data Description Data Treatment ‘#SLeeptine sleeptine <- activities %% filter(tiercode >= @10100 & tiercode < 010200) %>% group_by(tucaseid) %% sunmarize(sleep time = sum(dur)) 4H Warning: package ‘bindrepp’ was built under R version 3.3.3 fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl a6s2nen017 Project Data Exploration fHiorktime worktine <- activities %% filter(tiercode >= @50000 & tiercode < 060000) %>% group_by(tucaseid) %% sunmarize(work_time = sum(dur)) #PLaytine playtime <- activities %% filter(tiercode >= 120000 & tiercode < 130000) %>% group_by(tucaseid) %% sunmarize(play time = sum(dur)) ‘#VoLunteertime voltime <- activities %>% filter(tiercode >= 150000 & tiercode < 160000) %>% group_by(tucaseid) %% sunmarize(vol_time = sum(dur)) ‘#Sporttine sporttime <- activities %% filter(tiercode >= 130000 & tiercode < 14000) %>% group_by(tucaseid) %% sunmarize(sport_time = sum(dur)) fhealthtine healthtime <- activities %% filter(tiercode &@ tlercode < 080500) %>% group_by(tucaseid) %% sunmarize(health_time = sum(dur)) 010300 & tiercode < 010400 | tiercode >= e80400 #Herge datafranes into individual profiles data <- merge(denographic, work, by = “tucaseid”) data <- merge(data, sleeptine, by = “tucaseid", all.x = T) data <- merge(data, worktine, by = “tucaseid", all.x = 1) data <- merge(data, playtine, by = “tucaseid", all.x = 7) data <- merge(data, voltine, by = “tucaseid", all.x = T) data <- merge(data, sporttine, by = “tucaseid*, all.x = T) data <- merge(data, healthtine, by = "tucaseid", all.x = T) ‘#colnames (data)[35] <- “sleep_time” data$sleep_time[is.na(data$sleep_tine)] <- @ datagwork_tine[is.na(data$work_time)] <- 0 data$play_tine[is.na(datagplay_tine)] <- 0 data$vol_tine[is.na(data$vol_tine)] <- @ data$sport_time[is.na(dataSsport_time)] <- @ datathealth_time[is.na(data$health_time)] <- 0 ‘#hleekday or Weekend data <- data %% mutate(wday ynd(paste(tuyear, diary no, diary day, sep="="))) wday(wday, Label=TRUE)) data <- data %% mutate(wday fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 4136s2nen017 Project Data Exploration EXAMINING THE DATA SET'S OCCUPATION TYPE OCCURENCES select <- levels (work$occup_code) count <- rep(NA, length(select)) counter <- 1 for (i in select) ( print (i) count [counter] <- length(work$occup_code[work$occup_code print (count[counter]) counter <- counter + 1 } [1] “mgmt_biz_finance” [1] 87701 [1] "professional" [1] 96470 [1] "service" [2] 86166 [1] “sales' [1] 79548 [1] “office_admin” [1] 83157 [1] “farming forestry fishing" [1] 69155 [1] "construction" [a] 73364 (2) “instali_repair_naint” (1) 71898 [1] "production" [1] 74745 [1] "transport" [1] 74182 2222 FRPRPRETEPR RRR RE Specific attributes of variables: ‘Wage: USD Age: Years Health Related Care: Minutes/Surveyed Day Week/Weekend: Binary Variable, “Yes” or “No” Weekday: “Sun, Mon, Tue, Wed, Thu, Fri, Sat” Job Type/Work Class: “govt, private, self-employed, without_pay” Employment Status: “employed-at work, employed-absent, unemployed-layoff, unemployed-looking, not_in_labor_force” Student Status: "Yes", "No", NA Time spent working: Minutes/Surveyed Day Time spent playing: Minutes/Surveyed Day Time spent volunteering: Minutes/Surveyed Day Time spent sporting Minutes/Surveyed Day Education: “< hs diploma, hs diploma, some college, associate degree, bachelor's degree, master's degree, prof degree, doctoral degree” Marital Status: “married, divorced, seperated, widowed, never married” Parental Status: “Yes”, “No” Information about discrete variables ‘Number of Observations nrow(data) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 596s2nen017 Project Data Exploration we [2] 181335 sleep time sunmnary (slept ime[2]) ft sleep_tine a Min, 1.8 WH Ist Qu.: 450.0 aH Median : 515.0 aH Mean: 529.2 4H 3rd Qu.: 600.0 WH Max. :1436.0 sleep <- as.numeric(unlist(sleeptime[2])) sd(sleep) wi [2] 134.8153 var(sleep) sw [2] 18175.17 worktime summary (worktime[2]) WH worktime wHoMing 3: 1.8 HH Ast Qu.: 285.0 WH Median : 465.0 wH Mean: 419.2 WH 3rd Qu.: 540.0 WH Max, :1430.0 working <- as-numeric(unlist(worktime[2])) sd(working) # [1] 207.9235 var(working) wa [2] 43232.18 fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 66s2nen017 play _tine sunmary (playtime[2]) at play_time a8 Min. 1.0 WH Ist Qu.: 150.0 Wt Median : 270.0 WH Mean: 308.2 3rd Qu.: 430.0 wi Max, :1439.0 play <- as.nuneric(unlist(playtime[2])) sd(play) wi [1] 204.3468 var(play) aa [1] 41756.8 fvol_time sunmary(voltine[2]) ¥# ——-vol_tine we Ming: 18 HH Ast Que: 45.0 Hit Median : 95.0 ft Mean: 134.7 ft 3rd Qu.: 180.0 Ht Max, :1315.0 vol <- as.nuneric(unlist (voltime[2])) sd(vol) a [2] 132.5571 var(vol) fw [1] 17307.27 wsport_time sunmary (sporttime[2]) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration 7136s2nen017 Project Data Exploration HH sport_time we Mins: 18 aa Ast Qui: 45.0 WH Median: 60.0 WH Mean: 106.7 wH 3rd Qu.: 120.0 WH Max. 11260.0 sport <- as.numeric(unlist(sporttime[2])) sd(sport) w# [1] 104.0516 var(sport) we [1] 10826.73 #health_time sunmary (healthtime[2]) wt -health_time HB Min. 1,00 WH Ast Que: 5.00 Wt Median : 30.00 a Mean: 88.89 wH 3rd Qu.: 90.00 WH Max. :1430.00 health <- as.numeric(unlist(healthtine[2])) sd(health) a [1] 161.5228 var(health) # [1] 26089.63, wages per week sunmary (data$weekly_earn) fH Min, Ist Qu. Median Mean 3rd Qu. Max. NA’ WH = 0.0 396.0 686.8 853.5 1140.@ 2885.@ 81127 fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl aes2nen017 sd(dataweekly_earn, na.rm = T) sw [2] 647.4775 var(data$weekly_earn, na.rm = T) a [2] 419227.1 tage sunmary (data$age) a Min, Ist Qu. Median Mean 3rd Qu. WH 15.00 33.00 46.00 46.83 60.00 sd(datagage) 4H (1) 17. 76646 var(datasage) wi (1) 315.6471 Correcting for Outliers and Setting Bounds Trimming: (to make our analysis more effective) WAGE : WE ARE TAKING WAGES LESS THAN 3000 WORKTIME: WE ARE CONSIDERING ONLY THOSE WHO WORK (WORK TIME POSITIVE) VOLUNTEERTIME: WE ARE CONSIDERING ONLY THOSE WHO VOLUNTEER (VOL TIME POSITIVE) SPORTTIME: WE ARE CONSIDERING ONLY THOSE WHO PLAY SPORTS (SPORT TIME POSITIVE) HEALTHTIME: WE ARE CONSIDERING ONLY THOSE WHO SPENT TIME ON HEALTH (HEALTH TIME POSITIVE) Allof the other values will be set to NA (not 0), so that they don't affect analysis. fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration Max. 85.00 936s2nen017 Project Data Exploration HERE WE ARE TRIMMING OUTLIERS 4 OR MORE STANDARD DEVIATIONS OVER THE MEAN #Trim wages data$weekly_earn[data$weekly_earn 884.61] <- NA #7ine spent sleeping select <- mean(sleep) + 4 * sd(sleep) data <- data[datassleep_tine < select, } #Time spent working select <- mean(working) + 4 * sd(working) data <- data[datagwork_time < select, ] data$work_tine[data$work_time == @] <- NA #Time spent playing select <- mean(play) + 4 * sd(play) data <- data[data$play_tine < select, ] #Time spent volunteering select <- mean(vol) + 4 * sd(vol) data <- data[data$vol_tine < select, ] data$vol_time[datagvol_time == 0] <- NA #Time spent sporting select <- mean(sport) + 4 * sd(sport) data <- data[datassport_tine < select, ] data$sport_tine[data$sport_time == 0] <- NA #Tine spent caring for their health select <- mean(health) + 4 * sd(nealth) data <- data{datashealth_tine < select, ] dataghealth_tine[datashealth tine == @] <- NA Some important plots for continuous variables: NOTE NOTE WE ARE REMOVING ZERO TERMS AND VALUE CEILINGS FOR THE SAKE OF DATA ANALYSIS, KEEP THAT IN MIND WHEN MAKING THE WRITUP. “OUT OF PEOPLE WHO SPEND SOME TIME PLAYING, THEY DO __" and "FOR PEOPLE WITH LESS THAN THIS AMOUNT, THEY DO ALSO, CHANGE OUR MOTIVATIONS SECTION TO REFLECT WHAT PENG PENG SAYS, AND WHAT WE ARE TRYING TO PREDICT. Fequire(cowplot) #lage Distribution histwage <- geplot(data, aes(weekly_earn)) + geom_histogram(bins = 108) + coord_cartesian(xlim = €(-8, 3000)) + ggtitle("Histogram of Wage") + xlab("Wage(Per Week)") + ylab("Frequency") histwage ‘it Warning: Renoved 83094 rows containing non-finite values (stat_bin). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 10786sangiz017 Project Data Exploration Histogram of Wage 3000 3 2000 2 g S = 2 c 1000 0 0 1000 2000 3000 Wage(Per Week) age Distribution and Sleep vs. Age histage <- ggplot (data, aes(age)) + geom_histogram(bins = 3) + coord_cartesian(xLim = c(-8, 100 )) + getitle("Histogran of Age”) + xlab(“Aage(Yrs)") + ylab("Frequency") Lineage <- ggplot (data) + geom_smooth(aes(x = age, y = sleep_tine)) + ggtitle("Sleep Vs. Age") + xlab("Age(¥rs)") + ylab("SLeep(Min/Day)") plot_grid(histage, lineage) ‘ia ~geom_smooth()~ using method fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1136sangiz017 Project Data Exploration Histogram of Age Sleep Vs. Age 9000 575 z s 8 S 6000 gs iS = 550 & = uw 3 oO 3000 525 0 0 25 50 75 100 20 40 60 80 Age(Yrs) Age(Yrs) ‘#SLeeptime Distribution and Plot of Wage vs. Sleeptime histsleep <- ggplot(data, aes(sleep_tine)) + geomhistogram(bins = 58) + coord_cartesian(xlim = c(-@, 130@)) + ggtitle("Histogram of Sleep Time") + xlab("Sleep(Min/Day)") + ylab("Frequency") Linesleep <- ggplot(data) + geon_snooth(aes(x = sleep_tin s. Sleep") + xlab("Sleep(Min/Day)") + ylab("Wage(Per Week)") y = weekly earn)) + getitle("wage V plot_grid(histsleep, linesleep) ‘ia ~geom_smooth()~ using method 4 Warning: Removed 83094 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 12138s2nen017 Project Data Exploration Histogram of Sleep Time Wage Vs. Sleep 15000 800 = 3 > 3 2 10000 = 700 3 5 a a 2 oOo ms gs 5000 = 600 500 al 0 500 1000 0 300 600 900 Sleep(Min/Day) Sleep(Min/Day) f#Hlorktime Distribution and Plot of Wage vs. Worktime histwork <- geplot(data, aes(work time)) + geom_histogram(bins = 8) + coord_cartesian(xlin = c( -®, 900) + ggtitle("Histogram of Work Time") + x1ab("Work(Min/Day)") + ylab("Frequency") Linework <- ggplot (data) + geom_snooth(aes(x = work time, y Work Time") + xlab("Work(Min/Day)") + ylab("Wage(Per Week) reekly_earn)) + ggtitle("Wage Vs. plot_grid(histwork, Linework) 4 Warning: Removed 11911 rows containing non-finite values (stat_bin). 4 ~geon_smooth()” using method = ‘gam* a Warning: Renoved 123960 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 19986sangiz017 Project Data Exploration Histogram of Work Time Wage Vs. Work Time 1200 6000 = 1000 o > 3 5 4000 = S 5 = a 2 oOo ot @ 800 8 = 2000 600 0 0 250 «500750 ° 400 800 1200 Work(Min/Day) Work(Min/Day) #Playtime Distribution and Plot of Wage vs. Playtime histplay <- ggplot(data, aes(play_time)) + geom_histogram(bins = $8) + coord_cartesian(xlin = c( -®, 150) + ggtitle( "Histogram of Play Time") + xlab("Play(Min/Day)") + ylab("Frequency") Lineplay <- geplot(data) + geom_smooth(aes(x = play_time, y = weekly earn)) + ggtitle("Wage Vs. Play Time") + xlab("Play(Min/Day)*) + ylab("Wage(Per Week)") plot_grid(histplay, lineplay) 4 ~geon_smooth()* using method = "gan* 4 Warning: Removed 83094 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1438sangiz017 Project Data Exploration Histogram of Play Time Wage Vs. Play Time 800 = a 8 700 s go" = o a i & 600 = 500 0 500 1000 ©1500 0 300 600 900 Play(Min/Day) Play(Min/Day) #Wolunteertime Distribution and Plot of age vs. Volunteer Time histvol <- ggplot (data, aes(vol_time)) + geom_histogram(bins = 58) + coord_cartesian(xlim = c(-@ » 450) + ggtitle("Hist of Volunteer Time”) + x1ab("Volunteer(Min/Day)") + ylab("Frequency") Linevol <- ggplot (data) + geom_smooth(aes(x = vol_time, y = weekly_earn)) + ggtitle(*wage Vs. Vo 1 Tine") + xlab("Volunteering(Min/Day)") + ylab("Wage(Per Week)") plot_grid(histvol, linevol) i Warning: Renoved 167673 rows containing non-finite values (stat_bin). 4 ~geon_smooth()” using method = ‘gam* a Warnin + Removed 173672 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1598s2nen017 Project Data Exploration Hist of Volunteer Time Wage Vs. Vol Time 840 = > 3 2 = eS B 800 E é = 760 0 100 200 300 400 0 200 400 600 Volunteer(Min/Day) Volunteering(Min/Day) ‘#Sporttine Distribution and Plot of Wage vs. Sporttime histsport <- ggplot(data, aes(sport_tine)) + geom_histogram(bins = 58) + coord_cartesian(xlim = €(-®, 45@)) + ggtitle("Hist of Sport Time") + xlab("Sport(Min/Day)") + ylab("Frequency") Linesport <- ggplot(data) + geon_snooth(aes(x = sport_time, y s. Sport Time") + xlab("Sport(Min/Day)") + ylab("Wage(Per Week)’ feekly_earn)) + ggtitle(*Wage V plot_grid(histsport, linesport) i Warning: Renoved 146819 rows containing non-finite values (stat_bin). 4 ~geon_smooth()” using method = ‘gam* 4 Warning: Renoved 162830 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1696s2nen017 Project Data Exploration jist of Sport Time Wage Vs. Sport Time 6000 900 4000 3 S © 800 2 = e é i 3 8 700 2000 = 600 0 0 100 200 300 400 0 100 200 300 400 500 Sport(Min/Day) Sport(Min/Day) #Healthtime Distribution and Plot of Wage vs. Healthtine histhealth <- ggplot(data, aes(log(health_time + 1))) + geon_histogran(bins = S@) + coord_cartes ian(xlim = c(-8, 6)) + ggtitle("Hist of Health Time") + xlab("Healthtime(Min/Day)") + ylab("Freq uency") Linehealth <- ggplot(data) + geom_smooth(aes(x = health_time, y Vs. Health Time") + xlab("Work(Min/Day)") + ylab("Wage(Per Week") jeekly_earn)) + ggtitle("Sleep plot_grid(histhealth, Linehealth) HH Warning: Removed 164263 rows containing non-finite values (stat_t 4H “geon_smooth()” using method = ‘gam* 4 Warning: Removed 175449 rows containing non-finite values (stat_snooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 7186s2nen017 Project Data Exploration Hist of Health Time Sleep Vs. Health Time 1500 900 1000 S 800 > 2 g 7 Ss 5 a 3 & 700 « 2 500 s 600 500 0 0 2 4 6 0 200 400 600 Healthtime(Min/Day) Work(Min/Day) Some important plots for catagorical variables: Fequire(ggnosaic) #osaicplot of Occupation Type and Work Class geplot(data) + geon_mosaic(aes(x=product (occup_code), fill = factor(work_class)), na.rm=T) + ggt itle("Occupation Type and Sector") + theme(axis.text.x = elenent_text(angle = 98, hjust = 1)) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1996sangiz017 Project Data Exploration Occupation Type and Sector factor(work_class) govt private self-employed without_pay 3 3 @ g £ SSESE 2 e 2 2 ¢€ §ss 5 S = 8 8 SES 3 E 3 8 & 2353 6 é 1 Fe 3 6 l 2 g Wea 8s ° S585 2 8 i) a 5 g88 E gg 2 33 e gs E £ product(occup_code) Some important plots of mixed continuous and catagorical variables #Boxplot of Occupation Type and Sleep selectoce <- data[!is.na(data$occup_code),] Beplot(selectocc) + geom_boxplot(aes(x = occup_code, y = sleep_time)) + ggtitle("Sleep Vs. Work Class") + xlab("Work Class") + ylab("Sleep(Hrs)") + theme(axis.text.x = elenent_text(angle = 90 » just = 2)) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 19688sangiz017 Project Data Exploration Sleep Vs. Work Class 8 zg = 600 See SeqbqqHo ® 300 cect 7 Wes S 3 = © 2 ¢@ § ® 3 &@ §$ & 8 #2 & 3 & 1 & a2 2 & ig g 2 8 § 8 & y ° 3 = 5 gS 5 a 5 8 8 B = s = a | a Ee 2 z : £ s Work Class ‘#BoxpLot of Occupation Type and Wage selectocc <- data[!is.na(datasoccup_code),] ggplot(selectoce) + geom_boxplot(aes(x = occup_code, y = weekly earn), na.rn = T) + ggtitle("Wag e Vs. Occupation Type") + xlab("Occupation Type") + ylab("wage(Per Weck)") + thene(axis.text.x = elenent_text(angle = 99, hjust = 1)) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 20138s2nen017 Project Data Exploration Wage Vs. Occupation Type i ! 3000 y 8 3 8 Wage(Per Week) i in n L ° sales service transport professional office_admin construction production repait farming_forestry_fishing instal Occupation Type #oxplot of Highest Degree Attained and wage Bgplot(data) + geom_boxplot(aes(x = edu, y = weekly_earn)) + ggtitle("Wage Vs. Degree Attained”) + xlab("Highest Degree Attained") + ylab("Wage(Per Week)") + thene(axis.text.x = element_1 ngle = 98, hjust = 1)) ext(a 4 Warning: Removed 83094 rows containing non-finite values (stat_boxplot). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2196s2nen017 Project Data Exploration Wage Vs. Degree Attained 3000 y 8 3 8 3 s 8 Wage(Per Week) ° 8 % ° 2 8 2 8 9° E E 2 E = 2 2 2 2 ¢ % ff 8 & fF 2 = 8 3 3 3 3 3 a 2 2 2 2 2 Ss ma 2 £ & o 5 5 a S v a 3 o 3 3 8 2 3 8 E 3 2 s Highest Degree Attained Data Analysis Selecting an Optimal Model sleep2 <- (data$sleep_tine)*2 #Base Model with sleep Amt <- Im(Log(weekly_earn + 1) ~ sleep time + sleep2, data = data) sunmary (1m) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2196s2nen017 wi call a In(Formula WH Residuals: Min 19. Median -6.5102 -0.4274 0.1193 Project Data Exploration 3Q Max 0.5903 1.9707 = log(weekly_earn + 1) ~ sleep_time + sleep2, data = data) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl coefficient: Estimate 6.469¢+20 4,299e-04 -1, 1080-06 Std. Error t value Pr(>|t|) 2.982e-@2 216.937 < 2e-16 *** 1.110e-04 3.872 @.0ee108 *** 1,019e-@7 -10.796 < 2e-16 *** a a a wt a 4 (Intercept) i sleep_time 4H sleep2 a a a wt wt at ae Signif. codes: @ 0.001 '**' 9.01 '** 0.05 Residual standard error: 0.9012 on 96746 degrees of freedom (83094 observations deleted due to missingness) Multiple R-squared: 0.01205, Adjusted R-squared: 0.01203 F-statistic: 589.9 on 2 and 96746 DF, p-value: < 2.2e-16 wBase Model with sleep, demographic data In2 <- Im(Log(weekly_earn + 1) ~ sleep_tine + sleep2 + age, data = data) sunmary (1m2) call An(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age, data = data) Residuals: Min 19. Median 3Q Max -6.8897 -0.4186 0.1226 0.5822 1.9094 Coefficient: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.086e+00 3.080e-02 197.598 <2e-16 *** Sleep_tine 2.731e-04 1.100e-04 2.483 0.013 * sleep2 -8.638e-07 1.@11e-07 -8.547 <2e-16 *** age 9.518e-@3 2.179e-04 43.677 <2e-16 *** Signif. codes: @ '*#*" 2.001 '**' 0.01 '*' 0.05 '.' 2° 1 Residual standard error: 0.8924 on 96745 degrees of freedom (83094 observations deleted due to missingness) Multiple R-squared: 0.3115, Adjusted R-squared: 0.03112 F-statistic: 1037 on 3 and 96745 DF, p-value: < 2.2e-16 PRP PEPER RRR RTE PER 236s2nen017 Project Data Exploration #Base Model with sleep, demographic data An3 <- Im(Log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex, data = data) sunmary (1m3) call sex, data = data) Residuals: Min 1Q. Median 3Q Max -7.0895 -0.4090 0.1136 0.5649 1.9935 FRRRREE Coefficients: Estimate Std. Error t value Pr(>/t|) (intercept) 6.203e+08 3.022¢-02 205.226 < 2e-16 sleep_tine 4.719e-04 1.078e-04 4.378 1.2e-05 sleep? _—-1,001e-96 | 9.902e-08 -10.114 < 2e-16 age 9.817e-03 2.135e-04 45.981 < 2e-16 sexfemale -3.608e-01 5.640e-03 -63.972 < 2e-16 Signif. codes: @ '*#*" 2,001 '**' 0.01 '*" 0.05 (83094 observations deleted due to missingness) Multiple R-squared: 0.07047, Adjusted R-squared: SEPRRRE TPE RR RR RE #8ase Model with sleep, demographic data (OPTIMAL MODE! Ana <- Im(Log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex + edu + occup_code, data = data) sunmary (1m) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl An(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + Residual standard error: @.8741 on 96744 degrees of freedom L) 2.07044 F-statistic: 1834 on 4 and 96744 DF, p-value: < 2.2e-16 236s2nen017 call sex + edu + occup_code, data = dat: Residuals: Min 19 Median 3Q Ma -7.5721 -0.2985 0.1018 0.4458 2.268: Coefficients: (Intercept) 6 sleep_tine “4 sleep2 1 age 5 sexfenale 3 eduhs diploma 5 edusone college 5 eduassociate degree 7. 9 1 1 1 2 7 ZRRPRE TETRA R RRR RP ESER edubachelor's degree i edunaster's degree it eduprof degree it edudoctoral degree #4 occup_codeprofessional + 4 occup_codeservice - #8 occup_codesales 4 occup_codeoffice_admin i occup_codefarming_forestry fishing -5. i occup_codeconstruction “1. i occup_codeinstall_repair_maint — -1. 4H occup_codeproduction -2. 4 occup_codetransport “4, a 44 (Intercept) ad 4H sleep_time 4H sleep2 . it age - i sexfemale aad 4 eduhs diploma a 4H edusone college a HH eduassociate degree a 4 edubachelor's degree 4 edunaster's degree o ## eduprof degree # edudoctoral degree ad ‘i occup_codeprofessional HH occup_codeservice oad 4 occup_codesales a 4 occup_codeoffice_admin a #8 occup_codefarming forestry fishing *** #4 occup_codeconstruction oo ## occup_codeinstall_repair_maint —*** 4H occup_codeproduction fle: Usersejche!Documents/Schoo!:2017-181ECON%420131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration a) 2 3540-01 -148e-01 -0480+00 a3ie+ee .1870+08 .A65e-01 -A1de-01 -56e-01 -111e-01 986e-01, 632e-01 5ite-01 953e-01, 8420-01 2 -8Ade-02 418¢-05 6640-08 8950-04 3620-03 5520-03, 2100-02 -134¢-02 -047¢-02 -251e-02 3220-02 1500-02 1260-03, 5880-03, -051¢-02 -443¢-03 -087¢-02 -471¢-02 555¢-02 .233¢-02 2900-02 218. 2 31 -64. 54. 57. 64. 87. 83. 48 5s. -30. 71. -52. -43. -19. “1. 9 -23 -37 In(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + Estimate Std. Error t value Pr(>|t|) -2090+00 .978e-04 9340-07 8840-03 4570-01 1930-01 8560-01 333 < 20-16 286 1.25¢-07 232 0.0256 051 < 2e-16 474 < 20-16 365 < 2e-16 994 < 20-16 B19 < 2e-16 268 < 2e-16 126 < 2e-16 688 < 2e-16 1ea < 20-16 338 < 2e-16 971 < 20-16 9e1 < 20-16 532 < 2e-16 986 < 2e-16 089 < 2-16 71d ¢ 20-16 954 < 2e-16 1538 < 2e-16 2596s2nen017 Project Data Exploration 4 occup_codetransport ” wh WH Signif. codes: @ '*#*" @.001 '**' 0.01 '*' 0.05 '.' 2° 1 a 4 Residual standard error: 0.7617 on 96728 degrees of freedom 4 (83094 observations deleted due to missingness) 4 Multiple R-squared: 0.2944, Adjusted R-squared: 9.2942 fi F-statistic: 2018 on 28 and 96728 DF, p-value: < 2.2e-16 HE WERE GOING TO PUT IN OTHER STUFF, BUT IT SUFFERS FROM COLLINEARITY Bgplot(data) + geom_smooth(aes(x = sleep_time, y = work_time)) + ggtitle("Work and Sleep") + x12 b("Sleep") + ylab("Work") 4 “ geom_smooth()” using method = ‘gam* 4H Warning: Removed 110911 rows containing non-finite values (stat_snooth). Work and Sleep 600 ¥ 400 6 Ss 200 0 300 600 900 Sleep #uith sleep, demographic data, work time, play time Inwp <- Im(log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex + edu + occup_code + work time + play_time, data = data) summary (1mwp) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2696s2nen017 call Residuals: Min 10. Median BQ Ma -7.575@ -0.3061 0.0748 0.4198 2.115: Coefficients: (Intercept) 5. sleep_tine 4 sleep2 2. age 6. sexfenale 3. eduhs diploma 4. ZRRPRE TETRA R RRR RP ESER edusone college 5 eduassociate degree 6 edubachelor's degree 8 it edunaster's degree 9 it eduprof degree 1 4 edudoctoral degree 1 #4 occup_codeprofessional 2 4 occup_codeservice “6 #4 occup_codesales 4 occup_codeoffice_admin 4 occup_codefarming_forestry_fishing i occup_codeconstruction a1 wi occup_codeinstall_repair_maint — -1. 4H occup_codeproduction 23. 4 occup_codetransport “4. 4 work time 5. #4 play_time 2. wt 4s (Intercept) “ # sleep_time 4H sleep? : 4H age a 4H sexfenale a 4H eduhs diplona a 4 edusone college 4 eduassociate degree o 4 edubachelor's degree # edunaster's degree ad tt eduprof degree i edudoctoral degree a 4 occup_codeprofessional a iit occup_codeservice a #8 occup_codesales a #4 occup_codeoffice_admin oo ## occup_codefarming_forestry_fishing *** 4 occup_codeconstruction fle: Usersejche!Documents/Schoo!:2017-181ECON%420131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration 7 970e+00 240-05 267-07 1900-03 1882-21 3220-01 -184e-01 3960-01 -326e-01 -518e-01 416400 .0840+08 .153e-01 9960-01 299-01 .137e-01 268-01 998e-01, 819e-01 337e-@1 959e-01, 1780-24 5480-04 Estimate Std. .017e-02 4 2840-04 2880-07 4280-04 7680-03 -231¢-02 -298¢-02 4560-02 -337e-02 -S71e-02 7620-02, 5080-02 -083¢-02 -177¢-02 -273¢-02 -196¢-02 -698e-02 -883e-02 938e-02 +563¢-02 6220-02 9996-05 .713¢-05 od 1s. -2. -1 25. -47. 35 39. 43. 62. 68. 37 43. -21. -59. -41. -34, -16. -10. “9 -21. -30. 25 9. In(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex + edu + occup_code + worktime + play tine, data fata) Error t value -612 314 759 497 103 99 932 921 276 552 678 aaa 468 421 626 593, 929 610 384, 352 577 9@7 394 Pr(oitl) <2e-16 0.7539 0.0785 «20-16 «20-16 «20-16 <2e-16 <2e-16 <2e-16 <2e-16 «20-16 «20-16 «20-16 «20-16 «2e-16 <2e-16 <2e-16 <2e-16 <2e-16 «20-16 «20-16 «20-16 «2e-16 27736s2nen017 Project Data Exploration 4H occup_codeinstall_repair_maint — *** 4H occup_codeproduction a 4 occup_codetransport a 4 work time a #8 play_time oo ft - iH Signif. codes: @ '*#** 0.001 '**' 9.01 '*' 0.05." 0.25 ae 8 Residual standard error: 0.7273 on 55860 degrees of freedom i (123960 observations deleted due to missingness) WH Multiple R-squared: 0.3094, Adjusted R-squared: 9.3092 WH F-statistic: 1138 on 22 and S586 DF, p-value: < 2.2e-16 #ulith sleep, demographic data, work time, play time, and health time Amal <- Im(log(weekly_earn + 1) ~ sleep_tine + sleep2 + ‘age + sex + edu + occup_code + worktime + play_tine + health_time + sport_time + vol_time, data = data) sunmary (1mal1) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2036s2nen017 a wi call 8 Im(Formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + WH sex + edu + occup_code + worktime + play_time + health_time + WH sport_time + vol_time, data = data) a it Residuals: a Min 1Q Median 39 Max Wi -1,12580 -0.26159 0.00369 0.26248 0.96205, a WH Coefficients: a Estimate Std. Error t value Pr(>/tl) 4H (Intercept) 4.631100 2.839e+00 1.631 0.1288 4H sleep_time 5.a13e-@3 8.851e-03 0.612 @,5522 sit sleep +7.6970-06 1.1490-@5 -0.672 0.5158 a age 1,175e-@2 1.987e-02 0.591 @.5652 4 sexfenale -1.347e+09 5.873e-@1 -2.294 0.0407 4 edusone college -6.569e-02 9.421e-01 -0.078 0.9456 it eduassociate degree 4.635e-01 8.161e-01 0.568 0.5805 fit edubachelor’s degree -1.485e-02 7.423e-@1 -0.020 0.9844 i edunaster's degree 4.136e-01 7.995e-@1 0.517 0.6143 4 edudoctoral degree 1,935e1@@ 1.239409 1.562 @.1442 4 occup_codeprofessional 8.75Se-@1 6.163e-01 1.421 @.1809 #4 occup_codeservice 5.780e-@1 1.003e+00 8.576 @.5751 #8 occup_codesales 2.476e-@1 7.781e-01 0.318 @.7558 #4 occup_codeoffice_admin 3.110e-01 6.075e-01 0.512 @.6180, 4 occup_codeconstruction 8.105e-61 1.753e+00 0.462 @.6521 4 occup_codeproduction -1.219e-@1 1.189e+09 -.103 0.9200 8 work_time 1.457e-@3 1.649e-03 0.884 0.3942 i play_time -6.723e-04 1,977¢-@3 -0.348 0.7397 4 health_time 9.522e-@3 5.1@1e-03 1.867 2.0865 . 4 sport_time -1.719e-03 8,548e-@3 -0.281 0.8440 it vol_time -1.176¢-03 3.751e-@3 -0.313 0.7593 ft - WH Signif. codes: @ '*#** 0.001 '**' 9.01 '*° 0.05 '.' a1" * at # Residual standard error: 0.7829 on 12 degrees of freedom wi (179818 observations deleted due to missingness) WH Multiple R-squared: 0.6659, Adjusted R-squared: 0.109 # F-statistic: 1.196 on 20 and 12 DF, One Way ANOVA for F-Statistic, ASSUMPTIONS FOR ANOVA FIRST TEST Project Data Exploration p-value: 0.3841 al <- aov(1og(weekly_earn + 1) ~ occup_code, data=data) summary (21) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2036s2nen017 Project Data Exploration a DF Sum Sq Mean Sq F value Pr(>F) 4H occup_code 9 13585 1509.4 2214 <2e-16 *** fa Residuals 96739 65941 @.7 aH = WH Signif. codes: @ '*#*" @,001 '**' 0.01 '*' 0.05 '.' 2° 1 4 83094 observations deleted due to missingness a2 <- aov(sleep_time ~ occup_code, dataedata) sunmary (22) a DF Sum Sq Mean Sq F value Pr(>F) 4## occup_code 9 1.047e+07 1163397 74.25 <2e-16 *** WH Residuals 112330 1.768e+09 15670 ae at Signif. codes: @ ‘'**** 8.001 '**' 0.01 '** 0.05 4H 67503 observations deleted due to missingness bizFinsleep <- data$sleep_time[datagoccup_code constructsleep <- datagsleep_tine[data$occup_code bizfinearn <- data$weekly_earn[data$occup_code constructearn <- data$weekly_earn{data$occup_code #sunmary (bizfinsLeep) #sunmary(constructsleep) t.test(bizFinsleep, constructsleep) Welch Two Sample t-test data: bizfinsleep and constructsleep t = -9.1121, df = 6956, p-value < 2.2e-16 alternative hypothesi: 95 percent confidence interval -22.39046 -14.46223 sample estinates: i mean of x mean of y 497.7677 16.1940 fPRRHRRRE te ‘ftsummary(bizfinearn) #tsummary(constructearn) t.test(bizFinearn, constructearn) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl ygmt_biz_finance"] construction") = "mgnt_biz_finance"] “construction” ] true difference in means is not equal to @ 30836s2nen017 Project Data Exploration 4 Welch Two Sample t-test wi data: bizFinearn and constructearn WH t = 42.317, df = 7816.5, p-value < 2.2¢-16 4 alternative hypothesis: true difference in means is not equal to @ 4 95 percent confidence interval wi 358.4114 393.2298 w## sample estimates: 4 mean of x mean of y # 1179.7642 803.9435 What is the nature of these differences? How can we express them and capture through linear model? flets run the previous Linear model, but seperating by the two occupations bizfin <- data[data$occup_code == "mgnt_biz_finance”, ] sleepbf <- (bizfingsleep_time)*2 construct <- data[datagoccup_code == "construction", ] sleepcon <- (construct $sleep_tine)*2 In6 <- Im(Log(weekly earn + 1) ~ sleep_tine + sleepbf + age + sex + edu, data = bizfin) summary (1n6) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 31136
You might also like
Verzani Answers
PDF
100% (8)
Verzani Answers
94 pages
R Basics
PDF
88% (8)
R Basics
8 pages
Thera Bank-Project
PDF
100% (12)
Thera Bank-Project
26 pages
Bank Rpubs
PDF
No ratings yet
Bank Rpubs
24 pages
PA Data Viz R Solution
PDF
No ratings yet
PA Data Viz R Solution
11 pages
Machine Learning Project
PDF
67% (3)
Machine Learning Project
30 pages
R Working Materials Prep
PDF
No ratings yet
R Working Materials Prep
43 pages
Logistic Regression Assignment
PDF
No ratings yet
Logistic Regression Assignment
20 pages
"Cps - TXT" "Education" "South" "SEX" "Experience" "Union" "WAGE" "AGE" "RACE" "Occupat Ion" "Sector" "MARR"
PDF
No ratings yet
"Cps - TXT" "Education" "South" "SEX" "Experience" "Union" "WAGE" "AGE" "RACE" "Occupat Ion" "Sector" "MARR"
9 pages
Project 5 PDF
PDF
100% (1)
Project 5 PDF
48 pages
Preprocessing - Preprocessing Your Data With R
PDF
No ratings yet
Preprocessing - Preprocessing Your Data With R
23 pages
R Practicals
PDF
No ratings yet
R Practicals
32 pages
Analysis Using Statistical: Introduction & Data Exploration
PDF
No ratings yet
Analysis Using Statistical: Introduction & Data Exploration
23 pages
R Working Manuals Students
PDF
No ratings yet
R Working Manuals Students
11 pages
SAS R::: Cheat Sheet
PDF
No ratings yet
SAS R::: Cheat Sheet
2 pages
R Commands
PDF
No ratings yet
R Commands
18 pages
Chapter3-Measures of Center
PDF
No ratings yet
Chapter3-Measures of Center
26 pages
DM Assignment - Thena Bank
PDF
No ratings yet
DM Assignment - Thena Bank
39 pages
Sas R
PDF
No ratings yet
Sas R
2 pages
Project3: Loading Library
PDF
No ratings yet
Project3: Loading Library
17 pages
Da (22C01156)
PDF
No ratings yet
Da (22C01156)
26 pages
Project On Data Mining-Raveendra Babu Gaddam
PDF
No ratings yet
Project On Data Mining-Raveendra Babu Gaddam
29 pages
BA Project - Section 1 Group 1
PDF
No ratings yet
BA Project - Section 1 Group 1
27 pages
Mini Project-Data Mining
PDF
No ratings yet
Mini Project-Data Mining
25 pages
s05 Solution
PDF
No ratings yet
s05 Solution
15 pages
Project 4 - Cars-Datasets PDF
PDF
100% (2)
Project 4 - Cars-Datasets PDF
44 pages
Rubel Assignment 2
PDF
No ratings yet
Rubel Assignment 2
7 pages
Exploratory Data Analysis in R
PDF
No ratings yet
Exploratory Data Analysis in R
33 pages
Predictive+Modelling+-+Logistic+Regression+-+Student+Version-New2.3.ipynb - Colaboratory
PDF
No ratings yet
Predictive+Modelling+-+Logistic+Regression+-+Student+Version-New2.3.ipynb - Colaboratory
12 pages
R Note
PDF
No ratings yet
R Note
56 pages
TITLE: Bank Marketing Classification: Submitted To: Dr. Supriya Kumar de Professor XLRI, Jamshedpur
PDF
No ratings yet
TITLE: Bank Marketing Classification: Submitted To: Dr. Supriya Kumar de Professor XLRI, Jamshedpur
18 pages
Produit Bancaire
PDF
No ratings yet
Produit Bancaire
15 pages
Salary Prediction
PDF
No ratings yet
Salary Prediction
32 pages
FIT3152 Data Analytics. Tutorial 01: Introduction To R. Review of Basic Statistics
PDF
No ratings yet
FIT3152 Data Analytics. Tutorial 01: Introduction To R. Review of Basic Statistics
4 pages
BDA MSC It
PDF
No ratings yet
BDA MSC It
35 pages
ANZ Virtual Internship Module Model Answer For Task 1
PDF
No ratings yet
ANZ Virtual Internship Module Model Answer For Task 1
7 pages
Praktikum Modul 3
PDF
No ratings yet
Praktikum Modul 3
5 pages
SOC 210 Lab Assignment #2
PDF
No ratings yet
SOC 210 Lab Assignment #2
7 pages
Bellabeat R Script Template
PDF
No ratings yet
Bellabeat R Script Template
4 pages
Project Employee Absenteeism
PDF
No ratings yet
Project Employee Absenteeism
33 pages
Modelling With R
PDF
No ratings yet
Modelling With R
3 pages
07 HR
PDF
No ratings yet
07 HR
15 pages
Home Credit Data
PDF
No ratings yet
Home Credit Data
6 pages
EDA Python Code Cheatsheets
PDF
No ratings yet
EDA Python Code Cheatsheets
52 pages
Awini Mustapha-Project1
PDF
No ratings yet
Awini Mustapha-Project1
8 pages
21BCS5999 - Ankit Kumar (Assignment 2)
PDF
No ratings yet
21BCS5999 - Ankit Kumar (Assignment 2)
16 pages
R Examples
PDF
No ratings yet
R Examples
56 pages
Thera Bank - Project
PDF
100% (4)
Thera Bank - Project
34 pages
Working With Data
PDF
No ratings yet
Working With Data
38 pages
Analysis Report
PDF
No ratings yet
Analysis Report
8 pages
Lab2
PDF
No ratings yet
Lab2
22 pages
013 Plotting Predictors
PDF
No ratings yet
013 Plotting Predictors
14 pages
Data Wrangling
PDF
No ratings yet
Data Wrangling
12 pages
IntroR 2
PDF
No ratings yet
IntroR 2
18 pages
Results
PDF
No ratings yet
Results
7 pages
Cart Project
PDF
75% (4)
Cart Project
17 pages