0% found this document useful (0 votes)
22 views

Final Draft

Uploaded by

cjchen2810
Copyright
© © All Rights Reserved
Available Formats
Download as PDF or read online on Scribd
0% found this document useful (0 votes)
22 views

Final Draft

Uploaded by

cjchen2810
Copyright
© © All Rights Reserved
Available Formats
Download as PDF or read online on Scribd
You are on page 1/ 36
s2nen017 Project Data Exploration Project Data Exploration Chris Chen November 26, 2017 FORMAT PLOTS CHANGE COLORS Exploratory Data Analysis #Install Packages ‘#install. packages (‘Lubridate’) winstallpackoges( “dplyr’) Hinstal. packages (‘ggptot2") #install.packages(‘atus") #install.packages(‘cowplot’) ‘#install.packages(‘ggmosaic’) Run Libraries Library (éplyr) i Warning: package ‘dplyr’ was built under R version 3.3.3 ae wi Attaching package: ‘dplyr* 4H The following objects are masked from "package:stats* a filter, lag 4 The following objects are masked from ‘package:base': a WH intersect, setdiff, setequal, union Library (1ubridate) 4H Warning: package ‘lubridate’ was built under R version 3.3.3 wt # Attaching package: ‘lubridate’ it The following object is masked from ‘package:base’: ae WH date fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1196 s2nen017 Project Data Exploration Library (ggplot2) 4 Warning: package ‘ggplot2‘ was built under R version 3.3.3 Library(etus) 4H Warning: package ‘atus’ was built under R version 3.3.3 Library (cowplot) at a Attaching packag “conplot* 4H The following object is masked from ‘package:ggeplot2" wh WH gesave Library (ggnosaic) 4 Warning: package ‘ggnosaic’ was built under R version 3.3.3 4H Loading required package: productplots 4H Warning: package 'productplots' was built under R version 3.3.3 a2 ‘Attaching package: ‘ggmosaic’ The following objects are masked from “package:productplots': 222 decker, hspine, mosaic, prodcalc, spine, vspine #CoLum Names names (atuscps) WH [1] “tucaseid’ region" “state wt [5] “age” edu" "pace" wi [9] “country_born” “citizer "marital 4 [13] "Famincone” names(atusresp) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2186 s2nen017 Project Data Exploration 4 [1] “tucaseid” “tuyear® #8 [3] “diary_no Hary_day sn [5] “holiday occup_code* 4 [7] “ind_code" abor_status” 4 [9] “student_status" pee #8 [11] "work_class" “ourly_wage" 8 [13] “weekly_earn "work_hr's_week’ #8 [15] “mult_jobs" “partner_hh #8 [17] “partner_works” “partner_ptft" #8 [19] “hh_size” chil ## [21] “hh_child_youngest_age" names (atusact) # [1] “tucaseid wStore the demographic and activity information in seperate datafranes demographic <- atuscps work <- atusresp activities <- atusact Data Description Data Treatment ‘#SLeeptine sleeptine <- activities %% filter(tiercode >= @10100 & tiercode < 010200) %>% group_by(tucaseid) %% sunmarize(sleep time = sum(dur)) 4H Warning: package ‘bindrepp’ was built under R version 3.3.3 fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl a6 s2nen017 Project Data Exploration fHiorktime worktine <- activities %% filter(tiercode >= @50000 & tiercode < 060000) %>% group_by(tucaseid) %% sunmarize(work_time = sum(dur)) #PLaytine playtime <- activities %% filter(tiercode >= 120000 & tiercode < 130000) %>% group_by(tucaseid) %% sunmarize(play time = sum(dur)) ‘#VoLunteertime voltime <- activities %>% filter(tiercode >= 150000 & tiercode < 160000) %>% group_by(tucaseid) %% sunmarize(vol_time = sum(dur)) ‘#Sporttine sporttime <- activities %% filter(tiercode >= 130000 & tiercode < 14000) %>% group_by(tucaseid) %% sunmarize(sport_time = sum(dur)) fhealthtine healthtime <- activities %% filter(tiercode &@ tlercode < 080500) %>% group_by(tucaseid) %% sunmarize(health_time = sum(dur)) 010300 & tiercode < 010400 | tiercode >= e80400 #Herge datafranes into individual profiles data <- merge(denographic, work, by = “tucaseid”) data <- merge(data, sleeptine, by = “tucaseid", all.x = T) data <- merge(data, worktine, by = “tucaseid", all.x = 1) data <- merge(data, playtine, by = “tucaseid", all.x = 7) data <- merge(data, voltine, by = “tucaseid", all.x = T) data <- merge(data, sporttine, by = “tucaseid*, all.x = T) data <- merge(data, healthtine, by = "tucaseid", all.x = T) ‘#colnames (data)[35] <- “sleep_time” data$sleep_time[is.na(data$sleep_tine)] <- @ datagwork_tine[is.na(data$work_time)] <- 0 data$play_tine[is.na(datagplay_tine)] <- 0 data$vol_tine[is.na(data$vol_tine)] <- @ data$sport_time[is.na(dataSsport_time)] <- @ datathealth_time[is.na(data$health_time)] <- 0 ‘#hleekday or Weekend data <- data %% mutate(wday ynd(paste(tuyear, diary no, diary day, sep="="))) wday(wday, Label=TRUE)) data <- data %% mutate(wday fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 4136 s2nen017 Project Data Exploration EXAMINING THE DATA SET'S OCCUPATION TYPE OCCURENCES select <- levels (work$occup_code) count <- rep(NA, length(select)) counter <- 1 for (i in select) ( print (i) count [counter] <- length(work$occup_code[work$occup_code print (count[counter]) counter <- counter + 1 } [1] “mgmt_biz_finance” [1] 87701 [1] "professional" [1] 96470 [1] "service" [2] 86166 [1] “sales' [1] 79548 [1] “office_admin” [1] 83157 [1] “farming forestry fishing" [1] 69155 [1] "construction" [a] 73364 (2) “instali_repair_naint” (1) 71898 [1] "production" [1] 74745 [1] "transport" [1] 74182 2222 FRPRPRETEPR RRR RE Specific attributes of variables: ‘Wage: USD Age: Years Health Related Care: Minutes/Surveyed Day Week/Weekend: Binary Variable, “Yes” or “No” Weekday: “Sun, Mon, Tue, Wed, Thu, Fri, Sat” Job Type/Work Class: “govt, private, self-employed, without_pay” Employment Status: “employed-at work, employed-absent, unemployed-layoff, unemployed-looking, not_in_labor_force” Student Status: "Yes", "No", NA Time spent working: Minutes/Surveyed Day Time spent playing: Minutes/Surveyed Day Time spent volunteering: Minutes/Surveyed Day Time spent sporting Minutes/Surveyed Day Education: “< hs diploma, hs diploma, some college, associate degree, bachelor's degree, master's degree, prof degree, doctoral degree” Marital Status: “married, divorced, seperated, widowed, never married” Parental Status: “Yes”, “No” Information about discrete variables ‘Number of Observations nrow(data) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 596 s2nen017 Project Data Exploration we [2] 181335 sleep time sunmnary (slept ime[2]) ft sleep_tine a Min, 1.8 WH Ist Qu.: 450.0 aH Median : 515.0 aH Mean: 529.2 4H 3rd Qu.: 600.0 WH Max. :1436.0 sleep <- as.numeric(unlist(sleeptime[2])) sd(sleep) wi [2] 134.8153 var(sleep) sw [2] 18175.17 worktime summary (worktime[2]) WH worktime wHoMing 3: 1.8 HH Ast Qu.: 285.0 WH Median : 465.0 wH Mean: 419.2 WH 3rd Qu.: 540.0 WH Max, :1430.0 working <- as-numeric(unlist(worktime[2])) sd(working) # [1] 207.9235 var(working) wa [2] 43232.18 fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 66 s2nen017 play _tine sunmary (playtime[2]) at play_time a8 Min. 1.0 WH Ist Qu.: 150.0 Wt Median : 270.0 WH Mean: 308.2 3rd Qu.: 430.0 wi Max, :1439.0 play <- as.nuneric(unlist(playtime[2])) sd(play) wi [1] 204.3468 var(play) aa [1] 41756.8 fvol_time sunmary(voltine[2]) ¥# ——-vol_tine we Ming: 18 HH Ast Que: 45.0 Hit Median : 95.0 ft Mean: 134.7 ft 3rd Qu.: 180.0 Ht Max, :1315.0 vol <- as.nuneric(unlist (voltime[2])) sd(vol) a [2] 132.5571 var(vol) fw [1] 17307.27 wsport_time sunmary (sporttime[2]) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration 7136 s2nen017 Project Data Exploration HH sport_time we Mins: 18 aa Ast Qui: 45.0 WH Median: 60.0 WH Mean: 106.7 wH 3rd Qu.: 120.0 WH Max. 11260.0 sport <- as.numeric(unlist(sporttime[2])) sd(sport) w# [1] 104.0516 var(sport) we [1] 10826.73 #health_time sunmary (healthtime[2]) wt -health_time HB Min. 1,00 WH Ast Que: 5.00 Wt Median : 30.00 a Mean: 88.89 wH 3rd Qu.: 90.00 WH Max. :1430.00 health <- as.numeric(unlist(healthtine[2])) sd(health) a [1] 161.5228 var(health) # [1] 26089.63, wages per week sunmary (data$weekly_earn) fH Min, Ist Qu. Median Mean 3rd Qu. Max. NA’ WH = 0.0 396.0 686.8 853.5 1140.@ 2885.@ 81127 fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl ae s2nen017 sd(dataweekly_earn, na.rm = T) sw [2] 647.4775 var(data$weekly_earn, na.rm = T) a [2] 419227.1 tage sunmary (data$age) a Min, Ist Qu. Median Mean 3rd Qu. WH 15.00 33.00 46.00 46.83 60.00 sd(datagage) 4H (1) 17. 76646 var(datasage) wi (1) 315.6471 Correcting for Outliers and Setting Bounds Trimming: (to make our analysis more effective) WAGE : WE ARE TAKING WAGES LESS THAN 3000 WORKTIME: WE ARE CONSIDERING ONLY THOSE WHO WORK (WORK TIME POSITIVE) VOLUNTEERTIME: WE ARE CONSIDERING ONLY THOSE WHO VOLUNTEER (VOL TIME POSITIVE) SPORTTIME: WE ARE CONSIDERING ONLY THOSE WHO PLAY SPORTS (SPORT TIME POSITIVE) HEALTHTIME: WE ARE CONSIDERING ONLY THOSE WHO SPENT TIME ON HEALTH (HEALTH TIME POSITIVE) Allof the other values will be set to NA (not 0), so that they don't affect analysis. fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration Max. 85.00 936 s2nen017 Project Data Exploration HERE WE ARE TRIMMING OUTLIERS 4 OR MORE STANDARD DEVIATIONS OVER THE MEAN #Trim wages data$weekly_earn[data$weekly_earn 884.61] <- NA #7ine spent sleeping select <- mean(sleep) + 4 * sd(sleep) data <- data[datassleep_tine < select, } #Time spent working select <- mean(working) + 4 * sd(working) data <- data[datagwork_time < select, ] data$work_tine[data$work_time == @] <- NA #Time spent playing select <- mean(play) + 4 * sd(play) data <- data[data$play_tine < select, ] #Time spent volunteering select <- mean(vol) + 4 * sd(vol) data <- data[data$vol_tine < select, ] data$vol_time[datagvol_time == 0] <- NA #Time spent sporting select <- mean(sport) + 4 * sd(sport) data <- data[datassport_tine < select, ] data$sport_tine[data$sport_time == 0] <- NA #Tine spent caring for their health select <- mean(health) + 4 * sd(nealth) data <- data{datashealth_tine < select, ] dataghealth_tine[datashealth tine == @] <- NA Some important plots for continuous variables: NOTE NOTE WE ARE REMOVING ZERO TERMS AND VALUE CEILINGS FOR THE SAKE OF DATA ANALYSIS, KEEP THAT IN MIND WHEN MAKING THE WRITUP. “OUT OF PEOPLE WHO SPEND SOME TIME PLAYING, THEY DO __" and "FOR PEOPLE WITH LESS THAN THIS AMOUNT, THEY DO ALSO, CHANGE OUR MOTIVATIONS SECTION TO REFLECT WHAT PENG PENG SAYS, AND WHAT WE ARE TRYING TO PREDICT. Fequire(cowplot) #lage Distribution histwage <- geplot(data, aes(weekly_earn)) + geom_histogram(bins = 108) + coord_cartesian(xlim = €(-8, 3000)) + ggtitle("Histogram of Wage") + xlab("Wage(Per Week)") + ylab("Frequency") histwage ‘it Warning: Renoved 83094 rows containing non-finite values (stat_bin). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 10786 sangiz017 Project Data Exploration Histogram of Wage 3000 3 2000 2 g S = 2 c 1000 0 0 1000 2000 3000 Wage(Per Week) age Distribution and Sleep vs. Age histage <- ggplot (data, aes(age)) + geom_histogram(bins = 3) + coord_cartesian(xLim = c(-8, 100 )) + getitle("Histogran of Age”) + xlab(“Aage(Yrs)") + ylab("Frequency") Lineage <- ggplot (data) + geom_smooth(aes(x = age, y = sleep_tine)) + ggtitle("Sleep Vs. Age") + xlab("Age(¥rs)") + ylab("SLeep(Min/Day)") plot_grid(histage, lineage) ‘ia ~geom_smooth()~ using method fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1136 sangiz017 Project Data Exploration Histogram of Age Sleep Vs. Age 9000 575 z s 8 S 6000 gs iS = 550 & = uw 3 oO 3000 525 0 0 25 50 75 100 20 40 60 80 Age(Yrs) Age(Yrs) ‘#SLeeptime Distribution and Plot of Wage vs. Sleeptime histsleep <- ggplot(data, aes(sleep_tine)) + geomhistogram(bins = 58) + coord_cartesian(xlim = c(-@, 130@)) + ggtitle("Histogram of Sleep Time") + xlab("Sleep(Min/Day)") + ylab("Frequency") Linesleep <- ggplot(data) + geon_snooth(aes(x = sleep_tin s. Sleep") + xlab("Sleep(Min/Day)") + ylab("Wage(Per Week)") y = weekly earn)) + getitle("wage V plot_grid(histsleep, linesleep) ‘ia ~geom_smooth()~ using method 4 Warning: Removed 83094 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 12138 s2nen017 Project Data Exploration Histogram of Sleep Time Wage Vs. Sleep 15000 800 = 3 > 3 2 10000 = 700 3 5 a a 2 oOo ms gs 5000 = 600 500 al 0 500 1000 0 300 600 900 Sleep(Min/Day) Sleep(Min/Day) f#Hlorktime Distribution and Plot of Wage vs. Worktime histwork <- geplot(data, aes(work time)) + geom_histogram(bins = 8) + coord_cartesian(xlin = c( -®, 900) + ggtitle("Histogram of Work Time") + x1ab("Work(Min/Day)") + ylab("Frequency") Linework <- ggplot (data) + geom_snooth(aes(x = work time, y Work Time") + xlab("Work(Min/Day)") + ylab("Wage(Per Week) reekly_earn)) + ggtitle("Wage Vs. plot_grid(histwork, Linework) 4 Warning: Removed 11911 rows containing non-finite values (stat_bin). 4 ~geon_smooth()” using method = ‘gam* a Warning: Renoved 123960 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 19986 sangiz017 Project Data Exploration Histogram of Work Time Wage Vs. Work Time 1200 6000 = 1000 o > 3 5 4000 = S 5 = a 2 oOo ot @ 800 8 = 2000 600 0 0 250 «500750 ° 400 800 1200 Work(Min/Day) Work(Min/Day) #Playtime Distribution and Plot of Wage vs. Playtime histplay <- ggplot(data, aes(play_time)) + geom_histogram(bins = $8) + coord_cartesian(xlin = c( -®, 150) + ggtitle( "Histogram of Play Time") + xlab("Play(Min/Day)") + ylab("Frequency") Lineplay <- geplot(data) + geom_smooth(aes(x = play_time, y = weekly earn)) + ggtitle("Wage Vs. Play Time") + xlab("Play(Min/Day)*) + ylab("Wage(Per Week)") plot_grid(histplay, lineplay) 4 ~geon_smooth()* using method = "gan* 4 Warning: Removed 83094 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1438 sangiz017 Project Data Exploration Histogram of Play Time Wage Vs. Play Time 800 = a 8 700 s go" = o a i & 600 = 500 0 500 1000 ©1500 0 300 600 900 Play(Min/Day) Play(Min/Day) #Wolunteertime Distribution and Plot of age vs. Volunteer Time histvol <- ggplot (data, aes(vol_time)) + geom_histogram(bins = 58) + coord_cartesian(xlim = c(-@ » 450) + ggtitle("Hist of Volunteer Time”) + x1ab("Volunteer(Min/Day)") + ylab("Frequency") Linevol <- ggplot (data) + geom_smooth(aes(x = vol_time, y = weekly_earn)) + ggtitle(*wage Vs. Vo 1 Tine") + xlab("Volunteering(Min/Day)") + ylab("Wage(Per Week)") plot_grid(histvol, linevol) i Warning: Renoved 167673 rows containing non-finite values (stat_bin). 4 ~geon_smooth()” using method = ‘gam* a Warnin + Removed 173672 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1598 s2nen017 Project Data Exploration Hist of Volunteer Time Wage Vs. Vol Time 840 = > 3 2 = eS B 800 E é = 760 0 100 200 300 400 0 200 400 600 Volunteer(Min/Day) Volunteering(Min/Day) ‘#Sporttine Distribution and Plot of Wage vs. Sporttime histsport <- ggplot(data, aes(sport_tine)) + geom_histogram(bins = 58) + coord_cartesian(xlim = €(-®, 45@)) + ggtitle("Hist of Sport Time") + xlab("Sport(Min/Day)") + ylab("Frequency") Linesport <- ggplot(data) + geon_snooth(aes(x = sport_time, y s. Sport Time") + xlab("Sport(Min/Day)") + ylab("Wage(Per Week)’ feekly_earn)) + ggtitle(*Wage V plot_grid(histsport, linesport) i Warning: Renoved 146819 rows containing non-finite values (stat_bin). 4 ~geon_smooth()” using method = ‘gam* 4 Warning: Renoved 162830 rows containing non-finite values (stat_smooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1696 s2nen017 Project Data Exploration jist of Sport Time Wage Vs. Sport Time 6000 900 4000 3 S © 800 2 = e é i 3 8 700 2000 = 600 0 0 100 200 300 400 0 100 200 300 400 500 Sport(Min/Day) Sport(Min/Day) #Healthtime Distribution and Plot of Wage vs. Healthtine histhealth <- ggplot(data, aes(log(health_time + 1))) + geon_histogran(bins = S@) + coord_cartes ian(xlim = c(-8, 6)) + ggtitle("Hist of Health Time") + xlab("Healthtime(Min/Day)") + ylab("Freq uency") Linehealth <- ggplot(data) + geom_smooth(aes(x = health_time, y Vs. Health Time") + xlab("Work(Min/Day)") + ylab("Wage(Per Week") jeekly_earn)) + ggtitle("Sleep plot_grid(histhealth, Linehealth) HH Warning: Removed 164263 rows containing non-finite values (stat_t 4H “geon_smooth()” using method = ‘gam* 4 Warning: Removed 175449 rows containing non-finite values (stat_snooth). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 7186 s2nen017 Project Data Exploration Hist of Health Time Sleep Vs. Health Time 1500 900 1000 S 800 > 2 g 7 Ss 5 a 3 & 700 « 2 500 s 600 500 0 0 2 4 6 0 200 400 600 Healthtime(Min/Day) Work(Min/Day) Some important plots for catagorical variables: Fequire(ggnosaic) #osaicplot of Occupation Type and Work Class geplot(data) + geon_mosaic(aes(x=product (occup_code), fill = factor(work_class)), na.rm=T) + ggt itle("Occupation Type and Sector") + theme(axis.text.x = elenent_text(angle = 98, hjust = 1)) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 1996 sangiz017 Project Data Exploration Occupation Type and Sector factor(work_class) govt private self-employed without_pay 3 3 @ g £ SSESE 2 e 2 2 ¢€ §ss 5 S = 8 8 SES 3 E 3 8 & 2353 6 é 1 Fe 3 6 l 2 g Wea 8s ° S585 2 8 i) a 5 g88 E gg 2 33 e gs E £ product(occup_code) Some important plots of mixed continuous and catagorical variables #Boxplot of Occupation Type and Sleep selectoce <- data[!is.na(data$occup_code),] Beplot(selectocc) + geom_boxplot(aes(x = occup_code, y = sleep_time)) + ggtitle("Sleep Vs. Work Class") + xlab("Work Class") + ylab("Sleep(Hrs)") + theme(axis.text.x = elenent_text(angle = 90 » just = 2)) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 19688 sangiz017 Project Data Exploration Sleep Vs. Work Class 8 zg = 600 See SeqbqqHo ® 300 cect 7 Wes S 3 = © 2 ¢@ § ® 3 &@ §$ & 8 #2 & 3 & 1 & a2 2 & ig g 2 8 § 8 & y ° 3 = 5 gS 5 a 5 8 8 B = s = a | a Ee 2 z : £ s Work Class ‘#BoxpLot of Occupation Type and Wage selectocc <- data[!is.na(datasoccup_code),] ggplot(selectoce) + geom_boxplot(aes(x = occup_code, y = weekly earn), na.rn = T) + ggtitle("Wag e Vs. Occupation Type") + xlab("Occupation Type") + ylab("wage(Per Weck)") + thene(axis.text.x = elenent_text(angle = 99, hjust = 1)) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 20138 s2nen017 Project Data Exploration Wage Vs. Occupation Type i ! 3000 y 8 3 8 Wage(Per Week) i in n L ° sales service transport professional office_admin construction production repait farming_forestry_fishing instal Occupation Type #oxplot of Highest Degree Attained and wage Bgplot(data) + geom_boxplot(aes(x = edu, y = weekly_earn)) + ggtitle("Wage Vs. Degree Attained”) + xlab("Highest Degree Attained") + ylab("Wage(Per Week)") + thene(axis.text.x = element_1 ngle = 98, hjust = 1)) ext(a 4 Warning: Removed 83094 rows containing non-finite values (stat_boxplot). fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2196 s2nen017 Project Data Exploration Wage Vs. Degree Attained 3000 y 8 3 8 3 s 8 Wage(Per Week) ° 8 % ° 2 8 2 8 9° E E 2 E = 2 2 2 2 ¢ % ff 8 & fF 2 = 8 3 3 3 3 3 a 2 2 2 2 2 Ss ma 2 £ & o 5 5 a S v a 3 o 3 3 8 2 3 8 E 3 2 s Highest Degree Attained Data Analysis Selecting an Optimal Model sleep2 <- (data$sleep_tine)*2 #Base Model with sleep Amt <- Im(Log(weekly_earn + 1) ~ sleep time + sleep2, data = data) sunmary (1m) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2196 s2nen017 wi call a In(Formula WH Residuals: Min 19. Median -6.5102 -0.4274 0.1193 Project Data Exploration 3Q Max 0.5903 1.9707 = log(weekly_earn + 1) ~ sleep_time + sleep2, data = data) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl coefficient: Estimate 6.469¢+20 4,299e-04 -1, 1080-06 Std. Error t value Pr(>|t|) 2.982e-@2 216.937 < 2e-16 *** 1.110e-04 3.872 @.0ee108 *** 1,019e-@7 -10.796 < 2e-16 *** a a a wt a 4 (Intercept) i sleep_time 4H sleep2 a a a wt wt at ae Signif. codes: @ 0.001 '**' 9.01 '** 0.05 Residual standard error: 0.9012 on 96746 degrees of freedom (83094 observations deleted due to missingness) Multiple R-squared: 0.01205, Adjusted R-squared: 0.01203 F-statistic: 589.9 on 2 and 96746 DF, p-value: < 2.2e-16 wBase Model with sleep, demographic data In2 <- Im(Log(weekly_earn + 1) ~ sleep_tine + sleep2 + age, data = data) sunmary (1m2) call An(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age, data = data) Residuals: Min 19. Median 3Q Max -6.8897 -0.4186 0.1226 0.5822 1.9094 Coefficient: Estimate Std. Error t value Pr(>|t|) (Intercept) 6.086e+00 3.080e-02 197.598 <2e-16 *** Sleep_tine 2.731e-04 1.100e-04 2.483 0.013 * sleep2 -8.638e-07 1.@11e-07 -8.547 <2e-16 *** age 9.518e-@3 2.179e-04 43.677 <2e-16 *** Signif. codes: @ '*#*" 2.001 '**' 0.01 '*' 0.05 '.' 2° 1 Residual standard error: 0.8924 on 96745 degrees of freedom (83094 observations deleted due to missingness) Multiple R-squared: 0.3115, Adjusted R-squared: 0.03112 F-statistic: 1037 on 3 and 96745 DF, p-value: < 2.2e-16 PRP PEPER RRR RTE PER 236 s2nen017 Project Data Exploration #Base Model with sleep, demographic data An3 <- Im(Log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex, data = data) sunmary (1m3) call sex, data = data) Residuals: Min 1Q. Median 3Q Max -7.0895 -0.4090 0.1136 0.5649 1.9935 FRRRREE Coefficients: Estimate Std. Error t value Pr(>/t|) (intercept) 6.203e+08 3.022¢-02 205.226 < 2e-16 sleep_tine 4.719e-04 1.078e-04 4.378 1.2e-05 sleep? _—-1,001e-96 | 9.902e-08 -10.114 < 2e-16 age 9.817e-03 2.135e-04 45.981 < 2e-16 sexfemale -3.608e-01 5.640e-03 -63.972 < 2e-16 Signif. codes: @ '*#*" 2,001 '**' 0.01 '*" 0.05 (83094 observations deleted due to missingness) Multiple R-squared: 0.07047, Adjusted R-squared: SEPRRRE TPE RR RR RE #8ase Model with sleep, demographic data (OPTIMAL MODE! Ana <- Im(Log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex + edu + occup_code, data = data) sunmary (1m) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl An(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + Residual standard error: @.8741 on 96744 degrees of freedom L) 2.07044 F-statistic: 1834 on 4 and 96744 DF, p-value: < 2.2e-16 236 s2nen017 call sex + edu + occup_code, data = dat: Residuals: Min 19 Median 3Q Ma -7.5721 -0.2985 0.1018 0.4458 2.268: Coefficients: (Intercept) 6 sleep_tine “4 sleep2 1 age 5 sexfenale 3 eduhs diploma 5 edusone college 5 eduassociate degree 7. 9 1 1 1 2 7 ZRRPRE TETRA R RRR RP ESER edubachelor's degree i edunaster's degree it eduprof degree it edudoctoral degree #4 occup_codeprofessional + 4 occup_codeservice - #8 occup_codesales 4 occup_codeoffice_admin i occup_codefarming_forestry fishing -5. i occup_codeconstruction “1. i occup_codeinstall_repair_maint — -1. 4H occup_codeproduction -2. 4 occup_codetransport “4, a 44 (Intercept) ad 4H sleep_time 4H sleep2 . it age - i sexfemale aad 4 eduhs diploma a 4H edusone college a HH eduassociate degree a 4 edubachelor's degree 4 edunaster's degree o ## eduprof degree # edudoctoral degree ad ‘i occup_codeprofessional HH occup_codeservice oad 4 occup_codesales a 4 occup_codeoffice_admin a #8 occup_codefarming forestry fishing *** #4 occup_codeconstruction oo ## occup_codeinstall_repair_maint —*** 4H occup_codeproduction fle: Usersejche!Documents/Schoo!:2017-181ECON%420131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration a) 2 3540-01 -148e-01 -0480+00 a3ie+ee .1870+08 .A65e-01 -A1de-01 -56e-01 -111e-01 986e-01, 632e-01 5ite-01 953e-01, 8420-01 2 -8Ade-02 418¢-05 6640-08 8950-04 3620-03 5520-03, 2100-02 -134¢-02 -047¢-02 -251e-02 3220-02 1500-02 1260-03, 5880-03, -051¢-02 -443¢-03 -087¢-02 -471¢-02 555¢-02 .233¢-02 2900-02 218. 2 31 -64. 54. 57. 64. 87. 83. 48 5s. -30. 71. -52. -43. -19. “1. 9 -23 -37 In(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + Estimate Std. Error t value Pr(>|t|) -2090+00 .978e-04 9340-07 8840-03 4570-01 1930-01 8560-01 333 < 20-16 286 1.25¢-07 232 0.0256 051 < 2e-16 474 < 20-16 365 < 2e-16 994 < 20-16 B19 < 2e-16 268 < 2e-16 126 < 2e-16 688 < 2e-16 1ea < 20-16 338 < 2e-16 971 < 20-16 9e1 < 20-16 532 < 2e-16 986 < 2e-16 089 < 2-16 71d ¢ 20-16 954 < 2e-16 1538 < 2e-16 2596 s2nen017 Project Data Exploration 4 occup_codetransport ” wh WH Signif. codes: @ '*#*" @.001 '**' 0.01 '*' 0.05 '.' 2° 1 a 4 Residual standard error: 0.7617 on 96728 degrees of freedom 4 (83094 observations deleted due to missingness) 4 Multiple R-squared: 0.2944, Adjusted R-squared: 9.2942 fi F-statistic: 2018 on 28 and 96728 DF, p-value: < 2.2e-16 HE WERE GOING TO PUT IN OTHER STUFF, BUT IT SUFFERS FROM COLLINEARITY Bgplot(data) + geom_smooth(aes(x = sleep_time, y = work_time)) + ggtitle("Work and Sleep") + x12 b("Sleep") + ylab("Work") 4 “ geom_smooth()” using method = ‘gam* 4H Warning: Removed 110911 rows containing non-finite values (stat_snooth). Work and Sleep 600 ¥ 400 6 Ss 200 0 300 600 900 Sleep #uith sleep, demographic data, work time, play time Inwp <- Im(log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex + edu + occup_code + work time + play_time, data = data) summary (1mwp) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2696 s2nen017 call Residuals: Min 10. Median BQ Ma -7.575@ -0.3061 0.0748 0.4198 2.115: Coefficients: (Intercept) 5. sleep_tine 4 sleep2 2. age 6. sexfenale 3. eduhs diploma 4. ZRRPRE TETRA R RRR RP ESER edusone college 5 eduassociate degree 6 edubachelor's degree 8 it edunaster's degree 9 it eduprof degree 1 4 edudoctoral degree 1 #4 occup_codeprofessional 2 4 occup_codeservice “6 #4 occup_codesales 4 occup_codeoffice_admin 4 occup_codefarming_forestry_fishing i occup_codeconstruction a1 wi occup_codeinstall_repair_maint — -1. 4H occup_codeproduction 23. 4 occup_codetransport “4. 4 work time 5. #4 play_time 2. wt 4s (Intercept) “ # sleep_time 4H sleep? : 4H age a 4H sexfenale a 4H eduhs diplona a 4 edusone college 4 eduassociate degree o 4 edubachelor's degree # edunaster's degree ad tt eduprof degree i edudoctoral degree a 4 occup_codeprofessional a iit occup_codeservice a #8 occup_codesales a #4 occup_codeoffice_admin oo ## occup_codefarming_forestry_fishing *** 4 occup_codeconstruction fle: Usersejche!Documents/Schoo!:2017-181ECON%420131/ProjeclProject_Data_Exploration_VS.himl Project Data Exploration 7 970e+00 240-05 267-07 1900-03 1882-21 3220-01 -184e-01 3960-01 -326e-01 -518e-01 416400 .0840+08 .153e-01 9960-01 299-01 .137e-01 268-01 998e-01, 819e-01 337e-@1 959e-01, 1780-24 5480-04 Estimate Std. .017e-02 4 2840-04 2880-07 4280-04 7680-03 -231¢-02 -298¢-02 4560-02 -337e-02 -S71e-02 7620-02, 5080-02 -083¢-02 -177¢-02 -273¢-02 -196¢-02 -698e-02 -883e-02 938e-02 +563¢-02 6220-02 9996-05 .713¢-05 od 1s. -2. -1 25. -47. 35 39. 43. 62. 68. 37 43. -21. -59. -41. -34, -16. -10. “9 -21. -30. 25 9. In(formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + sex + edu + occup_code + worktime + play tine, data fata) Error t value -612 314 759 497 103 99 932 921 276 552 678 aaa 468 421 626 593, 929 610 384, 352 577 9@7 394 Pr(oitl) <2e-16 0.7539 0.0785 «20-16 «20-16 «20-16 <2e-16 <2e-16 <2e-16 <2e-16 «20-16 «20-16 «20-16 «20-16 «2e-16 <2e-16 <2e-16 <2e-16 <2e-16 «20-16 «20-16 «20-16 «2e-16 27736 s2nen017 Project Data Exploration 4H occup_codeinstall_repair_maint — *** 4H occup_codeproduction a 4 occup_codetransport a 4 work time a #8 play_time oo ft - iH Signif. codes: @ '*#** 0.001 '**' 9.01 '*' 0.05." 0.25 ae 8 Residual standard error: 0.7273 on 55860 degrees of freedom i (123960 observations deleted due to missingness) WH Multiple R-squared: 0.3094, Adjusted R-squared: 9.3092 WH F-statistic: 1138 on 22 and S586 DF, p-value: < 2.2e-16 #ulith sleep, demographic data, work time, play time, and health time Amal <- Im(log(weekly_earn + 1) ~ sleep_tine + sleep2 + ‘age + sex + edu + occup_code + worktime + play_tine + health_time + sport_time + vol_time, data = data) sunmary (1mal1) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2036 s2nen017 a wi call 8 Im(Formula = log(weekly_earn + 1) ~ sleep_time + sleep2 + age + WH sex + edu + occup_code + worktime + play_time + health_time + WH sport_time + vol_time, data = data) a it Residuals: a Min 1Q Median 39 Max Wi -1,12580 -0.26159 0.00369 0.26248 0.96205, a WH Coefficients: a Estimate Std. Error t value Pr(>/tl) 4H (Intercept) 4.631100 2.839e+00 1.631 0.1288 4H sleep_time 5.a13e-@3 8.851e-03 0.612 @,5522 sit sleep +7.6970-06 1.1490-@5 -0.672 0.5158 a age 1,175e-@2 1.987e-02 0.591 @.5652 4 sexfenale -1.347e+09 5.873e-@1 -2.294 0.0407 4 edusone college -6.569e-02 9.421e-01 -0.078 0.9456 it eduassociate degree 4.635e-01 8.161e-01 0.568 0.5805 fit edubachelor’s degree -1.485e-02 7.423e-@1 -0.020 0.9844 i edunaster's degree 4.136e-01 7.995e-@1 0.517 0.6143 4 edudoctoral degree 1,935e1@@ 1.239409 1.562 @.1442 4 occup_codeprofessional 8.75Se-@1 6.163e-01 1.421 @.1809 #4 occup_codeservice 5.780e-@1 1.003e+00 8.576 @.5751 #8 occup_codesales 2.476e-@1 7.781e-01 0.318 @.7558 #4 occup_codeoffice_admin 3.110e-01 6.075e-01 0.512 @.6180, 4 occup_codeconstruction 8.105e-61 1.753e+00 0.462 @.6521 4 occup_codeproduction -1.219e-@1 1.189e+09 -.103 0.9200 8 work_time 1.457e-@3 1.649e-03 0.884 0.3942 i play_time -6.723e-04 1,977¢-@3 -0.348 0.7397 4 health_time 9.522e-@3 5.1@1e-03 1.867 2.0865 . 4 sport_time -1.719e-03 8,548e-@3 -0.281 0.8440 it vol_time -1.176¢-03 3.751e-@3 -0.313 0.7593 ft - WH Signif. codes: @ '*#** 0.001 '**' 9.01 '*° 0.05 '.' a1" * at # Residual standard error: 0.7829 on 12 degrees of freedom wi (179818 observations deleted due to missingness) WH Multiple R-squared: 0.6659, Adjusted R-squared: 0.109 # F-statistic: 1.196 on 20 and 12 DF, One Way ANOVA for F-Statistic, ASSUMPTIONS FOR ANOVA FIRST TEST Project Data Exploration p-value: 0.3841 al <- aov(1og(weekly_earn + 1) ~ occup_code, data=data) summary (21) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 2036 s2nen017 Project Data Exploration a DF Sum Sq Mean Sq F value Pr(>F) 4H occup_code 9 13585 1509.4 2214 <2e-16 *** fa Residuals 96739 65941 @.7 aH = WH Signif. codes: @ '*#*" @,001 '**' 0.01 '*' 0.05 '.' 2° 1 4 83094 observations deleted due to missingness a2 <- aov(sleep_time ~ occup_code, dataedata) sunmary (22) a DF Sum Sq Mean Sq F value Pr(>F) 4## occup_code 9 1.047e+07 1163397 74.25 <2e-16 *** WH Residuals 112330 1.768e+09 15670 ae at Signif. codes: @ ‘'**** 8.001 '**' 0.01 '** 0.05 4H 67503 observations deleted due to missingness bizFinsleep <- data$sleep_time[datagoccup_code constructsleep <- datagsleep_tine[data$occup_code bizfinearn <- data$weekly_earn[data$occup_code constructearn <- data$weekly_earn{data$occup_code #sunmary (bizfinsLeep) #sunmary(constructsleep) t.test(bizFinsleep, constructsleep) Welch Two Sample t-test data: bizfinsleep and constructsleep t = -9.1121, df = 6956, p-value < 2.2e-16 alternative hypothesi: 95 percent confidence interval -22.39046 -14.46223 sample estinates: i mean of x mean of y 497.7677 16.1940 fPRRHRRRE te ‘ftsummary(bizfinearn) #tsummary(constructearn) t.test(bizFinearn, constructearn) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl ygmt_biz_finance"] construction") = "mgnt_biz_finance"] “construction” ] true difference in means is not equal to @ 30836 s2nen017 Project Data Exploration 4 Welch Two Sample t-test wi data: bizFinearn and constructearn WH t = 42.317, df = 7816.5, p-value < 2.2¢-16 4 alternative hypothesis: true difference in means is not equal to @ 4 95 percent confidence interval wi 358.4114 393.2298 w## sample estimates: 4 mean of x mean of y # 1179.7642 803.9435 What is the nature of these differences? How can we express them and capture through linear model? flets run the previous Linear model, but seperating by the two occupations bizfin <- data[data$occup_code == "mgnt_biz_finance”, ] sleepbf <- (bizfingsleep_time)*2 construct <- data[datagoccup_code == "construction", ] sleepcon <- (construct $sleep_tine)*2 In6 <- Im(Log(weekly earn + 1) ~ sleep_tine + sleepbf + age + sex + edu, data = bizfin) summary (1n6) fle: Usersejche!Documents/Schoo!2017-181ECON%.20131/ProjeclProject_Data_Exploration_VS.himl 31136

You might also like