Base R Functions
2024-10-01
Basic Functions & Data Structures
• Type of Classes: numeric, character, logical, factor
#Assignment Operator
a <- 2.7
b <- "hello"
c <- TRUE
a
## [1] 2.7
## [1] "hello"
## [1] TRUE
#class() Function
class(a)
## [1] "numeric"
class(b)
## [1] "character"
is.logical(c)
## [1] TRUE
• Coercion Functions: to convert data types
as.numeric()
as.logical()
as.character()
as.factor()
1
Data Structure Features & Usage
Vector combine function: c( )
selecting elements: vector[index OR index range]
replacing elements with a new value: vector[index] <- new_value
Matrix matrix(values, nrow = , byrow = TRUE) FALSE to fill by
*elements only take in 1 column 1st instead
datatype select elements: matrix[row_no, col_no]
Dataframe columns are named & can be of different class types
List retrieving names of objects in the list: names(list)
selecting components in a list: list[[index]]
#Creating a vector of length 3
d <- c(1,2,3) #c() is a combine function that joins comma-separated data types into a vector
d
## [1] 1 2 3
#Creating a sequence of integers
x <- -2:2
x
## [1] -2 -1 0 1 2
y<- 2ˆx
y
## [1] 0.25 0.50 1.00 2.00 4.00
#Select elements within a vector *indexing in R is 1-based
#--> same as python formatting
#1st element:
y[1]
## [1] 0.25
#Range:
y[2:4]
## [1] 0.5 1.0 2.0
#Replacing elements in a vector with a new value
y[1] <- -3
y
## [1] -3.0 0.5 1.0 2.0 4.0
#Class of Vector --> dependent on the property/nature of its elements
firstname <- c("adam", "brian", "cathy")# character, length = 3
avg <- c(1.2) # numeric, length = 1
pass <- c(TRUE, FALSE, TRUE) # logical, length = 3
class(pass)
2
## [1] "logical"
#--------------
#2) MATRIX
#--------------
#elements only take in one datatype
mat <- matrix(1:12, nrow = 4, byrow = TRUE) #byrow FALSE fills by column first instead, reverses dimensi
mat
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
## [4,] 10 11 12
dim(mat)
## [1] 4 3
#Select elements within a matrix
#entry at 2nd row & 3rd column
mat[2, 3]
## [1] 6
#select elements in 1st & 2nd rows
mat[c(1,2), ]
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
#Different from matrix in a sense that columns are named & can be of different class types
#Creating Data Frames: data.frame() function
budget_cat <- c("Manpower", "Asset", "Other")
amount <- c(519.4, 38.0, 141.4)
op_budget <- data.frame(budget_cat, amount)
op_budget
## budget_cat amount
## 1 Manpower 519.4
## 2 Asset 38.0
## 3 Other 141.4
#Select elements in a dataframe
#Option 1: Conventional Method
op_budget[, "budget_cat"] #select the budget category
## [1] "Manpower" "Asset" "Other"
3
#Option 2: ACCESSOR OPERATOR $
op_budget$budget_cat
## [1] "Manpower" "Asset" "Other"
#[ EXAMPLE: CARS DATA ]
data(cars) #load data
str(cars) #structure of data frame
## ’data.frame’: 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
dim(cars) #number of rows & columns
## [1] 50 2
class(cars) # checking types of object
## [1] "data.frame"
names(cars) # viewing column names
## [1] "speed" "dist"
head(cars) # default is 6, to specify indicate n = 10
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
#SPECIFY NUMBER OF ROWS to be examines using
head(cars, n = 10)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
## 7 10 18
## 8 10 26
## 9 10 34
## 10 11 17
4
#A collection of objects which can be of different classes & lengths
mylist <- list(A = 1,
B = c(1, 2),
C = c(TRUE, FALSE, TRUE),
D = matrix(1:6, nrow = 3)) # a list of 4 objects
class(mylist)
## [1] "list"
names(mylist)
## [1] "A" "B" "C" "D"
#Select components of a list:
#Option 1: using DOUBLE SQUARE BRACKETS [[]] & take in an index value
mylist[[3]] #3rd component in list
## [1] TRUE FALSE TRUE
#Option 2: using ACCESSOR OPERATOR $ & specify componenet of list
mylist$C
## [1] TRUE FALSE TRUE
table() function: contingency table of counts for a particular variable
unique() function: lists down a dataframe of unique values
General string r
*1-based indexing
string r function how it works
str_length() Find the number of characters in a string
count() Count the number of occurrences of a pattern in a
string
str_c(vector_of_strings, Concatenate strings (similar to paste() but more
vector_of_string_to_concat, sep = “-”) efficient)
*paste()
str_sub(string/vector_of_strings, start = Extract a substring by specifying start and end
index, end = index) positions
str_extract(string/vector_of_strings, Extract the first occurrence of a pattern in a string
pattern)
str_match(string, regular_expression) Extract matched groups from a regular expression
str_match_all() Extract all matched groups from a string
str_split(col_name, pattern = “common Split a string into multiple parts based on a pattern
character”, simplify = TRUE)
str_detect(string/vector_of_strings, Detect the presence of a pattern in a string.
pattern)
5
string r function how it works
str_end(string, character) Detects for strings ending with the character
str_replace(string, pattern, Replace the first occurrence of a pattern in a string
replacement_string)
str_replace_all(string, pattern, Replace all occurrences of a pattern in a string
replacement_string)
str_which(string, pattern) Find the index of strings that match a pattern
str_remove() Remove the first occurrence of a pattern in a string.
str_trim() Remove leading and trailing whitespace from a string
str_squish() Remove excess whitespace from a string (reduces
multiple spaces to one)
str_pad() Pad a string to a specific width.
str_to_upper() Change a string to upper &
str_to_lower() lower case
str_to_title() Each letter of each word is capitalized
library(tidyverse)
#str_extract()
df <- tibble(sentence = c("The price is $100", "It costs $200"))
# Extract the first number after "$"
df %>% mutate(price = str_extract(sentence, "\\$\\d+"))
## # A tibble: 2 x 2
## sentence price
## <chr> <chr>
## 1 The price is $100 $100
## 2 It costs $200 $200
#str_replace()
df <- tibble(sentence = c("I have 2 apples", "You have 3 bananas"))
# Replace the first number with "many"
df %>% mutate(sentence_replaced = str_replace(sentence, "\\d+", "many"))
## # A tibble: 2 x 2
## sentence sentence_replaced
## <chr> <chr>
## 1 I have 2 apples I have many apples
## 2 You have 3 bananas You have many bananas
#str_match()
df <- tibble(sentence = c("I have 2 apples", "You have 3 bananas"))
# Extract the number and the word after it
df %>% mutate(matches = str_match(sentence, "(\\d+) (\\w+)"))
## # A tibble: 2 x 2
## sentence matches[,1] [,2] [,3]
## <chr> <chr> <chr> <chr>
## 1 I have 2 apples 2 apples 2 apples
## 2 You have 3 bananas 3 bananas 3 bananas
6
#str_match_all()
df <- tibble(sentence = c("I have 2 apples and 3 bananas", "You have 4 oranges and 2 pears"))
# Extract all numbers
df %>% mutate(matches = str_match_all(sentence, "\\d+"))
## # A tibble: 2 x 2
## sentence matches
## <chr> <list>
## 1 I have 2 apples and 3 bananas <chr [2 x 1]>
## 2 You have 4 oranges and 2 pears <chr [2 x 1]>
#str_pad()
df <- tibble(name = c("Joe", "Sam"))
# Pad names to 10 characters with dots
df %>% mutate(padded_name = str_pad(name, width = 10, side = "right", pad = "."))
## # A tibble: 2 x 2
## name padded_name
## <chr> <chr>
## 1 Joe Joe.......
## 2 Sam Sam.......
apply functions
function How it works
apply(X, margin , function) X: matrix/array, margin: 1 - rows, 2 - columns, function: function
using anonymous functions: to apply
apply(X, 1, function(x) Apply a function to rows or columns of a matrix or an array
any(is.na(x))
lappply(X, function) Apply a function to each element of a list or vector, returning a list
sapply(X, function) Simplify lapply(), returns a vector or matrix instead of a list
tapply(V index, function) V: vector to be split, index: factors to be split by
Apply a function over subsets of a vector
#---------------------------------------------------
# apply(): across rows/columns of a matrix/dataframe
#---------------------------------------------------
my_mat <- matrix(1:30, nrow = 10, byrow = FALSE) #generate a 10x3 matrix with values 1:30
my_mat
## [,1] [,2] [,3]
## [1,] 1 11 21
## [2,] 2 12 22
## [3,] 3 13 23
## [4,] 4 14 24
## [5,] 5 15 25
## [6,] 6 16 26
7
## [7,] 7 17 27
## [8,] 8 18 28
## [9,] 9 19 29
## [10,] 10 20 30
#COLUMN means
apply(my_mat, MARGIN = 2, mean)
## [1] 5.5 15.5 25.5
#ROW means
apply(my_mat, MARGIN = 1, mean)
## [1] 11 12 13 14 15 16 17 18 19 20
#using ANONYMOUS FUNC
#COLUMN SUM INCREMENTED BY 3
apply(my_mat, MARGIN = 2, function(x) sum(x) + 3)
## [1] 58 158 258
#x is a single column of the matrix my_mat
#anonymous function takes in 1 argument
Example: US Personal Expenditure data
data(USArrests)
head(USArrests, 3)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
#-----------------------------------------------------------------
# tapply(): om a subset of data frame broken down by factor levels
#-----------------------------------------------------------------
# Create synthetic data
set.seed(27)
df<-data.frame(price = rnorm(100, sd = 5, mean = 20),
city =sample(paste0("C", 1:4),size = 100,replace = T),
region =sample(paste0("R", 1:4),size = 100,replace = T))
head(df)
## price city region
## 1 29.53581 C2 R2
## 2 25.72438 C4 R2
## 3 16.17735 C1 R4
## 4 12.71284 C2 R2
## 5 14.53266 C4 R3
## 6 21.47621 C4 R2
8
#--------------------------------------
#LIST BEING USED for sapply() & laaply()
y <- list(A = 1:5,
B = seq(0, 10, length = 5),
C = c(TRUE, TRUE, FALSE))
y
## $A
## [1] 1 2 3 4 5
##
## $B
## [1] 0.0 2.5 5.0 7.5 10.0
##
## $C
## [1] TRUE TRUE FALSE
#---------------------------------------
#--------------------------------------------------------------
# sapply(): across elements of a list & return a VECTOR/MATRIX
#--------------------------------------------------------------
sapply(y, mean)
## A B C
## 3.0000000 5.0000000 0.6666667
#-----------------------------------------------------
# lapply(): across elements of a list & return a LIST
#-----------------------------------------------------
lapply(y, mean)
## $A
## [1] 3
##
## $B
## [1] 5
##
## $C
## [1] 0.6666667