Introduction To Spark With Sparklyr in R
Introduction To Spark With Sparklyr in R
//2
Selecting columns
# track_metadata_tbl has been pre-defined
track_metadata_tbl
# Manipulate the track metadata
track_metadata_tbl %>%
# Select columns
select(artist_name, release, title, year)
Filtering rows
# track_metadata_tbl has been pre-defined
glimpse(track_metadata_tbl)
Arranging rows
# track_metadata_tbl has been pre-defined
track_metadata_tbl
Mutating columns
# track_metadata_tbl has been pre-defined
track_metadata_tbl
Summarizing columns
# track_metadata_tbl has been pre-defined
track_metadata_tbl
track_metadata_tbl %>%
# Select columns starting with artist
select(starts_with("artist"))
track_metadata_tbl %>%
# Select columns ending with id
select(ends_with("id"))
track_metadata_tbl %>%
# Select columns containing ti
select(contains("ti"))
track_metadata_tbl %>%
# Select columns matching ti.?t
select(matches("ti.?t"))
track_metadata_tbl %>%
# Only return rows with distinct artist_name
distinct(artist_name)
Common people
# track_metadata_tbl has been pre-defined
track_metadata_tbl
track_metadata_tbl %>%
# Count the artist_name values
count(artist_name, sort = TRUE) %>%
# Restrict to top 20
top_n(20)
duration_by_artist %>%
# Sort by ascending mean duration
arrange(mean_duration)
duration_by_artist %>%
# Sort by descending mean duration
arrange(desc(mean_duration))
Groups of mutants
# track_metadata_tbl has been pre-defined
track_metadata_tbl
track_metadata_tbl %>%
# Group by artist
group_by(artist_name) %>%
# Calc time since first release
mutate(time_since_first_release = year - min(year)) %>%
# Arrange by descending time since first release
arrange(desc(time_since_first_release))
Left joins
# track_metadata_tbl and artist_terms_tbl have been pre-defined
track_metadata_tbl
artist_terms_tbl
Anti joins
# track_metadata_tbl and artist_terms_tbl have been pre-defined
track_metadata_tbl
artist_terms_tbl
Semi joins
# track_metadata_tbl and artist_terms_tbl have been pre-defined
track_metadata_tbl
artist_terms_tbl
//4
sentimental_artists %>%
# Arrange by ascending positivity
arrange(positivity) %>%
# Get top 5
top_n(5)
sentimental_artists %>%
# Arrange by descending positivity
arrange(desc(positivity)) %>%
# Get top 5
top_n(5)
track_metadata_tbl %>%
# Select artist_mbid column
select(artist_mbid) %>%
# Split it by hyphens
ft_regex_tokenizer("artist_mbid", "artist_mbid_chunks", pattern = "-")
track_metadata_tbl %>%
# Sample the data without replacement
sdf_sample(0.01, replacement = FALSE, seed = 20000229) %>%
# Compute the result
compute("sample_track_metadata")
Training/testing partitions
# track_metadata_tbl has been pre-defined
track_metadata_tbl
//4
Come together
# track_metadata_tbl, timbre_tbl pre-defined
track_metadata_tbl
timbre_tbl
track_metadata_tbl %>%
# Inner join to timbre_tbl
inner_join(timbre_tbl, by = "track_id") %>%
# Convert year to numeric
mutate(year = as.numeric(year))
both_responses %>%
# Add a residual column
mutate(
residual = predicted - actual
) %>%
# Group by model
group_by(model) %>%
# Calculate the root mean square error
summarize(
rmse = sqrt(mean(residual ^ 2))
)