This test now fails:
expect_equal(
query(
"SELECT origin, dest,
COUNT(flight) AS num_flts,
round(AVG(distance)) AS dist,
round(AVG(arr_delay)) AS avg_delay
FROM flights_dt
WHERE distance BETWEEN 200 AND 300
AND air_time IS NOT NULL
GROUP BY origin, dest
HAVING num_flts > 3000
ORDER BY num_flts DESC, avg_delay DESC
LIMIT 100;"
),
flights_dt %>%
filter(between(distance,200,300) & !is.na(air_time)) %>%
group_by(origin, dest) %>%
filter(sum(!is.na(flight)) > 3000) %>%
summarise(
num_flts = sum(!is.na(flight)),
dist = round(mean(distance, na.rm = TRUE)),
avg_delay = round(mean(arr_delay, na.rm = TRUE))
) %>%
ungroup() %>%
arrange(desc(num_flts), desc(avg_delay)) %>%
head(100L)
)
Unfortunately you don't get a particularly informative error (even with local_edition(3)) because the pipeline is rather deep. However, I think this is the key difference:
actual$parent$parent$parent$parent$parent$i vs expected$parent$parent$parent$parent$parent$i
- `\`_DT3\`[, .I[sum(!is.na(flight)) > 3000], by = .(origin, dest)]$V1`
+ `\`_DT4\`[, .I[sum(!is.na(flight)) > 3000], by = .(origin, dest)]$V1`
i.e. expected is generating one additional intermediate data table name than expected — this is probably due to the new grouped filter behaviour. Indeed, if I remove filter(sum(!is.na(flight)) > 3000) and HAVING num_flts > 3000 the test passes
This test now fails:
expect_equal( query( "SELECT origin, dest, COUNT(flight) AS num_flts, round(AVG(distance)) AS dist, round(AVG(arr_delay)) AS avg_delay FROM flights_dt WHERE distance BETWEEN 200 AND 300 AND air_time IS NOT NULL GROUP BY origin, dest HAVING num_flts > 3000 ORDER BY num_flts DESC, avg_delay DESC LIMIT 100;" ), flights_dt %>% filter(between(distance,200,300) & !is.na(air_time)) %>% group_by(origin, dest) %>% filter(sum(!is.na(flight)) > 3000) %>% summarise( num_flts = sum(!is.na(flight)), dist = round(mean(distance, na.rm = TRUE)), avg_delay = round(mean(arr_delay, na.rm = TRUE)) ) %>% ungroup() %>% arrange(desc(num_flts), desc(avg_delay)) %>% head(100L) )Unfortunately you don't get a particularly informative error (even with
local_edition(3)) because the pipeline is rather deep. However, I think this is the key difference:i.e. expected is generating one additional intermediate data table name than expected — this is probably due to the new grouped filter behaviour. Indeed, if I remove
filter(sum(!is.na(flight)) > 3000)andHAVING num_flts > 3000the test passes