-
Notifications
You must be signed in to change notification settings - Fork 18
Description
I discovered this while tightening up vroom's filepath handling. I made the reprex on Windows with R 4.1 and, anecdotally, have the same "no round trip" problem on ubuntu 18.04 with en_us locale (which is ISO-8819-1), which is included in vroom's test matrix. I think the cause / fix is likely different on the two platforms, though.
Based on recent experience in readxl and vroom, I'm going to hypothesize that cpp11 is now auto-converting a filepath to UTF-8 and then it's being re-encoded as UTF-8 (in error). I've also now realized that some of what I see interactively is not reflected in the reprex, maybe because of knitr's enforcement of UTF-8? 😭 I've tried to compensate for this.
R.version.string
#> [1] "R version 4.1.2 (2021-11-01)"
.Platform$OS.type
#> [1] "windows"
Sys.getlocale()
#> [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
l10n_info()
#> $MBCS
#> [1] FALSE
#>
#> $`UTF-8`
#> [1] FALSE
#>
#> $`Latin-1`
#> [1] TRUE
#>
#> $codepage
#> [1] 1252
#>
#> $system.codepage
#> [1] 1252I’m going to try writing bz2, gz, xz, and zip because I see these specific
cases in vroom. First I pass UTF-8 encoded paths. I include brio for
comparison.
make_temp_path <- function(ext) {
file.path(tempdir(), paste0("d\u00E4t", ext))
}
(bz2file <- withr::local_file(make_temp_path(".tar.bz2")))
#> [1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\RtmpUbfHcR/dät.tar.bz2"
(gzfile <- withr::local_file(make_temp_path(".tar.gz")))
#> [1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\RtmpUbfHcR/dät.tar.gz"
(xzfile <- withr::local_file(make_temp_path(".tar.xz")))
#> [1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\RtmpUbfHcR/dät.tar.xz"
(zipfile <- withr::local_file(make_temp_path(".zip")))
#> [1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\RtmpUbfHcR/dät.zip"
(briofile <- withr::local_file(make_temp_path(".csv")))
#> [1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\RtmpUbfHcR/dät.csv"
write_archive_file <- function(file) {
out_con <- archive::archive_write(file, "d\u00E4t.csv")
write.csv(file = out_con, data.frame(a = "A", b = "B"))
}
# at first, 0 files
list.files(tempdir(), pattern = "^d")
#> character(0)
write_archive_file(gzfile)
write_archive_file(bz2file)
write_archive_file(xzfile)
write_archive_file(zipfile)
brio::write_lines("whatever", briofile)
# now there are 5 files, 4 mis-encoded ones from archive, 1 from brio
(x <- list.files(tempdir(), pattern = "^d"))
#> [1] "dät.tar.bz2" "dät.tar.gz" "dät.tar.xz" "dät.zip" "dät.csv"
Encoding(x)
#> [1] "UTF-8" "UTF-8" "UTF-8" "UTF-8" "UTF-8"
bz2file
#> [1] "C:\\Users\\jenny\\AppData\\Local\\Temp\\RtmpUbfHcR/dät.tar.bz2"
x[1]
#> [1] "dät.tar.bz2"
charToRaw(bz2file)
#> [1] 43 3a 5c 55 73 65 72 73 5c 6a 65 6e 6e 79 5c 41 70 70 44 61 74 61 5c 4c 6f 63 61 6c
#> [29] 5c 54 65 6d 70 5c 52 74 6d 70 41 76 59 6e 4f 33 2f 64 c3 a4 74 2e 74 61 72 2e 62 7a
#> [57] 32
charToRaw(x[1]) # bz2file
#> [1] 64 c3 83 c2 a4 74 2e 74 61 72 2e 62 7a 32archive hasn’t written to the intended filepaths, brio has. What if I
explicitly pass paths in the native encoding?
write_archive_file(enc2native(gzfile))
write_archive_file(enc2native(bz2file))
write_archive_file(enc2native(xzfile))
write_archive_file(enc2native(zipfile))
brio::write_lines("whatever", enc2native(briofile))
(x <- list.files(tempdir(), pattern = "^d"))
#> [1] "dät.tar.bz2" "dät.tar.gz" "dät.tar.xz" "dät.zip" "dät.csv"Passing natively encoded paths doesn’t help, i.e. we just overwrite the
previous file paths.
What’s the nature of the problem? It looks like the UTF-8 bytes are being
treated as Windows-1252 bytes and then getting re-encoded as UTF-8.
lapply(x, charToRaw)
#> [[1]]
#> [1] 64 c3 83 c2 a4 74 2e 74 61 72 2e 62 7a 32
#>
#> [[2]]
#> [1] 64 c3 83 c2 a4 74 2e 74 61 72 2e 67 7a
#>
#> [[3]]
#> [1] 64 c3 83 c2 a4 74 2e 74 61 72 2e 78 7a
#>
#> [[4]]
#> [1] 64 c3 83 c2 a4 74 2e 7a 69 70
#>
#> [[5]]
#> [1] 64 c3 a4 74 2e 63 73 76
a_umlaut <- "\u00E4"
charToRaw(a_umlaut)
#> [1] c3 a4
iconv(a_umlaut, from = "Windows-1252", to = "UTF-8")
#> [1] "ä"
charToRaw(iconv(a_umlaut, from = "Windows-1252", to = "UTF-8"))
#> [1] c3 83 c2 a4Can I read from the filepaths I tried to write to? No.
archive::archive_read(bz2file)
#> Warning in file(archive, "rb"): cannot open file 'C:
#> \Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.bz2': No such file or
#> directory
#> Error in file(archive, "rb"): cannot open the connection
archive::archive_read(gzfile)
#> Warning in file(archive, "rb"): cannot open file 'C:
#> \Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.gz': No such file or
#> directory
#> Error in file(archive, "rb"): cannot open the connection
archive::archive_read(xzfile)
#> Warning in file(archive, "rb"): cannot open file 'C:
#> \Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.xz': No such file or
#> directory
#> Error in file(archive, "rb"): cannot open the connection
archive::archive_read(zipfile)
#> Warning in file(archive, "rb"): cannot open file 'C:
#> \Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.zip': No such file or directory
#> Error in file(archive, "rb"): cannot open the connectionFor the record, the files written by archive are just fine. And, for all but
.zip, even the name of the included file, which itself has the same
non-ascii character in it, is OK. For .zip, that file name is mis-encoded
and, incidentally, the date looks wrong (1980?).
# the files written are fine, just the path is broken
find_file <- function(ext) {
out <-
list.files(tempdir(), pattern = paste0("^d.*", ext, "$"), full.names = TRUE)
cat("Reading from:\n", out, "\n")
out
}
read.csv(archive::archive_read(find_file(".tar.gz")), row.names = 1)
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.gz
#> a b
#> 1 A B
read.csv(archive::archive_read(find_file(".tar.bz2")), row.names = 1)
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.bz2
#> a b
#> 1 A B
read.csv(archive::archive_read(find_file(".tar.xz")), row.names = 1)
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.xz
#> a b
#> 1 A B
read.csv(archive::archive_read(find_file(".zip")), row.names = 1)
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.zip
#> a b
#> 1 A B
archive::archive(find_file(".tar.gz"))
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.gz
#> # A tibble: 1 x 3
#> path size date
#> <chr> <int> <dttm>
#> 1 dät.csv 23 2022-05-12 19:10:28
archive::archive(find_file(".tar.bz2"))
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.bz2
#> # A tibble: 1 x 3
#> path size date
#> <chr> <int> <dttm>
#> 1 dät.csv 23 2022-05-12 19:10:28
archive::archive(find_file(".tar.xz"))
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.tar.xz
#> # A tibble: 1 x 3
#> path size date
#> <chr> <int> <dttm>
#> 1 dät.csv 23 2022-05-12 19:10:28
# uh-oh
archive::archive(find_file(".zip"))
#> Reading from:
#> C:\Users\jenny\AppData\Local\Temp\RtmpUbfHcR/dät.zip
#> # A tibble: 1 x 3
#> path size date
#> <chr> <int> <dttm>
#> 1 "dA\u000ft.csv" 23 1980-01-01 00:00:00
brio::read_lines(briofile)
#> [1] "whatever"Created on 2022-05-12 by the reprex package (v2.0.1)