...
 
Commits (2)
......@@ -19,3 +19,4 @@
^\.lintr$
^\.appveyor\.yml$
^.*\.tar\.gz
^src/compact_enc_det/\.git$
......@@ -15,6 +15,8 @@
#'
#' @export
#'
#' @example man-roxygen/ex-detect.R
#'
ced_enc_detect <- function(x, enc_hint = NULL, lang_hint = NULL) {
.Call(`_ced_ced_enc_detect`, x, enc_hint, lang_hint)
}
......
# test backend version
expect_true(is.numeric_version(ced_version()))
expect_true(ced_version() >= "2.2")
# test empty input
expect_identical(ced_enc_detect(NULL), character(0))
expect_identical(ced_enc_detect(raw()), character(0))
expect_identical(ced_enc_detect(character()), character(0))
expect_identical(ced_enc_detect(character(1)), NA_character_)
expect_identical(ced_enc_detect(NA_character_), NA_character_)
# test ASCII encoding
expect_identical(ced_enc_detect(letters), rep("US-ASCII", length(letters)))
# test preserve names
expect_identical(ced_enc_detect(c(a = "test")), c(a = "US-ASCII"))
test_file <- system.file("test.txt", package = "ced")
test_txt <- read.dcf(test_file, all = TRUE)
# test UTF-8 in various languages
expect_identical(ced_enc_detect(test_txt[["English"]]), "US-ASCII")
expect_identical(ced_enc_detect(test_txt[["Italian"]]), "US-ASCII")
expect_identical(ced_enc_detect(test_txt[["Spanish"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["French"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["Czech"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["Russian"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["Ukrainian"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["Chinese"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["Japanese"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["Korean"]]), "UTF-8")
expect_identical(ced_enc_detect(test_txt[["Arabic(3)"]]), "UTF-8")
# test non UTF-8 enodings
expect_identical(ced_enc_detect(iconv(test_txt[["Russian"]], "UTF-8", "WINDOWS-1251")), "windows-1251")
expect_identical(ced_enc_detect(iconv(test_txt[["Russian"]], "UTF-8", "IBM866")), "IBM866")
# detect character vector with ASCII strings
ascii <- "I can eat glass and it doesn't hurt me."
ced_enc_detect(ascii)
# detect character vector with UTF-8 strings
utf8 <- "\u4e0b\u5348\u597d"
print(utf8)
ced_enc_detect(utf8)
# path to examples
ex_path <- system.file("test.txt", package = "ced")
ex_txt <- read.dcf(ex_path, all = TRUE)
# russian text
print(ex_txt[["France"]])
ced_enc_detect(ex_txt[["Russian"]])
ced_enc_detect(iconv(ex_txt[["Russian"]], "utf8", "ibm866"))
ced_enc_detect(iconv(ex_txt[["Russian"]], "utf8", "windows-1251"))
ced_enc_detect(iconv(ex_txt[["Russian"]], "utf8", "koi8-r"))
# chinese text
print(ex_txt[["Chinese"]])
ced_enc_detect(ex_txt[["Chinese"]])
ced_enc_detect(iconv(ex_txt[["Chinese"]], "utf8", "gb18030"))
# korean text
print(ex_txt[["Korean"]])
ced_enc_detect(ex_txt[["Korean"]])
ced_enc_detect(iconv(ex_txt[["Korean"]], "utf8", "uhc"))
ced_enc_detect(iconv(ex_txt[["Korean"]], "utf8", "iso-2022-kr"))
# japanese text
print(ex_txt[["Japanese"]])
ced_enc_detect(ex_txt[["Japanese"]])
ced_enc_detect(iconv(ex_txt[["Japanese"]], "utf8", "shift_jis"))
ced_enc_detect(iconv(ex_txt[["Japanese"]], "utf8", "iso-2022-jp"))
......@@ -19,3 +19,42 @@ Character vector with suggested encodings.
\description{
Detect charset encoding of the character or raw vector.
}
\examples{
# detect character vector with ASCII strings
ascii <- "I can eat glass and it doesn't hurt me."
ced_enc_detect(ascii)
# detect character vector with UTF-8 strings
utf8 <- "\u4e0b\u5348\u597d"
print(utf8)
ced_enc_detect(utf8)
# path to examples
ex_path <- system.file("test.txt", package = "ced")
ex_txt <- read.dcf(ex_path, all = TRUE)
# russian text
print(ex_txt[["France"]])
ced_enc_detect(ex_txt[["Russian"]])
ced_enc_detect(iconv(ex_txt[["Russian"]], "utf8", "ibm866"))
ced_enc_detect(iconv(ex_txt[["Russian"]], "utf8", "windows-1251"))
ced_enc_detect(iconv(ex_txt[["Russian"]], "utf8", "koi8-r"))
# chinese text
print(ex_txt[["Chinese"]])
ced_enc_detect(ex_txt[["Chinese"]])
ced_enc_detect(iconv(ex_txt[["Chinese"]], "utf8", "gb18030"))
# korean text
print(ex_txt[["Korean"]])
ced_enc_detect(ex_txt[["Korean"]])
ced_enc_detect(iconv(ex_txt[["Korean"]], "utf8", "uhc"))
ced_enc_detect(iconv(ex_txt[["Korean"]], "utf8", "iso-2022-kr"))
# japanese text
print(ex_txt[["Japanese"]])
ced_enc_detect(ex_txt[["Japanese"]])
ced_enc_detect(iconv(ex_txt[["Japanese"]], "utf8", "shift_jis"))
ced_enc_detect(iconv(ex_txt[["Japanese"]], "utf8", "iso-2022-jp"))
}
......@@ -18,7 +18,7 @@ BEGIN_RCPP
END_RCPP
}
// ced_version
Rcpp::StringVector ced_version();
Rcpp::List ced_version();
RcppExport SEXP _ced_ced_version() {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
......
......@@ -20,9 +20,9 @@ Rcpp::String detect(const char* x, size_t n, const Encoding& enc, const Language
&bytes_consumed, // how many bytes used
&is_reliable // encoding is at least 2^10 time more probable then the second-best encoding
);
if (!is_reliable) {
return NA_STRING;
}
// if (!is_reliable) {
// return NA_STRING;
// }
return Rcpp::wrap(res);
}
......@@ -40,6 +40,8 @@ Rcpp::String detect(const char* x, size_t n, const Encoding& enc, const Language
//'
//' @export
//'
//' @example man-roxygen/ex-detect.R
//'
// [[Rcpp::export(rng = false)]]
SEXP ced_enc_detect(SEXP x, SEXP enc_hint = R_NilValue, SEXP lang_hint = R_NilValue) {
size_t n = LENGTH(x);
......
......@@ -13,8 +13,15 @@
//' @export
//'
// [[Rcpp::export(rng = false)]]
Rcpp::StringVector ced_version() {
Rcpp::StringVector res = Version();
Rcpp::List ced_version() {
const char* s = Version();
std::stringstream ss(s);
std::vector<int> tmp;
std::string num;
while (std::getline(ss, num, '.')) {
tmp.push_back(std::stoi(num));
}
Rcpp::List res = Rcpp::List::create(tmp);
res.attr("class") = "numeric_version";
return res;
}