Commit fe61cc4a authored by Radford Neal's avatar Radford Neal

changes based on all CHARSXPs being cached now

parent a604317d
Changes based on all CHARSXPs now being cached (except NA_STRING),
so some simplifications/speedups are possible.
......@@ -47,7 +47,6 @@
IS_ACTIVE_BINDING
IS_ASCII
IS_BYTES
IS_CACHED
IS_LATIN1
IS_UTF8
InitDynload
......
......@@ -177,7 +177,7 @@ extern0 SEXP R_UnderscoreString; /* "_", as a CHARSXP */
#define LATIN1_MASK (1<<2)
#define UTF8_MASK (1<<3)
/* (1<<4) is taken by S4_OBJECT_MASK */
#define CACHED_MASK (1<<5)
#define CACHED_MASK (1<<5) /* no longer used */
#define ASCII_MASK (1<<6)
/* Symbol and string hash table declarations. */
......@@ -225,8 +225,7 @@ typedef union { int i; double r; } R_static_box_contents;
# define IS_UTF8(x) (UPTR_FROM_SEXP(x)->sxpinfo.gp & UTF8_MASK)
# define SET_UTF8(x) ((UPTR_FROM_SEXP(x)->sxpinfo.gp) |= UTF8_MASK)
# define ENC_KNOWN(x) (UPTR_FROM_SEXP(x)->sxpinfo.gp & (LATIN1_MASK|UTF8_MASK))
# define SET_CACHED(x) ((UPTR_FROM_SEXP(x)->sxpinfo.gp) |= CACHED_MASK)
# define IS_CACHED(x) ((UPTR_FROM_SEXP(x)->sxpinfo.gp) & CACHED_MASK)
# define IS_CACHED(x) 1 /* All strings are cached, except NA_STRING */
#else /* USE_RINTERNALS */
/* Needed only for write-barrier testing */
int IS_BYTES(SEXP x);
......@@ -238,8 +237,6 @@ void SET_ASCII(SEXP x);
int IS_UTF8(SEXP x);
void SET_UTF8(SEXP x);
int ENC_KNOWN(SEXP x);
int SET_CACHED(SEXP x);
int IS_CACHED(SEXP x);
#endif /* USE_RINTERNALS */
#include "Internal.h" /* do_FOO */
......@@ -1969,19 +1966,20 @@ static inline int ISNAN_NOT_NA (double x)
static inline int SEQL(SEXP a, SEXP b)
{
/* The only case where pointer comparisons do not suffice is where
we have two strings in different encodings (which must be
non-ASCII strings). Note that one of the strings could be marked
as unknown. */
if (a == b) return 1;
/* Leave this to compiler to optimize */
if (IS_CACHED(a) && IS_CACHED(b) && ENC_KNOWN(a) == ENC_KNOWN(b))
we have two strings in different encodings (which must be
non-ASCII strings). Note that one of the strings could be marked
as unknown. */
if (a == b)
return 1;
if (ENC_KNOWN(a) == ENC_KNOWN(b))
return 0;
else {
SEXP vmax = R_VStack;
int result = !strcmp(translateCharUTF8(a), translateCharUTF8(b));
R_VStack = vmax; /* discard any memory used by translateCharUTF8 */
return result;
}
SEXP vmax = R_VStack;
int result = !strcmp(translateCharUTF8(a), translateCharUTF8(b));
R_VStack = vmax; /* discard any memory used by translateCharUTF8 */
return result;
}
......
......@@ -632,9 +632,6 @@ void attribute_hidden (SET_LATIN1)(SEXP x) { SET_LATIN1(x); }
void attribute_hidden (SET_UTF8)(SEXP x) { SET_UTF8(x); }
void attribute_hidden (SET_ASCII)(SEXP x) { SET_ASCII(x); }
int attribute_hidden (ENC_KNOWN)(SEXP x) { return ENC_KNOWN(x); }
void attribute_hidden (SET_CACHED)(SEXP x) { SET_CACHED(x); }
int attribute_hidden (IS_CACHED)(SEXP x) { return IS_CACHED(x); }
/* ------------------------------------------------------------------------
......
......@@ -197,8 +197,7 @@ static void inspect_tree(int pre, SEXP v, int deep, int pvec, int prom) {
if (IS_LATIN1(v)) Rprintf(" [latin1]");
if (IS_UTF8(v)) Rprintf(" [UTF8]");
if (IS_ASCII(v)) Rprintf(" [ASCII]");
if (IS_CACHED(v)) Rprintf(" [cached]");
Rprintf("\"%s\"", CHAR(v));
Rprintf(" \"%s\"", CHAR(v));
}
if (TYPEOF(v) == SYMSXP) {
if (v == R_UnboundValue)
......
......@@ -2840,7 +2840,6 @@ SEXP mkCharLenCE(const char *name, int len, cetype_t enc)
}
if (is_ascii) SET_ASCII(val);
CHAR_HASH(val) = full_hash;
SET_CACHED(val); /* Mark it */
/* add the new value to the cache */
......@@ -2990,23 +2989,10 @@ R_FreeStringBufferL(R_StringBuffer *buf)
/* ======== These need direct access to gp field for efficiency ======== */
/* This has NA_STRING = NA_STRING. Inlined version is SEQL. */
/* This has NA_STRING = NA_STRING. Uses inlined version from Defn.h. */
int Seql(SEXP a, SEXP b)
{
/* The only case where pointer comparisons do not suffice is where
we have two strings in different encodings (which must be
non-ASCII strings). Note that one of the strings could be marked
as unknown. */
if (a == b) return 1;
/* Leave this to compiler to optimize */
if (IS_CACHED(a) && IS_CACHED(b) && ENC_KNOWN(a) == ENC_KNOWN(b))
return 0;
else {
SEXP vmax = R_VStack;
int result = !strcmp(translateCharUTF8(a), translateCharUTF8(b));
R_VStack = vmax; /* discard any memory used by translateCharUTF8 */
return result;
}
return SEQL(a,b);
}
......
......@@ -718,7 +718,6 @@ void InitNames()
/* NA_STRING */
NA_STRING = allocCharsxp(strlen("NA"));
strcpy(CHAR_RW(NA_STRING), "NA");
SET_CACHED(NA_STRING); /* Mark it */
R_print.na_string = NA_STRING;
/* Set up a set of globals so that a symbol table search can be
......
......@@ -48,7 +48,6 @@ struct _HashData {
int nomatch;
Rboolean useUTF8;
Rboolean useCache;
};
......@@ -154,8 +153,7 @@ static int shash(SEXP x, int indx, HashData *d)
unsigned int k;
const char *p;
const void *vmax = VMAXGET();
if(!d->useUTF8 && d->useCache) return cshash(x, indx, d);
/* Not having d->useCache really should not happen anymore. */
if(!d->useUTF8) return cshash(x, indx, d);
p = translateCharUTF8(STRING_ELT(x, indx));
k = 0;
while (*p++)
......@@ -212,7 +210,6 @@ static int sequal(SEXP x, int i, SEXP y, int j)
so avoid looking at the contents */
if (STRING_ELT(x, i) == STRING_ELT(y, j)) return 1;
/* Then if either is NA the other cannot be */
/* Once all CHARSXPs are cached, Seql will handle this */
if (STRING_ELT(x, i) == NA_STRING || STRING_ELT(y, j) == NA_STRING)
return 0;
return SEQL(STRING_ELT(x, i), STRING_ELT(y, j));
......@@ -319,7 +316,6 @@ static void MKsetup(int n, HashData *d)
static void HashTableSetup(SEXP x, HashData *d)
{
d->useUTF8 = FALSE;
d->useCache = TRUE;
switch (TYPEOF(x)) {
case LGLSXP:
d->hash = lhash;
......@@ -412,7 +408,7 @@ SEXP duplicated(SEXP x, Rboolean from_last)
HashTableSetup(x, &data); \
h = INTEGER(data.HashTable); \
if(TYPEOF(x) == STRSXP) { \
data.useUTF8 = FALSE; data.useCache = TRUE; \
data.useUTF8 = FALSE; \
for(i = 0; i < LENGTH(x); i++) { \
if(IS_BYTES(STRING_ELT(x, i))) { \
data.useUTF8 = FALSE; break; \
......@@ -420,9 +416,6 @@ SEXP duplicated(SEXP x, Rboolean from_last)
if(ENC_KNOWN(STRING_ELT(x, i))) { \
data.useUTF8 = TRUE; \
} \
if(!IS_CACHED(STRING_ELT(x, i))) { \
data.useCache = FALSE; break; \
} \
} \
}
......@@ -748,7 +741,6 @@ SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env)
if(type == STRSXP) {
Rboolean useBytes = FALSE;
Rboolean useUTF8 = FALSE;
Rboolean useCache = TRUE;
int len_x = length(x);
for(i = 0; i < len_x; i++) {
SEXP s = STRING_ELT(x, i);
......@@ -760,31 +752,20 @@ SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env)
if(ENC_KNOWN(s)) {
useUTF8 = TRUE;
}
if(!IS_CACHED(s)) {
useCache = FALSE;
break;
}
}
if(!useBytes || useCache) {
int len_table = length(table);
for(i = 0; i < len_table; i++) {
SEXP s = STRING_ELT(table, i);
if(IS_BYTES(s)) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
}
if(ENC_KNOWN(s)) {
useUTF8 = TRUE;
}
if(!IS_CACHED(s)) {
useCache = FALSE;
break;
}
int len_table = length(table);
for(i = 0; i < len_table; i++) {
SEXP s = STRING_ELT(table, i);
if(IS_BYTES(s)) {
useBytes = TRUE;
useUTF8 = FALSE;
break;
}
if(ENC_KNOWN(s)) {
useUTF8 = TRUE;
}
}
data.useUTF8 = useUTF8;
data.useCache = useCache;
}
DoHashing(table, &data);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment