[Rd] "bug" and patch: quadratic running time for strsplit(..., fixed=TRUE) (PR#9902)

From: <jbrzusto_at_fastmail.fm>
Date: Fri, 07 Sep 2007 15:47:26 +0200 (CEST)


Full_Name: John Brzustowski
Version: R-devel-trunk, R-2.4.0
OS: linux, gcc 4.0.3
Submission from: (NULL) (206.248.157.184)

This isn't a bug, but an easily-remedied performance issue.

SYMPTOM > for (i in 1000 * (1:20)) {

   y <- paste(rep("asdf", times=i), collapse=" ")    t <- system.time(strsplit(y, " ", fixed=TRUE))    cat(sprintf("i=%5d time=%5d msec\n",i, round(1000*t[1]))) }

i= 1000 time=    2 msec
i= 2000 time=    9 msec
i= 3000 time=   20 msec
i= 4000 time=   34 msec
i= 5000 time=   57 msec
i= 6000 time=   77 msec
i= 7000 time=  107 msec
i= 8000 time=  136 msec
i= 9000 time=  177 msec
i=10000 time=  230 msec
i=11000 time=  275 msec
i=12000 time=  308 msec
i=13000 time=  371 msec
i=14000 time=  446 msec
i=15000 time=  544 msec
i=16000 time=  639 msec
i=17000 time=  726 msec
i=18000 time=  864 msec
i=19000 time=  944 msec
i=20000 time= 1106 msec

DIAGNOSIS strsplit() uses strlen() in the bounds check clause of a for(;;) statement, which forces a full scan of the source string for each character in the source string. Unlike R's LENGTH() macro, strlen for C strings is an expensive operation, and in this case (at least), gcc 4.0.3's -O2 level optimizer is not able to recognize the call as a loop invariant, despite the declaration "const char *buf".

REMEDIED BEHAVIOUR

i= 1000 time=    0 msec
i= 2000 time=    1 msec
i= 3000 time=    1 msec
i= 4000 time=    0 msec
i= 5000 time=    1 msec
i= 6000 time=    1 msec
i= 7000 time=    1 msec
i= 8000 time=    2 msec
i= 9000 time=    2 msec
i=10000 time=    2 msec
i=11000 time=    2 msec
i=12000 time=    2 msec
i=13000 time=    2 msec
i=14000 time=    2 msec
i=15000 time=    4 msec
i=16000 time=    3 msec
i=17000 time=    3 msec
i=18000 time=    4 msec
i=19000 time=    3 msec
i=20000 time=    4 msec

RELATED ISSUES A simple search turns up other instances of this usage in R's source. For completeness, I'm submitting patches for all of them, but have not tested whether they in fact cause a detectable performance problem. In the case of modules/X11/dataentry.c, the patch also fixes a presumably ineffectual "bug".

$ grep -nR "for *([^;]*;[^;]*strlen *(" *

main/rlocale.c:137:		for (i = 0; i < strlen(lc_str) && i < sizeof(lc_str); i++)
main/printutils.c:486:		for(j = 0; j < strlen(buf); j++) *q++ = buf[j];
main/sysutils.c:493:		for(j = 0; j < strlen(sub); j++) *outbuf++ = sub[j];
modules/X11/rotated.c:608:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:856:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1399:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1797:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2045:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2339:	for(i=0; i<strlen(text)-1; i++)
modules/X11/dataentry.c:1358: for(i = 0; i < strlen(text); i++) *bufp++ = text[i];

PATCHES (against trunk)
Only the first is required to fix strsplit().

Index: src/main/character.c


     static char *lc_cache = "";
     static int lc = 0;
 
     if (0 != strcmp(setlocale(LC_CTYPE, NULL), lc_cache)) {
 	strncpy(lc_str, setlocale(LC_CTYPE, NULL), sizeof(lc_str));
-	for (i = 0; i < strlen(lc_str) && i < sizeof(lc_str); i++)
+	for (i = 0, j = strlen(lc_str); i < j && i < sizeof(lc_str); i++)
 	    lc_str[i] = toupper(lc_str[i]);
 	for (i = 0; i < (sizeof(cjk_locale_name)/sizeof(cjk_locale_name_t)); 
 	     i++) {

Index: src/main/printutils.c
     /* count number of sections in string */
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		nl++;
 

@@ -853,7 +853,7 @@
/* count number of sections in string */ item->nl=1; if(align!=NONE) - for(i=0; i<strlen(text)-1; i++) + for(i=strlen(text)-2; i >= 0; i--) if(text[i]=='\n') item->nl++;
@@ -1396,7 +1396,7 @@
/* count number of sections in string */ nl=1; if(align!=NONE) - for(i=0; i<strlen(text)-1; i++) + for(i=strlen(text)-2; i >= 0; i--) if(text[i]=='\n') nl++;

@@ -1794,7 +1794,7 @@
 

     /* count number of sections in string */
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		nl++;
 

@@ -2042,7 +2042,7 @@
/* count number of sections in string */ item->nl=1; if(align!=NONE) - for(i=0; i<strlen(text)-1; i++) + for(i=strlen(text)-2; i >= 0; i--) if(text[i]=='\n') item->nl++;
@@ -2336,7 +2336,7 @@
/* count number of sections in string */ nl=1; if(align!=NONE) - for(i=0; i<strlen(text)-1; i++) + for(i=strlen(text)-2; i >= 0; i--) if(text[i]=='\n') nl++;

Index: src/modules/X11/dataentry.c


@@ -1355,9 +1355,13 @@

 	goto donehc;
     }
 
-    for(i = 0; i < strlen(text); i++) *bufp++ = text[i];
- *(bufp+1) = '\0';
- clength += strlen(text);
+    /* as originally written, this left an undefined byte at
+       the end of bufp, followed by a zero byte; luckily, the storage
+       pointed to by bufp had already been zeroed, so the undefined
+       byte was in fact zero.  */
+    strcpy(bufp, text);
+    bufp += (j = strlen(text));
+    clength += j;
     printstring(DE, buf, clength, DE->crow, DE->ccol, 1);
     return;

______________________________________________
R-devel_at_r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel Received on Fri 07 Sep 2007 - 18:37:04 GMT

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.2.0, at Sat 08 Sep 2007 - 14:40:34 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-devel. Please read the posting guide before posting to the list.