Re: [Rd] [R] HTTP User-Agent header

From: Seth Falcon <sfalcon_at_fhcrc.org>
Date: Fri 28 Jul 2006 - 16:44:08 GMT

[moved from R-help to R-devel]

Prof Brian Ripley <ripley@stats.ox.ac.uk> writes:

> Otherwise, see ?download.file and choose a different download method,
> or look at the source code (src/modules/internet/nanohttp.c) and submit a 
> patch.

I have a rough draft patch, see below, that adds a User-Agent header to HTTP requests made in R via download.file. If there is interest, I will polish it.

Why have R identify itself? Well, I think it is reasonable behavior for legitimate "browsers" to identify themselves. It will help a user whose institution has a rather harsh web proxy policy (however, silly it may be). It will also be of use in tracking use of R, versions, and OSes on CRAN mirrors.

Here is an example of what the user-agent string will be for an R running on OSX:

    R (2.4.0 powerpc-apple-darwin8.7.0 powerpc darwin8.7.0)

And here is the patch...

+ seth

Index: src/include/R_ext/R-ftp-http.h


-void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK);
+void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers, int cacheOK);
 int	RxmlNanoHTTPRead(void *ctx, void *dest, int len);
 void	RxmlNanoHTTPClose(void *ctx);
 int 	RxmlNanoHTTPReturnCode(void *ctx);
Index: src/include/Rmodules/Rinternet.h
-typedef void * (*R_HTTPOpenRoutine)(const char *url, const int cacheOK);
+typedef void * (*R_HTTPOpenRoutine)(const char *url, const char *headers, const int cacheOK);
 typedef int    (*R_HTTPReadRoutine)(void *ctx, char *dest, int len);
 typedef void   (*R_HTTPCloseRoutine)(void *ctx);
 	      

Index: src/main/names.c


Index: src/main/memory.c


     if(method == "auto") {
         if(capabilities("http/ftp"))

@@ -22,7 +24,8 @@
stop("no download method found") } if(method == "internal") - status <- .Internal(download(url, destfile, quiet, mode, cacheOK)) + status <- .Internal(download(url, destfile, quiet, mode, headers, + cacheOK)) else if(method == "wget") { extra <- if(quiet) " --quiet" else "" if(!cacheOK) extra <- paste(extra, "--cache=off")
Index: src/library/utils/R/windows/download.file.R
     if(method == "auto") {
         if(capabilities("http/ftp"))

@@ -22,7 +24,8 @@
stop("no download method found") } if(method == "internal") - status <- .Internal(download(url, destfile, quiet, mode, cacheOK)) + status <- .Internal(download(url, destfile, quiet, mode, headers, + cacheOK)) else if(method == "wget") { extra <- if(quiet) " --quiet" else "" if(!cacheOK) extra <- paste(extra, "--cache=off")
Index: src/library/utils/R/readhttp.R

   \item{cacheOK}{logical. Is a server-side cached value acceptable?      Implemented for the \code{"internal"} and \code{"wget"} methods.}

+
+  \item{headers}{character. Headers to be used in the HTTP request.
+  This should be a character vector of length one formatted correctly
+  for use in the HTTP header.  The default value of \code{NULL}
+  results in a standard user agent header to be added to the HTTP
+  request that identified R as \code{User-Agent: R (X.Y.Z platform
+  arch os)}.  }

 }
 \details{
   The function \code{download.file} can be used to download a single Index: src/modules/internet/internet.c
-static void *in_R_HTTPOpen(const char *url, const int cacheOK);
+static void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK);
 static int   in_R_HTTPRead(void *ctx, char *dest, int len);
 static void  in_R_HTTPClose(void *ctx);
 

@@ -70,7 +70,7 @@
 

     switch(type) {
     case HTTPsh:
-	ctxt = in_R_HTTPOpen(url, 0);
+	ctxt = in_R_HTTPOpen(url, NULL, 0);
 	if(ctxt == NULL) {
 	  /* if we call error() we get a connection leak*/
 	  /* so do_url has to raise the error*/

@@ -238,14 +238,14 @@

 }
 #endif  

-/* download(url, destfile, quiet, mode, cacheOK) */ +/* download(url, destfile, quiet, mode, headers, cacheOK) */  

 #define CPBUFSIZE 65536
 #define IBUFSIZE 4096
 static SEXP in_do_download(SEXP call, SEXP op, SEXP args, SEXP env)  {
- SEXP ans, scmd, sfile, smode;
- char *url, *file, *mode;
+ SEXP ans, scmd, sfile, smode, sheaders; + char *url, *file, *mode, *headers;

     int quiet, status = 0, cacheOK;  

     checkArity(op, args);
@@ -268,6 +268,14 @@

     if(!isString(smode) || length(smode) != 1)
 	error(_("invalid '%s' argument"), "mode");
     mode = CHAR(STRING_ELT(smode, 0));
+    sheaders = CAR(args); args = CDR(args);
+    if(TYPEOF(sheaders) == NILSXP)
+        headers = NULL;
+    else {
+        if(!isString(sheaders) || length(sheaders) != 1)
+            error(_("invalid '%s' argument"), "headers");
+        headers = CHAR(STRING_ELT(sheaders, 0));
+    }
     cacheOK = asLogical(CAR(args));
     if(cacheOK == NA_LOGICAL)
 	error(_("invalid '%s' argument"), "cacheOK");

@@ -319,7 +327,7 @@

 #ifdef Win32

         R_FlushConsole();
 #endif

-	ctxt = in_R_HTTPOpen(url, cacheOK);
+	ctxt = in_R_HTTPOpen(url, headers, cacheOK);
 	if(ctxt == NULL) status = 1;
 	else {
 	    if(!quiet) REprintf(_("opened URL\n"), url);

@@ -473,7 +481,7 @@
 

 #if defined(SUPPORT_LIBXML) && !defined(USE_WININET)  

-void *in_R_HTTPOpen(const char *url, int cacheOK) +void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK)  {

     inetconn *con;
     void *ctxt;

@@ -484,15 +492,15 @@
if(timeout == NA_INTEGER || timeout <= 0) timeout = 60; RxmlNanoHTTPTimeout(timeout);

- ctxt = RxmlNanoHTTPOpen(url, NULL, cacheOK); + ctxt = RxmlNanoHTTPOpen(url, NULL, headers, cacheOK);
     if(ctxt != NULL) {
 	int rc = RxmlNanoHTTPReturnCode(ctxt);
 	if(rc != 200) {
 	    char *msg;
-	    RxmlNanoHTTPClose(ctxt);
 	    /* bug work-around: it will crash on OS X if passed directly */
 	    msg = _("cannot open: HTTP status was '%d %s'");
 	    warning(msg, rc, RxmlNanoHTTPStatusMsg(ctxt));
+	    RxmlNanoHTTPClose(ctxt);
 	    return NULL;
 	} else {
 	    type = RxmlNanoHTTPContentType(ctxt);
Index: src/modules/internet/nanohttp.c

 void*
-RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK) +RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers,

+                 int cacheOK)
 {
     if (contentType != NULL) *contentType = NULL;
- return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, NULL, cacheOK); + return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, headers, cacheOK);  }  

 /**



R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel Received on Sat Jul 29 06:11:44 2006

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.1.8, at Fri 28 Jul 2006 - 22:27:45 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help. Please read the posting guide before posting to the list.