Re: [Rd] [R] HTTP User-Agent header

From: Seth Falcon <sfalcon_at_fhcrc.org>
Date: Sun 30 Jul 2006 - 14:39:04 GMT

Robert Gentleman <rgentlem@fhcrc.org> writes:

> OK, that suggests setting at the options level would solve both of your 
> problems and that seems like the best approach. I don't really want to 
> pass this around as a parameter through the maze of functions that might 
> actually download something if we don't have to.

I have an updated patch that adds an HTTPUserAgent option. The default is a string like:

    R (2.4.0 x86_64-unknown-linux-gnu x86_64 linux-gnu)

If the HTTPUserAgent option is NULL, no user agent header is added to HTTP requests (this is the current behavior). This option allows R to use an arbitrary user agent header.

The patch adds two non-exported functions to utils:

  1. defaultUserAgent - returns a string like above
  2. makeUserAgent - formats content of HTTPUserAgent option for use as part of an HTTP request header.

I've tested on OSX and Linux, but not on Windows. When USE_WININET is defined, a user agent string of "R" was already being used. With this patch, the HTTPUserAgent options is used. I'm unsure if NULL is allowed.

Also, in src/main/internet.c there is a comment:   "Next 6 are for use by libxml, only"
and then a definition for R_HTTPOpen. Not sure how/when these get used. The user agent for these calls remains unspecified with this patch.

+ seth

Patch summary:

 src/include/R_ext/R-ftp-http.h   |    2 +-
 src/include/Rmodules/Rinternet.h |    2 +-
 src/library/base/man/options.Rd  |    5 +++++
 src/library/utils/R/readhttp.R   |   25 +++++++++++++++++++++++++
 src/library/utils/R/zzz.R        |    3 ++-
 src/main/internet.c              |    2 +-
 src/modules/internet/internet.c  |   37 +++++++++++++++++++++++++------------
 src/modules/internet/nanohttp.c  |    8 ++++++--
 8 files changed, 66 insertions(+), 18 deletions(-)

Index: src/include/R_ext/R-ftp-http.h


-void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK);
+void *	RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers, int cacheOK);
 int	RxmlNanoHTTPRead(void *ctx, void *dest, int len);
 void	RxmlNanoHTTPClose(void *ctx);
 int 	RxmlNanoHTTPReturnCode(void *ctx);
Index: src/include/Rmodules/Rinternet.h
-typedef void * (*R_HTTPOpenRoutine)(const char *url, const int cacheOK);
+typedef void * (*R_HTTPOpenRoutine)(const char *url, const char *headers, const int cacheOK);
 typedef int    (*R_HTTPReadRoutine)(void *ctx, char *dest, int len);
 typedef void   (*R_HTTPCloseRoutine)(void *ctx);
 	      

Index: src/main/internet.c


+    \item{\code{HTTPUserAgent}:}{string used as the user agent in HTTP
+      requests.  If \code{NULL}, HTTP requests will be made without a
+      user agent header.  The default is \code{R (<version> <platform>
+      <arch> <os>)}}
+
     \item{\code{internet.info}:}{The minimum level of information to be
       printed on URL downloads etc.  Default is 2, for failure causes.
       Set to 1 or 0 to get more information.}
Index: src/modules/internet/internet.c
-static void *in_R_HTTPOpen(const char *url, const int cacheOK);
+static void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK);
 static int   in_R_HTTPRead(void *ctx, char *dest, int len);
 static void  in_R_HTTPClose(void *ctx);
 

@@ -70,7 +70,7 @@
 

     switch(type) {
     case HTTPsh:
-	ctxt = in_R_HTTPOpen(url, 0);
+	ctxt = in_R_HTTPOpen(url, NULL, 0);
 	if(ctxt == NULL) {
 	  /* if we call error() we get a connection leak*/
 	  /* so do_url has to raise the error*/

@@ -238,14 +238,14 @@
 }
 #endif  

-/* download(url, destfile, quiet, mode, cacheOK) */ +/* download(url, destfile, quiet, mode, headers, cacheOK) */  

 #define CPBUFSIZE 65536
 #define IBUFSIZE 4096
 static SEXP in_do_download(SEXP call, SEXP op, SEXP args, SEXP env)  {
- SEXP ans, scmd, sfile, smode;
- char *url, *file, *mode;
+ SEXP ans, scmd, sfile, smode, sheaders, agentFun; + char *url, *file, *mode, *headers;

     int quiet, status = 0, cacheOK;  

     checkArity(op, args);
@@ -271,6 +271,17 @@

     cacheOK = asLogical(CAR(args));
     if(cacheOK == NA_LOGICAL)
 	error(_("invalid '%s' argument"), "cacheOK");
+#ifdef USE_WININET
+    PROTECT(agentFun = lang2(install("makeUserAgent"), ScalarLogical(0)));
+#else
+    PROTECT(agentFun = lang1(install("makeUserAgent")));
+#endif
+    PROTECT(sheaders = eval(agentFun, R_FindNamespace(mkString("utils"))));
+    UNPROTECT(1);
+    if(TYPEOF(sheaders) == NILSXP)
+        headers = NULL;
+    else 
+        headers = CHAR(STRING_ELT(sheaders, 0));
 #ifdef Win32
     if (!pbar.wprog) {
 	pbar.wprog = newwindow(_("Download progress"), rect(0, 0, 540, 100),

@@ -319,7 +330,7 @@

 #ifdef Win32

         R_FlushConsole();
 #endif

-	ctxt = in_R_HTTPOpen(url, cacheOK);
+	ctxt = in_R_HTTPOpen(url, headers, cacheOK);
 	if(ctxt == NULL) status = 1;
 	else {
 	    if(!quiet) REprintf(_("opened URL\n"), url);

@@ -466,14 +477,14 @@
 
     PROTECT(ans = allocVector(INTSXP, 1));
     INTEGER(ans)[0] = status;

- UNPROTECT(1);
+ UNPROTECT(2);
     return ans;
 }    

 #if defined(SUPPORT_LIBXML) && !defined(USE_WININET)  

-void *in_R_HTTPOpen(const char *url, int cacheOK) +void *in_R_HTTPOpen(const char *url, const char *headers, const int cacheOK)  {

     inetconn *con;
     void *ctxt;

@@ -484,7 +495,7 @@
if(timeout == NA_INTEGER || timeout <= 0) timeout = 60; RxmlNanoHTTPTimeout(timeout);

- ctxt = RxmlNanoHTTPOpen(url, NULL, cacheOK); + ctxt = RxmlNanoHTTPOpen(url, NULL, headers, cacheOK);
     if(ctxt != NULL) {
 	int rc = RxmlNanoHTTPReturnCode(ctxt);
 	if(rc != 200) {

@@ -605,7 +616,8 @@

 }
 #endif /* USE_WININET_ASYNC */  

-static void *in_R_HTTPOpen(const char *url, const int cacheOK) +static void *in_R_HTTPOpen(const char *url, const char *headers,

+                           const int cacheOK)
 {
     WIctxt  wictxt;
     DWORD status, d1 = 4, d2 = 0, d3 = 100;

@@ -622,7 +634,7 @@
wictxt->length = -1; wictxt->type = NULL; wictxt->hand = - InternetOpen("R", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, + InternetOpen(headers, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, #ifdef USE_WININET_ASYNC INTERNET_FLAG_ASYNC

 #else
@@ -870,7 +882,8 @@

 #endif  

 #ifndef HAVE_INTERNET
-static void *in_R_HTTPOpen(const char *url, const int cacheOK) +static void *in_R_HTTPOpen(const char *url, const char *headers,

+                           const int cacheOK)
 {
     return NULL;

 }
Index: src/modules/internet/nanohttp.c

 void*
-RxmlNanoHTTPOpen(const char *URL, char **contentType, int cacheOK) +RxmlNanoHTTPOpen(const char *URL, char **contentType, const char *headers,

+                 int cacheOK)
 {
     if (contentType != NULL) *contentType = NULL;
- return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, NULL, cacheOK); + return RxmlNanoHTTPMethod(URL, NULL, NULL, contentType, headers, cacheOK);  }  

 /**



R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel Received on Mon Jul 31 10:34:41 2006

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.1.8, at Mon 31 Jul 2006 - 18:30:05 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help. Please read the posting guide before posting to the list.