[Rd] chartr better

From: Ei-ji Nakama <nakama_at_ki.rim.or.jp>
Date: Thu, 13 Sep 2007 16:29:30 +0900


For example, the following changes are necessary when i convert a Japanese hiragana into katakana in chattr.

R code:
> chartr("\u3041-\u3093","\u30a1-\u30f3","\u3084\u3063\u305f\u30fc")

+typedef struct { wchar_t c_old, c_new; } xtable_t;
+static inline int xtable_comp(const xtable_t *a, const xtable_t *b)
+{
+ return a->c_old - b->c_old;
+}
+static inline int xtable_key_comp(const wchar_t *a, const xtable_t *b)
+{
+ return *a - b->c_old;
+}
+

 SEXP attribute_hidden do_chartr(SEXP call, SEXP op, SEXP args, SEXP env)  {

     SEXP old, _new, x, y;
@@ -2064,14 +2074,18 @@
 #ifdef SUPPORT_MBCS

     if(mbcslocale) {
         int j, nb, nc;
-        wchar_t xtable[65536 + 1], c_old, c_new, *wc;

+ xtable_t *xtable;
+ int xtable_cnt;
+ wchar_t c_old, c_new, *wc;
const char *xi, *s; struct wtr_spec *trs_old, **trs_old_ptr; struct wtr_spec *trs_new, **trs_new_ptr; - - for(i = 0; i <= UCHAR_MAX; i++) xtable[i] = i;
+ struct wtr_spec *trs_cnt, **trs_cnt_ptr;
/* Initialize the old and new wtr_spec lists. */
+ trs_cnt = Calloc(1, struct wtr_spec);
+ trs_cnt->type = WTR_INIT;
+ trs_cnt->next = NULL;
trs_old = Calloc(1, struct wtr_spec); trs_old->type = WTR_INIT; trs_old->next = NULL; @@ -2084,6 +2098,7 @@ if(nc < 0) error(_("invalid multibyte string 'old'")); wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, s, nc + 1);
+ wtr_build_spec(wc, trs_cnt); /* use count only */
wtr_build_spec(wc, trs_old); s = translateChar(STRING_ELT(_new, 0)); @@ -2096,38 +2111,54 @@ /* Initialize the pointers for walking through the old and new wtr_spec lists and retrieving the next chars from the lists. */
+ trs_cnt_ptr = Calloc(1, struct wtr_spec *);
+ *trs_cnt_ptr = trs_cnt->next;
+ for( xtable_cnt = 0 ; wtr_get_next_char_from_spec(trs_cnt_ptr)
;xtable_cnt++ );
+ Free(trs_cnt_ptr);
+ xtable = (xtable_t *)R_alloc(xtable_cnt+1,sizeof(xtable_t));
+
trs_old_ptr = Calloc(1, struct wtr_spec *); *trs_old_ptr = trs_old->next; trs_new_ptr = Calloc(1, struct wtr_spec *); *trs_new_ptr = trs_new->next; - for(;;) {
+ for(i=0;;i++) {
c_old = wtr_get_next_char_from_spec(trs_old_ptr); c_new = wtr_get_next_char_from_spec(trs_new_ptr); if(c_old == '\0') break; else if(c_new == '\0') error(_("'old' is longer than 'new'")); - else - xtable[c_old] = c_new;
+ else{
+ xtable[i].c_old=c_old;
+ xtable[i].c_new=c_new;
+ }
}
+
/* Free the memory occupied by the wtr_spec lists. */ wtr_free_spec(trs_old); wtr_free_spec(trs_new); Free(trs_old_ptr); Free(trs_new_ptr);
+ qsort(xtable, xtable_cnt, sizeof(xtable_t),
+ (int(*)(const void *, const void *))xtable_comp);
+
n = LENGTH(x); PROTECT(y = allocVector(STRSXP, n)); for(i = 0; i < n; i++) { if (STRING_ELT(x,i) == NA_STRING) SET_STRING_ELT(y, i, NA_STRING); else {
+ xtable_t *tbl;
xi = translateChar(STRING_ELT(x, i)); nc = mbstowcs(NULL, xi, 0); if(nc < 0) error(_("invalid input multibyte string %d"), i+1); wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); - for(j = 0; j < nc; j++) wc[j] = xtable[wc[j]];
+ for(j = 0; j < nc; j++)
+ if (tbl = bsearch(&wc[j], xtable, xtable_cnt,
sizeof(xtable_t),
+ (int(*)(const void *, const
void *))xtable_key_comp))
+ wc[j]=tbl->c_new;
nb = wcstombs(NULL, wc, 0); cbuf = CallocCharBuf(nb); wcstombs(cbuf, wc, nb + 1);
-- 
EI-JI Nakama  <nakama_at_ki.rim.or.jp>
"\u4e2d\u9593\u6804\u6cbb"  <nakama_at_ki.rim.or.jp>

______________________________________________
R-devel_at_r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel
Received on Thu 13 Sep 2007 - 07:35:36 GMT

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.2.0, at Wed 19 Sep 2007 - 14:41:03 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-devel. Please read the posting guide before posting to the list.