Dogcows Code - chaz/tar/blob - lib/unicodeio.c

   1 /* Unicode character output to streams with locale dependent encoding.
   2
   3    Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software Foundation,
  17    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 /* Written by Bruno Haible <haible@clisp.cons.org>.  */
  20
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #if HAVE_STDDEF_H
  26 # include <stddef.h>
  27 #endif
  28
  29 #include <stdio.h>
  30 #if HAVE_STRING_H
  31 # include <string.h>
  32 #else
  33 # include <strings.h>
  34 #endif
  35
  36 #include <errno.h>
  37 #ifndef errno
  38 extern int errno;
  39 #endif
  40
  41 #if HAVE_ICONV
  42 # include <iconv.h>
  43 #endif
  44
  45 /* Some systems, like SunOS 4, don't have EILSEQ.  On these systems,
  46    define EILSEQ to some value other than EINVAL, because our invokers
  47    may want to distinguish EINVAL from EILSEQ.  */
  48 #ifndef EILSEQ
  49 # define EILSEQ ENOENT
  50 #endif
  51 #ifndef ENOTSUP
  52 # define ENOTSUP EINVAL
  53 #endif
  54
  55 #if HAVE_LANGINFO_CODESET && ! USE_INCLUDED_LIBINTL
  56 # include <langinfo.h>
  57 #endif
  58
  59 #include "unicodeio.h"
  60
  61 /* When we pass a Unicode character to iconv(), we must pass it in a
  62    suitable encoding. The standardized Unicode encodings are
  63    UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
  64    UCS-2 supports only characters up to \U0000FFFF.
  65    UTF-16 and variants support only characters up to \U0010FFFF.
  66    UTF-7 is way too complex and not supported by glibc-2.1.
  67    UCS-4 specification leaves doubts about endianness and byte order
  68    mark. glibc currently interprets it as big endian without byte order
  69    mark, but this is not backed by an RFC.
  70    So we use UTF-8. It supports characters up to \U7FFFFFFF and is
  71    unambiguously defined.  */
  72
  73 /* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
  74    Returns the number of bytes stored, or -1 if wc is out of range.  */
  75 static int
  76 utf8_wctomb (unsigned char *r, unsigned int wc)
  77 {
  78   int count;
  79
  80   if (wc < 0x80)
  81     count = 1;
  82   else if (wc < 0x800)
  83     count = 2;
  84   else if (wc < 0x10000)
  85     count = 3;
  86   else if (wc < 0x200000)
  87     count = 4;
  88   else if (wc < 0x4000000)
  89     count = 5;
  90   else if (wc <= 0x7fffffff)
  91     count = 6;
  92   else
  93     return -1;
  94
  95   switch (count)
  96     {
  97       /* Note: code falls through cases! */
  98       case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
  99       case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
 100       case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
 101       case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
 102       case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
 103       case 1: r[0] = wc;
 104     }
 105
 106   return count;
 107 }
 108
 109 /* Luckily, the encoding's name is platform independent.  */
 110 #define UTF8_NAME "UTF-8"
 111
 112 /* Converts the Unicode character CODE to its multibyte representation
 113    in the current locale and calls SUCCESS on the resulting byte
 114    sequence.  If an error occurs, invoke FAILURE instead,
 115    passing it CODE with errno set appropriately.
 116    Assumes that the locale doesn't change between two calls.
 117    Return whatever the SUCCESS or FAILURE returns.  */
 118 int
 119 unicode_to_mb (unsigned int code,
 120                int (*success) PARAMS((const char *buf, size_t buflen,
 121                                       void *callback_arg)),
 122                int (*failure) PARAMS((unsigned int code,
 123                                       void *callback_arg)),
 124                void *callback_arg)
 125 {
 126   static int initialized;
 127   static int is_utf8;
 128 #if HAVE_ICONV
 129   static iconv_t utf8_to_local;
 130 #endif
 131
 132   char inbuf[6];
 133   int count;
 134
 135   if (!initialized)
 136     {
 137       const char *charset;
 138
 139 #if USE_INCLUDED_LIBINTL
 140       extern const char *locale_charset PARAMS ((void));
 141       charset = locale_charset ();
 142 #else
 143 # if HAVE_LANGINFO_CODESET
 144       charset = nl_langinfo (CODESET);
 145 # else
 146       charset = "";
 147 # endif
 148 #endif
 149
 150       is_utf8 = !strcmp (charset, UTF8_NAME);
 151 #if HAVE_ICONV
 152       if (!is_utf8)
 153         {
 154           utf8_to_local = iconv_open (charset, UTF8_NAME);
 155           if (utf8_to_local == (iconv_t)(-1))
 156             {
 157               /* For an unknown encoding, assume ASCII.  */
 158               utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
 159               if (utf8_to_local == (iconv_t)(-1))
 160                 return failure (code, callback_arg);
 161             }
 162         }
 163 #endif
 164       initialized = 1;
 165     }
 166
 167   /* Convert the character to UTF-8.  */
 168   count = utf8_wctomb ((unsigned char *) inbuf, code);
 169   if (count < 0)
 170     {
 171       errno = EILSEQ;
 172       return failure (code, callback_arg);
 173     }
 174
 175   if (is_utf8)
 176     {
 177       return success (inbuf, count, callback_arg);
 178     }
 179   else
 180     {
 181 #if HAVE_ICONV
 182       char outbuf[25];
 183       const char *inptr;
 184       size_t inbytesleft;
 185       char *outptr;
 186       size_t outbytesleft;
 187       size_t res;
 188
 189       inptr = inbuf;
 190       inbytesleft = count;
 191       outptr = outbuf;
 192       outbytesleft = sizeof (outbuf);
 193
 194       /* Convert the character from UTF-8 to the locale's charset.  */
 195       res = iconv (utf8_to_local,
 196                    (ICONV_CONST char **)&inptr, &inbytesleft,
 197                    &outptr, &outbytesleft);
 198       if (inbytesleft > 0 || res == (size_t)(-1)
 199           /* Irix iconv() inserts a NUL byte if it cannot convert. */
 200 # if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
 201           || (res > 0 && code != 0 && outptr - outbuf == 1 && *outbuf == '\0')
 202 # endif
 203          )
 204         {
 205           if (res != (size_t)(-1))
 206             errno = EILSEQ;
 207           return failure (code, callback_arg);
 208         }
 209
 210       /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 211 # if defined _LIBICONV_VERSION \
 212     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 213
 214       /* Get back to the initial shift state.  */
 215       res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
 216       if (res == (size_t)(-1))
 217         return failure (code, callback_arg);
 218 # endif
 219
 220       return success (outbuf, outptr - outbuf, callback_arg);
 221 #else
 222       errno = ENOTSUP;
 223       return failure (code, callback_arg);
 224 #endif
 225     }
 226 }
 227
 228 /* Simple success callback that outputs the converted string.
 229    The STREAM is passed as callback_arg.  */
 230 int
 231 print_unicode_success (const char *buf, size_t buflen, void *callback_arg)
 232 {
 233   FILE *stream = (FILE *) callback_arg;
 234
 235   return fwrite (buf, 1, buflen, stream) == 0 ? -1 : 0;
 236 }
 237
 238 /* Simple failure callback that prints an ASCII representation, using
 239    the same notation as C99 strings.  */
 240 int
 241 print_unicode_failure (unsigned int code, void *callback_arg)
 242 {
 243   int e = errno;
 244   FILE *stream = callback_arg;
 245
 246   fprintf (stream, code < 0x10000 ? "\\u%04X" : "\\U%08X", code);
 247   errno = e;
 248   return -1;
 249 }
 250
 251 /* Outputs the Unicode character CODE to the output stream STREAM.
 252    Returns zero if successful, -1 (setting errno) otherwise.
 253    Assumes that the locale doesn't change between two calls.  */
 254 int
 255 print_unicode_char (FILE *stream, unsigned int code)
 256 {
 257   return unicode_to_mb (code, print_unicode_success, print_unicode_failure,
 258                         stream);
 259 }