Dogcows Code - chaz/tar/blob - lib/unicodeio.c

   1 /* Unicode character output to streams with locale dependent encoding.
   2
   3    Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software Foundation,
  17    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 /* Written by Bruno Haible <haible@clisp.cons.org>.  */
  20
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #if HAVE_STDDEF_H
  26 # include <stddef.h>
  27 #endif
  28
  29 #include <stdio.h>
  30 #if HAVE_STRING_H
  31 # include <string.h>
  32 #else
  33 # include <strings.h>
  34 #endif
  35
  36 #include <errno.h>
  37 #ifndef errno
  38 extern int errno;
  39 #endif
  40 #ifndef EILSEQ
  41 # define EILSEQ EINVAL
  42 #endif
  43 #ifndef ENOTSUP
  44 # define ENOTSUP EINVAL
  45 #endif
  46
  47 #if HAVE_ICONV
  48 # include <iconv.h>
  49 #endif
  50
  51 #include "unicodeio.h"
  52
  53 /* When we pass a Unicode character to iconv(), we must pass it in a
  54    suitable encoding. The standardized Unicode encodings are
  55    UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
  56    UCS-2 supports only characters up to \U0000FFFF.
  57    UTF-16 and variants support only characters up to \U0010FFFF.
  58    UTF-7 is way too complex and not supported by glibc-2.1.
  59    UCS-4 specification leaves doubts about endianness and byte order
  60    mark. glibc currently interprets it as big endian without byte order
  61    mark, but this is not backed by an RFC.
  62    So we use UTF-8. It supports characters up to \U7FFFFFFF and is
  63    unambiguously defined.  */
  64
  65 /* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
  66    Returns the number of bytes stored, or -1 if wc is out of range.  */
  67 static int
  68 utf8_wctomb (unsigned char *r, unsigned int wc)
  69 {
  70   int count;
  71
  72   if (wc < 0x80)
  73     count = 1;
  74   else if (wc < 0x800)
  75     count = 2;
  76   else if (wc < 0x10000)
  77     count = 3;
  78   else if (wc < 0x200000)
  79     count = 4;
  80   else if (wc < 0x4000000)
  81     count = 5;
  82   else if (wc <= 0x7fffffff)
  83     count = 6;
  84   else
  85     return -1;
  86
  87   switch (count)
  88     {
  89       /* Note: code falls through cases! */
  90       case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
  91       case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
  92       case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
  93       case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
  94       case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
  95       case 1: r[0] = wc;
  96     }
  97
  98   return count;
  99 }
 100
 101 /* Luckily, the encoding's name is platform independent.  */
 102 #define UTF8_NAME "UTF-8"
 103
 104 /* Converts the Unicode character CODE to its multibyte representation
 105    in the current locale and calls SUCCESS on the resulting byte
 106    sequence.  If an error occurs, invoke FAILURE instead,
 107    passing it CODE with errno set appropriately.
 108    Assumes that the locale doesn't change between two calls.
 109    Return whatever the SUCCESS or FAILURE returns.  */
 110 int
 111 unicode_to_mb (unsigned int code,
 112                int (*success) PARAMS((const char *buf, size_t buflen,
 113                                       void *callback_arg)),
 114                int (*failure) PARAMS((unsigned int code,
 115                                       void *callback_arg)),
 116                void *callback_arg)
 117 {
 118   static int initialized;
 119   static int is_utf8;
 120 #if HAVE_ICONV
 121   static iconv_t utf8_to_local;
 122 #endif
 123
 124   char inbuf[6];
 125   int count;
 126
 127   if (!initialized)
 128     {
 129       extern const char *locale_charset PARAMS ((void));
 130       const char *charset = locale_charset ();
 131
 132       is_utf8 = !strcmp (charset, UTF8_NAME);
 133 #if HAVE_ICONV
 134       if (!is_utf8)
 135         {
 136           utf8_to_local = iconv_open (charset, UTF8_NAME);
 137           if (utf8_to_local == (iconv_t)(-1))
 138             {
 139               /* For an unknown encoding, assume ASCII.  */
 140               utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
 141               if (utf8_to_local == (iconv_t)(-1))
 142                 return failure (code, callback_arg);
 143             }
 144         }
 145 #endif
 146       initialized = 1;
 147     }
 148
 149   /* Convert the character to UTF-8.  */
 150   count = utf8_wctomb ((unsigned char *) inbuf, code);
 151   if (count < 0)
 152     {
 153       errno = EILSEQ;
 154       return failure (code, callback_arg);
 155     }
 156
 157   if (is_utf8)
 158     {
 159       return success (inbuf, count, callback_arg);
 160     }
 161   else
 162     {
 163 #if HAVE_ICONV
 164       char outbuf[25];
 165       const char *inptr;
 166       size_t inbytesleft;
 167       char *outptr;
 168       size_t outbytesleft;
 169       size_t res;
 170
 171       inptr = inbuf;
 172       inbytesleft = count;
 173       outptr = outbuf;
 174       outbytesleft = sizeof (outbuf);
 175
 176       /* Convert the character from UTF-8 to the locale's charset.  */
 177       res = iconv (utf8_to_local,
 178                    (ICONV_CONST char **)&inptr, &inbytesleft,
 179                    &outptr, &outbytesleft);
 180       if (inbytesleft > 0 || res == (size_t)(-1)
 181           /* Irix iconv() inserts a NUL byte if it cannot convert. */
 182 # if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
 183           || (res > 0 && code != 0 && outptr - outbuf == 1 && *outbuf == '\0')
 184 # endif
 185          )
 186         {
 187           if (res != (size_t)(-1))
 188             errno = EILSEQ;
 189           return failure (code, callback_arg);
 190         }
 191
 192       /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 193 # if defined _LIBICONV_VERSION \
 194     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 195
 196       /* Get back to the initial shift state.  */
 197       res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
 198       if (res == (size_t)(-1))
 199         return failure (code, callback_arg);
 200 # endif
 201
 202       return success (outbuf, outptr - outbuf, callback_arg);
 203 #else
 204       errno = ENOTSUP;
 205       return failure (code, callback_arg);
 206 #endif
 207     }
 208 }
 209
 210 /* Simple success callback that outputs the converted string.
 211    The STREAM is passed as callback_arg.  */
 212 int
 213 print_unicode_success (const char *buf, size_t buflen, void *callback_arg)
 214 {
 215   FILE *stream = (FILE *) callback_arg;
 216
 217   return fwrite (buf, 1, buflen, stream) == 0 ? -1 : 0;
 218 }
 219
 220 /* Simple failure callback that prints an ASCII representation, using
 221    the same notation as C99 strings.  */
 222 int
 223 print_unicode_failure (unsigned int code, void *callback_arg)
 224 {
 225   int e = errno;
 226   FILE *stream = callback_arg;
 227
 228   fprintf (stream, code < 0x10000 ? "\\u%04X" : "\\U%08X", code);
 229   errno = e;
 230   return -1;
 231 }
 232
 233 /* Outputs the Unicode character CODE to the output stream STREAM.
 234    Returns zero if successful, -1 (setting errno) otherwise.
 235    Assumes that the locale doesn't change between two calls.  */
 236 int
 237 print_unicode_char (FILE *stream, unsigned int code)
 238 {
 239   return unicode_to_mb (code, print_unicode_success, print_unicode_failure,
 240                         stream);
 241 }