Dogcows Code - chaz/tar/blob - lib/unicodeio.c

   1 /* Unicode character output to streams with locale dependent encoding.
   2
   3    Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation; either version 2, or (at your option)
   8    any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program; if not, write to the Free Software Foundation,
  17    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 /* Written by Bruno Haible <haible@clisp.cons.org>.  */
  20
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #if HAVE_STDDEF_H
  26 # include <stddef.h>
  27 #endif
  28
  29 #include <stdio.h>
  30 #if HAVE_STRING_H
  31 # include <string.h>
  32 #else
  33 # include <strings.h>
  34 #endif
  35
  36 #include <errno.h>
  37 #ifndef errno
  38 extern int errno;
  39 #endif
  40 #ifndef EILSEQ
  41 # define EILSEQ EINVAL
  42 #endif
  43 #ifndef ENOTSUP
  44 # define ENOTSUP EINVAL
  45 #endif
  46
  47 #if HAVE_ICONV
  48 # include <iconv.h>
  49 #endif
  50
  51 #if HAVE_LANGINFO_CODESET && ! USE_INCLUDED_LIBINTL
  52 # include <langinfo.h>
  53 #endif
  54
  55 #include "unicodeio.h"
  56
  57 /* When we pass a Unicode character to iconv(), we must pass it in a
  58    suitable encoding. The standardized Unicode encodings are
  59    UTF-8, UCS-2, UCS-4, UTF-16, UTF-16BE, UTF-16LE, UTF-7.
  60    UCS-2 supports only characters up to \U0000FFFF.
  61    UTF-16 and variants support only characters up to \U0010FFFF.
  62    UTF-7 is way too complex and not supported by glibc-2.1.
  63    UCS-4 specification leaves doubts about endianness and byte order
  64    mark. glibc currently interprets it as big endian without byte order
  65    mark, but this is not backed by an RFC.
  66    So we use UTF-8. It supports characters up to \U7FFFFFFF and is
  67    unambiguously defined.  */
  68
  69 /* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
  70    Returns the number of bytes stored, or -1 if wc is out of range.  */
  71 static int
  72 utf8_wctomb (unsigned char *r, unsigned int wc)
  73 {
  74   int count;
  75
  76   if (wc < 0x80)
  77     count = 1;
  78   else if (wc < 0x800)
  79     count = 2;
  80   else if (wc < 0x10000)
  81     count = 3;
  82   else if (wc < 0x200000)
  83     count = 4;
  84   else if (wc < 0x4000000)
  85     count = 5;
  86   else if (wc <= 0x7fffffff)
  87     count = 6;
  88   else
  89     return -1;
  90
  91   switch (count)
  92     {
  93       /* Note: code falls through cases! */
  94       case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
  95       case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
  96       case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
  97       case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
  98       case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
  99       case 1: r[0] = wc;
 100     }
 101
 102   return count;
 103 }
 104
 105 /* Luckily, the encoding's name is platform independent.  */
 106 #define UTF8_NAME "UTF-8"
 107
 108 /* Converts the Unicode character CODE to its multibyte representation
 109    in the current locale and calls SUCCESS on the resulting byte
 110    sequence.  If an error occurs, invoke FAILURE instead,
 111    passing it CODE with errno set appropriately.
 112    Assumes that the locale doesn't change between two calls.
 113    Return whatever the SUCCESS or FAILURE returns.  */
 114 int
 115 unicode_to_mb (unsigned int code,
 116                int (*success) PARAMS((const char *buf, size_t buflen,
 117                                       void *callback_arg)),
 118                int (*failure) PARAMS((unsigned int code,
 119                                       void *callback_arg)),
 120                void *callback_arg)
 121 {
 122   static int initialized;
 123   static int is_utf8;
 124 #if HAVE_ICONV
 125   static iconv_t utf8_to_local;
 126 #endif
 127
 128   char inbuf[6];
 129   int count;
 130
 131   if (!initialized)
 132     {
 133       const char *charset;
 134
 135 #if USE_INCLUDED_LIBINTL
 136       extern const char *locale_charset PARAMS ((void));
 137       charset = locale_charset ();
 138 #else
 139 # if HAVE_LANGINFO_CODESET
 140       charset = nl_langinfo (CODESET);
 141 # else
 142       charset = "";
 143 # endif
 144 #endif
 145
 146       is_utf8 = !strcmp (charset, UTF8_NAME);
 147 #if HAVE_ICONV
 148       if (!is_utf8)
 149         {
 150           utf8_to_local = iconv_open (charset, UTF8_NAME);
 151           if (utf8_to_local == (iconv_t)(-1))
 152             {
 153               /* For an unknown encoding, assume ASCII.  */
 154               utf8_to_local = iconv_open ("ASCII", UTF8_NAME);
 155               if (utf8_to_local == (iconv_t)(-1))
 156                 return failure (code, callback_arg);
 157             }
 158         }
 159 #endif
 160       initialized = 1;
 161     }
 162
 163   /* Convert the character to UTF-8.  */
 164   count = utf8_wctomb ((unsigned char *) inbuf, code);
 165   if (count < 0)
 166     {
 167       errno = EILSEQ;
 168       return failure (code, callback_arg);
 169     }
 170
 171   if (is_utf8)
 172     {
 173       return success (inbuf, count, callback_arg);
 174     }
 175   else
 176     {
 177 #if HAVE_ICONV
 178       char outbuf[25];
 179       const char *inptr;
 180       size_t inbytesleft;
 181       char *outptr;
 182       size_t outbytesleft;
 183       size_t res;
 184
 185       inptr = inbuf;
 186       inbytesleft = count;
 187       outptr = outbuf;
 188       outbytesleft = sizeof (outbuf);
 189
 190       /* Convert the character from UTF-8 to the locale's charset.  */
 191       res = iconv (utf8_to_local,
 192                    (ICONV_CONST char **)&inptr, &inbytesleft,
 193                    &outptr, &outbytesleft);
 194       if (inbytesleft > 0 || res == (size_t)(-1)
 195           /* Irix iconv() inserts a NUL byte if it cannot convert. */
 196 # if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
 197           || (res > 0 && code != 0 && outptr - outbuf == 1 && *outbuf == '\0')
 198 # endif
 199          )
 200         {
 201           if (res != (size_t)(-1))
 202             errno = EILSEQ;
 203           return failure (code, callback_arg);
 204         }
 205
 206       /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
 207 # if defined _LIBICONV_VERSION \
 208     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
 209
 210       /* Get back to the initial shift state.  */
 211       res = iconv (utf8_to_local, NULL, NULL, &outptr, &outbytesleft);
 212       if (res == (size_t)(-1))
 213         return failure (code, callback_arg);
 214 # endif
 215
 216       return success (outbuf, outptr - outbuf, callback_arg);
 217 #else
 218       errno = ENOTSUP;
 219       return failure (code, callback_arg);
 220 #endif
 221     }
 222 }
 223
 224 /* Simple success callback that outputs the converted string.
 225    The STREAM is passed as callback_arg.  */
 226 int
 227 print_unicode_success (const char *buf, size_t buflen, void *callback_arg)
 228 {
 229   FILE *stream = (FILE *) callback_arg;
 230
 231   return fwrite (buf, 1, buflen, stream) == 0 ? -1 : 0;
 232 }
 233
 234 /* Simple failure callback that prints an ASCII representation, using
 235    the same notation as C99 strings.  */
 236 int
 237 print_unicode_failure (unsigned int code, void *callback_arg)
 238 {
 239   int e = errno;
 240   FILE *stream = callback_arg;
 241
 242   fprintf (stream, code < 0x10000 ? "\\u%04X" : "\\U%08X", code);
 243   errno = e;
 244   return -1;
 245 }
 246
 247 /* Outputs the Unicode character CODE to the output stream STREAM.
 248    Returns zero if successful, -1 (setting errno) otherwise.
 249    Assumes that the locale doesn't change between two calls.  */
 250 int
 251 print_unicode_char (FILE *stream, unsigned int code)
 252 {
 253   return unicode_to_mb (code, print_unicode_success, print_unicode_failure,
 254                         stream);
 255 }