Dogcows Code - chaz/tar/blob - src/utf8.c

   1 /* Charset handling for GNU tar.
   2
   3    Copyright (C) 2004 Free Software Foundation, Inc.
   4
   5    This program is free software; you can redistribute it and/or modify it
   6    under the terms of the GNU General Public License as published by the
   7    Free Software Foundation; either version 2, or (at your option) any later
   8    version.
   9
  10    This program is distributed in the hope that it will be useful, but
  11    WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
  13    Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License along
  16    with this program; if not, write to the Free Software Foundation, Inc.,
  17    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  18
  19 #include <system.h>
  20 #include <quotearg.h>
  21 #include "common.h"
  22 #ifdef HAVE_ICONV_H
  23 # include <iconv.h>
  24 #endif
  25
  26 #ifndef ICONV_CONST
  27 # define ICONV_CONST
  28 #endif
  29
  30 #ifdef HAVE_LIBICONV
  31
  32 struct langtab
  33 {
  34   char const *lang;        /* Language code */
  35   char const *terr;        /* Territory code */
  36   char const *charset;     /* Corresponding charset */
  37 };
  38
  39 /* The list of language codes defined in ISO 639 with the corresponding
  40    default character sets.
  41
  42    NOTES:
  43
  44    1) The list must be ordered by:
  45         a) lang field in ascending order
  46         b) terr field in descending order.
  47       NULL fields are considered less than non-null ones.
  48    2) Many entries have NULL charset fields. Please help fill them!
  49    3) The "default" character set for a given language is a matter
  50       of preference. Possibly the table should contain a *list* of
  51       possible character sets.
  52    4) LC_ALL "modifier" field is not taken into account */
  53
  54 static struct langtab langtab[] = {
  55   { "C",     NULL, "ASCII"},
  56   { "POSIX", NULL, "ASCII" },
  57   { "aa",    NULL, NULL},              /* Afar */
  58   { "ab",    NULL, NULL},              /* Abkhazian */
  59   { "ae",    NULL, NULL},              /* Avestan */
  60   { "af",    NULL, "iso-8859-1"},      /* Afrikaans */
  61   { "am",    NULL, "UTF-8"},           /* Amharic */
  62   { "ar",    NULL, "iso-8859-6"},      /* Arabic */
  63   { "as",    NULL, NULL},              /* Assamese */
  64   { "ay",    NULL, "iso-8859-1"},      /* Aymara */
  65   { "az",    NULL, NULL},              /* Azerbaijani */
  66   { "ba",    NULL, NULL},              /* Bashkir */
  67   { "be",    NULL, "UTF-8"},           /* Byelorussian; Belarusian */
  68   { "bg",    NULL, "iso-8859-5"},      /* Bulgarian */
  69   { "bh",    NULL, NULL},              /* Bihari */
  70   { "bi",    NULL, NULL},              /* Bislama */
  71   { "bn",    NULL, NULL},              /* Bengali; Bangla */
  72   { "bo",    NULL, NULL},              /* Tibetan */
  73   { "br",    NULL, "iso-8859-1"},      /* Breton: 1,5,8,9 */
  74   { "bs",    NULL, NULL},              /* Bosnian */
  75   { "ca",    NULL, "iso-8859-1"},      /* Catalan: 1,5,8,9 */
  76   { "ce",    NULL, NULL},              /* Chechen */
  77   { "ch",    NULL, NULL},              /* Chamorro */
  78   { "co",    NULL, "iso-8859-1"},      /* Corsican */
  79   { "cs",    NULL, "iso-8859-2"},      /* Czech */
  80   { "cu",    NULL, NULL },             /* Church Slavic */
  81   { "cv",    NULL, NULL},              /* Chuvash */
  82   { "cy",    NULL, "iso-8859-1"},      /* Welsh */
  83   { "da",    NULL, "iso-8859-1"},      /* Danish: 4-9 */
  84   { "de",    NULL, "iso-8859-1"},      /* German */
  85   { "dz",    NULL, NULL },             /* Dzongkha; Bhutani */
  86   { "el",    NULL, "iso-8859-7"},      /* Greek */
  87   { "en",    NULL, "iso-8859-1"},      /* English */
  88   { "eo",    NULL, "iso-8859-3"},      /* Esperanto */
  89   { "es",    NULL, "iso-8859-1"},      /* Spanish */
  90   { "et",    NULL, "iso-8859-15"},     /* Estonian: 6,7,9 */
  91   { "eu",    NULL, "iso-8859-1"},      /* Basque: 5,8,9 */
  92   { "fa",    NULL, "UTF-8"},           /* Persian */
  93   { "fi",    NULL, "iso-8859-15"},     /* Finnish */
  94   { "fj",    NULL, NULL },             /* Fijian; Fiji */
  95   { "fo",    NULL, "iso-8859-1"},      /* Faroese: 6,9 */
  96   { "fr",    NULL, "iso-8859-1"},      /* French */
  97   { "fy",    NULL, "iso-8859-1"},      /* Frisian */
  98   { "ga",    NULL, "iso-8859-14"},     /* Irish */
  99   { "gd",    NULL, "iso-8859-14" },    /* Scots; Gaelic */
 100   { "gl",    NULL, NULL },             /* Gallegan; Galician */
 101   { "gn",    NULL, NULL},              /* Guarani */
 102   { "gu",    NULL, NULL},              /* Gujarati */
 103   { "gv",    NULL, "iso-8859-14"},     /* Manx */
 104   { "ha",    NULL, NULL },             /* Hausa (?) */
 105   { "he",    NULL, "iso-8859-8" },     /* Hebrew */
 106   { "hi",    NULL, NULL},              /* Hindi */
 107   { "ho",    NULL, NULL},              /* Hiri Motu */
 108   { "hr",    NULL, "iso-8859-2"},      /* Croatian: 10 */
 109   { "hu",    NULL, "iso-8859-2"},      /* Hungarian */
 110   { "hy",    NULL, NULL},              /* Armenian */
 111   { "hz",    NULL, NULL},              /* Herero */
 112   { "id",    NULL, "iso-8859-1"},      /* Indonesian (formerly in) */
 113   { "ia",    NULL, NULL},              /* Interlingua */
 114   { "ie",    NULL, NULL},              /* Interlingue */
 115   { "ik",    NULL, NULL},              /* Inupiak */
 116   { "io",    NULL, NULL},              /* Ido */
 117   { "is",    NULL, "iso-8859-1"},      /* Icelandic */
 118   { "it",    NULL, "iso-8859-1"},      /* Italian */
 119   { "iu",    NULL, NULL},              /* Inuktitut */
 120   { "ja",    NULL, "EUC-JP"},          /* Japanese */
 121   { "jv",    NULL, NULL},              /* Javanese */
 122   { "ka",    NULL, NULL},              /* Georgian */
 123   { "ki",    NULL, NULL},              /* Kikuyu */
 124   { "kj",    NULL, NULL},              /* Kuanyama */
 125   { "kk",    NULL, NULL},              /* Kazakh */
 126   { "kl",    NULL, "iso-8859-1"},      /* Kalaallisut; Greenlandic */
 127   { "km",    NULL, NULL},              /* Khmer; Cambodian */
 128   { "kn",    NULL, NULL},              /* Kannada */
 129   { "ko",    NULL, "EUC-KR"},          /* Korean */
 130   { "ks",    NULL, NULL},              /* Kashmiri */
 131   { "ku",    NULL, NULL},              /* Kurdish */
 132   { "kv",    NULL, NULL},              /* Komi */
 133   { "kw",    NULL, "iso-8859-14"},     /* Cornish: 1,5,8 */
 134   { "ky",    NULL, NULL},              /* Kirghiz */
 135   { "la",    NULL, "iso-8859-1"},      /* Latin */
 136   { "lb",    NULL, "iso-8859-1"},      /* Letzeburgesch */
 137   { "ln",    NULL, NULL},              /* Lingala */
 138   { "lo",    NULL, NULL},              /* Lao; Laotian */
 139   { "lt",    NULL, "iso-8859-4"},      /* Lithuanian */
 140   { "lv",    NULL, "iso-8859-4"},      /* Latvian; Lettish */
 141   { "mg",    NULL, NULL},              /* Malagasy */
 142   { "mh",    NULL, NULL},              /* Marshall */
 143   { "mi",    NULL, NULL},              /* Maori */
 144   { "mk",    NULL, NULL},              /* Macedonian */
 145   { "ml",    NULL, NULL},              /* Malayalam */
 146   { "mn",    NULL, NULL},              /* Mongolian */
 147   { "mo",    NULL, "iso-8859-2"},      /* Moldavian */
 148   { "mr",    NULL, NULL},              /* Marathi */
 149   { "ms",    NULL, NULL},              /* Malay */
 150   { "mt",    NULL, "iso-8859-3"},      /* Maltese */
 151   { "my",    NULL, NULL},              /* Burmese */
 152   { "na",    NULL, NULL},              /* Nauru */
 153   { "nb",    NULL, "iso-8859-1"},      /* Norwegian Bokmål; Bokm@aa{}l  */
 154   { "nd",    NULL, NULL},              /* Ndebele, North */
 155   { "ne",    NULL, NULL},              /* Nepali */
 156   { "ng",    NULL, NULL},              /* Ndonga */
 157   { "nl",    NULL, "iso-8859-1"},      /* Dutch: 5,9 */
 158   { "nn",    NULL, "iso-8859-1"},      /* Norwegian Nynorsk */
 159   { "no",    NULL, "iso-8859-1"},      /* Norwegian */
 160   { "nr",    NULL, NULL},              /* Ndebele, South */
 161   { "nv",    NULL, NULL},              /* Navajo */
 162   { "ny",    NULL, NULL},              /* Chichewa; Nyanja */
 163   { "oc",    NULL, NULL},              /* Occitan; Provençal; Proven@,{c}al */
 164   { "om",    NULL, NULL},              /* (Afan) Oromo */
 165   { "or",    NULL, NULL},              /* Oriya */
 166   { "os",    NULL, NULL},              /* Ossetian; Ossetic */
 167   { "pa",    NULL, NULL},              /* Panjabi; Punjabi */
 168   { "pi",    NULL, NULL},              /* Pali */
 169   { "pl",    NULL, "iso-8859-2"},      /* Polish */
 170   { "ps",    NULL, NULL},              /* Pashto, Pushto */
 171   { "pt",    NULL, "iso-8859-1"},      /* Portuguese */
 172   { "qu",    NULL, "iso-8859-1"},      /* Quechua */
 173   { "rm",    NULL, "iso-8859-1"},      /* Rhaeto-Romance */
 174   { "rn",    NULL, NULL },             /* Rundi; Kirundi */
 175   { "ro",    NULL, "iso-8859-2"},      /* Romanian */
 176   { "ru",    NULL, "koi8-r"},          /* Russian */
 177   { "rw",    NULL, NULL},              /* Kinyarwanda */
 178   { "sa",    NULL, NULL},              /* Sanskrit */
 179   { "sc",    NULL, "iso-8859-1"},      /* Sardinian */
 180   { "sd",    NULL, NULL},              /* Sindhi */
 181   { "se",    NULL, "iso-8859-10"},     /* Northern Sami */
 182   { "sg",    NULL, NULL},              /* Sango; Sangro */
 183   { "si",    NULL, NULL},              /* Sinhalese */
 184   { "sk",    NULL, "iso-8859-2"},      /* Slovak */
 185   { "sl",    NULL, "iso-8859-1"},      /* Slovenian */
 186   { "sm",    NULL, NULL},              /* Samoan */
 187   { "sn",    NULL, NULL},              /* Shona */
 188   { "so",    NULL, NULL},              /* Somali */
 189   { "sq",    NULL, "iso-8859-1"},      /* Albanian: 2,5,8,9,10 */
 190   { "sr",    NULL, "iso-8859-2"},      /* Serbian */
 191   { "ss",    NULL, NULL},              /* Swati; Siswati */
 192   { "st",    NULL, NULL},              /* Sesotho; Sotho, Southern */
 193   { "su",    NULL, NULL},              /* Sundanese */
 194   { "sv",    NULL, "iso-8859-1"},      /* Swedish */
 195   { "sw",    NULL, NULL},              /* Swahili */
 196   { "ta",    NULL, NULL},              /* Tamil */
 197   { "te",    NULL, NULL},              /* Telugu */
 198   { "tg",    NULL, NULL},              /* Tajik */
 199   { "th",    NULL, "iso-8859-11"},     /* Thai */
 200   { "ti",    NULL, NULL},              /* Tigrinya */
 201   { "tk",    NULL, NULL},              /* Turkmen */
 202   { "tl",    NULL, "iso-8859-1"},      /* Tagalog */
 203   { "tn",    NULL, NULL},              /* Tswana; Setswana */
 204   { "to",    NULL, NULL},              /* Tonga (?) */
 205   { "tr",    NULL, "iso-8859-9"},      /* Turkish */
 206   { "ts",    NULL, NULL},              /* Tsonga */
 207   { "tt",    NULL, NULL},              /* Tatar */
 208   { "tw",    NULL, NULL},              /* Twi */
 209   { "ty",    NULL, NULL},              /* Tahitian */
 210   { "ug",    NULL, NULL},              /* Uighur */
 211   { "uk",    NULL, "koi8-u"},          /* Ukrainian */
 212   { "ur",    NULL, NULL},              /* Urdu */
 213   { "uz",    NULL, NULL},              /* Uzbek */
 214   { "vi",    NULL, NULL},              /* Vietnamese */
 215   { "vo",    NULL, NULL},              /* Volapük; Volap@"{u}k; Volapuk */
 216   { "wa",    NULL, "iso-8859-1"},      /* Walloon */
 217   { "wo",    NULL, NULL},              /* Wolof */
 218   { "xh",    NULL, NULL},              /* Xhosa */
 219   { "yi",    NULL, "iso-8859-8"},      /* Yiddish (formerly ji) */
 220   { "yo",    NULL, NULL},              /* Yoruba */
 221   { "za",    NULL, NULL},              /* Zhuang */
 222   { "zh",    "TW", "big5"},            /* Chinese */
 223   { "zh",    NULL, "gb2312"},          /* Chinese */
 224   { "zu",    NULL, NULL},              /* Zulu */
 225   { NULL,    NULL, NULL}
 226 };
 227
 228 /* Given the language and (optionally) territory code, return the
 229    default character set for that language. See notes above. */
 230
 231 static char const *
 232 charset_lookup (char const *lang, char const *terr)
 233 {
 234   struct langtab const *p;
 235
 236   if (!lang)
 237     return NULL;
 238   for (p = langtab; p->lang; p++)
 239     if (strcasecmp (p->lang, lang) == 0
 240         && (terr == NULL
 241             || p->terr == NULL
 242             || !strcasecmp (p->terr, terr) == 0))
 243       return p->charset;
 244   return NULL;
 245 }
 246
 247 static const char *
 248 get_input_charset (void)
 249 {
 250   const char *charset = NULL;
 251   char *tmp;
 252
 253   /* Try to deduce the charset from LC_ALL or LANG variables */
 254
 255   tmp = getenv ("LC_ALL");
 256   if (!tmp)
 257     tmp = getenv ("LANG");
 258
 259   if (tmp)
 260     {
 261       char *lang;
 262       char *terr;
 263
 264       lang = strtok (tmp, "_");
 265       terr = strtok (NULL, ".");
 266       charset = strtok (NULL, "@");
 267
 268       if (!charset)
 269         charset = charset_lookup (lang, terr);
 270     }
 271
 272   if (!charset)
 273     charset = "iso-8859-1";
 274   return charset;
 275 }
 276
 277 #else /* !defined HAVE_LIBICONV */
 278
 279 # undef iconv_open
 280 # define iconv_open(tocode, fromcode) ((iconv_t) -1)
 281
 282 # undef iconv
 283 # define iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft) ((size_t) 0)
 284
 285 # undef iconv_close
 286 # define iconv_close(cd) 0
 287
 288 #endif /* !defined HAVE_LIBICONV */
 289
 290
 291 \f
 292
 293 static iconv_t conv_desc[2] = { (iconv_t) -1, (iconv_t) -1 };
 294
 295 static iconv_t
 296 utf8_init (bool to_utf)
 297 {
 298   if (conv_desc[(int) to_utf] == (iconv_t) -1)
 299     {
 300       if (to_utf)
 301         conv_desc[(int) to_utf] = iconv_open ("UTF-8", get_input_charset ());
 302       else
 303         conv_desc[(int) to_utf] = iconv_open (get_input_charset (), "UTF-8");
 304     }
 305   return conv_desc[(int) to_utf];
 306 }
 307
 308 bool
 309 utf8_convert (bool to_utf, char const *input, char **output)
 310 {
 311   char ICONV_CONST *ib;
 312   char *ob;
 313   size_t inlen;
 314   size_t outlen;
 315   size_t rc;
 316   iconv_t cd = utf8_init (to_utf);
 317
 318   if (cd == 0)
 319     {
 320       *output = xstrdup (input);
 321       return true;
 322     }
 323   else if (cd == (iconv_t)-1)
 324     return false;
 325
 326   inlen = strlen (input) + 1;
 327   outlen = inlen * MB_LEN_MAX + 1;
 328   ob = *output = xmalloc (outlen);
 329   ib = (char ICONV_CONST *) input;
 330   rc = iconv (cd, &ib, &inlen, &ob, &outlen);
 331   *ob = 0;
 332   return rc != -1;
 333 }
 334 \f
 335
 336 bool
 337 string_ascii_p (const char *str)
 338 {
 339   const unsigned char *p = (const unsigned char *)str;
 340   for (; *p; p++)
 341     if (*p > 127)
 342       return false;
 343   return true;
 344 }