]> Dogcows Code - chaz/tar/blob - src/utf8.c
Merge recent gnulib changes, and remove some lint.
[chaz/tar] / src / utf8.c
1 /* Charset handling for GNU tar.
2
3 Copyright (C) 2004 Free Software Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify it
6 under the terms of the GNU General Public License as published by the
7 Free Software Foundation; either version 2, or (at your option) any later
8 version.
9
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13 Public License for more details.
14
15 You should have received a copy of the GNU General Public License along
16 with this program; if not, write to the Free Software Foundation, Inc.,
17 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18
19 #include "system.h"
20 #include <quotearg.h>
21 #include "common.h"
22 #ifdef HAVE_ICONV_H
23 # include <iconv.h>
24 #endif
25
26 #ifdef HAVE_LIBICONV
27
28 struct langtab
29 {
30 char const *lang; /* Language code */
31 char const *terr; /* Territory code */
32 char const *charset; /* Corresponding charset */
33 };
34
35 /* The list of language codes defined in ISO 639 with the corresponding
36 default character sets.
37
38 NOTES:
39
40 1) The list must be ordered by:
41 a) lang field in ascending order
42 b) terr field in descending order.
43 NULL fields are considered less than non-null ones.
44 2) Many entries have NULL charset fields. Please help fill them!
45 3) The "default" character set for a given language is a matter
46 of preference. Possibly the table should contain a *list* of
47 possible character sets.
48 4) LC_ALL "modifier" field is not taken into account */
49
50 static struct langtab langtab[] = {
51 { "C", NULL, "ASCII"},
52 { "POSIX", NULL, "ASCII" },
53 { "aa", NULL, NULL}, /* Afar */
54 { "ab", NULL, NULL}, /* Abkhazian */
55 { "ae", NULL, NULL}, /* Avestan */
56 { "af", NULL, "iso-8859-1"}, /* Afrikaans */
57 { "am", NULL, "UTF-8"}, /* Amharic */
58 { "ar", NULL, "iso-8859-6"}, /* Arabic */
59 { "as", NULL, NULL}, /* Assamese */
60 { "ay", NULL, "iso-8859-1"}, /* Aymara */
61 { "az", NULL, NULL}, /* Azerbaijani */
62 { "ba", NULL, NULL}, /* Bashkir */
63 { "be", NULL, "UTF-8"}, /* Byelorussian; Belarusian */
64 { "bg", NULL, "iso-8859-5"}, /* Bulgarian */
65 { "bh", NULL, NULL}, /* Bihari */
66 { "bi", NULL, NULL}, /* Bislama */
67 { "bn", NULL, NULL}, /* Bengali; Bangla */
68 { "bo", NULL, NULL}, /* Tibetan */
69 { "br", NULL, "iso-8859-1"}, /* Breton: 1,5,8,9 */
70 { "bs", NULL, NULL}, /* Bosnian */
71 { "ca", NULL, "iso-8859-1"}, /* Catalan: 1,5,8,9 */
72 { "ce", NULL, NULL}, /* Chechen */
73 { "ch", NULL, NULL}, /* Chamorro */
74 { "co", NULL, "iso-8859-1"}, /* Corsican */
75 { "cs", NULL, "iso-8859-2"}, /* Czech */
76 { "cu", NULL, NULL }, /* Church Slavic */
77 { "cv", NULL, NULL}, /* Chuvash */
78 { "cy", NULL, "iso-8859-1"}, /* Welsh */
79 { "da", NULL, "iso-8859-1"}, /* Danish: 4-9 */
80 { "de", NULL, "iso-8859-1"}, /* German */
81 { "dz", NULL, NULL }, /* Dzongkha; Bhutani */
82 { "el", NULL, "iso-8859-7"}, /* Greek */
83 { "en", NULL, "iso-8859-1"}, /* English */
84 { "eo", NULL, "iso-8859-3"}, /* Esperanto */
85 { "es", NULL, "iso-8859-1"}, /* Spanish */
86 { "et", NULL, "iso-8859-15"}, /* Estonian: 6,7,9 */
87 { "eu", NULL, "iso-8859-1"}, /* Basque: 5,8,9 */
88 { "fa", NULL, "UTF-8"}, /* Persian */
89 { "fi", NULL, "iso-8859-15"}, /* Finnish */
90 { "fj", NULL, NULL }, /* Fijian; Fiji */
91 { "fo", NULL, "iso-8859-1"}, /* Faroese: 6,9 */
92 { "fr", NULL, "iso-8859-1"}, /* French */
93 { "fy", NULL, "iso-8859-1"}, /* Frisian */
94 { "ga", NULL, "iso-8859-14"}, /* Irish */
95 { "gd", NULL, "iso-8859-14" }, /* Scots; Gaelic */
96 { "gl", NULL, NULL }, /* Gallegan; Galician */
97 { "gn", NULL, NULL}, /* Guarani */
98 { "gu", NULL, NULL}, /* Gujarati */
99 { "gv", NULL, "iso-8859-14"}, /* Manx */
100 { "ha", NULL, NULL }, /* Hausa (?) */
101 { "he", NULL, "iso-8859-8" }, /* Hebrew */
102 { "hi", NULL, NULL}, /* Hindi */
103 { "ho", NULL, NULL}, /* Hiri Motu */
104 { "hr", NULL, "iso-8859-2"}, /* Croatian: 10 */
105 { "hu", NULL, "iso-8859-2"}, /* Hungarian */
106 { "hy", NULL, NULL}, /* Armenian */
107 { "hz", NULL, NULL}, /* Herero */
108 { "id", NULL, "iso-8859-1"}, /* Indonesian (formerly in) */
109 { "ia", NULL, NULL}, /* Interlingua */
110 { "ie", NULL, NULL}, /* Interlingue */
111 { "ik", NULL, NULL}, /* Inupiak */
112 { "io", NULL, NULL}, /* Ido */
113 { "is", NULL, "iso-8859-1"}, /* Icelandic */
114 { "it", NULL, "iso-8859-1"}, /* Italian */
115 { "iu", NULL, NULL}, /* Inuktitut */
116 { "ja", NULL, "EUC-JP"}, /* Japanese */
117 { "jv", NULL, NULL}, /* Javanese */
118 { "ka", NULL, NULL}, /* Georgian */
119 { "ki", NULL, NULL}, /* Kikuyu */
120 { "kj", NULL, NULL}, /* Kuanyama */
121 { "kk", NULL, NULL}, /* Kazakh */
122 { "kl", NULL, "iso-8859-1"}, /* Kalaallisut; Greenlandic */
123 { "km", NULL, NULL}, /* Khmer; Cambodian */
124 { "kn", NULL, NULL}, /* Kannada */
125 { "ko", NULL, "EUC-KR"}, /* Korean */
126 { "ks", NULL, NULL}, /* Kashmiri */
127 { "ku", NULL, NULL}, /* Kurdish */
128 { "kv", NULL, NULL}, /* Komi */
129 { "kw", NULL, "iso-8859-14"}, /* Cornish: 1,5,8 */
130 { "ky", NULL, NULL}, /* Kirghiz */
131 { "la", NULL, "iso-8859-1"}, /* Latin */
132 { "lb", NULL, "iso-8859-1"}, /* Letzeburgesch */
133 { "ln", NULL, NULL}, /* Lingala */
134 { "lo", NULL, NULL}, /* Lao; Laotian */
135 { "lt", NULL, "iso-8859-4"}, /* Lithuanian */
136 { "lv", NULL, "iso-8859-4"}, /* Latvian; Lettish */
137 { "mg", NULL, NULL}, /* Malagasy */
138 { "mh", NULL, NULL}, /* Marshall */
139 { "mi", NULL, NULL}, /* Maori */
140 { "mk", NULL, NULL}, /* Macedonian */
141 { "ml", NULL, NULL}, /* Malayalam */
142 { "mn", NULL, NULL}, /* Mongolian */
143 { "mo", NULL, "iso-8859-2"}, /* Moldavian */
144 { "mr", NULL, NULL}, /* Marathi */
145 { "ms", NULL, NULL}, /* Malay */
146 { "mt", NULL, "iso-8859-3"}, /* Maltese */
147 { "my", NULL, NULL}, /* Burmese */
148 { "na", NULL, NULL}, /* Nauru */
149 { "nb", NULL, "iso-8859-1"}, /* Norwegian Bokmål; Bokm@aa{}l */
150 { "nd", NULL, NULL}, /* Ndebele, North */
151 { "ne", NULL, NULL}, /* Nepali */
152 { "ng", NULL, NULL}, /* Ndonga */
153 { "nl", NULL, "iso-8859-1"}, /* Dutch: 5,9 */
154 { "nn", NULL, "iso-8859-1"}, /* Norwegian Nynorsk */
155 { "no", NULL, "iso-8859-1"}, /* Norwegian */
156 { "nr", NULL, NULL}, /* Ndebele, South */
157 { "nv", NULL, NULL}, /* Navajo */
158 { "ny", NULL, NULL}, /* Chichewa; Nyanja */
159 { "oc", NULL, NULL}, /* Occitan; Provençal; Proven@,{c}al */
160 { "om", NULL, NULL}, /* (Afan) Oromo */
161 { "or", NULL, NULL}, /* Oriya */
162 { "os", NULL, NULL}, /* Ossetian; Ossetic */
163 { "pa", NULL, NULL}, /* Panjabi; Punjabi */
164 { "pi", NULL, NULL}, /* Pali */
165 { "pl", NULL, "iso-8859-2"}, /* Polish */
166 { "ps", NULL, NULL}, /* Pashto, Pushto */
167 { "pt", NULL, "iso-8859-1"}, /* Portuguese */
168 { "qu", NULL, "iso-8859-1"}, /* Quechua */
169 { "rm", NULL, "iso-8859-1"}, /* Rhaeto-Romance */
170 { "rn", NULL, NULL }, /* Rundi; Kirundi */
171 { "ro", NULL, "iso-8859-2"}, /* Romanian */
172 { "ru", NULL, "koi8-r"}, /* Russian */
173 { "rw", NULL, NULL}, /* Kinyarwanda */
174 { "sa", NULL, NULL}, /* Sanskrit */
175 { "sc", NULL, "iso-8859-1"}, /* Sardinian */
176 { "sd", NULL, NULL}, /* Sindhi */
177 { "se", NULL, "iso-8859-10"}, /* Northern Sami */
178 { "sg", NULL, NULL}, /* Sango; Sangro */
179 { "si", NULL, NULL}, /* Sinhalese */
180 { "sk", NULL, "iso-8859-2"}, /* Slovak */
181 { "sl", NULL, "iso-8859-1"}, /* Slovenian */
182 { "sm", NULL, NULL}, /* Samoan */
183 { "sn", NULL, NULL}, /* Shona */
184 { "so", NULL, NULL}, /* Somali */
185 { "sq", NULL, "iso-8859-1"}, /* Albanian: 2,5,8,9,10 */
186 { "sr", NULL, "iso-8859-2"}, /* Serbian */
187 { "ss", NULL, NULL}, /* Swati; Siswati */
188 { "st", NULL, NULL}, /* Sesotho; Sotho, Southern */
189 { "su", NULL, NULL}, /* Sundanese */
190 { "sv", NULL, "iso-8859-1"}, /* Swedish */
191 { "sw", NULL, NULL}, /* Swahili */
192 { "ta", NULL, NULL}, /* Tamil */
193 { "te", NULL, NULL}, /* Telugu */
194 { "tg", NULL, NULL}, /* Tajik */
195 { "th", NULL, "iso-8859-11"}, /* Thai */
196 { "ti", NULL, NULL}, /* Tigrinya */
197 { "tk", NULL, NULL}, /* Turkmen */
198 { "tl", NULL, "iso-8859-1"}, /* Tagalog */
199 { "tn", NULL, NULL}, /* Tswana; Setswana */
200 { "to", NULL, NULL}, /* Tonga (?) */
201 { "tr", NULL, "iso-8859-9"}, /* Turkish */
202 { "ts", NULL, NULL}, /* Tsonga */
203 { "tt", NULL, NULL}, /* Tatar */
204 { "tw", NULL, NULL}, /* Twi */
205 { "ty", NULL, NULL}, /* Tahitian */
206 { "ug", NULL, NULL}, /* Uighur */
207 { "uk", NULL, "koi8-u"}, /* Ukrainian */
208 { "ur", NULL, NULL}, /* Urdu */
209 { "uz", NULL, NULL}, /* Uzbek */
210 { "vi", NULL, NULL}, /* Vietnamese */
211 { "vo", NULL, NULL}, /* Volapük; Volap@"{u}k; Volapuk */
212 { "wa", NULL, "iso-8859-1"}, /* Walloon */
213 { "wo", NULL, NULL}, /* Wolof */
214 { "xh", NULL, NULL}, /* Xhosa */
215 { "yi", NULL, "iso-8859-8"}, /* Yiddish (formerly ji) */
216 { "yo", NULL, NULL}, /* Yoruba */
217 { "za", NULL, NULL}, /* Zhuang */
218 { "zh", "TW", "big5"}, /* Chinese */
219 { "zh", NULL, "gb2312"}, /* Chinese */
220 { "zu", NULL, NULL}, /* Zulu */
221 { NULL, NULL, NULL}
222 };
223
224 /* Given the language and (optionally) territory code, return the
225 default character set for that language. See notes above. */
226
227 static char const *
228 charset_lookup (char const *lang, char const *terr)
229 {
230 struct langtab const *p;
231
232 if (!lang)
233 return NULL;
234 for (p = langtab; p->lang; p++)
235 if (strcasecmp (p->lang, lang) == 0
236 && (terr == NULL
237 || p->terr == NULL
238 || !strcasecmp (p->terr, terr) == 0))
239 return p->charset;
240 return NULL;
241 }
242
243 static const char *
244 get_input_charset (void)
245 {
246 const char *charset = NULL;
247 char *tmp;
248
249 /* Try to deduce the charset from LC_ALL or LANG variables */
250
251 tmp = getenv ("LC_ALL");
252 if (!tmp)
253 tmp = getenv ("LANG");
254
255 if (tmp)
256 {
257 char *lang;
258 char *terr;
259
260 lang = strtok (tmp, "_");
261 terr = strtok (NULL, ".");
262 charset = strtok (NULL, "@");
263
264 if (!charset)
265 charset = charset_lookup (lang, terr);
266 }
267
268 if (!charset)
269 charset = "iso-8859-1";
270 return charset;
271 }
272
273 #else /* !defined HAVE_LIBICONV */
274
275 # undef iconv_open
276 # define iconv_open(tocode, fromcode) ((iconv_t) -1)
277
278 # undef iconv
279 # define iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft) ((size_t) 0)
280
281 # undef iconv_close
282 # define iconv_close(cd) 0
283
284 #endif /* !defined HAVE_LIBICONV */
285
286
287 \f
288
289 static iconv_t conv_desc[2] = { (iconv_t) -1, (iconv_t) -1 };
290
291 static iconv_t
292 utf8_init (bool to_utf)
293 {
294 if (conv_desc[(int) to_utf] == (iconv_t) -1)
295 {
296 if (to_utf)
297 conv_desc[(int) to_utf] = iconv_open ("UTF-8", get_input_charset ());
298 else
299 conv_desc[(int) to_utf] = iconv_open (get_input_charset (), "UTF-8");
300 }
301 return conv_desc[(int) to_utf];
302 }
303
304 bool
305 utf8_convert (bool to_utf, char const *input, char **output)
306 {
307 char ICONV_CONST *ib;
308 char *ob;
309 size_t inlen;
310 size_t outlen;
311 size_t rc;
312 iconv_t cd = utf8_init (to_utf);
313
314 if (cd == 0)
315 {
316 *output = xstrdup (input);
317 return true;
318 }
319 else if (cd == (iconv_t)-1)
320 return false;
321
322 inlen = strlen (input) + 1;
323 outlen = inlen * MB_LEN_MAX + 1;
324 ob = *output = xmalloc (outlen);
325 ib = (char ICONV_CONST *) input;
326 rc = iconv (cd, &ib, &inlen, &ob, &outlen);
327 *ob = 0;
328 return rc != -1;
329 }
330 \f
331
332 bool
333 string_ascii_p (const char *str)
334 {
335 const unsigned char *p = (const unsigned char *)str;
336 for (; *p; p++)
337 if (*p > 127)
338 return false;
339 return true;
340 }
This page took 0.049271 seconds and 5 git commands to generate.