add gitignore
[chaz/homebank] / src / hb-encoding.c
1 /* HomeBank -- Free, easy, personal accounting for everyone.
2 * Copyright (C) 1995-2014 Maxime DOYEN
3 *
4 * This file is part of HomeBank.
5 *
6 * HomeBank is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * HomeBank is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "homebank.h"
21 #include "hb-encoding.h"
22
23 #define MYDEBUG 0
24
25 #if MYDEBUG
26 #define DB(x) (x);
27 #else
28 #define DB(x);
29 #endif
30
31 /* our global datas */
32 extern struct HomeBank *GLOBALS;
33
34 /* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = */
35
36
37
38 /*
39 * The original versions of the following tables are taken from profterm
40 *
41 * Copyright (C) 2002 Red Hat, Inc.
42 */
43
44
45
46 static const GeditEncoding utf8_encoding = {
47 GEDIT_ENCODING_UTF_8,
48 "UTF-8",
49 "Unicode"
50 };
51
52 static const GeditEncoding encodings [] = {
53
54 { GEDIT_ENCODING_ISO_8859_1,
55 "ISO-8859-1", "Western" },
56 { GEDIT_ENCODING_ISO_8859_2,
57 "ISO-8859-2", "Central European" },
58 { GEDIT_ENCODING_ISO_8859_3,
59 "ISO-8859-3", "South European" },
60 { GEDIT_ENCODING_ISO_8859_4,
61 "ISO-8859-4", "Baltic" },
62 { GEDIT_ENCODING_ISO_8859_5,
63 "ISO-8859-5", "Cyrillic" },
64 { GEDIT_ENCODING_ISO_8859_6,
65 "ISO-8859-6", "Arabic" },
66 { GEDIT_ENCODING_ISO_8859_7,
67 "ISO-8859-7", "Greek" },
68 { GEDIT_ENCODING_ISO_8859_8,
69 "ISO-8859-8", "Hebrew Visual" },
70 { GEDIT_ENCODING_ISO_8859_8_I,
71 "ISO-8859-8-I", "Hebrew" },
72 { GEDIT_ENCODING_ISO_8859_9,
73 "ISO-8859-9", "Turkish" },
74 { GEDIT_ENCODING_ISO_8859_10,
75 "ISO-8859-10", "Nordic" },
76 { GEDIT_ENCODING_ISO_8859_13,
77 "ISO-8859-13", "Baltic" },
78 { GEDIT_ENCODING_ISO_8859_14,
79 "ISO-8859-14", "Celtic" },
80 { GEDIT_ENCODING_ISO_8859_15,
81 "ISO-8859-15", "Western" },
82 { GEDIT_ENCODING_ISO_8859_16,
83 "ISO-8859-16", "Romanian" },
84
85 { GEDIT_ENCODING_UTF_7,
86 "UTF-7", "Unicode" },
87 { GEDIT_ENCODING_UTF_16,
88 "UTF-16", "Unicode" },
89 { GEDIT_ENCODING_UTF_16_BE,
90 "UTF-16BE", "Unicode" },
91 { GEDIT_ENCODING_UTF_16_LE,
92 "UTF-16LE", "Unicode" },
93 { GEDIT_ENCODING_UTF_32,
94 "UTF-32", "Unicode" },
95 { GEDIT_ENCODING_UCS_2,
96 "UCS-2", "Unicode" },
97 { GEDIT_ENCODING_UCS_4,
98 "UCS-4", "Unicode" },
99
100 { GEDIT_ENCODING_ARMSCII_8,
101 "ARMSCII-8", "Armenian" },
102 { GEDIT_ENCODING_BIG5,
103 "BIG5", "Chinese Traditional" },
104 { GEDIT_ENCODING_BIG5_HKSCS,
105 "BIG5-HKSCS", "Chinese Traditional" },
106 { GEDIT_ENCODING_CP_866,
107 "CP866", "Cyrillic/Russian" },
108
109 { GEDIT_ENCODING_EUC_JP,
110 "EUC-JP", "Japanese" },
111 { GEDIT_ENCODING_EUC_JP_MS,
112 "EUC-JP-MS", "Japanese" },
113 { GEDIT_ENCODING_CP932,
114 "CP932", "Japanese" },
115
116 { GEDIT_ENCODING_EUC_KR,
117 "EUC-KR", "Korean" },
118 { GEDIT_ENCODING_EUC_TW,
119 "EUC-TW", "Chinese Traditional" },
120
121 { GEDIT_ENCODING_GB18030,
122 "GB18030", "Chinese Simplified" },
123 { GEDIT_ENCODING_GB2312,
124 "GB2312", "Chinese Simplified" },
125 { GEDIT_ENCODING_GBK,
126 "GBK", "Chinese Simplified" },
127 { GEDIT_ENCODING_GEOSTD8,
128 "GEORGIAN-ACADEMY", "Georgian" }, /* FIXME GEOSTD8 ? */
129 { GEDIT_ENCODING_HZ,
130 "HZ", "Chinese Simplified" },
131
132 { GEDIT_ENCODING_IBM_850,
133 "IBM850", "Western" },
134 { GEDIT_ENCODING_IBM_852,
135 "IBM852", "Central European" },
136 { GEDIT_ENCODING_IBM_855,
137 "IBM855", "Cyrillic" },
138 { GEDIT_ENCODING_IBM_857,
139 "IBM857", "Turkish" },
140 { GEDIT_ENCODING_IBM_862,
141 "IBM862", "Hebrew" },
142 { GEDIT_ENCODING_IBM_864,
143 "IBM864", "Arabic" },
144
145 { GEDIT_ENCODING_ISO_2022_JP,
146 "ISO-2022-JP", "Japanese" },
147 { GEDIT_ENCODING_ISO_2022_KR,
148 "ISO-2022-KR", "Korean" },
149 { GEDIT_ENCODING_ISO_IR_111,
150 "ISO-IR-111", "Cyrillic" },
151 { GEDIT_ENCODING_JOHAB,
152 "JOHAB", "Korean" },
153 { GEDIT_ENCODING_KOI8_R,
154 "KOI8R", "Cyrillic" },
155 { GEDIT_ENCODING_KOI8__R,
156 "KOI8-R", "Cyrillic" },
157 { GEDIT_ENCODING_KOI8_U,
158 "KOI8U", "Cyrillic/Ukrainian" },
159
160 { GEDIT_ENCODING_SHIFT_JIS,
161 "SHIFT_JIS", "Japanese" },
162 { GEDIT_ENCODING_TCVN,
163 "TCVN", "Vietnamese" },
164 { GEDIT_ENCODING_TIS_620,
165 "TIS-620", "Thai" },
166 { GEDIT_ENCODING_UHC,
167 "UHC", "Korean" },
168 { GEDIT_ENCODING_VISCII,
169 "VISCII", "Vietnamese" },
170
171 { GEDIT_ENCODING_WINDOWS_1250,
172 "WINDOWS-1250", "Central European" },
173 { GEDIT_ENCODING_WINDOWS_1251,
174 "WINDOWS-1251", "Cyrillic" },
175 { GEDIT_ENCODING_WINDOWS_1252,
176 "WINDOWS-1252", "Western" },
177 { GEDIT_ENCODING_WINDOWS_1253,
178 "WINDOWS-1253", "Greek" },
179 { GEDIT_ENCODING_WINDOWS_1254,
180 "WINDOWS-1254", "Turkish" },
181 { GEDIT_ENCODING_WINDOWS_1255,
182 "WINDOWS-1255", "Hebrew" },
183 { GEDIT_ENCODING_WINDOWS_1256,
184 "WINDOWS-1256", "Arabic" },
185 { GEDIT_ENCODING_WINDOWS_1257,
186 "WINDOWS-1257", "Baltic" },
187 { GEDIT_ENCODING_WINDOWS_1258,
188 "WINDOWS-1258", "Vietnamese" }
189 };
190
191 const GeditEncoding *
192 gedit_encoding_get_from_index (gint index)
193 {
194 //g_return_val_if_fail (index >= 0, NULL);
195
196 if (index >= GEDIT_ENCODING_LAST)
197 return NULL;
198
199 //gedit_encoding_lazy_init ();
200
201 return &encodings [index];
202 }
203
204 const GeditEncoding *
205 gedit_encoding_get_utf8 (void)
206 {
207 //gedit_encoding_lazy_init ();
208
209 return &utf8_encoding;
210 }
211
212
213 static gchar *homebank_utf8_convert(gchar *buffer, const gchar **charset)
214 {
215 GError *conv_error;
216 gchar* conv_buffer = NULL;
217 gsize new_len;
218 guint i;
219 gboolean valid;
220 const struct _GeditEncoding *enc;
221
222 DB( g_print("(homebank) homebank_utf8_convert\n") );
223
224 for (i=0 ; i<GEDIT_ENCODING_LAST ; i++)
225 {
226 conv_error = NULL;
227
228 enc = gedit_encoding_get_from_index(i);
229 DB( g_print("-> should try %s\n", enc->charset) );
230
231 conv_buffer = g_convert(buffer, -1, "UTF-8", enc->charset, NULL, &new_len, &conv_error);
232 valid = g_utf8_validate (conv_buffer, -1, NULL);
233 if ((conv_error != NULL) || !valid )
234 {
235 DB( g_print (" -> Couldn't convert from %s to UTF-8.\n", enc->charset) );
236 }
237 else
238 {
239 DB( g_print (" -> file compatible with '%s'\n", enc->charset) );
240 if(charset != NULL)
241 *charset = enc->charset;
242 return conv_buffer;
243 }
244 }
245
246 if(charset != NULL)
247 *charset = NULL;
248 return NULL;
249 }
250
251
252 /*
253 * Ensure a buffer to be utf-8, and convert if necessary
254 *
255 */
256 gchar *homebank_utf8_ensure(gchar *buffer)
257 {
258 gboolean isvalid;
259 gchar *converted;
260
261 DB( g_print("(homebank) homebank_utf8_ensure\n") );
262
263 if(buffer == NULL)
264 return NULL;
265
266 isvalid = g_utf8_validate(buffer, -1, NULL);
267 DB( g_print(" -> is valid utf8: %d\n", isvalid) );
268
269 if(!isvalid)
270 {
271 converted = homebank_utf8_convert(buffer, NULL);
272 if(converted != NULL)
273 {
274 //g_warn here ?
275 g_free(buffer);
276 return converted;
277 }
278 //g_warn here ?
279 }
280 return buffer;
281 }
282
283
284 const gchar *homebank_file_getencoding(gchar *filename)
285 {
286 const gchar *charset = NULL;
287 gchar *buffer;
288 gsize length;
289 GError *error = NULL;
290 gboolean isutf8;
291 const gchar *locale_charset;
292 const struct _GeditEncoding *enc;
293
294 DB( g_print("(homebank) test encoding\n") );
295
296 if (g_get_charset (&locale_charset) == FALSE)
297 {
298 //unknown_encoding.charset = g_strdup (locale_charset);
299
300 }
301
302 DB( g_print(" -> locale charset is '%s'\n", locale_charset) );
303
304 if (g_file_get_contents (filename, &buffer, &length, &error))
305 {
306
307 isutf8 = g_utf8_validate(buffer, -1, NULL);
308 DB( g_print(" -> is valid utf8: %d\n", isutf8) );
309
310 if( isutf8 == FALSE )
311 {
312 gchar *converted;
313
314 converted = homebank_utf8_convert(buffer, &charset);
315
316 DB( g_print(" -> converted charset match: '%s'\n", charset) );
317 DB( g_print(" -> converted: '%p' %s\n", converted, converted) );
318
319 if(converted != NULL)
320 g_free(converted);
321 }
322 else
323 {
324 enc = gedit_encoding_get_utf8();
325 charset = enc->charset;
326 }
327
328
329 g_free(buffer);
330 }
331
332 DB( g_print (" -> charset is '%s'\n", charset) );
333
334 return charset;
335 }
336
337
This page took 0.055721 seconds and 4 git commands to generate.