Parent Directory
|
Revision Log
committing r3113 initial commit again...
1 | ///////////////////////////////////////////////////////////////////////////// |
2 | // Name: src/common/strconv.cpp |
3 | // Purpose: Unicode conversion classes |
4 | // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik, |
5 | // Ryan Norton, Fredrik Roubert (UTF7) |
6 | // Modified by: |
7 | // Created: 29/01/98 |
8 | // RCS-ID: $Id: strconv.cpp 56394 2008-10-17 11:31:22Z VZ $ |
9 | // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik |
10 | // (c) 2000-2003 Vadim Zeitlin |
11 | // (c) 2004 Ryan Norton, Fredrik Roubert |
12 | // Licence: wxWindows licence |
13 | ///////////////////////////////////////////////////////////////////////////// |
14 | |
15 | // For compilers that support precompilation, includes "wx.h". |
16 | #include "wx/wxprec.h" |
17 | |
18 | #ifndef WX_PRECOMP |
19 | #ifdef __WXMSW__ |
20 | #include "wx/msw/missing.h" |
21 | #endif |
22 | #include "wx/intl.h" |
23 | #include "wx/log.h" |
24 | #include "wx/utils.h" |
25 | #include "wx/hashmap.h" |
26 | #endif |
27 | |
28 | #include "wx/strconv.h" |
29 | |
30 | #if wxUSE_WCHAR_T |
31 | |
32 | #ifdef __WINDOWS__ |
33 | #include "wx/msw/private.h" |
34 | #endif |
35 | |
36 | #ifndef __WXWINCE__ |
37 | #include <errno.h> |
38 | #endif |
39 | |
40 | #include <ctype.h> |
41 | #include <string.h> |
42 | #include <stdlib.h> |
43 | |
44 | #if defined(__WIN32__) && !defined(__WXMICROWIN__) |
45 | #define wxHAVE_WIN32_MB2WC |
46 | #endif |
47 | |
48 | #ifdef __SALFORDC__ |
49 | #include <clib.h> |
50 | #endif |
51 | |
52 | #ifdef HAVE_ICONV |
53 | #include <iconv.h> |
54 | #include "wx/thread.h" |
55 | #endif |
56 | |
57 | #include "wx/encconv.h" |
58 | #include "wx/fontmap.h" |
59 | |
60 | #ifdef __WXMAC__ |
61 | #ifndef __DARWIN__ |
62 | #include <ATSUnicode.h> |
63 | #include <TextCommon.h> |
64 | #include <TextEncodingConverter.h> |
65 | #endif |
66 | |
67 | // includes Mac headers |
68 | #include "wx/mac/private.h" |
69 | #include "wx/thread.h" |
70 | |
71 | #endif |
72 | |
73 | |
74 | #define TRACE_STRCONV _T("strconv") |
75 | |
76 | // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to |
77 | // be 4 bytes |
78 | #if SIZEOF_WCHAR_T == 2 |
79 | #define WC_UTF16 |
80 | #endif |
81 | |
82 | |
83 | // ============================================================================ |
84 | // implementation |
85 | // ============================================================================ |
86 | |
87 | // helper function of cMB2WC(): check if n bytes at this location are all NUL |
88 | static bool NotAllNULs(const char *p, size_t n) |
89 | { |
90 | while ( n && *p++ == '\0' ) |
91 | n--; |
92 | |
93 | return n != 0; |
94 | } |
95 | |
96 | // ---------------------------------------------------------------------------- |
97 | // UTF-16 en/decoding to/from UCS-4 with surrogates handling |
98 | // ---------------------------------------------------------------------------- |
99 | |
100 | static size_t encode_utf16(wxUint32 input, wxUint16 *output) |
101 | { |
102 | if (input <= 0xffff) |
103 | { |
104 | if (output) |
105 | *output = (wxUint16) input; |
106 | |
107 | return 1; |
108 | } |
109 | else if (input >= 0x110000) |
110 | { |
111 | return wxCONV_FAILED; |
112 | } |
113 | else |
114 | { |
115 | if (output) |
116 | { |
117 | *output++ = (wxUint16) ((input >> 10) + 0xd7c0); |
118 | *output = (wxUint16) ((input & 0x3ff) + 0xdc00); |
119 | } |
120 | |
121 | return 2; |
122 | } |
123 | } |
124 | |
125 | static size_t decode_utf16(const wxUint16* input, wxUint32& output) |
126 | { |
127 | if ((*input < 0xd800) || (*input > 0xdfff)) |
128 | { |
129 | output = *input; |
130 | return 1; |
131 | } |
132 | else if ((input[1] < 0xdc00) || (input[1] > 0xdfff)) |
133 | { |
134 | output = *input; |
135 | return wxCONV_FAILED; |
136 | } |
137 | else |
138 | { |
139 | output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00); |
140 | return 2; |
141 | } |
142 | } |
143 | |
144 | #ifdef WC_UTF16 |
145 | typedef wchar_t wxDecodeSurrogate_t; |
146 | #else // !WC_UTF16 |
147 | typedef wxUint16 wxDecodeSurrogate_t; |
148 | #endif // WC_UTF16/!WC_UTF16 |
149 | |
150 | // returns the next UTF-32 character from the wchar_t buffer and advances the |
151 | // pointer to the character after this one |
152 | // |
153 | // if an invalid character is found, *pSrc is set to NULL, the caller must |
154 | // check for this |
155 | static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc) |
156 | { |
157 | wxUint32 out; |
158 | const size_t |
159 | n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out); |
160 | if ( n == wxCONV_FAILED ) |
161 | *pSrc = NULL; |
162 | else |
163 | *pSrc += n; |
164 | |
165 | return out; |
166 | } |
167 | |
168 | // ---------------------------------------------------------------------------- |
169 | // wxMBConv |
170 | // ---------------------------------------------------------------------------- |
171 | |
172 | size_t |
173 | wxMBConv::ToWChar(wchar_t *dst, size_t dstLen, |
174 | const char *src, size_t srcLen) const |
175 | { |
176 | // although new conversion classes are supposed to implement this function |
177 | // directly, the existins ones only implement the old MB2WC() and so, to |
178 | // avoid to have to rewrite all conversion classes at once, we provide a |
179 | // default (but not efficient) implementation of this one in terms of the |
180 | // old function by copying the input to ensure that it's NUL-terminated and |
181 | // then using MB2WC() to convert it |
182 | |
183 | // the number of chars [which would be] written to dst [if it were not NULL] |
184 | size_t dstWritten = 0; |
185 | |
186 | // the number of NULs terminating this string |
187 | size_t nulLen = 0; // not really needed, but just to avoid warnings |
188 | |
189 | // if we were not given the input size we just have to assume that the |
190 | // string is properly terminated as we have no way of knowing how long it |
191 | // is anyhow, but if we do have the size check whether there are enough |
192 | // NULs at the end |
193 | wxCharBuffer bufTmp; |
194 | const char *srcEnd; |
195 | if ( srcLen != wxNO_LEN ) |
196 | { |
197 | // we need to know how to find the end of this string |
198 | nulLen = GetMBNulLen(); |
199 | if ( nulLen == wxCONV_FAILED ) |
200 | return wxCONV_FAILED; |
201 | |
202 | // if there are enough NULs we can avoid the copy |
203 | if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) ) |
204 | { |
205 | // make a copy in order to properly NUL-terminate the string |
206 | bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */); |
207 | char * const p = bufTmp.data(); |
208 | memcpy(p, src, srcLen); |
209 | for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ ) |
210 | *s = '\0'; |
211 | |
212 | src = bufTmp; |
213 | } |
214 | |
215 | srcEnd = src + srcLen; |
216 | } |
217 | else // quit after the first loop iteration |
218 | { |
219 | srcEnd = NULL; |
220 | } |
221 | |
222 | for ( ;; ) |
223 | { |
224 | // try to convert the current chunk |
225 | size_t lenChunk = MB2WC(NULL, src, 0); |
226 | if ( lenChunk == wxCONV_FAILED ) |
227 | return wxCONV_FAILED; |
228 | |
229 | lenChunk++; // for the L'\0' at the end of this chunk |
230 | |
231 | dstWritten += lenChunk; |
232 | |
233 | if ( lenChunk == 1 ) |
234 | { |
235 | // nothing left in the input string, conversion succeeded |
236 | break; |
237 | } |
238 | |
239 | if ( dst ) |
240 | { |
241 | if ( dstWritten > dstLen ) |
242 | return wxCONV_FAILED; |
243 | |
244 | if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED ) |
245 | return wxCONV_FAILED; |
246 | |
247 | dst += lenChunk; |
248 | } |
249 | |
250 | if ( !srcEnd ) |
251 | { |
252 | // we convert just one chunk in this case as this is the entire |
253 | // string anyhow |
254 | break; |
255 | } |
256 | |
257 | // advance the input pointer past the end of this chunk |
258 | while ( NotAllNULs(src, nulLen) ) |
259 | { |
260 | // notice that we must skip over multiple bytes here as we suppose |
261 | // that if NUL takes 2 or 4 bytes, then all the other characters do |
262 | // too and so if advanced by a single byte we might erroneously |
263 | // detect sequences of NUL bytes in the middle of the input |
264 | src += nulLen; |
265 | } |
266 | |
267 | src += nulLen; // skipping over its terminator as well |
268 | |
269 | // note that ">=" (and not just "==") is needed here as the terminator |
270 | // we skipped just above could be inside or just after the buffer |
271 | // delimited by inEnd |
272 | if ( src >= srcEnd ) |
273 | break; |
274 | } |
275 | |
276 | return dstWritten; |
277 | } |
278 | |
279 | size_t |
280 | wxMBConv::FromWChar(char *dst, size_t dstLen, |
281 | const wchar_t *src, size_t srcLen) const |
282 | { |
283 | // the number of chars [which would be] written to dst [if it were not NULL] |
284 | size_t dstWritten = 0; |
285 | |
286 | // make a copy of the input string unless it is already properly |
287 | // NUL-terminated |
288 | // |
289 | // if we don't know its length we have no choice but to assume that it is, |
290 | // indeed, properly terminated |
291 | wxWCharBuffer bufTmp; |
292 | if ( srcLen == wxNO_LEN ) |
293 | { |
294 | srcLen = wxWcslen(src) + 1; |
295 | } |
296 | else if ( srcLen != 0 && src[srcLen - 1] != L'\0' ) |
297 | { |
298 | // make a copy in order to properly NUL-terminate the string |
299 | bufTmp = wxWCharBuffer(srcLen); |
300 | memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t)); |
301 | src = bufTmp; |
302 | } |
303 | |
304 | const size_t lenNul = GetMBNulLen(); |
305 | for ( const wchar_t * const srcEnd = src + srcLen; |
306 | src < srcEnd; |
307 | src += wxWcslen(src) + 1 /* skip L'\0' too */ ) |
308 | { |
309 | // try to convert the current chunk |
310 | size_t lenChunk = WC2MB(NULL, src, 0); |
311 | |
312 | if ( lenChunk == wxCONV_FAILED ) |
313 | return wxCONV_FAILED; |
314 | |
315 | lenChunk += lenNul; |
316 | dstWritten += lenChunk; |
317 | |
318 | if ( dst ) |
319 | { |
320 | if ( dstWritten > dstLen ) |
321 | return wxCONV_FAILED; |
322 | |
323 | if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED ) |
324 | return wxCONV_FAILED; |
325 | |
326 | dst += lenChunk; |
327 | } |
328 | } |
329 | |
330 | return dstWritten; |
331 | } |
332 | |
333 | size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const |
334 | { |
335 | size_t rc = ToWChar(outBuff, outLen, inBuff); |
336 | if ( rc != wxCONV_FAILED ) |
337 | { |
338 | // ToWChar() returns the buffer length, i.e. including the trailing |
339 | // NUL, while this method doesn't take it into account |
340 | rc--; |
341 | } |
342 | |
343 | return rc; |
344 | } |
345 | |
346 | size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const |
347 | { |
348 | size_t rc = FromWChar(outBuff, outLen, inBuff); |
349 | if ( rc != wxCONV_FAILED ) |
350 | { |
351 | rc -= GetMBNulLen(); |
352 | } |
353 | |
354 | return rc; |
355 | } |
356 | |
357 | wxMBConv::~wxMBConv() |
358 | { |
359 | // nothing to do here (necessary for Darwin linking probably) |
360 | } |
361 | |
362 | const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const |
363 | { |
364 | if ( psz ) |
365 | { |
366 | // calculate the length of the buffer needed first |
367 | const size_t nLen = MB2WC(NULL, psz, 0); |
368 | if ( nLen != wxCONV_FAILED ) |
369 | { |
370 | // now do the actual conversion |
371 | wxWCharBuffer buf(nLen /* +1 added implicitly */); |
372 | |
373 | // +1 for the trailing NULL |
374 | if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED ) |
375 | return buf; |
376 | } |
377 | } |
378 | |
379 | return wxWCharBuffer(); |
380 | } |
381 | |
382 | const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const |
383 | { |
384 | if ( pwz ) |
385 | { |
386 | const size_t nLen = WC2MB(NULL, pwz, 0); |
387 | if ( nLen != wxCONV_FAILED ) |
388 | { |
389 | // extra space for trailing NUL(s) |
390 | static const size_t extraLen = GetMaxMBNulLen(); |
391 | |
392 | wxCharBuffer buf(nLen + extraLen - 1); |
393 | if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED ) |
394 | return buf; |
395 | } |
396 | } |
397 | |
398 | return wxCharBuffer(); |
399 | } |
400 | |
401 | const wxWCharBuffer |
402 | wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const |
403 | { |
404 | const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen); |
405 | if ( dstLen != wxCONV_FAILED ) |
406 | { |
407 | wxWCharBuffer wbuf(dstLen - 1); |
408 | if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) |
409 | { |
410 | if ( outLen ) |
411 | { |
412 | *outLen = dstLen; |
413 | if ( wbuf[dstLen - 1] == L'\0' ) |
414 | (*outLen)--; |
415 | } |
416 | |
417 | return wbuf; |
418 | } |
419 | } |
420 | |
421 | if ( outLen ) |
422 | *outLen = 0; |
423 | |
424 | return wxWCharBuffer(); |
425 | } |
426 | |
427 | const wxCharBuffer |
428 | wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const |
429 | { |
430 | size_t dstLen = FromWChar(NULL, 0, inBuff, inLen); |
431 | if ( dstLen != wxCONV_FAILED ) |
432 | { |
433 | // special case of empty input: can't allocate 0 size buffer below as |
434 | // wxCharBuffer insists on NUL-terminating it |
435 | wxCharBuffer buf(dstLen ? dstLen - 1 : 1); |
436 | if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED ) |
437 | { |
438 | if ( outLen ) |
439 | { |
440 | *outLen = dstLen; |
441 | |
442 | const size_t nulLen = GetMBNulLen(); |
443 | if ( dstLen >= nulLen && |
444 | !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) ) |
445 | { |
446 | // in this case the output is NUL-terminated and we're not |
447 | // supposed to count NUL |
448 | *outLen -= nulLen; |
449 | } |
450 | } |
451 | |
452 | return buf; |
453 | } |
454 | } |
455 | |
456 | if ( outLen ) |
457 | *outLen = 0; |
458 | |
459 | return wxCharBuffer(); |
460 | } |
461 | |
462 | // ---------------------------------------------------------------------------- |
463 | // wxMBConvLibc |
464 | // ---------------------------------------------------------------------------- |
465 | |
466 | size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const |
467 | { |
468 | return wxMB2WC(buf, psz, n); |
469 | } |
470 | |
471 | size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const |
472 | { |
473 | return wxWC2MB(buf, psz, n); |
474 | } |
475 | |
476 | // ---------------------------------------------------------------------------- |
477 | // wxConvBrokenFileNames |
478 | // ---------------------------------------------------------------------------- |
479 | |
480 | #ifdef __UNIX__ |
481 | |
482 | wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset) |
483 | { |
484 | if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0 |
485 | || wxStricmp(charset, _T("UTF8")) == 0 ) |
486 | m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA); |
487 | else |
488 | m_conv = new wxCSConv(charset); |
489 | } |
490 | |
491 | #endif // __UNIX__ |
492 | |
493 | // ---------------------------------------------------------------------------- |
494 | // UTF-7 |
495 | // ---------------------------------------------------------------------------- |
496 | |
497 | // Implementation (C) 2004 Fredrik Roubert |
498 | |
499 | // |
500 | // BASE64 decoding table |
501 | // |
502 | static const unsigned char utf7unb64[] = |
503 | { |
504 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
505 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
506 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
507 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
508 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
509 | 0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f, |
510 | 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, |
511 | 0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
512 | 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, |
513 | 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, |
514 | 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, |
515 | 0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff, |
516 | 0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, |
517 | 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, |
518 | 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, |
519 | 0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff, |
520 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
521 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
522 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
523 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
524 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
525 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
526 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
527 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
528 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
529 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
530 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
531 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
532 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
533 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
534 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, |
535 | 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |
536 | }; |
537 | |
538 | size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const |
539 | { |
540 | size_t len = 0; |
541 | |
542 | while ( *psz && (!buf || (len < n)) ) |
543 | { |
544 | unsigned char cc = *psz++; |
545 | if (cc != '+') |
546 | { |
547 | // plain ASCII char |
548 | if (buf) |
549 | *buf++ = cc; |
550 | len++; |
551 | } |
552 | else if (*psz == '-') |
553 | { |
554 | // encoded plus sign |
555 | if (buf) |
556 | *buf++ = cc; |
557 | len++; |
558 | psz++; |
559 | } |
560 | else // start of BASE64 encoded string |
561 | { |
562 | bool lsb, ok; |
563 | unsigned int d, l; |
564 | for ( ok = lsb = false, d = 0, l = 0; |
565 | (cc = utf7unb64[(unsigned char)*psz]) != 0xff; |
566 | psz++ ) |
567 | { |
568 | d <<= 6; |
569 | d += cc; |
570 | for (l += 6; l >= 8; lsb = !lsb) |
571 | { |
572 | unsigned char c = (unsigned char)((d >> (l -= 8)) % 256); |
573 | if (lsb) |
574 | { |
575 | if (buf) |
576 | *buf++ |= c; |
577 | len ++; |
578 | } |
579 | else |
580 | { |
581 | if (buf) |
582 | *buf = (wchar_t)(c << 8); |
583 | } |
584 | |
585 | ok = true; |
586 | } |
587 | } |
588 | |
589 | if ( !ok ) |
590 | { |
591 | // in valid UTF7 we should have valid characters after '+' |
592 | return wxCONV_FAILED; |
593 | } |
594 | |
595 | if (*psz == '-') |
596 | psz++; |
597 | } |
598 | } |
599 | |
600 | if ( buf && (len < n) ) |
601 | *buf = '\0'; |
602 | |
603 | return len; |
604 | } |
605 | |
606 | // |
607 | // BASE64 encoding table |
608 | // |
609 | static const unsigned char utf7enb64[] = |
610 | { |
611 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', |
612 | 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', |
613 | 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
614 | 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', |
615 | 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', |
616 | 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', |
617 | 'w', 'x', 'y', 'z', '0', '1', '2', '3', |
618 | '4', '5', '6', '7', '8', '9', '+', '/' |
619 | }; |
620 | |
621 | // |
622 | // UTF-7 encoding table |
623 | // |
624 | // 0 - Set D (directly encoded characters) |
625 | // 1 - Set O (optional direct characters) |
626 | // 2 - whitespace characters (optional) |
627 | // 3 - special characters |
628 | // |
629 | static const unsigned char utf7encode[128] = |
630 | { |
631 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, |
632 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
633 | 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3, |
634 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, |
635 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
636 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1, |
637 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
638 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3 |
639 | }; |
640 | |
641 | size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const |
642 | { |
643 | size_t len = 0; |
644 | |
645 | while (*psz && ((!buf) || (len < n))) |
646 | { |
647 | wchar_t cc = *psz++; |
648 | if (cc < 0x80 && utf7encode[cc] < 1) |
649 | { |
650 | // plain ASCII char |
651 | if (buf) |
652 | *buf++ = (char)cc; |
653 | |
654 | len++; |
655 | } |
656 | #ifndef WC_UTF16 |
657 | else if (((wxUint32)cc) > 0xffff) |
658 | { |
659 | // no surrogate pair generation (yet?) |
660 | return wxCONV_FAILED; |
661 | } |
662 | #endif |
663 | else |
664 | { |
665 | if (buf) |
666 | *buf++ = '+'; |
667 | |
668 | len++; |
669 | if (cc != '+') |
670 | { |
671 | // BASE64 encode string |
672 | unsigned int lsb, d, l; |
673 | for (d = 0, l = 0; /*nothing*/; psz++) |
674 | { |
675 | for (lsb = 0; lsb < 2; lsb ++) |
676 | { |
677 | d <<= 8; |
678 | d += lsb ? cc & 0xff : (cc & 0xff00) >> 8; |
679 | |
680 | for (l += 8; l >= 6; ) |
681 | { |
682 | l -= 6; |
683 | if (buf) |
684 | *buf++ = utf7enb64[(d >> l) % 64]; |
685 | len++; |
686 | } |
687 | } |
688 | |
689 | cc = *psz; |
690 | if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1)) |
691 | break; |
692 | } |
693 | |
694 | if (l != 0) |
695 | { |
696 | if (buf) |
697 | *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64]; |
698 | |
699 | len++; |
700 | } |
701 | } |
702 | |
703 | if (buf) |
704 | *buf++ = '-'; |
705 | len++; |
706 | } |
707 | } |
708 | |
709 | if (buf && (len < n)) |
710 | *buf = 0; |
711 | |
712 | return len; |
713 | } |
714 | |
715 | // ---------------------------------------------------------------------------- |
716 | // UTF-8 |
717 | // ---------------------------------------------------------------------------- |
718 | |
719 | static wxUint32 utf8_max[]= |
720 | { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff }; |
721 | |
722 | // boundaries of the private use area we use to (temporarily) remap invalid |
723 | // characters invalid in a UTF-8 encoded string |
724 | const wxUint32 wxUnicodePUA = 0x100000; |
725 | const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256; |
726 | |
727 | size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const |
728 | { |
729 | size_t len = 0; |
730 | |
731 | while (*psz && ((!buf) || (len < n))) |
732 | { |
733 | const char *opsz = psz; |
734 | bool invalid = false; |
735 | unsigned char cc = *psz++, fc = cc; |
736 | unsigned cnt; |
737 | for (cnt = 0; fc & 0x80; cnt++) |
738 | fc <<= 1; |
739 | |
740 | if (!cnt) |
741 | { |
742 | // plain ASCII char |
743 | if (buf) |
744 | *buf++ = cc; |
745 | len++; |
746 | |
747 | // escape the escape character for octal escapes |
748 | if ((m_options & MAP_INVALID_UTF8_TO_OCTAL) |
749 | && cc == '\\' && (!buf || len < n)) |
750 | { |
751 | if (buf) |
752 | *buf++ = cc; |
753 | len++; |
754 | } |
755 | } |
756 | else |
757 | { |
758 | cnt--; |
759 | if (!cnt) |
760 | { |
761 | // invalid UTF-8 sequence |
762 | invalid = true; |
763 | } |
764 | else |
765 | { |
766 | unsigned ocnt = cnt - 1; |
767 | wxUint32 res = cc & (0x3f >> cnt); |
768 | while (cnt--) |
769 | { |
770 | cc = *psz; |
771 | if ((cc & 0xC0) != 0x80) |
772 | { |
773 | // invalid UTF-8 sequence |
774 | invalid = true; |
775 | break; |
776 | } |
777 | |
778 | psz++; |
779 | res = (res << 6) | (cc & 0x3f); |
780 | } |
781 | |
782 | if (invalid || res <= utf8_max[ocnt]) |
783 | { |
784 | // illegal UTF-8 encoding |
785 | invalid = true; |
786 | } |
787 | else if ((m_options & MAP_INVALID_UTF8_TO_PUA) && |
788 | res >= wxUnicodePUA && res < wxUnicodePUAEnd) |
789 | { |
790 | // if one of our PUA characters turns up externally |
791 | // it must also be treated as an illegal sequence |
792 | // (a bit like you have to escape an escape character) |
793 | invalid = true; |
794 | } |
795 | else |
796 | { |
797 | #ifdef WC_UTF16 |
798 | // cast is ok because wchar_t == wxUuint16 if WC_UTF16 |
799 | size_t pa = encode_utf16(res, (wxUint16 *)buf); |
800 | if (pa == wxCONV_FAILED) |
801 | { |
802 | invalid = true; |
803 | } |
804 | else |
805 | { |
806 | if (buf) |
807 | buf += pa; |
808 | len += pa; |
809 | } |
810 | #else // !WC_UTF16 |
811 | if (buf) |
812 | *buf++ = (wchar_t)res; |
813 | len++; |
814 | #endif // WC_UTF16/!WC_UTF16 |
815 | } |
816 | } |
817 | |
818 | if (invalid) |
819 | { |
820 | if (m_options & MAP_INVALID_UTF8_TO_PUA) |
821 | { |
822 | while (opsz < psz && (!buf || len < n)) |
823 | { |
824 | #ifdef WC_UTF16 |
825 | // cast is ok because wchar_t == wxUuint16 if WC_UTF16 |
826 | size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf); |
827 | wxASSERT(pa != wxCONV_FAILED); |
828 | if (buf) |
829 | buf += pa; |
830 | opsz++; |
831 | len += pa; |
832 | #else |
833 | if (buf) |
834 | *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz); |
835 | opsz++; |
836 | len++; |
837 | #endif |
838 | } |
839 | } |
840 | else if (m_options & MAP_INVALID_UTF8_TO_OCTAL) |
841 | { |
842 | while (opsz < psz && (!buf || len < n)) |
843 | { |
844 | if ( buf && len + 3 < n ) |
845 | { |
846 | unsigned char on = *opsz; |
847 | *buf++ = L'\\'; |
848 | *buf++ = (wchar_t)( L'0' + on / 0100 ); |
849 | *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 ); |
850 | *buf++ = (wchar_t)( L'0' + on % 010 ); |
851 | } |
852 | |
853 | opsz++; |
854 | len += 4; |
855 | } |
856 | } |
857 | else // MAP_INVALID_UTF8_NOT |
858 | { |
859 | return wxCONV_FAILED; |
860 | } |
861 | } |
862 | } |
863 | } |
864 | |
865 | if (buf && (len < n)) |
866 | *buf = 0; |
867 | |
868 | return len; |
869 | } |
870 | |
871 | static inline bool isoctal(wchar_t wch) |
872 | { |
873 | return L'0' <= wch && wch <= L'7'; |
874 | } |
875 | |
876 | size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const |
877 | { |
878 | size_t len = 0; |
879 | |
880 | while (*psz && ((!buf) || (len < n))) |
881 | { |
882 | wxUint32 cc; |
883 | |
884 | #ifdef WC_UTF16 |
885 | // cast is ok for WC_UTF16 |
886 | size_t pa = decode_utf16((const wxUint16 *)psz, cc); |
887 | psz += (pa == wxCONV_FAILED) ? 1 : pa; |
888 | #else |
889 | cc = (*psz++) & 0x7fffffff; |
890 | #endif |
891 | |
892 | if ( (m_options & MAP_INVALID_UTF8_TO_PUA) |
893 | && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd ) |
894 | { |
895 | if (buf) |
896 | *buf++ = (char)(cc - wxUnicodePUA); |
897 | len++; |
898 | } |
899 | else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) |
900 | && cc == L'\\' && psz[0] == L'\\' ) |
901 | { |
902 | if (buf) |
903 | *buf++ = (char)cc; |
904 | psz++; |
905 | len++; |
906 | } |
907 | else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) && |
908 | cc == L'\\' && |
909 | isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) ) |
910 | { |
911 | if (buf) |
912 | { |
913 | *buf++ = (char) ((psz[0] - L'0') * 0100 + |
914 | (psz[1] - L'0') * 010 + |
915 | (psz[2] - L'0')); |
916 | } |
917 | |
918 | psz += 3; |
919 | len++; |
920 | } |
921 | else |
922 | { |
923 | unsigned cnt; |
924 | for (cnt = 0; cc > utf8_max[cnt]; cnt++) |
925 | { |
926 | } |
927 | |
928 | if (!cnt) |
929 | { |
930 | // plain ASCII char |
931 | if (buf) |
932 | *buf++ = (char) cc; |
933 | len++; |
934 | } |
935 | else |
936 | { |
937 | len += cnt + 1; |
938 | if (buf) |
939 | { |
940 | *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt))); |
941 | while (cnt--) |
942 | *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f)); |
943 | } |
944 | } |
945 | } |
946 | } |
947 | |
948 | if (buf && (len < n)) |
949 | *buf = 0; |
950 | |
951 | return len; |
952 | } |
953 | |
954 | // ============================================================================ |
955 | // UTF-16 |
956 | // ============================================================================ |
957 | |
958 | #ifdef WORDS_BIGENDIAN |
959 | #define wxMBConvUTF16straight wxMBConvUTF16BE |
960 | #define wxMBConvUTF16swap wxMBConvUTF16LE |
961 | #else |
962 | #define wxMBConvUTF16swap wxMBConvUTF16BE |
963 | #define wxMBConvUTF16straight wxMBConvUTF16LE |
964 | #endif |
965 | |
966 | /* static */ |
967 | size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen) |
968 | { |
969 | if ( srcLen == wxNO_LEN ) |
970 | { |
971 | // count the number of bytes in input, including the trailing NULs |
972 | const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); |
973 | for ( srcLen = 1; *inBuff++; srcLen++ ) |
974 | ; |
975 | |
976 | srcLen *= BYTES_PER_CHAR; |
977 | } |
978 | else // we already have the length |
979 | { |
980 | // we can only convert an entire number of UTF-16 characters |
981 | if ( srcLen % BYTES_PER_CHAR ) |
982 | return wxCONV_FAILED; |
983 | } |
984 | |
985 | return srcLen; |
986 | } |
987 | |
988 | // case when in-memory representation is UTF-16 too |
989 | #ifdef WC_UTF16 |
990 | |
991 | // ---------------------------------------------------------------------------- |
992 | // conversions without endianness change |
993 | // ---------------------------------------------------------------------------- |
994 | |
995 | size_t |
996 | wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, |
997 | const char *src, size_t srcLen) const |
998 | { |
999 | // set up the scene for using memcpy() (which is presumably more efficient |
1000 | // than copying the bytes one by one) |
1001 | srcLen = GetLength(src, srcLen); |
1002 | if ( srcLen == wxNO_LEN ) |
1003 | return wxCONV_FAILED; |
1004 | |
1005 | const size_t inLen = srcLen / BYTES_PER_CHAR; |
1006 | if ( dst ) |
1007 | { |
1008 | if ( dstLen < inLen ) |
1009 | return wxCONV_FAILED; |
1010 | |
1011 | memcpy(dst, src, srcLen); |
1012 | } |
1013 | |
1014 | return inLen; |
1015 | } |
1016 | |
1017 | size_t |
1018 | wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen, |
1019 | const wchar_t *src, size_t srcLen) const |
1020 | { |
1021 | if ( srcLen == wxNO_LEN ) |
1022 | srcLen = wxWcslen(src) + 1; |
1023 | |
1024 | srcLen *= BYTES_PER_CHAR; |
1025 | |
1026 | if ( dst ) |
1027 | { |
1028 | if ( dstLen < srcLen ) |
1029 | return wxCONV_FAILED; |
1030 | |
1031 | memcpy(dst, src, srcLen); |
1032 | } |
1033 | |
1034 | return srcLen; |
1035 | } |
1036 | |
1037 | // ---------------------------------------------------------------------------- |
1038 | // endian-reversing conversions |
1039 | // ---------------------------------------------------------------------------- |
1040 | |
1041 | size_t |
1042 | wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, |
1043 | const char *src, size_t srcLen) const |
1044 | { |
1045 | srcLen = GetLength(src, srcLen); |
1046 | if ( srcLen == wxNO_LEN ) |
1047 | return wxCONV_FAILED; |
1048 | |
1049 | srcLen /= BYTES_PER_CHAR; |
1050 | |
1051 | if ( dst ) |
1052 | { |
1053 | if ( dstLen < srcLen ) |
1054 | return wxCONV_FAILED; |
1055 | |
1056 | const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); |
1057 | for ( size_t n = 0; n < srcLen; n++, inBuff++ ) |
1058 | { |
1059 | *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff); |
1060 | } |
1061 | } |
1062 | |
1063 | return srcLen; |
1064 | } |
1065 | |
1066 | size_t |
1067 | wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen, |
1068 | const wchar_t *src, size_t srcLen) const |
1069 | { |
1070 | if ( srcLen == wxNO_LEN ) |
1071 | srcLen = wxWcslen(src) + 1; |
1072 | |
1073 | srcLen *= BYTES_PER_CHAR; |
1074 | |
1075 | if ( dst ) |
1076 | { |
1077 | if ( dstLen < srcLen ) |
1078 | return wxCONV_FAILED; |
1079 | |
1080 | wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); |
1081 | for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ ) |
1082 | { |
1083 | *outBuff++ = wxUINT16_SWAP_ALWAYS(*src); |
1084 | } |
1085 | } |
1086 | |
1087 | return srcLen; |
1088 | } |
1089 | |
1090 | #else // !WC_UTF16: wchar_t is UTF-32 |
1091 | |
1092 | // ---------------------------------------------------------------------------- |
1093 | // conversions without endianness change |
1094 | // ---------------------------------------------------------------------------- |
1095 | |
1096 | size_t |
1097 | wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen, |
1098 | const char *src, size_t srcLen) const |
1099 | { |
1100 | srcLen = GetLength(src, srcLen); |
1101 | if ( srcLen == wxNO_LEN ) |
1102 | return wxCONV_FAILED; |
1103 | |
1104 | const size_t inLen = srcLen / BYTES_PER_CHAR; |
1105 | if ( !dst ) |
1106 | { |
1107 | // optimization: return maximal space which could be needed for this |
1108 | // string even if the real size could be smaller if the buffer contains |
1109 | // any surrogates |
1110 | return inLen; |
1111 | } |
1112 | |
1113 | size_t outLen = 0; |
1114 | const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); |
1115 | for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) |
1116 | { |
1117 | const wxUint32 ch = wxDecodeSurrogate(&inBuff); |
1118 | if ( !inBuff ) |
1119 | return wxCONV_FAILED; |
1120 | |
1121 | if ( ++outLen > dstLen ) |
1122 | return wxCONV_FAILED; |
1123 | |
1124 | *dst++ = ch; |
1125 | } |
1126 | |
1127 | |
1128 | return outLen; |
1129 | } |
1130 | |
1131 | size_t |
1132 | wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen, |
1133 | const wchar_t *src, size_t srcLen) const |
1134 | { |
1135 | if ( srcLen == wxNO_LEN ) |
1136 | srcLen = wxWcslen(src) + 1; |
1137 | |
1138 | size_t outLen = 0; |
1139 | wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); |
1140 | for ( size_t n = 0; n < srcLen; n++ ) |
1141 | { |
1142 | wxUint16 cc[2]; |
1143 | const size_t numChars = encode_utf16(*src++, cc); |
1144 | if ( numChars == wxCONV_FAILED ) |
1145 | return wxCONV_FAILED; |
1146 | |
1147 | outLen += numChars * BYTES_PER_CHAR; |
1148 | if ( outBuff ) |
1149 | { |
1150 | if ( outLen > dstLen ) |
1151 | return wxCONV_FAILED; |
1152 | |
1153 | *outBuff++ = cc[0]; |
1154 | if ( numChars == 2 ) |
1155 | { |
1156 | // second character of a surrogate |
1157 | *outBuff++ = cc[1]; |
1158 | } |
1159 | } |
1160 | } |
1161 | |
1162 | return outLen; |
1163 | } |
1164 | |
1165 | // ---------------------------------------------------------------------------- |
1166 | // endian-reversing conversions |
1167 | // ---------------------------------------------------------------------------- |
1168 | |
1169 | size_t |
1170 | wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen, |
1171 | const char *src, size_t srcLen) const |
1172 | { |
1173 | srcLen = GetLength(src, srcLen); |
1174 | if ( srcLen == wxNO_LEN ) |
1175 | return wxCONV_FAILED; |
1176 | |
1177 | const size_t inLen = srcLen / BYTES_PER_CHAR; |
1178 | if ( !dst ) |
1179 | { |
1180 | // optimization: return maximal space which could be needed for this |
1181 | // string even if the real size could be smaller if the buffer contains |
1182 | // any surrogates |
1183 | return inLen; |
1184 | } |
1185 | |
1186 | size_t outLen = 0; |
1187 | const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src); |
1188 | for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; ) |
1189 | { |
1190 | wxUint32 ch; |
1191 | wxUint16 tmp[2]; |
1192 | |
1193 | tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff); |
1194 | inBuff++; |
1195 | tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff); |
1196 | |
1197 | const size_t numChars = decode_utf16(tmp, ch); |
1198 | if ( numChars == wxCONV_FAILED ) |
1199 | return wxCONV_FAILED; |
1200 | |
1201 | if ( numChars == 2 ) |
1202 | inBuff++; |
1203 | |
1204 | if ( ++outLen > dstLen ) |
1205 | return wxCONV_FAILED; |
1206 | |
1207 | *dst++ = ch; |
1208 | } |
1209 | |
1210 | |
1211 | return outLen; |
1212 | } |
1213 | |
1214 | size_t |
1215 | wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen, |
1216 | const wchar_t *src, size_t srcLen) const |
1217 | { |
1218 | if ( srcLen == wxNO_LEN ) |
1219 | srcLen = wxWcslen(src) + 1; |
1220 | |
1221 | size_t outLen = 0; |
1222 | wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst); |
1223 | for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ ) |
1224 | { |
1225 | wxUint16 cc[2]; |
1226 | const size_t numChars = encode_utf16(*src, cc); |
1227 | if ( numChars == wxCONV_FAILED ) |
1228 | return wxCONV_FAILED; |
1229 | |
1230 | outLen += numChars * BYTES_PER_CHAR; |
1231 | if ( outBuff ) |
1232 | { |
1233 | if ( outLen > dstLen ) |
1234 | return wxCONV_FAILED; |
1235 | |
1236 | *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]); |
1237 | if ( numChars == 2 ) |
1238 | { |
1239 | // second character of a surrogate |
1240 | *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]); |
1241 | } |
1242 | } |
1243 | } |
1244 | |
1245 | return outLen; |
1246 | } |
1247 | |
1248 | #endif // WC_UTF16/!WC_UTF16 |
1249 | |
1250 | |
1251 | // ============================================================================ |
1252 | // UTF-32 |
1253 | // ============================================================================ |
1254 | |
1255 | #ifdef WORDS_BIGENDIAN |
1256 | #define wxMBConvUTF32straight wxMBConvUTF32BE |
1257 | #define wxMBConvUTF32swap wxMBConvUTF32LE |
1258 | #else |
1259 | #define wxMBConvUTF32swap wxMBConvUTF32BE |
1260 | #define wxMBConvUTF32straight wxMBConvUTF32LE |
1261 | #endif |
1262 | |
1263 | |
1264 | WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE; |
1265 | WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE; |
1266 | |
1267 | /* static */ |
1268 | size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen) |
1269 | { |
1270 | if ( srcLen == wxNO_LEN ) |
1271 | { |
1272 | // count the number of bytes in input, including the trailing NULs |
1273 | const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); |
1274 | for ( srcLen = 1; *inBuff++; srcLen++ ) |
1275 | ; |
1276 | |
1277 | srcLen *= BYTES_PER_CHAR; |
1278 | } |
1279 | else // we already have the length |
1280 | { |
1281 | // we can only convert an entire number of UTF-32 characters |
1282 | if ( srcLen % BYTES_PER_CHAR ) |
1283 | return wxCONV_FAILED; |
1284 | } |
1285 | |
1286 | return srcLen; |
1287 | } |
1288 | |
1289 | // case when in-memory representation is UTF-16 |
1290 | #ifdef WC_UTF16 |
1291 | |
1292 | // ---------------------------------------------------------------------------- |
1293 | // conversions without endianness change |
1294 | // ---------------------------------------------------------------------------- |
1295 | |
1296 | size_t |
1297 | wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, |
1298 | const char *src, size_t srcLen) const |
1299 | { |
1300 | srcLen = GetLength(src, srcLen); |
1301 | if ( srcLen == wxNO_LEN ) |
1302 | return wxCONV_FAILED; |
1303 | |
1304 | const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); |
1305 | const size_t inLen = srcLen / BYTES_PER_CHAR; |
1306 | size_t outLen = 0; |
1307 | for ( size_t n = 0; n < inLen; n++ ) |
1308 | { |
1309 | wxUint16 cc[2]; |
1310 | const size_t numChars = encode_utf16(*inBuff++, cc); |
1311 | if ( numChars == wxCONV_FAILED ) |
1312 | return wxCONV_FAILED; |
1313 | |
1314 | outLen += numChars; |
1315 | if ( dst ) |
1316 | { |
1317 | if ( outLen > dstLen ) |
1318 | return wxCONV_FAILED; |
1319 | |
1320 | *dst++ = cc[0]; |
1321 | if ( numChars == 2 ) |
1322 | { |
1323 | // second character of a surrogate |
1324 | *dst++ = cc[1]; |
1325 | } |
1326 | } |
1327 | } |
1328 | |
1329 | return outLen; |
1330 | } |
1331 | |
1332 | size_t |
1333 | wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, |
1334 | const wchar_t *src, size_t srcLen) const |
1335 | { |
1336 | if ( srcLen == wxNO_LEN ) |
1337 | srcLen = wxWcslen(src) + 1; |
1338 | |
1339 | if ( !dst ) |
1340 | { |
1341 | // optimization: return maximal space which could be needed for this |
1342 | // string instead of the exact amount which could be less if there are |
1343 | // any surrogates in the input |
1344 | // |
1345 | // we consider that surrogates are rare enough to make it worthwhile to |
1346 | // avoid running the loop below at the cost of slightly extra memory |
1347 | // consumption |
1348 | return srcLen * BYTES_PER_CHAR; |
1349 | } |
1350 | |
1351 | wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); |
1352 | size_t outLen = 0; |
1353 | for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) |
1354 | { |
1355 | const wxUint32 ch = wxDecodeSurrogate(&src); |
1356 | if ( !src ) |
1357 | return wxCONV_FAILED; |
1358 | |
1359 | outLen += BYTES_PER_CHAR; |
1360 | |
1361 | if ( outLen > dstLen ) |
1362 | return wxCONV_FAILED; |
1363 | |
1364 | *outBuff++ = ch; |
1365 | } |
1366 | |
1367 | return outLen; |
1368 | } |
1369 | |
1370 | // ---------------------------------------------------------------------------- |
1371 | // endian-reversing conversions |
1372 | // ---------------------------------------------------------------------------- |
1373 | |
1374 | size_t |
1375 | wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen, |
1376 | const char *src, size_t srcLen) const |
1377 | { |
1378 | srcLen = GetLength(src, srcLen); |
1379 | if ( srcLen == wxNO_LEN ) |
1380 | return wxCONV_FAILED; |
1381 | |
1382 | const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); |
1383 | const size_t inLen = srcLen / BYTES_PER_CHAR; |
1384 | size_t outLen = 0; |
1385 | for ( size_t n = 0; n < inLen; n++, inBuff++ ) |
1386 | { |
1387 | wxUint16 cc[2]; |
1388 | const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc); |
1389 | if ( numChars == wxCONV_FAILED ) |
1390 | return wxCONV_FAILED; |
1391 | |
1392 | outLen += numChars; |
1393 | if ( dst ) |
1394 | { |
1395 | if ( outLen > dstLen ) |
1396 | return wxCONV_FAILED; |
1397 | |
1398 | *dst++ = cc[0]; |
1399 | if ( numChars == 2 ) |
1400 | { |
1401 | // second character of a surrogate |
1402 | *dst++ = cc[1]; |
1403 | } |
1404 | } |
1405 | } |
1406 | |
1407 | return outLen; |
1408 | } |
1409 | |
1410 | size_t |
1411 | wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, |
1412 | const wchar_t *src, size_t srcLen) const |
1413 | { |
1414 | if ( srcLen == wxNO_LEN ) |
1415 | srcLen = wxWcslen(src) + 1; |
1416 | |
1417 | if ( !dst ) |
1418 | { |
1419 | // optimization: return maximal space which could be needed for this |
1420 | // string instead of the exact amount which could be less if there are |
1421 | // any surrogates in the input |
1422 | // |
1423 | // we consider that surrogates are rare enough to make it worthwhile to |
1424 | // avoid running the loop below at the cost of slightly extra memory |
1425 | // consumption |
1426 | return srcLen*BYTES_PER_CHAR; |
1427 | } |
1428 | |
1429 | wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); |
1430 | size_t outLen = 0; |
1431 | for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; ) |
1432 | { |
1433 | const wxUint32 ch = wxDecodeSurrogate(&src); |
1434 | if ( !src ) |
1435 | return wxCONV_FAILED; |
1436 | |
1437 | outLen += BYTES_PER_CHAR; |
1438 | |
1439 | if ( outLen > dstLen ) |
1440 | return wxCONV_FAILED; |
1441 | |
1442 | *outBuff++ = wxUINT32_SWAP_ALWAYS(ch); |
1443 | } |
1444 | |
1445 | return outLen; |
1446 | } |
1447 | |
1448 | #else // !WC_UTF16: wchar_t is UTF-32 |
1449 | |
1450 | // ---------------------------------------------------------------------------- |
1451 | // conversions without endianness change |
1452 | // ---------------------------------------------------------------------------- |
1453 | |
1454 | size_t |
1455 | wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen, |
1456 | const char *src, size_t srcLen) const |
1457 | { |
1458 | // use memcpy() as it should be much faster than hand-written loop |
1459 | srcLen = GetLength(src, srcLen); |
1460 | if ( srcLen == wxNO_LEN ) |
1461 | return wxCONV_FAILED; |
1462 | |
1463 | const size_t inLen = srcLen/BYTES_PER_CHAR; |
1464 | if ( dst ) |
1465 | { |
1466 | if ( dstLen < inLen ) |
1467 | return wxCONV_FAILED; |
1468 | |
1469 | memcpy(dst, src, srcLen); |
1470 | } |
1471 | |
1472 | return inLen; |
1473 | } |
1474 | |
1475 | size_t |
1476 | wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen, |
1477 | const wchar_t *src, size_t srcLen) const |
1478 | { |
1479 | if ( srcLen == wxNO_LEN ) |
1480 | srcLen = wxWcslen(src) + 1; |
1481 | |
1482 | srcLen *= BYTES_PER_CHAR; |
1483 | |
1484 | if ( dst ) |
1485 | { |
1486 | if ( dstLen < srcLen ) |
1487 | return wxCONV_FAILED; |
1488 | |
1489 | memcpy(dst, src, srcLen); |
1490 | } |
1491 | |
1492 | return srcLen; |
1493 | } |
1494 | |
1495 | // ---------------------------------------------------------------------------- |
1496 | // endian-reversing conversions |
1497 | // ---------------------------------------------------------------------------- |
1498 | |
1499 | size_t |
1500 | wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen, |
1501 | const char *src, size_t srcLen) const |
1502 | { |
1503 | srcLen = GetLength(src, srcLen); |
1504 | if ( srcLen == wxNO_LEN ) |
1505 | return wxCONV_FAILED; |
1506 | |
1507 | srcLen /= BYTES_PER_CHAR; |
1508 | |
1509 | if ( dst ) |
1510 | { |
1511 | if ( dstLen < srcLen ) |
1512 | return wxCONV_FAILED; |
1513 | |
1514 | const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src); |
1515 | for ( size_t n = 0; n < srcLen; n++, inBuff++ ) |
1516 | { |
1517 | *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff); |
1518 | } |
1519 | } |
1520 | |
1521 | return srcLen; |
1522 | } |
1523 | |
1524 | size_t |
1525 | wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen, |
1526 | const wchar_t *src, size_t srcLen) const |
1527 | { |
1528 | if ( srcLen == wxNO_LEN ) |
1529 | srcLen = wxWcslen(src) + 1; |
1530 | |
1531 | srcLen *= BYTES_PER_CHAR; |
1532 | |
1533 | if ( dst ) |
1534 | { |
1535 | if ( dstLen < srcLen ) |
1536 | return wxCONV_FAILED; |
1537 | |
1538 | wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst); |
1539 | for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ ) |
1540 | { |
1541 | *outBuff++ = wxUINT32_SWAP_ALWAYS(*src); |
1542 | } |
1543 | } |
1544 | |
1545 | return srcLen; |
1546 | } |
1547 | |
1548 | #endif // WC_UTF16/!WC_UTF16 |
1549 | |
1550 | |
1551 | // ============================================================================ |
1552 | // The classes doing conversion using the iconv_xxx() functions |
1553 | // ============================================================================ |
1554 | |
1555 | #ifdef HAVE_ICONV |
1556 | |
1557 | // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with |
1558 | // E2BIG if output buffer is _exactly_ as big as needed. Such case is |
1559 | // (unless there's yet another bug in glibc) the only case when iconv() |
1560 | // returns with (size_t)-1 (which means error) and says there are 0 bytes |
1561 | // left in the input buffer -- when _real_ error occurs, |
1562 | // bytes-left-in-input buffer is non-zero. Hence, this alternative test for |
1563 | // iconv() failure. |
1564 | // [This bug does not appear in glibc 2.2.] |
1565 | #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1 |
1566 | #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \ |
1567 | (errno != E2BIG || bufLeft != 0)) |
1568 | #else |
1569 | #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1) |
1570 | #endif |
1571 | |
1572 | #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x)) |
1573 | |
1574 | #define ICONV_T_INVALID ((iconv_t)-1) |
1575 | |
1576 | #if SIZEOF_WCHAR_T == 4 |
1577 | #define WC_BSWAP wxUINT32_SWAP_ALWAYS |
1578 | #define WC_ENC wxFONTENCODING_UTF32 |
1579 | #elif SIZEOF_WCHAR_T == 2 |
1580 | #define WC_BSWAP wxUINT16_SWAP_ALWAYS |
1581 | #define WC_ENC wxFONTENCODING_UTF16 |
1582 | #else // sizeof(wchar_t) != 2 nor 4 |
1583 | // does this ever happen? |
1584 | #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org" |
1585 | #endif |
1586 | |
1587 | // ---------------------------------------------------------------------------- |
1588 | // wxMBConv_iconv: encapsulates an iconv character set |
1589 | // ---------------------------------------------------------------------------- |
1590 | |
1591 | class wxMBConv_iconv : public wxMBConv |
1592 | { |
1593 | public: |
1594 | wxMBConv_iconv(const wxChar *name); |
1595 | virtual ~wxMBConv_iconv(); |
1596 | |
1597 | virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const; |
1598 | virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const; |
1599 | |
1600 | // classify this encoding as explained in wxMBConv::GetMBNulLen() comment |
1601 | virtual size_t GetMBNulLen() const; |
1602 | |
1603 | virtual wxMBConv *Clone() const |
1604 | { |
1605 | wxMBConv_iconv *p = new wxMBConv_iconv(m_name); |
1606 | p->m_minMBCharWidth = m_minMBCharWidth; |
1607 | return p; |
1608 | } |
1609 | |
1610 | bool IsOk() const |
1611 | { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); } |
1612 | |
1613 | protected: |
1614 | // the iconv handlers used to translate from multibyte |
1615 | // to wide char and in the other direction |
1616 | iconv_t m2w, |
1617 | w2m; |
1618 | |
1619 | #if wxUSE_THREADS |
1620 | // guards access to m2w and w2m objects |
1621 | wxMutex m_iconvMutex; |
1622 | #endif |
1623 | |
1624 | private: |
1625 | // the name (for iconv_open()) of a wide char charset -- if none is |
1626 | // available on this machine, it will remain NULL |
1627 | static wxString ms_wcCharsetName; |
1628 | |
1629 | // true if the wide char encoding we use (i.e. ms_wcCharsetName) has |
1630 | // different endian-ness than the native one |
1631 | static bool ms_wcNeedsSwap; |
1632 | |
1633 | |
1634 | // name of the encoding handled by this conversion |
1635 | wxString m_name; |
1636 | |
1637 | // cached result of GetMBNulLen(); set to 0 meaning "unknown" |
1638 | // initially |
1639 | size_t m_minMBCharWidth; |
1640 | }; |
1641 | |
1642 | // make the constructor available for unit testing |
1643 | WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name ) |
1644 | { |
1645 | wxMBConv_iconv* result = new wxMBConv_iconv( name ); |
1646 | if ( !result->IsOk() ) |
1647 | { |
1648 | delete result; |
1649 | return 0; |
1650 | } |
1651 | |
1652 | return result; |
1653 | } |
1654 | |
1655 | wxString wxMBConv_iconv::ms_wcCharsetName; |
1656 | bool wxMBConv_iconv::ms_wcNeedsSwap = false; |
1657 | |
1658 | wxMBConv_iconv::wxMBConv_iconv(const wxChar *name) |
1659 | : m_name(name) |
1660 | { |
1661 | m_minMBCharWidth = 0; |
1662 | |
1663 | // iconv operates with chars, not wxChars, but luckily it uses only ASCII |
1664 | // names for the charsets |
1665 | const wxCharBuffer cname(wxString(name).ToAscii()); |
1666 | |
1667 | // check for charset that represents wchar_t: |
1668 | if ( ms_wcCharsetName.empty() ) |
1669 | { |
1670 | wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:")); |
1671 | |
1672 | #if wxUSE_FONTMAP |
1673 | const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC); |
1674 | #else // !wxUSE_FONTMAP |
1675 | static const wxChar *names_static[] = |
1676 | { |
1677 | #if SIZEOF_WCHAR_T == 4 |
1678 | _T("UCS-4"), |
1679 | #elif SIZEOF_WCHAR_T = 2 |
1680 | _T("UCS-2"), |
1681 | #endif |
1682 | NULL |
1683 | }; |
1684 | const wxChar **names = names_static; |
1685 | #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP |
1686 | |
1687 | for ( ; *names && ms_wcCharsetName.empty(); ++names ) |
1688 | { |
1689 | const wxString nameCS(*names); |
1690 | |
1691 | // first try charset with explicit bytesex info (e.g. "UCS-4LE"): |
1692 | wxString nameXE(nameCS); |
1693 | |
1694 | #ifdef WORDS_BIGENDIAN |
1695 | nameXE += _T("BE"); |
1696 | #else // little endian |
1697 | nameXE += _T("LE"); |
1698 | #endif |
1699 | |
1700 | wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""), |
1701 | nameXE.c_str()); |
1702 | |
1703 | m2w = iconv_open(nameXE.ToAscii(), cname); |
1704 | if ( m2w == ICONV_T_INVALID ) |
1705 | { |
1706 | // try charset w/o bytesex info (e.g. "UCS4") |
1707 | wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""), |
1708 | nameCS.c_str()); |
1709 | m2w = iconv_open(nameCS.ToAscii(), cname); |
1710 | |
1711 | // and check for bytesex ourselves: |
1712 | if ( m2w != ICONV_T_INVALID ) |
1713 | { |
1714 | char buf[2], *bufPtr; |
1715 | wchar_t wbuf[2], *wbufPtr; |
1716 | size_t insz, outsz; |
1717 | size_t res; |
1718 | |
1719 | buf[0] = 'A'; |
1720 | buf[1] = 0; |
1721 | wbuf[0] = 0; |
1722 | insz = 2; |
1723 | outsz = SIZEOF_WCHAR_T * 2; |
1724 | wbufPtr = wbuf; |
1725 | bufPtr = buf; |
1726 | |
1727 | res = iconv( |
1728 | m2w, ICONV_CHAR_CAST(&bufPtr), &insz, |
1729 | (char**)&wbufPtr, &outsz); |
1730 | |
1731 | if (ICONV_FAILED(res, insz)) |
1732 | { |
1733 | wxLogLastError(wxT("iconv")); |
1734 | wxLogError(_("Conversion to charset '%s' doesn't work."), |
1735 | nameCS.c_str()); |
1736 | } |
1737 | else // ok, can convert to this encoding, remember it |
1738 | { |
1739 | ms_wcCharsetName = nameCS; |
1740 | ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0]; |
1741 | } |
1742 | } |
1743 | } |
1744 | else // use charset not requiring byte swapping |
1745 | { |
1746 | ms_wcCharsetName = nameXE; |
1747 | } |
1748 | } |
1749 | |
1750 | wxLogTrace(TRACE_STRCONV, |
1751 | wxT("iconv wchar_t charset is \"%s\"%s"), |
1752 | ms_wcCharsetName.empty() ? _T("<none>") |
1753 | : ms_wcCharsetName.c_str(), |
1754 | ms_wcNeedsSwap ? _T(" (needs swap)") |
1755 | : _T("")); |
1756 | } |
1757 | else // we already have ms_wcCharsetName |
1758 | { |
1759 | m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname); |
1760 | } |
1761 | |
1762 | if ( ms_wcCharsetName.empty() ) |
1763 | { |
1764 | w2m = ICONV_T_INVALID; |
1765 | } |
1766 | else |
1767 | { |
1768 | w2m = iconv_open(cname, ms_wcCharsetName.ToAscii()); |
1769 | if ( w2m == ICONV_T_INVALID ) |
1770 | { |
1771 | wxLogTrace(TRACE_STRCONV, |
1772 | wxT("\"%s\" -> \"%s\" works but not the converse!?"), |
1773 | ms_wcCharsetName.c_str(), cname.data()); |
1774 | } |
1775 | } |
1776 | } |
1777 | |
1778 | wxMBConv_iconv::~wxMBConv_iconv() |
1779 | { |
1780 | if ( m2w != ICONV_T_INVALID ) |
1781 | iconv_close(m2w); |
1782 | if ( w2m != ICONV_T_INVALID ) |
1783 | iconv_close(w2m); |
1784 | } |
1785 | |
1786 | size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const |
1787 | { |
1788 | // find the string length: notice that must be done differently for |
1789 | // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs |
1790 | size_t inbuf; |
1791 | const size_t nulLen = GetMBNulLen(); |
1792 | switch ( nulLen ) |
1793 | { |
1794 | default: |
1795 | return wxCONV_FAILED; |
1796 | |
1797 | case 1: |
1798 | inbuf = strlen(psz); // arguably more optimized than our version |
1799 | break; |
1800 | |
1801 | case 2: |
1802 | case 4: |
1803 | // for UTF-16/32 not only we need to have 2/4 consecutive NULs but |
1804 | // they also have to start at character boundary and not span two |
1805 | // adjacent characters |
1806 | const char *p; |
1807 | for ( p = psz; NotAllNULs(p, nulLen); p += nulLen ) |
1808 | ; |
1809 | inbuf = p - psz; |
1810 | break; |
1811 | } |
1812 | |
1813 | #if wxUSE_THREADS |
1814 | // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle. |
1815 | // Unfortunately there are a couple of global wxCSConv objects such as |
1816 | // wxConvLocal that are used all over wx code, so we have to make sure |
1817 | // the handle is used by at most one thread at the time. Otherwise |
1818 | // only a few wx classes would be safe to use from non-main threads |
1819 | // as MB<->WC conversion would fail "randomly". |
1820 | wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); |
1821 | #endif // wxUSE_THREADS |
1822 | |
1823 | size_t outbuf = n * SIZEOF_WCHAR_T; |
1824 | size_t res, cres; |
1825 | // VS: Use these instead of psz, buf because iconv() modifies its arguments: |
1826 | wchar_t *bufPtr = buf; |
1827 | const char *pszPtr = psz; |
1828 | |
1829 | if (buf) |
1830 | { |
1831 | // have destination buffer, convert there |
1832 | cres = iconv(m2w, |
1833 | ICONV_CHAR_CAST(&pszPtr), &inbuf, |
1834 | (char**)&bufPtr, &outbuf); |
1835 | res = n - (outbuf / SIZEOF_WCHAR_T); |
1836 | |
1837 | if (ms_wcNeedsSwap) |
1838 | { |
1839 | // convert to native endianness |
1840 | for ( unsigned i = 0; i < res; i++ ) |
1841 | buf[n] = WC_BSWAP(buf[i]); |
1842 | } |
1843 | |
1844 | // NUL-terminate the string if there is any space left |
1845 | if (res < n) |
1846 | buf[res] = 0; |
1847 | } |
1848 | else |
1849 | { |
1850 | // no destination buffer... convert using temp buffer |
1851 | // to calculate destination buffer requirement |
1852 | wchar_t tbuf[8]; |
1853 | res = 0; |
1854 | |
1855 | do |
1856 | { |
1857 | bufPtr = tbuf; |
1858 | outbuf = 8 * SIZEOF_WCHAR_T; |
1859 | |
1860 | cres = iconv(m2w, |
1861 | ICONV_CHAR_CAST(&pszPtr), &inbuf, |
1862 | (char**)&bufPtr, &outbuf ); |
1863 | |
1864 | res += 8 - (outbuf / SIZEOF_WCHAR_T); |
1865 | } |
1866 | while ((cres == (size_t)-1) && (errno == E2BIG)); |
1867 | } |
1868 | |
1869 | if (ICONV_FAILED(cres, inbuf)) |
1870 | { |
1871 | //VS: it is ok if iconv fails, hence trace only |
1872 | wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); |
1873 | return wxCONV_FAILED; |
1874 | } |
1875 | |
1876 | return res; |
1877 | } |
1878 | |
1879 | size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const |
1880 | { |
1881 | #if wxUSE_THREADS |
1882 | // NB: explained in MB2WC |
1883 | wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex); |
1884 | #endif |
1885 | |
1886 | size_t inlen = wxWcslen(psz); |
1887 | size_t inbuf = inlen * SIZEOF_WCHAR_T; |
1888 | size_t outbuf = n; |
1889 | size_t res, cres; |
1890 | |
1891 | wchar_t *tmpbuf = 0; |
1892 | |
1893 | if (ms_wcNeedsSwap) |
1894 | { |
1895 | // need to copy to temp buffer to switch endianness |
1896 | // (doing WC_BSWAP twice on the original buffer won't help, as it |
1897 | // could be in read-only memory, or be accessed in some other thread) |
1898 | tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T); |
1899 | for ( size_t i = 0; i < inlen; i++ ) |
1900 | tmpbuf[n] = WC_BSWAP(psz[i]); |
1901 | |
1902 | tmpbuf[inlen] = L'\0'; |
1903 | psz = tmpbuf; |
1904 | } |
1905 | |
1906 | if (buf) |
1907 | { |
1908 | // have destination buffer, convert there |
1909 | cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); |
1910 | |
1911 | res = n - outbuf; |
1912 | |
1913 | // NB: iconv was given only wcslen(psz) characters on input, and so |
1914 | // it couldn't convert the trailing zero. Let's do it ourselves |
1915 | // if there's some room left for it in the output buffer. |
1916 | if (res < n) |
1917 | buf[0] = 0; |
1918 | } |
1919 | else |
1920 | { |
1921 | // no destination buffer: convert using temp buffer |
1922 | // to calculate destination buffer requirement |
1923 | char tbuf[16]; |
1924 | res = 0; |
1925 | do |
1926 | { |
1927 | buf = tbuf; |
1928 | outbuf = 16; |
1929 | |
1930 | cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf ); |
1931 | |
1932 | res += 16 - outbuf; |
1933 | } |
1934 | while ((cres == (size_t)-1) && (errno == E2BIG)); |
1935 | } |
1936 | |
1937 | if (ms_wcNeedsSwap) |
1938 | { |
1939 | free(tmpbuf); |
1940 | } |
1941 | |
1942 | if (ICONV_FAILED(cres, inbuf)) |
1943 | { |
1944 | wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode())); |
1945 | return wxCONV_FAILED; |
1946 | } |
1947 | |
1948 | return res; |
1949 | } |
1950 | |
1951 | size_t wxMBConv_iconv::GetMBNulLen() const |
1952 | { |
1953 | if ( m_minMBCharWidth == 0 ) |
1954 | { |
1955 | wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv); |
1956 | |
1957 | #if wxUSE_THREADS |
1958 | // NB: explained in MB2WC |
1959 | wxMutexLocker lock(self->m_iconvMutex); |
1960 | #endif |
1961 | |
1962 | const wchar_t *wnul = L""; |
1963 | char buf[8]; // should be enough for NUL in any encoding |
1964 | size_t inLen = sizeof(wchar_t), |
1965 | outLen = WXSIZEOF(buf); |
1966 | char *inBuff = (char *)wnul; |
1967 | char *outBuff = buf; |
1968 | if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 ) |
1969 | { |
1970 | self->m_minMBCharWidth = (size_t)-1; |
1971 | } |
1972 | else // ok |
1973 | { |
1974 | self->m_minMBCharWidth = outBuff - buf; |
1975 | } |
1976 | } |
1977 | |
1978 | return m_minMBCharWidth; |
1979 | } |
1980 | |
1981 | #endif // HAVE_ICONV |
1982 | |
1983 | |
1984 | // ============================================================================ |
1985 | // Win32 conversion classes |
1986 | // ============================================================================ |
1987 | |
1988 | #ifdef wxHAVE_WIN32_MB2WC |
1989 | |
1990 | // from utils.cpp |
1991 | #if wxUSE_FONTMAP |
1992 | extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset); |
1993 | extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding); |
1994 | #endif |
1995 | |
1996 | class wxMBConv_win32 : public wxMBConv |
1997 | { |
1998 | public: |
1999 | wxMBConv_win32() |
2000 | { |
2001 | m_CodePage = CP_ACP; |
2002 | m_minMBCharWidth = 0; |
2003 | } |
2004 | |
2005 | wxMBConv_win32(const wxMBConv_win32& conv) |
2006 | : wxMBConv() |
2007 | { |
2008 | m_CodePage = conv.m_CodePage; |
2009 | m_minMBCharWidth = conv.m_minMBCharWidth; |
2010 | } |
2011 | |
2012 | #if wxUSE_FONTMAP |
2013 | wxMBConv_win32(const wxChar* name) |
2014 | { |
2015 | m_CodePage = wxCharsetToCodepage(name); |
2016 | m_minMBCharWidth = 0; |
2017 | } |
2018 | |
2019 | wxMBConv_win32(wxFontEncoding encoding) |
2020 | { |
2021 | m_CodePage = wxEncodingToCodepage(encoding); |
2022 | m_minMBCharWidth = 0; |
2023 | } |
2024 | #endif // wxUSE_FONTMAP |
2025 | |
2026 | virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const |
2027 | { |
2028 | // note that we have to use MB_ERR_INVALID_CHARS flag as it without it |
2029 | // the behaviour is not compatible with the Unix version (using iconv) |
2030 | // and break the library itself, e.g. wxTextInputStream::NextChar() |
2031 | // wouldn't work if reading an incomplete MB char didn't result in an |
2032 | // error |
2033 | // |
2034 | // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or |
2035 | // Win XP or newer and it is not supported for UTF-[78] so we always |
2036 | // use our own conversions in this case. See |
2037 | // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx |
2038 | // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp |
2039 | if ( m_CodePage == CP_UTF8 ) |
2040 | { |
2041 | return wxConvUTF8.MB2WC(buf, psz, n); |
2042 | } |
2043 | |
2044 | if ( m_CodePage == CP_UTF7 ) |
2045 | { |
2046 | return wxConvUTF7.MB2WC(buf, psz, n); |
2047 | } |
2048 | |
2049 | int flags = 0; |
2050 | if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) && |
2051 | IsAtLeastWin2kSP4() ) |
2052 | { |
2053 | flags = MB_ERR_INVALID_CHARS; |
2054 | } |
2055 | |
2056 | const size_t len = ::MultiByteToWideChar |
2057 | ( |
2058 | m_CodePage, // code page |
2059 | flags, // flags: fall on error |
2060 | psz, // input string |
2061 | -1, // its length (NUL-terminated) |
2062 | buf, // output string |
2063 | buf ? n : 0 // size of output buffer |
2064 | ); |
2065 | if ( !len ) |
2066 | { |
2067 | // function totally failed |
2068 | return wxCONV_FAILED; |
2069 | } |
2070 | |
2071 | // if we were really converting and didn't use MB_ERR_INVALID_CHARS, |
2072 | // check if we succeeded, by doing a double trip: |
2073 | if ( !flags && buf ) |
2074 | { |
2075 | const size_t mbLen = strlen(psz); |
2076 | wxCharBuffer mbBuf(mbLen); |
2077 | if ( ::WideCharToMultiByte |
2078 | ( |
2079 | m_CodePage, |
2080 | 0, |
2081 | buf, |
2082 | -1, |
2083 | mbBuf.data(), |
2084 | mbLen + 1, // size in bytes, not length |
2085 | NULL, |
2086 | NULL |
2087 | ) == 0 || |
2088 | strcmp(mbBuf, psz) != 0 ) |
2089 | { |
2090 | // we didn't obtain the same thing we started from, hence |
2091 | // the conversion was lossy and we consider that it failed |
2092 | return wxCONV_FAILED; |
2093 | } |
2094 | } |
2095 | |
2096 | // note that it returns count of written chars for buf != NULL and size |
2097 | // of the needed buffer for buf == NULL so in either case the length of |
2098 | // the string (which never includes the terminating NUL) is one less |
2099 | return len - 1; |
2100 | } |
2101 | |
2102 | virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const |
2103 | { |
2104 | /* |
2105 | we have a problem here: by default, WideCharToMultiByte() may |
2106 | replace characters unrepresentable in the target code page with bad |
2107 | quality approximations such as turning "1/2" symbol (U+00BD) into |
2108 | "1" for the code pages which don't have it and we, obviously, want |
2109 | to avoid this at any price |
2110 | |
2111 | the trouble is that this function does it _silently_, i.e. it won't |
2112 | even tell us whether it did or not... Win98/2000 and higher provide |
2113 | WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and |
2114 | we have to resort to a round trip, i.e. check that converting back |
2115 | results in the same string -- this is, of course, expensive but |
2116 | otherwise we simply can't be sure to not garble the data. |
2117 | */ |
2118 | |
2119 | // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN |
2120 | // it doesn't work with CJK encodings (which we test for rather roughly |
2121 | // here...) nor with UTF-7/8 nor, of course, with Windows versions not |
2122 | // supporting it |
2123 | BOOL usedDef wxDUMMY_INITIALIZE(false); |
2124 | BOOL *pUsedDef; |
2125 | int flags; |
2126 | if ( CanUseNoBestFit() && m_CodePage < 50000 ) |
2127 | { |
2128 | // it's our lucky day |
2129 | flags = WC_NO_BEST_FIT_CHARS; |
2130 | pUsedDef = &usedDef; |
2131 | } |
2132 | else // old system or unsupported encoding |
2133 | { |
2134 | flags = 0; |
2135 | pUsedDef = NULL; |
2136 | } |
2137 | |
2138 | const size_t len = ::WideCharToMultiByte |
2139 | ( |
2140 | m_CodePage, // code page |
2141 | flags, // either none or no best fit |
2142 | pwz, // input string |
2143 | -1, // it is (wide) NUL-terminated |
2144 | buf, // output buffer |
2145 | buf ? n : 0, // and its size |
2146 | NULL, // default "replacement" char |
2147 | pUsedDef // [out] was it used? |
2148 | ); |
2149 | |
2150 | if ( !len ) |
2151 | { |
2152 | // function totally failed |
2153 | return wxCONV_FAILED; |
2154 | } |
2155 | |
2156 | // if we were really converting, check if we succeeded |
2157 | if ( buf ) |
2158 | { |
2159 | if ( flags ) |
2160 | { |
2161 | // check if the conversion failed, i.e. if any replacements |
2162 | // were done |
2163 | if ( usedDef ) |
2164 | return wxCONV_FAILED; |
2165 | } |
2166 | else // we must resort to double tripping... |
2167 | { |
2168 | wxWCharBuffer wcBuf(n); |
2169 | if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || |
2170 | wcscmp(wcBuf, pwz) != 0 ) |
2171 | { |
2172 | // we didn't obtain the same thing we started from, hence |
2173 | // the conversion was lossy and we consider that it failed |
2174 | return wxCONV_FAILED; |
2175 | } |
2176 | } |
2177 | } |
2178 | |
2179 | // see the comment above for the reason of "len - 1" |
2180 | return len - 1; |
2181 | } |
2182 | |
2183 | virtual size_t GetMBNulLen() const |
2184 | { |
2185 | if ( m_minMBCharWidth == 0 ) |
2186 | { |
2187 | int len = ::WideCharToMultiByte |
2188 | ( |
2189 | m_CodePage, // code page |
2190 | 0, // no flags |
2191 | L"", // input string |
2192 | 1, // translate just the NUL |
2193 | NULL, // output buffer |
2194 | 0, // and its size |
2195 | NULL, // no replacement char |
2196 | NULL // [out] don't care if it was used |
2197 | ); |
2198 | |
2199 | wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32); |
2200 | switch ( len ) |
2201 | { |
2202 | default: |
2203 | wxLogDebug(_T("Unexpected NUL length %d"), len); |
2204 | self->m_minMBCharWidth = (size_t)-1; |
2205 | break; |
2206 | |
2207 | case 0: |
2208 | self->m_minMBCharWidth = (size_t)-1; |
2209 | break; |
2210 | |
2211 | case 1: |
2212 | case 2: |
2213 | case 4: |
2214 | self->m_minMBCharWidth = len; |
2215 | break; |
2216 | } |
2217 | } |
2218 | |
2219 | return m_minMBCharWidth; |
2220 | } |
2221 | |
2222 | virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); } |
2223 | |
2224 | bool IsOk() const { return m_CodePage != -1; } |
2225 | |
2226 | private: |
2227 | static bool CanUseNoBestFit() |
2228 | { |
2229 | static int s_isWin98Or2k = -1; |
2230 | |
2231 | if ( s_isWin98Or2k == -1 ) |
2232 | { |
2233 | int verMaj, verMin; |
2234 | switch ( wxGetOsVersion(&verMaj, &verMin) ) |
2235 | { |
2236 | case wxOS_WINDOWS_9X: |
2237 | s_isWin98Or2k = verMaj >= 4 && verMin >= 10; |
2238 | break; |
2239 | |
2240 | case wxOS_WINDOWS_NT: |
2241 | s_isWin98Or2k = verMaj >= 5; |
2242 | break; |
2243 | |
2244 | default: |
2245 | // unknown: be conservative by default |
2246 | s_isWin98Or2k = 0; |
2247 | break; |
2248 | } |
2249 | |
2250 | wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") ); |
2251 | } |
2252 | |
2253 | return s_isWin98Or2k == 1; |
2254 | } |
2255 | |
2256 | static bool IsAtLeastWin2kSP4() |
2257 | { |
2258 | #ifdef __WXWINCE__ |
2259 | return false; |
2260 | #else |
2261 | static int s_isAtLeastWin2kSP4 = -1; |
2262 | |
2263 | if ( s_isAtLeastWin2kSP4 == -1 ) |
2264 | { |
2265 | OSVERSIONINFOEX ver; |
2266 | |
2267 | memset(&ver, 0, sizeof(ver)); |
2268 | ver.dwOSVersionInfoSize = sizeof(ver); |
2269 | GetVersionEx((OSVERSIONINFO*)&ver); |
2270 | |
2271 | s_isAtLeastWin2kSP4 = |
2272 | ((ver.dwMajorVersion > 5) || // Vista+ |
2273 | (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003 |
2274 | (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 && |
2275 | ver.wServicePackMajor >= 4)) // 2000 SP4+ |
2276 | ? 1 : 0; |
2277 | } |
2278 | |
2279 | return s_isAtLeastWin2kSP4 == 1; |
2280 | #endif |
2281 | } |
2282 | |
2283 | |
2284 | // the code page we're working with |
2285 | long m_CodePage; |
2286 | |
2287 | // cached result of GetMBNulLen(), set to 0 initially meaning |
2288 | // "unknown" |
2289 | size_t m_minMBCharWidth; |
2290 | }; |
2291 | |
2292 | #endif // wxHAVE_WIN32_MB2WC |
2293 | |
2294 | // ============================================================================ |
2295 | // Cocoa conversion classes |
2296 | // ============================================================================ |
2297 | |
2298 | #if defined(__WXCOCOA__) |
2299 | |
2300 | // RN: There is no UTF-32 support in either Core Foundation or Cocoa. |
2301 | // Strangely enough, internally Core Foundation uses |
2302 | // UTF-32 internally quite a bit - its just not public (yet). |
2303 | |
2304 | #include <CoreFoundation/CFString.h> |
2305 | #include <CoreFoundation/CFStringEncodingExt.h> |
2306 | |
2307 | CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding) |
2308 | { |
2309 | CFStringEncoding enc = kCFStringEncodingInvalidId ; |
2310 | |
2311 | switch (encoding) |
2312 | { |
2313 | case wxFONTENCODING_DEFAULT : |
2314 | enc = CFStringGetSystemEncoding(); |
2315 | break ; |
2316 | |
2317 | case wxFONTENCODING_ISO8859_1 : |
2318 | enc = kCFStringEncodingISOLatin1 ; |
2319 | break ; |
2320 | case wxFONTENCODING_ISO8859_2 : |
2321 | enc = kCFStringEncodingISOLatin2; |
2322 | break ; |
2323 | case wxFONTENCODING_ISO8859_3 : |
2324 | enc = kCFStringEncodingISOLatin3 ; |
2325 | break ; |
2326 | case wxFONTENCODING_ISO8859_4 : |
2327 | enc = kCFStringEncodingISOLatin4; |
2328 | break ; |
2329 | case wxFONTENCODING_ISO8859_5 : |
2330 | enc = kCFStringEncodingISOLatinCyrillic; |
2331 | break ; |
2332 | case wxFONTENCODING_ISO8859_6 : |
2333 | enc = kCFStringEncodingISOLatinArabic; |
2334 | break ; |
2335 | case wxFONTENCODING_ISO8859_7 : |
2336 | enc = kCFStringEncodingISOLatinGreek; |
2337 | break ; |
2338 | case wxFONTENCODING_ISO8859_8 : |
2339 | enc = kCFStringEncodingISOLatinHebrew; |
2340 | break ; |
2341 | case wxFONTENCODING_ISO8859_9 : |
2342 | enc = kCFStringEncodingISOLatin5; |
2343 | break ; |
2344 | case wxFONTENCODING_ISO8859_10 : |
2345 | enc = kCFStringEncodingISOLatin6; |
2346 | break ; |
2347 | case wxFONTENCODING_ISO8859_11 : |
2348 | enc = kCFStringEncodingISOLatinThai; |
2349 | break ; |
2350 | case wxFONTENCODING_ISO8859_13 : |
2351 | enc = kCFStringEncodingISOLatin7; |
2352 | break ; |
2353 | case wxFONTENCODING_ISO8859_14 : |
2354 | enc = kCFStringEncodingISOLatin8; |
2355 | break ; |
2356 | case wxFONTENCODING_ISO8859_15 : |
2357 | enc = kCFStringEncodingISOLatin9; |
2358 | break ; |
2359 | |
2360 | case wxFONTENCODING_KOI8 : |
2361 | enc = kCFStringEncodingKOI8_R; |
2362 | break ; |
2363 | case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866 |
2364 | enc = kCFStringEncodingDOSRussian; |
2365 | break ; |
2366 | |
2367 | // case wxFONTENCODING_BULGARIAN : |
2368 | // enc = ; |
2369 | // break ; |
2370 | |
2371 | case wxFONTENCODING_CP437 : |
2372 | enc = kCFStringEncodingDOSLatinUS ; |
2373 | break ; |
2374 | case wxFONTENCODING_CP850 : |
2375 | enc = kCFStringEncodingDOSLatin1; |
2376 | break ; |
2377 | case wxFONTENCODING_CP852 : |
2378 | enc = kCFStringEncodingDOSLatin2; |
2379 | break ; |
2380 | case wxFONTENCODING_CP855 : |
2381 | enc = kCFStringEncodingDOSCyrillic; |
2382 | break ; |
2383 | case wxFONTENCODING_CP866 : |
2384 | enc = kCFStringEncodingDOSRussian ; |
2385 | break ; |
2386 | case wxFONTENCODING_CP874 : |
2387 | enc = kCFStringEncodingDOSThai; |
2388 | break ; |
2389 | case wxFONTENCODING_CP932 : |
2390 | enc = kCFStringEncodingDOSJapanese; |
2391 | break ; |
2392 | case wxFONTENCODING_CP936 : |
2393 | enc = kCFStringEncodingDOSChineseSimplif ; |
2394 | break ; |
2395 | case wxFONTENCODING_CP949 : |
2396 | enc = kCFStringEncodingDOSKorean; |
2397 | break ; |
2398 | case wxFONTENCODING_CP950 : |
2399 | enc = kCFStringEncodingDOSChineseTrad; |
2400 | break ; |
2401 | case wxFONTENCODING_CP1250 : |
2402 | enc = kCFStringEncodingWindowsLatin2; |
2403 | break ; |
2404 | case wxFONTENCODING_CP1251 : |
2405 | enc = kCFStringEncodingWindowsCyrillic ; |
2406 | break ; |
2407 | case wxFONTENCODING_CP1252 : |
2408 | enc = kCFStringEncodingWindowsLatin1 ; |
2409 | break ; |
2410 | case wxFONTENCODING_CP1253 : |
2411 | enc = kCFStringEncodingWindowsGreek; |
2412 | break ; |
2413 | case wxFONTENCODING_CP1254 : |
2414 | enc = kCFStringEncodingWindowsLatin5; |
2415 | break ; |
2416 | case wxFONTENCODING_CP1255 : |
2417 | enc = kCFStringEncodingWindowsHebrew ; |
2418 | break ; |
2419 | case wxFONTENCODING_CP1256 : |
2420 | enc = kCFStringEncodingWindowsArabic ; |
2421 | break ; |
2422 | case wxFONTENCODING_CP1257 : |
2423 | enc = kCFStringEncodingWindowsBalticRim; |
2424 | break ; |
2425 | // This only really encodes to UTF7 (if that) evidently |
2426 | // case wxFONTENCODING_UTF7 : |
2427 | // enc = kCFStringEncodingNonLossyASCII ; |
2428 | // break ; |
2429 | case wxFONTENCODING_UTF8 : |
2430 | enc = kCFStringEncodingUTF8 ; |
2431 | break ; |
2432 | case wxFONTENCODING_EUC_JP : |
2433 | enc = kCFStringEncodingEUC_JP; |
2434 | break ; |
2435 | case wxFONTENCODING_UTF16 : |
2436 | enc = kCFStringEncodingUnicode ; |
2437 | break ; |
2438 | case wxFONTENCODING_MACROMAN : |
2439 | enc = kCFStringEncodingMacRoman ; |
2440 | break ; |
2441 | case wxFONTENCODING_MACJAPANESE : |
2442 | enc = kCFStringEncodingMacJapanese ; |
2443 | break ; |
2444 | case wxFONTENCODING_MACCHINESETRAD : |
2445 | enc = kCFStringEncodingMacChineseTrad ; |
2446 | break ; |
2447 | case wxFONTENCODING_MACKOREAN : |
2448 | enc = kCFStringEncodingMacKorean ; |
2449 | break ; |
2450 | case wxFONTENCODING_MACARABIC : |
2451 | enc = kCFStringEncodingMacArabic ; |
2452 | break ; |
2453 | case wxFONTENCODING_MACHEBREW : |
2454 | enc = kCFStringEncodingMacHebrew ; |
2455 | break ; |
2456 | case wxFONTENCODING_MACGREEK : |
2457 | enc = kCFStringEncodingMacGreek ; |
2458 | break ; |
2459 | case wxFONTENCODING_MACCYRILLIC : |
2460 | enc = kCFStringEncodingMacCyrillic ; |
2461 | break ; |
2462 | case wxFONTENCODING_MACDEVANAGARI : |
2463 | enc = kCFStringEncodingMacDevanagari ; |
2464 | break ; |
2465 | case wxFONTENCODING_MACGURMUKHI : |
2466 | enc = kCFStringEncodingMacGurmukhi ; |
2467 | break ; |
2468 | case wxFONTENCODING_MACGUJARATI : |
2469 | enc = kCFStringEncodingMacGujarati ; |
2470 | break ; |
2471 | case wxFONTENCODING_MACORIYA : |
2472 | enc = kCFStringEncodingMacOriya ; |
2473 | break ; |
2474 | case wxFONTENCODING_MACBENGALI : |
2475 | enc = kCFStringEncodingMacBengali ; |
2476 | break ; |
2477 | case wxFONTENCODING_MACTAMIL : |
2478 | enc = kCFStringEncodingMacTamil ; |
2479 | break ; |
2480 | case wxFONTENCODING_MACTELUGU : |
2481 | enc = kCFStringEncodingMacTelugu ; |
2482 | break ; |
2483 | case wxFONTENCODING_MACKANNADA : |
2484 | enc = kCFStringEncodingMacKannada ; |
2485 | break ; |
2486 | case wxFONTENCODING_MACMALAJALAM : |
2487 | enc = kCFStringEncodingMacMalayalam ; |
2488 | break ; |
2489 | case wxFONTENCODING_MACSINHALESE : |
2490 | enc = kCFStringEncodingMacSinhalese ; |
2491 | break ; |
2492 | case wxFONTENCODING_MACBURMESE : |
2493 | enc = kCFStringEncodingMacBurmese ; |
2494 | break ; |
2495 | case wxFONTENCODING_MACKHMER : |
2496 | enc = kCFStringEncodingMacKhmer ; |
2497 | break ; |
2498 | case wxFONTENCODING_MACTHAI : |
2499 | enc = kCFStringEncodingMacThai ; |
2500 | break ; |
2501 | case wxFONTENCODING_MACLAOTIAN : |
2502 | enc = kCFStringEncodingMacLaotian ; |
2503 | break ; |
2504 | case wxFONTENCODING_MACGEORGIAN : |
2505 | enc = kCFStringEncodingMacGeorgian ; |
2506 | break ; |
2507 | case wxFONTENCODING_MACARMENIAN : |
2508 | enc = kCFStringEncodingMacArmenian ; |
2509 | break ; |
2510 | case wxFONTENCODING_MACCHINESESIMP : |
2511 | enc = kCFStringEncodingMacChineseSimp ; |
2512 | break ; |
2513 | case wxFONTENCODING_MACTIBETAN : |
2514 | enc = kCFStringEncodingMacTibetan ; |
2515 | break ; |
2516 | case wxFONTENCODING_MACMONGOLIAN : |
2517 | enc = kCFStringEncodingMacMongolian ; |
2518 | break ; |
2519 | case wxFONTENCODING_MACETHIOPIC : |
2520 | enc = kCFStringEncodingMacEthiopic ; |
2521 | break ; |
2522 | case wxFONTENCODING_MACCENTRALEUR : |
2523 | enc = kCFStringEncodingMacCentralEurRoman ; |
2524 | break ; |
2525 | case wxFONTENCODING_MACVIATNAMESE : |
2526 | enc = kCFStringEncodingMacVietnamese ; |
2527 | break ; |
2528 | case wxFONTENCODING_MACARABICEXT : |
2529 | enc = kCFStringEncodingMacExtArabic ; |
2530 | break ; |
2531 | case wxFONTENCODING_MACSYMBOL : |
2532 | enc = kCFStringEncodingMacSymbol ; |
2533 | break ; |
2534 | case wxFONTENCODING_MACDINGBATS : |
2535 | enc = kCFStringEncodingMacDingbats ; |
2536 | break ; |
2537 | case wxFONTENCODING_MACTURKISH : |
2538 | enc = kCFStringEncodingMacTurkish ; |
2539 | break ; |
2540 | case wxFONTENCODING_MACCROATIAN : |
2541 | enc = kCFStringEncodingMacCroatian ; |
2542 | break ; |
2543 | case wxFONTENCODING_MACICELANDIC : |
2544 | enc = kCFStringEncodingMacIcelandic ; |
2545 | break ; |
2546 | case wxFONTENCODING_MACROMANIAN : |
2547 | enc = kCFStringEncodingMacRomanian ; |
2548 | break ; |
2549 | case wxFONTENCODING_MACCELTIC : |
2550 | enc = kCFStringEncodingMacCeltic ; |
2551 | break ; |
2552 | case wxFONTENCODING_MACGAELIC : |
2553 | enc = kCFStringEncodingMacGaelic ; |
2554 | break ; |
2555 | // case wxFONTENCODING_MACKEYBOARD : |
2556 | // enc = kCFStringEncodingMacKeyboardGlyphs ; |
2557 | // break ; |
2558 | |
2559 | default : |
2560 | // because gcc is picky |
2561 | break ; |
2562 | } |
2563 | |
2564 | return enc ; |
2565 | } |
2566 | |
2567 | class wxMBConv_cocoa : public wxMBConv |
2568 | { |
2569 | public: |
2570 | wxMBConv_cocoa() |
2571 | { |
2572 | Init(CFStringGetSystemEncoding()) ; |
2573 | } |
2574 | |
2575 | wxMBConv_cocoa(const wxMBConv_cocoa& conv) |
2576 | { |
2577 | m_encoding = conv.m_encoding; |
2578 | } |
2579 | |
2580 | #if wxUSE_FONTMAP |
2581 | wxMBConv_cocoa(const wxChar* name) |
2582 | { |
2583 | Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ; |
2584 | } |
2585 | #endif |
2586 | |
2587 | wxMBConv_cocoa(wxFontEncoding encoding) |
2588 | { |
2589 | Init( wxCFStringEncFromFontEnc(encoding) ); |
2590 | } |
2591 | |
2592 | virtual ~wxMBConv_cocoa() |
2593 | { |
2594 | } |
2595 | |
2596 | void Init( CFStringEncoding encoding) |
2597 | { |
2598 | m_encoding = encoding ; |
2599 | } |
2600 | |
2601 | size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const |
2602 | { |
2603 | wxASSERT(szUnConv); |
2604 | |
2605 | CFStringRef theString = CFStringCreateWithBytes ( |
2606 | NULL, //the allocator |
2607 | (const UInt8*)szUnConv, |
2608 | strlen(szUnConv), |
2609 | m_encoding, |
2610 | false //no BOM/external representation |
2611 | ); |
2612 | |
2613 | wxASSERT(theString); |
2614 | |
2615 | size_t nOutLength = CFStringGetLength(theString); |
2616 | |
2617 | if (szOut == NULL) |
2618 | { |
2619 | CFRelease(theString); |
2620 | return nOutLength; |
2621 | } |
2622 | |
2623 | CFRange theRange = { 0, nOutSize }; |
2624 | |
2625 | #if SIZEOF_WCHAR_T == 4 |
2626 | UniChar* szUniCharBuffer = new UniChar[nOutSize]; |
2627 | #endif |
2628 | |
2629 | CFStringGetCharacters(theString, theRange, szUniCharBuffer); |
2630 | |
2631 | CFRelease(theString); |
2632 | |
2633 | szUniCharBuffer[nOutLength] = '\0'; |
2634 | |
2635 | #if SIZEOF_WCHAR_T == 4 |
2636 | wxMBConvUTF16 converter; |
2637 | converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize ); |
2638 | delete [] szUniCharBuffer; |
2639 | #endif |
2640 | |
2641 | return nOutLength; |
2642 | } |
2643 | |
2644 | size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const |
2645 | { |
2646 | wxASSERT(szUnConv); |
2647 | |
2648 | size_t nRealOutSize; |
2649 | size_t nBufSize = wxWcslen(szUnConv); |
2650 | UniChar* szUniBuffer = (UniChar*) szUnConv; |
2651 | |
2652 | #if SIZEOF_WCHAR_T == 4 |
2653 | wxMBConvUTF16 converter ; |
2654 | nBufSize = converter.WC2MB( NULL, szUnConv, 0 ); |
2655 | szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1]; |
2656 | converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar)); |
2657 | nBufSize /= sizeof(UniChar); |
2658 | #endif |
2659 | |
2660 | CFStringRef theString = CFStringCreateWithCharactersNoCopy( |
2661 | NULL, //allocator |
2662 | szUniBuffer, |
2663 | nBufSize, |
2664 | kCFAllocatorNull //deallocator - we want to deallocate it ourselves |
2665 | ); |
2666 | |
2667 | wxASSERT(theString); |
2668 | |
2669 | //Note that CER puts a BOM when converting to unicode |
2670 | //so we check and use getchars instead in that case |
2671 | if (m_encoding == kCFStringEncodingUnicode) |
2672 | { |
2673 | if (szOut != NULL) |
2674 | CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut); |
2675 | |
2676 | nRealOutSize = CFStringGetLength(theString) + 1; |
2677 | } |
2678 | else |
2679 | { |
2680 | CFStringGetBytes( |
2681 | theString, |
2682 | CFRangeMake(0, CFStringGetLength(theString)), |
2683 | m_encoding, |
2684 | 0, //what to put in characters that can't be converted - |
2685 | //0 tells CFString to return NULL if it meets such a character |
2686 | false, //not an external representation |
2687 | (UInt8*) szOut, |
2688 | nOutSize, |
2689 | (CFIndex*) &nRealOutSize |
2690 | ); |
2691 | } |
2692 | |
2693 | CFRelease(theString); |
2694 | |
2695 | #if SIZEOF_WCHAR_T == 4 |
2696 | delete[] szUniBuffer; |
2697 | #endif |
2698 | |
2699 | return nRealOutSize - 1; |
2700 | } |
2701 | |
2702 | virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); } |
2703 | |
2704 | bool IsOk() const |
2705 | { |
2706 | return m_encoding != kCFStringEncodingInvalidId && |
2707 | CFStringIsEncodingAvailable(m_encoding); |
2708 | } |
2709 | |
2710 | private: |
2711 | CFStringEncoding m_encoding ; |
2712 | }; |
2713 | |
2714 | #endif // defined(__WXCOCOA__) |
2715 | |
2716 | // ============================================================================ |
2717 | // Mac conversion classes |
2718 | // ============================================================================ |
2719 | |
2720 | #if defined(__WXMAC__) && defined(TARGET_CARBON) |
2721 | |
2722 | class wxMBConv_mac : public wxMBConv |
2723 | { |
2724 | public: |
2725 | wxMBConv_mac() |
2726 | { |
2727 | Init(CFStringGetSystemEncoding()) ; |
2728 | } |
2729 | |
2730 | wxMBConv_mac(const wxMBConv_mac& conv) |
2731 | { |
2732 | Init(conv.m_char_encoding); |
2733 | } |
2734 | |
2735 | #if wxUSE_FONTMAP |
2736 | wxMBConv_mac(const wxChar* name) |
2737 | { |
2738 | wxFontEncoding enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false); |
2739 | Init( (enc != wxFONTENCODING_SYSTEM) ? wxMacGetSystemEncFromFontEnc( enc ) : kTextEncodingUnknown); |
2740 | } |
2741 | #endif |
2742 | |
2743 | wxMBConv_mac(wxFontEncoding encoding) |
2744 | { |
2745 | Init( wxMacGetSystemEncFromFontEnc(encoding) ); |
2746 | } |
2747 | |
2748 | virtual ~wxMBConv_mac() |
2749 | { |
2750 | OSStatus status = noErr ; |
2751 | if (m_MB2WC_converter) |
2752 | status = TECDisposeConverter(m_MB2WC_converter); |
2753 | if (m_WC2MB_converter) |
2754 | status = TECDisposeConverter(m_WC2MB_converter); |
2755 | } |
2756 | |
2757 | void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant , |
2758 | TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat) |
2759 | { |
2760 | m_MB2WC_converter = NULL ; |
2761 | m_WC2MB_converter = NULL ; |
2762 | if ( encoding != kTextEncodingUnknown ) |
2763 | { |
2764 | m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ; |
2765 | m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ; |
2766 | } |
2767 | else |
2768 | { |
2769 | m_char_encoding = kTextEncodingUnknown; |
2770 | m_unicode_encoding = kTextEncodingUnknown; |
2771 | } |
2772 | } |
2773 | |
2774 | virtual void CreateIfNeeded() const |
2775 | { |
2776 | if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL && |
2777 | m_char_encoding != kTextEncodingUnknown && m_unicode_encoding != kTextEncodingUnknown ) |
2778 | { |
2779 | OSStatus status = noErr ; |
2780 | status = TECCreateConverter(&m_MB2WC_converter, |
2781 | m_char_encoding, |
2782 | m_unicode_encoding); |
2783 | wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; |
2784 | status = TECCreateConverter(&m_WC2MB_converter, |
2785 | m_unicode_encoding, |
2786 | m_char_encoding); |
2787 | wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ; |
2788 | } |
2789 | } |
2790 | |
2791 | size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const |
2792 | { |
2793 | CreateIfNeeded() ; |
2794 | OSStatus status = noErr ; |
2795 | ByteCount byteOutLen ; |
2796 | ByteCount byteInLen = strlen(psz) + 1; |
2797 | wchar_t *tbuf = NULL ; |
2798 | UniChar* ubuf = NULL ; |
2799 | size_t res = 0 ; |
2800 | |
2801 | if (buf == NULL) |
2802 | { |
2803 | // Apple specs say at least 32 |
2804 | n = wxMax( 32, byteInLen ) ; |
2805 | tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; |
2806 | } |
2807 | |
2808 | ByteCount byteBufferLen = n * sizeof( UniChar ) ; |
2809 | |
2810 | #if SIZEOF_WCHAR_T == 4 |
2811 | ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; |
2812 | #else |
2813 | ubuf = (UniChar*) (buf ? buf : tbuf) ; |
2814 | #endif |
2815 | { |
2816 | #if wxUSE_THREADS |
2817 | wxMutexLocker lock( m_MB2WC_guard ); |
2818 | #endif |
2819 | status = TECConvertText( |
2820 | m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, |
2821 | (TextPtr) ubuf, byteBufferLen, &byteOutLen); |
2822 | } |
2823 | |
2824 | #if SIZEOF_WCHAR_T == 4 |
2825 | // we have to terminate here, because n might be larger for the trailing zero, and if UniChar |
2826 | // is not properly terminated we get random characters at the end |
2827 | ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; |
2828 | wxMBConvUTF16 converter ; |
2829 | res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; |
2830 | free( ubuf ) ; |
2831 | #else |
2832 | res = byteOutLen / sizeof( UniChar ) ; |
2833 | #endif |
2834 | |
2835 | if ( buf == NULL ) |
2836 | free(tbuf) ; |
2837 | |
2838 | if ( buf && res < n) |
2839 | buf[res] = 0; |
2840 | |
2841 | return res ; |
2842 | } |
2843 | |
2844 | size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const |
2845 | { |
2846 | CreateIfNeeded() ; |
2847 | OSStatus status = noErr ; |
2848 | ByteCount byteOutLen ; |
2849 | ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; |
2850 | |
2851 | char *tbuf = NULL ; |
2852 | |
2853 | if (buf == NULL) |
2854 | { |
2855 | // Apple specs say at least 32 |
2856 | n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); |
2857 | tbuf = (char*) malloc( n ) ; |
2858 | } |
2859 | |
2860 | ByteCount byteBufferLen = n ; |
2861 | UniChar* ubuf = NULL ; |
2862 | |
2863 | #if SIZEOF_WCHAR_T == 4 |
2864 | wxMBConvUTF16 converter ; |
2865 | size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; |
2866 | byteInLen = unicharlen ; |
2867 | ubuf = (UniChar*) malloc( byteInLen + 2 ) ; |
2868 | converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; |
2869 | #else |
2870 | ubuf = (UniChar*) psz ; |
2871 | #endif |
2872 | |
2873 | { |
2874 | #if wxUSE_THREADS |
2875 | wxMutexLocker lock( m_WC2MB_guard ); |
2876 | #endif |
2877 | status = TECConvertText( |
2878 | m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen, |
2879 | (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); |
2880 | } |
2881 | |
2882 | #if SIZEOF_WCHAR_T == 4 |
2883 | free( ubuf ) ; |
2884 | #endif |
2885 | |
2886 | if ( buf == NULL ) |
2887 | free(tbuf) ; |
2888 | |
2889 | size_t res = byteOutLen ; |
2890 | if ( buf && res < n) |
2891 | { |
2892 | buf[res] = 0; |
2893 | |
2894 | //we need to double-trip to verify it didn't insert any ? in place |
2895 | //of bogus characters |
2896 | wxWCharBuffer wcBuf(n); |
2897 | size_t pszlen = wxWcslen(psz); |
2898 | if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED || |
2899 | wxWcslen(wcBuf) != pszlen || |
2900 | memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 ) |
2901 | { |
2902 | // we didn't obtain the same thing we started from, hence |
2903 | // the conversion was lossy and we consider that it failed |
2904 | return wxCONV_FAILED; |
2905 | } |
2906 | } |
2907 | |
2908 | return res ; |
2909 | } |
2910 | |
2911 | virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); } |
2912 | |
2913 | bool IsOk() const |
2914 | { |
2915 | CreateIfNeeded() ; |
2916 | return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; |
2917 | } |
2918 | |
2919 | protected : |
2920 | mutable TECObjectRef m_MB2WC_converter; |
2921 | mutable TECObjectRef m_WC2MB_converter; |
2922 | #if wxUSE_THREADS |
2923 | mutable wxMutex m_MB2WC_guard; |
2924 | mutable wxMutex m_WC2MB_guard; |
2925 | #endif |
2926 | |
2927 | TextEncodingBase m_char_encoding; |
2928 | TextEncodingBase m_unicode_encoding; |
2929 | }; |
2930 | |
2931 | // MB is decomposed (D) normalized UTF8 |
2932 | |
2933 | class wxMBConv_macUTF8D : public wxMBConv_mac |
2934 | { |
2935 | public : |
2936 | wxMBConv_macUTF8D() |
2937 | { |
2938 | Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ; |
2939 | m_uni = NULL; |
2940 | m_uniBack = NULL ; |
2941 | } |
2942 | |
2943 | virtual ~wxMBConv_macUTF8D() |
2944 | { |
2945 | if (m_uni!=NULL) |
2946 | DisposeUnicodeToTextInfo(&m_uni); |
2947 | if (m_uniBack!=NULL) |
2948 | DisposeUnicodeToTextInfo(&m_uniBack); |
2949 | } |
2950 | |
2951 | size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const |
2952 | { |
2953 | CreateIfNeeded() ; |
2954 | OSStatus status = noErr ; |
2955 | ByteCount byteOutLen ; |
2956 | ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ; |
2957 | |
2958 | char *tbuf = NULL ; |
2959 | |
2960 | if (buf == NULL) |
2961 | { |
2962 | // Apple specs say at least 32 |
2963 | n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T ); |
2964 | tbuf = (char*) malloc( n ) ; |
2965 | } |
2966 | |
2967 | ByteCount byteBufferLen = n ; |
2968 | UniChar* ubuf = NULL ; |
2969 | |
2970 | #if SIZEOF_WCHAR_T == 4 |
2971 | wxMBConvUTF16 converter ; |
2972 | size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ; |
2973 | byteInLen = unicharlen ; |
2974 | ubuf = (UniChar*) malloc( byteInLen + 2 ) ; |
2975 | converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ; |
2976 | #else |
2977 | ubuf = (UniChar*) psz ; |
2978 | #endif |
2979 | |
2980 | // ubuf is a non-decomposed UniChar buffer |
2981 | |
2982 | ByteCount dcubuflen = byteInLen * 2 + 2 ; |
2983 | ByteCount dcubufread , dcubufwritten ; |
2984 | UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; |
2985 | |
2986 | { |
2987 | #if wxUSE_THREADS |
2988 | wxMutexLocker lock( m_WC2MB_guard ); |
2989 | #endif |
2990 | ConvertFromUnicodeToText( m_uni , byteInLen , ubuf , |
2991 | kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , dcubuf ) ; |
2992 | |
2993 | // we now convert that decomposed buffer into UTF8 |
2994 | |
2995 | status = TECConvertText( |
2996 | m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread, |
2997 | (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen); |
2998 | } |
2999 | |
3000 | free( dcubuf ); |
3001 | |
3002 | #if SIZEOF_WCHAR_T == 4 |
3003 | free( ubuf ) ; |
3004 | #endif |
3005 | |
3006 | if ( buf == NULL ) |
3007 | free(tbuf) ; |
3008 | |
3009 | size_t res = byteOutLen ; |
3010 | if ( buf && res < n) |
3011 | { |
3012 | buf[res] = 0; |
3013 | // don't test for round-trip fidelity yet, we cannot guarantee it yet |
3014 | } |
3015 | |
3016 | return res ; |
3017 | } |
3018 | |
3019 | size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const |
3020 | { |
3021 | CreateIfNeeded() ; |
3022 | OSStatus status = noErr ; |
3023 | ByteCount byteOutLen ; |
3024 | ByteCount byteInLen = strlen(psz) + 1; |
3025 | wchar_t *tbuf = NULL ; |
3026 | UniChar* ubuf = NULL ; |
3027 | size_t res = 0 ; |
3028 | |
3029 | if (buf == NULL) |
3030 | { |
3031 | // Apple specs say at least 32 |
3032 | n = wxMax( 32, byteInLen ) ; |
3033 | tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ; |
3034 | } |
3035 | |
3036 | ByteCount byteBufferLen = n * sizeof( UniChar ) ; |
3037 | |
3038 | #if SIZEOF_WCHAR_T == 4 |
3039 | ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ; |
3040 | #else |
3041 | ubuf = (UniChar*) (buf ? buf : tbuf) ; |
3042 | #endif |
3043 | |
3044 | ByteCount dcubuflen = byteBufferLen * 2 + 2 ; |
3045 | ByteCount dcubufread , dcubufwritten ; |
3046 | UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ; |
3047 | |
3048 | { |
3049 | #if wxUSE_THREADS |
3050 | wxMutexLocker lock( m_MB2WC_guard ); |
3051 | #endif |
3052 | status = TECConvertText( |
3053 | m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen, |
3054 | (TextPtr) dcubuf, dcubuflen, &byteOutLen); |
3055 | // we have to terminate here, because n might be larger for the trailing zero, and if UniChar |
3056 | // is not properly terminated we get random characters at the end |
3057 | dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; |
3058 | |
3059 | // now from the decomposed UniChar to properly composed uniChar |
3060 | ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf , |
3061 | kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen , &dcubufread , &dcubufwritten , ubuf ) ; |
3062 | } |
3063 | |
3064 | free( dcubuf ); |
3065 | byteOutLen = dcubufwritten ; |
3066 | ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ; |
3067 | |
3068 | |
3069 | #if SIZEOF_WCHAR_T == 4 |
3070 | wxMBConvUTF16 converter ; |
3071 | res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ; |
3072 | free( ubuf ) ; |
3073 | #else |
3074 | res = byteOutLen / sizeof( UniChar ) ; |
3075 | #endif |
3076 | |
3077 | if ( buf == NULL ) |
3078 | free(tbuf) ; |
3079 | |
3080 | if ( buf && res < n) |
3081 | buf[res] = 0; |
3082 | |
3083 | return res ; |
3084 | } |
3085 | |
3086 | virtual void CreateIfNeeded() const |
3087 | { |
3088 | wxMBConv_mac::CreateIfNeeded() ; |
3089 | if ( m_uni == NULL ) |
3090 | { |
3091 | m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, |
3092 | kUnicodeNoSubset, kTextEncodingDefaultFormat); |
3093 | m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, |
3094 | kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat); |
3095 | m_map.mappingVersion = kUnicodeUseLatestMapping; |
3096 | |
3097 | OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni); |
3098 | wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; |
3099 | |
3100 | m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, |
3101 | kUnicodeNoSubset, kTextEncodingDefaultFormat); |
3102 | m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault, |
3103 | kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat); |
3104 | m_map.mappingVersion = kUnicodeUseLatestMapping; |
3105 | err = CreateUnicodeToTextInfo(&m_map, &m_uniBack); |
3106 | wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ; |
3107 | } |
3108 | } |
3109 | protected : |
3110 | mutable UnicodeToTextInfo m_uni; |
3111 | mutable UnicodeToTextInfo m_uniBack; |
3112 | mutable UnicodeMapping m_map; |
3113 | }; |
3114 | #endif // defined(__WXMAC__) && defined(TARGET_CARBON) |
3115 | |
3116 | // ============================================================================ |
3117 | // wxEncodingConverter based conversion classes |
3118 | // ============================================================================ |
3119 | |
3120 | #if wxUSE_FONTMAP |
3121 | |
3122 | class wxMBConv_wxwin : public wxMBConv |
3123 | { |
3124 | private: |
3125 | void Init() |
3126 | { |
3127 | m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) && |
3128 | w2m.Init(wxFONTENCODING_UNICODE, m_enc); |
3129 | } |
3130 | |
3131 | public: |
3132 | // temporarily just use wxEncodingConverter stuff, |
3133 | // so that it works while a better implementation is built |
3134 | wxMBConv_wxwin(const wxChar* name) |
3135 | { |
3136 | if (name) |
3137 | m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false); |
3138 | else |
3139 | m_enc = wxFONTENCODING_SYSTEM; |
3140 | |
3141 | Init(); |
3142 | } |
3143 | |
3144 | wxMBConv_wxwin(wxFontEncoding enc) |
3145 | { |
3146 | m_enc = enc; |
3147 | |
3148 | Init(); |
3149 | } |
3150 | |
3151 | size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const |
3152 | { |
3153 | size_t inbuf = strlen(psz); |
3154 | if (buf) |
3155 | { |
3156 | if (!m2w.Convert(psz, buf)) |
3157 | return wxCONV_FAILED; |
3158 | } |
3159 | return inbuf; |
3160 | } |
3161 | |
3162 | size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const |
3163 | { |
3164 | const size_t inbuf = wxWcslen(psz); |
3165 | if (buf) |
3166 | { |
3167 | if (!w2m.Convert(psz, buf)) |
3168 | return wxCONV_FAILED; |
3169 | } |
3170 | |
3171 | return inbuf; |
3172 | } |
3173 | |
3174 | virtual size_t GetMBNulLen() const |
3175 | { |
3176 | switch ( m_enc ) |
3177 | { |
3178 | case wxFONTENCODING_UTF16BE: |
3179 | case wxFONTENCODING_UTF16LE: |
3180 | return 2; |
3181 | |
3182 | case wxFONTENCODING_UTF32BE: |
3183 | case wxFONTENCODING_UTF32LE: |
3184 | return 4; |
3185 | |
3186 | default: |
3187 | return 1; |
3188 | } |
3189 | } |
3190 | |
3191 | virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); } |
3192 | |
3193 | bool IsOk() const { return m_ok; } |
3194 | |
3195 | public: |
3196 | wxFontEncoding m_enc; |
3197 | wxEncodingConverter m2w, w2m; |
3198 | |
3199 | private: |
3200 | // were we initialized successfully? |
3201 | bool m_ok; |
3202 | |
3203 | DECLARE_NO_COPY_CLASS(wxMBConv_wxwin) |
3204 | }; |
3205 | |
3206 | // make the constructors available for unit testing |
3207 | WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name ) |
3208 | { |
3209 | wxMBConv_wxwin* result = new wxMBConv_wxwin( name ); |
3210 | if ( !result->IsOk() ) |
3211 | { |
3212 | delete result; |
3213 | return 0; |
3214 | } |
3215 | |
3216 | return result; |
3217 | } |
3218 | |
3219 | #endif // wxUSE_FONTMAP |
3220 | |
3221 | // ============================================================================ |
3222 | // wxCSConv implementation |
3223 | // ============================================================================ |
3224 | |
3225 | void wxCSConv::Init() |
3226 | { |
3227 | m_name = NULL; |
3228 | m_convReal = NULL; |
3229 | m_deferred = true; |
3230 | } |
3231 | |
3232 | wxCSConv::wxCSConv(const wxChar *charset) |
3233 | { |
3234 | Init(); |
3235 | |
3236 | if ( charset ) |
3237 | { |
3238 | SetName(charset); |
3239 | } |
3240 | |
3241 | #if wxUSE_FONTMAP |
3242 | m_encoding = wxFontMapperBase::GetEncodingFromName(charset); |
3243 | if ( m_encoding == wxFONTENCODING_MAX ) |
3244 | { |
3245 | // set to unknown/invalid value |
3246 | m_encoding = wxFONTENCODING_SYSTEM; |
3247 | } |
3248 | else if ( m_encoding == wxFONTENCODING_DEFAULT ) |
3249 | { |
3250 | // wxFONTENCODING_DEFAULT is same as US-ASCII in this context |
3251 | m_encoding = wxFONTENCODING_ISO8859_1; |
3252 | } |
3253 | #else |
3254 | m_encoding = wxFONTENCODING_SYSTEM; |
3255 | #endif |
3256 | } |
3257 | |
3258 | wxCSConv::wxCSConv(wxFontEncoding encoding) |
3259 | { |
3260 | if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT ) |
3261 | { |
3262 | wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") ); |
3263 | |
3264 | encoding = wxFONTENCODING_SYSTEM; |
3265 | } |
3266 | |
3267 | Init(); |
3268 | |
3269 | m_encoding = encoding; |
3270 | } |
3271 | |
3272 | wxCSConv::~wxCSConv() |
3273 | { |
3274 | Clear(); |
3275 | } |
3276 | |
3277 | wxCSConv::wxCSConv(const wxCSConv& conv) |
3278 | : wxMBConv() |
3279 | { |
3280 | Init(); |
3281 | |
3282 | SetName(conv.m_name); |
3283 | m_encoding = conv.m_encoding; |
3284 | } |
3285 | |
3286 | wxCSConv& wxCSConv::operator=(const wxCSConv& conv) |
3287 | { |
3288 | Clear(); |
3289 | |
3290 | SetName(conv.m_name); |
3291 | m_encoding = conv.m_encoding; |
3292 | |
3293 | return *this; |
3294 | } |
3295 | |
3296 | void wxCSConv::Clear() |
3297 | { |
3298 | free(m_name); |
3299 | delete m_convReal; |
3300 | |
3301 | m_name = NULL; |
3302 | m_convReal = NULL; |
3303 | } |
3304 | |
3305 | void wxCSConv::SetName(const wxChar *charset) |
3306 | { |
3307 | if (charset) |
3308 | { |
3309 | m_name = wxStrdup(charset); |
3310 | m_deferred = true; |
3311 | } |
3312 | } |
3313 | |
3314 | #if wxUSE_FONTMAP |
3315 | |
3316 | WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual, |
3317 | wxEncodingNameCache ); |
3318 | |
3319 | static wxEncodingNameCache gs_nameCache; |
3320 | #endif |
3321 | |
3322 | wxMBConv *wxCSConv::DoCreate() const |
3323 | { |
3324 | #if wxUSE_FONTMAP |
3325 | wxLogTrace(TRACE_STRCONV, |
3326 | wxT("creating conversion for %s"), |
3327 | (m_name ? m_name |
3328 | : wxFontMapperBase::GetEncodingName(m_encoding).c_str())); |
3329 | #endif // wxUSE_FONTMAP |
3330 | |
3331 | // check for the special case of ASCII or ISO8859-1 charset: as we have |
3332 | // special knowledge of it anyhow, we don't need to create a special |
3333 | // conversion object |
3334 | if ( m_encoding == wxFONTENCODING_ISO8859_1 || |
3335 | m_encoding == wxFONTENCODING_DEFAULT ) |
3336 | { |
3337 | // don't convert at all |
3338 | return NULL; |
3339 | } |
3340 | |
3341 | // we trust OS to do conversion better than we can so try external |
3342 | // conversion methods first |
3343 | // |
3344 | // the full order is: |
3345 | // 1. OS conversion (iconv() under Unix or Win32 API) |
3346 | // 2. hard coded conversions for UTF |
3347 | // 3. wxEncodingConverter as fall back |
3348 | |
3349 | // step (1) |
3350 | #ifdef HAVE_ICONV |
3351 | #if !wxUSE_FONTMAP |
3352 | if ( m_name ) |
3353 | #endif // !wxUSE_FONTMAP |
3354 | { |
3355 | wxString name(m_name); |
3356 | #if wxUSE_FONTMAP |
3357 | wxFontEncoding encoding(m_encoding); |
3358 | #endif |
3359 | |
3360 | if ( !name.empty() ) |
3361 | { |
3362 | wxMBConv_iconv *conv = new wxMBConv_iconv(name); |
3363 | if ( conv->IsOk() ) |
3364 | return conv; |
3365 | |
3366 | delete conv; |
3367 | |
3368 | #if wxUSE_FONTMAP |
3369 | encoding = |
3370 | wxFontMapperBase::Get()->CharsetToEncoding(name, false); |
3371 | #endif // wxUSE_FONTMAP |
3372 | } |
3373 | #if wxUSE_FONTMAP |
3374 | { |
3375 | const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding); |
3376 | if ( it != gs_nameCache.end() ) |
3377 | { |
3378 | if ( it->second.empty() ) |
3379 | return NULL; |
3380 | |
3381 | wxMBConv_iconv *conv = new wxMBConv_iconv(it->second); |
3382 | if ( conv->IsOk() ) |
3383 | return conv; |
3384 | |
3385 | delete conv; |
3386 | } |
3387 | |
3388 | const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding); |
3389 | // CS : in case this does not return valid names (eg for MacRoman) encoding |
3390 | // got a 'failure' entry in the cache all the same, although it just has to |
3391 | // be created using a different method, so only store failed iconv creation |
3392 | // attempts (or perhaps we shoulnd't do this at all ?) |
3393 | if ( names[0] != NULL ) |
3394 | { |
3395 | for ( ; *names; ++names ) |
3396 | { |
3397 | wxMBConv_iconv *conv = new wxMBConv_iconv(*names); |
3398 | if ( conv->IsOk() ) |
3399 | { |
3400 | gs_nameCache[encoding] = *names; |
3401 | return conv; |
3402 | } |
3403 | |
3404 | delete conv; |
3405 | } |
3406 | |
3407 | gs_nameCache[encoding] = _T(""); // cache the failure |
3408 | } |
3409 | } |
3410 | #endif // wxUSE_FONTMAP |
3411 | } |
3412 | #endif // HAVE_ICONV |
3413 | |
3414 | #ifdef wxHAVE_WIN32_MB2WC |
3415 | { |
3416 | #if wxUSE_FONTMAP |
3417 | wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name) |
3418 | : new wxMBConv_win32(m_encoding); |
3419 | if ( conv->IsOk() ) |
3420 | return conv; |
3421 | |
3422 | delete conv; |
3423 | #else |
3424 | return NULL; |
3425 | #endif |
3426 | } |
3427 | #endif // wxHAVE_WIN32_MB2WC |
3428 | |
3429 | #if defined(__WXMAC__) |
3430 | { |
3431 | // leave UTF16 and UTF32 to the built-ins of wx |
3432 | if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE || |
3433 | ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) ) |
3434 | { |
3435 | #if wxUSE_FONTMAP |
3436 | wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name) |
3437 | : new wxMBConv_mac(m_encoding); |
3438 | #else |
3439 | wxMBConv_mac *conv = new wxMBConv_mac(m_encoding); |
3440 | #endif |
3441 | if ( conv->IsOk() ) |
3442 | return conv; |
3443 | |
3444 | delete conv; |
3445 | } |
3446 | } |
3447 | #endif |
3448 | |
3449 | #if defined(__WXCOCOA__) |
3450 | { |
3451 | if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) ) |
3452 | { |
3453 | #if wxUSE_FONTMAP |
3454 | wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name) |
3455 | : new wxMBConv_cocoa(m_encoding); |
3456 | #else |
3457 | wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding); |
3458 | #endif |
3459 | |
3460 | if ( conv->IsOk() ) |
3461 | return conv; |
3462 | |
3463 | delete conv; |
3464 | } |
3465 | } |
3466 | #endif |
3467 | // step (2) |
3468 | wxFontEncoding enc = m_encoding; |
3469 | #if wxUSE_FONTMAP |
3470 | if ( enc == wxFONTENCODING_SYSTEM && m_name ) |
3471 | { |
3472 | // use "false" to suppress interactive dialogs -- we can be called from |
3473 | // anywhere and popping up a dialog from here is the last thing we want to |
3474 | // do |
3475 | enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false); |
3476 | } |
3477 | #endif // wxUSE_FONTMAP |
3478 | |
3479 | switch ( enc ) |
3480 | { |
3481 | case wxFONTENCODING_UTF7: |
3482 | return new wxMBConvUTF7; |
3483 | |
3484 | case wxFONTENCODING_UTF8: |
3485 | return new wxMBConvUTF8; |
3486 | |
3487 | case wxFONTENCODING_UTF16BE: |
3488 | return new wxMBConvUTF16BE; |
3489 | |
3490 | case wxFONTENCODING_UTF16LE: |
3491 | return new wxMBConvUTF16LE; |
3492 | |
3493 | case wxFONTENCODING_UTF32BE: |
3494 | return new wxMBConvUTF32BE; |
3495 | |
3496 | case wxFONTENCODING_UTF32LE: |
3497 | return new wxMBConvUTF32LE; |
3498 | |
3499 | default: |
3500 | // nothing to do but put here to suppress gcc warnings |
3501 | break; |
3502 | } |
3503 | |
3504 | // step (3) |
3505 | #if wxUSE_FONTMAP |
3506 | { |
3507 | wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name) |
3508 | : new wxMBConv_wxwin(m_encoding); |
3509 | if ( conv->IsOk() ) |
3510 | return conv; |
3511 | |
3512 | delete conv; |
3513 | } |
3514 | #endif // wxUSE_FONTMAP |
3515 | |
3516 | // NB: This is a hack to prevent deadlock. What could otherwise happen |
3517 | // in Unicode build: wxConvLocal creation ends up being here |
3518 | // because of some failure and logs the error. But wxLog will try to |
3519 | // attach a timestamp, for which it will need wxConvLocal (to convert |
3520 | // time to char* and then wchar_t*), but that fails, tries to log the |
3521 | // error, but wxLog has an (already locked) critical section that |
3522 | // guards the static buffer. |
3523 | static bool alreadyLoggingError = false; |
3524 | if (!alreadyLoggingError) |
3525 | { |
3526 | alreadyLoggingError = true; |
3527 | wxLogError(_("Cannot convert from the charset '%s'!"), |
3528 | m_name ? m_name |
3529 | : |
3530 | #if wxUSE_FONTMAP |
3531 | wxFontMapperBase::GetEncodingDescription(m_encoding).c_str() |
3532 | #else // !wxUSE_FONTMAP |
3533 | wxString::Format(_("encoding %i"), m_encoding).c_str() |
3534 | #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP |
3535 | ); |
3536 | |
3537 | alreadyLoggingError = false; |
3538 | } |
3539 | |
3540 | return NULL; |
3541 | } |
3542 | |
3543 | void wxCSConv::CreateConvIfNeeded() const |
3544 | { |
3545 | if ( m_deferred ) |
3546 | { |
3547 | wxCSConv *self = (wxCSConv *)this; // const_cast |
3548 | |
3549 | // if we don't have neither the name nor the encoding, use the default |
3550 | // encoding for this system |
3551 | if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM ) |
3552 | { |
3553 | #if wxUSE_INTL |
3554 | self->m_encoding = wxLocale::GetSystemEncoding(); |
3555 | #else |
3556 | // fallback to some reasonable default: |
3557 | self->m_encoding = wxFONTENCODING_ISO8859_1; |
3558 | #endif // wxUSE_INTL |
3559 | } |
3560 | |
3561 | self->m_convReal = DoCreate(); |
3562 | self->m_deferred = false; |
3563 | } |
3564 | } |
3565 | |
3566 | bool wxCSConv::IsOk() const |
3567 | { |
3568 | CreateConvIfNeeded(); |
3569 | |
3570 | // special case: no convReal created for wxFONTENCODING_ISO8859_1 |
3571 | if ( m_encoding == wxFONTENCODING_ISO8859_1 ) |
3572 | return true; // always ok as we do it ourselves |
3573 | |
3574 | // m_convReal->IsOk() is called at its own creation, so we know it must |
3575 | // be ok if m_convReal is non-NULL |
3576 | return m_convReal != NULL; |
3577 | } |
3578 | |
3579 | size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen, |
3580 | const char *src, size_t srcLen) const |
3581 | { |
3582 | CreateConvIfNeeded(); |
3583 | |
3584 | if (m_convReal) |
3585 | return m_convReal->ToWChar(dst, dstLen, src, srcLen); |
3586 | |
3587 | // latin-1 (direct) |
3588 | if ( srcLen == wxNO_LEN ) |
3589 | srcLen = strlen(src) + 1; // take trailing NUL too |
3590 | |
3591 | if ( dst ) |
3592 | { |
3593 | if ( dstLen < srcLen ) |
3594 | return wxCONV_FAILED; |
3595 | |
3596 | for ( size_t n = 0; n < srcLen; n++ ) |
3597 | dst[n] = (unsigned char)(src[n]); |
3598 | } |
3599 | |
3600 | return srcLen; |
3601 | } |
3602 | |
3603 | size_t wxCSConv::FromWChar(char *dst, size_t dstLen, |
3604 | const wchar_t *src, size_t srcLen) const |
3605 | { |
3606 | CreateConvIfNeeded(); |
3607 | |
3608 | if (m_convReal) |
3609 | return m_convReal->FromWChar(dst, dstLen, src, srcLen); |
3610 | |
3611 | // latin-1 (direct) |
3612 | if ( srcLen == wxNO_LEN ) |
3613 | srcLen = wxWcslen(src) + 1; |
3614 | |
3615 | if ( dst ) |
3616 | { |
3617 | if ( dstLen < srcLen ) |
3618 | return wxCONV_FAILED; |
3619 | |
3620 | for ( size_t n = 0; n < srcLen; n++ ) |
3621 | { |
3622 | if ( src[n] > 0xFF ) |
3623 | return wxCONV_FAILED; |
3624 | |
3625 | dst[n] = (char)src[n]; |
3626 | } |
3627 | |
3628 | } |
3629 | else // still need to check the input validity |
3630 | { |
3631 | for ( size_t n = 0; n < srcLen; n++ ) |
3632 | { |
3633 | if ( src[ |