/[pcsx2_0.9.7]/trunk/3rdparty/wxWidgets/src/common/strconv.cpp
ViewVC logotype

Annotation of /trunk/3rdparty/wxWidgets/src/common/strconv.cpp

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (hide annotations) (download)
Tue Sep 7 03:24:11 2010 UTC (10 years, 4 months ago) by william
File size: 109064 byte(s)
committing r3113 initial commit again...
1 william 31 /////////////////////////////////////////////////////////////////////////////
2     // Name: src/common/strconv.cpp
3     // Purpose: Unicode conversion classes
4     // Author: Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
5     // Ryan Norton, Fredrik Roubert (UTF7)
6     // Modified by:
7     // Created: 29/01/98
8     // RCS-ID: $Id: strconv.cpp 56394 2008-10-17 11:31:22Z VZ $
9     // Copyright: (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
10     // (c) 2000-2003 Vadim Zeitlin
11     // (c) 2004 Ryan Norton, Fredrik Roubert
12     // Licence: wxWindows licence
13     /////////////////////////////////////////////////////////////////////////////
14    
15     // For compilers that support precompilation, includes "wx.h".
16     #include "wx/wxprec.h"
17    
18     #ifndef WX_PRECOMP
19     #ifdef __WXMSW__
20     #include "wx/msw/missing.h"
21     #endif
22     #include "wx/intl.h"
23     #include "wx/log.h"
24     #include "wx/utils.h"
25     #include "wx/hashmap.h"
26     #endif
27    
28     #include "wx/strconv.h"
29    
30     #if wxUSE_WCHAR_T
31    
32     #ifdef __WINDOWS__
33     #include "wx/msw/private.h"
34     #endif
35    
36     #ifndef __WXWINCE__
37     #include <errno.h>
38     #endif
39    
40     #include <ctype.h>
41     #include <string.h>
42     #include <stdlib.h>
43    
44     #if defined(__WIN32__) && !defined(__WXMICROWIN__)
45     #define wxHAVE_WIN32_MB2WC
46     #endif
47    
48     #ifdef __SALFORDC__
49     #include <clib.h>
50     #endif
51    
52     #ifdef HAVE_ICONV
53     #include <iconv.h>
54     #include "wx/thread.h"
55     #endif
56    
57     #include "wx/encconv.h"
58     #include "wx/fontmap.h"
59    
60     #ifdef __WXMAC__
61     #ifndef __DARWIN__
62     #include <ATSUnicode.h>
63     #include <TextCommon.h>
64     #include <TextEncodingConverter.h>
65     #endif
66    
67     // includes Mac headers
68     #include "wx/mac/private.h"
69     #include "wx/thread.h"
70    
71     #endif
72    
73    
74     #define TRACE_STRCONV _T("strconv")
75    
76     // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
77     // be 4 bytes
78     #if SIZEOF_WCHAR_T == 2
79     #define WC_UTF16
80     #endif
81    
82    
83     // ============================================================================
84     // implementation
85     // ============================================================================
86    
87     // helper function of cMB2WC(): check if n bytes at this location are all NUL
88     static bool NotAllNULs(const char *p, size_t n)
89     {
90     while ( n && *p++ == '\0' )
91     n--;
92    
93     return n != 0;
94     }
95    
96     // ----------------------------------------------------------------------------
97     // UTF-16 en/decoding to/from UCS-4 with surrogates handling
98     // ----------------------------------------------------------------------------
99    
100     static size_t encode_utf16(wxUint32 input, wxUint16 *output)
101     {
102     if (input <= 0xffff)
103     {
104     if (output)
105     *output = (wxUint16) input;
106    
107     return 1;
108     }
109     else if (input >= 0x110000)
110     {
111     return wxCONV_FAILED;
112     }
113     else
114     {
115     if (output)
116     {
117     *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
118     *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
119     }
120    
121     return 2;
122     }
123     }
124    
125     static size_t decode_utf16(const wxUint16* input, wxUint32& output)
126     {
127     if ((*input < 0xd800) || (*input > 0xdfff))
128     {
129     output = *input;
130     return 1;
131     }
132     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
133     {
134     output = *input;
135     return wxCONV_FAILED;
136     }
137     else
138     {
139     output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
140     return 2;
141     }
142     }
143    
144     #ifdef WC_UTF16
145     typedef wchar_t wxDecodeSurrogate_t;
146     #else // !WC_UTF16
147     typedef wxUint16 wxDecodeSurrogate_t;
148     #endif // WC_UTF16/!WC_UTF16
149    
150     // returns the next UTF-32 character from the wchar_t buffer and advances the
151     // pointer to the character after this one
152     //
153     // if an invalid character is found, *pSrc is set to NULL, the caller must
154     // check for this
155     static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
156     {
157     wxUint32 out;
158     const size_t
159     n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
160     if ( n == wxCONV_FAILED )
161     *pSrc = NULL;
162     else
163     *pSrc += n;
164    
165     return out;
166     }
167    
168     // ----------------------------------------------------------------------------
169     // wxMBConv
170     // ----------------------------------------------------------------------------
171    
172     size_t
173     wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
174     const char *src, size_t srcLen) const
175     {
176     // although new conversion classes are supposed to implement this function
177     // directly, the existins ones only implement the old MB2WC() and so, to
178     // avoid to have to rewrite all conversion classes at once, we provide a
179     // default (but not efficient) implementation of this one in terms of the
180     // old function by copying the input to ensure that it's NUL-terminated and
181     // then using MB2WC() to convert it
182    
183     // the number of chars [which would be] written to dst [if it were not NULL]
184     size_t dstWritten = 0;
185    
186     // the number of NULs terminating this string
187     size_t nulLen = 0; // not really needed, but just to avoid warnings
188    
189     // if we were not given the input size we just have to assume that the
190     // string is properly terminated as we have no way of knowing how long it
191     // is anyhow, but if we do have the size check whether there are enough
192     // NULs at the end
193     wxCharBuffer bufTmp;
194     const char *srcEnd;
195     if ( srcLen != wxNO_LEN )
196     {
197     // we need to know how to find the end of this string
198     nulLen = GetMBNulLen();
199     if ( nulLen == wxCONV_FAILED )
200     return wxCONV_FAILED;
201    
202     // if there are enough NULs we can avoid the copy
203     if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
204     {
205     // make a copy in order to properly NUL-terminate the string
206     bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
207     char * const p = bufTmp.data();
208     memcpy(p, src, srcLen);
209     for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
210     *s = '\0';
211    
212     src = bufTmp;
213     }
214    
215     srcEnd = src + srcLen;
216     }
217     else // quit after the first loop iteration
218     {
219     srcEnd = NULL;
220     }
221    
222     for ( ;; )
223     {
224     // try to convert the current chunk
225     size_t lenChunk = MB2WC(NULL, src, 0);
226     if ( lenChunk == wxCONV_FAILED )
227     return wxCONV_FAILED;
228    
229     lenChunk++; // for the L'\0' at the end of this chunk
230    
231     dstWritten += lenChunk;
232    
233     if ( lenChunk == 1 )
234     {
235     // nothing left in the input string, conversion succeeded
236     break;
237     }
238    
239     if ( dst )
240     {
241     if ( dstWritten > dstLen )
242     return wxCONV_FAILED;
243    
244     if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
245     return wxCONV_FAILED;
246    
247     dst += lenChunk;
248     }
249    
250     if ( !srcEnd )
251     {
252     // we convert just one chunk in this case as this is the entire
253     // string anyhow
254     break;
255     }
256    
257     // advance the input pointer past the end of this chunk
258     while ( NotAllNULs(src, nulLen) )
259     {
260     // notice that we must skip over multiple bytes here as we suppose
261     // that if NUL takes 2 or 4 bytes, then all the other characters do
262     // too and so if advanced by a single byte we might erroneously
263     // detect sequences of NUL bytes in the middle of the input
264     src += nulLen;
265     }
266    
267     src += nulLen; // skipping over its terminator as well
268    
269     // note that ">=" (and not just "==") is needed here as the terminator
270     // we skipped just above could be inside or just after the buffer
271     // delimited by inEnd
272     if ( src >= srcEnd )
273     break;
274     }
275    
276     return dstWritten;
277     }
278    
279     size_t
280     wxMBConv::FromWChar(char *dst, size_t dstLen,
281     const wchar_t *src, size_t srcLen) const
282     {
283     // the number of chars [which would be] written to dst [if it were not NULL]
284     size_t dstWritten = 0;
285    
286     // make a copy of the input string unless it is already properly
287     // NUL-terminated
288     //
289     // if we don't know its length we have no choice but to assume that it is,
290     // indeed, properly terminated
291     wxWCharBuffer bufTmp;
292     if ( srcLen == wxNO_LEN )
293     {
294     srcLen = wxWcslen(src) + 1;
295     }
296     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
297     {
298     // make a copy in order to properly NUL-terminate the string
299     bufTmp = wxWCharBuffer(srcLen);
300     memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
301     src = bufTmp;
302     }
303    
304     const size_t lenNul = GetMBNulLen();
305     for ( const wchar_t * const srcEnd = src + srcLen;
306     src < srcEnd;
307     src += wxWcslen(src) + 1 /* skip L'\0' too */ )
308     {
309     // try to convert the current chunk
310     size_t lenChunk = WC2MB(NULL, src, 0);
311    
312     if ( lenChunk == wxCONV_FAILED )
313     return wxCONV_FAILED;
314    
315     lenChunk += lenNul;
316     dstWritten += lenChunk;
317    
318     if ( dst )
319     {
320     if ( dstWritten > dstLen )
321     return wxCONV_FAILED;
322    
323     if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
324     return wxCONV_FAILED;
325    
326     dst += lenChunk;
327     }
328     }
329    
330     return dstWritten;
331     }
332    
333     size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
334     {
335     size_t rc = ToWChar(outBuff, outLen, inBuff);
336     if ( rc != wxCONV_FAILED )
337     {
338     // ToWChar() returns the buffer length, i.e. including the trailing
339     // NUL, while this method doesn't take it into account
340     rc--;
341     }
342    
343     return rc;
344     }
345    
346     size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
347     {
348     size_t rc = FromWChar(outBuff, outLen, inBuff);
349     if ( rc != wxCONV_FAILED )
350     {
351     rc -= GetMBNulLen();
352     }
353    
354     return rc;
355     }
356    
357     wxMBConv::~wxMBConv()
358     {
359     // nothing to do here (necessary for Darwin linking probably)
360     }
361    
362     const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
363     {
364     if ( psz )
365     {
366     // calculate the length of the buffer needed first
367     const size_t nLen = MB2WC(NULL, psz, 0);
368     if ( nLen != wxCONV_FAILED )
369     {
370     // now do the actual conversion
371     wxWCharBuffer buf(nLen /* +1 added implicitly */);
372    
373     // +1 for the trailing NULL
374     if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
375     return buf;
376     }
377     }
378    
379     return wxWCharBuffer();
380     }
381    
382     const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
383     {
384     if ( pwz )
385     {
386     const size_t nLen = WC2MB(NULL, pwz, 0);
387     if ( nLen != wxCONV_FAILED )
388     {
389     // extra space for trailing NUL(s)
390     static const size_t extraLen = GetMaxMBNulLen();
391    
392     wxCharBuffer buf(nLen + extraLen - 1);
393     if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
394     return buf;
395     }
396     }
397    
398     return wxCharBuffer();
399     }
400    
401     const wxWCharBuffer
402     wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
403     {
404     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
405     if ( dstLen != wxCONV_FAILED )
406     {
407     wxWCharBuffer wbuf(dstLen - 1);
408     if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
409     {
410     if ( outLen )
411     {
412     *outLen = dstLen;
413     if ( wbuf[dstLen - 1] == L'\0' )
414     (*outLen)--;
415     }
416    
417     return wbuf;
418     }
419     }
420    
421     if ( outLen )
422     *outLen = 0;
423    
424     return wxWCharBuffer();
425     }
426    
427     const wxCharBuffer
428     wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
429     {
430     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
431     if ( dstLen != wxCONV_FAILED )
432     {
433     // special case of empty input: can't allocate 0 size buffer below as
434     // wxCharBuffer insists on NUL-terminating it
435     wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
436     if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
437     {
438     if ( outLen )
439     {
440     *outLen = dstLen;
441    
442     const size_t nulLen = GetMBNulLen();
443     if ( dstLen >= nulLen &&
444     !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
445     {
446     // in this case the output is NUL-terminated and we're not
447     // supposed to count NUL
448     *outLen -= nulLen;
449     }
450     }
451    
452     return buf;
453     }
454     }
455    
456     if ( outLen )
457     *outLen = 0;
458    
459     return wxCharBuffer();
460     }
461    
462     // ----------------------------------------------------------------------------
463     // wxMBConvLibc
464     // ----------------------------------------------------------------------------
465    
466     size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
467     {
468     return wxMB2WC(buf, psz, n);
469     }
470    
471     size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
472     {
473     return wxWC2MB(buf, psz, n);
474     }
475    
476     // ----------------------------------------------------------------------------
477     // wxConvBrokenFileNames
478     // ----------------------------------------------------------------------------
479    
480     #ifdef __UNIX__
481    
482     wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
483     {
484     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
485     || wxStricmp(charset, _T("UTF8")) == 0 )
486     m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
487     else
488     m_conv = new wxCSConv(charset);
489     }
490    
491     #endif // __UNIX__
492    
493     // ----------------------------------------------------------------------------
494     // UTF-7
495     // ----------------------------------------------------------------------------
496    
497     // Implementation (C) 2004 Fredrik Roubert
498    
499     //
500     // BASE64 decoding table
501     //
502     static const unsigned char utf7unb64[] =
503     {
504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
505     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
506     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
509     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
510     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
511     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
512     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
513     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
514     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
515     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
516     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
517     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
518     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
519     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
534     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
535     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
536     };
537    
538     size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
539     {
540     size_t len = 0;
541    
542     while ( *psz && (!buf || (len < n)) )
543     {
544     unsigned char cc = *psz++;
545     if (cc != '+')
546     {
547     // plain ASCII char
548     if (buf)
549     *buf++ = cc;
550     len++;
551     }
552     else if (*psz == '-')
553     {
554     // encoded plus sign
555     if (buf)
556     *buf++ = cc;
557     len++;
558     psz++;
559     }
560     else // start of BASE64 encoded string
561     {
562     bool lsb, ok;
563     unsigned int d, l;
564     for ( ok = lsb = false, d = 0, l = 0;
565     (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
566     psz++ )
567     {
568     d <<= 6;
569     d += cc;
570     for (l += 6; l >= 8; lsb = !lsb)
571     {
572     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
573     if (lsb)
574     {
575     if (buf)
576     *buf++ |= c;
577     len ++;
578     }
579     else
580     {
581     if (buf)
582     *buf = (wchar_t)(c << 8);
583     }
584    
585     ok = true;
586     }
587     }
588    
589     if ( !ok )
590     {
591     // in valid UTF7 we should have valid characters after '+'
592     return wxCONV_FAILED;
593     }
594    
595     if (*psz == '-')
596     psz++;
597     }
598     }
599    
600     if ( buf && (len < n) )
601     *buf = '\0';
602    
603     return len;
604     }
605    
606     //
607     // BASE64 encoding table
608     //
609     static const unsigned char utf7enb64[] =
610     {
611     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
612     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
613     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
614     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
615     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
616     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
617     'w', 'x', 'y', 'z', '0', '1', '2', '3',
618     '4', '5', '6', '7', '8', '9', '+', '/'
619     };
620    
621     //
622     // UTF-7 encoding table
623     //
624     // 0 - Set D (directly encoded characters)
625     // 1 - Set O (optional direct characters)
626     // 2 - whitespace characters (optional)
627     // 3 - special characters
628     //
629     static const unsigned char utf7encode[128] =
630     {
631     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
632     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
633     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
634     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
635     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
636     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
637     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
639     };
640    
641     size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
642     {
643     size_t len = 0;
644    
645     while (*psz && ((!buf) || (len < n)))
646     {
647     wchar_t cc = *psz++;
648     if (cc < 0x80 && utf7encode[cc] < 1)
649     {
650     // plain ASCII char
651     if (buf)
652     *buf++ = (char)cc;
653    
654     len++;
655     }
656     #ifndef WC_UTF16
657     else if (((wxUint32)cc) > 0xffff)
658     {
659     // no surrogate pair generation (yet?)
660     return wxCONV_FAILED;
661     }
662     #endif
663     else
664     {
665     if (buf)
666     *buf++ = '+';
667    
668     len++;
669     if (cc != '+')
670     {
671     // BASE64 encode string
672     unsigned int lsb, d, l;
673     for (d = 0, l = 0; /*nothing*/; psz++)
674     {
675     for (lsb = 0; lsb < 2; lsb ++)
676     {
677     d <<= 8;
678     d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
679    
680     for (l += 8; l >= 6; )
681     {
682     l -= 6;
683     if (buf)
684     *buf++ = utf7enb64[(d >> l) % 64];
685     len++;
686     }
687     }
688    
689     cc = *psz;
690     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
691     break;
692     }
693    
694     if (l != 0)
695     {
696     if (buf)
697     *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
698    
699     len++;
700     }
701     }
702    
703     if (buf)
704     *buf++ = '-';
705     len++;
706     }
707     }
708    
709     if (buf && (len < n))
710     *buf = 0;
711    
712     return len;
713     }
714    
715     // ----------------------------------------------------------------------------
716     // UTF-8
717     // ----------------------------------------------------------------------------
718    
719     static wxUint32 utf8_max[]=
720     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
721    
722     // boundaries of the private use area we use to (temporarily) remap invalid
723     // characters invalid in a UTF-8 encoded string
724     const wxUint32 wxUnicodePUA = 0x100000;
725     const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
726    
727     size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
728     {
729     size_t len = 0;
730    
731     while (*psz && ((!buf) || (len < n)))
732     {
733     const char *opsz = psz;
734     bool invalid = false;
735     unsigned char cc = *psz++, fc = cc;
736     unsigned cnt;
737     for (cnt = 0; fc & 0x80; cnt++)
738     fc <<= 1;
739    
740     if (!cnt)
741     {
742     // plain ASCII char
743     if (buf)
744     *buf++ = cc;
745     len++;
746    
747     // escape the escape character for octal escapes
748     if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
749     && cc == '\\' && (!buf || len < n))
750     {
751     if (buf)
752     *buf++ = cc;
753     len++;
754     }
755     }
756     else
757     {
758     cnt--;
759     if (!cnt)
760     {
761     // invalid UTF-8 sequence
762     invalid = true;
763     }
764     else
765     {
766     unsigned ocnt = cnt - 1;
767     wxUint32 res = cc & (0x3f >> cnt);
768     while (cnt--)
769     {
770     cc = *psz;
771     if ((cc & 0xC0) != 0x80)
772     {
773     // invalid UTF-8 sequence
774     invalid = true;
775     break;
776     }
777    
778     psz++;
779     res = (res << 6) | (cc & 0x3f);
780     }
781    
782     if (invalid || res <= utf8_max[ocnt])
783     {
784     // illegal UTF-8 encoding
785     invalid = true;
786     }
787     else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
788     res >= wxUnicodePUA && res < wxUnicodePUAEnd)
789     {
790     // if one of our PUA characters turns up externally
791     // it must also be treated as an illegal sequence
792     // (a bit like you have to escape an escape character)
793     invalid = true;
794     }
795     else
796     {
797     #ifdef WC_UTF16
798     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
799     size_t pa = encode_utf16(res, (wxUint16 *)buf);
800     if (pa == wxCONV_FAILED)
801     {
802     invalid = true;
803     }
804     else
805     {
806     if (buf)
807     buf += pa;
808     len += pa;
809     }
810     #else // !WC_UTF16
811     if (buf)
812     *buf++ = (wchar_t)res;
813     len++;
814     #endif // WC_UTF16/!WC_UTF16
815     }
816     }
817    
818     if (invalid)
819     {
820     if (m_options & MAP_INVALID_UTF8_TO_PUA)
821     {
822     while (opsz < psz && (!buf || len < n))
823     {
824     #ifdef WC_UTF16
825     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
826     size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
827     wxASSERT(pa != wxCONV_FAILED);
828     if (buf)
829     buf += pa;
830     opsz++;
831     len += pa;
832     #else
833     if (buf)
834     *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
835     opsz++;
836     len++;
837     #endif
838     }
839     }
840     else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
841     {
842     while (opsz < psz && (!buf || len < n))
843     {
844     if ( buf && len + 3 < n )
845     {
846     unsigned char on = *opsz;
847     *buf++ = L'\\';
848     *buf++ = (wchar_t)( L'0' + on / 0100 );
849     *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
850     *buf++ = (wchar_t)( L'0' + on % 010 );
851     }
852    
853     opsz++;
854     len += 4;
855     }
856     }
857     else // MAP_INVALID_UTF8_NOT
858     {
859     return wxCONV_FAILED;
860     }
861     }
862     }
863     }
864    
865     if (buf && (len < n))
866     *buf = 0;
867    
868     return len;
869     }
870    
871     static inline bool isoctal(wchar_t wch)
872     {
873     return L'0' <= wch && wch <= L'7';
874     }
875    
876     size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
877     {
878     size_t len = 0;
879    
880     while (*psz && ((!buf) || (len < n)))
881     {
882     wxUint32 cc;
883    
884     #ifdef WC_UTF16
885     // cast is ok for WC_UTF16
886     size_t pa = decode_utf16((const wxUint16 *)psz, cc);
887     psz += (pa == wxCONV_FAILED) ? 1 : pa;
888     #else
889     cc = (*psz++) & 0x7fffffff;
890     #endif
891    
892     if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
893     && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
894     {
895     if (buf)
896     *buf++ = (char)(cc - wxUnicodePUA);
897     len++;
898     }
899     else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
900     && cc == L'\\' && psz[0] == L'\\' )
901     {
902     if (buf)
903     *buf++ = (char)cc;
904     psz++;
905     len++;
906     }
907     else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
908     cc == L'\\' &&
909     isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
910     {
911     if (buf)
912     {
913     *buf++ = (char) ((psz[0] - L'0') * 0100 +
914     (psz[1] - L'0') * 010 +
915     (psz[2] - L'0'));
916     }
917    
918     psz += 3;
919     len++;
920     }
921     else
922     {
923     unsigned cnt;
924     for (cnt = 0; cc > utf8_max[cnt]; cnt++)
925     {
926     }
927    
928     if (!cnt)
929     {
930     // plain ASCII char
931     if (buf)
932     *buf++ = (char) cc;
933     len++;
934     }
935     else
936     {
937     len += cnt + 1;
938     if (buf)
939     {
940     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
941     while (cnt--)
942     *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
943     }
944     }
945     }
946     }
947    
948     if (buf && (len < n))
949     *buf = 0;
950    
951     return len;
952     }
953    
954     // ============================================================================
955     // UTF-16
956     // ============================================================================
957    
958     #ifdef WORDS_BIGENDIAN
959     #define wxMBConvUTF16straight wxMBConvUTF16BE
960     #define wxMBConvUTF16swap wxMBConvUTF16LE
961     #else
962     #define wxMBConvUTF16swap wxMBConvUTF16BE
963     #define wxMBConvUTF16straight wxMBConvUTF16LE
964     #endif
965    
966     /* static */
967     size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
968     {
969     if ( srcLen == wxNO_LEN )
970     {
971     // count the number of bytes in input, including the trailing NULs
972     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
973     for ( srcLen = 1; *inBuff++; srcLen++ )
974     ;
975    
976     srcLen *= BYTES_PER_CHAR;
977     }
978     else // we already have the length
979     {
980     // we can only convert an entire number of UTF-16 characters
981     if ( srcLen % BYTES_PER_CHAR )
982     return wxCONV_FAILED;
983     }
984    
985     return srcLen;
986     }
987    
988     // case when in-memory representation is UTF-16 too
989     #ifdef WC_UTF16
990    
991     // ----------------------------------------------------------------------------
992     // conversions without endianness change
993     // ----------------------------------------------------------------------------
994    
995     size_t
996     wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
997     const char *src, size_t srcLen) const
998     {
999     // set up the scene for using memcpy() (which is presumably more efficient
1000     // than copying the bytes one by one)
1001     srcLen = GetLength(src, srcLen);
1002     if ( srcLen == wxNO_LEN )
1003     return wxCONV_FAILED;
1004    
1005     const size_t inLen = srcLen / BYTES_PER_CHAR;
1006     if ( dst )
1007     {
1008     if ( dstLen < inLen )
1009     return wxCONV_FAILED;
1010    
1011     memcpy(dst, src, srcLen);
1012     }
1013    
1014     return inLen;
1015     }
1016    
1017     size_t
1018     wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1019     const wchar_t *src, size_t srcLen) const
1020     {
1021     if ( srcLen == wxNO_LEN )
1022     srcLen = wxWcslen(src) + 1;
1023    
1024     srcLen *= BYTES_PER_CHAR;
1025    
1026     if ( dst )
1027     {
1028     if ( dstLen < srcLen )
1029     return wxCONV_FAILED;
1030    
1031     memcpy(dst, src, srcLen);
1032     }
1033    
1034     return srcLen;
1035     }
1036    
1037     // ----------------------------------------------------------------------------
1038     // endian-reversing conversions
1039     // ----------------------------------------------------------------------------
1040    
1041     size_t
1042     wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1043     const char *src, size_t srcLen) const
1044     {
1045     srcLen = GetLength(src, srcLen);
1046     if ( srcLen == wxNO_LEN )
1047     return wxCONV_FAILED;
1048    
1049     srcLen /= BYTES_PER_CHAR;
1050    
1051     if ( dst )
1052     {
1053     if ( dstLen < srcLen )
1054     return wxCONV_FAILED;
1055    
1056     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1057     for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1058     {
1059     *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1060     }
1061     }
1062    
1063     return srcLen;
1064     }
1065    
1066     size_t
1067     wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1068     const wchar_t *src, size_t srcLen) const
1069     {
1070     if ( srcLen == wxNO_LEN )
1071     srcLen = wxWcslen(src) + 1;
1072    
1073     srcLen *= BYTES_PER_CHAR;
1074    
1075     if ( dst )
1076     {
1077     if ( dstLen < srcLen )
1078     return wxCONV_FAILED;
1079    
1080     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1081     for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1082     {
1083     *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1084     }
1085     }
1086    
1087     return srcLen;
1088     }
1089    
1090     #else // !WC_UTF16: wchar_t is UTF-32
1091    
1092     // ----------------------------------------------------------------------------
1093     // conversions without endianness change
1094     // ----------------------------------------------------------------------------
1095    
1096     size_t
1097     wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1098     const char *src, size_t srcLen) const
1099     {
1100     srcLen = GetLength(src, srcLen);
1101     if ( srcLen == wxNO_LEN )
1102     return wxCONV_FAILED;
1103    
1104     const size_t inLen = srcLen / BYTES_PER_CHAR;
1105     if ( !dst )
1106     {
1107     // optimization: return maximal space which could be needed for this
1108     // string even if the real size could be smaller if the buffer contains
1109     // any surrogates
1110     return inLen;
1111     }
1112    
1113     size_t outLen = 0;
1114     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1115     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1116     {
1117     const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1118     if ( !inBuff )
1119     return wxCONV_FAILED;
1120    
1121     if ( ++outLen > dstLen )
1122     return wxCONV_FAILED;
1123    
1124     *dst++ = ch;
1125     }
1126    
1127    
1128     return outLen;
1129     }
1130    
1131     size_t
1132     wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1133     const wchar_t *src, size_t srcLen) const
1134     {
1135     if ( srcLen == wxNO_LEN )
1136     srcLen = wxWcslen(src) + 1;
1137    
1138     size_t outLen = 0;
1139     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1140     for ( size_t n = 0; n < srcLen; n++ )
1141     {
1142     wxUint16 cc[2];
1143     const size_t numChars = encode_utf16(*src++, cc);
1144     if ( numChars == wxCONV_FAILED )
1145     return wxCONV_FAILED;
1146    
1147     outLen += numChars * BYTES_PER_CHAR;
1148     if ( outBuff )
1149     {
1150     if ( outLen > dstLen )
1151     return wxCONV_FAILED;
1152    
1153     *outBuff++ = cc[0];
1154     if ( numChars == 2 )
1155     {
1156     // second character of a surrogate
1157     *outBuff++ = cc[1];
1158     }
1159     }
1160     }
1161    
1162     return outLen;
1163     }
1164    
1165     // ----------------------------------------------------------------------------
1166     // endian-reversing conversions
1167     // ----------------------------------------------------------------------------
1168    
1169     size_t
1170     wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1171     const char *src, size_t srcLen) const
1172     {
1173     srcLen = GetLength(src, srcLen);
1174     if ( srcLen == wxNO_LEN )
1175     return wxCONV_FAILED;
1176    
1177     const size_t inLen = srcLen / BYTES_PER_CHAR;
1178     if ( !dst )
1179     {
1180     // optimization: return maximal space which could be needed for this
1181     // string even if the real size could be smaller if the buffer contains
1182     // any surrogates
1183     return inLen;
1184     }
1185    
1186     size_t outLen = 0;
1187     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1188     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1189     {
1190     wxUint32 ch;
1191     wxUint16 tmp[2];
1192    
1193     tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1194     inBuff++;
1195     tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1196    
1197     const size_t numChars = decode_utf16(tmp, ch);
1198     if ( numChars == wxCONV_FAILED )
1199     return wxCONV_FAILED;
1200    
1201     if ( numChars == 2 )
1202     inBuff++;
1203    
1204     if ( ++outLen > dstLen )
1205     return wxCONV_FAILED;
1206    
1207     *dst++ = ch;
1208     }
1209    
1210    
1211     return outLen;
1212     }
1213    
1214     size_t
1215     wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1216     const wchar_t *src, size_t srcLen) const
1217     {
1218     if ( srcLen == wxNO_LEN )
1219     srcLen = wxWcslen(src) + 1;
1220    
1221     size_t outLen = 0;
1222     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1223     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1224     {
1225     wxUint16 cc[2];
1226     const size_t numChars = encode_utf16(*src, cc);
1227     if ( numChars == wxCONV_FAILED )
1228     return wxCONV_FAILED;
1229    
1230     outLen += numChars * BYTES_PER_CHAR;
1231     if ( outBuff )
1232     {
1233     if ( outLen > dstLen )
1234     return wxCONV_FAILED;
1235    
1236     *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1237     if ( numChars == 2 )
1238     {
1239     // second character of a surrogate
1240     *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1241     }
1242     }
1243     }
1244    
1245     return outLen;
1246     }
1247    
1248     #endif // WC_UTF16/!WC_UTF16
1249    
1250    
1251     // ============================================================================
1252     // UTF-32
1253     // ============================================================================
1254    
1255     #ifdef WORDS_BIGENDIAN
1256     #define wxMBConvUTF32straight wxMBConvUTF32BE
1257     #define wxMBConvUTF32swap wxMBConvUTF32LE
1258     #else
1259     #define wxMBConvUTF32swap wxMBConvUTF32BE
1260     #define wxMBConvUTF32straight wxMBConvUTF32LE
1261     #endif
1262    
1263    
1264     WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1265     WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1266    
1267     /* static */
1268     size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1269     {
1270     if ( srcLen == wxNO_LEN )
1271     {
1272     // count the number of bytes in input, including the trailing NULs
1273     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1274     for ( srcLen = 1; *inBuff++; srcLen++ )
1275     ;
1276    
1277     srcLen *= BYTES_PER_CHAR;
1278     }
1279     else // we already have the length
1280     {
1281     // we can only convert an entire number of UTF-32 characters
1282     if ( srcLen % BYTES_PER_CHAR )
1283     return wxCONV_FAILED;
1284     }
1285    
1286     return srcLen;
1287     }
1288    
1289     // case when in-memory representation is UTF-16
1290     #ifdef WC_UTF16
1291    
1292     // ----------------------------------------------------------------------------
1293     // conversions without endianness change
1294     // ----------------------------------------------------------------------------
1295    
1296     size_t
1297     wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1298     const char *src, size_t srcLen) const
1299     {
1300     srcLen = GetLength(src, srcLen);
1301     if ( srcLen == wxNO_LEN )
1302     return wxCONV_FAILED;
1303    
1304     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1305     const size_t inLen = srcLen / BYTES_PER_CHAR;
1306     size_t outLen = 0;
1307     for ( size_t n = 0; n < inLen; n++ )
1308     {
1309     wxUint16 cc[2];
1310     const size_t numChars = encode_utf16(*inBuff++, cc);
1311     if ( numChars == wxCONV_FAILED )
1312     return wxCONV_FAILED;
1313    
1314     outLen += numChars;
1315     if ( dst )
1316     {
1317     if ( outLen > dstLen )
1318     return wxCONV_FAILED;
1319    
1320     *dst++ = cc[0];
1321     if ( numChars == 2 )
1322     {
1323     // second character of a surrogate
1324     *dst++ = cc[1];
1325     }
1326     }
1327     }
1328    
1329     return outLen;
1330     }
1331    
1332     size_t
1333     wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1334     const wchar_t *src, size_t srcLen) const
1335     {
1336     if ( srcLen == wxNO_LEN )
1337     srcLen = wxWcslen(src) + 1;
1338    
1339     if ( !dst )
1340     {
1341     // optimization: return maximal space which could be needed for this
1342     // string instead of the exact amount which could be less if there are
1343     // any surrogates in the input
1344     //
1345     // we consider that surrogates are rare enough to make it worthwhile to
1346     // avoid running the loop below at the cost of slightly extra memory
1347     // consumption
1348     return srcLen * BYTES_PER_CHAR;
1349     }
1350    
1351     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1352     size_t outLen = 0;
1353     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1354     {
1355     const wxUint32 ch = wxDecodeSurrogate(&src);
1356     if ( !src )
1357     return wxCONV_FAILED;
1358    
1359     outLen += BYTES_PER_CHAR;
1360    
1361     if ( outLen > dstLen )
1362     return wxCONV_FAILED;
1363    
1364     *outBuff++ = ch;
1365     }
1366    
1367     return outLen;
1368     }
1369    
1370     // ----------------------------------------------------------------------------
1371     // endian-reversing conversions
1372     // ----------------------------------------------------------------------------
1373    
1374     size_t
1375     wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1376     const char *src, size_t srcLen) const
1377     {
1378     srcLen = GetLength(src, srcLen);
1379     if ( srcLen == wxNO_LEN )
1380     return wxCONV_FAILED;
1381    
1382     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1383     const size_t inLen = srcLen / BYTES_PER_CHAR;
1384     size_t outLen = 0;
1385     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1386     {
1387     wxUint16 cc[2];
1388     const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1389     if ( numChars == wxCONV_FAILED )
1390     return wxCONV_FAILED;
1391    
1392     outLen += numChars;
1393     if ( dst )
1394     {
1395     if ( outLen > dstLen )
1396     return wxCONV_FAILED;
1397    
1398     *dst++ = cc[0];
1399     if ( numChars == 2 )
1400     {
1401     // second character of a surrogate
1402     *dst++ = cc[1];
1403     }
1404     }
1405     }
1406    
1407     return outLen;
1408     }
1409    
1410     size_t
1411     wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1412     const wchar_t *src, size_t srcLen) const
1413     {
1414     if ( srcLen == wxNO_LEN )
1415     srcLen = wxWcslen(src) + 1;
1416    
1417     if ( !dst )
1418     {
1419     // optimization: return maximal space which could be needed for this
1420     // string instead of the exact amount which could be less if there are
1421     // any surrogates in the input
1422     //
1423     // we consider that surrogates are rare enough to make it worthwhile to
1424     // avoid running the loop below at the cost of slightly extra memory
1425     // consumption
1426     return srcLen*BYTES_PER_CHAR;
1427     }
1428    
1429     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1430     size_t outLen = 0;
1431     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1432     {
1433     const wxUint32 ch = wxDecodeSurrogate(&src);
1434     if ( !src )
1435     return wxCONV_FAILED;
1436    
1437     outLen += BYTES_PER_CHAR;
1438    
1439     if ( outLen > dstLen )
1440     return wxCONV_FAILED;
1441    
1442     *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1443     }
1444    
1445     return outLen;
1446     }
1447    
1448     #else // !WC_UTF16: wchar_t is UTF-32
1449    
1450     // ----------------------------------------------------------------------------
1451     // conversions without endianness change
1452     // ----------------------------------------------------------------------------
1453    
1454     size_t
1455     wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1456     const char *src, size_t srcLen) const
1457     {
1458     // use memcpy() as it should be much faster than hand-written loop
1459     srcLen = GetLength(src, srcLen);
1460     if ( srcLen == wxNO_LEN )
1461     return wxCONV_FAILED;
1462    
1463     const size_t inLen = srcLen/BYTES_PER_CHAR;
1464     if ( dst )
1465     {
1466     if ( dstLen < inLen )
1467     return wxCONV_FAILED;
1468    
1469     memcpy(dst, src, srcLen);
1470     }
1471    
1472     return inLen;
1473     }
1474    
1475     size_t
1476     wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1477     const wchar_t *src, size_t srcLen) const
1478     {
1479     if ( srcLen == wxNO_LEN )
1480     srcLen = wxWcslen(src) + 1;
1481    
1482     srcLen *= BYTES_PER_CHAR;
1483    
1484     if ( dst )
1485     {
1486     if ( dstLen < srcLen )
1487     return wxCONV_FAILED;
1488    
1489     memcpy(dst, src, srcLen);
1490     }
1491    
1492     return srcLen;
1493     }
1494    
1495     // ----------------------------------------------------------------------------
1496     // endian-reversing conversions
1497     // ----------------------------------------------------------------------------
1498    
1499     size_t
1500     wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1501     const char *src, size_t srcLen) const
1502     {
1503     srcLen = GetLength(src, srcLen);
1504     if ( srcLen == wxNO_LEN )
1505     return wxCONV_FAILED;
1506    
1507     srcLen /= BYTES_PER_CHAR;
1508    
1509     if ( dst )
1510     {
1511     if ( dstLen < srcLen )
1512     return wxCONV_FAILED;
1513    
1514     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1515     for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1516     {
1517     *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1518     }
1519     }
1520    
1521     return srcLen;
1522     }
1523    
1524     size_t
1525     wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1526     const wchar_t *src, size_t srcLen) const
1527     {
1528     if ( srcLen == wxNO_LEN )
1529     srcLen = wxWcslen(src) + 1;
1530    
1531     srcLen *= BYTES_PER_CHAR;
1532    
1533     if ( dst )
1534     {
1535     if ( dstLen < srcLen )
1536     return wxCONV_FAILED;
1537    
1538     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1539     for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1540     {
1541     *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1542     }
1543     }
1544    
1545     return srcLen;
1546     }
1547    
1548     #endif // WC_UTF16/!WC_UTF16
1549    
1550    
1551     // ============================================================================
1552     // The classes doing conversion using the iconv_xxx() functions
1553     // ============================================================================
1554    
1555     #ifdef HAVE_ICONV
1556    
1557     // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1558     // E2BIG if output buffer is _exactly_ as big as needed. Such case is
1559     // (unless there's yet another bug in glibc) the only case when iconv()
1560     // returns with (size_t)-1 (which means error) and says there are 0 bytes
1561     // left in the input buffer -- when _real_ error occurs,
1562     // bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1563     // iconv() failure.
1564     // [This bug does not appear in glibc 2.2.]
1565     #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1566     #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1567     (errno != E2BIG || bufLeft != 0))
1568     #else
1569     #define ICONV_FAILED(cres, bufLeft) (cres == (size_t)-1)
1570     #endif
1571    
1572     #define ICONV_CHAR_CAST(x) ((ICONV_CONST char **)(x))
1573    
1574     #define ICONV_T_INVALID ((iconv_t)-1)
1575    
1576     #if SIZEOF_WCHAR_T == 4
1577     #define WC_BSWAP wxUINT32_SWAP_ALWAYS
1578     #define WC_ENC wxFONTENCODING_UTF32
1579     #elif SIZEOF_WCHAR_T == 2
1580     #define WC_BSWAP wxUINT16_SWAP_ALWAYS
1581     #define WC_ENC wxFONTENCODING_UTF16
1582     #else // sizeof(wchar_t) != 2 nor 4
1583     // does this ever happen?
1584     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1585     #endif
1586    
1587     // ----------------------------------------------------------------------------
1588     // wxMBConv_iconv: encapsulates an iconv character set
1589     // ----------------------------------------------------------------------------
1590    
1591     class wxMBConv_iconv : public wxMBConv
1592     {
1593     public:
1594     wxMBConv_iconv(const wxChar *name);
1595     virtual ~wxMBConv_iconv();
1596    
1597     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1598     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1599    
1600     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1601     virtual size_t GetMBNulLen() const;
1602    
1603     virtual wxMBConv *Clone() const
1604     {
1605     wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1606     p->m_minMBCharWidth = m_minMBCharWidth;
1607     return p;
1608     }
1609    
1610     bool IsOk() const
1611     { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1612    
1613     protected:
1614     // the iconv handlers used to translate from multibyte
1615     // to wide char and in the other direction
1616     iconv_t m2w,
1617     w2m;
1618    
1619     #if wxUSE_THREADS
1620     // guards access to m2w and w2m objects
1621     wxMutex m_iconvMutex;
1622     #endif
1623    
1624     private:
1625     // the name (for iconv_open()) of a wide char charset -- if none is
1626     // available on this machine, it will remain NULL
1627     static wxString ms_wcCharsetName;
1628    
1629     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1630     // different endian-ness than the native one
1631     static bool ms_wcNeedsSwap;
1632    
1633    
1634     // name of the encoding handled by this conversion
1635     wxString m_name;
1636    
1637     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1638     // initially
1639     size_t m_minMBCharWidth;
1640     };
1641    
1642     // make the constructor available for unit testing
1643     WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1644     {
1645     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1646     if ( !result->IsOk() )
1647     {
1648     delete result;
1649     return 0;
1650     }
1651    
1652     return result;
1653     }
1654    
1655     wxString wxMBConv_iconv::ms_wcCharsetName;
1656     bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1657    
1658     wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1659     : m_name(name)
1660     {
1661     m_minMBCharWidth = 0;
1662    
1663     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1664     // names for the charsets
1665     const wxCharBuffer cname(wxString(name).ToAscii());
1666    
1667     // check for charset that represents wchar_t:
1668     if ( ms_wcCharsetName.empty() )
1669     {
1670     wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1671    
1672     #if wxUSE_FONTMAP
1673     const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1674     #else // !wxUSE_FONTMAP
1675     static const wxChar *names_static[] =
1676     {
1677     #if SIZEOF_WCHAR_T == 4
1678     _T("UCS-4"),
1679     #elif SIZEOF_WCHAR_T = 2
1680     _T("UCS-2"),
1681     #endif
1682     NULL
1683     };
1684     const wxChar **names = names_static;
1685     #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1686    
1687     for ( ; *names && ms_wcCharsetName.empty(); ++names )
1688     {
1689     const wxString nameCS(*names);
1690    
1691     // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1692     wxString nameXE(nameCS);
1693    
1694     #ifdef WORDS_BIGENDIAN
1695     nameXE += _T("BE");
1696     #else // little endian
1697     nameXE += _T("LE");
1698     #endif
1699    
1700     wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1701     nameXE.c_str());
1702    
1703     m2w = iconv_open(nameXE.ToAscii(), cname);
1704     if ( m2w == ICONV_T_INVALID )
1705     {
1706     // try charset w/o bytesex info (e.g. "UCS4")
1707     wxLogTrace(TRACE_STRCONV, _T(" trying charset \"%s\""),
1708     nameCS.c_str());
1709     m2w = iconv_open(nameCS.ToAscii(), cname);
1710    
1711     // and check for bytesex ourselves:
1712     if ( m2w != ICONV_T_INVALID )
1713     {
1714     char buf[2], *bufPtr;
1715     wchar_t wbuf[2], *wbufPtr;
1716     size_t insz, outsz;
1717     size_t res;
1718    
1719     buf[0] = 'A';
1720     buf[1] = 0;
1721     wbuf[0] = 0;
1722     insz = 2;
1723     outsz = SIZEOF_WCHAR_T * 2;
1724     wbufPtr = wbuf;
1725     bufPtr = buf;
1726    
1727     res = iconv(
1728     m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1729     (char**)&wbufPtr, &outsz);
1730    
1731     if (ICONV_FAILED(res, insz))
1732     {
1733     wxLogLastError(wxT("iconv"));
1734     wxLogError(_("Conversion to charset '%s' doesn't work."),
1735     nameCS.c_str());
1736     }
1737     else // ok, can convert to this encoding, remember it
1738     {
1739     ms_wcCharsetName = nameCS;
1740     ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1741     }
1742     }
1743     }
1744     else // use charset not requiring byte swapping
1745     {
1746     ms_wcCharsetName = nameXE;
1747     }
1748     }
1749    
1750     wxLogTrace(TRACE_STRCONV,
1751     wxT("iconv wchar_t charset is \"%s\"%s"),
1752     ms_wcCharsetName.empty() ? _T("<none>")
1753     : ms_wcCharsetName.c_str(),
1754     ms_wcNeedsSwap ? _T(" (needs swap)")
1755     : _T(""));
1756     }
1757     else // we already have ms_wcCharsetName
1758     {
1759     m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1760     }
1761    
1762     if ( ms_wcCharsetName.empty() )
1763     {
1764     w2m = ICONV_T_INVALID;
1765     }
1766     else
1767     {
1768     w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1769     if ( w2m == ICONV_T_INVALID )
1770     {
1771     wxLogTrace(TRACE_STRCONV,
1772     wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1773     ms_wcCharsetName.c_str(), cname.data());
1774     }
1775     }
1776     }
1777    
1778     wxMBConv_iconv::~wxMBConv_iconv()
1779     {
1780     if ( m2w != ICONV_T_INVALID )
1781     iconv_close(m2w);
1782     if ( w2m != ICONV_T_INVALID )
1783     iconv_close(w2m);
1784     }
1785    
1786     size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1787     {
1788     // find the string length: notice that must be done differently for
1789     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1790     size_t inbuf;
1791     const size_t nulLen = GetMBNulLen();
1792     switch ( nulLen )
1793     {
1794     default:
1795     return wxCONV_FAILED;
1796    
1797     case 1:
1798     inbuf = strlen(psz); // arguably more optimized than our version
1799     break;
1800    
1801     case 2:
1802     case 4:
1803     // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1804     // they also have to start at character boundary and not span two
1805     // adjacent characters
1806     const char *p;
1807     for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1808     ;
1809     inbuf = p - psz;
1810     break;
1811     }
1812    
1813     #if wxUSE_THREADS
1814     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1815     // Unfortunately there are a couple of global wxCSConv objects such as
1816     // wxConvLocal that are used all over wx code, so we have to make sure
1817     // the handle is used by at most one thread at the time. Otherwise
1818     // only a few wx classes would be safe to use from non-main threads
1819     // as MB<->WC conversion would fail "randomly".
1820     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1821     #endif // wxUSE_THREADS
1822    
1823     size_t outbuf = n * SIZEOF_WCHAR_T;
1824     size_t res, cres;
1825     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1826     wchar_t *bufPtr = buf;
1827     const char *pszPtr = psz;
1828    
1829     if (buf)
1830     {
1831     // have destination buffer, convert there
1832     cres = iconv(m2w,
1833     ICONV_CHAR_CAST(&pszPtr), &inbuf,
1834     (char**)&bufPtr, &outbuf);
1835     res = n - (outbuf / SIZEOF_WCHAR_T);
1836    
1837     if (ms_wcNeedsSwap)
1838     {
1839     // convert to native endianness
1840     for ( unsigned i = 0; i < res; i++ )
1841     buf[n] = WC_BSWAP(buf[i]);
1842     }
1843    
1844     // NUL-terminate the string if there is any space left
1845     if (res < n)
1846     buf[res] = 0;
1847     }
1848     else
1849     {
1850     // no destination buffer... convert using temp buffer
1851     // to calculate destination buffer requirement
1852     wchar_t tbuf[8];
1853     res = 0;
1854    
1855     do
1856     {
1857     bufPtr = tbuf;
1858     outbuf = 8 * SIZEOF_WCHAR_T;
1859    
1860     cres = iconv(m2w,
1861     ICONV_CHAR_CAST(&pszPtr), &inbuf,
1862     (char**)&bufPtr, &outbuf );
1863    
1864     res += 8 - (outbuf / SIZEOF_WCHAR_T);
1865     }
1866     while ((cres == (size_t)-1) && (errno == E2BIG));
1867     }
1868    
1869     if (ICONV_FAILED(cres, inbuf))
1870     {
1871     //VS: it is ok if iconv fails, hence trace only
1872     wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1873     return wxCONV_FAILED;
1874     }
1875    
1876     return res;
1877     }
1878    
1879     size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1880     {
1881     #if wxUSE_THREADS
1882     // NB: explained in MB2WC
1883     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1884     #endif
1885    
1886     size_t inlen = wxWcslen(psz);
1887     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1888     size_t outbuf = n;
1889     size_t res, cres;
1890    
1891     wchar_t *tmpbuf = 0;
1892    
1893     if (ms_wcNeedsSwap)
1894     {
1895     // need to copy to temp buffer to switch endianness
1896     // (doing WC_BSWAP twice on the original buffer won't help, as it
1897     // could be in read-only memory, or be accessed in some other thread)
1898     tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1899     for ( size_t i = 0; i < inlen; i++ )
1900     tmpbuf[n] = WC_BSWAP(psz[i]);
1901    
1902     tmpbuf[inlen] = L'\0';
1903     psz = tmpbuf;
1904     }
1905    
1906     if (buf)
1907     {
1908     // have destination buffer, convert there
1909     cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1910    
1911     res = n - outbuf;
1912    
1913     // NB: iconv was given only wcslen(psz) characters on input, and so
1914     // it couldn't convert the trailing zero. Let's do it ourselves
1915     // if there's some room left for it in the output buffer.
1916     if (res < n)
1917     buf[0] = 0;
1918     }
1919     else
1920     {
1921     // no destination buffer: convert using temp buffer
1922     // to calculate destination buffer requirement
1923     char tbuf[16];
1924     res = 0;
1925     do
1926     {
1927     buf = tbuf;
1928     outbuf = 16;
1929    
1930     cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1931    
1932     res += 16 - outbuf;
1933     }
1934     while ((cres == (size_t)-1) && (errno == E2BIG));
1935     }
1936    
1937     if (ms_wcNeedsSwap)
1938     {
1939     free(tmpbuf);
1940     }
1941    
1942     if (ICONV_FAILED(cres, inbuf))
1943     {
1944     wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1945     return wxCONV_FAILED;
1946     }
1947    
1948     return res;
1949     }
1950    
1951     size_t wxMBConv_iconv::GetMBNulLen() const
1952     {
1953     if ( m_minMBCharWidth == 0 )
1954     {
1955     wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1956    
1957     #if wxUSE_THREADS
1958     // NB: explained in MB2WC
1959     wxMutexLocker lock(self->m_iconvMutex);
1960     #endif
1961    
1962     const wchar_t *wnul = L"";
1963     char buf[8]; // should be enough for NUL in any encoding
1964     size_t inLen = sizeof(wchar_t),
1965     outLen = WXSIZEOF(buf);
1966     char *inBuff = (char *)wnul;
1967     char *outBuff = buf;
1968     if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1969     {
1970     self->m_minMBCharWidth = (size_t)-1;
1971     }
1972     else // ok
1973     {
1974     self->m_minMBCharWidth = outBuff - buf;
1975     }
1976     }
1977    
1978     return m_minMBCharWidth;
1979     }
1980    
1981     #endif // HAVE_ICONV
1982    
1983    
1984     // ============================================================================
1985     // Win32 conversion classes
1986     // ============================================================================
1987    
1988     #ifdef wxHAVE_WIN32_MB2WC
1989    
1990     // from utils.cpp
1991     #if wxUSE_FONTMAP
1992     extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1993     extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1994     #endif
1995    
1996     class wxMBConv_win32 : public wxMBConv
1997     {
1998     public:
1999     wxMBConv_win32()
2000     {
2001     m_CodePage = CP_ACP;
2002     m_minMBCharWidth = 0;
2003     }
2004    
2005     wxMBConv_win32(const wxMBConv_win32& conv)
2006     : wxMBConv()
2007     {
2008     m_CodePage = conv.m_CodePage;
2009     m_minMBCharWidth = conv.m_minMBCharWidth;
2010     }
2011    
2012     #if wxUSE_FONTMAP
2013     wxMBConv_win32(const wxChar* name)
2014     {
2015     m_CodePage = wxCharsetToCodepage(name);
2016     m_minMBCharWidth = 0;
2017     }
2018    
2019     wxMBConv_win32(wxFontEncoding encoding)
2020     {
2021     m_CodePage = wxEncodingToCodepage(encoding);
2022     m_minMBCharWidth = 0;
2023     }
2024     #endif // wxUSE_FONTMAP
2025    
2026     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2027     {
2028     // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2029     // the behaviour is not compatible with the Unix version (using iconv)
2030     // and break the library itself, e.g. wxTextInputStream::NextChar()
2031     // wouldn't work if reading an incomplete MB char didn't result in an
2032     // error
2033     //
2034     // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2035     // Win XP or newer and it is not supported for UTF-[78] so we always
2036     // use our own conversions in this case. See
2037     // http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2038     // http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2039     if ( m_CodePage == CP_UTF8 )
2040     {
2041     return wxConvUTF8.MB2WC(buf, psz, n);
2042     }
2043    
2044     if ( m_CodePage == CP_UTF7 )
2045     {
2046     return wxConvUTF7.MB2WC(buf, psz, n);
2047     }
2048    
2049     int flags = 0;
2050     if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2051     IsAtLeastWin2kSP4() )
2052     {
2053     flags = MB_ERR_INVALID_CHARS;
2054     }
2055    
2056     const size_t len = ::MultiByteToWideChar
2057     (
2058     m_CodePage, // code page
2059     flags, // flags: fall on error
2060     psz, // input string
2061     -1, // its length (NUL-terminated)
2062     buf, // output string
2063     buf ? n : 0 // size of output buffer
2064     );
2065     if ( !len )
2066     {
2067     // function totally failed
2068     return wxCONV_FAILED;
2069     }
2070    
2071     // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2072     // check if we succeeded, by doing a double trip:
2073     if ( !flags && buf )
2074     {
2075     const size_t mbLen = strlen(psz);
2076     wxCharBuffer mbBuf(mbLen);
2077     if ( ::WideCharToMultiByte
2078     (
2079     m_CodePage,
2080     0,
2081     buf,
2082     -1,
2083     mbBuf.data(),
2084     mbLen + 1, // size in bytes, not length
2085     NULL,
2086     NULL
2087     ) == 0 ||
2088     strcmp(mbBuf, psz) != 0 )
2089     {
2090     // we didn't obtain the same thing we started from, hence
2091     // the conversion was lossy and we consider that it failed
2092     return wxCONV_FAILED;
2093     }
2094     }
2095    
2096     // note that it returns count of written chars for buf != NULL and size
2097     // of the needed buffer for buf == NULL so in either case the length of
2098     // the string (which never includes the terminating NUL) is one less
2099     return len - 1;
2100     }
2101    
2102     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2103     {
2104     /*
2105     we have a problem here: by default, WideCharToMultiByte() may
2106     replace characters unrepresentable in the target code page with bad
2107     quality approximations such as turning "1/2" symbol (U+00BD) into
2108     "1" for the code pages which don't have it and we, obviously, want
2109     to avoid this at any price
2110    
2111     the trouble is that this function does it _silently_, i.e. it won't
2112     even tell us whether it did or not... Win98/2000 and higher provide
2113     WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2114     we have to resort to a round trip, i.e. check that converting back
2115     results in the same string -- this is, of course, expensive but
2116     otherwise we simply can't be sure to not garble the data.
2117     */
2118    
2119     // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2120     // it doesn't work with CJK encodings (which we test for rather roughly
2121     // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2122     // supporting it
2123     BOOL usedDef wxDUMMY_INITIALIZE(false);
2124     BOOL *pUsedDef;
2125     int flags;
2126     if ( CanUseNoBestFit() && m_CodePage < 50000 )
2127     {
2128     // it's our lucky day
2129     flags = WC_NO_BEST_FIT_CHARS;
2130     pUsedDef = &usedDef;
2131     }
2132     else // old system or unsupported encoding
2133     {
2134     flags = 0;
2135     pUsedDef = NULL;
2136     }
2137    
2138     const size_t len = ::WideCharToMultiByte
2139     (
2140     m_CodePage, // code page
2141     flags, // either none or no best fit
2142     pwz, // input string
2143     -1, // it is (wide) NUL-terminated
2144     buf, // output buffer
2145     buf ? n : 0, // and its size
2146     NULL, // default "replacement" char
2147     pUsedDef // [out] was it used?
2148     );
2149    
2150     if ( !len )
2151     {
2152     // function totally failed
2153     return wxCONV_FAILED;
2154     }
2155    
2156     // if we were really converting, check if we succeeded
2157     if ( buf )
2158     {
2159     if ( flags )
2160     {
2161     // check if the conversion failed, i.e. if any replacements
2162     // were done
2163     if ( usedDef )
2164     return wxCONV_FAILED;
2165     }
2166     else // we must resort to double tripping...
2167     {
2168     wxWCharBuffer wcBuf(n);
2169     if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2170     wcscmp(wcBuf, pwz) != 0 )
2171     {
2172     // we didn't obtain the same thing we started from, hence
2173     // the conversion was lossy and we consider that it failed
2174     return wxCONV_FAILED;
2175     }
2176     }
2177     }
2178    
2179     // see the comment above for the reason of "len - 1"
2180     return len - 1;
2181     }
2182    
2183     virtual size_t GetMBNulLen() const
2184     {
2185     if ( m_minMBCharWidth == 0 )
2186     {
2187     int len = ::WideCharToMultiByte
2188     (
2189     m_CodePage, // code page
2190     0, // no flags
2191     L"", // input string
2192     1, // translate just the NUL
2193     NULL, // output buffer
2194     0, // and its size
2195     NULL, // no replacement char
2196     NULL // [out] don't care if it was used
2197     );
2198    
2199     wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2200     switch ( len )
2201     {
2202     default:
2203     wxLogDebug(_T("Unexpected NUL length %d"), len);
2204     self->m_minMBCharWidth = (size_t)-1;
2205     break;
2206    
2207     case 0:
2208     self->m_minMBCharWidth = (size_t)-1;
2209     break;
2210    
2211     case 1:
2212     case 2:
2213     case 4:
2214     self->m_minMBCharWidth = len;
2215     break;
2216     }
2217     }
2218    
2219     return m_minMBCharWidth;
2220     }
2221    
2222     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2223    
2224     bool IsOk() const { return m_CodePage != -1; }
2225    
2226     private:
2227     static bool CanUseNoBestFit()
2228     {
2229     static int s_isWin98Or2k = -1;
2230    
2231     if ( s_isWin98Or2k == -1 )
2232     {
2233     int verMaj, verMin;
2234     switch ( wxGetOsVersion(&verMaj, &verMin) )
2235     {
2236     case wxOS_WINDOWS_9X:
2237     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2238     break;
2239    
2240     case wxOS_WINDOWS_NT:
2241     s_isWin98Or2k = verMaj >= 5;
2242     break;
2243    
2244     default:
2245     // unknown: be conservative by default
2246     s_isWin98Or2k = 0;
2247     break;
2248     }
2249    
2250     wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2251     }
2252    
2253     return s_isWin98Or2k == 1;
2254     }
2255    
2256     static bool IsAtLeastWin2kSP4()
2257     {
2258     #ifdef __WXWINCE__
2259     return false;
2260     #else
2261     static int s_isAtLeastWin2kSP4 = -1;
2262    
2263     if ( s_isAtLeastWin2kSP4 == -1 )
2264     {
2265     OSVERSIONINFOEX ver;
2266    
2267     memset(&ver, 0, sizeof(ver));
2268     ver.dwOSVersionInfoSize = sizeof(ver);
2269     GetVersionEx((OSVERSIONINFO*)&ver);
2270    
2271     s_isAtLeastWin2kSP4 =
2272     ((ver.dwMajorVersion > 5) || // Vista+
2273     (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2274     (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2275     ver.wServicePackMajor >= 4)) // 2000 SP4+
2276     ? 1 : 0;
2277     }
2278    
2279     return s_isAtLeastWin2kSP4 == 1;
2280     #endif
2281     }
2282    
2283    
2284     // the code page we're working with
2285     long m_CodePage;
2286    
2287     // cached result of GetMBNulLen(), set to 0 initially meaning
2288     // "unknown"
2289     size_t m_minMBCharWidth;
2290     };
2291    
2292     #endif // wxHAVE_WIN32_MB2WC
2293    
2294     // ============================================================================
2295     // Cocoa conversion classes
2296     // ============================================================================
2297    
2298     #if defined(__WXCOCOA__)
2299    
2300     // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2301     // Strangely enough, internally Core Foundation uses
2302     // UTF-32 internally quite a bit - its just not public (yet).
2303    
2304     #include <CoreFoundation/CFString.h>
2305     #include <CoreFoundation/CFStringEncodingExt.h>
2306    
2307     CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2308     {
2309     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2310    
2311     switch (encoding)
2312     {
2313     case wxFONTENCODING_DEFAULT :
2314     enc = CFStringGetSystemEncoding();
2315     break ;
2316    
2317     case wxFONTENCODING_ISO8859_1 :
2318     enc = kCFStringEncodingISOLatin1 ;
2319     break ;
2320     case wxFONTENCODING_ISO8859_2 :
2321     enc = kCFStringEncodingISOLatin2;
2322     break ;
2323     case wxFONTENCODING_ISO8859_3 :
2324     enc = kCFStringEncodingISOLatin3 ;
2325     break ;
2326     case wxFONTENCODING_ISO8859_4 :
2327     enc = kCFStringEncodingISOLatin4;
2328     break ;
2329     case wxFONTENCODING_ISO8859_5 :
2330     enc = kCFStringEncodingISOLatinCyrillic;
2331     break ;
2332     case wxFONTENCODING_ISO8859_6 :
2333     enc = kCFStringEncodingISOLatinArabic;
2334     break ;
2335     case wxFONTENCODING_ISO8859_7 :
2336     enc = kCFStringEncodingISOLatinGreek;
2337     break ;
2338     case wxFONTENCODING_ISO8859_8 :
2339     enc = kCFStringEncodingISOLatinHebrew;
2340     break ;
2341     case wxFONTENCODING_ISO8859_9 :
2342     enc = kCFStringEncodingISOLatin5;
2343     break ;
2344     case wxFONTENCODING_ISO8859_10 :
2345     enc = kCFStringEncodingISOLatin6;
2346     break ;
2347     case wxFONTENCODING_ISO8859_11 :
2348     enc = kCFStringEncodingISOLatinThai;
2349     break ;
2350     case wxFONTENCODING_ISO8859_13 :
2351     enc = kCFStringEncodingISOLatin7;
2352     break ;
2353     case wxFONTENCODING_ISO8859_14 :
2354     enc = kCFStringEncodingISOLatin8;
2355     break ;
2356     case wxFONTENCODING_ISO8859_15 :
2357     enc = kCFStringEncodingISOLatin9;
2358     break ;
2359    
2360     case wxFONTENCODING_KOI8 :
2361     enc = kCFStringEncodingKOI8_R;
2362     break ;
2363     case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2364     enc = kCFStringEncodingDOSRussian;
2365     break ;
2366    
2367     // case wxFONTENCODING_BULGARIAN :
2368     // enc = ;
2369     // break ;
2370    
2371     case wxFONTENCODING_CP437 :
2372     enc = kCFStringEncodingDOSLatinUS ;
2373     break ;
2374     case wxFONTENCODING_CP850 :
2375     enc = kCFStringEncodingDOSLatin1;
2376     break ;
2377     case wxFONTENCODING_CP852 :
2378     enc = kCFStringEncodingDOSLatin2;
2379     break ;
2380     case wxFONTENCODING_CP855 :
2381     enc = kCFStringEncodingDOSCyrillic;
2382     break ;
2383     case wxFONTENCODING_CP866 :
2384     enc = kCFStringEncodingDOSRussian ;
2385     break ;
2386     case wxFONTENCODING_CP874 :
2387     enc = kCFStringEncodingDOSThai;
2388     break ;
2389     case wxFONTENCODING_CP932 :
2390     enc = kCFStringEncodingDOSJapanese;
2391     break ;
2392     case wxFONTENCODING_CP936 :
2393     enc = kCFStringEncodingDOSChineseSimplif ;
2394     break ;
2395     case wxFONTENCODING_CP949 :
2396     enc = kCFStringEncodingDOSKorean;
2397     break ;
2398     case wxFONTENCODING_CP950 :
2399     enc = kCFStringEncodingDOSChineseTrad;
2400     break ;
2401     case wxFONTENCODING_CP1250 :
2402     enc = kCFStringEncodingWindowsLatin2;
2403     break ;
2404     case wxFONTENCODING_CP1251 :
2405     enc = kCFStringEncodingWindowsCyrillic ;
2406     break ;
2407     case wxFONTENCODING_CP1252 :
2408     enc = kCFStringEncodingWindowsLatin1 ;
2409     break ;
2410     case wxFONTENCODING_CP1253 :
2411     enc = kCFStringEncodingWindowsGreek;
2412     break ;
2413     case wxFONTENCODING_CP1254 :
2414     enc = kCFStringEncodingWindowsLatin5;
2415     break ;
2416     case wxFONTENCODING_CP1255 :
2417     enc = kCFStringEncodingWindowsHebrew ;
2418     break ;
2419     case wxFONTENCODING_CP1256 :
2420     enc = kCFStringEncodingWindowsArabic ;
2421     break ;
2422     case wxFONTENCODING_CP1257 :
2423     enc = kCFStringEncodingWindowsBalticRim;
2424     break ;
2425     // This only really encodes to UTF7 (if that) evidently
2426     // case wxFONTENCODING_UTF7 :
2427     // enc = kCFStringEncodingNonLossyASCII ;
2428     // break ;
2429     case wxFONTENCODING_UTF8 :
2430     enc = kCFStringEncodingUTF8 ;
2431     break ;
2432     case wxFONTENCODING_EUC_JP :
2433     enc = kCFStringEncodingEUC_JP;
2434     break ;
2435     case wxFONTENCODING_UTF16 :
2436     enc = kCFStringEncodingUnicode ;
2437     break ;
2438     case wxFONTENCODING_MACROMAN :
2439     enc = kCFStringEncodingMacRoman ;
2440     break ;
2441     case wxFONTENCODING_MACJAPANESE :
2442     enc = kCFStringEncodingMacJapanese ;
2443     break ;
2444     case wxFONTENCODING_MACCHINESETRAD :
2445     enc = kCFStringEncodingMacChineseTrad ;
2446     break ;
2447     case wxFONTENCODING_MACKOREAN :
2448     enc = kCFStringEncodingMacKorean ;
2449     break ;
2450     case wxFONTENCODING_MACARABIC :
2451     enc = kCFStringEncodingMacArabic ;
2452     break ;
2453     case wxFONTENCODING_MACHEBREW :
2454     enc = kCFStringEncodingMacHebrew ;
2455     break ;
2456     case wxFONTENCODING_MACGREEK :
2457     enc = kCFStringEncodingMacGreek ;
2458     break ;
2459     case wxFONTENCODING_MACCYRILLIC :
2460     enc = kCFStringEncodingMacCyrillic ;
2461     break ;
2462     case wxFONTENCODING_MACDEVANAGARI :
2463     enc = kCFStringEncodingMacDevanagari ;
2464     break ;
2465     case wxFONTENCODING_MACGURMUKHI :
2466     enc = kCFStringEncodingMacGurmukhi ;
2467     break ;
2468     case wxFONTENCODING_MACGUJARATI :
2469     enc = kCFStringEncodingMacGujarati ;
2470     break ;
2471     case wxFONTENCODING_MACORIYA :
2472     enc = kCFStringEncodingMacOriya ;
2473     break ;
2474     case wxFONTENCODING_MACBENGALI :
2475     enc = kCFStringEncodingMacBengali ;
2476     break ;
2477     case wxFONTENCODING_MACTAMIL :
2478     enc = kCFStringEncodingMacTamil ;
2479     break ;
2480     case wxFONTENCODING_MACTELUGU :
2481     enc = kCFStringEncodingMacTelugu ;
2482     break ;
2483     case wxFONTENCODING_MACKANNADA :
2484     enc = kCFStringEncodingMacKannada ;
2485     break ;
2486     case wxFONTENCODING_MACMALAJALAM :
2487     enc = kCFStringEncodingMacMalayalam ;
2488     break ;
2489     case wxFONTENCODING_MACSINHALESE :
2490     enc = kCFStringEncodingMacSinhalese ;
2491     break ;
2492     case wxFONTENCODING_MACBURMESE :
2493     enc = kCFStringEncodingMacBurmese ;
2494     break ;
2495     case wxFONTENCODING_MACKHMER :
2496     enc = kCFStringEncodingMacKhmer ;
2497     break ;
2498     case wxFONTENCODING_MACTHAI :
2499     enc = kCFStringEncodingMacThai ;
2500     break ;
2501     case wxFONTENCODING_MACLAOTIAN :
2502     enc = kCFStringEncodingMacLaotian ;
2503     break ;
2504     case wxFONTENCODING_MACGEORGIAN :
2505     enc = kCFStringEncodingMacGeorgian ;
2506     break ;
2507     case wxFONTENCODING_MACARMENIAN :
2508     enc = kCFStringEncodingMacArmenian ;
2509     break ;
2510     case wxFONTENCODING_MACCHINESESIMP :
2511     enc = kCFStringEncodingMacChineseSimp ;
2512     break ;
2513     case wxFONTENCODING_MACTIBETAN :
2514     enc = kCFStringEncodingMacTibetan ;
2515     break ;
2516     case wxFONTENCODING_MACMONGOLIAN :
2517     enc = kCFStringEncodingMacMongolian ;
2518     break ;
2519     case wxFONTENCODING_MACETHIOPIC :
2520     enc = kCFStringEncodingMacEthiopic ;
2521     break ;
2522     case wxFONTENCODING_MACCENTRALEUR :
2523     enc = kCFStringEncodingMacCentralEurRoman ;
2524     break ;
2525     case wxFONTENCODING_MACVIATNAMESE :
2526     enc = kCFStringEncodingMacVietnamese ;
2527     break ;
2528     case wxFONTENCODING_MACARABICEXT :
2529     enc = kCFStringEncodingMacExtArabic ;
2530     break ;
2531     case wxFONTENCODING_MACSYMBOL :
2532     enc = kCFStringEncodingMacSymbol ;
2533     break ;
2534     case wxFONTENCODING_MACDINGBATS :
2535     enc = kCFStringEncodingMacDingbats ;
2536     break ;
2537     case wxFONTENCODING_MACTURKISH :
2538     enc = kCFStringEncodingMacTurkish ;
2539     break ;
2540     case wxFONTENCODING_MACCROATIAN :
2541     enc = kCFStringEncodingMacCroatian ;
2542     break ;
2543     case wxFONTENCODING_MACICELANDIC :
2544     enc = kCFStringEncodingMacIcelandic ;
2545     break ;
2546     case wxFONTENCODING_MACROMANIAN :
2547     enc = kCFStringEncodingMacRomanian ;
2548     break ;
2549     case wxFONTENCODING_MACCELTIC :
2550     enc = kCFStringEncodingMacCeltic ;
2551     break ;
2552     case wxFONTENCODING_MACGAELIC :
2553     enc = kCFStringEncodingMacGaelic ;
2554     break ;
2555     // case wxFONTENCODING_MACKEYBOARD :
2556     // enc = kCFStringEncodingMacKeyboardGlyphs ;
2557     // break ;
2558    
2559     default :
2560     // because gcc is picky
2561     break ;
2562     }
2563    
2564     return enc ;
2565     }
2566    
2567     class wxMBConv_cocoa : public wxMBConv
2568     {
2569     public:
2570     wxMBConv_cocoa()
2571     {
2572     Init(CFStringGetSystemEncoding()) ;
2573     }
2574    
2575     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2576     {
2577     m_encoding = conv.m_encoding;
2578     }
2579    
2580     #if wxUSE_FONTMAP
2581     wxMBConv_cocoa(const wxChar* name)
2582     {
2583     Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2584     }
2585     #endif
2586    
2587     wxMBConv_cocoa(wxFontEncoding encoding)
2588     {
2589     Init( wxCFStringEncFromFontEnc(encoding) );
2590     }
2591    
2592     virtual ~wxMBConv_cocoa()
2593     {
2594     }
2595    
2596     void Init( CFStringEncoding encoding)
2597     {
2598     m_encoding = encoding ;
2599     }
2600    
2601     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2602     {
2603     wxASSERT(szUnConv);
2604    
2605     CFStringRef theString = CFStringCreateWithBytes (
2606     NULL, //the allocator
2607     (const UInt8*)szUnConv,
2608     strlen(szUnConv),
2609     m_encoding,
2610     false //no BOM/external representation
2611     );
2612    
2613     wxASSERT(theString);
2614    
2615     size_t nOutLength = CFStringGetLength(theString);
2616    
2617     if (szOut == NULL)
2618     {
2619     CFRelease(theString);
2620     return nOutLength;
2621     }
2622    
2623     CFRange theRange = { 0, nOutSize };
2624    
2625     #if SIZEOF_WCHAR_T == 4
2626     UniChar* szUniCharBuffer = new UniChar[nOutSize];
2627     #endif
2628    
2629     CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2630    
2631     CFRelease(theString);
2632    
2633     szUniCharBuffer[nOutLength] = '\0';
2634    
2635     #if SIZEOF_WCHAR_T == 4
2636     wxMBConvUTF16 converter;
2637     converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2638     delete [] szUniCharBuffer;
2639     #endif
2640    
2641     return nOutLength;
2642     }
2643    
2644     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2645     {
2646     wxASSERT(szUnConv);
2647    
2648     size_t nRealOutSize;
2649     size_t nBufSize = wxWcslen(szUnConv);
2650     UniChar* szUniBuffer = (UniChar*) szUnConv;
2651    
2652     #if SIZEOF_WCHAR_T == 4
2653     wxMBConvUTF16 converter ;
2654     nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2655     szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2656     converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2657     nBufSize /= sizeof(UniChar);
2658     #endif
2659    
2660     CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2661     NULL, //allocator
2662     szUniBuffer,
2663     nBufSize,
2664     kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2665     );
2666    
2667     wxASSERT(theString);
2668    
2669     //Note that CER puts a BOM when converting to unicode
2670     //so we check and use getchars instead in that case
2671     if (m_encoding == kCFStringEncodingUnicode)
2672     {
2673     if (szOut != NULL)
2674     CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2675    
2676     nRealOutSize = CFStringGetLength(theString) + 1;
2677     }
2678     else
2679     {
2680     CFStringGetBytes(
2681     theString,
2682     CFRangeMake(0, CFStringGetLength(theString)),
2683     m_encoding,
2684     0, //what to put in characters that can't be converted -
2685     //0 tells CFString to return NULL if it meets such a character
2686     false, //not an external representation
2687     (UInt8*) szOut,
2688     nOutSize,
2689     (CFIndex*) &nRealOutSize
2690     );
2691     }
2692    
2693     CFRelease(theString);
2694    
2695     #if SIZEOF_WCHAR_T == 4
2696     delete[] szUniBuffer;
2697     #endif
2698    
2699     return nRealOutSize - 1;
2700     }
2701    
2702     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2703    
2704     bool IsOk() const
2705     {
2706     return m_encoding != kCFStringEncodingInvalidId &&
2707     CFStringIsEncodingAvailable(m_encoding);
2708     }
2709    
2710     private:
2711     CFStringEncoding m_encoding ;
2712     };
2713    
2714     #endif // defined(__WXCOCOA__)
2715    
2716     // ============================================================================
2717     // Mac conversion classes
2718     // ============================================================================
2719    
2720     #if defined(__WXMAC__) && defined(TARGET_CARBON)
2721    
2722     class wxMBConv_mac : public wxMBConv
2723     {
2724     public:
2725     wxMBConv_mac()
2726     {
2727     Init(CFStringGetSystemEncoding()) ;
2728     }
2729    
2730     wxMBConv_mac(const wxMBConv_mac& conv)
2731     {
2732     Init(conv.m_char_encoding);
2733     }
2734    
2735     #if wxUSE_FONTMAP
2736     wxMBConv_mac(const wxChar* name)
2737     {
2738     wxFontEncoding enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2739     Init( (enc != wxFONTENCODING_SYSTEM) ? wxMacGetSystemEncFromFontEnc( enc ) : kTextEncodingUnknown);
2740     }
2741     #endif
2742    
2743     wxMBConv_mac(wxFontEncoding encoding)
2744     {
2745     Init( wxMacGetSystemEncFromFontEnc(encoding) );
2746     }
2747    
2748     virtual ~wxMBConv_mac()
2749     {
2750     OSStatus status = noErr ;
2751     if (m_MB2WC_converter)
2752     status = TECDisposeConverter(m_MB2WC_converter);
2753     if (m_WC2MB_converter)
2754     status = TECDisposeConverter(m_WC2MB_converter);
2755     }
2756    
2757     void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2758     TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2759     {
2760     m_MB2WC_converter = NULL ;
2761     m_WC2MB_converter = NULL ;
2762     if ( encoding != kTextEncodingUnknown )
2763     {
2764