libstdc++
codecvt_specializations.h
Go to the documentation of this file.
1 // Locale support (codecvt) -*- C++ -*-
2 
3 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
4 // Free Software Foundation, Inc.
5 //
6 // This file is part of the GNU ISO C++ Library. This library is free
7 // software; you can redistribute it and/or modify it under the
8 // terms of the GNU General Public License as published by the
9 // Free Software Foundation; either version 3, or (at your option)
10 // any later version.
11 
12 // This library is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 
17 // Under Section 7 of GPL version 3, you are granted additional
18 // permissions described in the GCC Runtime Library Exception, version
19 // 3.1, as published by the Free Software Foundation.
20 
21 // You should have received a copy of the GNU General Public License and
22 // a copy of the GCC Runtime Library Exception along with this program;
23 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
24 // <http://www.gnu.org/licenses/>.
25 
26 //
27 // ISO C++ 14882: 22.2.1.5 Template class codecvt
28 //
29 
30 // Written by Benjamin Kosnik <bkoz@redhat.com>
31 
32 /** @file ext/codecvt_specializations.h
33  * This file is a GNU extension to the Standard C++ Library.
34  */
35 
36 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
37 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
38 
39 #include <bits/c++config.h>
40 #include <locale>
41 #include <iconv.h>
42 
43 _GLIBCXX_BEGIN_NAMESPACE(__gnu_cxx)
44 
45  /// Extension to use iconv for dealing with character encodings.
46  // This includes conversions and comparisons between various character
47  // sets. This object encapsulates data that may need to be shared between
48  // char_traits, codecvt and ctype.
50  {
51  public:
52  // Types:
53  // NB: A conversion descriptor subsumes and enhances the
54  // functionality of a simple state type such as mbstate_t.
55  typedef iconv_t descriptor_type;
56 
57  protected:
58  // Name of internal character set encoding.
59  std::string _M_int_enc;
60 
61  // Name of external character set encoding.
62  std::string _M_ext_enc;
63 
64  // Conversion descriptor between external encoding to internal encoding.
65  descriptor_type _M_in_desc;
66 
67  // Conversion descriptor between internal encoding to external encoding.
68  descriptor_type _M_out_desc;
69 
70  // The byte-order marker for the external encoding, if necessary.
71  int _M_ext_bom;
72 
73  // The byte-order marker for the internal encoding, if necessary.
74  int _M_int_bom;
75 
76  // Number of external bytes needed to construct one complete
77  // character in the internal encoding.
78  // NB: -1 indicates variable, or stateful, encodings.
79  int _M_bytes;
80 
81  public:
82  explicit
83  encoding_state()
84  : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
85  { }
86 
87  explicit
88  encoding_state(const char* __int, const char* __ext,
89  int __ibom = 0, int __ebom = 0, int __bytes = 1)
90  : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0),
91  _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
92  { init(); }
93 
94  // 21.1.2 traits typedefs
95  // p4
96  // typedef STATE_T state_type
97  // requires: state_type shall meet the requirements of
98  // CopyConstructible types (20.1.3)
99  // NB: This does not preserve the actual state of the conversion
100  // descriptor member, but it does duplicate the encoding
101  // information.
102  encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
103  { construct(__obj); }
104 
105  // Need assignment operator as well.
106  encoding_state&
107  operator=(const encoding_state& __obj)
108  {
109  construct(__obj);
110  return *this;
111  }
112 
113  ~encoding_state()
114  { destroy(); }
115 
116  bool
117  good() const throw()
118  {
119  const descriptor_type __err = (iconv_t)(-1);
120  bool __test = _M_in_desc && _M_in_desc != __err;
121  __test &= _M_out_desc && _M_out_desc != __err;
122  return __test;
123  }
124 
125  int
126  character_ratio() const
127  { return _M_bytes; }
128 
129  const std::string
130  internal_encoding() const
131  { return _M_int_enc; }
132 
133  int
134  internal_bom() const
135  { return _M_int_bom; }
136 
137  const std::string
138  external_encoding() const
139  { return _M_ext_enc; }
140 
141  int
142  external_bom() const
143  { return _M_ext_bom; }
144 
145  const descriptor_type&
146  in_descriptor() const
147  { return _M_in_desc; }
148 
149  const descriptor_type&
150  out_descriptor() const
151  { return _M_out_desc; }
152 
153  protected:
154  void
155  init()
156  {
157  const descriptor_type __err = (iconv_t)(-1);
158  const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
159  if (!_M_in_desc && __have_encodings)
160  {
161  _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
162  if (_M_in_desc == __err)
163  std::__throw_runtime_error(__N("encoding_state::_M_init "
164  "creating iconv input descriptor failed"));
165  }
166  if (!_M_out_desc && __have_encodings)
167  {
168  _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
169  if (_M_out_desc == __err)
170  std::__throw_runtime_error(__N("encoding_state::_M_init "
171  "creating iconv output descriptor failed"));
172  }
173  }
174 
175  void
176  construct(const encoding_state& __obj)
177  {
178  destroy();
179  _M_int_enc = __obj._M_int_enc;
180  _M_ext_enc = __obj._M_ext_enc;
181  _M_ext_bom = __obj._M_ext_bom;
182  _M_int_bom = __obj._M_int_bom;
183  _M_bytes = __obj._M_bytes;
184  init();
185  }
186 
187  void
188  destroy() throw()
189  {
190  const descriptor_type __err = (iconv_t)(-1);
191  if (_M_in_desc && _M_in_desc != __err)
192  {
193  iconv_close(_M_in_desc);
194  _M_in_desc = 0;
195  }
196  if (_M_out_desc && _M_out_desc != __err)
197  {
198  iconv_close(_M_out_desc);
199  _M_out_desc = 0;
200  }
201  }
202  };
203 
204  /// encoding_char_traits
205  // Custom traits type with encoding_state for the state type, and the
206  // associated fpos<encoding_state> for the position type, all other
207  // bits equivalent to the required char_traits instantiations.
208  template<typename _CharT>
209  struct encoding_char_traits : public std::char_traits<_CharT>
210  {
211  typedef encoding_state state_type;
212  typedef typename std::fpos<state_type> pos_type;
213  };
214 
215 _GLIBCXX_END_NAMESPACE
216 
217 
218 _GLIBCXX_BEGIN_NAMESPACE(std)
219 
220  using __gnu_cxx::encoding_state;
221 
222  /// codecvt<InternT, _ExternT, encoding_state> specialization.
223  // This partial specialization takes advantage of iconv to provide
224  // code conversions between a large number of character encodings.
225  template<typename _InternT, typename _ExternT>
226  class codecvt<_InternT, _ExternT, encoding_state>
227  : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
228  {
229  public:
230  // Types:
231  typedef codecvt_base::result result;
232  typedef _InternT intern_type;
233  typedef _ExternT extern_type;
235  typedef state_type::descriptor_type descriptor_type;
236 
237  // Data Members:
238  static locale::id id;
239 
240  explicit
241  codecvt(size_t __refs = 0)
243  { }
244 
245  explicit
246  codecvt(state_type& __enc, size_t __refs = 0)
248  { }
249 
250  protected:
251  virtual
252  ~codecvt() { }
253 
254  virtual result
255  do_out(state_type& __state, const intern_type* __from,
256  const intern_type* __from_end, const intern_type*& __from_next,
257  extern_type* __to, extern_type* __to_end,
258  extern_type*& __to_next) const;
259 
260  virtual result
261  do_unshift(state_type& __state, extern_type* __to,
262  extern_type* __to_end, extern_type*& __to_next) const;
263 
264  virtual result
265  do_in(state_type& __state, const extern_type* __from,
266  const extern_type* __from_end, const extern_type*& __from_next,
267  intern_type* __to, intern_type* __to_end,
268  intern_type*& __to_next) const;
269 
270  virtual int
271  do_encoding() const throw();
272 
273  virtual bool
274  do_always_noconv() const throw();
275 
276  virtual int
277  do_length(state_type&, const extern_type* __from,
278  const extern_type* __end, size_t __max) const;
279 
280  virtual int
281  do_max_length() const throw();
282  };
283 
284  template<typename _InternT, typename _ExternT>
285  locale::id
287 
288  // This adaptor works around the signature problems of the second
289  // argument to iconv(): SUSv2 and others use 'const char**', but glibc 2.2
290  // uses 'char**', which matches the POSIX 1003.1-2001 standard.
291  // Using this adaptor, g++ will do the work for us.
292  template<typename _Tp>
293  inline size_t
294  __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
295  iconv_t __cd, char** __inbuf, size_t* __inbytes,
296  char** __outbuf, size_t* __outbytes)
297  { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
298 
299  template<typename _InternT, typename _ExternT>
300  codecvt_base::result
301  codecvt<_InternT, _ExternT, encoding_state>::
302  do_out(state_type& __state, const intern_type* __from,
303  const intern_type* __from_end, const intern_type*& __from_next,
304  extern_type* __to, extern_type* __to_end,
305  extern_type*& __to_next) const
306  {
307  result __ret = codecvt_base::error;
308  if (__state.good())
309  {
310  const descriptor_type& __desc = __state.out_descriptor();
311  const size_t __fmultiple = sizeof(intern_type);
312  size_t __fbytes = __fmultiple * (__from_end - __from);
313  const size_t __tmultiple = sizeof(extern_type);
314  size_t __tbytes = __tmultiple * (__to_end - __to);
315 
316  // Argument list for iconv specifies a byte sequence. Thus,
317  // all to/from arrays must be brutally casted to char*.
318  char* __cto = reinterpret_cast<char*>(__to);
319  char* __cfrom;
320  size_t __conv;
321 
322  // Some encodings need a byte order marker as the first item
323  // in the byte stream, to designate endian-ness. The default
324  // value for the byte order marker is NULL, so if this is
325  // the case, it's not necessary and we can just go on our
326  // merry way.
327  int __int_bom = __state.internal_bom();
328  if (__int_bom)
329  {
330  size_t __size = __from_end - __from;
331  intern_type* __cfixed = static_cast<intern_type*>
332  (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
333  __cfixed[0] = static_cast<intern_type>(__int_bom);
334  char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
335  __cfrom = reinterpret_cast<char*>(__cfixed);
336  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
337  &__fbytes, &__cto, &__tbytes);
338  }
339  else
340  {
341  intern_type* __cfixed = const_cast<intern_type*>(__from);
342  __cfrom = reinterpret_cast<char*>(__cfixed);
343  __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes,
344  &__cto, &__tbytes);
345  }
346 
347  if (__conv != size_t(-1))
348  {
349  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
350  __to_next = reinterpret_cast<extern_type*>(__cto);
351  __ret = codecvt_base::ok;
352  }
353  else
354  {
355  if (__fbytes < __fmultiple * (__from_end - __from))
356  {
357  __from_next = reinterpret_cast<const intern_type*>(__cfrom);
358  __to_next = reinterpret_cast<extern_type*>(__cto);
359  __ret = codecvt_base::partial;
360  }
361  else
362  __ret = codecvt_base::error;
363  }
364  }
365  return __ret;
366  }
367 
368  template<typename _InternT, typename _ExternT>
369  codecvt_base::result
371  do_unshift(state_type& __state, extern_type* __to,
372  extern_type* __to_end, extern_type*& __to_next) const
373  {
374  result __ret = codecvt_base::error;
375  if (__state.good())
376  {
377  const descriptor_type& __desc = __state.in_descriptor();
378  const size_t __tmultiple = sizeof(intern_type);
379  size_t __tlen = __tmultiple * (__to_end - __to);
380 
381  // Argument list for iconv specifies a byte sequence. Thus,
382  // all to/from arrays must be brutally casted to char*.
383  char* __cto = reinterpret_cast<char*>(__to);
384  size_t __conv = __iconv_adaptor(iconv,__desc, NULL, NULL,
385  &__cto, &__tlen);
386 
387  if (__conv != size_t(-1))
388  {
389  __to_next = reinterpret_cast<extern_type*>(__cto);
390  if (__tlen == __tmultiple * (__to_end - __to))
391  __ret = codecvt_base::noconv;
392  else if (__tlen == 0)
393  __ret = codecvt_base::ok;
394  else
395  __ret = codecvt_base::partial;
396  }
397  else
398  __ret = codecvt_base::error;
399  }
400  return __ret;
401  }
402 
403  template<typename _InternT, typename _ExternT>
404  codecvt_base::result
405  codecvt<_InternT, _ExternT, encoding_state>::
406  do_in(state_type& __state, const extern_type* __from,
407  const extern_type* __from_end, const extern_type*& __from_next,
408  intern_type* __to, intern_type* __to_end,
409  intern_type*& __to_next) const
410  {
411  result __ret = codecvt_base::error;
412  if (__state.good())
413  {
414  const descriptor_type& __desc = __state.in_descriptor();
415  const size_t __fmultiple = sizeof(extern_type);
416  size_t __flen = __fmultiple * (__from_end - __from);
417  const size_t __tmultiple = sizeof(intern_type);
418  size_t __tlen = __tmultiple * (__to_end - __to);
419 
420  // Argument list for iconv specifies a byte sequence. Thus,
421  // all to/from arrays must be brutally casted to char*.
422  char* __cto = reinterpret_cast<char*>(__to);
423  char* __cfrom;
424  size_t __conv;
425 
426  // Some encodings need a byte order marker as the first item
427  // in the byte stream, to designate endian-ness. The default
428  // value for the byte order marker is NULL, so if this is
429  // the case, it's not necessary and we can just go on our
430  // merry way.
431  int __ext_bom = __state.external_bom();
432  if (__ext_bom)
433  {
434  size_t __size = __from_end - __from;
435  extern_type* __cfixed = static_cast<extern_type*>
436  (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
437  __cfixed[0] = static_cast<extern_type>(__ext_bom);
438  char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
439  __cfrom = reinterpret_cast<char*>(__cfixed);
440  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
441  &__flen, &__cto, &__tlen);
442  }
443  else
444  {
445  extern_type* __cfixed = const_cast<extern_type*>(__from);
446  __cfrom = reinterpret_cast<char*>(__cfixed);
447  __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
448  &__flen, &__cto, &__tlen);
449  }
450 
451 
452  if (__conv != size_t(-1))
453  {
454  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
455  __to_next = reinterpret_cast<intern_type*>(__cto);
456  __ret = codecvt_base::ok;
457  }
458  else
459  {
460  if (__flen < static_cast<size_t>(__from_end - __from))
461  {
462  __from_next = reinterpret_cast<const extern_type*>(__cfrom);
463  __to_next = reinterpret_cast<intern_type*>(__cto);
464  __ret = codecvt_base::partial;
465  }
466  else
467  __ret = codecvt_base::error;
468  }
469  }
470  return __ret;
471  }
472 
473  template<typename _InternT, typename _ExternT>
474  int
475  codecvt<_InternT, _ExternT, encoding_state>::
476  do_encoding() const throw()
477  {
478  int __ret = 0;
479  if (sizeof(_ExternT) <= sizeof(_InternT))
480  __ret = sizeof(_InternT) / sizeof(_ExternT);
481  return __ret;
482  }
483 
484  template<typename _InternT, typename _ExternT>
485  bool
486  codecvt<_InternT, _ExternT, encoding_state>::
487  do_always_noconv() const throw()
488  { return false; }
489 
490  template<typename _InternT, typename _ExternT>
491  int
492  codecvt<_InternT, _ExternT, encoding_state>::
493  do_length(state_type&, const extern_type* __from,
494  const extern_type* __end, size_t __max) const
495  { return std::min(__max, static_cast<size_t>(__end - __from)); }
496 
497  // _GLIBCXX_RESOLVE_LIB_DEFECTS
498  // 74. Garbled text for codecvt::do_max_length
499  template<typename _InternT, typename _ExternT>
500  int
501  codecvt<_InternT, _ExternT, encoding_state>::
502  do_max_length() const throw()
503  { return 1; }
504 
505 _GLIBCXX_END_NAMESPACE
506 
507 #endif