I found another option using the
code_cvt
facet. This
code is a "bit" more complex but
will also work with
MBCSs such as
codepage 932 (Japanese). I have
tested it with some central european
and japanese characters on VS2008:
Collapse |
Copy Code
#include <sstream>
#include <locale>
#include <string>
template<size_t buf_size = 100>
class cp_converter {
const std::locale loc;
public:
cp_converter(const std::locale& loc) :
loc(loc)
{
}
std::wstring widen(const std::string& in) {
return convert<char, wchar_t>(in);
}
std::string narrow(const std::wstring& in) {
return convert<wchar_t, char>(in);
}
private:
typedef std::codecvt<wchar_t, char, mbstate_t> codecvt_facet;
inline codecvt_facet::result cv(
const codecvt_facet& facet,
mbstate_t& s,
const char* f1, const char* l1, const char*& n1,
wchar_t* f2, wchar_t* l2, wchar_t*& n2) const
{
return facet.in(s, f1, l1, n1, f2, l2, n2);
}
inline codecvt_facet::result cv(
const codecvt_facet& facet,
mbstate_t& s,
const wchar_t* f1, const wchar_t* l1, const wchar_t*& n1,
char* f2, char* l2, char*& n2) const
{
return facet.out(s, f1, l1, n1, f2, l2, n2);
}
template<class ct_in, class ct_out>
std::basic_string<ct_out> convert(const std::basic_string<ct_in>& in)
{
using namespace std;
const codecvt_facet& facet = use_facet<codecvt_facet>(loc);
basic_stringstream<ct_out> os;
ct_out buf[buf_size];
mbstate_t state = {0};
codecvt_facet::result result;
const ct_in* ipc = &in[0];
do {
ct_out* opc = 0;
result = cv(facet, state,
ipc, &in[0] + in.size(), ipc,
buf, buf + buf_size, opc);
os << basic_string<ct_out>(buf, opc - buf);
} while ((ipc < &in[0] + in.size()) && (result != codecvt_facet::error));
if (codecvt_facet::ok != result) throw std::exception("result is not ok!");
return os.str();
}
};
In order to use the class template,
create an object from it with the
locale you want to use. The
widen(...)
member will the
convert text from the selected
locale/code page to
std::wstring
. The
narrow(...)
will convert wide
characters to a
std::string
in the selected locale/code page.
Collapse |
Copy Code
cp_converter<> conv_polish(std::locale("Polish"));
assert(conv_polish.widen("\xF0") == L"\x0111");
cp_converter<> conv_english(std::locale("English"));
assert(conv_english.narrow(L"\x0111") == "\x64");
Important: Setting
buf_size
to odd values (e.g.
99) may result in buffer overflows
in
buf
. It seems that
VC++ 2008 disregards the value of
l2
in
facet.out(...)
if a
wchar_t
is expanded to more
than one
char
and this
happens to be at the last byte of
the buffer. As mcbs windows code
pages are limited (that is just a
guess, correct me if i'm wrong) to 1
or 2 bytes for each character, it
should be safe to set
buf_size
to any even number.
Important II: The C++
standard does not state what the
native encoding should be - neither
for
char
nor for
wchar_t
. It just happens to
be ANSI and UTF-16 in VC++ on
Windows. Other C++ compilers may be
standard compliant but using other
encodings. Then this tip would still
convert encodings but the result
won't be as expected.