Copyright © 2011 Anders Dalvander
Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt).
Table of Contents
Recent discussions on the Boost Developers mailing list has focused a lot on Unicode and UTF-encoded strings. Some think that std::string
is sufficient while others think that there is a need for a new replacement string class. Some of these think this new replacement string class should have the same interface as std::string
while others think it should use a new interface with a completely new container as the back-end storage. There has been proposals which tries too much and there has been proposals which tries too little.
Boost.Text will probably fall in the latter category. Boost.Text is yet another Unicode string class/library with the following set of features and requirements, whilst non of them are written in stone:
It is designed around the codepoint concept.
It uses (currently forward-) iterators for encoding and decoding.
It has a minimal interface, mostly constructors and iterator access.
Most other functions can (hopefully) be free functions.
It uses basic_string as back-end.
It has fast access to underlying basic_string.
It is (currently) immutable and shares data, and thus fast to copy.
My hope is that Boost.Text will give birth to some ideas or inspire fellow developers.
The following code demonstrates the syntax for using Boost.Text:
#include <boost/text.hpp> #include <iostream> #include <iomanip> int main() { const boost::codepoint cps[] = { 0x41u, 0x42u, 0x80u, 0x800u, 0x10000u, 0x10fffdu }; // construct from codepoint range boost::u8text u8txt(boost::begin(cps), boost::end(cps)); // construct from encoded container, // currently treats each element as a codepoint boost::u8text u8txt2("test"); // sharing is caring boost::u8text u8txt3 = u8txt; // construct from codepoint range boost::u16text u16txt(boost::begin(cps), boost::end(cps)); // construct from text, transcodes range boost::u16text u16txt2 = u8txt; // construct from text, transcodes range boost::u32text u32txt = u8txt; if (u32txt == u8txt) std::cout << "equal\n"; else std::cout << "not equal\n"; }
The following code demonstrates the syntax for using encodings of Boost.Text directly:
#include <boost/text.hpp> #include <iostream> #include <iomanip> int main() { const boost::codepoint cps[] = { 0x41u, 0x42u, 0x80u, 0x800u, 0x10000u, 0x10fffdu }; const boost::int8_t u8s[] = { 0x41u, 0x42u, 0xc2u, 0x80u, 0xe0u, 0xa0u, 0x80u, 0xf0u, 0x90u, 0x80u, 0x80u, 0xf4u, 0x8fu, 0xbfu, 0xbdu }; const boost::uint16_t u16s[] = { 0x0041u, 0x0042u, 0x0080u, 0x0800u, 0xd800u, 0xdc00u, 0xdbffu, 0xdffdu }; const boost::uint32_t u32s[] = { 0x00000041u, 0x00000042u, 0x00000080u, 0x00000800u, 0x00010000u, 0x0010fffdu }; std::cout << std::hex; std::cout << "UTF-8 code units\n"; boost::utf8::encoding::encode_iterator<const boost::codepoint*> u8_it(boost::begin(cps), boost::end(cps)), u8_it_end(boost::end(cps), boost::end(cps)); while (u8_it != u8_it_end) { std::cout << "0x" << std::setfill('0') << std::setw(2) << static_cast<boost::uint32_t>(static_cast<boost::uint8_t>(*u8_it)) << "\n"; ++u8_it; } std::cout << "\n"; std::cout << "UTF-8 code points\n"; boost::utf8::encoding::decode_iterator<const boost::int8_t*> cp_u8_it(boost::begin(u8s), boost::end(u8s)), cp_u8_it_end(boost::end(u8s), boost::end(u8s)); while (cp_u8_it != cp_u8_it_end) { std::cout << "U+" << std::setfill('0') << std::setw(4) << *cp_u8_it << "\n"; ++cp_u8_it; } std::cout << "\n"; std::cout << "UTF-16 code units\n"; boost::utf16::encoding::encode_iterator<const boost::codepoint*> u16_it(boost::begin(cps), boost::end(cps)), u16_it_end(boost::end(cps), boost::end(cps)); while (u16_it != u16_it_end) { std::cout << "0x" << std::setfill('0') << std::setw(4) << static_cast<boost::uint32_t>(static_cast<boost::uint16_t>(*u16_it)) << "\n"; ++u16_it; } std::cout << "\n"; std::cout << "UTF-16 code points\n"; boost::utf16::encoding::decode_iterator<const boost::uint16_t*> cp_u16_it(boost::begin(u16s), boost::end(u16s)), cp_u16_it_end(boost::end(u16s), boost::end(u16s)); while (cp_u16_it != cp_u16_it_end) { std::cout << "U+" << std::setfill('0') << std::setw(4) << *cp_u16_it << "\n"; ++cp_u16_it; } std::cout << "\n"; std::cout << "UTF-32 code units\n"; boost::utf32::encoding::encode_iterator<const boost::codepoint*> u32_it(boost::begin(cps), boost::end(cps)), u32_it_end(boost::end(cps), boost::end(cps)); while (u32_it != u32_it_end) { std::cout << "0x" << std::setfill('0') << std::setw(8) << static_cast<boost::uint32_t>(*u32_it) << "\n"; ++u32_it; } std::cout << "\n"; std::cout << "UTF-32 code points\n"; boost::utf32::encoding::decode_iterator<const boost::uint32_t*> cp_u32_it(boost::begin(u32s), boost::end(u32s)), cp_u32_it_end(boost::end(u32s), boost::end(u32s)); while (cp_u32_it != cp_u32_it_end) { std::cout << "U+" << std::setfill('0') << std::setw(4) << *cp_u32_it << "\n"; ++cp_u32_it; } std::cout << "\n"; }