diff --git a/CHANGES b/CHANGES index db0b53a6..a4dec018 100644 --- a/CHANGES +++ b/CHANGES @@ -40,6 +40,9 @@ Backend-specific changes: - Add "nocreate" option to avoid creating a new database file (#1021). - Improve column names to SOCI database types mapping (#1120). +- ODBC + - Add support for using wide strings (#1179). + History of the changes in the previous versions: diff --git a/docs/api/backend.md b/docs/api/backend.md index 691b0fc2..521768fd 100644 --- a/docs/api/backend.md +++ b/docs/api/backend.md @@ -1,6 +1,6 @@ # Backends reference -This part of the documentation is provided for those who want towrite (and contribute!) their +This part of the documentation is provided for those who want to write (and contribute!) their own backends. It is anyway recommendedthat authors of new backend see the code of some existing backend forhints on how things are really done. @@ -28,6 +28,7 @@ enum data_type enum db_type { db_string, + db_wstring, db_int8, db_uint8, db_int16, @@ -50,6 +51,7 @@ enum exchange_type { x_char, x_stdstring, + x_stdwstring, x_int8, x_uint8, x_int16, diff --git a/docs/api/client.md b/docs/api/client.md index 085786e4..b3d78483 100644 --- a/docs/api/client.md +++ b/docs/api/client.md @@ -13,7 +13,7 @@ The following types are commonly used in the rest of the interface: ```cpp // data types, as seen by the user -enum db_type { db_string, db_date, db_double, db_int8, db_uint8, db_int16, db_uint16, db_int32, db_uint32, db_int64, db_uint64 }; +enum db_type { db_string, db_wstring, db_date, db_double, db_int8, db_uint8, db_int16, db_uint16, db_int32, db_uint32, db_int64, db_uint64 }; // deprecated data types enum which may be still used but is less precise than db_type enum data_type { dt_string, dt_date, dt_double, dt_integer, dt_long_long, dt_unsigned_long_long }; diff --git a/docs/backends/odbc.md b/docs/backends/odbc.md index 9365c028..97435499 100644 --- a/docs/backends/odbc.md +++ b/docs/backends/odbc.md @@ -75,6 +75,7 @@ For the ODBC backend, this type mapping is: | SQL_INTEGER | db_int32 | int32_t | | SQL_BIGINT | db_int64 | int64_t | | SQL_CHAR, SQL_VARCHAR | db_string | std::string | +| SQL_WCHAR, SQL_WVARCHAR, SQL_WLONGVARCHAR | db_wstring | std::wstring | | SQL_TYPE_DATE, SQL_TYPE_TIME, SQL_TYPE_TIMESTAMP | db_date | std::tm | Not all ODBC drivers support all datatypes. diff --git a/include/private/soci-exchange-cast.h b/include/private/soci-exchange-cast.h index 755af0ae..8c18ec3c 100644 --- a/include/private/soci-exchange-cast.h +++ b/include/private/soci-exchange-cast.h @@ -36,6 +36,12 @@ struct exchange_type_traits typedef std::string value_type; }; +template <> +struct exchange_type_traits +{ + typedef std::wstring value_type; +}; + template <> struct exchange_type_traits { diff --git a/include/private/soci-vector-helpers.h b/include/private/soci-vector-helpers.h index d4eff144..1ae0c939 100644 --- a/include/private/soci-vector-helpers.h +++ b/include/private/soci-vector-helpers.h @@ -33,6 +33,8 @@ inline std::size_t get_vector_size(exchange_type e, void *data) return exchange_vector_type_cast(data).size(); case x_stdstring: return exchange_vector_type_cast(data).size(); + case x_stdwstring: + return exchange_vector_type_cast(data).size(); case x_int8: return exchange_vector_type_cast(data).size(); case x_uint8: @@ -76,6 +78,9 @@ inline void resize_vector(exchange_type e, void *data, std::size_t newSize) case x_stdstring: exchange_vector_type_cast(data).resize(newSize); return; + case x_stdwstring: + exchange_vector_type_cast(data).resize(newSize); + return; case x_int8: exchange_vector_type_cast(data).resize(newSize); return; @@ -131,6 +136,7 @@ inline std::string& vector_string_value(exchange_type e, void *data, std::size_t return exchange_vector_type_cast(data).at(ind).value; case x_longstring: return exchange_vector_type_cast(data).at(ind).value; + case x_stdwstring: case x_char: case x_int8: case x_uint8: diff --git a/include/soci/exchange-traits.h b/include/soci/exchange-traits.h index 53478a58..9279faf9 100644 --- a/include/soci/exchange-traits.h +++ b/include/soci/exchange-traits.h @@ -155,6 +155,13 @@ struct exchange_traits enum { x_type = x_stdstring }; }; +template <> +struct exchange_traits +{ + typedef basic_type_tag type_family; + enum { x_type = x_stdwstring }; +}; + template <> struct exchange_traits { diff --git a/include/soci/odbc/soci-odbc.h b/include/soci/odbc/soci-odbc.h index 0bedd575..e499711f 100644 --- a/include/soci/odbc/soci-odbc.h +++ b/include/soci/odbc/soci-odbc.h @@ -43,6 +43,16 @@ namespace details { return reinterpret_cast(const_cast(s.c_str())); } + + inline SQLWCHAR* sqlchar_cast(std::wstring const& s) + { + return reinterpret_cast(const_cast(s.c_str())); + } + + inline SQLWCHAR* sqlchar_cast(std::u16string const& s) + { + return reinterpret_cast(const_cast(s.c_str())); + } } // Option allowing to specify the "driver completion" parameter of @@ -192,6 +202,10 @@ private: SQLSMALLINT& sqlType, SQLSMALLINT& cType); + void copy_from_string(const std::wstring& s, + SQLLEN& size, + SQLSMALLINT& sqlType, + SQLSMALLINT& cType); }; struct odbc_vector_use_type_backend : details::vector_use_type_backend, diff --git a/include/soci/ref-counted-statement.h b/include/soci/ref-counted-statement.h index 47068016..ea5854a6 100644 --- a/include/soci/ref-counted-statement.h +++ b/include/soci/ref-counted-statement.h @@ -11,6 +11,7 @@ #include "soci/statement.h" #include "soci/into-type.h" #include "soci/use-type.h" +#include "soci-unicode.h" // std #include diff --git a/include/soci/soci-backend.h b/include/soci/soci-backend.h index bed27aa4..5fe3d229 100644 --- a/include/soci/soci-backend.h +++ b/include/soci/soci-backend.h @@ -26,6 +26,7 @@ namespace soci enum db_type { db_string, + db_wstring, db_int8, db_uint8, db_int16, @@ -61,6 +62,7 @@ enum exchange_type { x_char, x_stdstring, + x_stdwstring, x_int8, x_uint8, x_int16, @@ -271,6 +273,9 @@ public: case db_uint64: return dt_unsigned_long_long; case db_blob: return dt_blob; case db_xml: return dt_xml; + + case db_wstring: + throw soci_error("unable to convert value to data_type"); } // unreachable diff --git a/include/soci/soci-unicode.h b/include/soci/soci-unicode.h new file mode 100644 index 00000000..995741fd --- /dev/null +++ b/include/soci/soci-unicode.h @@ -0,0 +1,426 @@ +#ifndef SOCI_UNICODE_H_INCLUDED +#define SOCI_UNICODE_H_INCLUDED + +#include "soci/error.h" + +#include + +#include + +// Define SOCI_WCHAR_T_IS_UTF32 if wchar_t is wider than 16 bits (e.g., on Unix/Linux) +#if WCHAR_MAX > 0xFFFFu +#define SOCI_WCHAR_T_IS_UTF32 +#endif + +namespace soci +{ + +namespace details +{ + +#if defined(SOCI_WCHAR_T_IS_UTF32) +static_assert(sizeof(wchar_t) == sizeof(char32_t), "wchar_t must be 32 bits"); + +inline char32_t const* wide_to_char_type(std::wstring const& ws) +{ + return reinterpret_cast(ws.data()); +} +#else +static_assert(sizeof(wchar_t) == sizeof(char16_t), "wchar_t must be 16 bits"); + +inline char16_t const* wide_to_char_type(std::wstring const& ws) +{ + return reinterpret_cast(ws.data()); +} +#endif + +inline void throw_if_too_small(std::size_t required, std::size_t available) +{ + if (required > available) + throw soci_error("Output buffer is too small"); +} + +/** + Check if the given string is a valid UTF-8 encoded string. + + Throws soci_error if the string is not a valid UTF-8 string. + + @param utf8 The string of length @a len. + @param len The length of the string. + */ +void SOCI_DECL ensure_valid_utf8(char const* utf8, std::size_t len); + +/// @overload +inline void ensure_valid_utf8(std::string const& utf8) +{ + ensure_valid_utf8(utf8.data(), utf8.size()); +} + +/** + Check if a given string is a valid UTF-16 encoded string. + + Throws soci_error if the string is not a valid UTF-16 string. + + @param s The UTF-16 string to check. + @param len The length of the string in characters. + */ +void SOCI_DECL ensure_valid_utf16(char16_t const* s, std::size_t len); + +/// @overload +inline void ensure_valid_utf16(std::u16string const& utf16) +{ + ensure_valid_utf16(utf16.data(), utf16.size()); +} + +/** + Check if a given string is a valid UTF-32 encoded string. + + Throws soci_error if the string is not a valid UTF-32 string. + + @param utf32 The input UTF-32 string. + @return True if the input string is valid, false otherwise. + */ +void SOCI_DECL ensure_valid_utf32(char32_t const* s, std::size_t len); + +/// @overload +inline void ensure_valid_utf32(std::u32string const& utf32) +{ + ensure_valid_utf32(utf32.data(), utf32.size()); +} + +/** + Convert a UTF-8 encoded string to a UTF-16 encoded string. + + The input string must be a valid UTF-8 encoded string of the given length + (not necessarily NUL-terminated). The output buffer must either contain + enough space to store @a len16 characters or be @c nullptr to just compute + the length required for conversion (in which case @a len16 is ignored). + + @param utf8 The input UTF-8 encoded string. + @param len8 The length of the input string. + @param out16 The output buffer or nullptr to just compute the required + length. + @param len16 The length of the output buffer if it is non-null. + @return The length of the UTF-16 output. + @throws soci_error if the input string contains invalid UTF-8 encoding or + if the required length is greater than @a len16 when @a out16 is not @c + nullptr. + */ +std::size_t SOCI_DECL +utf8_to_utf16(char const* utf8, std::size_t len8, + char16_t* out16, std::size_t len16); + +/// @overload +inline std::u16string utf8_to_utf16(char const* s, std::size_t len) +{ + auto const len16 = utf8_to_utf16(s, len, nullptr, 0); + std::u16string utf16(len16, u'\0'); + utf8_to_utf16(s, len, const_cast(utf16.data()), len16); + return utf16; +} + +/// @overload +inline std::u16string utf8_to_utf16(std::string const& utf8) +{ + return utf8_to_utf16(utf8.data(), utf8.size()); +} + +/// @overload +inline std::u16string utf8_to_utf16(char const* s) +{ + return utf8_to_utf16(s, std::char_traits::length(s)); +} + +/** + Convert a UTF-16 encoded string to a UTF-8 encoded string. + + Semantics of this function are the same as for utf8_to_utf16(), see its + documentation for more details. + */ +std::size_t SOCI_DECL +utf16_to_utf8(char16_t const* utf16, std::size_t len16, + char* out8, std::size_t len8); + +/// @overload +inline std::string utf16_to_utf8(char16_t const* s, std::size_t len) +{ + auto const len8 = utf16_to_utf8(s, len, nullptr, 0); + std::string utf8(len8, '\0'); + utf16_to_utf8(s, len, const_cast(utf8.data()), len8); + return utf8; +} + +/// @overload +inline std::string utf16_to_utf8(std::u16string const& utf16) +{ + return utf16_to_utf8(utf16.data(), utf16.size()); +} + +/// @overload +inline std::string utf16_to_utf8(char16_t const* s) +{ + return utf16_to_utf8(s, std::char_traits::length(s)); +} + +/** + Convert a UTF-16 encoded string to a UTF-32 encoded string. + + Semantics of this function are the same as for utf8_to_utf16(), see its + documentation for more details. + */ +std::size_t SOCI_DECL +utf16_to_utf32(char16_t const* utf16, std::size_t len16, + char32_t* out32, std::size_t len32); + +/// @overload +inline std::u32string utf16_to_utf32(char16_t const* s, std::size_t len) +{ + auto const len32 = utf16_to_utf32(s, len, nullptr, 0); + std::u32string utf32(len32, U'\0'); + utf16_to_utf32(s, len, const_cast(utf32.data()), len32); + return utf32; +} + +/// @overload +inline std::u32string utf16_to_utf32(std::u16string const& utf16) +{ + return utf16_to_utf32(utf16.data(), utf16.size()); +} + +/// @overload +inline std::u32string utf16_to_utf32(char16_t const* s) +{ + return utf16_to_utf32(s, std::char_traits::length(s)); +} + + +/** + Convert a UTF-32 encoded string to a UTF-16 encoded string. + + Semantics of this function are the same as for utf8_to_utf16(), see its + documentation for more details. + */ +std::size_t SOCI_DECL +utf32_to_utf16(char32_t const* utf32, std::size_t len32, + char16_t* out16, std::size_t len16); + +/// @overload +inline std::u16string utf32_to_utf16(char32_t const* utf32, std::size_t len) +{ + auto const len16 = utf32_to_utf16(utf32, len, nullptr, 0); + std::u16string utf16(len16, u'\0'); + utf32_to_utf16(utf32, len, const_cast(utf16.data()), len16); + return utf16; +} + +/// @overload +inline std::u16string utf32_to_utf16(std::u32string const& utf32) +{ + return utf32_to_utf16(utf32.data(), utf32.size()); +} + +/// @overload +inline std::u16string utf32_to_utf16(char32_t const* s) +{ + return utf32_to_utf16(s, std::char_traits::length(s)); +} + +/** + Convert a UTF-8 encoded string to a UTF-32 encoded string. + + Semantics of this function are the same as for utf8_to_utf16(), see its + documentation for more details. + */ +std::size_t SOCI_DECL +utf8_to_utf32(char const* utf8, std::size_t len8, + char32_t* out32, std::size_t len32); + +/// @overload +inline std::u32string utf8_to_utf32(char const* utf8, std::size_t len) +{ + auto const len32 = utf8_to_utf32(utf8, len, nullptr, 0); + std::u32string utf32(len32, U'\0'); + utf8_to_utf32(utf8, len, const_cast(utf32.data()), len32); + return utf32; +} + +/// @overload +inline std::u32string utf8_to_utf32(std::string const& utf8) +{ + return utf8_to_utf32(utf8.data(), utf8.size()); +} + +/// @overload +inline std::u32string utf8_to_utf32(char const* s) +{ + return utf8_to_utf32(s, std::char_traits::length(s)); +} + +/** + Convert a UTF-32 encoded string to a UTF-8 encoded string. + + Semantics of this function are the same as for utf8_to_utf16(), see its + documentation for more details. + */ +std::size_t SOCI_DECL +utf32_to_utf8(char32_t const* utf32, std::size_t len32, + char* out8, std::size_t len8); + +/// @overload +inline std::string utf32_to_utf8(char32_t const* s, std::size_t len) +{ + auto const len8 = utf32_to_utf8(s, len, nullptr, 0); + std::string utf8(len8, '\0'); + utf32_to_utf8(s, len, const_cast(utf8.data()), len8); + return utf8; +} + +/// @overload +inline std::string utf32_to_utf8(std::u32string const& utf32) +{ + return utf32_to_utf8(utf32.data(), utf32.size()); +} + +/// @overload +inline std::string utf32_to_utf8(char32_t const* s) +{ + return utf32_to_utf8(s, std::char_traits::length(s)); +} + +/** + Convert a UTF-8 encoded string to a wide string (wstring). + + This is equivalent to either utf8_to_utf32() or utf8_to_utf16() depending + on the platform. + + @param utf8 The input UTF-8 encoded string. + @return The wide string. + */ +inline std::wstring utf8_to_wide(char const* s, std::size_t len) +{ +#if defined(SOCI_WCHAR_T_IS_UTF32) + auto const wlen = utf8_to_utf32(s, len, nullptr, 0); + std::wstring ws(wlen, u'\0'); + utf8_to_utf32(s, len, const_cast(wide_to_char_type(ws)), wlen); + return ws; +#else // !SOCI_WCHAR_T_IS_UTF32 + auto const wlen = utf8_to_utf16(s, len, nullptr, 0); + std::wstring ws(wlen, u'\0'); + utf8_to_utf16(s, len, const_cast(wide_to_char_type(ws)), wlen); + return ws; +#endif // SOCI_WCHAR_T_IS_UTF32 +} + +/// @overload +inline std::wstring utf8_to_wide(std::string const& utf8) +{ + return utf8_to_wide(utf8.data(), utf8.size()); +} + +/** + Convert a wide string (wstring) to a UTF-8 encoded string. + + This is equivalent to either utf32_to_utf8() or utf16_to_utf8() depending + on the platform. + + @param ws The wide string. + @return std::string The UTF-8 encoded string. + */ +inline std::string wide_to_utf8(std::wstring const& ws) +{ +#if defined(SOCI_WCHAR_T_IS_UTF32) + return utf32_to_utf8(wide_to_char_type(ws), ws.size()); +#else // !SOCI_WCHAR_T_IS_UTF32 + return utf16_to_utf8(wide_to_char_type(ws), ws.size()); +#endif // SOCI_WCHAR_T_IS_UTF32 +} + +/** + Convert a UTF-16 encoded string to a wide string (wstring). + + This is equivalent to either utf16_to_utf32() or direct copy depending on + the platform. + + @param s The UTF-16 encoded string. + @param len The length of the input string. + @return The wide string. + @throws soci_error if the input string contains invalid UTF-16 encoding. + */ +inline std::wstring utf16_to_wide(char16_t const* s, std::size_t len) +{ +#if defined(SOCI_WCHAR_T_IS_UTF32) + // Convert UTF-16 to UTF-32 which is used by wstring. + auto const wlen = utf16_to_utf32(s, len, nullptr, 0); + std::wstring ws(wlen, L'\0'); + utf16_to_utf32(s, len, + const_cast(wide_to_char_type(ws)), wlen); + return ws; +#else // !SOCI_WCHAR_T_IS_UTF32 + // Perform validation even though it's already UTF-16 + ensure_valid_utf16(s, len); + wchar_t const* ws = reinterpret_cast(s); + return std::wstring(ws, ws + len); +#endif // SOCI_WCHAR_T_IS_UTF32 +} + +/// @overload +inline std::wstring utf16_to_wide(char16_t const* s) +{ + return utf16_to_wide(s, std::char_traits::length(s)); +} + +/// @overload +inline std::wstring utf16_to_wide(std::u16string const& utf16) +{ + return utf16_to_wide(utf16.data(), utf16.size()); +} + +/** + Convert a wide string (wstring) to a UTF-16 encoded string. + + This is equivalent to either utf32_to_utf16() or direct copy depending on + the platform. + + @param ws The wide string. + @param out The output buffer or nullptr to just compute the required length. + @param len The output buffer length in characters (ignored if @a out is @c + nullptr). + @return The length of the UTF-16 output. + @throws soci_error if the input string contains invalid wide characters or + if the output buffer is too small when @a out is not @c nullptr. + */ +inline +std::size_t wide_to_utf16(std::wstring const& ws, char16_t* out, std::size_t len) +{ +#if defined(SOCI_WCHAR_T_IS_UTF32) + // Convert UTF-32 string to UTF-16. + return utf32_to_utf16(wide_to_char_type(ws), ws.length(), out, len); +#else // !SOCI_WCHAR_T_IS_UTF32 + // It's already in UTF-16, just copy, but check that it's valid and that we + // have enough space -- or just return the length if not asked to copy. + auto const wlen = ws.length(); + if (out) + { + throw_if_too_small(wlen, len); + + ensure_valid_utf16(wide_to_char_type(ws), wlen); + std::memcpy(out, ws.data(), wlen * sizeof(wchar_t)); + } + return wlen; +#endif // SOCI_WCHAR_T_IS_UTF32 +} + +/// @overload +inline std::u16string wide_to_utf16(std::wstring const& ws) +{ + auto const wlen = wide_to_utf16(ws, nullptr, 0); + std::u16string utf16(wlen, u'\0'); + wide_to_utf16(ws, const_cast(utf16.data()), wlen); + return utf16; +} + +} // namespace details + +} // namespace soci + +#endif // SOCI_UNICODE_H_INCLUDED diff --git a/include/soci/type-holder.h b/include/soci/type-holder.h index 375a5fd3..c621caeb 100644 --- a/include/soci/type-holder.h +++ b/include/soci/type-holder.h @@ -131,6 +131,7 @@ struct soci_cast< union type_holder { std::string* s; + std::wstring* ws; int8_t* i8; int16_t* i16; int32_t* i32; @@ -153,6 +154,12 @@ struct type_holder_trait static const db_type type = db_string; }; +template <> +struct type_holder_trait +{ + static const db_type type = db_wstring; +}; + template <> struct type_holder_trait { @@ -305,6 +312,9 @@ public: case db_string: delete val_.s; break; + case db_wstring: + delete val_.ws; + break; } } @@ -345,6 +355,8 @@ public: case db_xml: case db_string: return soci_cast::cast(*val_.s); + case db_wstring: + return soci_cast::cast(*val_.ws); } throw std::bad_cast(); @@ -380,6 +392,8 @@ public: case db_xml: case db_string: return soci_return_same::value(*val_.s); + case db_wstring: + return soci_return_same::value(*val_.ws); } throw std::bad_cast(); @@ -430,6 +444,9 @@ private: case db_string: val_.s = static_cast(val); return; + case db_wstring: + val_.ws = static_cast(val); + return; } // This should be unreachable diff --git a/src/backends/odbc/standard-into-type.cpp b/src/backends/odbc/standard-into-type.cpp index 37c028a3..1f252e4c 100644 --- a/src/backends/odbc/standard-into-type.cpp +++ b/src/backends/odbc/standard-into-type.cpp @@ -7,6 +7,7 @@ #define SOCI_ODBC_SOURCE #include "soci/soci-platform.h" +#include "soci/soci-unicode.h" #include "soci/odbc/soci-odbc.h" #include "soci-compiler.h" #include "soci-cstrtoi.h" @@ -50,6 +51,15 @@ void odbc_standard_into_type_backend::define_by_pos( buf_ = new char[size]; data = buf_; break; + case x_stdwstring: + odbcType_ = SQL_C_WCHAR; + // Do exactly the same thing here as for x_stdstring above. + size = static_cast(statement_.column_size(position_)); + size = (size >= ODBC_MAX_COL_SIZE || size == 0) ? odbc_max_buffer_length : size; + size += sizeof(SQLWCHAR); + buf_ = new char[size]; + data = buf_; + break; case x_int8: odbcType_ = SQL_C_STINYINT; size = sizeof(int8_t); @@ -183,6 +193,15 @@ void odbc_standard_into_type_backend::post_fetch( throw soci_error("Buffer size overflow; maybe got too large string"); } } + else if (type_ == x_stdwstring) + { + std::wstring& s = exchange_type_cast(data_); + s = utf16_to_wide(reinterpret_cast(buf_)); + if (s.size() * sizeof(wchar_t) >= (odbc_max_buffer_length - 1)) + { + throw soci_error("Buffer size overflow; maybe got too large string"); + } + } else if (type_ == x_longstring) { exchange_type_cast(data_).value = buf_; diff --git a/src/backends/odbc/standard-use-type.cpp b/src/backends/odbc/standard-use-type.cpp index 12c573cd..eb26add9 100644 --- a/src/backends/odbc/standard-use-type.cpp +++ b/src/backends/odbc/standard-use-type.cpp @@ -6,6 +6,7 @@ #define SOCI_ODBC_SOURCE #include "soci/soci-platform.h" #include "soci/odbc/soci-odbc.h" +#include "soci/soci-unicode.h" #include "soci-compiler.h" #include "soci-exchange-cast.h" #include @@ -112,6 +113,13 @@ void* odbc_standard_use_type_backend::prepare_for_bind( copy_from_string(s, size, sqlType, cType); } break; + case x_stdwstring: + { + std::wstring const& s = exchange_type_cast(data_); + + copy_from_string(s, size, sqlType, cType); + } + break; case x_stdtm: { std::tm const& t = exchange_type_cast(data_); @@ -175,6 +183,27 @@ void odbc_standard_use_type_backend::copy_from_string( indHolder_ = SQL_NTS; } +void odbc_standard_use_type_backend::copy_from_string( + const std::wstring& s, + SQLLEN& size, + SQLSMALLINT& sqlType, + SQLSMALLINT& cType + ) +{ + auto const len = wide_to_utf16(s, nullptr, 0); + + size = static_cast((len + 1) * sizeof(SQLWCHAR)); + sqlType = size >= ODBC_MAX_COL_SIZE ? SQL_WLONGVARCHAR : SQL_WVARCHAR; + cType = SQL_C_WCHAR; + buf_ = new char[size]; + + char16_t* const wbuf = reinterpret_cast(buf_); + wide_to_utf16(s, wbuf, len); + wbuf[len] = u'\0'; + + indHolder_ = SQL_NTS; +} + void odbc_standard_use_type_backend::bind_by_pos( int &position, void *data, exchange_type type, bool /* readOnly */) { diff --git a/src/backends/odbc/statement.cpp b/src/backends/odbc/statement.cpp index 2f61df2e..e527ffef 100644 --- a/src/backends/odbc/statement.cpp +++ b/src/backends/odbc/statement.cpp @@ -7,6 +7,7 @@ #define SOCI_ODBC_SOURCE #include "soci/odbc/soci-odbc.h" +#include "soci/soci-unicode.h" #include #include #include @@ -390,6 +391,11 @@ void odbc_statement_backend::describe_column(int colNum, case SQL_BIGINT: dbtype = is_unsigned == SQL_TRUE ? db_uint64 : db_int64; break; + case SQL_WCHAR: + case SQL_WVARCHAR: + case SQL_WLONGVARCHAR: + dbtype = db_wstring; + break; case SQL_CHAR: case SQL_VARCHAR: case SQL_LONGVARCHAR: diff --git a/src/backends/odbc/vector-into-type.cpp b/src/backends/odbc/vector-into-type.cpp index 3e864d9b..3dc54181 100644 --- a/src/backends/odbc/vector-into-type.cpp +++ b/src/backends/odbc/vector-into-type.cpp @@ -8,6 +8,7 @@ #define SOCI_ODBC_SOURCE #include "soci/soci-platform.h" #include "soci/odbc/soci-odbc.h" +#include "soci/soci-unicode.h" #include "soci/type-wrappers.h" #include "soci-compiler.h" #include "soci-cstrtoi.h" @@ -126,6 +127,27 @@ void odbc_vector_into_type_backend::define_by_pos( buf_ = new char[colSize_ * elementsCount]; } break; + case x_stdwstring: + // Do exactly the same thing as above, but for wide characters. + { + odbcType_ = SQL_C_WCHAR; + + colSize_ = static_cast(get_sqllen_from_value(statement_.column_size(position))); + if (colSize_ >= ODBC_MAX_COL_SIZE || colSize_ == 0) + { + colSize_ = odbc_max_buffer_length; + + statement_.fetchVectorByRows_ = true; + } + + colSize_ += sizeof(SQLWCHAR); + + const std::size_t elementsCount + = statement_.fetchVectorByRows_ ? 1 : vectorSize; + + buf_ = new char[colSize_ * elementsCount * sizeof(SQLWCHAR)]; + } + break; case x_stdtm: odbcType_ = SQL_C_TYPE_TIMESTAMP; @@ -196,6 +218,7 @@ void odbc_vector_into_type_backend::rebind_row(std::size_t rowInd) case x_char: case x_stdstring: + case x_stdwstring: case x_xmltype: case x_longstring: case x_stdtm: @@ -287,6 +310,34 @@ void odbc_vector_into_type_backend::do_post_fetch_rows( value.assign(pos, end - pos); } } + else if (type_ == x_stdwstring) + { + // Do exactly the same thing as above, but for wide characters. + SQLWCHAR* pos = reinterpret_cast(buf_); + for (std::size_t i = beginRow; i != endRow; ++i, pos += colSize_ / sizeof(SQLWCHAR)) + { + SQLLEN len = get_sqllen_from_vector_at(i); + + std::wstring& value = exchange_vector_type_cast(data_).at(i); + if (len == -1) + { + value.clear(); + continue; + } + + SQLWCHAR* end = pos + len / sizeof(SQLWCHAR); + while (end != pos) + { + if (*--end != L' ') + { + ++end; + break; + } + } + + value = utf16_to_wide(reinterpret_cast(pos), end - pos); + } + } else if (type_ == x_stdtm) { std::vector *vp diff --git a/src/backends/odbc/vector-use-type.cpp b/src/backends/odbc/vector-use-type.cpp index b6dfa625..bb6a0c68 100644 --- a/src/backends/odbc/vector-use-type.cpp +++ b/src/backends/odbc/vector-use-type.cpp @@ -7,6 +7,7 @@ #define SOCI_ODBC_SOURCE #include "soci/soci-platform.h" +#include "soci/soci-unicode.h" #include "soci/odbc/soci-odbc.h" #include "soci-compiler.h" #include "soci-vector-helpers.h" @@ -231,6 +232,42 @@ void* odbc_vector_use_type_backend::prepare_for_bind(SQLUINTEGER &size, cType = SQL_C_CHAR; } break; + case x_stdwstring: + { + std::size_t maxSize = 0; + std::size_t const vecSize = get_vector_size(type_, data_); + prepare_indicators(vecSize); + for (std::size_t i = 0; i != vecSize; ++i) + { + std::wstring& value = exchange_vector_type_cast(data_).at(i); + std::size_t const sz = wide_to_utf16(value, nullptr, 0); + set_sqllen_from_vector_at(i, static_cast(sz * sizeof(SQLWCHAR))); + maxSize = sz > maxSize ? sz : maxSize; + } + + maxSize++; // For terminating nul. + + buf_ = new char[maxSize * vecSize * sizeof(SQLWCHAR)]; + memset(buf_, 0, maxSize * vecSize * sizeof(SQLWCHAR)); + + static_assert(sizeof(SQLWCHAR) == sizeof(char16_t), "unexpected SQLWCHAR size"); + char16_t* pos = reinterpret_cast(buf_); + + for (std::size_t i = 0; i != vecSize; ++i) + { + std::wstring& value = exchange_vector_type_cast(data_).at(i); + wide_to_utf16(value, pos, maxSize); + pos += maxSize; + } + + data = buf_; + size = static_cast(maxSize * sizeof(SQLWCHAR)); + + sqlType = size >= ODBC_MAX_COL_SIZE ? SQL_WLONGVARCHAR : SQL_WVARCHAR; + cType = SQL_C_WCHAR; + } + break; + case x_stdtm: { std::vector *vp @@ -339,6 +376,7 @@ void odbc_vector_use_type_backend::pre_use(indicator const *ind) case x_char: case x_stdstring: + case x_stdwstring: case x_xmltype: case x_longstring: non_null_indicator = SQL_NTS; @@ -438,7 +476,7 @@ void odbc_vector_use_type_backend::pre_use(indicator const *ind) else { // for strings we have already set the values - if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring) + if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring && type_ != x_stdwstring) { set_sqllen_from_vector_at(i, non_null_indicator); } @@ -451,7 +489,7 @@ void odbc_vector_use_type_backend::pre_use(indicator const *ind) for (std::size_t i = 0; i != indHolderVec_.size(); ++i) { // for strings we have already set the values - if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring) + if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring && type_ != x_stdwstring) { set_sqllen_from_vector_at(i, non_null_indicator); } diff --git a/src/backends/oracle/standard-into-type.cpp b/src/backends/oracle/standard-into-type.cpp index 5c0e78f1..17e32ec5 100644 --- a/src/backends/oracle/standard-into-type.cpp +++ b/src/backends/oracle/standard-into-type.cpp @@ -199,7 +199,10 @@ void oracle_standard_into_type_backend::define_by_pos( ociData_ = lobp; } break; + default: + throw soci_error("Into element used with non-supported type."); } + sword res = OCIDefineByPos(statement_.stmtp_, &defnp_, statement_.session_.errhp_, diff --git a/src/backends/oracle/standard-use-type.cpp b/src/backends/oracle/standard-use-type.cpp index 8a2b64b0..f18bf533 100644 --- a/src/backends/oracle/standard-use-type.cpp +++ b/src/backends/oracle/standard-use-type.cpp @@ -200,6 +200,9 @@ void oracle_standard_use_type_backend::prepare_for_bind( ociData_ = lobp; } break; + + case x_stdwstring: + throw soci_error("Wide string use elements are not supported by Oracle backend."); } } @@ -471,6 +474,8 @@ void oracle_standard_use_type_backend::pre_use(indicator const *ind) case x_blob: // nothing to do break; + case x_stdwstring: + throw soci_error("Wide string use elements are not supported by Oracle backend."); } // then handle indicators @@ -685,6 +690,8 @@ void oracle_standard_use_type_backend::post_use(bool gotData, indicator *ind) case x_longstring: // nothing to do here break; + case x_stdwstring: + throw soci_error("Wide string use elements are not supported by Oracle backend."); } } diff --git a/src/backends/oracle/vector-into-type.cpp b/src/backends/oracle/vector-into-type.cpp index 6ec30181..29ff8852 100644 --- a/src/backends/oracle/vector-into-type.cpp +++ b/src/backends/oracle/vector-into-type.cpp @@ -218,6 +218,7 @@ void oracle_vector_into_type_backend::define_by_pos_bulk( case x_statement: case x_rowid: case x_blob: + case x_stdwstring: throw soci_error("Unsupported type for vector into parameter"); } diff --git a/src/backends/oracle/vector-use-type.cpp b/src/backends/oracle/vector-use-type.cpp index 93a9c3ee..69ca958d 100644 --- a/src/backends/oracle/vector-use-type.cpp +++ b/src/backends/oracle/vector-use-type.cpp @@ -214,6 +214,7 @@ void oracle_vector_use_type_backend::prepare_for_bind( case x_statement: case x_rowid: case x_blob: + case x_stdwstring: throw soci_error("Unsupported type for vector use parameter"); } } diff --git a/src/backends/sqlite3/statement.cpp b/src/backends/sqlite3/statement.cpp index 24b23b66..bd6a2f70 100644 --- a/src/backends/sqlite3/statement.cpp +++ b/src/backends/sqlite3/statement.cpp @@ -208,6 +208,8 @@ sqlite3_statement_backend::load_rowset(int totalRows) case db_xml: throw soci_error("XML data type is not supported"); + case db_wstring: + throw soci_error("Wide string data type is not supported"); } } } @@ -332,6 +334,8 @@ sqlite3_statement_backend::bind_and_execute(int number) case db_xml: throw soci_error("XML data type is not supported"); + case db_wstring: + throw soci_error("Wide string data type is not supported"); } } diff --git a/src/backends/sqlite3/vector-into-type.cpp b/src/backends/sqlite3/vector-into-type.cpp index fb67a252..38d38cb6 100644 --- a/src/backends/sqlite3/vector-into-type.cpp +++ b/src/backends/sqlite3/vector-into-type.cpp @@ -130,6 +130,8 @@ void set_number_in_vector(void *p, int idx, const sqlite3_column &col) case db_xml: throw soci_error("XML data type is not supported"); + case db_wstring: + throw soci_error("Wide string data type is not supported"); }; } @@ -243,6 +245,8 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind) case db_xml: throw soci_error("XML data type is not supported"); + case db_wstring: + throw soci_error("Wide string data type is not supported"); }; break; } // x_char @@ -325,6 +329,9 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind) set_in_vector(data_, i, xml); break; } + + case db_wstring: + throw soci_error("Wide string data type is not supported"); }; break; } // x_stdstring @@ -409,6 +416,7 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind) case db_uint32: case db_int64: case db_uint64: + case db_wstring: throw soci_error("Into element used with non-convertible type."); case db_xml: @@ -444,6 +452,8 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind) case db_xml: throw soci_error("XML data type is not supported"); + case db_wstring: + throw soci_error("Wide string data type is not supported"); } } } diff --git a/src/core/soci-simple.cpp b/src/core/soci-simple.cpp index 566a4659..30a9279f 100644 --- a/src/core/soci-simple.cpp +++ b/src/core/soci-simple.cpp @@ -356,6 +356,7 @@ struct statement_wrapper std::vector into_types; // for both single and bulk std::vector into_indicators; std::map into_strings; + std::map into_wstrings; std::map into_int8; std::map into_uint8; std::map into_int16; @@ -370,6 +371,7 @@ struct statement_wrapper std::vector > into_indicators_v; std::map > into_strings_v; + std::map > into_wstrings_v; std::map > into_int8_v; std::map > into_uint8_v; std::map > into_int16_v; @@ -384,6 +386,7 @@ struct statement_wrapper // use elements std::map use_indicators; std::map use_strings; + std::map use_wstrings; std::map use_int8; std::map use_uint8; std::map use_int16; @@ -398,6 +401,7 @@ struct statement_wrapper std::map > use_indicators_v; std::map > use_strings_v; + std::map > use_wstrings_v; std::map > use_int8_v; std::map > use_uint8_v; std::map > use_int16_v; @@ -618,6 +622,17 @@ bool name_exists_check_failed(statement_wrapper & wrapper, name_exists = (it != wrapper.use_strings.end()); } break; + case db_wstring: + { + typedef std::map + < + std::string, + std::wstring + >::const_iterator iterator; + iterator const it = wrapper.use_wstrings.find(name); + name_exists = (it != wrapper.use_wstrings.end()); + } + break; case db_int8: { typedef std::map::const_iterator iterator; @@ -718,6 +733,17 @@ bool name_exists_check_failed(statement_wrapper & wrapper, name_exists = (it != wrapper.use_strings_v.end()); } break; + case db_wstring: + { + typedef std::map + < + std::string, + std::vector + >::const_iterator iterator; + iterator const it = wrapper.use_wstrings_v.find(name); + name_exists = (it != wrapper.use_wstrings_v.end()); + } + break; case db_int8: { typedef std::map @@ -1595,6 +1621,9 @@ SOCI_DECL void soci_into_resize_v(statement_handle st, int new_size) case db_string: wrapper->into_strings_v[i].resize(new_size); break; + case db_wstring: + wrapper->into_wstrings_v[i].resize(new_size); + break; case db_int8: wrapper->into_int8_v[i].resize(new_size); break; @@ -3042,6 +3071,10 @@ SOCI_DECL void soci_prepare(statement_handle st, char const * query) wrapper->st.exchange( into(wrapper->into_strings[i], wrapper->into_indicators[i])); break; + case db_wstring: + wrapper->st.exchange( + into(wrapper->into_wstrings[i], wrapper->into_indicators[i])); + break; case db_int8: wrapper->st.exchange( into(wrapper->into_int8[i], wrapper->into_indicators[i])); @@ -3103,6 +3136,10 @@ SOCI_DECL void soci_prepare(statement_handle st, char const * query) wrapper->st.exchange( into(wrapper->into_strings_v[i], wrapper->into_indicators_v[i])); break; + case db_wstring: + wrapper->st.exchange( + into(wrapper->into_wstrings_v[i], wrapper->into_indicators_v[i])); + break; case db_int8: wrapper->st.exchange( into(wrapper->into_int8_v[i], wrapper->into_indicators_v[i])); diff --git a/src/core/statement.cpp b/src/core/statement.cpp index d187aaf2..3619b1eb 100644 --- a/src/core/statement.cpp +++ b/src/core/statement.cpp @@ -673,6 +673,12 @@ void statement_impl::bind_into() into_row(); } +template<> +void statement_impl::bind_into() +{ + into_row(); +} + template<> void statement_impl::bind_into() { @@ -762,6 +768,9 @@ void statement_impl::describe() case db_xml: bind_into(); break; + case db_wstring: + bind_into(); + break; case db_blob: bind_into(); break; diff --git a/src/core/unicode.cpp b/src/core/unicode.cpp new file mode 100644 index 00000000..b436eb15 --- /dev/null +++ b/src/core/unicode.cpp @@ -0,0 +1,549 @@ +// +// Copyright (C) 2024 Benjamin Oldenburg +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// + +#define SOCI_SOURCE +#include "soci/soci-unicode.h" + +namespace soci +{ + +namespace details +{ + +bool is_valid_utf8_sequence(unsigned char const* bytes, int length) +{ + if (length == 1) + { + return (bytes[0] & 0x80U) == 0; + } + if (length == 2) + { + if ((bytes[0] & 0xE0U) == 0xC0 && (bytes[1] & 0xC0U) == 0x80) + { + // Check for overlong encoding + const uint32_t code_point = ((bytes[0] & 0x1FU) << 6U) | (bytes[1] & 0x3FU); + return code_point >= 0x80; + } + return false; + } + if (length == 3) + { + if ((bytes[0] & 0xF0U) == 0xE0 && (bytes[1] & 0xC0U) == 0x80 && (bytes[2] & 0xC0U) == 0x80) + { + // Check for overlong encoding + const uint32_t code_point = ((bytes[0] & 0x0FU) << 12U) | ((bytes[1] & 0x3FU) << 6U) | (bytes[2] & 0x3FU); + return code_point >= 0x800 && code_point <= 0xFFFF; + } + return false; + } + if (length == 4) + { + if ((bytes[0] & 0xF8U) == 0xF0 && (bytes[1] & 0xC0U) == 0x80 && (bytes[2] & 0xC0U) == 0x80 && (bytes[3] & 0xC0U) == 0x80) + { + // Check for overlong encoding and valid Unicode code point + const uint32_t code_point = ((bytes[0] & 0x07U) << 18U) | ((bytes[1] & 0x3FU) << 12U) | ((bytes[2] & 0x3FU) << 6U) | (bytes[3] & 0x3FU); + return code_point >= 0x10000 && code_point <= 0x10FFFF; + } + return false; + } + return false; +} + +void ensure_valid_utf8(char const* utf8, std::size_t len) +{ + auto const* const bytes = reinterpret_cast(utf8); + + for (std::size_t i = 0; i < len;) + { + if ((bytes[i] & 0x80U) == 0) + { + // ASCII character, one byte + i += 1; + } + else if ((bytes[i] & 0xE0U) == 0xC0) + { + // Two-byte character, check if the next byte is a valid continuation byte + if (i + 1 >= len || !is_valid_utf8_sequence(bytes + i, 2)) + { + throw soci_error("Invalid UTF-8 sequence: Truncated or invalid two-byte sequence"); + } + i += 2; + } + else if ((bytes[i] & 0xF0U) == 0xE0U) + { + // Three-byte character, check if the next two bytes are valid continuation bytes + if (i + 2 >= len || !is_valid_utf8_sequence(bytes + i, 3)) + { + throw soci_error("Invalid UTF-8 sequence: Truncated or invalid three-byte sequence"); + } + i += 3; + } + else if ((bytes[i] & 0xF8U) == 0xF0U) + { + // Four-byte character, check if the next three bytes are valid continuation bytes + if (i + 3 >= len || !is_valid_utf8_sequence(bytes + i, 4)) + { + throw soci_error("Invalid UTF-8 sequence: Truncated or invalid four-byte sequence"); + } + i += 4; + } + else + { + // Invalid start byte + throw soci_error("Invalid UTF-8 sequence: Invalid start byte"); + } + } +} + +void ensure_valid_utf16(char16_t const* s, std::size_t len) +{ + for (std::size_t i = 0; i < len; ++i) + { + const char16_t chr = s[i]; + if (chr >= 0xD800 && chr <= 0xDBFF) + { // High surrogate + if (i + 1 >= len) + { + throw soci_error("Invalid UTF-16 sequence (truncated surrogate pair)"); + } + const char16_t next = s[i + 1]; + if (next < 0xDC00 || next > 0xDFFF) + { + throw soci_error("Invalid UTF-16 sequence (invalid surrogate pair)"); + } + ++i; // Skip the next character as it's part of the pair + } + else if (chr >= 0xDC00 && chr <= 0xDFFF) + { // Lone low surrogate + throw soci_error("Invalid UTF-16 sequence (lone low surrogate)"); + } + } +} + +void ensure_valid_utf32(char32_t const* s, std::size_t len) +{ + for (std::size_t i = 0; i < len; ++i) + { + const char32_t chr = s[i]; + + // Check if the code point is within the Unicode range + if (chr > 0x10FFFF) + { + throw soci_error("Invalid UTF-32 sequence: Code point out of range"); + } + + // Surrogate pairs are not valid in UTF-32 + if (chr >= 0xD800 && chr <= 0xDFFF) + { + throw soci_error("Invalid UTF-32 sequence: Surrogate pair found"); + } + + // Check for non-characters U+FFFE and U+FFFF + if (chr == 0xFFFE || chr == 0xFFFF) + { + throw soci_error("Invalid UTF-32 sequence: Non-character found"); + } + } +} + +std::size_t +utf8_to_utf16(char const* utf8, std::size_t len8, + char16_t* out16, std::size_t len16) +{ + // Skip the check if we're just computing the length for efficiency, we'll + // detect any errors when performing the actual conversion anyhow. + if (out16) + ensure_valid_utf8(utf8, len8); + + auto const* const bytes = reinterpret_cast(utf8); + + std::size_t len = 0; + + // Check for UTF-8 BOM + size_t start_index = 0; + if (len8 >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) + { + ++len; + + if (out16) + { + throw_if_too_small(len, len16); + *out16++ = 0xFEFF; // Add UTF-16 BOM + } + + start_index = 3; // Start conversion after the BOM + } + + for (size_t i = start_index; i < len8;) + { + uint32_t codepoint; + if ((bytes[i] & 0x80) == 0) + { + // ASCII character + codepoint = bytes[i++]; + } + else if ((bytes[i] & 0xE0) == 0xC0) + { + // 2-byte sequence + codepoint = ((bytes[i] & 0x1F) << 6) | (bytes[i + 1] & 0x3F); + i += 2; + } + else if ((bytes[i] & 0xF0) == 0xE0) + { + // 3-byte sequence + codepoint = ((bytes[i] & 0x0F) << 12) | ((bytes[i + 1] & 0x3F) << 6) | (bytes[i + 2] & 0x3F); + i += 3; + } + else if ((bytes[i] & 0xF8) == 0xF0) + { + // 4-byte sequence + codepoint = ((bytes[i] & 0x07) << 18) | ((bytes[i + 1] & 0x3F) << 12) | ((bytes[i + 2] & 0x3F) << 6) | (bytes[i + 3] & 0x3F); + i += 4; + } + else + { + throw soci_error("Invalid UTF-8 sequence"); + } + + if (codepoint <= 0xFFFF) + { + ++len; + + if (out16) + { + throw_if_too_small(len, len16); + *out16++ = static_cast(codepoint); + } + } + else + { + // Encode as surrogate pair + len += 2; + + if (out16) + { + throw_if_too_small(len, len16); + codepoint -= 0x10000; + *out16++ = static_cast((codepoint >> 10) + 0xD800); + *out16++ = static_cast((codepoint & 0x3FF) + 0xDC00); + } + } + } + + return len; +} + +std::size_t +utf16_to_utf8(char16_t const* utf16, std::size_t len16, + char* out8, std::size_t len8) +{ + // Skip the check if we're just computing the length for efficiency, we'll + // detect any errors when performing the actual conversion anyhow. + if (out8) + ensure_valid_utf16(utf16, len16); + + std::size_t len = 0; + + // Check for UTF-16 BOM + size_t start_index = 0; + if (len16 && utf16[0] == 0xFEFF) + { + len += 3; + if (out8) + { + throw_if_too_small(len, len8); + + // Add UTF-8 BOM + *out8++ = '\xEF'; + *out8++ = '\xBB'; + *out8++ = '\xBF'; + } + + start_index = 1; // Start conversion after the BOM + } + + for (std::size_t i = start_index; i < len16; ++i) + { + char16_t const chr = utf16[i]; + + if (chr < 0x80) + { + // 1-byte sequence (ASCII) + ++len; + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(chr); + } + } + else if (chr < 0x800) + { + // 2-byte sequence + len += 2; + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(0xC0U | ((chr >> 6) & 0x1FU)); + *out8++ = static_cast(0x80U | (chr & 0x3FU)); + } + } + else if ((chr >= 0xD800U) && (chr <= 0xDBFFU)) + { + // Handle UTF-16 surrogate pairs + if (i + 1 >= len16) + { + throw soci_error("Invalid UTF-16 surrogate pair (truncated)"); + } + char16_t const chr2 = utf16[i + 1]; + if (chr2 < 0xDC00U || chr2 > 0xDFFFU) + { + throw soci_error("Invalid UTF-16 surrogate pair"); + } + auto const codepoint = static_cast(((chr & 0x3FFU) << 10U) | (chr2 & 0x3FFU)) + 0x10000U; + + len += 4; + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(0xF0U | ((codepoint >> 18U) & 0x07U)); + *out8++ = static_cast(0x80U | ((codepoint >> 12U) & 0x3FU)); + *out8++ = static_cast(0x80U | ((codepoint >> 6U) & 0x3FU)); + *out8++ = static_cast(0x80U | (codepoint & 0x3FU)); + } + + ++i; // Skip the next character as it is part of the surrogate pair + } + else + { + // 3-byte sequence + len += 3; + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(0xE0U | ((chr >> 12) & 0x0FU)); + *out8++ = static_cast(0x80U | ((chr >> 6) & 0x3FU)); + *out8++ = static_cast(0x80U | (chr & 0x3FU)); + } + } + } + + return len; +} + +std::size_t +utf16_to_utf32(char16_t const* utf16, std::size_t len16, + char32_t* out32, std::size_t len32) +{ + // Skip the check if we're just computing the length for efficiency, we'll + // detect any errors when performing the actual conversion anyhow. + if (out32) + ensure_valid_utf16(utf16, len16); + + std::size_t len = 0; + for (std::size_t i = 0; i < len16; ++i) + { + char16_t const chr = *utf16++; + + ++len; + if (out32) + throw_if_too_small(len, len32); + + if (chr >= 0xD800U && chr <= 0xDBFFU) + { + // High surrogate, must be followed by a low surrogate + char16_t const chr2 = *utf16++; + ++i; + + if (out32) + { + const auto codepoint = static_cast(((static_cast(chr) & 0x3FFU) << 10U) | (static_cast(chr2) & 0x3FFU)) + 0x10000U; + *out32++ = codepoint; + } + } + else + { + // Valid BMP character or a low surrogate that is part of a valid + // pair (already checked by ensure_valid_utf16) + if (out32) + *out32++ = static_cast(chr); + } + } + + return len; +} + +std::size_t +utf32_to_utf16(char32_t const* utf32, std::size_t len32, + char16_t* out16, std::size_t len16) +{ + // Skip the check if we're just computing the length for efficiency, we'll + // detect any errors when performing the actual conversion anyhow. + if (out16) + ensure_valid_utf32(utf32, len32); + + std::size_t len = 0; + for (std::size_t i = 0; i < len32; ++i) + { + char32_t codepoint = *utf32++; + + if (codepoint <= 0xFFFFU) + { + ++len; + + // BMP character + if (out16) + { + throw_if_too_small(len, len16); + *out16++ = static_cast(codepoint); + } + } + else + { + len += 2; + + // Encode as a surrogate pair + if (out16) + { + throw_if_too_small(len, len16); + + // Note that we know that the code point is valid here because + // we called ensure_valid_utf32() above. + codepoint -= 0x10000; + *out16++ = static_cast((codepoint >> 10U) + 0xD800U); + *out16++ = static_cast((codepoint & 0x3FFU) + 0xDC00U); + } + } + } + + return len; +} + +std::size_t +utf8_to_utf32(char const* utf8, std::size_t len8, + char32_t* out32, std::size_t len32) +{ + // Skip the check if we're just computing the length for efficiency, we'll + // detect any errors when performing the actual conversion anyhow. + if (out32) + ensure_valid_utf8(utf8, len8); + + auto const* const bytes = reinterpret_cast(utf8); + + std::size_t len = 0; + for (std::size_t i = 0; i < len8;) + { + unsigned char chr1 = bytes[i]; + + ++len; + if (out32) + throw_if_too_small(len, len32); + + // 1-byte sequence (ASCII) + if ((chr1 & 0x80U) == 0) + { + if (out32) + *out32++ = static_cast(chr1); + ++i; + } + // 2-byte sequence + else if ((chr1 & 0xE0U) == 0xC0U) + { + if (out32) + *out32++ = static_cast(((chr1 & 0x1FU) << 6U) | (bytes[i + 1] & 0x3FU)); + i += 2; + } + // 3-byte sequence + else if ((chr1 & 0xF0U) == 0xE0U) + { + if (out32) + *out32++ = static_cast(((chr1 & 0x0FU) << 12U) | ((bytes[i + 1] & 0x3FU) << 6U) | (bytes[i + 2] & 0x3FU)); + i += 3; + } + // 4-byte sequence + else if ((chr1 & 0xF8U) == 0xF0U) + { + if (out32) + *out32++ = static_cast(((chr1 & 0x07U) << 18U) | ((bytes[i + 1] & 0x3FU) << 12U) | ((bytes[i + 2] & 0x3FU) << 6U) | (bytes[i + 3] & 0x3FU)); + i += 4; + } + } + + return len; +} + +std::size_t +utf32_to_utf8(char32_t const* utf32, std::size_t len32, + char* out8, std::size_t len8) +{ + // Skip the check if we're just computing the length for efficiency, we'll + // detect any errors when performing the actual conversion anyhow. + if (out8) + ensure_valid_utf32(utf32, len32); + + std::size_t len = 0; + + for (std::size_t i = 0; i < len32; ++i) + { + auto const codepoint = utf32[i]; + + if (codepoint < 0x80) + { + // 1-byte sequence (ASCII) + ++len; + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(codepoint); + } + } + else if (codepoint < 0x800) + { + // 2-byte sequence + len += 2; + + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(0xC0U | ((codepoint >> 6U) & 0x1FU)); + *out8++ = static_cast(0x80U | (codepoint & 0x3FU)); + } + } + else if (codepoint < 0x10000) + { + // 3-byte sequence + len += 3; + + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(0xE0U | ((codepoint >> 12U) & 0x0FU)); + *out8++ = static_cast(0x80U | ((codepoint >> 6U) & 0x3FU)); + *out8++ = static_cast(0x80U | (codepoint & 0x3FU)); + } + } + else // This must be the only remaining case for valid UTF-32 string. + { + // 4-byte sequence + len += 4; + + if (out8) + { + throw_if_too_small(len, len8); + *out8++ = static_cast(0xF0U | ((codepoint >> 18U) & 0x07U)); + *out8++ = static_cast(0x80U | ((codepoint >> 12U) & 0x3FU)); + *out8++ = static_cast(0x80U | ((codepoint >> 6U) & 0x3FU)); + *out8++ = static_cast(0x80U | (codepoint & 0x3FU)); + } + } + } + + return len; +} + +} // namespace details + +} // namespace soci diff --git a/src/core/use-type.cpp b/src/core/use-type.cpp index f17fcd91..6c26eaf2 100644 --- a/src/core/use-type.cpp +++ b/src/core/use-type.cpp @@ -9,6 +9,7 @@ #include "soci/soci-platform.h" #include "soci/use-type.h" #include "soci/statement.h" +#include "soci/soci-unicode.h" #include "soci-exchange-cast.h" #include "soci-mktime.h" @@ -58,6 +59,10 @@ void standard_use_type::dump_value(std::ostream& os) const os << "\"" << exchange_type_cast(data_) << "\""; return; + case x_stdwstring: + os << "\"" << wide_to_utf8(exchange_type_cast(data_)) << "\""; + return; + case x_int8: os << exchange_type_cast(data_); return; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 69766d8e..aec5e4cb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -15,7 +15,7 @@ colormsg(_HIBLUE_ "Configuring SOCI tests:") add_definitions(-DCATCH_CONFIG_CPP11_NO_SHUFFLE) if(MSVC) - add_compile_options(/bigobj) + add_compile_options(/bigobj /utf-8) endif() include_directories( diff --git a/tests/common/test-unicode.cpp b/tests/common/test-unicode.cpp new file mode 100644 index 00000000..5b2340b2 --- /dev/null +++ b/tests/common/test-unicode.cpp @@ -0,0 +1,355 @@ +// +// Copyright (C) 2024 Benjamin Oldenburg +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// + +#include "soci/soci.h" + +#include + +using namespace soci; +using namespace soci::details; + +TEST_CASE("UTF-8 validation tests", "[unicode]") +{ + // Valid UTF-8 strings - Should not throw exceptions + CHECK_NOTHROW(ensure_valid_utf8("Hello, world!")); // valid ASCII + CHECK_NOTHROW(ensure_valid_utf8("")); // Empty string + CHECK_NOTHROW(ensure_valid_utf8(u8"Здравствуй, мир!")); // valid UTF-8 + CHECK_NOTHROW(ensure_valid_utf8(u8"こんにちは世界")); // valid UTF-8 + CHECK_NOTHROW(ensure_valid_utf8(u8"😀😁😂🤣😃😄😅😆")); // valid UTF-8 with emojis + + // Invalid UTF-8 strings - Should throw soci_error exceptions + CHECK_THROWS_AS(ensure_valid_utf8("\x80"), soci_error); // Invalid single byte + CHECK_THROWS_AS(ensure_valid_utf8("\xC3\x28"), soci_error); // Invalid two-byte character + CHECK_THROWS_AS(ensure_valid_utf8("\xE2\x82"), soci_error); // Truncated three-byte character + CHECK_THROWS_AS(ensure_valid_utf8("\xF0\x90\x28"), soci_error); // Truncated four-byte character + CHECK_THROWS_AS(ensure_valid_utf8("\xF0\x90\x8D\x80\x80"), soci_error); // Extra byte in four-byte character +} + +TEST_CASE("UTF-16 validation tests", "[unicode]") +{ + // Valid UTF-16 strings + CHECK_NOTHROW(ensure_valid_utf16(u"Hello, world!")); // valid ASCII + CHECK_NOTHROW(ensure_valid_utf16(u"Здравствуй, мир!")); // valid Cyrillic + CHECK_NOTHROW(ensure_valid_utf16(u"こんにちは世界")); // valid Japanese + CHECK_NOTHROW(ensure_valid_utf16(u"😀😁😂🤣😃😄😅😆")); // valid emojis + + // Invalid UTF-16 strings - these should throw exceptions + std::u16string invalid_utf16; + + invalid_utf16 = u""; + invalid_utf16 += 0xD800; // lone high surrogate + CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error); + + invalid_utf16 = u""; + invalid_utf16 += 0xDC00; // lone low surrogate + CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error); + + invalid_utf16 = u""; + invalid_utf16 += 0xD800; + invalid_utf16 += 0xD800; // two high surrogates in a row + CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error); + + invalid_utf16 = u""; + invalid_utf16 += 0xDC00; + invalid_utf16 += 0xDC00; // two low surrogates in a row + CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error); +} + +TEST_CASE("UTF-32 validation tests", "[unicode]") +{ + // Valid UTF-32 strings + REQUIRE_NOTHROW(ensure_valid_utf32(U"Hello, world!")); // valid ASCII + REQUIRE_NOTHROW(ensure_valid_utf32(U"Здравствуй, мир!")); // valid Cyrillic + REQUIRE_NOTHROW(ensure_valid_utf32(U"こんにちは世界")); // valid Japanese + REQUIRE_NOTHROW(ensure_valid_utf32(U"😀😁😂🤣😃😄😅😆")); // valid emojis + + // Invalid UTF-32 strings + REQUIRE_THROWS_AS(ensure_valid_utf32(U"\x110000"), soci_error); // Invalid UTF-32 code point + REQUIRE_THROWS_AS(ensure_valid_utf32(U"\x1FFFFF"), soci_error); // Invalid range + REQUIRE_THROWS_AS(ensure_valid_utf32(U"\xFFFFFFFF"), soci_error); // Invalid range +} + +TEST_CASE("UTF-16 to UTF-32 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf16_to_utf32(u"Hello, world!") == U"Hello, world!"); + REQUIRE(utf16_to_utf32(u"こんにちは世界") == U"こんにちは世界"); + REQUIRE(utf16_to_utf32(u"😀😁😂🤣😃😄😅😆") == U"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::u16string utf16; + utf16.push_back(char16_t(0xD83D)); // high surrogate + utf16.push_back(char16_t(0xDE00)); // low surrogate + REQUIRE(utf16_to_utf32(utf16) == U"\U0001F600"); // 😀 + + // Invalid conversion (should throw an exception) + std::u16string invalid_utf16; + invalid_utf16.push_back(0xD800); // lone high surrogate + REQUIRE_THROWS_AS(utf16_to_utf32(invalid_utf16), soci_error); +} + +TEST_CASE("UTF-32 to UTF-16 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf32_to_utf16(U"Hello, world!") == u"Hello, world!"); + REQUIRE(utf32_to_utf16(U"こんにちは世界") == u"こんにちは世界"); + REQUIRE(utf32_to_utf16(U"😀😁😂🤣😃😄😅😆") == u"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::u32string utf32 = U"\U0001F600"; // 😀 + std::u16string expected_utf16; + expected_utf16.push_back(0xD83D); // high surrogate + expected_utf16.push_back(0xDE00); // low surrogate + REQUIRE(utf32_to_utf16(utf32) == expected_utf16); + + // Invalid conversion (should throw an exception) + std::u32string invalid_utf32 = U"\x110000"; // Invalid code point + REQUIRE_THROWS_AS(utf32_to_utf16(invalid_utf32), soci_error); +} + +TEST_CASE("UTF-8 to UTF-16 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf8_to_utf16(u8"Hello, world!") == u"Hello, world!"); + REQUIRE(utf8_to_utf16(u8"こんにちは世界") == u"こんにちは世界"); + REQUIRE(utf8_to_utf16(u8"😀😁😂🤣😃😄😅😆") == u"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 + std::u16string expected_utf16 = u"\xD83D\xDE00"; + REQUIRE(utf8_to_utf16(utf8) == expected_utf16); + + // Invalid conversion (should throw an exception) + std::string invalid_utf8 = "\xF0\x28\x8C\xBC"; // Invalid UTF-8 sequence + REQUIRE_THROWS_AS(utf8_to_utf16(invalid_utf8), soci_error); +} + +TEST_CASE("UTF-16 to UTF-8 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf16_to_utf8(u"Hello, world!") == u8"Hello, world!"); + REQUIRE(utf16_to_utf8(u"こんにちは世界") == u8"こんにちは世界"); + REQUIRE(utf16_to_utf8(u"😀😁😂🤣😃😄😅😆") == u8"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::u16string utf16; + utf16.push_back(0xD83D); // high surrogate + utf16.push_back(0xDE00); // low surrogate + REQUIRE(utf16_to_utf8(utf16) == "\xF0\x9F\x98\x80"); // 😀 + + // Invalid conversion (should throw an exception) + std::u16string invalid_utf16; + invalid_utf16.push_back(0xD800); // lone high surrogate + REQUIRE_THROWS_AS(utf16_to_utf8(invalid_utf16), soci_error); +} + +TEST_CASE("UTF-8 to UTF-32 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf8_to_utf32(u8"Hello, world!") == U"Hello, world!"); + REQUIRE(utf8_to_utf32(u8"こんにちは世界") == U"こんにちは世界"); + REQUIRE(utf8_to_utf32(u8"😀😁😂🤣😃😄😅😆") == U"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 + REQUIRE(utf8_to_utf32(utf8) == U"\U0001F600"); + + // Invalid conversion (should throw an exception) + std::string invalid_utf8 = "\xF0\x28\x8C\xBC"; // Invalid UTF-8 sequence + REQUIRE_THROWS_AS(utf8_to_utf32(invalid_utf8), soci_error); +} + +TEST_CASE("UTF-32 to UTF-8 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf32_to_utf8(U"Hello, world!") == u8"Hello, world!"); + REQUIRE(utf32_to_utf8(U"こんにちは世界") == u8"こんにちは世界"); + REQUIRE(utf32_to_utf8(U"😀😁😂🤣😃😄😅😆") == u8"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::u32string utf32 = U"\U0001F600"; // 😀 + REQUIRE(utf32_to_utf8(utf32) == "\xF0\x9F\x98\x80"); + + // Invalid conversion (should throw an exception) + std::u32string invalid_utf32 = U"\x110000"; // Invalid code point + REQUIRE_THROWS_AS(utf32_to_utf8(invalid_utf32), soci_error); + + // Invalid conversion (should throw an exception) + std::u32string invalid_wide; + invalid_wide.push_back(0xD800); // lone high surrogate + REQUIRE_THROWS_AS(utf32_to_utf8(invalid_wide), soci_error); +} + +TEST_CASE("Empty string tests", "[unicode]") +{ + REQUIRE(utf16_to_utf8(u"") == u8""); + REQUIRE(utf32_to_utf8(U"") == u8""); + REQUIRE(utf8_to_utf16(u8"") == u""); + REQUIRE(utf8_to_utf32(u8"") == U""); +} + +TEST_CASE("Strings with Byte Order Marks (BOMs)", "[unicode]") +{ + // UTF-8 BOM + const std::string utf8_bom = "\xEF\xBB\xBF"; + // UTF-16 BOM (Little Endian) + const std::u16string utf16_bom = u"\xFEFF"; + // UTF-32 BOM (Little Endian) + const std::u32string utf32_bom = U"\x0000FEFF"; + + const std::string content = "Hello, world!"; + const std::u16string content16 = u"Hello, world!"; + const std::u32string content32 = U"Hello, world!"; + + SECTION("UTF-8 to UTF-16") + { + std::u16string result = utf8_to_utf16(utf8_bom + content); + REQUIRE(result == utf16_bom + content16); + } + + SECTION("UTF-8 to UTF-32") + { + std::u32string result = utf8_to_utf32(utf8_bom + content); + REQUIRE(result == utf32_bom + content32); + } + + SECTION("UTF-16 to UTF-8") + { + std::string result = utf16_to_utf8(utf16_bom + content16); + REQUIRE(result == utf8_bom + content); + } + + SECTION("UTF-16 to UTF-32") + { + std::u32string result = utf16_to_utf32(utf16_bom + content16); + REQUIRE(result == utf32_bom + content32); + } + + SECTION("UTF-32 to UTF-8") + { + std::string result = utf32_to_utf8(utf32_bom + content32); + REQUIRE(result == utf8_bom + content); + } + + SECTION("UTF-32 to UTF-16") + { + std::u16string result = utf32_to_utf16(utf32_bom + content32); + REQUIRE(result == utf16_bom + content16); + } + + SECTION("Roundtrip conversions") + { + // UTF-8 -> UTF-16 -> UTF-8 + REQUIRE(utf16_to_utf8(utf8_to_utf16(utf8_bom + content)) == utf8_bom + content); + + // UTF-8 -> UTF-32 -> UTF-8 + REQUIRE(utf32_to_utf8(utf8_to_utf32(utf8_bom + content)) == utf8_bom + content); + + // UTF-16 -> UTF-8 -> UTF-16 + REQUIRE(utf8_to_utf16(utf16_to_utf8(utf16_bom + content16)) == utf16_bom + content16); + + // UTF-16 -> UTF-32 -> UTF-16 + REQUIRE(utf32_to_utf16(utf16_to_utf32(utf16_bom + content16)) == utf16_bom + content16); + + // UTF-32 -> UTF-8 -> UTF-32 + REQUIRE(utf8_to_utf32(utf32_to_utf8(utf32_bom + content32)) == utf32_bom + content32); + + // UTF-32 -> UTF-16 -> UTF-32 + REQUIRE(utf16_to_utf32(utf32_to_utf16(utf32_bom + content32)) == utf32_bom + content32); + } +} + +TEST_CASE("Strings with invalid code unit sequences", "[unicode]") +{ + REQUIRE_THROWS_AS(ensure_valid_utf16(u"\xD800\xD800"), soci_error); + REQUIRE_THROWS_AS(ensure_valid_utf32(U"\xD800"), soci_error); +} + +TEST_CASE("Strings with overlong encodings", "[unicode]") +{ + REQUIRE_THROWS_AS(ensure_valid_utf8("\xC0\xAF"), soci_error); +} + +TEST_CASE("Strings with non-characters", "[unicode]") +{ + REQUIRE_THROWS_AS(ensure_valid_utf32(U"\xFFFE"), soci_error); +} + +TEST_CASE("Strings with right-to-left characters", "[unicode]") +{ + REQUIRE_NOTHROW(ensure_valid_utf8(u8"مرحبا بالعالم")); +} + +TEST_CASE("UTF-8 to wide string conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf8_to_wide(u8"Hello, world!") == L"Hello, world!"); + REQUIRE(utf8_to_wide(u8"こんにちは世界") == L"こんにちは世界"); + REQUIRE(utf8_to_wide(u8"😀😁😂🤣😃😄😅😆") == L"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::string utf8 = "\xF0\x9F\x98\x80"; // 😀 + std::wstring expected_wide = L"\U0001F600"; + REQUIRE(utf8_to_wide(utf8) == expected_wide); + + // Invalid conversion (should throw an exception) + std::string invalid_utf8 = "\xF0\x28\x8C\xBC"; // Invalid UTF-8 sequence + REQUIRE_THROWS_AS(utf8_to_wide(invalid_utf8), soci_error); +} + +TEST_CASE("Wide string to UTF-8 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(wide_to_utf8(L"Hello, world!") == u8"Hello, world!"); + REQUIRE(wide_to_utf8(L"こんにちは世界") == u8"こんにちは世界"); + REQUIRE(wide_to_utf8(L"😀😁😂🤣😃😄😅😆") == u8"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::wstring wide = L"\U0001F600"; // 😀 + REQUIRE(wide_to_utf8(wide) == "\xF0\x9F\x98\x80"); + + // Invalid conversion (should throw an exception) + std::wstring invalid_wide; + invalid_wide.push_back(0xD800); // lone high surrogate + REQUIRE_THROWS_AS(wide_to_utf8(invalid_wide), soci_error); +} + +TEST_CASE("UTF-16 to wide string conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(utf16_to_wide(u"Hello, world!") == L"Hello, world!"); + REQUIRE(utf16_to_wide(u"こんにちは世界") == L"こんにちは世界"); + REQUIRE(utf16_to_wide(u"😀😁😂🤣😃😄😅😆") == L"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::u16string utf16 = u"\xD83D\xDE00"; // 😀 + std::wstring expected_wide = L"\U0001F600"; + REQUIRE(utf16_to_wide(utf16) == expected_wide); + + // Invalid conversion (should throw an exception) + std::u16string invalid_utf16; + invalid_utf16.push_back(0xD800); // lone high surrogate + REQUIRE_THROWS_AS(utf16_to_wide(invalid_utf16), soci_error); +} + +TEST_CASE("Wide string to UTF-16 conversion tests", "[unicode]") +{ + // Valid conversion tests + REQUIRE(wide_to_utf16(L"Hello, world!") == u"Hello, world!"); + REQUIRE(wide_to_utf16(L"こんにちは世界") == u"こんにちは世界"); + REQUIRE(wide_to_utf16(L"😀😁😂🤣😃😄😅😆") == u"😀😁😂🤣😃😄😅😆"); + + // Edge cases + std::wstring wide = L"\U0001F600"; // 😀 + REQUIRE(wide_to_utf16(wide) == u"\xD83D\xDE00"); + + // Invalid conversion (should throw an exception) + std::wstring invalid_wide; + invalid_wide.push_back(0xD800); // lone high surrogate + REQUIRE_THROWS_AS(wide_to_utf16(invalid_wide), soci_error); +} + diff --git a/tests/empty/CMakeLists.txt b/tests/empty/CMakeLists.txt index e4c3597d..9d945a5f 100644 --- a/tests/empty/CMakeLists.txt +++ b/tests/empty/CMakeLists.txt @@ -12,4 +12,7 @@ soci_backend_test( BACKEND Empty SOURCE test-empty.cpp + # We only run these tests from the empty backend test, as they don't use + # database at all. + ../common/test-unicode.cpp CONNSTR "dummy") diff --git a/tests/odbc/test-odbc-mssql.cpp b/tests/odbc/test-odbc-mssql.cpp index db7de057..72736f54 100644 --- a/tests/odbc/test-odbc-mssql.cpp +++ b/tests/odbc/test-odbc-mssql.cpp @@ -77,6 +77,58 @@ TEST_CASE("MS SQL long string", "[odbc][mssql][long]") ); } +struct wide_text_table_creator : public table_creator_base +{ + explicit wide_text_table_creator(soci::session &sql) + : table_creator_base(sql) + { + sql << "create table soci_test (" + "wide_text nvarchar(40) null" + ")"; + } +}; + +TEST_CASE("MS SQL wide string", "[odbc][mssql][wstring]") +{ + soci::session sql(backEnd, connectString); + + wide_text_table_creator create_wide_text_table(sql); + + std::wstring const str_in = L"Привет, SOCI!"; + + sql << "insert into soci_test(wide_text) values(:str)", use(str_in); + + std::wstring str_out; + sql << "select wide_text from soci_test", into(str_out); + + CHECK(str_out == str_in); +} + +TEST_CASE("MS SQL wide string vector", "[odbc][mssql][vector][wstring]") +{ + soci::session sql(backEnd, connectString); + + wide_text_table_creator create_wide_text_table(sql); + + std::vector const str_in = { + L"Привет, SOCI!", + L"Привет, World!", + L"Привет, Universe!", + L"Привет, Galaxy!"}; + + sql << "insert into soci_test(wide_text) values(:str)", use(str_in); + + std::vector str_out(4); + + sql << "select wide_text from soci_test", into(str_out); + + CHECK(str_out.size() == str_in.size()); + for (std::size_t i = 0; i != str_in.size(); ++i) + { + CHECK(str_out[i] == str_in[i]); + } +} + // DDL Creation objects for common tests struct table_creator_one : public table_creator_base {