Merge branch 'wstring'

Add std::wstring support for ODBC backend.

See #1179.
This commit is contained in:
Vadim Zeitlin
2025-01-24 16:37:17 +01:00
31 changed files with 1672 additions and 5 deletions

View File

@@ -40,6 +40,9 @@ Backend-specific changes:
- Add "nocreate" option to avoid creating a new database file (#1021).
- Improve column names to SOCI database types mapping (#1120).
- ODBC
- Add support for using wide strings (#1179).
History of the changes in the previous versions:

View File

@@ -1,6 +1,6 @@
# Backends reference
This part of the documentation is provided for those who want towrite (and contribute!) their
This part of the documentation is provided for those who want to write (and contribute!) their
own backends. It is anyway recommendedthat authors of new backend see the code of some existing
backend forhints on how things are really done.
@@ -28,6 +28,7 @@ enum data_type
enum db_type
{
db_string,
db_wstring,
db_int8,
db_uint8,
db_int16,
@@ -50,6 +51,7 @@ enum exchange_type
{
x_char,
x_stdstring,
x_stdwstring,
x_int8,
x_uint8,
x_int16,

View File

@@ -13,7 +13,7 @@ The following types are commonly used in the rest of the interface:
```cpp
// data types, as seen by the user
enum db_type { db_string, db_date, db_double, db_int8, db_uint8, db_int16, db_uint16, db_int32, db_uint32, db_int64, db_uint64 };
enum db_type { db_string, db_wstring, db_date, db_double, db_int8, db_uint8, db_int16, db_uint16, db_int32, db_uint32, db_int64, db_uint64 };
// deprecated data types enum which may be still used but is less precise than db_type
enum data_type { dt_string, dt_date, dt_double, dt_integer, dt_long_long, dt_unsigned_long_long };

View File

@@ -75,6 +75,7 @@ For the ODBC backend, this type mapping is:
| SQL_INTEGER | db_int32 | int32_t |
| SQL_BIGINT | db_int64 | int64_t |
| SQL_CHAR, SQL_VARCHAR | db_string | std::string |
| SQL_WCHAR, SQL_WVARCHAR, SQL_WLONGVARCHAR | db_wstring | std::wstring |
| SQL_TYPE_DATE, SQL_TYPE_TIME, SQL_TYPE_TIMESTAMP | db_date | std::tm |
Not all ODBC drivers support all datatypes.

View File

@@ -36,6 +36,12 @@ struct exchange_type_traits<x_stdstring>
typedef std::string value_type;
};
template <>
struct exchange_type_traits<x_stdwstring>
{
typedef std::wstring value_type;
};
template <>
struct exchange_type_traits<x_int8>
{

View File

@@ -33,6 +33,8 @@ inline std::size_t get_vector_size(exchange_type e, void *data)
return exchange_vector_type_cast<x_char>(data).size();
case x_stdstring:
return exchange_vector_type_cast<x_stdstring>(data).size();
case x_stdwstring:
return exchange_vector_type_cast<x_stdwstring>(data).size();
case x_int8:
return exchange_vector_type_cast<x_int8>(data).size();
case x_uint8:
@@ -76,6 +78,9 @@ inline void resize_vector(exchange_type e, void *data, std::size_t newSize)
case x_stdstring:
exchange_vector_type_cast<x_stdstring>(data).resize(newSize);
return;
case x_stdwstring:
exchange_vector_type_cast<x_stdwstring>(data).resize(newSize);
return;
case x_int8:
exchange_vector_type_cast<x_int8>(data).resize(newSize);
return;
@@ -131,6 +136,7 @@ inline std::string& vector_string_value(exchange_type e, void *data, std::size_t
return exchange_vector_type_cast<x_xmltype>(data).at(ind).value;
case x_longstring:
return exchange_vector_type_cast<x_longstring>(data).at(ind).value;
case x_stdwstring:
case x_char:
case x_int8:
case x_uint8:

View File

@@ -155,6 +155,13 @@ struct exchange_traits<std::string>
enum { x_type = x_stdstring };
};
template <>
struct exchange_traits<std::wstring>
{
typedef basic_type_tag type_family;
enum { x_type = x_stdwstring };
};
template <>
struct exchange_traits<std::tm>
{

View File

@@ -43,6 +43,16 @@ namespace details
{
return reinterpret_cast<SQLCHAR*>(const_cast<char*>(s.c_str()));
}
inline SQLWCHAR* sqlchar_cast(std::wstring const& s)
{
return reinterpret_cast<SQLWCHAR*>(const_cast<wchar_t*>(s.c_str()));
}
inline SQLWCHAR* sqlchar_cast(std::u16string const& s)
{
return reinterpret_cast<SQLWCHAR*>(const_cast<char16_t*>(s.c_str()));
}
}
// Option allowing to specify the "driver completion" parameter of
@@ -192,6 +202,10 @@ private:
SQLSMALLINT& sqlType,
SQLSMALLINT& cType);
void copy_from_string(const std::wstring& s,
SQLLEN& size,
SQLSMALLINT& sqlType,
SQLSMALLINT& cType);
};
struct odbc_vector_use_type_backend : details::vector_use_type_backend,

View File

@@ -11,6 +11,7 @@
#include "soci/statement.h"
#include "soci/into-type.h"
#include "soci/use-type.h"
#include "soci-unicode.h"
// std
#include <sstream>

View File

@@ -26,6 +26,7 @@ namespace soci
enum db_type
{
db_string,
db_wstring,
db_int8,
db_uint8,
db_int16,
@@ -61,6 +62,7 @@ enum exchange_type
{
x_char,
x_stdstring,
x_stdwstring,
x_int8,
x_uint8,
x_int16,
@@ -271,6 +273,9 @@ public:
case db_uint64: return dt_unsigned_long_long;
case db_blob: return dt_blob;
case db_xml: return dt_xml;
case db_wstring:
throw soci_error("unable to convert value to data_type");
}
// unreachable

426
include/soci/soci-unicode.h Normal file
View File

@@ -0,0 +1,426 @@
#ifndef SOCI_UNICODE_H_INCLUDED
#define SOCI_UNICODE_H_INCLUDED
#include "soci/error.h"
#include <string>
#include <wchar.h>
// Define SOCI_WCHAR_T_IS_UTF32 if wchar_t is wider than 16 bits (e.g., on Unix/Linux)
#if WCHAR_MAX > 0xFFFFu
#define SOCI_WCHAR_T_IS_UTF32
#endif
namespace soci
{
namespace details
{
#if defined(SOCI_WCHAR_T_IS_UTF32)
static_assert(sizeof(wchar_t) == sizeof(char32_t), "wchar_t must be 32 bits");
inline char32_t const* wide_to_char_type(std::wstring const& ws)
{
return reinterpret_cast<char32_t const*>(ws.data());
}
#else
static_assert(sizeof(wchar_t) == sizeof(char16_t), "wchar_t must be 16 bits");
inline char16_t const* wide_to_char_type(std::wstring const& ws)
{
return reinterpret_cast<char16_t const*>(ws.data());
}
#endif
inline void throw_if_too_small(std::size_t required, std::size_t available)
{
if (required > available)
throw soci_error("Output buffer is too small");
}
/**
Check if the given string is a valid UTF-8 encoded string.
Throws soci_error if the string is not a valid UTF-8 string.
@param utf8 The string of length @a len.
@param len The length of the string.
*/
void SOCI_DECL ensure_valid_utf8(char const* utf8, std::size_t len);
/// @overload
inline void ensure_valid_utf8(std::string const& utf8)
{
ensure_valid_utf8(utf8.data(), utf8.size());
}
/**
Check if a given string is a valid UTF-16 encoded string.
Throws soci_error if the string is not a valid UTF-16 string.
@param s The UTF-16 string to check.
@param len The length of the string in characters.
*/
void SOCI_DECL ensure_valid_utf16(char16_t const* s, std::size_t len);
/// @overload
inline void ensure_valid_utf16(std::u16string const& utf16)
{
ensure_valid_utf16(utf16.data(), utf16.size());
}
/**
Check if a given string is a valid UTF-32 encoded string.
Throws soci_error if the string is not a valid UTF-32 string.
@param utf32 The input UTF-32 string.
@return True if the input string is valid, false otherwise.
*/
void SOCI_DECL ensure_valid_utf32(char32_t const* s, std::size_t len);
/// @overload
inline void ensure_valid_utf32(std::u32string const& utf32)
{
ensure_valid_utf32(utf32.data(), utf32.size());
}
/**
Convert a UTF-8 encoded string to a UTF-16 encoded string.
The input string must be a valid UTF-8 encoded string of the given length
(not necessarily NUL-terminated). The output buffer must either contain
enough space to store @a len16 characters or be @c nullptr to just compute
the length required for conversion (in which case @a len16 is ignored).
@param utf8 The input UTF-8 encoded string.
@param len8 The length of the input string.
@param out16 The output buffer or nullptr to just compute the required
length.
@param len16 The length of the output buffer if it is non-null.
@return The length of the UTF-16 output.
@throws soci_error if the input string contains invalid UTF-8 encoding or
if the required length is greater than @a len16 when @a out16 is not @c
nullptr.
*/
std::size_t SOCI_DECL
utf8_to_utf16(char const* utf8, std::size_t len8,
char16_t* out16, std::size_t len16);
/// @overload
inline std::u16string utf8_to_utf16(char const* s, std::size_t len)
{
auto const len16 = utf8_to_utf16(s, len, nullptr, 0);
std::u16string utf16(len16, u'\0');
utf8_to_utf16(s, len, const_cast<char16_t*>(utf16.data()), len16);
return utf16;
}
/// @overload
inline std::u16string utf8_to_utf16(std::string const& utf8)
{
return utf8_to_utf16(utf8.data(), utf8.size());
}
/// @overload
inline std::u16string utf8_to_utf16(char const* s)
{
return utf8_to_utf16(s, std::char_traits<char>::length(s));
}
/**
Convert a UTF-16 encoded string to a UTF-8 encoded string.
Semantics of this function are the same as for utf8_to_utf16(), see its
documentation for more details.
*/
std::size_t SOCI_DECL
utf16_to_utf8(char16_t const* utf16, std::size_t len16,
char* out8, std::size_t len8);
/// @overload
inline std::string utf16_to_utf8(char16_t const* s, std::size_t len)
{
auto const len8 = utf16_to_utf8(s, len, nullptr, 0);
std::string utf8(len8, '\0');
utf16_to_utf8(s, len, const_cast<char*>(utf8.data()), len8);
return utf8;
}
/// @overload
inline std::string utf16_to_utf8(std::u16string const& utf16)
{
return utf16_to_utf8(utf16.data(), utf16.size());
}
/// @overload
inline std::string utf16_to_utf8(char16_t const* s)
{
return utf16_to_utf8(s, std::char_traits<char16_t>::length(s));
}
/**
Convert a UTF-16 encoded string to a UTF-32 encoded string.
Semantics of this function are the same as for utf8_to_utf16(), see its
documentation for more details.
*/
std::size_t SOCI_DECL
utf16_to_utf32(char16_t const* utf16, std::size_t len16,
char32_t* out32, std::size_t len32);
/// @overload
inline std::u32string utf16_to_utf32(char16_t const* s, std::size_t len)
{
auto const len32 = utf16_to_utf32(s, len, nullptr, 0);
std::u32string utf32(len32, U'\0');
utf16_to_utf32(s, len, const_cast<char32_t*>(utf32.data()), len32);
return utf32;
}
/// @overload
inline std::u32string utf16_to_utf32(std::u16string const& utf16)
{
return utf16_to_utf32(utf16.data(), utf16.size());
}
/// @overload
inline std::u32string utf16_to_utf32(char16_t const* s)
{
return utf16_to_utf32(s, std::char_traits<char16_t>::length(s));
}
/**
Convert a UTF-32 encoded string to a UTF-16 encoded string.
Semantics of this function are the same as for utf8_to_utf16(), see its
documentation for more details.
*/
std::size_t SOCI_DECL
utf32_to_utf16(char32_t const* utf32, std::size_t len32,
char16_t* out16, std::size_t len16);
/// @overload
inline std::u16string utf32_to_utf16(char32_t const* utf32, std::size_t len)
{
auto const len16 = utf32_to_utf16(utf32, len, nullptr, 0);
std::u16string utf16(len16, u'\0');
utf32_to_utf16(utf32, len, const_cast<char16_t*>(utf16.data()), len16);
return utf16;
}
/// @overload
inline std::u16string utf32_to_utf16(std::u32string const& utf32)
{
return utf32_to_utf16(utf32.data(), utf32.size());
}
/// @overload
inline std::u16string utf32_to_utf16(char32_t const* s)
{
return utf32_to_utf16(s, std::char_traits<char32_t>::length(s));
}
/**
Convert a UTF-8 encoded string to a UTF-32 encoded string.
Semantics of this function are the same as for utf8_to_utf16(), see its
documentation for more details.
*/
std::size_t SOCI_DECL
utf8_to_utf32(char const* utf8, std::size_t len8,
char32_t* out32, std::size_t len32);
/// @overload
inline std::u32string utf8_to_utf32(char const* utf8, std::size_t len)
{
auto const len32 = utf8_to_utf32(utf8, len, nullptr, 0);
std::u32string utf32(len32, U'\0');
utf8_to_utf32(utf8, len, const_cast<char32_t*>(utf32.data()), len32);
return utf32;
}
/// @overload
inline std::u32string utf8_to_utf32(std::string const& utf8)
{
return utf8_to_utf32(utf8.data(), utf8.size());
}
/// @overload
inline std::u32string utf8_to_utf32(char const* s)
{
return utf8_to_utf32(s, std::char_traits<char>::length(s));
}
/**
Convert a UTF-32 encoded string to a UTF-8 encoded string.
Semantics of this function are the same as for utf8_to_utf16(), see its
documentation for more details.
*/
std::size_t SOCI_DECL
utf32_to_utf8(char32_t const* utf32, std::size_t len32,
char* out8, std::size_t len8);
/// @overload
inline std::string utf32_to_utf8(char32_t const* s, std::size_t len)
{
auto const len8 = utf32_to_utf8(s, len, nullptr, 0);
std::string utf8(len8, '\0');
utf32_to_utf8(s, len, const_cast<char*>(utf8.data()), len8);
return utf8;
}
/// @overload
inline std::string utf32_to_utf8(std::u32string const& utf32)
{
return utf32_to_utf8(utf32.data(), utf32.size());
}
/// @overload
inline std::string utf32_to_utf8(char32_t const* s)
{
return utf32_to_utf8(s, std::char_traits<char32_t>::length(s));
}
/**
Convert a UTF-8 encoded string to a wide string (wstring).
This is equivalent to either utf8_to_utf32() or utf8_to_utf16() depending
on the platform.
@param utf8 The input UTF-8 encoded string.
@return The wide string.
*/
inline std::wstring utf8_to_wide(char const* s, std::size_t len)
{
#if defined(SOCI_WCHAR_T_IS_UTF32)
auto const wlen = utf8_to_utf32(s, len, nullptr, 0);
std::wstring ws(wlen, u'\0');
utf8_to_utf32(s, len, const_cast<char32_t*>(wide_to_char_type(ws)), wlen);
return ws;
#else // !SOCI_WCHAR_T_IS_UTF32
auto const wlen = utf8_to_utf16(s, len, nullptr, 0);
std::wstring ws(wlen, u'\0');
utf8_to_utf16(s, len, const_cast<char16_t*>(wide_to_char_type(ws)), wlen);
return ws;
#endif // SOCI_WCHAR_T_IS_UTF32
}
/// @overload
inline std::wstring utf8_to_wide(std::string const& utf8)
{
return utf8_to_wide(utf8.data(), utf8.size());
}
/**
Convert a wide string (wstring) to a UTF-8 encoded string.
This is equivalent to either utf32_to_utf8() or utf16_to_utf8() depending
on the platform.
@param ws The wide string.
@return std::string The UTF-8 encoded string.
*/
inline std::string wide_to_utf8(std::wstring const& ws)
{
#if defined(SOCI_WCHAR_T_IS_UTF32)
return utf32_to_utf8(wide_to_char_type(ws), ws.size());
#else // !SOCI_WCHAR_T_IS_UTF32
return utf16_to_utf8(wide_to_char_type(ws), ws.size());
#endif // SOCI_WCHAR_T_IS_UTF32
}
/**
Convert a UTF-16 encoded string to a wide string (wstring).
This is equivalent to either utf16_to_utf32() or direct copy depending on
the platform.
@param s The UTF-16 encoded string.
@param len The length of the input string.
@return The wide string.
@throws soci_error if the input string contains invalid UTF-16 encoding.
*/
inline std::wstring utf16_to_wide(char16_t const* s, std::size_t len)
{
#if defined(SOCI_WCHAR_T_IS_UTF32)
// Convert UTF-16 to UTF-32 which is used by wstring.
auto const wlen = utf16_to_utf32(s, len, nullptr, 0);
std::wstring ws(wlen, L'\0');
utf16_to_utf32(s, len,
const_cast<char32_t*>(wide_to_char_type(ws)), wlen);
return ws;
#else // !SOCI_WCHAR_T_IS_UTF32
// Perform validation even though it's already UTF-16
ensure_valid_utf16(s, len);
wchar_t const* ws = reinterpret_cast<wchar_t const*>(s);
return std::wstring(ws, ws + len);
#endif // SOCI_WCHAR_T_IS_UTF32
}
/// @overload
inline std::wstring utf16_to_wide(char16_t const* s)
{
return utf16_to_wide(s, std::char_traits<char16_t>::length(s));
}
/// @overload
inline std::wstring utf16_to_wide(std::u16string const& utf16)
{
return utf16_to_wide(utf16.data(), utf16.size());
}
/**
Convert a wide string (wstring) to a UTF-16 encoded string.
This is equivalent to either utf32_to_utf16() or direct copy depending on
the platform.
@param ws The wide string.
@param out The output buffer or nullptr to just compute the required length.
@param len The output buffer length in characters (ignored if @a out is @c
nullptr).
@return The length of the UTF-16 output.
@throws soci_error if the input string contains invalid wide characters or
if the output buffer is too small when @a out is not @c nullptr.
*/
inline
std::size_t wide_to_utf16(std::wstring const& ws, char16_t* out, std::size_t len)
{
#if defined(SOCI_WCHAR_T_IS_UTF32)
// Convert UTF-32 string to UTF-16.
return utf32_to_utf16(wide_to_char_type(ws), ws.length(), out, len);
#else // !SOCI_WCHAR_T_IS_UTF32
// It's already in UTF-16, just copy, but check that it's valid and that we
// have enough space -- or just return the length if not asked to copy.
auto const wlen = ws.length();
if (out)
{
throw_if_too_small(wlen, len);
ensure_valid_utf16(wide_to_char_type(ws), wlen);
std::memcpy(out, ws.data(), wlen * sizeof(wchar_t));
}
return wlen;
#endif // SOCI_WCHAR_T_IS_UTF32
}
/// @overload
inline std::u16string wide_to_utf16(std::wstring const& ws)
{
auto const wlen = wide_to_utf16(ws, nullptr, 0);
std::u16string utf16(wlen, u'\0');
wide_to_utf16(ws, const_cast<char16_t*>(utf16.data()), wlen);
return utf16;
}
} // namespace details
} // namespace soci
#endif // SOCI_UNICODE_H_INCLUDED

View File

@@ -131,6 +131,7 @@ struct soci_cast<
union type_holder
{
std::string* s;
std::wstring* ws;
int8_t* i8;
int16_t* i16;
int32_t* i32;
@@ -153,6 +154,12 @@ struct type_holder_trait<std::string>
static const db_type type = db_string;
};
template <>
struct type_holder_trait<std::wstring>
{
static const db_type type = db_wstring;
};
template <>
struct type_holder_trait<int8_t>
{
@@ -305,6 +312,9 @@ public:
case db_string:
delete val_.s;
break;
case db_wstring:
delete val_.ws;
break;
}
}
@@ -345,6 +355,8 @@ public:
case db_xml:
case db_string:
return soci_cast<T, std::string>::cast(*val_.s);
case db_wstring:
return soci_cast<T, std::wstring>::cast(*val_.ws);
}
throw std::bad_cast();
@@ -380,6 +392,8 @@ public:
case db_xml:
case db_string:
return soci_return_same<T, std::string>::value(*val_.s);
case db_wstring:
return soci_return_same<T, std::wstring>::value(*val_.ws);
}
throw std::bad_cast();
@@ -430,6 +444,9 @@ private:
case db_string:
val_.s = static_cast<std::string*>(val);
return;
case db_wstring:
val_.ws = static_cast<std::wstring*>(val);
return;
}
// This should be unreachable

View File

@@ -7,6 +7,7 @@
#define SOCI_ODBC_SOURCE
#include "soci/soci-platform.h"
#include "soci/soci-unicode.h"
#include "soci/odbc/soci-odbc.h"
#include "soci-compiler.h"
#include "soci-cstrtoi.h"
@@ -50,6 +51,15 @@ void odbc_standard_into_type_backend::define_by_pos(
buf_ = new char[size];
data = buf_;
break;
case x_stdwstring:
odbcType_ = SQL_C_WCHAR;
// Do exactly the same thing here as for x_stdstring above.
size = static_cast<SQLUINTEGER>(statement_.column_size(position_));
size = (size >= ODBC_MAX_COL_SIZE || size == 0) ? odbc_max_buffer_length : size;
size += sizeof(SQLWCHAR);
buf_ = new char[size];
data = buf_;
break;
case x_int8:
odbcType_ = SQL_C_STINYINT;
size = sizeof(int8_t);
@@ -183,6 +193,15 @@ void odbc_standard_into_type_backend::post_fetch(
throw soci_error("Buffer size overflow; maybe got too large string");
}
}
else if (type_ == x_stdwstring)
{
std::wstring& s = exchange_type_cast<x_stdwstring>(data_);
s = utf16_to_wide(reinterpret_cast<char16_t*>(buf_));
if (s.size() * sizeof(wchar_t) >= (odbc_max_buffer_length - 1))
{
throw soci_error("Buffer size overflow; maybe got too large string");
}
}
else if (type_ == x_longstring)
{
exchange_type_cast<x_longstring>(data_).value = buf_;

View File

@@ -6,6 +6,7 @@
#define SOCI_ODBC_SOURCE
#include "soci/soci-platform.h"
#include "soci/odbc/soci-odbc.h"
#include "soci/soci-unicode.h"
#include "soci-compiler.h"
#include "soci-exchange-cast.h"
#include <cctype>
@@ -112,6 +113,13 @@ void* odbc_standard_use_type_backend::prepare_for_bind(
copy_from_string(s, size, sqlType, cType);
}
break;
case x_stdwstring:
{
std::wstring const& s = exchange_type_cast<x_stdwstring>(data_);
copy_from_string(s, size, sqlType, cType);
}
break;
case x_stdtm:
{
std::tm const& t = exchange_type_cast<x_stdtm>(data_);
@@ -175,6 +183,27 @@ void odbc_standard_use_type_backend::copy_from_string(
indHolder_ = SQL_NTS;
}
void odbc_standard_use_type_backend::copy_from_string(
const std::wstring& s,
SQLLEN& size,
SQLSMALLINT& sqlType,
SQLSMALLINT& cType
)
{
auto const len = wide_to_utf16(s, nullptr, 0);
size = static_cast<SQLLEN>((len + 1) * sizeof(SQLWCHAR));
sqlType = size >= ODBC_MAX_COL_SIZE ? SQL_WLONGVARCHAR : SQL_WVARCHAR;
cType = SQL_C_WCHAR;
buf_ = new char[size];
char16_t* const wbuf = reinterpret_cast<char16_t*>(buf_);
wide_to_utf16(s, wbuf, len);
wbuf[len] = u'\0';
indHolder_ = SQL_NTS;
}
void odbc_standard_use_type_backend::bind_by_pos(
int &position, void *data, exchange_type type, bool /* readOnly */)
{

View File

@@ -7,6 +7,7 @@
#define SOCI_ODBC_SOURCE
#include "soci/odbc/soci-odbc.h"
#include "soci/soci-unicode.h"
#include <cctype>
#include <sstream>
#include <cstring>
@@ -390,6 +391,11 @@ void odbc_statement_backend::describe_column(int colNum,
case SQL_BIGINT:
dbtype = is_unsigned == SQL_TRUE ? db_uint64 : db_int64;
break;
case SQL_WCHAR:
case SQL_WVARCHAR:
case SQL_WLONGVARCHAR:
dbtype = db_wstring;
break;
case SQL_CHAR:
case SQL_VARCHAR:
case SQL_LONGVARCHAR:

View File

@@ -8,6 +8,7 @@
#define SOCI_ODBC_SOURCE
#include "soci/soci-platform.h"
#include "soci/odbc/soci-odbc.h"
#include "soci/soci-unicode.h"
#include "soci/type-wrappers.h"
#include "soci-compiler.h"
#include "soci-cstrtoi.h"
@@ -126,6 +127,27 @@ void odbc_vector_into_type_backend::define_by_pos(
buf_ = new char[colSize_ * elementsCount];
}
break;
case x_stdwstring:
// Do exactly the same thing as above, but for wide characters.
{
odbcType_ = SQL_C_WCHAR;
colSize_ = static_cast<size_t>(get_sqllen_from_value(statement_.column_size(position)));
if (colSize_ >= ODBC_MAX_COL_SIZE || colSize_ == 0)
{
colSize_ = odbc_max_buffer_length;
statement_.fetchVectorByRows_ = true;
}
colSize_ += sizeof(SQLWCHAR);
const std::size_t elementsCount
= statement_.fetchVectorByRows_ ? 1 : vectorSize;
buf_ = new char[colSize_ * elementsCount * sizeof(SQLWCHAR)];
}
break;
case x_stdtm:
odbcType_ = SQL_C_TYPE_TIMESTAMP;
@@ -196,6 +218,7 @@ void odbc_vector_into_type_backend::rebind_row(std::size_t rowInd)
case x_char:
case x_stdstring:
case x_stdwstring:
case x_xmltype:
case x_longstring:
case x_stdtm:
@@ -287,6 +310,34 @@ void odbc_vector_into_type_backend::do_post_fetch_rows(
value.assign(pos, end - pos);
}
}
else if (type_ == x_stdwstring)
{
// Do exactly the same thing as above, but for wide characters.
SQLWCHAR* pos = reinterpret_cast<SQLWCHAR*>(buf_);
for (std::size_t i = beginRow; i != endRow; ++i, pos += colSize_ / sizeof(SQLWCHAR))
{
SQLLEN len = get_sqllen_from_vector_at(i);
std::wstring& value = exchange_vector_type_cast<x_stdwstring>(data_).at(i);
if (len == -1)
{
value.clear();
continue;
}
SQLWCHAR* end = pos + len / sizeof(SQLWCHAR);
while (end != pos)
{
if (*--end != L' ')
{
++end;
break;
}
}
value = utf16_to_wide(reinterpret_cast<char16_t*>(pos), end - pos);
}
}
else if (type_ == x_stdtm)
{
std::vector<std::tm> *vp

View File

@@ -7,6 +7,7 @@
#define SOCI_ODBC_SOURCE
#include "soci/soci-platform.h"
#include "soci/soci-unicode.h"
#include "soci/odbc/soci-odbc.h"
#include "soci-compiler.h"
#include "soci-vector-helpers.h"
@@ -231,6 +232,42 @@ void* odbc_vector_use_type_backend::prepare_for_bind(SQLUINTEGER &size,
cType = SQL_C_CHAR;
}
break;
case x_stdwstring:
{
std::size_t maxSize = 0;
std::size_t const vecSize = get_vector_size(type_, data_);
prepare_indicators(vecSize);
for (std::size_t i = 0; i != vecSize; ++i)
{
std::wstring& value = exchange_vector_type_cast<x_stdwstring>(data_).at(i);
std::size_t const sz = wide_to_utf16(value, nullptr, 0);
set_sqllen_from_vector_at(i, static_cast<long>(sz * sizeof(SQLWCHAR)));
maxSize = sz > maxSize ? sz : maxSize;
}
maxSize++; // For terminating nul.
buf_ = new char[maxSize * vecSize * sizeof(SQLWCHAR)];
memset(buf_, 0, maxSize * vecSize * sizeof(SQLWCHAR));
static_assert(sizeof(SQLWCHAR) == sizeof(char16_t), "unexpected SQLWCHAR size");
char16_t* pos = reinterpret_cast<char16_t*>(buf_);
for (std::size_t i = 0; i != vecSize; ++i)
{
std::wstring& value = exchange_vector_type_cast<x_stdwstring>(data_).at(i);
wide_to_utf16(value, pos, maxSize);
pos += maxSize;
}
data = buf_;
size = static_cast<SQLINTEGER>(maxSize * sizeof(SQLWCHAR));
sqlType = size >= ODBC_MAX_COL_SIZE ? SQL_WLONGVARCHAR : SQL_WVARCHAR;
cType = SQL_C_WCHAR;
}
break;
case x_stdtm:
{
std::vector<std::tm> *vp
@@ -339,6 +376,7 @@ void odbc_vector_use_type_backend::pre_use(indicator const *ind)
case x_char:
case x_stdstring:
case x_stdwstring:
case x_xmltype:
case x_longstring:
non_null_indicator = SQL_NTS;
@@ -438,7 +476,7 @@ void odbc_vector_use_type_backend::pre_use(indicator const *ind)
else
{
// for strings we have already set the values
if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring)
if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring && type_ != x_stdwstring)
{
set_sqllen_from_vector_at(i, non_null_indicator);
}
@@ -451,7 +489,7 @@ void odbc_vector_use_type_backend::pre_use(indicator const *ind)
for (std::size_t i = 0; i != indHolderVec_.size(); ++i)
{
// for strings we have already set the values
if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring)
if (type_ != x_stdstring && type_ != x_xmltype && type_ != x_longstring && type_ != x_stdwstring)
{
set_sqllen_from_vector_at(i, non_null_indicator);
}

View File

@@ -199,7 +199,10 @@ void oracle_standard_into_type_backend::define_by_pos(
ociData_ = lobp;
}
break;
default:
throw soci_error("Into element used with non-supported type.");
}
sword res = OCIDefineByPos(statement_.stmtp_, &defnp_,
statement_.session_.errhp_,

View File

@@ -200,6 +200,9 @@ void oracle_standard_use_type_backend::prepare_for_bind(
ociData_ = lobp;
}
break;
case x_stdwstring:
throw soci_error("Wide string use elements are not supported by Oracle backend.");
}
}
@@ -471,6 +474,8 @@ void oracle_standard_use_type_backend::pre_use(indicator const *ind)
case x_blob:
// nothing to do
break;
case x_stdwstring:
throw soci_error("Wide string use elements are not supported by Oracle backend.");
}
// then handle indicators
@@ -685,6 +690,8 @@ void oracle_standard_use_type_backend::post_use(bool gotData, indicator *ind)
case x_longstring:
// nothing to do here
break;
case x_stdwstring:
throw soci_error("Wide string use elements are not supported by Oracle backend.");
}
}

View File

@@ -218,6 +218,7 @@ void oracle_vector_into_type_backend::define_by_pos_bulk(
case x_statement:
case x_rowid:
case x_blob:
case x_stdwstring:
throw soci_error("Unsupported type for vector into parameter");
}

View File

@@ -214,6 +214,7 @@ void oracle_vector_use_type_backend::prepare_for_bind(
case x_statement:
case x_rowid:
case x_blob:
case x_stdwstring:
throw soci_error("Unsupported type for vector use parameter");
}
}

View File

@@ -208,6 +208,8 @@ sqlite3_statement_backend::load_rowset(int totalRows)
case db_xml:
throw soci_error("XML data type is not supported");
case db_wstring:
throw soci_error("Wide string data type is not supported");
}
}
}
@@ -332,6 +334,8 @@ sqlite3_statement_backend::bind_and_execute(int number)
case db_xml:
throw soci_error("XML data type is not supported");
case db_wstring:
throw soci_error("Wide string data type is not supported");
}
}

View File

@@ -130,6 +130,8 @@ void set_number_in_vector(void *p, int idx, const sqlite3_column &col)
case db_xml:
throw soci_error("XML data type is not supported");
case db_wstring:
throw soci_error("Wide string data type is not supported");
};
}
@@ -243,6 +245,8 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind)
case db_xml:
throw soci_error("XML data type is not supported");
case db_wstring:
throw soci_error("Wide string data type is not supported");
};
break;
} // x_char
@@ -325,6 +329,9 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind)
set_in_vector(data_, i, xml);
break;
}
case db_wstring:
throw soci_error("Wide string data type is not supported");
};
break;
} // x_stdstring
@@ -409,6 +416,7 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind)
case db_uint32:
case db_int64:
case db_uint64:
case db_wstring:
throw soci_error("Into element used with non-convertible type.");
case db_xml:
@@ -444,6 +452,8 @@ void sqlite3_vector_into_type_backend::post_fetch(bool gotData, indicator * ind)
case db_xml:
throw soci_error("XML data type is not supported");
case db_wstring:
throw soci_error("Wide string data type is not supported");
}
}
}

View File

@@ -356,6 +356,7 @@ struct statement_wrapper
std::vector<db_type> into_types; // for both single and bulk
std::vector<indicator> into_indicators;
std::map<int, std::string> into_strings;
std::map<int, std::wstring> into_wstrings;
std::map<int, int8_t> into_int8;
std::map<int, uint8_t> into_uint8;
std::map<int, int16_t> into_int16;
@@ -370,6 +371,7 @@ struct statement_wrapper
std::vector<std::vector<indicator> > into_indicators_v;
std::map<int, std::vector<std::string> > into_strings_v;
std::map<int, std::vector<std::wstring> > into_wstrings_v;
std::map<int, std::vector<int8_t> > into_int8_v;
std::map<int, std::vector<uint8_t> > into_uint8_v;
std::map<int, std::vector<int16_t> > into_int16_v;
@@ -384,6 +386,7 @@ struct statement_wrapper
// use elements
std::map<std::string, indicator> use_indicators;
std::map<std::string, std::string> use_strings;
std::map<std::string, std::wstring> use_wstrings;
std::map<std::string, int8_t> use_int8;
std::map<std::string, uint8_t> use_uint8;
std::map<std::string, int16_t> use_int16;
@@ -398,6 +401,7 @@ struct statement_wrapper
std::map<std::string, std::vector<indicator> > use_indicators_v;
std::map<std::string, std::vector<std::string> > use_strings_v;
std::map<std::string, std::vector<std::wstring> > use_wstrings_v;
std::map<std::string, std::vector<int8_t> > use_int8_v;
std::map<std::string, std::vector<uint8_t> > use_uint8_v;
std::map<std::string, std::vector<int16_t> > use_int16_v;
@@ -618,6 +622,17 @@ bool name_exists_check_failed(statement_wrapper & wrapper,
name_exists = (it != wrapper.use_strings.end());
}
break;
case db_wstring:
{
typedef std::map
<
std::string,
std::wstring
>::const_iterator iterator;
iterator const it = wrapper.use_wstrings.find(name);
name_exists = (it != wrapper.use_wstrings.end());
}
break;
case db_int8:
{
typedef std::map<std::string, int8_t>::const_iterator iterator;
@@ -718,6 +733,17 @@ bool name_exists_check_failed(statement_wrapper & wrapper,
name_exists = (it != wrapper.use_strings_v.end());
}
break;
case db_wstring:
{
typedef std::map
<
std::string,
std::vector<std::wstring>
>::const_iterator iterator;
iterator const it = wrapper.use_wstrings_v.find(name);
name_exists = (it != wrapper.use_wstrings_v.end());
}
break;
case db_int8:
{
typedef std::map
@@ -1595,6 +1621,9 @@ SOCI_DECL void soci_into_resize_v(statement_handle st, int new_size)
case db_string:
wrapper->into_strings_v[i].resize(new_size);
break;
case db_wstring:
wrapper->into_wstrings_v[i].resize(new_size);
break;
case db_int8:
wrapper->into_int8_v[i].resize(new_size);
break;
@@ -3042,6 +3071,10 @@ SOCI_DECL void soci_prepare(statement_handle st, char const * query)
wrapper->st.exchange(
into(wrapper->into_strings[i], wrapper->into_indicators[i]));
break;
case db_wstring:
wrapper->st.exchange(
into(wrapper->into_wstrings[i], wrapper->into_indicators[i]));
break;
case db_int8:
wrapper->st.exchange(
into(wrapper->into_int8[i], wrapper->into_indicators[i]));
@@ -3103,6 +3136,10 @@ SOCI_DECL void soci_prepare(statement_handle st, char const * query)
wrapper->st.exchange(
into(wrapper->into_strings_v[i], wrapper->into_indicators_v[i]));
break;
case db_wstring:
wrapper->st.exchange(
into(wrapper->into_wstrings_v[i], wrapper->into_indicators_v[i]));
break;
case db_int8:
wrapper->st.exchange(
into(wrapper->into_int8_v[i], wrapper->into_indicators_v[i]));

View File

@@ -673,6 +673,12 @@ void statement_impl::bind_into<db_string>()
into_row<std::string>();
}
template<>
void statement_impl::bind_into<db_wstring>()
{
into_row<std::wstring>();
}
template<>
void statement_impl::bind_into<db_double>()
{
@@ -762,6 +768,9 @@ void statement_impl::describe()
case db_xml:
bind_into<db_string>();
break;
case db_wstring:
bind_into<db_wstring>();
break;
case db_blob:
bind_into<db_blob>();
break;

549
src/core/unicode.cpp Normal file
View File

@@ -0,0 +1,549 @@
//
// Copyright (C) 2024 Benjamin Oldenburg
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
#define SOCI_SOURCE
#include "soci/soci-unicode.h"
namespace soci
{
namespace details
{
bool is_valid_utf8_sequence(unsigned char const* bytes, int length)
{
if (length == 1)
{
return (bytes[0] & 0x80U) == 0;
}
if (length == 2)
{
if ((bytes[0] & 0xE0U) == 0xC0 && (bytes[1] & 0xC0U) == 0x80)
{
// Check for overlong encoding
const uint32_t code_point = ((bytes[0] & 0x1FU) << 6U) | (bytes[1] & 0x3FU);
return code_point >= 0x80;
}
return false;
}
if (length == 3)
{
if ((bytes[0] & 0xF0U) == 0xE0 && (bytes[1] & 0xC0U) == 0x80 && (bytes[2] & 0xC0U) == 0x80)
{
// Check for overlong encoding
const uint32_t code_point = ((bytes[0] & 0x0FU) << 12U) | ((bytes[1] & 0x3FU) << 6U) | (bytes[2] & 0x3FU);
return code_point >= 0x800 && code_point <= 0xFFFF;
}
return false;
}
if (length == 4)
{
if ((bytes[0] & 0xF8U) == 0xF0 && (bytes[1] & 0xC0U) == 0x80 && (bytes[2] & 0xC0U) == 0x80 && (bytes[3] & 0xC0U) == 0x80)
{
// Check for overlong encoding and valid Unicode code point
const uint32_t code_point = ((bytes[0] & 0x07U) << 18U) | ((bytes[1] & 0x3FU) << 12U) | ((bytes[2] & 0x3FU) << 6U) | (bytes[3] & 0x3FU);
return code_point >= 0x10000 && code_point <= 0x10FFFF;
}
return false;
}
return false;
}
void ensure_valid_utf8(char const* utf8, std::size_t len)
{
auto const* const bytes = reinterpret_cast<unsigned char const*>(utf8);
for (std::size_t i = 0; i < len;)
{
if ((bytes[i] & 0x80U) == 0)
{
// ASCII character, one byte
i += 1;
}
else if ((bytes[i] & 0xE0U) == 0xC0)
{
// Two-byte character, check if the next byte is a valid continuation byte
if (i + 1 >= len || !is_valid_utf8_sequence(bytes + i, 2))
{
throw soci_error("Invalid UTF-8 sequence: Truncated or invalid two-byte sequence");
}
i += 2;
}
else if ((bytes[i] & 0xF0U) == 0xE0U)
{
// Three-byte character, check if the next two bytes are valid continuation bytes
if (i + 2 >= len || !is_valid_utf8_sequence(bytes + i, 3))
{
throw soci_error("Invalid UTF-8 sequence: Truncated or invalid three-byte sequence");
}
i += 3;
}
else if ((bytes[i] & 0xF8U) == 0xF0U)
{
// Four-byte character, check if the next three bytes are valid continuation bytes
if (i + 3 >= len || !is_valid_utf8_sequence(bytes + i, 4))
{
throw soci_error("Invalid UTF-8 sequence: Truncated or invalid four-byte sequence");
}
i += 4;
}
else
{
// Invalid start byte
throw soci_error("Invalid UTF-8 sequence: Invalid start byte");
}
}
}
void ensure_valid_utf16(char16_t const* s, std::size_t len)
{
for (std::size_t i = 0; i < len; ++i)
{
const char16_t chr = s[i];
if (chr >= 0xD800 && chr <= 0xDBFF)
{ // High surrogate
if (i + 1 >= len)
{
throw soci_error("Invalid UTF-16 sequence (truncated surrogate pair)");
}
const char16_t next = s[i + 1];
if (next < 0xDC00 || next > 0xDFFF)
{
throw soci_error("Invalid UTF-16 sequence (invalid surrogate pair)");
}
++i; // Skip the next character as it's part of the pair
}
else if (chr >= 0xDC00 && chr <= 0xDFFF)
{ // Lone low surrogate
throw soci_error("Invalid UTF-16 sequence (lone low surrogate)");
}
}
}
void ensure_valid_utf32(char32_t const* s, std::size_t len)
{
for (std::size_t i = 0; i < len; ++i)
{
const char32_t chr = s[i];
// Check if the code point is within the Unicode range
if (chr > 0x10FFFF)
{
throw soci_error("Invalid UTF-32 sequence: Code point out of range");
}
// Surrogate pairs are not valid in UTF-32
if (chr >= 0xD800 && chr <= 0xDFFF)
{
throw soci_error("Invalid UTF-32 sequence: Surrogate pair found");
}
// Check for non-characters U+FFFE and U+FFFF
if (chr == 0xFFFE || chr == 0xFFFF)
{
throw soci_error("Invalid UTF-32 sequence: Non-character found");
}
}
}
std::size_t
utf8_to_utf16(char const* utf8, std::size_t len8,
char16_t* out16, std::size_t len16)
{
// Skip the check if we're just computing the length for efficiency, we'll
// detect any errors when performing the actual conversion anyhow.
if (out16)
ensure_valid_utf8(utf8, len8);
auto const* const bytes = reinterpret_cast<unsigned char const*>(utf8);
std::size_t len = 0;
// Check for UTF-8 BOM
size_t start_index = 0;
if (len8 >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF)
{
++len;
if (out16)
{
throw_if_too_small(len, len16);
*out16++ = 0xFEFF; // Add UTF-16 BOM
}
start_index = 3; // Start conversion after the BOM
}
for (size_t i = start_index; i < len8;)
{
uint32_t codepoint;
if ((bytes[i] & 0x80) == 0)
{
// ASCII character
codepoint = bytes[i++];
}
else if ((bytes[i] & 0xE0) == 0xC0)
{
// 2-byte sequence
codepoint = ((bytes[i] & 0x1F) << 6) | (bytes[i + 1] & 0x3F);
i += 2;
}
else if ((bytes[i] & 0xF0) == 0xE0)
{
// 3-byte sequence
codepoint = ((bytes[i] & 0x0F) << 12) | ((bytes[i + 1] & 0x3F) << 6) | (bytes[i + 2] & 0x3F);
i += 3;
}
else if ((bytes[i] & 0xF8) == 0xF0)
{
// 4-byte sequence
codepoint = ((bytes[i] & 0x07) << 18) | ((bytes[i + 1] & 0x3F) << 12) | ((bytes[i + 2] & 0x3F) << 6) | (bytes[i + 3] & 0x3F);
i += 4;
}
else
{
throw soci_error("Invalid UTF-8 sequence");
}
if (codepoint <= 0xFFFF)
{
++len;
if (out16)
{
throw_if_too_small(len, len16);
*out16++ = static_cast<char16_t>(codepoint);
}
}
else
{
// Encode as surrogate pair
len += 2;
if (out16)
{
throw_if_too_small(len, len16);
codepoint -= 0x10000;
*out16++ = static_cast<char16_t>((codepoint >> 10) + 0xD800);
*out16++ = static_cast<char16_t>((codepoint & 0x3FF) + 0xDC00);
}
}
}
return len;
}
std::size_t
utf16_to_utf8(char16_t const* utf16, std::size_t len16,
char* out8, std::size_t len8)
{
// Skip the check if we're just computing the length for efficiency, we'll
// detect any errors when performing the actual conversion anyhow.
if (out8)
ensure_valid_utf16(utf16, len16);
std::size_t len = 0;
// Check for UTF-16 BOM
size_t start_index = 0;
if (len16 && utf16[0] == 0xFEFF)
{
len += 3;
if (out8)
{
throw_if_too_small(len, len8);
// Add UTF-8 BOM
*out8++ = '\xEF';
*out8++ = '\xBB';
*out8++ = '\xBF';
}
start_index = 1; // Start conversion after the BOM
}
for (std::size_t i = start_index; i < len16; ++i)
{
char16_t const chr = utf16[i];
if (chr < 0x80)
{
// 1-byte sequence (ASCII)
++len;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(chr);
}
}
else if (chr < 0x800)
{
// 2-byte sequence
len += 2;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(0xC0U | ((chr >> 6) & 0x1FU));
*out8++ = static_cast<char>(0x80U | (chr & 0x3FU));
}
}
else if ((chr >= 0xD800U) && (chr <= 0xDBFFU))
{
// Handle UTF-16 surrogate pairs
if (i + 1 >= len16)
{
throw soci_error("Invalid UTF-16 surrogate pair (truncated)");
}
char16_t const chr2 = utf16[i + 1];
if (chr2 < 0xDC00U || chr2 > 0xDFFFU)
{
throw soci_error("Invalid UTF-16 surrogate pair");
}
auto const codepoint = static_cast<uint32_t>(((chr & 0x3FFU) << 10U) | (chr2 & 0x3FFU)) + 0x10000U;
len += 4;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(0xF0U | ((codepoint >> 18U) & 0x07U));
*out8++ = static_cast<char>(0x80U | ((codepoint >> 12U) & 0x3FU));
*out8++ = static_cast<char>(0x80U | ((codepoint >> 6U) & 0x3FU));
*out8++ = static_cast<char>(0x80U | (codepoint & 0x3FU));
}
++i; // Skip the next character as it is part of the surrogate pair
}
else
{
// 3-byte sequence
len += 3;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(0xE0U | ((chr >> 12) & 0x0FU));
*out8++ = static_cast<char>(0x80U | ((chr >> 6) & 0x3FU));
*out8++ = static_cast<char>(0x80U | (chr & 0x3FU));
}
}
}
return len;
}
std::size_t
utf16_to_utf32(char16_t const* utf16, std::size_t len16,
char32_t* out32, std::size_t len32)
{
// Skip the check if we're just computing the length for efficiency, we'll
// detect any errors when performing the actual conversion anyhow.
if (out32)
ensure_valid_utf16(utf16, len16);
std::size_t len = 0;
for (std::size_t i = 0; i < len16; ++i)
{
char16_t const chr = *utf16++;
++len;
if (out32)
throw_if_too_small(len, len32);
if (chr >= 0xD800U && chr <= 0xDBFFU)
{
// High surrogate, must be followed by a low surrogate
char16_t const chr2 = *utf16++;
++i;
if (out32)
{
const auto codepoint = static_cast<uint32_t>(((static_cast<unsigned int>(chr) & 0x3FFU) << 10U) | (static_cast<unsigned int>(chr2) & 0x3FFU)) + 0x10000U;
*out32++ = codepoint;
}
}
else
{
// Valid BMP character or a low surrogate that is part of a valid
// pair (already checked by ensure_valid_utf16)
if (out32)
*out32++ = static_cast<char32_t>(chr);
}
}
return len;
}
std::size_t
utf32_to_utf16(char32_t const* utf32, std::size_t len32,
char16_t* out16, std::size_t len16)
{
// Skip the check if we're just computing the length for efficiency, we'll
// detect any errors when performing the actual conversion anyhow.
if (out16)
ensure_valid_utf32(utf32, len32);
std::size_t len = 0;
for (std::size_t i = 0; i < len32; ++i)
{
char32_t codepoint = *utf32++;
if (codepoint <= 0xFFFFU)
{
++len;
// BMP character
if (out16)
{
throw_if_too_small(len, len16);
*out16++ = static_cast<char16_t>(codepoint);
}
}
else
{
len += 2;
// Encode as a surrogate pair
if (out16)
{
throw_if_too_small(len, len16);
// Note that we know that the code point is valid here because
// we called ensure_valid_utf32() above.
codepoint -= 0x10000;
*out16++ = static_cast<char16_t>((codepoint >> 10U) + 0xD800U);
*out16++ = static_cast<char16_t>((codepoint & 0x3FFU) + 0xDC00U);
}
}
}
return len;
}
std::size_t
utf8_to_utf32(char const* utf8, std::size_t len8,
char32_t* out32, std::size_t len32)
{
// Skip the check if we're just computing the length for efficiency, we'll
// detect any errors when performing the actual conversion anyhow.
if (out32)
ensure_valid_utf8(utf8, len8);
auto const* const bytes = reinterpret_cast<unsigned char const*>(utf8);
std::size_t len = 0;
for (std::size_t i = 0; i < len8;)
{
unsigned char chr1 = bytes[i];
++len;
if (out32)
throw_if_too_small(len, len32);
// 1-byte sequence (ASCII)
if ((chr1 & 0x80U) == 0)
{
if (out32)
*out32++ = static_cast<char32_t>(chr1);
++i;
}
// 2-byte sequence
else if ((chr1 & 0xE0U) == 0xC0U)
{
if (out32)
*out32++ = static_cast<char32_t>(((chr1 & 0x1FU) << 6U) | (bytes[i + 1] & 0x3FU));
i += 2;
}
// 3-byte sequence
else if ((chr1 & 0xF0U) == 0xE0U)
{
if (out32)
*out32++ = static_cast<char32_t>(((chr1 & 0x0FU) << 12U) | ((bytes[i + 1] & 0x3FU) << 6U) | (bytes[i + 2] & 0x3FU));
i += 3;
}
// 4-byte sequence
else if ((chr1 & 0xF8U) == 0xF0U)
{
if (out32)
*out32++ = static_cast<char32_t>(((chr1 & 0x07U) << 18U) | ((bytes[i + 1] & 0x3FU) << 12U) | ((bytes[i + 2] & 0x3FU) << 6U) | (bytes[i + 3] & 0x3FU));
i += 4;
}
}
return len;
}
std::size_t
utf32_to_utf8(char32_t const* utf32, std::size_t len32,
char* out8, std::size_t len8)
{
// Skip the check if we're just computing the length for efficiency, we'll
// detect any errors when performing the actual conversion anyhow.
if (out8)
ensure_valid_utf32(utf32, len32);
std::size_t len = 0;
for (std::size_t i = 0; i < len32; ++i)
{
auto const codepoint = utf32[i];
if (codepoint < 0x80)
{
// 1-byte sequence (ASCII)
++len;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(codepoint);
}
}
else if (codepoint < 0x800)
{
// 2-byte sequence
len += 2;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(0xC0U | ((codepoint >> 6U) & 0x1FU));
*out8++ = static_cast<char>(0x80U | (codepoint & 0x3FU));
}
}
else if (codepoint < 0x10000)
{
// 3-byte sequence
len += 3;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(0xE0U | ((codepoint >> 12U) & 0x0FU));
*out8++ = static_cast<char>(0x80U | ((codepoint >> 6U) & 0x3FU));
*out8++ = static_cast<char>(0x80U | (codepoint & 0x3FU));
}
}
else // This must be the only remaining case for valid UTF-32 string.
{
// 4-byte sequence
len += 4;
if (out8)
{
throw_if_too_small(len, len8);
*out8++ = static_cast<char>(0xF0U | ((codepoint >> 18U) & 0x07U));
*out8++ = static_cast<char>(0x80U | ((codepoint >> 12U) & 0x3FU));
*out8++ = static_cast<char>(0x80U | ((codepoint >> 6U) & 0x3FU));
*out8++ = static_cast<char>(0x80U | (codepoint & 0x3FU));
}
}
}
return len;
}
} // namespace details
} // namespace soci

View File

@@ -9,6 +9,7 @@
#include "soci/soci-platform.h"
#include "soci/use-type.h"
#include "soci/statement.h"
#include "soci/soci-unicode.h"
#include "soci-exchange-cast.h"
#include "soci-mktime.h"
@@ -58,6 +59,10 @@ void standard_use_type::dump_value(std::ostream& os) const
os << "\"" << exchange_type_cast<x_stdstring>(data_) << "\"";
return;
case x_stdwstring:
os << "\"" << wide_to_utf8(exchange_type_cast<x_stdwstring>(data_)) << "\"";
return;
case x_int8:
os << exchange_type_cast<x_int8>(data_);
return;

View File

@@ -15,7 +15,7 @@ colormsg(_HIBLUE_ "Configuring SOCI tests:")
add_definitions(-DCATCH_CONFIG_CPP11_NO_SHUFFLE)
if(MSVC)
add_compile_options(/bigobj)
add_compile_options(/bigobj /utf-8)
endif()
include_directories(

View File

@@ -0,0 +1,355 @@
//
// Copyright (C) 2024 Benjamin Oldenburg
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
#include "soci/soci.h"
#include <catch.hpp>
using namespace soci;
using namespace soci::details;
TEST_CASE("UTF-8 validation tests", "[unicode]")
{
// Valid UTF-8 strings - Should not throw exceptions
CHECK_NOTHROW(ensure_valid_utf8("Hello, world!")); // valid ASCII
CHECK_NOTHROW(ensure_valid_utf8("")); // Empty string
CHECK_NOTHROW(ensure_valid_utf8(u8"Здравствуй, мир!")); // valid UTF-8
CHECK_NOTHROW(ensure_valid_utf8(u8"こんにちは世界")); // valid UTF-8
CHECK_NOTHROW(ensure_valid_utf8(u8"😀😁😂🤣😃😄😅😆")); // valid UTF-8 with emojis
// Invalid UTF-8 strings - Should throw soci_error exceptions
CHECK_THROWS_AS(ensure_valid_utf8("\x80"), soci_error); // Invalid single byte
CHECK_THROWS_AS(ensure_valid_utf8("\xC3\x28"), soci_error); // Invalid two-byte character
CHECK_THROWS_AS(ensure_valid_utf8("\xE2\x82"), soci_error); // Truncated three-byte character
CHECK_THROWS_AS(ensure_valid_utf8("\xF0\x90\x28"), soci_error); // Truncated four-byte character
CHECK_THROWS_AS(ensure_valid_utf8("\xF0\x90\x8D\x80\x80"), soci_error); // Extra byte in four-byte character
}
TEST_CASE("UTF-16 validation tests", "[unicode]")
{
// Valid UTF-16 strings
CHECK_NOTHROW(ensure_valid_utf16(u"Hello, world!")); // valid ASCII
CHECK_NOTHROW(ensure_valid_utf16(u"Здравствуй, мир!")); // valid Cyrillic
CHECK_NOTHROW(ensure_valid_utf16(u"こんにちは世界")); // valid Japanese
CHECK_NOTHROW(ensure_valid_utf16(u"😀😁😂🤣😃😄😅😆")); // valid emojis
// Invalid UTF-16 strings - these should throw exceptions
std::u16string invalid_utf16;
invalid_utf16 = u"";
invalid_utf16 += 0xD800; // lone high surrogate
CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error);
invalid_utf16 = u"";
invalid_utf16 += 0xDC00; // lone low surrogate
CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error);
invalid_utf16 = u"";
invalid_utf16 += 0xD800;
invalid_utf16 += 0xD800; // two high surrogates in a row
CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error);
invalid_utf16 = u"";
invalid_utf16 += 0xDC00;
invalid_utf16 += 0xDC00; // two low surrogates in a row
CHECK_THROWS_AS(ensure_valid_utf16(invalid_utf16), soci_error);
}
TEST_CASE("UTF-32 validation tests", "[unicode]")
{
// Valid UTF-32 strings
REQUIRE_NOTHROW(ensure_valid_utf32(U"Hello, world!")); // valid ASCII
REQUIRE_NOTHROW(ensure_valid_utf32(U"Здравствуй, мир!")); // valid Cyrillic
REQUIRE_NOTHROW(ensure_valid_utf32(U"こんにちは世界")); // valid Japanese
REQUIRE_NOTHROW(ensure_valid_utf32(U"😀😁😂🤣😃😄😅😆")); // valid emojis
// Invalid UTF-32 strings
REQUIRE_THROWS_AS(ensure_valid_utf32(U"\x110000"), soci_error); // Invalid UTF-32 code point
REQUIRE_THROWS_AS(ensure_valid_utf32(U"\x1FFFFF"), soci_error); // Invalid range
REQUIRE_THROWS_AS(ensure_valid_utf32(U"\xFFFFFFFF"), soci_error); // Invalid range
}
TEST_CASE("UTF-16 to UTF-32 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf16_to_utf32(u"Hello, world!") == U"Hello, world!");
REQUIRE(utf16_to_utf32(u"こんにちは世界") == U"こんにちは世界");
REQUIRE(utf16_to_utf32(u"😀😁😂🤣😃😄😅😆") == U"😀😁😂🤣😃😄😅😆");
// Edge cases
std::u16string utf16;
utf16.push_back(char16_t(0xD83D)); // high surrogate
utf16.push_back(char16_t(0xDE00)); // low surrogate
REQUIRE(utf16_to_utf32(utf16) == U"\U0001F600"); // 😀
// Invalid conversion (should throw an exception)
std::u16string invalid_utf16;
invalid_utf16.push_back(0xD800); // lone high surrogate
REQUIRE_THROWS_AS(utf16_to_utf32(invalid_utf16), soci_error);
}
TEST_CASE("UTF-32 to UTF-16 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf32_to_utf16(U"Hello, world!") == u"Hello, world!");
REQUIRE(utf32_to_utf16(U"こんにちは世界") == u"こんにちは世界");
REQUIRE(utf32_to_utf16(U"😀😁😂🤣😃😄😅😆") == u"😀😁😂🤣😃😄😅😆");
// Edge cases
std::u32string utf32 = U"\U0001F600"; // 😀
std::u16string expected_utf16;
expected_utf16.push_back(0xD83D); // high surrogate
expected_utf16.push_back(0xDE00); // low surrogate
REQUIRE(utf32_to_utf16(utf32) == expected_utf16);
// Invalid conversion (should throw an exception)
std::u32string invalid_utf32 = U"\x110000"; // Invalid code point
REQUIRE_THROWS_AS(utf32_to_utf16(invalid_utf32), soci_error);
}
TEST_CASE("UTF-8 to UTF-16 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf8_to_utf16(u8"Hello, world!") == u"Hello, world!");
REQUIRE(utf8_to_utf16(u8"こんにちは世界") == u"こんにちは世界");
REQUIRE(utf8_to_utf16(u8"😀😁😂🤣😃😄😅😆") == u"😀😁😂🤣😃😄😅😆");
// Edge cases
std::string utf8 = "\xF0\x9F\x98\x80"; // 😀
std::u16string expected_utf16 = u"\xD83D\xDE00";
REQUIRE(utf8_to_utf16(utf8) == expected_utf16);
// Invalid conversion (should throw an exception)
std::string invalid_utf8 = "\xF0\x28\x8C\xBC"; // Invalid UTF-8 sequence
REQUIRE_THROWS_AS(utf8_to_utf16(invalid_utf8), soci_error);
}
TEST_CASE("UTF-16 to UTF-8 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf16_to_utf8(u"Hello, world!") == u8"Hello, world!");
REQUIRE(utf16_to_utf8(u"こんにちは世界") == u8"こんにちは世界");
REQUIRE(utf16_to_utf8(u"😀😁😂🤣😃😄😅😆") == u8"😀😁😂🤣😃😄😅😆");
// Edge cases
std::u16string utf16;
utf16.push_back(0xD83D); // high surrogate
utf16.push_back(0xDE00); // low surrogate
REQUIRE(utf16_to_utf8(utf16) == "\xF0\x9F\x98\x80"); // 😀
// Invalid conversion (should throw an exception)
std::u16string invalid_utf16;
invalid_utf16.push_back(0xD800); // lone high surrogate
REQUIRE_THROWS_AS(utf16_to_utf8(invalid_utf16), soci_error);
}
TEST_CASE("UTF-8 to UTF-32 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf8_to_utf32(u8"Hello, world!") == U"Hello, world!");
REQUIRE(utf8_to_utf32(u8"こんにちは世界") == U"こんにちは世界");
REQUIRE(utf8_to_utf32(u8"😀😁😂🤣😃😄😅😆") == U"😀😁😂🤣😃😄😅😆");
// Edge cases
std::string utf8 = "\xF0\x9F\x98\x80"; // 😀
REQUIRE(utf8_to_utf32(utf8) == U"\U0001F600");
// Invalid conversion (should throw an exception)
std::string invalid_utf8 = "\xF0\x28\x8C\xBC"; // Invalid UTF-8 sequence
REQUIRE_THROWS_AS(utf8_to_utf32(invalid_utf8), soci_error);
}
TEST_CASE("UTF-32 to UTF-8 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf32_to_utf8(U"Hello, world!") == u8"Hello, world!");
REQUIRE(utf32_to_utf8(U"こんにちは世界") == u8"こんにちは世界");
REQUIRE(utf32_to_utf8(U"😀😁😂🤣😃😄😅😆") == u8"😀😁😂🤣😃😄😅😆");
// Edge cases
std::u32string utf32 = U"\U0001F600"; // 😀
REQUIRE(utf32_to_utf8(utf32) == "\xF0\x9F\x98\x80");
// Invalid conversion (should throw an exception)
std::u32string invalid_utf32 = U"\x110000"; // Invalid code point
REQUIRE_THROWS_AS(utf32_to_utf8(invalid_utf32), soci_error);
// Invalid conversion (should throw an exception)
std::u32string invalid_wide;
invalid_wide.push_back(0xD800); // lone high surrogate
REQUIRE_THROWS_AS(utf32_to_utf8(invalid_wide), soci_error);
}
TEST_CASE("Empty string tests", "[unicode]")
{
REQUIRE(utf16_to_utf8(u"") == u8"");
REQUIRE(utf32_to_utf8(U"") == u8"");
REQUIRE(utf8_to_utf16(u8"") == u"");
REQUIRE(utf8_to_utf32(u8"") == U"");
}
TEST_CASE("Strings with Byte Order Marks (BOMs)", "[unicode]")
{
// UTF-8 BOM
const std::string utf8_bom = "\xEF\xBB\xBF";
// UTF-16 BOM (Little Endian)
const std::u16string utf16_bom = u"\xFEFF";
// UTF-32 BOM (Little Endian)
const std::u32string utf32_bom = U"\x0000FEFF";
const std::string content = "Hello, world!";
const std::u16string content16 = u"Hello, world!";
const std::u32string content32 = U"Hello, world!";
SECTION("UTF-8 to UTF-16")
{
std::u16string result = utf8_to_utf16(utf8_bom + content);
REQUIRE(result == utf16_bom + content16);
}
SECTION("UTF-8 to UTF-32")
{
std::u32string result = utf8_to_utf32(utf8_bom + content);
REQUIRE(result == utf32_bom + content32);
}
SECTION("UTF-16 to UTF-8")
{
std::string result = utf16_to_utf8(utf16_bom + content16);
REQUIRE(result == utf8_bom + content);
}
SECTION("UTF-16 to UTF-32")
{
std::u32string result = utf16_to_utf32(utf16_bom + content16);
REQUIRE(result == utf32_bom + content32);
}
SECTION("UTF-32 to UTF-8")
{
std::string result = utf32_to_utf8(utf32_bom + content32);
REQUIRE(result == utf8_bom + content);
}
SECTION("UTF-32 to UTF-16")
{
std::u16string result = utf32_to_utf16(utf32_bom + content32);
REQUIRE(result == utf16_bom + content16);
}
SECTION("Roundtrip conversions")
{
// UTF-8 -> UTF-16 -> UTF-8
REQUIRE(utf16_to_utf8(utf8_to_utf16(utf8_bom + content)) == utf8_bom + content);
// UTF-8 -> UTF-32 -> UTF-8
REQUIRE(utf32_to_utf8(utf8_to_utf32(utf8_bom + content)) == utf8_bom + content);
// UTF-16 -> UTF-8 -> UTF-16
REQUIRE(utf8_to_utf16(utf16_to_utf8(utf16_bom + content16)) == utf16_bom + content16);
// UTF-16 -> UTF-32 -> UTF-16
REQUIRE(utf32_to_utf16(utf16_to_utf32(utf16_bom + content16)) == utf16_bom + content16);
// UTF-32 -> UTF-8 -> UTF-32
REQUIRE(utf8_to_utf32(utf32_to_utf8(utf32_bom + content32)) == utf32_bom + content32);
// UTF-32 -> UTF-16 -> UTF-32
REQUIRE(utf16_to_utf32(utf32_to_utf16(utf32_bom + content32)) == utf32_bom + content32);
}
}
TEST_CASE("Strings with invalid code unit sequences", "[unicode]")
{
REQUIRE_THROWS_AS(ensure_valid_utf16(u"\xD800\xD800"), soci_error);
REQUIRE_THROWS_AS(ensure_valid_utf32(U"\xD800"), soci_error);
}
TEST_CASE("Strings with overlong encodings", "[unicode]")
{
REQUIRE_THROWS_AS(ensure_valid_utf8("\xC0\xAF"), soci_error);
}
TEST_CASE("Strings with non-characters", "[unicode]")
{
REQUIRE_THROWS_AS(ensure_valid_utf32(U"\xFFFE"), soci_error);
}
TEST_CASE("Strings with right-to-left characters", "[unicode]")
{
REQUIRE_NOTHROW(ensure_valid_utf8(u8"مرحبا بالعالم"));
}
TEST_CASE("UTF-8 to wide string conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf8_to_wide(u8"Hello, world!") == L"Hello, world!");
REQUIRE(utf8_to_wide(u8"こんにちは世界") == L"こんにちは世界");
REQUIRE(utf8_to_wide(u8"😀😁😂🤣😃😄😅😆") == L"😀😁😂🤣😃😄😅😆");
// Edge cases
std::string utf8 = "\xF0\x9F\x98\x80"; // 😀
std::wstring expected_wide = L"\U0001F600";
REQUIRE(utf8_to_wide(utf8) == expected_wide);
// Invalid conversion (should throw an exception)
std::string invalid_utf8 = "\xF0\x28\x8C\xBC"; // Invalid UTF-8 sequence
REQUIRE_THROWS_AS(utf8_to_wide(invalid_utf8), soci_error);
}
TEST_CASE("Wide string to UTF-8 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(wide_to_utf8(L"Hello, world!") == u8"Hello, world!");
REQUIRE(wide_to_utf8(L"こんにちは世界") == u8"こんにちは世界");
REQUIRE(wide_to_utf8(L"😀😁😂🤣😃😄😅😆") == u8"😀😁😂🤣😃😄😅😆");
// Edge cases
std::wstring wide = L"\U0001F600"; // 😀
REQUIRE(wide_to_utf8(wide) == "\xF0\x9F\x98\x80");
// Invalid conversion (should throw an exception)
std::wstring invalid_wide;
invalid_wide.push_back(0xD800); // lone high surrogate
REQUIRE_THROWS_AS(wide_to_utf8(invalid_wide), soci_error);
}
TEST_CASE("UTF-16 to wide string conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(utf16_to_wide(u"Hello, world!") == L"Hello, world!");
REQUIRE(utf16_to_wide(u"こんにちは世界") == L"こんにちは世界");
REQUIRE(utf16_to_wide(u"😀😁😂🤣😃😄😅😆") == L"😀😁😂🤣😃😄😅😆");
// Edge cases
std::u16string utf16 = u"\xD83D\xDE00"; // 😀
std::wstring expected_wide = L"\U0001F600";
REQUIRE(utf16_to_wide(utf16) == expected_wide);
// Invalid conversion (should throw an exception)
std::u16string invalid_utf16;
invalid_utf16.push_back(0xD800); // lone high surrogate
REQUIRE_THROWS_AS(utf16_to_wide(invalid_utf16), soci_error);
}
TEST_CASE("Wide string to UTF-16 conversion tests", "[unicode]")
{
// Valid conversion tests
REQUIRE(wide_to_utf16(L"Hello, world!") == u"Hello, world!");
REQUIRE(wide_to_utf16(L"こんにちは世界") == u"こんにちは世界");
REQUIRE(wide_to_utf16(L"😀😁😂🤣😃😄😅😆") == u"😀😁😂🤣😃😄😅😆");
// Edge cases
std::wstring wide = L"\U0001F600"; // 😀
REQUIRE(wide_to_utf16(wide) == u"\xD83D\xDE00");
// Invalid conversion (should throw an exception)
std::wstring invalid_wide;
invalid_wide.push_back(0xD800); // lone high surrogate
REQUIRE_THROWS_AS(wide_to_utf16(invalid_wide), soci_error);
}

View File

@@ -12,4 +12,7 @@
soci_backend_test(
BACKEND Empty
SOURCE test-empty.cpp
# We only run these tests from the empty backend test, as they don't use
# database at all.
../common/test-unicode.cpp
CONNSTR "dummy")

View File

@@ -77,6 +77,58 @@ TEST_CASE("MS SQL long string", "[odbc][mssql][long]")
);
}
struct wide_text_table_creator : public table_creator_base
{
explicit wide_text_table_creator(soci::session &sql)
: table_creator_base(sql)
{
sql << "create table soci_test ("
"wide_text nvarchar(40) null"
")";
}
};
TEST_CASE("MS SQL wide string", "[odbc][mssql][wstring]")
{
soci::session sql(backEnd, connectString);
wide_text_table_creator create_wide_text_table(sql);
std::wstring const str_in = L"Привет, SOCI!";
sql << "insert into soci_test(wide_text) values(:str)", use(str_in);
std::wstring str_out;
sql << "select wide_text from soci_test", into(str_out);
CHECK(str_out == str_in);
}
TEST_CASE("MS SQL wide string vector", "[odbc][mssql][vector][wstring]")
{
soci::session sql(backEnd, connectString);
wide_text_table_creator create_wide_text_table(sql);
std::vector<std::wstring> const str_in = {
L"Привет, SOCI!",
L"Привет, World!",
L"Привет, Universe!",
L"Привет, Galaxy!"};
sql << "insert into soci_test(wide_text) values(:str)", use(str_in);
std::vector<std::wstring> str_out(4);
sql << "select wide_text from soci_test", into(str_out);
CHECK(str_out.size() == str_in.size());
for (std::size_t i = 0; i != str_in.size(); ++i)
{
CHECK(str_out[i] == str_in[i]);
}
}
// DDL Creation objects for common tests
struct table_creator_one : public table_creator_base
{