SMSTools3 Search

SMS Server Tools 3

Menu

Basic information:

Additional information:

Support:

Get SMS Server Tools 3:

Additional Options

Sponsored links

Search

Custom Search

Visitor locations

SMS Server Tools 3 Community

Welcome, Guest. The forum is currently read-only, but will open soon.

Sat Jan 03, 2026 07:44

• SMSTools3 Community » Search

Page: 1

Keywords: Mode: All keywords (AND)
Fri Nov 29, 2019 17:26 [PATCH] 3.1.21 truncates messages with certain Unicode code points (Bug reports)
marko: Hi Keke, Thanks for fixing it further. Can you please also update the comments to indicate that U+10FFFF is the maximum supported code point, not U+FFFFF?
Mon Nov 25, 2019 06:18 [PATCH] 3.1.21 truncates messages with certain Unicode code points (Bug reports)
marko: Thank you for revising my fix. I should really have tested with a broader set of code points. It looks like the maximum UTF-8 code point that fits in 4 bytes is U+10FFFF (2 UTF-16 surrogate pairs: U+DBFF U+DFFF). GNOME will not let me type anything larger than that.
Tue Jul 11, 2017 17:48 [PATCH] 3.1.21 truncates messages with certain Unicode code points (Bug reports)
marko: Thank you for the prompt response and the decision to prepare an extra release with this fix. My friend is planning to deploy the system in about a month. It is always easier when custom patches are not needed.
Sun Jul 09, 2017 20:17 [PATCH] 3.1.21 truncates messages with certain Unicode code points (Bug reports)
marko: First of all, thank you for this great piece of software. As part of setting up this system for a friend, I wanted to test how Unicode data is handled. It turns out there are 2 bugs when receiving messages, and 1 when sending. Both will cause data to be truncated at the first encountered invalid character. For receiving, any code point at U+8000 or above will cause truncation, if the char data type is signed and sizeof(unsigned int) > 2, which I believe should hold on most systems. The reason is that sign extension will take place here: int ucs2_to_utf8_char(char ucs2, char utf8) { int result; unsigned int c = (ucs2[0] << 8) \| (unsigned char)ucs2[1]; Because of the sign extension, we will have c>0xffff if sizeof(c) > 2, and this function will return 0, causing the string to be truncated. Another problem for both receiving and sending is that UTF-16 data is not supported. UTF-16 is basically an extension of UCS2, allowing the code points U+10000 to U+FFFFF to be represented with two UCS2 code points, called surrogate pairs. The first pair is in the range 0xd800..0xdbff, and the second one is 0xdc00..0xdffff. I fixed these problems. I registered on this forum only to report this bug and to submit my patch. I would have preferred to use email for this, but I did not find any address. So, I am posting my patch below (see more discussion after it): diff -pur smstools3.1.21/src/charset.c smstools3-utf16/src/charset.c --- smstools3.1.21/src/charset.c 2017-03-31 16:22:07.000000000 +0300 +++ smstools3-utf16/src/charset.c 2017-07-09 22:51:35.203781163 +0300 @@ -521,6 +521,38 @@ int iso_utf8_2gsm(char* source, int size return dest_count; } +// Returns the number of utf8 bytes. +static int ucs2_to_utf8_char(const char ucs2, char utf8) +{ + int result; + unsigned c = (((unsigned char)ucs2[0]) << 8) \| (unsigned char)ucs2[1]; + + if (c <= 0x7F) + { + utf8[0] = (unsigned char)c; + result = 1; + } + else if (c <= 0x7FF) + { + utf8[1] = (unsigned char)(0x80 \| (c & 0x3F)); + c = (c >> 6); + utf8[0] = (unsigned char)(0xC0 \| c); + result = 2; + } + else + { + utf8[2] = (unsigned char)(0x80 \| (c & 0x3F)); + c = (c >> 6); + utf8[1] = (unsigned char)(0x80 \| (c & 0x3F)); + c = (c >> 6); + utf8[0] = (unsigned char)(0xE0 \| c); + result = 3; + } + + utf8[result] = '\0'; + return result; +} + // Outputs to the file. Return value: 0 = ok, -1 = error. int iso2utf8_file(FILE fp, char ascii, int userdatalength) { @@ -927,7 +959,7 @@ int decode_7bit_packed( return i; } -int utf8bytes0(char s, int allow_iso) +static int utf8bytes0(const char s, int allow_iso) { int result = 1; int i; @@ -960,20 +992,20 @@ int utf8bytes0(char s, int allow_iso) return result; } -int utf8bytes(char s) +int utf8bytes(const char s) { return utf8bytes0(s, 0); } -int iso_utf8bytes(char s) +static int iso_utf8bytes(const char s) { return utf8bytes0(s, 1); } -int iso_utf8chars(char s) +int iso_utf8chars(const char s) { int result = 0; - char p = s; + const char p = s; int i; while (p) @@ -988,155 +1020,195 @@ int iso_utf8chars(char s) return result; } -int iso_utf8_to_ucs2_char(char utf8, int len, char ucs2) +/** Transcode a Unicode code point from UTF-8 (or ISO 8859-15) into UTF-16 +@param s8 NUL-terminated UTF-8 or ISO 8859-15 data +@param d16 UTF-16 data (will be advanced by one character) +@param end end the UTF-16 output buffer +@return whether the conversion succeeded / +static int iso_utf8_to_ucs2_char(const char s8, char d16, const char end) { - unsigned int c = 0; - int i; + unsigned int c; - i = iso_utf8bytes(utf8); - if (len) - len = i; + if (d16 + 2 > end) + return 0; - switch (i) + switch (iso_utf8bytes(s8)) { - case 1: - c = (unsigned char)utf8[0]; - break; + case 1: + c = (unsigned char)(s8)++; + break; + + case 2: + c = (s8)++ & 0x3F; + c <<= 6; + c \|= (s8)++ & 0x3F; + break; + + case 3: + c = (s8)++ & 0x0F; + c <<= 6; + c \|= (s8)++ & 0x3F; + c <<= 6; + c \|= (s8)++ & 0x3F; + break; + + case 4: + c = (s8)++ & 0x0F; + c <<= 6; + c \|= (s8)++ & 0x3F; + c <<= 6; + c \|= (s8)++ & 0x3F; + c <<= 6; + c \|= (s8)++ & 0x3F; + break; - case 2: - c = (utf8[0] & 0x1F) << 6 \| (utf8[1] & 0x3F); - break; - - case 3: - c = (utf8[0] & 0x0F) << 12 \| (utf8[1] & 0x3F) << 6 \| (utf8[2] & 0x3F); - break; + default: + return 0; + } - default: + if (c >= 0x10000) { + / Use UTF-16 encoding for code points that reside outside the + Basic Multilingual Plane. / + if (d16 + 4 > end) return 0; + + // Fix: Subtract 0x10000 from the code point: + //c &= 0xFFFF; + c -= 0x10000; + + unsigned int c0 = 0xD800 \| (c >> 10); + (d16)++ = (char)(c0 >> 8); + (d16)++ = (char)(c0 & 0xFF); + c &= 0x3FF; + c \|= 0xDC00; } - ucs2[0] = (unsigned char)((c & 0xFF00) >> 8); - ucs2[1] = (unsigned char)(c & 0xFF); + (d16)++ = (char)(c >> 8); + (d16)++ = (char)(c & 0xFF); return 1; } -// Note: Returns the number of UCS2 characters, not bytes. -int iso_utf8_to_ucs2_buffer(char utf8, char ucs2, size_t ucs2_size) +/** Transcode a Unicode code point from UTF-16 into UTF-8 +@param s16 UTF-16 data (will be advanced by one character) +@param srcend end of the UTF-16 string +@param d8 UTF-8 data (will be advanced by one character) +@param dstend end the UTF-8 output buffer +@return whether the conversion succeeded / +static int utf16_to_utf8_char(const char s16, const char srcend, + char *d8, const char dstend) { - char p = utf8; - char end = utf8 +strlen(utf8); - int bytes; - size_t dest = 0; - int result = 0; - - while (p < end) - { - if (dest >= ucs2_size -1) - break; + if (s16 + 2 > srcend) + return 0; - if (!iso_utf8_to_ucs2_char(p, &bytes, &ucs2[dest])) - break; + unsigned c = (((unsigned char)(s16)[0]) << 8) \| (unsigned char)(s16)[1]; + s16 += 2; - p += bytes; - dest += 2; - result++; + /* Attempt to decode two consecutive surrogate pairs as UTF-16. + If that fails, fall back to encode those code points in 3 UTF-8 bytes. / + if (c >= 0xD800 && c <= 0xDBFF) + { + unsigned c2 = s16 + 2 > srcend + ? 0 + : (((unsigned char)(s16)[0]) << 8) \| (unsigned char)(s16)[1]; + if (c2 >= 0xDC00 && c2 <= 0xDFFF) { + /* Decode two UTF-16 surrogate pairs into one Unicode code point + (U+10000 to U+10FFFF (not U+FFFFF), 1..4 bytes in UTF-8) / + + // Fix2: + //c = 0x10000 \| (c & 0x3FF) << 10 \| (c2 & 0x3FF); + c = 0x10000 + ((c & 0x3FF) << 10) \| (c2 & 0x3FF); + + s16 += 2; + + if (d8 + 4 > dstend) + return 0; + (d8)[3] = (char)(0x80 \| (c & 0x3F)); + c >>= 6; + (d8)[2] = (char)(0x80 \| (c & 0x3F)); + c >>= 6; + (d8)[1] = (char)(0x80 \| (c & 0x3F)); + c >>= 6; + (d8)[0] = (char)(0xF0 \| c); + d8 += 4; + return 1; + } } - return result; -} - -// Returns the number of utf8 bytes. -int ucs2_to_utf8_char(char ucs2, char utf8) -{ - int result; - unsigned int c = (ucs2[0] << 8) \| (unsigned char)ucs2[1]; - if (c <= 0x7F) { - utf8[0] = (unsigned char)c; - result = 1; + if (d8 + 1 > dstend) + return 0; + (d8)++ = (char) c; } else if (c <= 0x7FF) { - utf8[1] = (unsigned char)(0x80 \| (c & 0x3F)); - c = (c >> 6); - utf8[0] = (unsigned char)(0xC0 \| c); - result = 2; - } - else if (c <= 0xFFFF) - { - utf8[2] = (unsigned char)(0x80 \| (c & 0x3F)); - c = (c >> 6); - utf8[1] = (unsigned char)(0x80 \| (c & 0x3F)); - c = (c >> 6); - utf8[0] = (unsigned char)(0xE0 \| c); - result = 3; + if (d8 + 2 > dstend) + return 0; + (d8)[1] = (char) (0x80 \| (c & 0x3F)); + c >>= 6; + (d8)[0] = (char) (0xC0 \| c); + d8 += 2; } else - result = 0; - - utf8[result] = '\0'; - return result; -} - -// Returns number of utf8 characters, not bytes. -int ucs2_to_utf8_buffer(char ucs2, size_t ucs2_buffer_len, char utf8, size_t utf8_size) -{ - int result = 0; - char p = ucs2; - char end = ucs2 + ucs2_buffer_len; - char utf8char[7]; - size_t len = 0; - int i; - - while (p < end) { - if (!(i = ucs2_to_utf8_char(p, utf8char))) - break; - - if (len + i >= utf8_size) - break; - - strcpy(&utf8[len], utf8char); - len += i; - p += 2; - result++; + if (d8 + 3 > dstend) + return 0; + (d8)[2] = (char)(0x80 \| (c & 0x3F)); + c >>= 6; + (d8)[1] = (char)(0x80 \| (c & 0x3F)); + c >>= 6; + (d8)[0] = (char)(0xE0 \| c); + d8 += 3; } - return result; + return 1; } -// Returns number of bytes. +/** Transcode a Unicode string from UTF-16 into UTF-8 +@param buf in: UTF-16 string; out: UTF-8 string +@param len length of the UTF-16 buffer, in bytes +@param maxlen maximum length of the UTF-8 output, in bytes +@return length of the converted UTF-8 string, in bytes / size_t ucs2utf(char buf, size_t len, size_t maxlen) { - char ucs2 = (char )malloc(len); + char s = malloc(len); - if (!ucs2) + if (!s) return 0; + else + { + const char ucs2 = s; + const char ends = ucs2 + len; + char utf8 = buf; + char endd = utf8 + maxlen; - memcpy(ucs2, buf, len); - ucs2_to_utf8_buffer(ucs2, len, buf, maxlen +1); + memcpy(s, buf, len); - free(ucs2); + while (utf16_to_utf8_char(&ucs2, ends, &utf8, endd)); - return strlen(buf); + if (utf8 < endd) + utf8 = '\0'; + + free(s); + + return utf8 - buf; + } } // Returns number of bytes. size_t iso_utf8_2ucs(char buf, size_t maxlen) { - size_t ucs2_size = iso_utf8chars(buf) 2; // Not NULL terminated. - char ucs2; + size_t ucs2_size = iso_utf8chars(buf) 4; // Not NULL terminated. + char ucs2, d16; + const char s8; size_t bytes; - if (ucs2_size > maxlen + 1) - ucs2_size = maxlen + 1; - - if (!(ucs2 = (char )malloc(ucs2_size))) + if (!(ucs2 = malloc(ucs2_size))) return 0; - bytes = 2 * iso_utf8_to_ucs2_buffer(buf, ucs2, ucs2_size); + d16 = ucs2; + s8 = buf; + + while (s8 && iso_utf8_to_ucs2_char(&s8, &d16, ucs2 + ucs2_size)); + + bytes = d16 - ucs2; + + if (bytes > maxlen) + bytes = maxlen; + memcpy(buf, ucs2, bytes); free(ucs2); diff -pur smstools3.1.21/src/charset.h smstools3-utf16/src/charset.h --- smstools3.1.21/src/charset.h 2017-03-29 23:33:14.000000000 +0300 +++ smstools3-utf16/src/charset.h 2017-07-09 22:51:19.467070697 +0300 @@ -31,13 +31,8 @@ int iso2utf8_file(FILE fp, char ascii, int decode_7bit_packed(char text, char dest, size_t size_dest); int encode_7bit_packed(char text, char dest, size_t size_dest); -int utf8bytes(char s); -int iso_utf8bytes(char s); -int iso_utf8chars(char s); -int iso_utf8_to_ucs2_char(char utf8, int len, char ucs2); -int iso_utf8_to_ucs2_buffer(char utf8, char ucs2, size_t ucs2_size); -int ucs2_to_utf8_char(char ucs2, char utf8); -int ucs2_to_utf8_buffer(char ucs2, size_t ucs2_buffer_len, char utf8, size_t utf8_size); +int utf8bytes(const char s); +int iso_utf8chars(const char s); size_t ucs2utf(char buf, size_t len, size_t maxlen); size_t iso_utf8_2ucs(char buf, size_t maxlen); int utf8_to_iso_char(char utf8, unsigned char iso); The above patch includes some cleanup as well, mainly the addition of static linkage specifiers and const qualifiers. For testing the patch, I used the following patch: diff -pur smstools3.1.21/src/Makefile smstools3-utf16/src/Makefile --- smstools3.1.21/src/Makefile 2017-05-04 00:05:45.000000000 +0300 +++ smstools3-utf16/src/Makefile 2017-07-09 19:53:23.723804454 +0300 @@ -49,10 +49,12 @@ CFLAGS += -D_FILE_OFFSET_BITS=64 # Use the following only on GNU/Linux and only if you need ps listing like "smsd: MAINPROCESS" and "smsd: GSM1" # CFLAGS += -D USE_LINUX_PS_TRICK -all: smsd +all: smsd smsd-test smsd: smsd.c extras.o locking.o cfgfile.o logging.o alarm.o smsd_cfg.o charset.o stats.o blacklist.o whitelist.o modeminit.o pdu.o charshift.o +smsd-test: test_utf16.o charset.o pdu.o charshift.o + ifneq (,$(findstring SOLARIS,$(CFLAGS))) ifeq (,$(findstring DISABLE_INET_SOCKET,$(CFLAGS))) override LFLAGS += -lsocket -lnsl and the following test program that I wrote first so that I could reproduce the problem without having any hardware attached: #include <stdarg.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include "smsd_cfg.h" #include "pdu.h" #include "charset.h" static char text[2048]; char tb_sprintf(char* format, ...) { va_list argp; if (format) { va_start(argp, format); vsnprintf(text, sizeof text, format, argp); va_end(argp); } return text; } void logch(char* format, ...) { tb_sprintf(format); } void writelogfile(int severity, int trouble, char* format, ...) { if (format) { va_list argp; fprintf(stderr, "%d:%d:", severity, trouble); va_start(argp, format); vfprintf(stderr, format, argp); va_end(argp); } } void writelogfile0(int severity, int trouble, char text) { writelogfile(severity, trouble, "%s", text); } int make_datetime_string(char dest, size_t dest_size, char a_date, char a_time, char a_format) { dest = a_date && a_time && a_format ? 0 * dest_size : 0; return 0; } char prch(char ch) { if ((unsigned char)ch >= ' ') return ch; return '.'; } void strcat_realloc(char *buffer, char str, char delimiter) { int delimiter_length = 0; if (delimiter) delimiter_length = strlen(delimiter); if (buffer == 0) { if ((buffer = malloc(strlen(str) + delimiter_length + 1))) buffer = 0; } else buffer = realloc(buffer, strlen(buffer) + strlen(str) + delimiter_length + 1); if (buffer) sprintf(strchr(buffer, 0), "%s%s", str, delimiter ? delimiter : ""); } char strcpyo(char dest, const char src) { size_t i; for (i = 0; src != '\0'; i++) dest = src; dest = '\0'; return dest; } int main (int argc, char* argv) { if (argc != 2) { fprintf(stderr, "Usage: %s PDU_string_in_hexadecimal\n", *argv); return 1; } int alphabet, with_udh, is_statusreport, is_unsupported_pdu, report, replace; int flash; char sendr[100], date[9], time[9], ascii[MAXTEXT], smsc[31], from_toa[51]; char udh_data[SIZE_UDH_DATA], udh_type[SIZE_UDH_TYPE]; char warning_headers[SIZE_WARNING_HEADERS]; size_t udlen = splitpdu(argv[1], "new", &alphabet, sendr, date, time, ascii, smsc, &with_udh, udh_data, udh_type, &is_statusreport, &is_unsupported_pdu, from_toa, &report, &replace, warning_headers, &flash, 0); size_t len = ucs2utf(ascii, udlen, sizeof ascii); fprintf(stderr, "%d:%zu:%zu:%s\n", alphabet, udlen, len, ascii); size_t ucslen = iso_utf8_2ucs(ascii, sizeof ascii); len = ucs2utf(ascii, ucslen, sizeof ascii); fprintf(stderr, "%zu:%zu:%s\n", ucslen, len, ascii); return 0; } The test string that I used was from the "PDU" header of a received message. With that message, the output of the program would be as follows: 2:40:26:Koe ja toinen ðŸ˜€ðŸ˜ðŸ˜‚ 40:26:Koe ja toinen ðŸ˜€ðŸ˜ðŸ˜‚ Without the patch applied, the string would be truncated at the first emoji (U+1F600), leaving the string "Koe ja toinen " (with a terminating space). I would appreciate it if you could include this patch in the next smstools release.

Page: 1

• SMSTools3 Community » Search

Top

Time in this board is UTC.

Privacy Policy SMS Server Tools 3 Copyright © Keijo Kasvi.