SMS Server Tools 3
 Menu
Basic information:
Additional information:
Support:
Get SMS Server Tools 3:
Additional Options

 Sponsored links

 Search
Custom Search

 Visitor locations
 
 SMS Server Tools 3 Community
Welcome, Guest. The forum is currently read-only, but will open soon. Fri Mar 14, 2025 10:46
SMSTools3 Community » Search Bottom

Page:  1

Keywords:
Mode: All keywords (AND)
marko: Hi Keke, Thanks for fixing it further. Can you please also update the comments to indicate that U+10FFFF is the maximum supported code point, not U+FFFFF?
marko: Thank you for revising my fix. I should really have tested with a broader set of code points. It looks like the maximum UTF-8 code point that fits in 4 bytes is U+10FFFF (2 UTF-16 surrogate pairs: U+DBFF U+DFFF). GNOME will not let me type anything larger than that.
marko: Thank you for the prompt response and the decision to prepare an extra release with this fix. My friend is planning to deploy the system in about a month. It is always easier when custom patches are not needed.
marko: First of all, thank you for this great piece of software. As part of setting up this system for a friend, I wanted to test how Unicode data is handled. It turns out there are 2 bugs when receiving messages, and 1 when sending. Both will cause data to be truncated at the first encountered invalid character. For receiving, any code point at U+8000 or above will cause truncation, if the char data type is signed and sizeof(unsigned int) > 2, which I believe should hold on most systems. The reason is that sign extension will take place here: int ucs2_to_utf8_char(char *ucs2, char *utf8) { int result; unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1]; Because of the sign extension, we will have c>0xffff if sizeof(c) > 2, and this function will return 0, causing the string to be truncated. Another problem for both receiving and sending is that UTF-16 data is not supported. UTF-16 is basically an extension of UCS2, allowing the code points U+10000 to U+FFFFF to be represented with two UCS2 code points, called surrogate pairs. The first pair is in the range 0xd800..0xdbff, and the second one is 0xdc00..0xdffff. I fixed these problems. I registered on this forum only to report this bug and to submit my patch. I would have preferred to use email for this, but I did not find any address. So, I am posting my patch below (see more discussion after it): diff -pur smstools3.1.21/src/charset.c smstools3-utf16/src/charset.c --- smstools3.1.21/src/charset.c 2017-03-31 16:22:07.000000000 +0300 +++ smstools3-utf16/src/charset.c 2017-07-09 22:51:35.203781163 +0300 @@ -521,6 +521,38 @@ int iso_utf8_2gsm(char* source, int size return dest_count; } +// Returns the number of utf8 bytes. +static int ucs2_to_utf8_char(const char *ucs2, char *utf8) +{ + int result; + unsigned c = (((unsigned char)ucs2[0]) << 8) | (unsigned char)ucs2[1]; + + if (c <= 0x7F) + { + utf8[0] = (unsigned char)c; + result = 1; + } + else if (c <= 0x7FF) + { + utf8[1] = (unsigned char)(0x80 | (c & 0x3F)); + c = (c >> 6); + utf8[0] = (unsigned char)(0xC0 | c); + result = 2; + } + else + { + utf8[2] = (unsigned char)(0x80 | (c & 0x3F)); + c = (c >> 6); + utf8[1] = (unsigned char)(0x80 | (c & 0x3F)); + c = (c >> 6); + utf8[0] = (unsigned char)(0xE0 | c); + result = 3; + } + + utf8[result] = '\0'; + return result; +} + // Outputs to the file. Return value: 0 = ok, -1 = error. int iso2utf8_file(FILE *fp, char *ascii, int userdatalength) { @@ -927,7 +959,7 @@ int decode_7bit_packed( return i; } -int utf8bytes0(char *s, int allow_iso) +static int utf8bytes0(const char *s, int allow_iso) { int result = 1; int i; @@ -960,20 +992,20 @@ int utf8bytes0(char *s, int allow_iso) return result; } -int utf8bytes(char *s) +int utf8bytes(const char *s) { return utf8bytes0(s, 0); } -int iso_utf8bytes(char *s) +static int iso_utf8bytes(const char *s) { return utf8bytes0(s, 1); } -int iso_utf8chars(char *s) +int iso_utf8chars(const char *s) { int result = 0; - char *p = s; + const char *p = s; int i; while (*p) @@ -988,155 +1020,195 @@ int iso_utf8chars(char *s) return result; } -int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2) +/** Transcode a Unicode code point from UTF-8 (or ISO 8859-15) into UTF-16 +@param s8 NUL-terminated UTF-8 or ISO 8859-15 data +@param d16 UTF-16 data (will be advanced by one character) +@param end end the UTF-16 output buffer +@return whether the conversion succeeded */ +static int iso_utf8_to_ucs2_char(const char **s8, char **d16, const char* end) { - unsigned int c = 0; - int i; + unsigned int c; - i = iso_utf8bytes(utf8); - if (len) - *len = i; + if (*d16 + 2 > end) + return 0; - switch (i) + switch (iso_utf8bytes(*s8)) { - case 1: - c = (unsigned char)utf8[0]; - break; + case 1: + c = (unsigned char)*(*s8)++; + break; + + case 2: + c = *(*s8)++ & 0x3F; + c <<= 6; + c |= *(*s8)++ & 0x3F; + break; + + case 3: + c = *(*s8)++ & 0x0F; + c <<= 6; + c |= *(*s8)++ & 0x3F; + c <<= 6; + c |= *(*s8)++ & 0x3F; + break; + + case 4: + c = *(*s8)++ & 0x0F; + c <<= 6; + c |= *(*s8)++ & 0x3F; + c <<= 6; + c |= *(*s8)++ & 0x3F; + c <<= 6; + c |= *(*s8)++ & 0x3F; + break; - case 2: - c = (utf8[0] & 0x1F) << 6 | (utf8[1] & 0x3F); - break; - - case 3: - c = (utf8[0] & 0x0F) << 12 | (utf8[1] & 0x3F) << 6 | (utf8[2] & 0x3F); - break; + default: + return 0; + } - default: + if (c >= 0x10000) { + /* Use UTF-16 encoding for code points that reside outside the + Basic Multilingual Plane. */ + if (*d16 + 4 > end) return 0; + + // Fix: Subtract 0x10000 from the code point: + //c &= 0xFFFF; + c -= 0x10000; + + unsigned int c0 = 0xD800 | (c >> 10); + *(*d16)++ = (char)(c0 >> 8); + *(*d16)++ = (char)(c0 & 0xFF); + c &= 0x3FF; + c |= 0xDC00; } - ucs2[0] = (unsigned char)((c & 0xFF00) >> 8); - ucs2[1] = (unsigned char)(c & 0xFF); + *(*d16)++ = (char)(c >> 8); + *(*d16)++ = (char)(c & 0xFF); return 1; } -// Note: Returns the number of UCS2 characters, not bytes. -int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size) +/** Transcode a Unicode code point from UTF-16 into UTF-8 +@param s16 UTF-16 data (will be advanced by one character) +@param srcend end of the UTF-16 string +@param d8 UTF-8 data (will be advanced by one character) +@param dstend end the UTF-8 output buffer +@return whether the conversion succeeded */ +static int utf16_to_utf8_char(const char **s16, const char *srcend, + char **d8, const char *dstend) { - char *p = utf8; - char *end = utf8 +strlen(utf8); - int bytes; - size_t dest = 0; - int result = 0; - - while (p < end) - { - if (dest >= ucs2_size -1) - break; + if (*s16 + 2 > srcend) + return 0; - if (!iso_utf8_to_ucs2_char(p, &bytes, &ucs2[dest])) - break; + unsigned c = (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1]; + *s16 += 2; - p += bytes; - dest += 2; - result++; + /* Attempt to decode two consecutive surrogate pairs as UTF-16. + If that fails, fall back to encode those code points in 3 UTF-8 bytes. */ + if (c >= 0xD800 && c <= 0xDBFF) + { + unsigned c2 = *s16 + 2 > srcend + ? 0 + : (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1]; + if (c2 >= 0xDC00 && c2 <= 0xDFFF) { + /* Decode two UTF-16 surrogate pairs into one Unicode code point + (U+10000 to U+10FFFF (not U+FFFFF), 1..4 bytes in UTF-8) */ + + // Fix2: + //c = 0x10000 | (c & 0x3FF) << 10 | (c2 & 0x3FF); + c = 0x10000 + ((c & 0x3FF) << 10) | (c2 & 0x3FF); + + *s16 += 2; + + if (*d8 + 4 > dstend) + return 0; + (*d8)[3] = (char)(0x80 | (c & 0x3F)); + c >>= 6; + (*d8)[2] = (char)(0x80 | (c & 0x3F)); + c >>= 6; + (*d8)[1] = (char)(0x80 | (c & 0x3F)); + c >>= 6; + (*d8)[0] = (char)(0xF0 | c); + *d8 += 4; + return 1; + } } - return result; -} - -// Returns the number of utf8 bytes. -int ucs2_to_utf8_char(char *ucs2, char *utf8) -{ - int result; - unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1]; - if (c <= 0x7F) { - utf8[0] = (unsigned char)c; - result = 1; + if (*d8 + 1 > dstend) + return 0; + *(*d8)++ = (char) c; } else if (c <= 0x7FF) { - utf8[1] = (unsigned char)(0x80 | (c & 0x3F)); - c = (c >> 6); - utf8[0] = (unsigned char)(0xC0 | c); - result = 2; - } - else if (c <= 0xFFFF) - { - utf8[2] = (unsigned char)(0x80 | (c & 0x3F)); - c = (c >> 6); - utf8[1] = (unsigned char)(0x80 | (c & 0x3F)); - c = (c >> 6); - utf8[0] = (unsigned char)(0xE0 | c); - result = 3; + if (*d8 + 2 > dstend) + return 0; + (*d8)[1] = (char) (0x80 | (c & 0x3F)); + c >>= 6; + (*d8)[0] = (char) (0xC0 | c); + *d8 += 2; } else - result = 0; - - utf8[result] = '\0'; - return result; -} - -// Returns number of utf8 characters, not bytes. -int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size) -{ - int result = 0; - char *p = ucs2; - char *end = ucs2 + ucs2_buffer_len; - char utf8char[7]; - size_t len = 0; - int i; - - while (p < end) { - if (!(i = ucs2_to_utf8_char(p, utf8char))) - break; - - if (len + i >= utf8_size) - break; - - strcpy(&utf8[len], utf8char); - len += i; - p += 2; - result++; + if (*d8 + 3 > dstend) + return 0; + (*d8)[2] = (char)(0x80 | (c & 0x3F)); + c >>= 6; + (*d8)[1] = (char)(0x80 | (c & 0x3F)); + c >>= 6; + (*d8)[0] = (char)(0xE0 | c); + *d8 += 3; } - return result; + return 1; } -// Returns number of bytes. +/** Transcode a Unicode string from UTF-16 into UTF-8 +@param buf in: UTF-16 string; out: UTF-8 string +@param len length of the UTF-16 buffer, in bytes +@param maxlen maximum length of the UTF-8 output, in bytes +@return length of the converted UTF-8 string, in bytes */ size_t ucs2utf(char *buf, size_t len, size_t maxlen) { - char *ucs2 = (char *)malloc(len); + char *s = malloc(len); - if (!ucs2) + if (!s) return 0; + else + { + const char *ucs2 = s; + const char *ends = ucs2 + len; + char *utf8 = buf; + char *endd = utf8 + maxlen; - memcpy(ucs2, buf, len); - ucs2_to_utf8_buffer(ucs2, len, buf, maxlen +1); + memcpy(s, buf, len); - free(ucs2); + while (utf16_to_utf8_char(&ucs2, ends, &utf8, endd)); - return strlen(buf); + if (utf8 < endd) + *utf8 = '\0'; + + free(s); + + return utf8 - buf; + } } // Returns number of bytes. size_t iso_utf8_2ucs(char *buf, size_t maxlen) { - size_t ucs2_size = iso_utf8chars(buf) * 2; // Not NULL terminated. - char *ucs2; + size_t ucs2_size = iso_utf8chars(buf) * 4; // Not NULL terminated. + char *ucs2, *d16; + const char *s8; size_t bytes; - if (ucs2_size > maxlen + 1) - ucs2_size = maxlen + 1; - - if (!(ucs2 = (char *)malloc(ucs2_size))) + if (!(ucs2 = malloc(ucs2_size))) return 0; - bytes = 2 * iso_utf8_to_ucs2_buffer(buf, ucs2, ucs2_size); + d16 = ucs2; + s8 = buf; + + while (*s8 && iso_utf8_to_ucs2_char(&s8, &d16, ucs2 + ucs2_size)); + + bytes = d16 - ucs2; + + if (bytes > maxlen) + bytes = maxlen; + memcpy(buf, ucs2, bytes); free(ucs2); diff -pur smstools3.1.21/src/charset.h smstools3-utf16/src/charset.h --- smstools3.1.21/src/charset.h 2017-03-29 23:33:14.000000000 +0300 +++ smstools3-utf16/src/charset.h 2017-07-09 22:51:19.467070697 +0300 @@ -31,13 +31,8 @@ int iso2utf8_file(FILE *fp, char *ascii, int decode_7bit_packed(char *text, char *dest, size_t size_dest); int encode_7bit_packed(char *text, char *dest, size_t size_dest); -int utf8bytes(char *s); -int iso_utf8bytes(char *s); -int iso_utf8chars(char *s); -int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2); -int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size); -int ucs2_to_utf8_char(char *ucs2, char *utf8); -int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size); +int utf8bytes(const char *s); +int iso_utf8chars(const char *s); size_t ucs2utf(char *buf, size_t len, size_t maxlen); size_t iso_utf8_2ucs(char *buf, size_t maxlen); int utf8_to_iso_char(char *utf8, unsigned char *iso); The above patch includes some cleanup as well, mainly the addition of static linkage specifiers and const qualifiers. For testing the patch, I used the following patch: diff -pur smstools3.1.21/src/Makefile smstools3-utf16/src/Makefile --- smstools3.1.21/src/Makefile 2017-05-04 00:05:45.000000000 +0300 +++ smstools3-utf16/src/Makefile 2017-07-09 19:53:23.723804454 +0300 @@ -49,10 +49,12 @@ CFLAGS += -D_FILE_OFFSET_BITS=64 # Use the following only on GNU/Linux and only if you need ps listing like "smsd: MAINPROCESS" and "smsd: GSM1" # CFLAGS += -D USE_LINUX_PS_TRICK -all: smsd +all: smsd smsd-test smsd: smsd.c extras.o locking.o cfgfile.o logging.o alarm.o smsd_cfg.o charset.o stats.o blacklist.o whitelist.o modeminit.o pdu.o charshift.o +smsd-test: test_utf16.o charset.o pdu.o charshift.o + ifneq (,$(findstring SOLARIS,$(CFLAGS))) ifeq (,$(findstring DISABLE_INET_SOCKET,$(CFLAGS))) override LFLAGS += -lsocket -lnsl and the following test program that I wrote first so that I could reproduce the problem without having any hardware attached: #include <stdarg.h> #include <stdio.h> #include <string.h> #include <stdlib.h> #include "smsd_cfg.h" #include "pdu.h" #include "charset.h" static char text[2048]; char* tb_sprintf(char* format, ...) { va_list argp; if (format) { va_start(argp, format); vsnprintf(text, sizeof text, format, argp); va_end(argp); } return text; } void logch(char* format, ...) { tb_sprintf(format); } void writelogfile(int severity, int trouble, char* format, ...) { if (format) { va_list argp; fprintf(stderr, "%d:%d:", severity, trouble); va_start(argp, format); vfprintf(stderr, format, argp); va_end(argp); } } void writelogfile0(int severity, int trouble, char *text) { writelogfile(severity, trouble, "%s", text); } int make_datetime_string(char *dest, size_t dest_size, char *a_date, char *a_time, char *a_format) { *dest = a_date && a_time && a_format ? 0 * dest_size : 0; return 0; } char prch(char ch) { if ((unsigned char)ch >= ' ') return ch; return '.'; } void strcat_realloc(char **buffer, char *str, char *delimiter) { int delimiter_length = 0; if (delimiter) delimiter_length = strlen(delimiter); if (*buffer == 0) { if ((*buffer = malloc(strlen(str) + delimiter_length + 1))) **buffer = 0; } else *buffer = realloc(*buffer, strlen(*buffer) + strlen(str) + delimiter_length + 1); if (*buffer) sprintf(strchr(*buffer, 0), "%s%s", str, delimiter ? delimiter : ""); } char *strcpyo(char *dest, const char *src) { size_t i; for (i = 0; src != '\0'; i++) dest = src; dest = '\0'; return dest; } int main (int argc, char** argv) { if (argc != 2) { fprintf(stderr, "Usage: %s PDU_string_in_hexadecimal\n", *argv); return 1; } int alphabet, with_udh, is_statusreport, is_unsupported_pdu, report, replace; int flash; char sendr[100], date[9], time[9], ascii[MAXTEXT], smsc[31], from_toa[51]; char udh_data[SIZE_UDH_DATA], udh_type[SIZE_UDH_TYPE]; char warning_headers[SIZE_WARNING_HEADERS]; size_t udlen = splitpdu(argv[1], "new", &alphabet, sendr, date, time, ascii, smsc, &with_udh, udh_data, udh_type, &is_statusreport, &is_unsupported_pdu, from_toa, &report, &replace, warning_headers, &flash, 0); size_t len = ucs2utf(ascii, udlen, sizeof ascii); fprintf(stderr, "%d:%zu:%zu:%s\n", alphabet, udlen, len, ascii); size_t ucslen = iso_utf8_2ucs(ascii, sizeof ascii); len = ucs2utf(ascii, ucslen, sizeof ascii); fprintf(stderr, "%zu:%zu:%s\n", ucslen, len, ascii); return 0; } The test string that I used was from the "PDU" header of a received message. With that message, the output of the program would be as follows: 2:40:26:Koe ja toinen 😀😁😂 40:26:Koe ja toinen 😀😁😂 Without the patch applied, the string would be truncated at the first emoji (U+1F600), leaving the string "Koe ja toinen " (with a terminating space). I would appreciate it if you could include this patch in the next smstools release.

Page:  1

SMSTools3 Community » Search Top

 
Time in this board is UTC.  

Privacy Policy   SMS Server Tools 3 Copyright © Keijo Kasvi.