As part of setting up this system for a friend, I wanted to test how Unicode data is handled.
It turns out there are 2 bugs when receiving messages, and 1 when sending. Both will cause data to be truncated at the first encountered invalid character.
For receiving, any code point at U+8000 or above will cause truncation, if the char data type is signed and sizeof(unsigned int) > 2, which I believe should hold on most systems. The reason is that sign extension will take place here:
int ucs2_to_utf8_char(char *ucs2, char *utf8)
{
int result;
unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1];
{
int result;
unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1];
'c' Syntax Highlight powered by GeSHi
Because of the sign extension, we will have c>0xffff if sizeof(c) > 2, and this function will return 0, causing the string to be truncated.
Another problem for both receiving and sending is that UTF-16 data is not supported. UTF-16 is basically an extension of UCS2, allowing the code points U+10000 to U+FFFFF to be represented with two UCS2 code points, called surrogate pairs. The first pair is in the range 0xd800..0xdbff, and the second one is 0xdc00..0xdffff.
I fixed these problems. I registered on this forum only to report this bug and to submit my patch. I would have preferred to use email for this, but I did not find any address. So, I am posting my patch below (see more discussion after it):
diff -pur smstools3.1.21/src/charset.c smstools3-utf16/src/charset.c
--- smstools3.1.21/src/charset.c 2017-03-31 16:22:07.000000000 +0300
+++ smstools3-utf16/src/charset.c 2017-07-09 22:51:35.203781163 +0300
@@ -521,6 +521,38 @@ int iso_utf8_2gsm(char* source, int size
return dest_count;
}
+// Returns the number of utf8 bytes.
+static int ucs2_to_utf8_char(const char *ucs2, char *utf8)
+{
+ int result;
+ unsigned c = (((unsigned char)ucs2[0]) << 8) | (unsigned char)ucs2[1];
+
+ if (c <= 0x7F)
+ {
+ utf8[0] = (unsigned char)c;
+ result = 1;
+ }
+ else if (c <= 0x7FF)
+ {
+ utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
+ c = (c >> 6);
+ utf8[0] = (unsigned char)(0xC0 | c);
+ result = 2;
+ }
+ else
+ {
+ utf8[2] = (unsigned char)(0x80 | (c & 0x3F));
+ c = (c >> 6);
+ utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
+ c = (c >> 6);
+ utf8[0] = (unsigned char)(0xE0 | c);
+ result = 3;
+ }
+
+ utf8[result] = '\0';
+ return result;
+}
+
// Outputs to the file. Return value: 0 = ok, -1 = error.
int iso2utf8_file(FILE *fp, char *ascii, int userdatalength)
{
@@ -927,7 +959,7 @@ int decode_7bit_packed(
return i;
}
-int utf8bytes0(char *s, int allow_iso)
+static int utf8bytes0(const char *s, int allow_iso)
{
int result = 1;
int i;
@@ -960,20 +992,20 @@ int utf8bytes0(char *s, int allow_iso)
return result;
}
-int utf8bytes(char *s)
+int utf8bytes(const char *s)
{
return utf8bytes0(s, 0);
}
-int iso_utf8bytes(char *s)
+static int iso_utf8bytes(const char *s)
{
return utf8bytes0(s, 1);
}
-int iso_utf8chars(char *s)
+int iso_utf8chars(const char *s)
{
int result = 0;
- char *p = s;
+ const char *p = s;
int i;
while (*p)
@@ -988,155 +1020,195 @@ int iso_utf8chars(char *s)
return result;
}
-int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2)
+/** Transcode a Unicode code point from UTF-8 (or ISO 8859-15) into UTF-16
+@param s8 NUL-terminated UTF-8 or ISO 8859-15 data
+@param d16 UTF-16 data (will be advanced by one character)
+@param end end the UTF-16 output buffer
+@return whether the conversion succeeded */
+static int iso_utf8_to_ucs2_char(const char **s8, char **d16, const char* end)
{
- unsigned int c = 0;
- int i;
+ unsigned int c;
- i = iso_utf8bytes(utf8);
- if (len)
- *len = i;
+ if (*d16 + 2 > end)
+ return 0;
- switch (i)
+ switch (iso_utf8bytes(*s8))
{
- case 1:
- c = (unsigned char)utf8[0];
- break;
+ case 1:
+ c = (unsigned char)*(*s8)++;
+ break;
+
+ case 2:
+ c = *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ break;
+
+ case 3:
+ c = *(*s8)++ & 0x0F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ break;
+
+ case 4:
+ c = *(*s8)++ & 0x0F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ break;
- case 2:
- c = (utf8[0] & 0x1F) << 6 | (utf8[1] & 0x3F);
- break;
-
- case 3:
- c = (utf8[0] & 0x0F) << 12 | (utf8[1] & 0x3F) << 6 | (utf8[2] & 0x3F);
- break;
+ default:
+ return 0;
+ }
- default:
+ if (c >= 0x10000) {
+ /* Use UTF-16 encoding for code points that reside outside the
+ Basic Multilingual Plane. */
+ if (*d16 + 4 > end)
return 0;
+
+ // Fix: Subtract 0x10000 from the code point:
+ //c &= 0xFFFF;
+ c -= 0x10000;
+
+ unsigned int c0 = 0xD800 | (c >> 10);
+ *(*d16)++ = (char)(c0 >> 8);
+ *(*d16)++ = (char)(c0 & 0xFF);
+ c &= 0x3FF;
+ c |= 0xDC00;
}
- ucs2[0] = (unsigned char)((c & 0xFF00) >> 8);
- ucs2[1] = (unsigned char)(c & 0xFF);
+ *(*d16)++ = (char)(c >> 8);
+ *(*d16)++ = (char)(c & 0xFF);
return 1;
}
-// Note: Returns the number of UCS2 characters, not bytes.
-int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size)
+/** Transcode a Unicode code point from UTF-16 into UTF-8
+@param s16 UTF-16 data (will be advanced by one character)
+@param srcend end of the UTF-16 string
+@param d8 UTF-8 data (will be advanced by one character)
+@param dstend end the UTF-8 output buffer
+@return whether the conversion succeeded */
+static int utf16_to_utf8_char(const char **s16, const char *srcend,
+ char **d8, const char *dstend)
{
- char *p = utf8;
- char *end = utf8 +strlen(utf8);
- int bytes;
- size_t dest = 0;
- int result = 0;
-
- while (p < end)
- {
- if (dest >= ucs2_size -1)
- break;
+ if (*s16 + 2 > srcend)
+ return 0;
- if (!iso_utf8_to_ucs2_char(p, &bytes, &ucs2[dest]))
- break;
+ unsigned c = (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1];
+ *s16 += 2;
- p += bytes;
- dest += 2;
- result++;
+ /* Attempt to decode two consecutive surrogate pairs as UTF-16.
+ If that fails, fall back to encode those code points in 3 UTF-8 bytes. */
+ if (c >= 0xD800 && c <= 0xDBFF)
+ {
+ unsigned c2 = *s16 + 2 > srcend
+ ? 0
+ : (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1];
+ if (c2 >= 0xDC00 && c2 <= 0xDFFF) {
+ /* Decode two UTF-16 surrogate pairs into one Unicode code point
+ (U+10000 to U+10FFFF (not U+FFFFF), 1..4 bytes in UTF-8) */
+
+ // Fix2:
+ //c = 0x10000 | (c & 0x3FF) << 10 | (c2 & 0x3FF);
+ c = 0x10000 + ((c & 0x3FF) << 10) | (c2 & 0x3FF);
+
+ *s16 += 2;
+
+ if (*d8 + 4 > dstend)
+ return 0;
+ (*d8)[3] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[2] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[1] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[0] = (char)(0xF0 | c);
+ *d8 += 4;
+ return 1;
+ }
}
- return result;
-}
-
-// Returns the number of utf8 bytes.
-int ucs2_to_utf8_char(char *ucs2, char *utf8)
-{
- int result;
- unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1];
-
if (c <= 0x7F)
{
- utf8[0] = (unsigned char)c;
- result = 1;
+ if (*d8 + 1 > dstend)
+ return 0;
+ *(*d8)++ = (char) c;
}
else if (c <= 0x7FF)
{
- utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
- c = (c >> 6);
- utf8[0] = (unsigned char)(0xC0 | c);
- result = 2;
- }
- else if (c <= 0xFFFF)
- {
- utf8[2] = (unsigned char)(0x80 | (c & 0x3F));
- c = (c >> 6);
- utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
- c = (c >> 6);
- utf8[0] = (unsigned char)(0xE0 | c);
- result = 3;
+ if (*d8 + 2 > dstend)
+ return 0;
+ (*d8)[1] = (char) (0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[0] = (char) (0xC0 | c);
+ *d8 += 2;
}
else
- result = 0;
-
- utf8[result] = '\0';
- return result;
-}
-
-// Returns number of utf8 characters, not bytes.
-int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size)
-{
- int result = 0;
- char *p = ucs2;
- char *end = ucs2 + ucs2_buffer_len;
- char utf8char[7];
- size_t len = 0;
- int i;
-
- while (p < end)
{
- if (!(i = ucs2_to_utf8_char(p, utf8char)))
- break;
-
- if (len + i >= utf8_size)
- break;
-
- strcpy(&utf8[len], utf8char);
- len += i;
- p += 2;
- result++;
+ if (*d8 + 3 > dstend)
+ return 0;
+ (*d8)[2] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[1] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[0] = (char)(0xE0 | c);
+ *d8 += 3;
}
- return result;
+ return 1;
}
-// Returns number of bytes.
+/** Transcode a Unicode string from UTF-16 into UTF-8
+@param buf in: UTF-16 string; out: UTF-8 string
+@param len length of the UTF-16 buffer, in bytes
+@param maxlen maximum length of the UTF-8 output, in bytes
+@return length of the converted UTF-8 string, in bytes */
size_t ucs2utf(char *buf, size_t len, size_t maxlen)
{
- char *ucs2 = (char *)malloc(len);
+ char *s = malloc(len);
- if (!ucs2)
+ if (!s)
return 0;
+ else
+ {
+ const char *ucs2 = s;
+ const char *ends = ucs2 + len;
+ char *utf8 = buf;
+ char *endd = utf8 + maxlen;
- memcpy(ucs2, buf, len);
- ucs2_to_utf8_buffer(ucs2, len, buf, maxlen +1);
+ memcpy(s, buf, len);
- free(ucs2);
+ while (utf16_to_utf8_char(&ucs2, ends, &utf8, endd));
- return strlen(buf);
+ if (utf8 < endd)
+ *utf8 = '\0';
+
+ free(s);
+
+ return utf8 - buf;
+ }
}
// Returns number of bytes.
size_t iso_utf8_2ucs(char *buf, size_t maxlen)
{
- size_t ucs2_size = iso_utf8chars(buf) * 2; // Not NULL terminated.
- char *ucs2;
+ size_t ucs2_size = iso_utf8chars(buf) * 4; // Not NULL terminated.
+ char *ucs2, *d16;
+ const char *s8;
size_t bytes;
- if (ucs2_size > maxlen + 1)
- ucs2_size = maxlen + 1;
-
- if (!(ucs2 = (char *)malloc(ucs2_size)))
+ if (!(ucs2 = malloc(ucs2_size)))
return 0;
- bytes = 2 * iso_utf8_to_ucs2_buffer(buf, ucs2, ucs2_size);
+ d16 = ucs2;
+ s8 = buf;
+
+ while (*s8 && iso_utf8_to_ucs2_char(&s8, &d16, ucs2 + ucs2_size));
+
+ bytes = d16 - ucs2;
+
+ if (bytes > maxlen)
+ bytes = maxlen;
+
memcpy(buf, ucs2, bytes);
free(ucs2);
diff -pur smstools3.1.21/src/charset.h smstools3-utf16/src/charset.h
--- smstools3.1.21/src/charset.h 2017-03-29 23:33:14.000000000 +0300
+++ smstools3-utf16/src/charset.h 2017-07-09 22:51:19.467070697 +0300
@@ -31,13 +31,8 @@ int iso2utf8_file(FILE *fp, char *ascii,
int decode_7bit_packed(char *text, char *dest, size_t size_dest);
int encode_7bit_packed(char *text, char *dest, size_t size_dest);
-int utf8bytes(char *s);
-int iso_utf8bytes(char *s);
-int iso_utf8chars(char *s);
-int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2);
-int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size);
-int ucs2_to_utf8_char(char *ucs2, char *utf8);
-int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size);
+int utf8bytes(const char *s);
+int iso_utf8chars(const char *s);
size_t ucs2utf(char *buf, size_t len, size_t maxlen);
size_t iso_utf8_2ucs(char *buf, size_t maxlen);
int utf8_to_iso_char(char *utf8, unsigned char *iso);
--- smstools3.1.21/src/charset.c 2017-03-31 16:22:07.000000000 +0300
+++ smstools3-utf16/src/charset.c 2017-07-09 22:51:35.203781163 +0300
@@ -521,6 +521,38 @@ int iso_utf8_2gsm(char* source, int size
return dest_count;
}
+// Returns the number of utf8 bytes.
+static int ucs2_to_utf8_char(const char *ucs2, char *utf8)
+{
+ int result;
+ unsigned c = (((unsigned char)ucs2[0]) << 8) | (unsigned char)ucs2[1];
+
+ if (c <= 0x7F)
+ {
+ utf8[0] = (unsigned char)c;
+ result = 1;
+ }
+ else if (c <= 0x7FF)
+ {
+ utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
+ c = (c >> 6);
+ utf8[0] = (unsigned char)(0xC0 | c);
+ result = 2;
+ }
+ else
+ {
+ utf8[2] = (unsigned char)(0x80 | (c & 0x3F));
+ c = (c >> 6);
+ utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
+ c = (c >> 6);
+ utf8[0] = (unsigned char)(0xE0 | c);
+ result = 3;
+ }
+
+ utf8[result] = '\0';
+ return result;
+}
+
// Outputs to the file. Return value: 0 = ok, -1 = error.
int iso2utf8_file(FILE *fp, char *ascii, int userdatalength)
{
@@ -927,7 +959,7 @@ int decode_7bit_packed(
return i;
}
-int utf8bytes0(char *s, int allow_iso)
+static int utf8bytes0(const char *s, int allow_iso)
{
int result = 1;
int i;
@@ -960,20 +992,20 @@ int utf8bytes0(char *s, int allow_iso)
return result;
}
-int utf8bytes(char *s)
+int utf8bytes(const char *s)
{
return utf8bytes0(s, 0);
}
-int iso_utf8bytes(char *s)
+static int iso_utf8bytes(const char *s)
{
return utf8bytes0(s, 1);
}
-int iso_utf8chars(char *s)
+int iso_utf8chars(const char *s)
{
int result = 0;
- char *p = s;
+ const char *p = s;
int i;
while (*p)
@@ -988,155 +1020,195 @@ int iso_utf8chars(char *s)
return result;
}
-int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2)
+/** Transcode a Unicode code point from UTF-8 (or ISO 8859-15) into UTF-16
+@param s8 NUL-terminated UTF-8 or ISO 8859-15 data
+@param d16 UTF-16 data (will be advanced by one character)
+@param end end the UTF-16 output buffer
+@return whether the conversion succeeded */
+static int iso_utf8_to_ucs2_char(const char **s8, char **d16, const char* end)
{
- unsigned int c = 0;
- int i;
+ unsigned int c;
- i = iso_utf8bytes(utf8);
- if (len)
- *len = i;
+ if (*d16 + 2 > end)
+ return 0;
- switch (i)
+ switch (iso_utf8bytes(*s8))
{
- case 1:
- c = (unsigned char)utf8[0];
- break;
+ case 1:
+ c = (unsigned char)*(*s8)++;
+ break;
+
+ case 2:
+ c = *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ break;
+
+ case 3:
+ c = *(*s8)++ & 0x0F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ break;
+
+ case 4:
+ c = *(*s8)++ & 0x0F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ c <<= 6;
+ c |= *(*s8)++ & 0x3F;
+ break;
- case 2:
- c = (utf8[0] & 0x1F) << 6 | (utf8[1] & 0x3F);
- break;
-
- case 3:
- c = (utf8[0] & 0x0F) << 12 | (utf8[1] & 0x3F) << 6 | (utf8[2] & 0x3F);
- break;
+ default:
+ return 0;
+ }
- default:
+ if (c >= 0x10000) {
+ /* Use UTF-16 encoding for code points that reside outside the
+ Basic Multilingual Plane. */
+ if (*d16 + 4 > end)
return 0;
+
+ // Fix: Subtract 0x10000 from the code point:
+ //c &= 0xFFFF;
+ c -= 0x10000;
+
+ unsigned int c0 = 0xD800 | (c >> 10);
+ *(*d16)++ = (char)(c0 >> 8);
+ *(*d16)++ = (char)(c0 & 0xFF);
+ c &= 0x3FF;
+ c |= 0xDC00;
}
- ucs2[0] = (unsigned char)((c & 0xFF00) >> 8);
- ucs2[1] = (unsigned char)(c & 0xFF);
+ *(*d16)++ = (char)(c >> 8);
+ *(*d16)++ = (char)(c & 0xFF);
return 1;
}
-// Note: Returns the number of UCS2 characters, not bytes.
-int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size)
+/** Transcode a Unicode code point from UTF-16 into UTF-8
+@param s16 UTF-16 data (will be advanced by one character)
+@param srcend end of the UTF-16 string
+@param d8 UTF-8 data (will be advanced by one character)
+@param dstend end the UTF-8 output buffer
+@return whether the conversion succeeded */
+static int utf16_to_utf8_char(const char **s16, const char *srcend,
+ char **d8, const char *dstend)
{
- char *p = utf8;
- char *end = utf8 +strlen(utf8);
- int bytes;
- size_t dest = 0;
- int result = 0;
-
- while (p < end)
- {
- if (dest >= ucs2_size -1)
- break;
+ if (*s16 + 2 > srcend)
+ return 0;
- if (!iso_utf8_to_ucs2_char(p, &bytes, &ucs2[dest]))
- break;
+ unsigned c = (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1];
+ *s16 += 2;
- p += bytes;
- dest += 2;
- result++;
+ /* Attempt to decode two consecutive surrogate pairs as UTF-16.
+ If that fails, fall back to encode those code points in 3 UTF-8 bytes. */
+ if (c >= 0xD800 && c <= 0xDBFF)
+ {
+ unsigned c2 = *s16 + 2 > srcend
+ ? 0
+ : (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1];
+ if (c2 >= 0xDC00 && c2 <= 0xDFFF) {
+ /* Decode two UTF-16 surrogate pairs into one Unicode code point
+ (U+10000 to U+10FFFF (not U+FFFFF), 1..4 bytes in UTF-8) */
+
+ // Fix2:
+ //c = 0x10000 | (c & 0x3FF) << 10 | (c2 & 0x3FF);
+ c = 0x10000 + ((c & 0x3FF) << 10) | (c2 & 0x3FF);
+
+ *s16 += 2;
+
+ if (*d8 + 4 > dstend)
+ return 0;
+ (*d8)[3] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[2] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[1] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[0] = (char)(0xF0 | c);
+ *d8 += 4;
+ return 1;
+ }
}
- return result;
-}
-
-// Returns the number of utf8 bytes.
-int ucs2_to_utf8_char(char *ucs2, char *utf8)
-{
- int result;
- unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1];
-
if (c <= 0x7F)
{
- utf8[0] = (unsigned char)c;
- result = 1;
+ if (*d8 + 1 > dstend)
+ return 0;
+ *(*d8)++ = (char) c;
}
else if (c <= 0x7FF)
{
- utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
- c = (c >> 6);
- utf8[0] = (unsigned char)(0xC0 | c);
- result = 2;
- }
- else if (c <= 0xFFFF)
- {
- utf8[2] = (unsigned char)(0x80 | (c & 0x3F));
- c = (c >> 6);
- utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
- c = (c >> 6);
- utf8[0] = (unsigned char)(0xE0 | c);
- result = 3;
+ if (*d8 + 2 > dstend)
+ return 0;
+ (*d8)[1] = (char) (0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[0] = (char) (0xC0 | c);
+ *d8 += 2;
}
else
- result = 0;
-
- utf8[result] = '\0';
- return result;
-}
-
-// Returns number of utf8 characters, not bytes.
-int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size)
-{
- int result = 0;
- char *p = ucs2;
- char *end = ucs2 + ucs2_buffer_len;
- char utf8char[7];
- size_t len = 0;
- int i;
-
- while (p < end)
{
- if (!(i = ucs2_to_utf8_char(p, utf8char)))
- break;
-
- if (len + i >= utf8_size)
- break;
-
- strcpy(&utf8[len], utf8char);
- len += i;
- p += 2;
- result++;
+ if (*d8 + 3 > dstend)
+ return 0;
+ (*d8)[2] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[1] = (char)(0x80 | (c & 0x3F));
+ c >>= 6;
+ (*d8)[0] = (char)(0xE0 | c);
+ *d8 += 3;
}
- return result;
+ return 1;
}
-// Returns number of bytes.
+/** Transcode a Unicode string from UTF-16 into UTF-8
+@param buf in: UTF-16 string; out: UTF-8 string
+@param len length of the UTF-16 buffer, in bytes
+@param maxlen maximum length of the UTF-8 output, in bytes
+@return length of the converted UTF-8 string, in bytes */
size_t ucs2utf(char *buf, size_t len, size_t maxlen)
{
- char *ucs2 = (char *)malloc(len);
+ char *s = malloc(len);
- if (!ucs2)
+ if (!s)
return 0;
+ else
+ {
+ const char *ucs2 = s;
+ const char *ends = ucs2 + len;
+ char *utf8 = buf;
+ char *endd = utf8 + maxlen;
- memcpy(ucs2, buf, len);
- ucs2_to_utf8_buffer(ucs2, len, buf, maxlen +1);
+ memcpy(s, buf, len);
- free(ucs2);
+ while (utf16_to_utf8_char(&ucs2, ends, &utf8, endd));
- return strlen(buf);
+ if (utf8 < endd)
+ *utf8 = '\0';
+
+ free(s);
+
+ return utf8 - buf;
+ }
}
// Returns number of bytes.
size_t iso_utf8_2ucs(char *buf, size_t maxlen)
{
- size_t ucs2_size = iso_utf8chars(buf) * 2; // Not NULL terminated.
- char *ucs2;
+ size_t ucs2_size = iso_utf8chars(buf) * 4; // Not NULL terminated.
+ char *ucs2, *d16;
+ const char *s8;
size_t bytes;
- if (ucs2_size > maxlen + 1)
- ucs2_size = maxlen + 1;
-
- if (!(ucs2 = (char *)malloc(ucs2_size)))
+ if (!(ucs2 = malloc(ucs2_size)))
return 0;
- bytes = 2 * iso_utf8_to_ucs2_buffer(buf, ucs2, ucs2_size);
+ d16 = ucs2;
+ s8 = buf;
+
+ while (*s8 && iso_utf8_to_ucs2_char(&s8, &d16, ucs2 + ucs2_size));
+
+ bytes = d16 - ucs2;
+
+ if (bytes > maxlen)
+ bytes = maxlen;
+
memcpy(buf, ucs2, bytes);
free(ucs2);
diff -pur smstools3.1.21/src/charset.h smstools3-utf16/src/charset.h
--- smstools3.1.21/src/charset.h 2017-03-29 23:33:14.000000000 +0300
+++ smstools3-utf16/src/charset.h 2017-07-09 22:51:19.467070697 +0300
@@ -31,13 +31,8 @@ int iso2utf8_file(FILE *fp, char *ascii,
int decode_7bit_packed(char *text, char *dest, size_t size_dest);
int encode_7bit_packed(char *text, char *dest, size_t size_dest);
-int utf8bytes(char *s);
-int iso_utf8bytes(char *s);
-int iso_utf8chars(char *s);
-int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2);
-int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size);
-int ucs2_to_utf8_char(char *ucs2, char *utf8);
-int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size);
+int utf8bytes(const char *s);
+int iso_utf8chars(const char *s);
size_t ucs2utf(char *buf, size_t len, size_t maxlen);
size_t iso_utf8_2ucs(char *buf, size_t maxlen);
int utf8_to_iso_char(char *utf8, unsigned char *iso);
The above patch includes some cleanup as well, mainly the addition of static linkage specifiers and const qualifiers.
For testing the patch, I used the following patch:
diff -pur smstools3.1.21/src/Makefile smstools3-utf16/src/Makefile
--- smstools3.1.21/src/Makefile 2017-05-04 00:05:45.000000000 +0300
+++ smstools3-utf16/src/Makefile 2017-07-09 19:53:23.723804454 +0300
@@ -49,10 +49,12 @@ CFLAGS += -D_FILE_OFFSET_BITS=64
# Use the following only on GNU/Linux and only if you need ps listing like "smsd: MAINPROCESS" and "smsd: GSM1"
# CFLAGS += -D USE_LINUX_PS_TRICK
-all: smsd
+all: smsd smsd-test
smsd: smsd.c extras.o locking.o cfgfile.o logging.o alarm.o smsd_cfg.o charset.o stats.o blacklist.o whitelist.o modeminit.o pdu.o charshift.o
+smsd-test: test_utf16.o charset.o pdu.o charshift.o
+
ifneq (,$(findstring SOLARIS,$(CFLAGS)))
ifeq (,$(findstring DISABLE_INET_SOCKET,$(CFLAGS)))
override LFLAGS += -lsocket -lnsl
--- smstools3.1.21/src/Makefile 2017-05-04 00:05:45.000000000 +0300
+++ smstools3-utf16/src/Makefile 2017-07-09 19:53:23.723804454 +0300
@@ -49,10 +49,12 @@ CFLAGS += -D_FILE_OFFSET_BITS=64
# Use the following only on GNU/Linux and only if you need ps listing like "smsd: MAINPROCESS" and "smsd: GSM1"
# CFLAGS += -D USE_LINUX_PS_TRICK
-all: smsd
+all: smsd smsd-test
smsd: smsd.c extras.o locking.o cfgfile.o logging.o alarm.o smsd_cfg.o charset.o stats.o blacklist.o whitelist.o modeminit.o pdu.o charshift.o
+smsd-test: test_utf16.o charset.o pdu.o charshift.o
+
ifneq (,$(findstring SOLARIS,$(CFLAGS)))
ifeq (,$(findstring DISABLE_INET_SOCKET,$(CFLAGS)))
override LFLAGS += -lsocket -lnsl
'diff' Syntax Highlight powered by GeSHi
and the following test program that I wrote first so that I could reproduce the problem without having any hardware attached:
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "smsd_cfg.h"
#include "pdu.h"
#include "charset.h"
static char text[2048];
char* tb_sprintf(char* format, ...)
{
va_list argp;
if (format)
{
va_start(argp, format);
vsnprintf(text, sizeof text, format, argp);
va_end(argp);
}
return text;
}
void logch(char* format, ...) { tb_sprintf(format); }
void writelogfile(int severity, int trouble, char* format, ...)
{
if (format)
{
va_list argp;
fprintf(stderr, "%d:%d:", severity, trouble);
va_start(argp, format);
vfprintf(stderr, format, argp);
va_end(argp);
}
}
void writelogfile0(int severity, int trouble, char *text)
{
writelogfile(severity, trouble, "%s", text);
}
int make_datetime_string(char *dest, size_t dest_size, char *a_date, char *a_time, char *a_format)
{
*dest = a_date && a_time && a_format ? 0 * dest_size : 0;
return 0;
}
char prch(char ch)
{
if ((unsigned char)ch >= ' ')
return ch;
return '.';
}
void strcat_realloc(char **buffer, char *str, char *delimiter)
{
int delimiter_length = 0;
if (delimiter)
delimiter_length = strlen(delimiter);
if (*buffer == 0)
{
if ((*buffer = malloc(strlen(str) + delimiter_length + 1)))
**buffer = 0;
}
else
*buffer = realloc(*buffer, strlen(*buffer) + strlen(str) + delimiter_length + 1);
if (*buffer)
sprintf(strchr(*buffer, 0), "%s%s", str, delimiter ? delimiter : "");
}
char *strcpyo(char *dest, const char *src)
{
size_t i;
for (i = 0; src[i] != '\0'; i++)
dest[i] = src[i];
dest[i] = '\0';
return dest;
}
int main (int argc, char** argv)
{
if (argc != 2) {
fprintf(stderr, "Usage: %s PDU_string_in_hexadecimal\n", *argv);
return 1;
}
int alphabet, with_udh, is_statusreport, is_unsupported_pdu, report, replace;
int flash;
char sendr[100], date[9], time[9], ascii[MAXTEXT], smsc[31], from_toa[51];
char udh_data[SIZE_UDH_DATA], udh_type[SIZE_UDH_TYPE];
char warning_headers[SIZE_WARNING_HEADERS];
size_t udlen = splitpdu(argv[1], "new", &alphabet, sendr, date, time, ascii, smsc,
&with_udh, udh_data, udh_type,
&is_statusreport, &is_unsupported_pdu, from_toa,
&report, &replace, warning_headers, &flash, 0);
size_t len = ucs2utf(ascii, udlen, sizeof ascii);
fprintf(stderr, "%d:%zu:%zu:%s\n", alphabet, udlen, len, ascii);
size_t ucslen = iso_utf8_2ucs(ascii, sizeof ascii);
len = ucs2utf(ascii, ucslen, sizeof ascii);
fprintf(stderr, "%zu:%zu:%s\n", ucslen, len, ascii);
return 0;
}
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "smsd_cfg.h"
#include "pdu.h"
#include "charset.h"
static char text[2048];
char* tb_sprintf(char* format, ...)
{
va_list argp;
if (format)
{
va_start(argp, format);
vsnprintf(text, sizeof text, format, argp);
va_end(argp);
}
return text;
}
void logch(char* format, ...) { tb_sprintf(format); }
void writelogfile(int severity, int trouble, char* format, ...)
{
if (format)
{
va_list argp;
fprintf(stderr, "%d:%d:", severity, trouble);
va_start(argp, format);
vfprintf(stderr, format, argp);
va_end(argp);
}
}
void writelogfile0(int severity, int trouble, char *text)
{
writelogfile(severity, trouble, "%s", text);
}
int make_datetime_string(char *dest, size_t dest_size, char *a_date, char *a_time, char *a_format)
{
*dest = a_date && a_time && a_format ? 0 * dest_size : 0;
return 0;
}
char prch(char ch)
{
if ((unsigned char)ch >= ' ')
return ch;
return '.';
}
void strcat_realloc(char **buffer, char *str, char *delimiter)
{
int delimiter_length = 0;
if (delimiter)
delimiter_length = strlen(delimiter);
if (*buffer == 0)
{
if ((*buffer = malloc(strlen(str) + delimiter_length + 1)))
**buffer = 0;
}
else
*buffer = realloc(*buffer, strlen(*buffer) + strlen(str) + delimiter_length + 1);
if (*buffer)
sprintf(strchr(*buffer, 0), "%s%s", str, delimiter ? delimiter : "");
}
char *strcpyo(char *dest, const char *src)
{
size_t i;
for (i = 0; src[i] != '\0'; i++)
dest[i] = src[i];
dest[i] = '\0';
return dest;
}
int main (int argc, char** argv)
{
if (argc != 2) {
fprintf(stderr, "Usage: %s PDU_string_in_hexadecimal\n", *argv);
return 1;
}
int alphabet, with_udh, is_statusreport, is_unsupported_pdu, report, replace;
int flash;
char sendr[100], date[9], time[9], ascii[MAXTEXT], smsc[31], from_toa[51];
char udh_data[SIZE_UDH_DATA], udh_type[SIZE_UDH_TYPE];
char warning_headers[SIZE_WARNING_HEADERS];
size_t udlen = splitpdu(argv[1], "new", &alphabet, sendr, date, time, ascii, smsc,
&with_udh, udh_data, udh_type,
&is_statusreport, &is_unsupported_pdu, from_toa,
&report, &replace, warning_headers, &flash, 0);
size_t len = ucs2utf(ascii, udlen, sizeof ascii);
fprintf(stderr, "%d:%zu:%zu:%s\n", alphabet, udlen, len, ascii);
size_t ucslen = iso_utf8_2ucs(ascii, sizeof ascii);
len = ucs2utf(ascii, ucslen, sizeof ascii);
fprintf(stderr, "%zu:%zu:%s\n", ucslen, len, ascii);
return 0;
}
The test string that I used was from the "PDU" header of a received message. With that message, the output of the program would be as follows:
Without the patch applied, the string would be truncated at the first emoji (U+1F600), leaving the string "Koe ja toinen " (with a terminating space).
I would appreciate it if you could include this patch in the next smstools release.
« Last edit by keke on Sun Dec 01, 2019 21:07, 60 months ago. »