SMSTools3 Bug reports

#1 Sun Jul 09, 2017 20:17, 109 months ago.

Member

Registered:
Jul 2017

Location: Vantaa, Finland

First of all, thank you for this great piece of software.
As part of setting up this system for a friend, I wanted to test how Unicode data is handled.

It turns out there are 2 bugs when receiving messages, and 1 when sending. Both will cause data to be truncated at the first encountered invalid character.

For receiving, any code point at U+8000 or above will cause truncation, if the char data type is signed and sizeof(unsigned int) > 2, which I believe should hold on most systems. The reason is that sign extension will take place here:

int ucs2_to_utf8_char(char *ucs2, char *utf8)
{
  int result;
  unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1];
 

[ Select ]

'c' Syntax Highlight powered by GeSHi

Because of the sign extension, we will have c>0xffff if sizeof(c) > 2, and this function will return 0, causing the string to be truncated.

Another problem for both receiving and sending is that UTF-16 data is not supported. UTF-16 is basically an extension of UCS2, allowing the code points U+10000 to U+FFFFF to be represented with two UCS2 code points, called surrogate pairs. The first pair is in the range 0xd800..0xdbff, and the second one is 0xdc00..0xdffff.

I fixed these problems. I registered on this forum only to report this bug and to submit my patch. I would have preferred to use email for this, but I did not find any address. So, I am posting my patch below (see more discussion after it):

diff -pur smstools3.1.21/src/charset.c smstools3-utf16/src/charset.c
--- smstools3.1.21/src/charset.c        2017-03-31 16:22:07.000000000 +0300
+++ smstools3-utf16/src/charset.c       2017-07-09 22:51:35.203781163 +0300
@@ -521,6 +521,38 @@ int iso_utf8_2gsm(char* source, int size
   return dest_count;
 }
 
+// Returns the number of utf8 bytes.
+static int ucs2_to_utf8_char(const char *ucs2, char *utf8)
+{
+  int result;
+  unsigned c = (((unsigned char)ucs2[0]) << 8) | (unsigned char)ucs2[1];
+
+  if (c <= 0x7F)
+  {
+    utf8[0] = (unsigned char)c;
+    result = 1;
+  }
+  else if (c <= 0x7FF)
+  {
+    utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
+    c = (c >> 6);
+    utf8[0] = (unsigned char)(0xC0 | c);
+    result = 2;
+  }
+  else
+  {
+    utf8[2] = (unsigned char)(0x80 | (c & 0x3F));
+    c = (c >> 6);
+    utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
+    c = (c >> 6);
+    utf8[0] = (unsigned char)(0xE0 | c);
+    result = 3;
+  }
+
+  utf8[result] = '\0';
+  return result;
+}
+
 // Outputs to the file. Return value: 0 = ok, -1 = error.
 int iso2utf8_file(FILE *fp, char *ascii, int userdatalength)
 {
@@ -927,7 +959,7 @@ int decode_7bit_packed(
        return i;
 }
 
-int utf8bytes0(char *s, int allow_iso)
+static int utf8bytes0(const char *s, int allow_iso)
 {
   int result = 1;
   int i;
@@ -960,20 +992,20 @@ int utf8bytes0(char *s, int allow_iso)
   return result;
 }
 
-int utf8bytes(char *s)
+int utf8bytes(const char *s)
 {
   return utf8bytes0(s, 0);
 }
 
-int iso_utf8bytes(char *s)
+static int iso_utf8bytes(const char *s)
 {
   return utf8bytes0(s, 1);
 }
 
-int iso_utf8chars(char *s)
+int iso_utf8chars(const char *s)
 {
   int result = 0;
-  char *p = s;
+  const char *p = s;
   int i;
 
   while (*p)
@@ -988,155 +1020,195 @@ int iso_utf8chars(char *s)
   return result;
 }
 
-int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2)
+/** Transcode a Unicode code point from UTF-8 (or ISO 8859-15) into UTF-16
+@param s8      NUL-terminated UTF-8 or ISO 8859-15 data
+@param d16     UTF-16 data (will be advanced by one character)
+@param end     end the UTF-16 output buffer
+@return whether the conversion succeeded */
+static int iso_utf8_to_ucs2_char(const char **s8, char **d16, const char* end)
 {
-  unsigned int c = 0;
-  int i;
+  unsigned int c;
 
-  i = iso_utf8bytes(utf8);
-  if (len)
-    *len = i;
+  if (*d16 + 2 > end)
+    return 0;
 
-  switch (i)
+  switch (iso_utf8bytes(*s8))
   {
-    case 1:
-      c = (unsigned char)utf8[0];
-      break;
+  case 1:
+    c = (unsigned char)*(*s8)++;
+    break;
+
+  case 2:
+    c = *(*s8)++ & 0x3F;
+    c <<= 6;
+    c |= *(*s8)++ & 0x3F;
+    break;
+
+  case 3:
+    c = *(*s8)++ & 0x0F;
+    c <<= 6;
+    c |= *(*s8)++ & 0x3F;
+    c <<= 6;
+    c |= *(*s8)++ & 0x3F;
+    break;
+
+  case 4:
+    c = *(*s8)++ & 0x0F;
+    c <<= 6;
+    c |= *(*s8)++ & 0x3F;
+    c <<= 6;
+    c |= *(*s8)++ & 0x3F;
+    c <<= 6;
+    c |= *(*s8)++ & 0x3F;
+    break;
 
-    case 2:
-      c = (utf8[0] & 0x1F) << 6 | (utf8[1] & 0x3F);
-      break;
-
-    case 3:
-      c = (utf8[0] & 0x0F) << 12 | (utf8[1] & 0x3F) << 6 | (utf8[2] & 0x3F);
-      break;
+  default:
+    return 0;
+  }
 
-    default:
+  if (c >= 0x10000) {
+    /* Use UTF-16 encoding for code points that reside outside the
+    Basic Multilingual Plane. */
+    if (*d16 + 4 > end)
       return 0;
+
+    // Fix: Subtract 0x10000 from the code point:
+    //c &= 0xFFFF;
+    c -= 0x10000;
+
+    unsigned int c0 = 0xD800 | (c >> 10);
+    *(*d16)++ = (char)(c0 >> 8);
+    *(*d16)++ = (char)(c0 & 0xFF);
+    c &= 0x3FF;
+    c |= 0xDC00;
   }
 
-  ucs2[0] = (unsigned char)((c & 0xFF00) >> 8);
-  ucs2[1] = (unsigned char)(c & 0xFF);
+  *(*d16)++ = (char)(c >> 8);
+  *(*d16)++ = (char)(c & 0xFF);
 
   return 1;
 }
 
-// Note: Returns the number of UCS2 characters, not bytes.
-int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size)
+/** Transcode a Unicode code point from UTF-16 into UTF-8
+@param s16     UTF-16 data (will be advanced by one character)
+@param srcend  end of the UTF-16 string
+@param d8      UTF-8 data (will be advanced by one character)
+@param dstend  end the UTF-8 output buffer
+@return whether the conversion succeeded */
+static int utf16_to_utf8_char(const char **s16, const char *srcend,
+                             char **d8, const char *dstend)
 {
-  char *p = utf8;
-  char *end = utf8 +strlen(utf8);
-  int bytes;
-  size_t dest = 0;
-  int result = 0;
-
-  while (p < end)
-  {
-    if (dest >= ucs2_size -1)
-      break;
+  if (*s16 + 2 > srcend)
+    return 0;
 
-    if (!iso_utf8_to_ucs2_char(p, &bytes, &ucs2[dest]))
-      break;
+  unsigned c = (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1];
+  *s16 += 2;
 
-    p += bytes;
-    dest += 2;
-    result++;
+  /* Attempt to decode two consecutive surrogate pairs as UTF-16.
+  If that fails, fall back to encode those code points in 3 UTF-8 bytes. */
+  if (c >= 0xD800 && c <= 0xDBFF)
+  {
+    unsigned c2 = *s16 + 2 > srcend
+      ? 0
+      : (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1];
+    if (c2 >= 0xDC00 && c2 <= 0xDFFF) {
+      /* Decode two UTF-16 surrogate pairs into one Unicode code point
+      (U+10000 to U+10FFFF (not U+FFFFF), 1..4 bytes in UTF-8) */
+
+      // Fix2:
+      //c = 0x10000 | (c & 0x3FF) << 10 | (c2 & 0x3FF);
+      c = 0x10000 + ((c & 0x3FF) << 10) | (c2 & 0x3FF);
+
+      *s16 += 2;
+
+      if (*d8 + 4 > dstend)
+        return 0;
+      (*d8)[3] = (char)(0x80 | (c & 0x3F));
+      c >>= 6;
+      (*d8)[2] = (char)(0x80 | (c & 0x3F));
+      c >>= 6;
+      (*d8)[1] = (char)(0x80 | (c & 0x3F));
+      c >>= 6;
+      (*d8)[0] = (char)(0xF0 | c);
+      *d8 += 4;
+      return 1;
+    }
   }
 
-  return result;
-}
-
-// Returns the number of utf8 bytes.
-int ucs2_to_utf8_char(char *ucs2, char *utf8)
-{
-  int result;
-  unsigned int c = (ucs2[0] << 8) | (unsigned char)ucs2[1];
-
   if (c <= 0x7F)
   {
-    utf8[0] = (unsigned char)c;
-    result = 1;
+    if (*d8 + 1 > dstend)
+      return 0;
+    *(*d8)++ = (char) c;
   }
   else if (c <= 0x7FF)
   {
-    utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
-    c = (c >> 6);
-    utf8[0] = (unsigned char)(0xC0 | c);
-    result = 2;
-  }
-  else if (c <= 0xFFFF)
-  {
-    utf8[2] = (unsigned char)(0x80 | (c & 0x3F));
-    c = (c >> 6);
-    utf8[1] = (unsigned char)(0x80 | (c & 0x3F));
-    c = (c >> 6);
-    utf8[0] = (unsigned char)(0xE0 | c);
-    result = 3;
+    if (*d8 + 2 > dstend)
+      return 0;
+    (*d8)[1] = (char) (0x80 | (c & 0x3F));
+    c >>= 6;
+    (*d8)[0] = (char) (0xC0 | c);
+    *d8 += 2;
   }
   else
-    result = 0;
-
-  utf8[result] = '\0';
-  return result;
-}
-
-// Returns number of utf8 characters, not bytes.
-int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size)
-{
-  int result = 0;
-  char *p = ucs2;
-  char *end = ucs2 + ucs2_buffer_len;
-  char utf8char[7];
-  size_t len = 0;
-  int i;
-
-  while (p < end)
   {
-    if (!(i = ucs2_to_utf8_char(p, utf8char)))
-      break;
-
-    if (len + i >= utf8_size)
-      break;
-
-    strcpy(&utf8[len], utf8char);
-    len += i;
-    p += 2;
-    result++;
+    if (*d8 + 3 > dstend)
+      return 0;
+    (*d8)[2] = (char)(0x80 | (c & 0x3F));
+    c >>= 6;
+    (*d8)[1] = (char)(0x80 | (c & 0x3F));
+    c >>= 6;
+    (*d8)[0] = (char)(0xE0 | c);
+    *d8 += 3;
   }
 
-  return result;
+  return 1;
 }
 
-// Returns number of bytes.
+/** Transcode a Unicode string from UTF-16 into UTF-8
+@param buf     in: UTF-16 string; out: UTF-8 string
+@param len     length of the UTF-16 buffer, in bytes
+@param maxlen  maximum length of the UTF-8 output, in bytes
+@return length of the converted UTF-8 string, in bytes */
 size_t ucs2utf(char *buf, size_t len, size_t maxlen)
 {
-  char *ucs2 = (char *)malloc(len);
+  char *s = malloc(len);
 
-  if (!ucs2)
+  if (!s)
     return 0;
+  else
+  {
+    const char *ucs2 = s;
+    const char *ends = ucs2 + len;
+    char *utf8 = buf;
+    char *endd = utf8 + maxlen;
 
-  memcpy(ucs2, buf, len);
-  ucs2_to_utf8_buffer(ucs2, len, buf, maxlen +1);
+    memcpy(s, buf, len);
 
-  free(ucs2);
+    while (utf16_to_utf8_char(&ucs2, ends, &utf8, endd));
 
-  return strlen(buf);
+    if (utf8 < endd)
+      *utf8 = '\0';
+
+    free(s);
+
+    return utf8 - buf;
+  }
 }
 
 // Returns number of bytes.
 size_t iso_utf8_2ucs(char *buf, size_t maxlen)
 {
-  size_t ucs2_size = iso_utf8chars(buf) * 2; // Not NULL terminated.
-  char *ucs2;
+  size_t ucs2_size = iso_utf8chars(buf) * 4; // Not NULL terminated.
+  char *ucs2, *d16;
+  const char *s8;
   size_t bytes;
 
-  if (ucs2_size > maxlen + 1)
-    ucs2_size = maxlen + 1;
-
-  if (!(ucs2 = (char *)malloc(ucs2_size)))
+  if (!(ucs2 = malloc(ucs2_size)))
     return 0;
 
-  bytes = 2 * iso_utf8_to_ucs2_buffer(buf, ucs2, ucs2_size);
+  d16 = ucs2;
+  s8 = buf;
+
+  while (*s8 && iso_utf8_to_ucs2_char(&s8, &d16, ucs2 + ucs2_size));
+
+  bytes = d16 - ucs2;
+
+  if (bytes > maxlen)
+    bytes = maxlen;
+
   memcpy(buf, ucs2, bytes);
 
   free(ucs2);
diff -pur smstools3.1.21/src/charset.h smstools3-utf16/src/charset.h
--- smstools3.1.21/src/charset.h        2017-03-29 23:33:14.000000000 +0300
+++ smstools3-utf16/src/charset.h       2017-07-09 22:51:19.467070697 +0300
@@ -31,13 +31,8 @@ int iso2utf8_file(FILE *fp, char *ascii,
 int decode_7bit_packed(char *text, char *dest, size_t size_dest);
 int encode_7bit_packed(char *text, char *dest, size_t size_dest);
 
-int utf8bytes(char *s);
-int iso_utf8bytes(char *s);
-int iso_utf8chars(char *s);
-int iso_utf8_to_ucs2_char(char *utf8, int *len, char *ucs2);
-int iso_utf8_to_ucs2_buffer(char *utf8, char *ucs2, size_t ucs2_size);
-int ucs2_to_utf8_char(char *ucs2, char *utf8);
-int ucs2_to_utf8_buffer(char *ucs2, size_t ucs2_buffer_len, char *utf8, size_t utf8_size);
+int utf8bytes(const char *s);
+int iso_utf8chars(const char *s);
 size_t ucs2utf(char *buf, size_t len, size_t maxlen);
 size_t iso_utf8_2ucs(char *buf, size_t maxlen);
 int utf8_to_iso_char(char *utf8, unsigned char *iso);
 

[ Select ]

[ Expand ]

'diff' Syntax Highlight powered by GeSHi

The above patch includes some cleanup as well, mainly the addition of static linkage specifiers and const qualifiers.
For testing the patch, I used the following patch:

diff -pur smstools3.1.21/src/Makefile smstools3-utf16/src/Makefile
--- smstools3.1.21/src/Makefile 2017-05-04 00:05:45.000000000 +0300
+++ smstools3-utf16/src/Makefile        2017-07-09 19:53:23.723804454 +0300
@@ -49,10 +49,12 @@ CFLAGS += -D_FILE_OFFSET_BITS=64
 # Use the following only on GNU/Linux and only if you need ps listing like "smsd: MAINPROCESS" and "smsd: GSM1"
 # CFLAGS += -D USE_LINUX_PS_TRICK
 
-all: smsd 
+all: smsd smsd-test
 
 smsd: smsd.c extras.o locking.o cfgfile.o logging.o alarm.o smsd_cfg.o charset.o stats.o blacklist.o whitelist.o modeminit.o pdu.o charshift.o
 
+smsd-test: test_utf16.o charset.o pdu.o charshift.o
+
 ifneq (,$(findstring SOLARIS,$(CFLAGS)))
 ifeq (,$(findstring DISABLE_INET_SOCKET,$(CFLAGS)))
        override LFLAGS += -lsocket -lnsl
 

[ Select ]

'diff' Syntax Highlight powered by GeSHi

and the following test program that I wrote first so that I could reproduce the problem without having any hardware attached:

#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "smsd_cfg.h"
#include "pdu.h"
#include "charset.h"

static char text[2048];

char* tb_sprintf(char* format, ...)
{
  va_list argp;

  if (format)
  {
    va_start(argp, format);
    vsnprintf(text, sizeof text, format, argp);
    va_end(argp);
  }

  return text;
}

void logch(char* format, ...) { tb_sprintf(format); }

void writelogfile(int severity, int trouble, char* format, ...)
{
  if (format)
  {
    va_list argp;
    fprintf(stderr, "%d:%d:", severity, trouble);
    va_start(argp, format);
    vfprintf(stderr, format, argp);
    va_end(argp);
  }
}

void writelogfile0(int severity, int trouble, char *text)
{
  writelogfile(severity, trouble, "%s", text);
}

int make_datetime_string(char *dest, size_t dest_size, char *a_date, char *a_time, char *a_format)
{
  *dest = a_date && a_time && a_format ? 0 * dest_size : 0;
  return 0;
}

char prch(char ch)
{
  if ((unsigned char)ch >= ' ')
    return ch;
  return '.';
}

void strcat_realloc(char **buffer, char *str, char *delimiter)
{
  int delimiter_length = 0;

  if (delimiter)
    delimiter_length = strlen(delimiter);

  if (*buffer == 0)
  {
    if ((*buffer = malloc(strlen(str) + delimiter_length + 1)))
      **buffer = 0;
  }
  else
    *buffer = realloc(*buffer, strlen(*buffer) + strlen(str) + delimiter_length + 1);

  if (*buffer)
    sprintf(strchr(*buffer, 0), "%s%s", str, delimiter ? delimiter : "");
}

char *strcpyo(char *dest, const char *src)
{
  size_t i;

  for (i = 0; src[i] != '\0'; i++)
    dest[i] = src[i];

  dest[i] = '\0';

  return dest;
}

int main (int argc, char** argv)
{
  if (argc != 2) {
    fprintf(stderr, "Usage: %s PDU_string_in_hexadecimal\n", *argv);
    return 1;
  }
  int alphabet, with_udh, is_statusreport, is_unsupported_pdu, report, replace;
  int flash;
  char sendr[100], date[9], time[9], ascii[MAXTEXT], smsc[31], from_toa[51];
  char udh_data[SIZE_UDH_DATA], udh_type[SIZE_UDH_TYPE];
  char warning_headers[SIZE_WARNING_HEADERS];
  size_t udlen = splitpdu(argv[1], "new", &alphabet, sendr, date, time, ascii, smsc,
           &with_udh, udh_data, udh_type,
           &is_statusreport, &is_unsupported_pdu, from_toa,
           &report, &replace, warning_headers, &flash, 0);
  size_t len = ucs2utf(ascii, udlen, sizeof ascii);
  fprintf(stderr, "%d:%zu:%zu:%s\n", alphabet, udlen, len, ascii);
  size_t ucslen = iso_utf8_2ucs(ascii, sizeof ascii);
  len = ucs2utf(ascii, ucslen, sizeof ascii);
  fprintf(stderr, "%zu:%zu:%s\n", ucslen, len, ascii);

  return 0;
}
 

[ Select ]

[ Expand ]

'c' Syntax Highlight powered by GeSHi

The test string that I used was from the "PDU" header of a received message. With that message, the output of the program would be as follows:

2:40:26:Koe ja toinen ðŸ˜€ðŸ˜ðŸ˜‚
40:26:Koe ja toinen ðŸ˜€ðŸ˜ðŸ˜‚

[ Select ]

Without the patch applied, the string would be truncated at the first emoji (U+1F600), leaving the string "Koe ja toinen " (with a terminating space).

I would appreciate it if you could include this patch in the next smstools release.

« Last edit by keke on Sun Dec 01, 2019 21:07, 80 months ago. »

marko

Top

#3 Tue Jul 11, 2017 17:48, 109 months ago.

Member

Registered:
Jul 2017

Posts: 4

Location: Vantaa, Finland

Topic owner

Thank you for the prompt response and the decision to prepare an extra release with this fix.
My friend is planning to deploy the system in about a month. It is always easier when custom patches are not needed.

keke

Top

#5 Tue Nov 19, 2019 22:17, 80 months ago.

Administrator

Registered:
May 2009

Posts: 2070

Location: Jyväskylä, Finland

Hi Tiburon, thanks for using this software and applying the patch. I have had very long delay on publishing the next version, because the original implementation in the software is designed to handle 16bit UCS2 characters only. In your case the character (ð œŽ) is \ud841\udf0e in UTF-16, and therefore gets truncated.

I _am_ going to release the next version, some day, but right now I still cannot give any estimation about the date of release.

If you are just sending those 32bit characters, you could use a header Alphabet: UCS2, and create a message body using iconv -t UTF-16BE.

If you are also receiving those characters, the workaround is more complicated. You need to disable internal decoding of Unicode (decode_unicode_text = no), and in the eventhanler you should handle the message body by yourself and give it to the iconv -f UTF-16BE.

EDIT: The patch is now fixed and no workarounds are needed anymore.

« Last edit by keke on Wed Nov 27, 2019 15:13, 80 months ago. »

marko

Top

#7 Mon Nov 25, 2019 06:18, 80 months ago.

Member

Registered:
Jul 2017

Posts: 4

Location: Vantaa, Finland

Topic owner

Thank you for revising my fix. I should really have tested with a broader set of code points.

It looks like the maximum UTF-8 code point that fits in 4 bytes is U+10FFFF (2 UTF-16 surrogate pairs: U+DBFF U+DFFF). GNOME will not let me type anything larger than that.

keke

Top

#9 Tue Nov 26, 2019 17:18, 80 months ago.

Administrator

Registered:
May 2009

Posts: 2070

Location: Jyväskylä, Finland

Sorry. Another fix is required in charset.c:

  /* Attempt to decode two consecutive surrogate pairs as UTF-16.
  If that fails, fall back to encode those code points in 3 UTF-8 bytes. */
  if (c >= 0xD800 && c <= 0xDBFF)
  {
    unsigned c2 = *s16 + 2 > srcend
      ? 0
      : (((unsigned char)(*s16)[0]) << 8) | (unsigned char)(*s16)[1];
    if (c2 >= 0xDC00 && c2 <= 0xDFFF) {
      /* Decode two UTF-16 surrogate pairs into one Unicode code point
      (U+10000 to U+10FFFF (not U+FFFFF), 1..4 bytes in UTF-8) */

      // Fix2:
      //c = 0x10000 | (c & 0x3FF) << 10 | (c2 & 0x3FF);
      c = 0x10000 + ((c & 0x3FF) << 10) | (c2 & 0x3FF);

      *s16 += 2;

      if (*d8 + 4 > dstend)
        return 0;
      (*d8)[3] = (char)(0x80 | (c & 0x3F));
 

[ Select ]

'c' Syntax Highlight powered by GeSHi

« Last edit by keke on Sun Dec 01, 2019 21:04, 80 months ago. »

marko

Top

#11 Fri Nov 29, 2019 17:26, 80 months ago.

Member

Registered:
Jul 2017

Posts: 4

Location: Vantaa, Finland

Topic owner

Hi Keke,

Thanks for fixing it further. Can you please also update the comments to indicate that U+10FFFF is the maximum supported code point, not U+FFFFF?

[PATCH] 3.1.21 truncates messages with certain Unicode code points