Convert text value in SQL Server from UTF8 to ISO 8859-1

I have a column in SQL Server with utf8 SQL_Latin1_General_CP1_CI_AS encoding. How can I convert and save the text in ISO 8859-1 encoding? I would like to do thing in a query on SQL Server. Any tips?

Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito


Solution 1:

I have written a function to repair UTF-8 text that is stored in a varchar field.

To check the fixed values you can use it like this:

CREATE TABLE #Table1 (Column1 varchar(max))

INSERT #Table1
VALUES ('Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito')

SELECT *, NewColumn1 = dbo.DecodeUTF8String(Column1)
FROM Table1
WHERE Column1 <> dbo.DecodeUTF8String(Column1)

Output:

Column1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito

NewColumn1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito

The code:

CREATE FUNCTION dbo.DecodeUTF8String (@value varchar(max))
RETURNS nvarchar(max)
AS
BEGIN
    -- Transforms a UTF-8 encoded varchar string into Unicode
    -- By Anthony Faull 2014-07-31
    DECLARE @result nvarchar(max);

    -- If ASCII or null there's no work to do
    IF (@value IS NULL
        OR @value NOT LIKE '%[^ -~]%' COLLATE Latin1_General_BIN
    )
        RETURN @value;

    -- Generate all integers from 1 to the length of string
    WITH e0(n) AS (SELECT TOP(POWER(2,POWER(2,0))) NULL FROM (VALUES (NULL),(NULL)) e(n))
        , e1(n) AS (SELECT TOP(POWER(2,POWER(2,1))) NULL FROM e0 CROSS JOIN e0 e)
        , e2(n) AS (SELECT TOP(POWER(2,POWER(2,2))) NULL FROM e1 CROSS JOIN e1 e)
        , e3(n) AS (SELECT TOP(POWER(2,POWER(2,3))) NULL FROM e2 CROSS JOIN e2 e)
        , e4(n) AS (SELECT TOP(POWER(2,POWER(2,4))) NULL FROM e3 CROSS JOIN e3 e)
        , e5(n) AS (SELECT TOP(POWER(2.,POWER(2,5)-1)-1) NULL FROM e4 CROSS JOIN e4 e)
    , numbers(position) AS
    (
        SELECT TOP(DATALENGTH(@value)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
        FROM e5
    )
    -- UTF-8 Algorithm (http://en.wikipedia.org/wiki/UTF-8)
    -- For each octet, count the high-order one bits, and extract the data bits.
    , octets AS
    (
        SELECT position, highorderones, partialcodepoint
        FROM numbers a
        -- Split UTF8 string into rows of one octet each.
        CROSS APPLY (SELECT octet = ASCII(SUBSTRING(@value, position, 1))) b
        -- Count the number of leading one bits
        CROSS APPLY (SELECT highorderones = 8 - FLOOR(LOG( ~CONVERT(tinyint, octet) * 2 + 1)/LOG(2))) c
        CROSS APPLY (SELECT databits = 7 - highorderones) d
        CROSS APPLY (SELECT partialcodepoint = octet % POWER(2, databits)) e
    )
    -- Compute the Unicode codepoint for each sequence of 1 to 4 bytes
    , codepoints AS
    (
        SELECT position, codepoint
        FROM
        (
            -- Get the starting octect for each sequence (i.e. exclude the continuation bytes)
            SELECT position, highorderones, partialcodepoint
            FROM octets
            WHERE highorderones <> 1
        ) lead
        CROSS APPLY (SELECT sequencelength = CASE WHEN highorderones in (1,2,3,4) THEN highorderones ELSE 1 END) b
        CROSS APPLY (SELECT endposition = position + sequencelength - 1) c
        CROSS APPLY
        (
            -- Compute the codepoint of a single UTF-8 sequence
            SELECT codepoint = SUM(POWER(2, shiftleft) * partialcodepoint)
            FROM octets
            CROSS APPLY (SELECT shiftleft = 6 * (endposition - position)) b
            WHERE position BETWEEN lead.position AND endposition
        ) d
    )
    -- Concatenate the codepoints into a Unicode string
    SELECT @result = CONVERT(xml,
        (
            SELECT NCHAR(codepoint)
            FROM codepoints
            ORDER BY position
            FOR XML PATH('')
        )).value('.', 'nvarchar(max)');

    RETURN @result;
END
GO

Solution 2:

Jason Penny has also written an SQL function to convert UTF-8 to Unicode (MIT licence) which worked on a simple example for me:

CREATE FUNCTION dbo.UTF8_TO_NVARCHAR(@in VarChar(MAX))
   RETURNS NVarChar(MAX)
AS
BEGIN
   DECLARE @out NVarChar(MAX), @i int, @c int, @c2 int, @c3 int, @nc int

   SELECT @i = 1, @out = ''

   WHILE (@i <= Len(@in))
   BEGIN
      SET @c = Ascii(SubString(@in, @i, 1))

      IF (@c < 128)
      BEGIN
         SET @nc = @c
         SET @i = @i + 1
      END
      ELSE IF (@c > 191 AND @c < 224)
      BEGIN
         SET @c2 = Ascii(SubString(@in, @i + 1, 1))

         SET @nc = (((@c & 31) * 64 /* << 6 */) | (@c2 & 63))
         SET @i = @i + 2
      END
      ELSE
      BEGIN
         SET @c2 = Ascii(SubString(@in, @i + 1, 1))
         SET @c3 = Ascii(SubString(@in, @i + 2, 1))

         SET @nc = (((@c & 15) * 4096 /* << 12 */) | ((@c2 & 63) * 64 /* << 6 */) | (@c3 & 63))
         SET @i = @i + 3
      END

      SET @out = @out + NChar(@nc)
   END
   RETURN @out
END
GO

The ticked answer by Anthony "looks" better to me, but maybe run both if doing conversion and investigate any discrepencies?!

Also we used the very ugly code below to detect BMP page unicode characters that were encoded as UTF-8 and then converted from varchar to nvarchar fields, that can be converted to UCS-16.

LIKE (N'%[' + CONVERT(NVARCHAR,(CHAR(192))) + CONVERT(NVARCHAR,(CHAR(193))) + CONVERT(NVARCHAR,(CHAR(194))) + CONVERT(NVARCHAR,(CHAR(195))) + CONVERT(NVARCHAR,(CHAR(196))) + CONVERT(NVARCHAR,(CHAR(197))) + CONVERT(NVARCHAR,(CHAR(198))) + CONVERT(NVARCHAR,(CHAR(199))) + CONVERT(NVARCHAR,(CHAR(200))) + CONVERT(NVARCHAR,(CHAR(201))) + CONVERT(NVARCHAR,(CHAR(202))) + CONVERT(NVARCHAR,(CHAR(203))) + CONVERT(NVARCHAR,(CHAR(204))) + CONVERT(NVARCHAR,(CHAR(205))) + CONVERT(NVARCHAR,(CHAR(206))) + CONVERT(NVARCHAR,(CHAR(207))) + CONVERT(NVARCHAR,(CHAR(208))) + CONVERT(NVARCHAR,(CHAR(209))) + CONVERT(NVARCHAR,(CHAR(210))) + CONVERT(NVARCHAR,(CHAR(211))) + CONVERT(NVARCHAR,(CHAR(212))) + CONVERT(NVARCHAR,(CHAR(213))) + CONVERT(NVARCHAR,(CHAR(214))) + CONVERT(NVARCHAR,(CHAR(215))) + CONVERT(NVARCHAR,(CHAR(216))) + CONVERT(NVARCHAR,(CHAR(217))) + CONVERT(NVARCHAR,(CHAR(218))) + CONVERT(NVARCHAR,(CHAR(219))) + CONVERT(NVARCHAR,(CHAR(220))) + CONVERT(NVARCHAR,(CHAR(221))) + CONVERT(NVARCHAR,(CHAR(222))) + CONVERT(NVARCHAR,(CHAR(223))) + CONVERT(NVARCHAR,(CHAR(224))) + CONVERT(NVARCHAR,(CHAR(225))) + CONVERT(NVARCHAR,(CHAR(226))) + CONVERT(NVARCHAR,(CHAR(227))) + CONVERT(NVARCHAR,(CHAR(228))) + CONVERT(NVARCHAR,(CHAR(229))) + CONVERT(NVARCHAR,(CHAR(230))) + CONVERT(NVARCHAR,(CHAR(231))) + CONVERT(NVARCHAR,(CHAR(232))) + CONVERT(NVARCHAR,(CHAR(233))) + CONVERT(NVARCHAR,(CHAR(234))) + CONVERT(NVARCHAR,(CHAR(235))) + CONVERT(NVARCHAR,(CHAR(236))) + CONVERT(NVARCHAR,(CHAR(237))) + CONVERT(NVARCHAR,(CHAR(238))) + CONVERT(NVARCHAR,(CHAR(239)))
    + N'][' + CONVERT(NVARCHAR,(CHAR(128))) + CONVERT(NVARCHAR,(CHAR(129))) + CONVERT(NVARCHAR,(CHAR(130))) + CONVERT(NVARCHAR,(CHAR(131))) + CONVERT(NVARCHAR,(CHAR(132))) + CONVERT(NVARCHAR,(CHAR(133))) + CONVERT(NVARCHAR,(CHAR(134))) + CONVERT(NVARCHAR,(CHAR(135))) + CONVERT(NVARCHAR,(CHAR(136))) + CONVERT(NVARCHAR,(CHAR(137))) + CONVERT(NVARCHAR,(CHAR(138))) + CONVERT(NVARCHAR,(CHAR(139))) + CONVERT(NVARCHAR,(CHAR(140))) + CONVERT(NVARCHAR,(CHAR(141))) + CONVERT(NVARCHAR,(CHAR(142))) + CONVERT(NVARCHAR,(CHAR(143))) + CONVERT(NVARCHAR,(CHAR(144))) + CONVERT(NVARCHAR,(CHAR(145))) + CONVERT(NVARCHAR,(CHAR(146))) + CONVERT(NVARCHAR,(CHAR(147))) + CONVERT(NVARCHAR,(CHAR(148))) + CONVERT(NVARCHAR,(CHAR(149))) + CONVERT(NVARCHAR,(CHAR(150))) + CONVERT(NVARCHAR,(CHAR(151))) + CONVERT(NVARCHAR,(CHAR(152))) + CONVERT(NVARCHAR,(CHAR(153))) + CONVERT(NVARCHAR,(CHAR(154))) + CONVERT(NVARCHAR,(CHAR(155))) + CONVERT(NVARCHAR,(CHAR(156))) + CONVERT(NVARCHAR,(CHAR(157))) + CONVERT(NVARCHAR,(CHAR(158))) + CONVERT(NVARCHAR,(CHAR(159))) + CONVERT(NVARCHAR,(CHAR(160))) + CONVERT(NVARCHAR,(CHAR(161))) + CONVERT(NVARCHAR,(CHAR(162))) + CONVERT(NVARCHAR,(CHAR(163))) + CONVERT(NVARCHAR,(CHAR(164))) + CONVERT(NVARCHAR,(CHAR(165))) + CONVERT(NVARCHAR,(CHAR(166))) + CONVERT(NVARCHAR,(CHAR(167))) + CONVERT(NVARCHAR,(CHAR(168))) + CONVERT(NVARCHAR,(CHAR(169))) + CONVERT(NVARCHAR,(CHAR(170))) + CONVERT(NVARCHAR,(CHAR(171))) + CONVERT(NVARCHAR,(CHAR(172))) + CONVERT(NVARCHAR,(CHAR(173))) + CONVERT(NVARCHAR,(CHAR(174))) + CONVERT(NVARCHAR,(CHAR(175))) + CONVERT(NVARCHAR,(CHAR(176))) + CONVERT(NVARCHAR,(CHAR(177))) + CONVERT(NVARCHAR,(CHAR(178))) + CONVERT(NVARCHAR,(CHAR(179))) + CONVERT(NVARCHAR,(CHAR(180))) + CONVERT(NVARCHAR,(CHAR(181))) + CONVERT(NVARCHAR,(CHAR(182))) + CONVERT(NVARCHAR,(CHAR(183))) + CONVERT(NVARCHAR,(CHAR(184))) + CONVERT(NVARCHAR,(CHAR(185))) + CONVERT(NVARCHAR,(CHAR(186))) + CONVERT(NVARCHAR,(CHAR(187))) + CONVERT(NVARCHAR,(CHAR(188))) + CONVERT(NVARCHAR,(CHAR(189))) + CONVERT(NVARCHAR,(CHAR(190))) + CONVERT(NVARCHAR,(CHAR(191)))
    + N']%') COLLATE Latin1_General_BIN

The above:

  • detects multi-byte sequences encoding U+0080 to U+FFFF (U+0080 to U+07FF is encoded as 110xxxxx 10xxxxxx, U+0800 to U+FFFF is encoded as 1110xxxx 10xxxxxx 10xxxxxx)
  • i.e. it detects hex byte 0xC0 to 0xEF followed by hex byte 0x80 to 0xBF
  • ignores ASCII control characters U+0000 to U+001F
  • ignores characters that are already correctly encoded to unicode >= U+0100 (i.e. not UTF-8)
  • ignores unicode characters U+0080 to U+00FF if they don't appear to be part of a UTF-8 sequence e.g. "coöperatief".
  • doesn't use LIKE "%[X-Y]" for X=0x80 to Y=0xBF because of potential collation issues
  • uses CONVERT(VARCHAR,CHAR(X)) instead of NCHAR because we had problems with NCHAR getting converted to the wrong value (for some values).
  • ignores UTF characters greater than U+FFFF (4 to 6 byte sequences which have a first byte of hex 0xF0 to 0xFD)

Solution 3:

i add a little modification to use new string aggregation function string_agg, from sql server 2017 and 2019

SELECT @result=STRING_AGG(NCHAR([codepoint]),'') WITHIN GROUP (ORDER BY position ASC) 
FROM codepoints

change de @result parts to this one. The XML still work in old fashion way. in 2019, string_agg works extreme faster than xml version (obvious... string_agg now is native, and is not fair compare)

Solution 4:

I made a solution that also handles 4 byte sequences (like emojis) by combining the answer from @robocat, some more cases with the logic taken from https://github.com/benkasminbullock/unicode-c/blob/master/unicode.c, and a solution for the problem of encoding extended unicode characters from https://dba.stackexchange.com/questions/139551/how-do-i-set-a-sql-server-unicode-nvarchar-string-to-an-emoji-or-supplementary. It's not fast or pretty, but it's working for me anyway. This particular solution includes Unicode replacement characters wherever it finds unknown bytes. It may be better just to throw an exception in these cases, or leave the bytes as they were, as future encoding could be off, but I preferred this for my use case.

-- Started with https://stackoverflow.com/questions/28168055/convert-text-value-in-sql-server-from-utf8-to-iso-8859-1
-- Modified following source in https://github.com/benkasminbullock/unicode-c/blob/master/unicode.c
-- Made characters > 65535 work using https://dba.stackexchange.com/questions/139551/how-do-i-set-a-sql-server-unicode-nvarchar-string-to-an-emoji-or-supplementary
CREATE FUNCTION dbo.UTF8_TO_NVARCHAR(@in VarChar(MAX)) RETURNS NVarChar(MAX) AS
BEGIN
    DECLARE @out NVarChar(MAX), @thisOut NVARCHAR(MAX), @i int, @c int, @c2 int, @c3 int, @c4 int

    SELECT @i = 1, @out = ''

    WHILE (@i <= Len(@in)) BEGIN
        SET @c = Ascii(SubString(@in, @i, 1))

        IF @c <= 0x7F BEGIN
            SET @thisOut = NCHAR(@c)
            SET @i = @i + 1
        END
        ELSE IF @c BETWEEN 0xC2 AND 0xDF BEGIN
            SET @c2 = Ascii(SubString(@in, @i + 1, 1))
            IF @c2 < 0x80 OR @c2 > 0xBF BEGIN
                SET @thisOut = NCHAR(0xFFFD)
                SET @i = @i + 1
            END
            ELSE BEGIN
                SET @thisOut = NCHAR(((@c & 31) * 64 /* << 6 */) | (@c2 & 63))
                SET @i = @i + 2
            END
        END
        ELSE IF @c BETWEEN 0xE0 AND 0xEF BEGIN
            SET @c2 = Ascii(SubString(@in, @i + 1, 1))
            SET @c3 = Ascii(SubString(@in, @i + 2, 1))
            IF @c2 < 0x80 OR @c2 > 0xBF OR @c3 < 0x80 OR (@c = 0xE0 AND @c2 < 0xA0) BEGIN
                SET @thisOut = NCHAR(0xFFFD)
                SET @i = @i + 1
            END
            ELSE BEGIN
                SET @thisOut = NCHAR(((@c & 15) * 4096 /* << 12 */) | ((@c2 & 63) * 64 /* << 6 */) | (@c3 & 63))
                SET @i = @i + 3
            END
        END
        ELSE IF @c BETWEEN 0xF0 AND 0xF4 BEGIN
            SET @c2 = Ascii(SubString(@in, @i + 1, 1))
            SET @c3 = Ascii(SubString(@in, @i + 2, 1))
            SET @c4 = Ascii(SubString(@in, @i + 3, 1))
            IF @c2 < 0x80 OR @c2 >= 0xC0 OR @c3 < 0x80 OR @c3 >= 0xC0 OR @c4 < 0x80 OR @c4 >= 0xC0 OR (@c = 0xF0 AND @c2 < 0x90) BEGIN
                SET @thisOut = NCHAR(0xFFFD)
                SET @i = @i + 1
            END
            ELSE BEGIN
                DECLARE @nc INT = (((@c & 0x07) * 262144 /* << 18 */) | ((@c2 & 0x3F) * 4096 /* << 12 */) | ((@c3 & 0x3F) * 64) | (@c4 & 0x3F))
                DECLARE @HighSurrogateInt INT = 55232 + (@nc / 1024), @LowSurrogateInt INT = 56320 + (@nc % 1024)
                SET @thisOut = NCHAR(@HighSurrogateInt) + NCHAR(@LowSurrogateInt)
                SET @i = @i + 4
            END
        END
        ELSE BEGIN
            SET @thisOut = NCHAR(0xFFFD)
            SET @i = @i + 1
        END

        SET @out = @out + @thisOut
    END
    RETURN @out
END
GO