我在SQL Server中有一列使用 SQL_Latin1_General_CP1_CI_AS 编码的文本。如何将文本转换并保存为 ISO 8859-1 编码?我想在SQL Server上的查询中完成这个操作。有什么提示吗?
你好。我喜欢这个游戏。当我下载它时,我甚至以为我不会很喜欢。
我在SQL Server中有一列使用 SQL_Latin1_General_CP1_CI_AS 编码的文本。如何将文本转换并保存为 ISO 8859-1 编码?我想在SQL Server上的查询中完成这个操作。有什么提示吗?
你好。我喜欢这个游戏。当我下载它时,我甚至以为我不会很喜欢。
我已经编写了一个函数来修复存储在varchar
字段中的UTF-8文本。
您可以按照以下方式使用它来检查修复后的值:
CREATE TABLE #Table1 (Column1 varchar(max))
INSERT #Table1
VALUES ('Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito')
SELECT *, NewColumn1 = dbo.DecodeUTF8String(Column1)
FROM Table1
WHERE Column1 <> dbo.DecodeUTF8String(Column1)
输出:
Column1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito
NewColumn1
-------------------------------
Olá. Gostei do jogo. Quando "baixei" até achei que não iria curtir muito
代码:
CREATE FUNCTION dbo.DecodeUTF8String (@value varchar(max))
RETURNS nvarchar(max)
AS
BEGIN
-- Transforms a UTF-8 encoded varchar string into Unicode
-- By Anthony Faull 2014-07-31
DECLARE @result nvarchar(max);
-- If ASCII or null there's no work to do
IF (@value IS NULL
OR @value NOT LIKE '%[^ -~]%' COLLATE Latin1_General_BIN
)
RETURN @value;
-- Generate all integers from 1 to the length of string
WITH e0(n) AS (SELECT TOP(POWER(2,POWER(2,0))) NULL FROM (VALUES (NULL),(NULL)) e(n))
, e1(n) AS (SELECT TOP(POWER(2,POWER(2,1))) NULL FROM e0 CROSS JOIN e0 e)
, e2(n) AS (SELECT TOP(POWER(2,POWER(2,2))) NULL FROM e1 CROSS JOIN e1 e)
, e3(n) AS (SELECT TOP(POWER(2,POWER(2,3))) NULL FROM e2 CROSS JOIN e2 e)
, e4(n) AS (SELECT TOP(POWER(2,POWER(2,4))) NULL FROM e3 CROSS JOIN e3 e)
, e5(n) AS (SELECT TOP(POWER(2.,POWER(2,5)-1)-1) NULL FROM e4 CROSS JOIN e4 e)
, numbers(position) AS
(
SELECT TOP(DATALENGTH(@value)) ROW_NUMBER() OVER (ORDER BY (SELECT NULL))
FROM e5
)
-- UTF-8 Algorithm (http://en.wikipedia.org/wiki/UTF-8)
-- For each octet, count the high-order one bits, and extract the data bits.
, octets AS
(
SELECT position, highorderones, partialcodepoint
FROM numbers a
-- Split UTF8 string into rows of one octet each.
CROSS APPLY (SELECT octet = ASCII(SUBSTRING(@value, position, 1))) b
-- Count the number of leading one bits
CROSS APPLY (SELECT highorderones = 8 - FLOOR(LOG( ~CONVERT(tinyint, octet) * 2 + 1)/LOG(2))) c
CROSS APPLY (SELECT databits = 7 - highorderones) d
CROSS APPLY (SELECT partialcodepoint = octet % POWER(2, databits)) e
)
-- Compute the Unicode codepoint for each sequence of 1 to 4 bytes
, codepoints AS
(
SELECT position, codepoint
FROM
(
-- Get the starting octect for each sequence (i.e. exclude the continuation bytes)
SELECT position, highorderones, partialcodepoint
FROM octets
WHERE highorderones <> 1
) lead
CROSS APPLY (SELECT sequencelength = CASE WHEN highorderones in (1,2,3,4) THEN highorderones ELSE 1 END) b
CROSS APPLY (SELECT endposition = position + sequencelength - 1) c
CROSS APPLY
(
-- Compute the codepoint of a single UTF-8 sequence
SELECT codepoint = SUM(POWER(2, shiftleft) * partialcodepoint)
FROM octets
CROSS APPLY (SELECT shiftleft = 6 * (endposition - position)) b
WHERE position BETWEEN lead.position AND endposition
) d
)
-- Concatenate the codepoints into a Unicode string
SELECT @result = CONVERT(xml,
(
SELECT NCHAR(codepoint)
FROM codepoints
ORDER BY position
FOR XML PATH('')
)).value('.', 'nvarchar(max)');
RETURN @result;
END
GO
SELECT dbo.DecodeUTF8String(N'abcÄ ƒÃ¢Ã®È™È›ðŸ˜€âœ"?')
,应该产生 abcăâîșț✔?
,但它只输出 abcăâîșț✔?
。 - Razvan SocolJason Penny也编写了一个将UTF-8转换为Unicode的SQL函数(MIT许可证),在我进行简单示例测试后,该函数有效:
CREATE FUNCTION dbo.UTF8_TO_NVARCHAR(@in VarChar(MAX))
RETURNS NVarChar(MAX)
AS
BEGIN
DECLARE @out NVarChar(MAX), @i int, @c int, @c2 int, @c3 int, @nc int
SELECT @i = 1, @out = ''
WHILE (@i <= Len(@in))
BEGIN
SET @c = Ascii(SubString(@in, @i, 1))
IF (@c < 128)
BEGIN
SET @nc = @c
SET @i = @i + 1
END
ELSE IF (@c > 191 AND @c < 224)
BEGIN
SET @c2 = Ascii(SubString(@in, @i + 1, 1))
SET @nc = (((@c & 31) * 64 /* << 6 */) | (@c2 & 63))
SET @i = @i + 2
END
ELSE
BEGIN
SET @c2 = Ascii(SubString(@in, @i + 1, 1))
SET @c3 = Ascii(SubString(@in, @i + 2, 1))
SET @nc = (((@c & 15) * 4096 /* << 12 */) | ((@c2 & 63) * 64 /* << 6 */) | (@c3 & 63))
SET @i = @i + 3
END
SET @out = @out + NChar(@nc)
END
RETURN @out
END
GO
LIKE (N'%[' + CONVERT(NVARCHAR,(CHAR(192))) + CONVERT(NVARCHAR,(CHAR(193))) + CONVERT(NVARCHAR,(CHAR(194))) + CONVERT(NVARCHAR,(CHAR(195))) + CONVERT(NVARCHAR,(CHAR(196))) + CONVERT(NVARCHAR,(CHAR(197))) + CONVERT(NVARCHAR,(CHAR(198))) + CONVERT(NVARCHAR,(CHAR(199))) + CONVERT(NVARCHAR,(CHAR(200))) + CONVERT(NVARCHAR,(CHAR(201))) + CONVERT(NVARCHAR,(CHAR(202))) + CONVERT(NVARCHAR,(CHAR(203))) + CONVERT(NVARCHAR,(CHAR(204))) + CONVERT(NVARCHAR,(CHAR(205))) + CONVERT(NVARCHAR,(CHAR(206))) + CONVERT(NVARCHAR,(CHAR(207))) + CONVERT(NVARCHAR,(CHAR(208))) + CONVERT(NVARCHAR,(CHAR(209))) + CONVERT(NVARCHAR,(CHAR(210))) + CONVERT(NVARCHAR,(CHAR(211))) + CONVERT(NVARCHAR,(CHAR(212))) + CONVERT(NVARCHAR,(CHAR(213))) + CONVERT(NVARCHAR,(CHAR(214))) + CONVERT(NVARCHAR,(CHAR(215))) + CONVERT(NVARCHAR,(CHAR(216))) + CONVERT(NVARCHAR,(CHAR(217))) + CONVERT(NVARCHAR,(CHAR(218))) + CONVERT(NVARCHAR,(CHAR(219))) + CONVERT(NVARCHAR,(CHAR(220))) + CONVERT(NVARCHAR,(CHAR(221))) + CONVERT(NVARCHAR,(CHAR(222))) + CONVERT(NVARCHAR,(CHAR(223))) + CONVERT(NVARCHAR,(CHAR(224))) + CONVERT(NVARCHAR,(CHAR(225))) + CONVERT(NVARCHAR,(CHAR(226))) + CONVERT(NVARCHAR,(CHAR(227))) + CONVERT(NVARCHAR,(CHAR(228))) + CONVERT(NVARCHAR,(CHAR(229))) + CONVERT(NVARCHAR,(CHAR(230))) + CONVERT(NVARCHAR,(CHAR(231))) + CONVERT(NVARCHAR,(CHAR(232))) + CONVERT(NVARCHAR,(CHAR(233))) + CONVERT(NVARCHAR,(CHAR(234))) + CONVERT(NVARCHAR,(CHAR(235))) + CONVERT(NVARCHAR,(CHAR(236))) + CONVERT(NVARCHAR,(CHAR(237))) + CONVERT(NVARCHAR,(CHAR(238))) + CONVERT(NVARCHAR,(CHAR(239)))
+ N'][' + CONVERT(NVARCHAR,(CHAR(128))) + CONVERT(NVARCHAR,(CHAR(129))) + CONVERT(NVARCHAR,(CHAR(130))) + CONVERT(NVARCHAR,(CHAR(131))) + CONVERT(NVARCHAR,(CHAR(132))) + CONVERT(NVARCHAR,(CHAR(133))) + CONVERT(NVARCHAR,(CHAR(134))) + CONVERT(NVARCHAR,(CHAR(135))) + CONVERT(NVARCHAR,(CHAR(136))) + CONVERT(NVARCHAR,(CHAR(137))) + CONVERT(NVARCHAR,(CHAR(138))) + CONVERT(NVARCHAR,(CHAR(139))) + CONVERT(NVARCHAR,(CHAR(140))) + CONVERT(NVARCHAR,(CHAR(141))) + CONVERT(NVARCHAR,(CHAR(142))) + CONVERT(NVARCHAR,(CHAR(143))) + CONVERT(NVARCHAR,(CHAR(144))) + CONVERT(NVARCHAR,(CHAR(145))) + CONVERT(NVARCHAR,(CHAR(146))) + CONVERT(NVARCHAR,(CHAR(147))) + CONVERT(NVARCHAR,(CHAR(148))) + CONVERT(NVARCHAR,(CHAR(149))) + CONVERT(NVARCHAR,(CHAR(150))) + CONVERT(NVARCHAR,(CHAR(151))) + CONVERT(NVARCHAR,(CHAR(152))) + CONVERT(NVARCHAR,(CHAR(153))) + CONVERT(NVARCHAR,(CHAR(154))) + CONVERT(NVARCHAR,(CHAR(155))) + CONVERT(NVARCHAR,(CHAR(156))) + CONVERT(NVARCHAR,(CHAR(157))) + CONVERT(NVARCHAR,(CHAR(158))) + CONVERT(NVARCHAR,(CHAR(159))) + CONVERT(NVARCHAR,(CHAR(160))) + CONVERT(NVARCHAR,(CHAR(161))) + CONVERT(NVARCHAR,(CHAR(162))) + CONVERT(NVARCHAR,(CHAR(163))) + CONVERT(NVARCHAR,(CHAR(164))) + CONVERT(NVARCHAR,(CHAR(165))) + CONVERT(NVARCHAR,(CHAR(166))) + CONVERT(NVARCHAR,(CHAR(167))) + CONVERT(NVARCHAR,(CHAR(168))) + CONVERT(NVARCHAR,(CHAR(169))) + CONVERT(NVARCHAR,(CHAR(170))) + CONVERT(NVARCHAR,(CHAR(171))) + CONVERT(NVARCHAR,(CHAR(172))) + CONVERT(NVARCHAR,(CHAR(173))) + CONVERT(NVARCHAR,(CHAR(174))) + CONVERT(NVARCHAR,(CHAR(175))) + CONVERT(NVARCHAR,(CHAR(176))) + CONVERT(NVARCHAR,(CHAR(177))) + CONVERT(NVARCHAR,(CHAR(178))) + CONVERT(NVARCHAR,(CHAR(179))) + CONVERT(NVARCHAR,(CHAR(180))) + CONVERT(NVARCHAR,(CHAR(181))) + CONVERT(NVARCHAR,(CHAR(182))) + CONVERT(NVARCHAR,(CHAR(183))) + CONVERT(NVARCHAR,(CHAR(184))) + CONVERT(NVARCHAR,(CHAR(185))) + CONVERT(NVARCHAR,(CHAR(186))) + CONVERT(NVARCHAR,(CHAR(187))) + CONVERT(NVARCHAR,(CHAR(188))) + CONVERT(NVARCHAR,(CHAR(189))) + CONVERT(NVARCHAR,(CHAR(190))) + CONVERT(NVARCHAR,(CHAR(191)))
+ N']%') COLLATE Latin1_General_BIN
-- Started with https://dev59.com/cV4c5IYBdhLWcg3wOoEH
-- Modified following source in https://github.com/benkasminbullock/unicode-c/blob/master/unicode.c
-- Made characters > 65535 work using https://dba.stackexchange.com/questions/139551/how-do-i-set-a-sql-server-unicode-nvarchar-string-to-an-emoji-or-supplementary
CREATE FUNCTION dbo.UTF8_TO_NVARCHAR(@in VarChar(MAX)) RETURNS NVarChar(MAX) AS
BEGIN
DECLARE @out NVarChar(MAX), @thisOut NVARCHAR(MAX), @i int, @c int, @c2 int, @c3 int, @c4 int
SELECT @i = 1, @out = ''
WHILE (@i <= Len(@in)) BEGIN
SET @c = Ascii(SubString(@in, @i, 1))
IF @c <= 0x7F BEGIN
SET @thisOut = NCHAR(@c)
SET @i = @i + 1
END
ELSE IF @c BETWEEN 0xC2 AND 0xDF BEGIN
SET @c2 = Ascii(SubString(@in, @i + 1, 1))
IF @c2 < 0x80 OR @c2 > 0xBF BEGIN
SET @thisOut = NCHAR(0xFFFD)
SET @i = @i + 1
END
ELSE BEGIN
SET @thisOut = NCHAR(((@c & 31) * 64 /* << 6 */) | (@c2 & 63))
SET @i = @i + 2
END
END
ELSE IF @c BETWEEN 0xE0 AND 0xEF BEGIN
SET @c2 = Ascii(SubString(@in, @i + 1, 1))
SET @c3 = Ascii(SubString(@in, @i + 2, 1))
IF @c2 < 0x80 OR @c2 > 0xBF OR @c3 < 0x80 OR (@c = 0xE0 AND @c2 < 0xA0) BEGIN
SET @thisOut = NCHAR(0xFFFD)
SET @i = @i + 1
END
ELSE BEGIN
SET @thisOut = NCHAR(((@c & 15) * 4096 /* << 12 */) | ((@c2 & 63) * 64 /* << 6 */) | (@c3 & 63))
SET @i = @i + 3
END
END
ELSE IF @c BETWEEN 0xF0 AND 0xF4 BEGIN
SET @c2 = Ascii(SubString(@in, @i + 1, 1))
SET @c3 = Ascii(SubString(@in, @i + 2, 1))
SET @c4 = Ascii(SubString(@in, @i + 3, 1))
IF @c2 < 0x80 OR @c2 >= 0xC0 OR @c3 < 0x80 OR @c3 >= 0xC0 OR @c4 < 0x80 OR @c4 >= 0xC0 OR (@c = 0xF0 AND @c2 < 0x90) BEGIN
SET @thisOut = NCHAR(0xFFFD)
SET @i = @i + 1
END
ELSE BEGIN
DECLARE @nc INT = (((@c & 0x07) * 262144 /* << 18 */) | ((@c2 & 0x3F) * 4096 /* << 12 */) | ((@c3 & 0x3F) * 64) | (@c4 & 0x3F))
DECLARE @HighSurrogateInt INT = 55232 + (@nc / 1024), @LowSurrogateInt INT = 56320 + (@nc % 1024)
SET @thisOut = NCHAR(@HighSurrogateInt) + NCHAR(@LowSurrogateInt)
SET @i = @i + 4
END
END
ELSE BEGIN
SET @thisOut = NCHAR(0xFFFD)
SET @i = @i + 1
END
SET @out = @out + @thisOut
END
RETURN @out
END
GO
CREATE OR ALTER FUNCTION [dbo].[fnUTF8Decode](@UTF8 VARCHAR(4001)) RETURNS TABLE AS RETURN
/* Converts a UTF-8 encoded VARCHAR to NVARCHAR (UCS-2). Based on UTF-8 documentation on Wikipedia and the
code/discussion at https://dev59.com/cV4c5IYBdhLWcg3wOoEH#31064459.
One can quickly detect strings that need conversion using the following expression:
<FIELD> LIKE CONCAT('%[', CHAR(192), '-', CHAR(255), ']%') COLLATE Latin1_General_BIN.
Be aware, however, that this may return true for strings that this function has already converted to UCS-2.
See robocat's answer on the above referenced Stack Overflow thread for a slower but more robust expression.
Notes/Limitations
1) Written as a inline table-valued function for optimized performance.
2) Only tested on a database with SQL_Latin1_General_CP1_CI_AS collation. More specifically, this was
not designed to output Supplementary Characters and converts all such UTF-8 sequences to �.
3) Empty input strings, '', and strings with nothing but invalid UTF-8 chars are returned as NULL.
4) Assumes input is UTF-8 compliant. For example, extended ASCII characters such as en dash CHAR(150)
are not allowed unless part of a multi-byte sequence and will be skipped otherwise. In other words:
SELECT * FROM dbo.fnUTF8Decode(CHAR(150)) -> NULL
5) Input should be limited to 4000 characters to ensure that output will fit in NVARCHAR(4000), which is
what STRING_AGG outputs when fed a sequence of NVARCHAR(1) characters generated by NCHAR. However,
T-SQL silently truncates overlong parameters so we've declared our input as VARCHAR(4001) to allow
STRING_AGG to generate an error on overlong input. If we didn't do this, callers would never be
notified about truncation.
6) If we need to process more than 4000 chars in the future, we'll need to change input to VARCHAR(MAX) and
CAST the CASE WHEN expression to NVARCHAR(MAX) to force STRING_AGG to output NVARCHAR(MAX). Note that
this change will significantly degrade performance, which is why we didn't do it in the first place.
7) Due to use of STRING_AGG, this is only compatible with SQL 2017. It will probably work fine on 2019
but that version has native UTF-8 support so you're probably better off using that. For earlier versions,
replace STRING_AGG with a CLR equivalent (ms-sql-server-group-concat-sqlclr) or FOR XML PATH(''), TYPE...
*/
SELECT STRING_AGG (
CASE
WHEN A1 & 0xF0 = 0xF0 THEN --Four byte sequences (like emoji) can't be represented in UCS-2
NCHAR(0xFFFD) --Output U+FFFD (Replacement Character) instead
WHEN A1 & 0xE0 = 0xE0 THEN --Three byte sequence; get/combine relevant bits from A1-A3
NCHAR((A1 & 0x0F) * 4096 | (A2 & 0x3F) * 64 | (A3 & 0x3F))
WHEN A1 & 0xC0 = 0xC0 THEN --Two byte sequence; get/combine relevant bits from A1-A2
NCHAR((A1 & 0x3F) * 64 | (A2 & 0x3F))
ELSE NCHAR(A1) --Regular ASCII character; output as is
END
, '') UCS2
FROM dbo.fnNumbers(ISNULL(DATALENGTH(@UTF8), 0))
CROSS APPLY (SELECT ASCII(SUBSTRING(@UTF8, I, 1)) A1, ASCII(SUBSTRING(@UTF8, I + 1, 1)) A2, ASCII(SUBSTRING(@UTF8, I + 2, 1)) A3) A
WHERE A1 <= 127 OR A1 >= 192 --Output only ASCII chars and one char for each multi-byte sequence
GO
CREATE OR ALTER FUNCTION [dbo].[fnNumbers](@MaxNumber BIGINT) RETURNS TABLE AS RETURN
/* Generates a table of numbers up to the specified @MaxNumber, limited to 4,294,967,296. Useful for special case
situations and algorithms. Copied from https://www.itprotoday.com/sql-server/virtual-auxiliary-table-numbers
with minor formatting and name changes.
*/
WITH L0 AS (
SELECT 1 I UNION ALL SELECT 1 --Generates 2 rows
), L1 AS (
SELECT 1 I FROM L0 CROSS JOIN L0 L -- 4 rows
), L2 AS (
SELECT 1 I FROM L1 CROSS JOIN L1 L -- 16 rows
), L3 AS (
SELECT 1 I FROM L2 CROSS JOIN L2 L -- 256 rows
), L4 AS (
SELECT 1 I FROM L3 CROSS JOIN L3 L -- 65,536 rows
), L5 AS (
SELECT 1 I FROM L4 CROSS JOIN L4 L -- 4,294,967,296 rows
), Numbers AS (
SELECT ROW_NUMBER() OVER(ORDER BY (SELECT NULL)) I FROM L5
)
SELECT TOP (@MaxNumber) I FROM Numbers ORDER BY I
GO
SELECT @result=STRING_AGG(NCHAR([codepoint]),'') WITHIN GROUP (ORDER BY position ASC)
FROM codepoints
我刚刚通过创建一个新的字段为varchar(255),并将新字段设置为旧字段(nvarchar(255))来成功地实现了这一点。这产生了国际地名的“美式”版本。
更新WorldCities 设置admin_correct = admin_name
varchar(255) nvarchar(255)
我找到了需要执行的查询,只是还没有找到编码方式。
ALTER TABLE dbo.MyTable ALTER COLUMN CharCol
varchar(10)COLLATE Latin1_General_CI_AS NOT NULL;