
11.08.2011, 23:52
|
 |
.
|
|
Регистрация: 18.05.2011
Адрес: Омск
Сообщения: 3,970
Версия Delphi: 3,5,7,10,12,XE2
Репутация: выкл
|
|
Я пользовался этим:
Код:
unit convert;
interface
const
UTF16BE = $FFFE;
UTF16LE = $FEFF;
function DetectUTF16LEBOM(const P: PChar; const Size: Integer): Boolean;
function DetectUTF16BEBOM(const P: PChar; const Size: Integer): Boolean;
function UTF16ToUTF8(const S : WideString; Count : Integer; var utf : Boolean; Swap : Boolean = False) : AnsiString;
implementation
//----------------------------------------------------------
type
UCS2 = Word;
UCS4 = Cardinal;
const
MaximumUCS4 : UCS4 = $7FFFFFFF;
ReplacementCharacter : UCS4 = $0000FFFD;
function DetectUTF16LEBOM(const P: PChar; const Size: Integer): Boolean;
begin
Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar(UTF16LE));
end;
function DetectUTF16BEBOM(const P: PChar; const Size: Integer): Boolean;
begin
Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar(UTF16BE));
end;
function SwapUTF16Endian(const P : Cardinal) : Cardinal;
begin
Result := ((Ord(P) and $FF) shl 8) or (Ord(P) shr 8);
end;
function ConvertSurrogate(S1, S2 : UCS2) : UCS4;
// Converts a pair of high and low surrogate into the corresponding UCS4 character.
const
SurrogateOffset = ($D800 shl 10) + $DC00 - $10000;
begin
Result := Word(S1) shl 10 + Word(S2) - SurrogateOffset;
end;
function UTF16ToUTF8(const S : WideString; Count : Integer; var utf : Boolean; Swap : Boolean = False) : AnsiString;
// Converts the given Unicode text (which may contain surrogates) into
// the UTF-8 encoding used for the HTML clipboard format.
const
FirstByteMark: array[0..6] of Byte = ($00, $00, $C0, $E0, $F0, $F8, $FC);
var
Ch : UCS4;
I, J, T : Integer;
BytesToWrite : Cardinal;
begin
Result := '';
utf := True;
if (Count = 0) then
Exit;
begin
// Make room for the result. Assume worst case, there are only short texts to convert.
SetLength(Result, 6 * Count);
T := 1;
I := 1;
BytesToWrite := 0;
while I <= Count do
begin
Ch := UCS4(S[i]);
if Swap then // for UTF16BE
Ch := SwapUTF16Endian(Ch);
if (Hi(Ch) = 0) and (Lo(Ch) >= $C0) then // for wild (mad) record with codepage cp1251
begin
BytesToWrite := 1;
Result[T] := AnsiChar(Lo(Ch));
inc(T, BytesToWrite);
inc(I);
utf := False;
Continue;
end
else
begin
if (Ch and $FFFFF800) = $D800 then // Is the character a surrogate?
begin
inc(I);
// Check the following char whether it forms a valid surrogate pair with the first character.
if (I <= Count) and ((UCS4(S[i]) and $FFFFFC00) = $DC00) then
Ch := ConvertSurrogate(UCS2(Ch), UCS2(S[i]))
else // Skip invalid surrogate value.
Continue;
end;
if Ch < $80 then
BytesToWrite := 1
else if Ch < $800 then
BytesToWrite := 2
else if Ch < $10000 then
BytesToWrite := 3
else if Ch < $200000 then
BytesToWrite := 4
else if Ch < $4000000 then
BytesToWrite := 5
else if Ch <= MaximumUCS4 then
BytesToWrite := 6
else
begin
BytesToWrite := 2;
Ch := ReplacementCharacter;
end;
end;
for J := BytesToWrite downto 2 do
begin
Result[T + J - 1] := AnsiChar((Ch or $80) and $BF);
Ch := Ch shr 6;
end;
Result[T] := AnsiChar(Ch or FirstByteMark[BytesToWrite]);
inc(T, BytesToWrite);
inc(I);
end;
SetLength(Result, T - 1); // set to actual length
end;
end;
end.
__________________
Je venus de nulle part
55.026263 с.ш., 73.397636 в.д.
|