Показать сообщение отдельно
  #5  
Старый 11.08.2011, 23:52
Аватар для angvelem
angvelem angvelem вне форума
.
 
Регистрация: 18.05.2011
Адрес: Омск
Сообщения: 3,970
Версия Delphi: 3,5,7,10,12,XE2
Репутация: выкл
По умолчанию

Я пользовался этим:
Код:
unit convert;

interface

const
  UTF16BE = $FFFE;
  UTF16LE = $FEFF;

function DetectUTF16LEBOM(const P: PChar; const Size: Integer): Boolean;
function DetectUTF16BEBOM(const P: PChar; const Size: Integer): Boolean;
function UTF16ToUTF8(const S : WideString; Count : Integer; var utf : Boolean; Swap : Boolean = False) : AnsiString;

implementation

//----------------------------------------------------------
  
type
  UCS2 = Word;
  UCS4 = Cardinal;

const
  MaximumUCS4		: UCS4 = $7FFFFFFF;
  ReplacementCharacter	: UCS4 = $0000FFFD;

function DetectUTF16LEBOM(const P: PChar; const Size: Integer): Boolean;
begin
  Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar(UTF16LE));
end;

function DetectUTF16BEBOM(const P: PChar; const Size: Integer): Boolean;
begin
  Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar(UTF16BE));
end;

function SwapUTF16Endian(const P : Cardinal) : Cardinal;
begin
  Result := ((Ord(P) and $FF) shl 8) or (Ord(P) shr 8);
end;

function ConvertSurrogate(S1, S2 : UCS2) : UCS4;
// Converts a pair of high and low surrogate into the corresponding UCS4 character.
const
  SurrogateOffset = ($D800 shl 10) + $DC00 - $10000;
begin
  Result := Word(S1) shl 10 + Word(S2) - SurrogateOffset;
end;

function UTF16ToUTF8(const S : WideString; Count : Integer; var utf : Boolean; Swap : Boolean = False) : AnsiString;
// Converts the given Unicode text (which may contain surrogates) into
// the UTF-8 encoding used for the HTML clipboard format.
const
  FirstByteMark: array[0..6] of Byte = ($00, $00, $C0, $E0, $F0, $F8, $FC);
var
  Ch           : UCS4;
  I, J, T      : Integer;
  BytesToWrite : Cardinal;
begin
  Result := '';
  utf := True;
  if (Count = 0) then
    Exit;
  begin
    // Make room for the result. Assume worst case, there are only short texts to convert.
    SetLength(Result, 6 * Count);
    T := 1;
    I := 1;
    BytesToWrite := 0;
    while I <= Count do
    begin
      Ch := UCS4(S[i]);
      if Swap then				// for UTF16BE
	Ch := SwapUTF16Endian(Ch);

      if (Hi(Ch) = 0) and (Lo(Ch) >= $C0) then	// for wild (mad) record with codepage cp1251
      begin
	BytesToWrite := 1;
	Result[T] := AnsiChar(Lo(Ch));
	inc(T, BytesToWrite);
	inc(I);
	utf := False;
	Continue;
      end
      else
      begin
	if (Ch and $FFFFF800) = $D800 then	// Is the character a surrogate?
	begin
	  inc(I);
	  // Check the following char whether it forms a valid surrogate pair with the first character.
	  if (I <= Count) and ((UCS4(S[i]) and $FFFFFC00) = $DC00) then
	    Ch := ConvertSurrogate(UCS2(Ch), UCS2(S[i]))
	  else // Skip invalid surrogate value.
	    Continue;
	end;

	if Ch < $80 then
	  BytesToWrite := 1
	else if Ch < $800 then
	  BytesToWrite := 2
	else if Ch < $10000 then
	  BytesToWrite := 3
	else if Ch < $200000 then
	  BytesToWrite := 4
	else if Ch < $4000000 then
	  BytesToWrite := 5
	else if Ch <= MaximumUCS4 then
	  BytesToWrite := 6
	else
	begin
	  BytesToWrite := 2;
	  Ch := ReplacementCharacter;
	end;
      end;
      for J := BytesToWrite downto 2 do
      begin
	Result[T + J - 1] := AnsiChar((Ch or $80) and $BF);
	Ch := Ch shr 6;
      end;
      Result[T] := AnsiChar(Ch or FirstByteMark[BytesToWrite]);
      inc(T, BytesToWrite);

      inc(I);
    end;
    SetLength(Result, T - 1); // set to actual length
  end;
end;

end.
__________________
Je venus de nulle part
55.026263 с.ш., 73.397636 в.д.
Ответить с цитированием