 {
 ****************************************************************************
 CRW32 project
 Copyright (C) 2020 Alexey Kuryakin kouriakine@mail.ru
 UTF8 - UTF16 - Ansi encoding/decoding routines.
 Sources:
  Delphi 7 RTL - AnsiToUtf8/Utf8ToAnsi,UTF8Encode/UTF8Decode
  http://www.cyberforum.ru/delphi-beginners/thread946045.html
  https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
  https://docs.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-multibytetowidechar
 Modifications:
  20200116 - initial version by AK
 ****************************************************************************
 }
unit _utf8;

{$I _sysdef}

interface

uses
 {$IFDEF USES_SHAREMEM} ShareMem, {$ENDIF} windows;

type
 TUnicodeInt            = 0..$10FFFF;       // Range of UCS according to rfc3629

const                                       // UTF8 magic constants, see rfc3629
 UTF8_BOM               = #$EF#$BB#$BF;     // "BYTE ORDER MARK" (abbreviated "BOM")    Utf8Decode(UTF8_BOM)=UCS_ZWNBSP
 UCS_ZWNBSP             = WideChar($FEFF);  // UCS U+FEFF "ZERO WIDTH NO-BREAK SPACE"   Utf8Encode(UCS_ZWNBSP)=UTF8_BOM
 AnsiToUtf8MaxFactor    = 4;                // Up to 4 bytes per UTF8 character allowed

const                                       // WinAPI magic constants:
 CP_ACP                 = 0;                // The system default Windows ANSI code page.
 CP_OEMCP               = 1;                // The current system OEM code page.
 CP_MACCP               = 2;                // The current system Macintosh code page.
 CP_THREAD_ACP          = 3;                // Windows 2000: The Windows ANSI code page for the current thread.
 CP_SYMBOL              = 42;               // Windows 2000: Symbol code page (42).
 CP_UTF7                = 65000;            // UTF-7. Use this value only when forced by a 7-bit transport mechanism.
 CP_UTF8                = 65001;            // UTF-8.

const                                       // MultiByteToWideChar flags:
 MB_PRECOMPOSED       = 1;
 MB_COMPOSITE         = 2;
 MB_ERR_INVALID_CHARS = 8;
 MB_USEGLYPHCHARS     = 4;

type
  Utf8String = type string;
  ByteString = type string;
  PUtf8String = ^Utf8String;
  PByteString = ^ByteString;

// Ansi <-> UTF8 conversion

function AnsiToUtf8(const S: AnsiString): Utf8String;
function Utf8ToAnsi(const S: UTF8String): AnsiString;

// WideString <-> UTF8 conversion

function UTF8Encode(const S: WideString): Utf8String;
function UTF8Decode(const S: UTF8String): WideString;

// PChar/PWideChar Unicode <-> UTF8 conversion (without WinApi call).

// UnicodeToUTF8(3), UTF8ToUnicode(3):
// Scans the source data to find the null terminator, up to MaxBytes
// Dest must have MaxBytes available in Dest.
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.

function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: Integer): Integer; overload;
function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: Integer): Integer; overload;

// UnicodeToUtf8(4), UTF8ToUnicode(4):
// MaxDestBytes includes the null terminator (last char in the buffer will be set to null)
// Function result includes the null terminator.
// Nulls in the source data are not considered terminators - SourceChars must be accurate

function UnicodeToUtf8(Dest: PChar; MaxDestBytes: Cardinal; Source: PWideChar; SourceChars: Cardinal): Cardinal; overload;
function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: Cardinal; Source: PChar; SourceBytes: Cardinal): Cardinal; overload;


// WideString <-> UTF8 conversion with WinApi functions.

function WideToUtf8String(const Source:WideString):Utf8String;
function Utf8ToWideString(const Source:Utf8String):WideString;

// UTF8 <--> WideString, UTF8 <--> AnsiString functions.

function utf8_valid(const S:AnsiString):Boolean;
function utf8_length(const S:Utf8String; check:Boolean=false):Integer;
function utf8_encode_wide(const S:WideString):Utf8String;
function utf8_decode_wide(const S:Utf8String):WideString;
function utf8_encode_ansi(const S:AnsiString):Utf8String;
function utf8_decode_ansi(const S:Utf8String):AnsiString;
function utf8_copy(const S:Utf8String; i,n:Integer):Utf8String;
function utf8_ord(const S:Utf8String; i:Integer):Integer;
function utf8_chr(c:Integer):Utf8String;
function utf8_uppercase(S:Utf8String):Utf8String;
function utf8_lowercase(S:Utf8String):Utf8String;

// Get status of last UTF8 function call.

function  GetLastErrorUtf8: DWORD;
procedure SetLastErrorUtf8(Code:DWORD);

implementation

threadvar Utf8LastError : DWORD;

function GetLastErrorUtf8: DWORD;
begin
 Result:=Utf8LastError;
end;

procedure SetLastErrorUtf8(Code:DWORD);
begin
 Utf8LastError:=Code;
end;

function AnsiToUtf8(const S: AnsiString): Utf8String;
begin
 Result := Utf8Encode(S);
end;

function Utf8ToAnsi(const S: Utf8String): AnsiString;
begin
 Result := Utf8Decode(S);
end;

function Utf8Encode(const S: WideString): Utf8String;
var  L: Integer; Temp: Utf8String;
begin
 Result := '';
 if S = '' then Exit;
 // SetLength includes space for null terminator
 SetLength(Temp, Length(S) * AnsiToUtf8MaxFactor);
 L := UnicodeToUtf8(PChar(Temp), Length(Temp)+1, PWideChar(S), Length(S));
 if L > 0 then SetLength(Temp, L-1) else begin Temp := ''; Utf8LastError:=ERROR_NO_UNICODE_TRANSLATION; end;
 Result := Temp;
end;

function Utf8Decode(const S: Utf8String): WideString;
var L: Integer; Temp: WideString;
begin
 Result := '';
 if S = '' then Exit;
 SetLength(Temp, Length(S)); // SetLength includes space for null terminator
 L := Utf8ToUnicode(PWideChar(Temp), Length(Temp)+1, PChar(S), Length(S));
 if L > 0 then SetLength(Temp, L-1) else begin Temp := ''; Utf8LastError:=ERROR_NO_UNICODE_TRANSLATION; end;
 Result := Temp;
end;

function UnicodeToUtf8(Dest: PChar; Source: PWideChar; MaxBytes: Integer): Integer;
var len: Cardinal;
begin
 len := 0;
 if Source <> nil then while Source[len] <> #0 do Inc(len);
 Result := UnicodeToUtf8(Dest, MaxBytes, Source, len);
end;

function Utf8ToUnicode(Dest: PWideChar; Source: PChar; MaxChars: Integer): Integer;
var len: Cardinal;
begin
 len := 0;
 if Source <> nil then while Source[len] <> #0 do Inc(len);
 Result := Utf8ToUnicode(Dest, MaxChars, Source, len);
end;

function UnicodeToUtf8(Dest: PChar; MaxDestBytes: Cardinal; Source: PWideChar; SourceChars: Cardinal): Cardinal;
var i, count: Cardinal; c: Cardinal;
begin
 Result := 0;
 if Source = nil then Exit;
 count := 0;
 i := 0;
 if Dest <> nil then begin
  while (i < SourceChars) and (count < MaxDestBytes) do begin
   c := Cardinal(Source[i]);
   Inc(i);
   if c <= $7F then begin
    Dest[count] := Char(c);
    Inc(count);
   end else
   if c > $7FF then begin
    if count + 3 > MaxDestBytes then break;
    Dest[count] := Char($E0 or (c shr 12));
    Dest[count+1] := Char($80 or ((c shr 6) and $3F));
    Dest[count+2] := Char($80 or (c and $3F));
    Inc(count,3);
   end else
   begin //  $7F < Source[i] <= $7FF
    if count + 2 > MaxDestBytes then break;
    Dest[count] := Char($C0 or (c shr 6));
    Dest[count+1] := Char($80 or (c and $3F));
    Inc(count,2);
   end;
  end;
  if count >= MaxDestBytes then count := MaxDestBytes-1;
  Dest[count] := #0;
 end else begin
  while i < SourceChars do begin
   c := Integer(Source[i]);
   Inc(i);
   if c > $7F then begin
    if c > $7FF then Inc(count);
    Inc(count);
   end;
   Inc(count);
  end;
 end;
 Result := count+1;  // convert zero based index to byte count
end;

function Utf8ToUnicode(Dest: PWideChar; MaxDestChars: Cardinal; Source: PChar; SourceBytes: Cardinal): Cardinal;
var i, count: Cardinal; c: Byte; wc: Cardinal;
begin
 if Source = nil then begin
  Result := 0;
  Exit;
 end;
 Result := Cardinal(-1);
 count := 0;
 i := 0;
 if Dest <> nil then begin
  while (i < SourceBytes) and (count < MaxDestChars) do begin
   wc := Cardinal(Source[i]);
   Inc(i);
   if (wc and $80) <> 0 then begin
    if i >= SourceBytes then Exit;          // incomplete multibyte char
    wc := wc and $3F;
    if (wc and $20) <> 0 then begin
     c := Byte(Source[i]);
     Inc(i);
     if (c and $C0) <> $80 then Exit;      // malformed trail byte or out of range char
     if i >= SourceBytes then Exit;        // incomplete multibyte char
     wc := (wc shl 6) or (c and $3F);
    end;
    c := Byte(Source[i]);
    Inc(i);
    if (c and $C0) <> $80 then Exit;       // malformed trail byte
    Dest[count] := WideChar((wc shl 6) or (c and $3F));
   end else Dest[count] := WideChar(wc);
   Inc(count);
  end;
  if count >= MaxDestChars then count := MaxDestChars-1;
  Dest[count] := #0;
 end else begin
  while (i < SourceBytes) do begin
   c := Byte(Source[i]);
   Inc(i);
   if (c and $80) <> 0 then begin
    if i >= SourceBytes then Exit;          // incomplete multibyte char
    c := c and $3F;
    if (c and $20) <> 0 then begin
     c := Byte(Source[i]);
     Inc(i);
     if (c and $C0) <> $80 then Exit;      // malformed trail byte or out of range char
     if i >= SourceBytes then Exit;        // incomplete multibyte char
    end;
    c := Byte(Source[i]);
    Inc(i);
    if (c and $C0) <> $80 then Exit;       // malformed trail byte
   end;
   Inc(count);
  end;
 end;
 Result := count+1;
end;

function CheckCount(Count:Integer; var Dest:Utf8String):Boolean; overload;
begin
 Result:=(Count>0);
 if not Result then begin
  Utf8LastError:=GetLastError;
  Dest:='';
 end;
end;

function CheckCount(Count:Integer; var Dest:WideString):Boolean; overload;
begin
 Result:=(Count>0);
 if not Result then begin
  Utf8LastError:=GetLastError;
  Dest:='';
 end;
end;

const BufferMax = 511;

function WideToUtf8String(const Source:WideString):Utf8String;
var Count,SourceLen:Integer; Buffer:packed array[0..BufferMax] of Char; Temp:Utf8String;
begin
 Result:='';
 SourceLen:=Length(Source);
 if SourceLen>0 then begin
  Count:=WideCharToMultiByte(CP_UTF8,0,PWideChar(Source),SourceLen,nil,0,nil,nil);
  if not CheckCount(Count,Result) then Exit;
  // Use Buffer for small strings to avoid memory allocation
  if Count<(SizeOf(Buffer) div SizeOf(Buffer[0])) then begin
   Count:=WideCharToMultiByte(CP_UTF8,0,PWideChar(Source),SourceLen,Buffer,Count,nil,nil);
   if not CheckCount(Count,Result) then Exit;
   SetString(Result,Buffer,Count);
  end else begin
   SetString(Temp,nil,Count);
   Count:=WideCharToMultiByte(CP_UTF8,0,PWideChar(Source),SourceLen,PChar(Temp),Count,nil,nil);
   if not CheckCount(Count,Result) then Exit;
   SetString(Result,PChar(Temp),Count);
  end;
 end;
end;

function Utf8ToWideString(const Source:Utf8String):WideString;
var Count,SourceLen:Integer; Buffer:packed array[0..BufferMax] of WideChar; Temp:WideString;
begin
 Result:='';
 SourceLen:=Length(Source);
 if SourceLen>0 then begin
  Count:=MultiByteToWideChar(CP_UTF8,0,PChar(Source),SourceLen,nil,0);
  if not CheckCount(Count,Result) then Exit;
  // Use Buffer for small strings to avoid memory allocation
  if Count<(SizeOf(Buffer) div SizeOf(Buffer[0])) then begin
   Count:=MultiByteToWideChar(CP_UTF8,0,PChar(Source),SourceLen,Buffer,Count);
   if not CheckCount(Count,Result) then Exit;
   SetString(Result,Buffer,Count);
  end else begin
   SetString(Temp,nil,Count);
   Count:=MultiByteToWideChar(CP_UTF8,0,PChar(Source),SourceLen,PWideChar(Temp),Count);
   if not CheckCount(Count,Result) then Exit;
   SetString(Result,PWideChar(Temp),Count);
  end;
 end;
end;

function utf8_valid(const S:AnsiString):Boolean;
begin
 if (S<>'') and (utf8_length(S,true)=0)
 then Result:=false
 else Result:=true;
end;

function utf8_length(const S:Utf8String; check:Boolean=false):Integer;
var dwFlags:DWORD;
begin
 Result:=0;
 if Length(S)=0 then Exit;
 dwFlags:=0; if check then dwFlags:=dwFlags or MB_ERR_INVALID_CHARS;
 Result:=MultiByteToWideChar(CP_UTF8,dwFlags,PChar(S),Length(S),nil,0);
 if Result=0 then Utf8LastError:=GetLastError;
end;

function utf8_encode_wide(const S:WideString):Utf8String;
begin
 Result:=WideToUtf8String(S);
end;

function utf8_decode_wide(const S:Utf8String):WideString;
begin
 Result:=Utf8ToWideString(S);
end;

function utf8_encode_ansi(const S:AnsiString):Utf8String;
begin
 Result:=WideToUtf8String(S);
end;

function utf8_decode_ansi(const S:Utf8String):AnsiString;
begin
 Result:=Utf8ToWideString(S);
end;

function utf8_copy(const S:Utf8String; i,n:Integer):Utf8String;
begin
 Result:='';
 if i<=0 then Exit; if n<=0 then Exit; if S='' then Exit;
 Result:=WideToUtf8String(Copy(Utf8ToWideString(S),i,n));
end;

function utf8_ord(const S:Utf8String; i:Integer):Integer;
var Temp:WideString;
begin
 Result:=-1;
 if i<=0 then Exit;
 Temp:=Utf8ToWideString(S);
 if i>Length(Temp) then Exit;
 Result:=Ord(Temp[i]);
end;

function utf8_chr(c:Integer):Utf8String;
begin
 if (c>=Low(TUnicodeInt)) and (c<=High(TUnicodeInt))
 then Result:=WideToUtf8String(WideChar(c))
 else Result:='';
end;

function utf8_uppercase(S:Utf8String):Utf8String;
var Temp:WideString;
begin
 Result:='';
 if Length(S)=0 then Exit;
 Temp:=Utf8ToWideString(S);
 if Length(Temp)=0 then Exit;
 CharUpperBuffW(PWideChar(Temp),Length(Temp));
 Result:=WideToUtf8String(Temp);
end;

function utf8_lowercase(S:Utf8String):Utf8String;
var Temp:WideString;
begin
 Result:='';
 if Length(S)=0 then Exit;
 Temp:=Utf8ToWideString(S);
 if Length(Temp)=0 then Exit;
 CharLowerBuffW(PWideChar(Temp),Length(Temp));
 Result:=WideToUtf8String(Temp);
end;

end.
