////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2001-2023 Alexey Kuryakin daqgroup@mail.ru under MIT license //
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
// This file is part of the CRW-DAQ project by DaqGroup - component CRWLIB.   //
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
// Purpose:                                                                   //
// UTF8 - UTF16 - Ansi encoding/decoding routines.                            //
////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
// History:                                                                   //
// 20200116 - initial version by AK                                           //
// 20230510 - Modified for FPC (A.K.)                                         //
////////////////////////////////////////////////////////////////////////////////

unit _crw_utf8;

{$I _crw_sysdef.inc}

{$I _crw_sysmode.inc}

interface

uses
 //////////////////////////////////////////////////////
 {$I _crw_uses_first.inc} // NB: MUST BE FIRST USES !!!
 //////////////////////////////////////////////////////
 sysutils, classes, math, lazutf8,
 _crw_alloc;

 // Default ANSI codepage, synonym of CP_ACP
const
 CP_ANSI = CP_ACP;

// UTF8 chars max bytes according to rfc3629
const
 AnsiToUtf8MaxFactor    = 4;

// UTF8 avail UCS range according to rfc3629
type
 TUnicodeInt            = 0..$10FFFF;

// UTF8 magic constants, see rfc3629
// "BYTE ORDER MARK" (abbreviated "BOM")    Utf8Decode(UTF8_BOM)=UCS_ZWNBSP
// UCS U+FEFF "ZERO WIDTH NO-BREAK SPACE"   Utf8Encode(UCS_ZWNBSP)=UTF8_BOM
const
 UTF8_BOM               = #$EF#$BB#$BF;
 UCS_ZWNBSP             = WideChar($FEFF);

const // Uses for utf8_fix_cp
 TheFavoriteAnsiCodePage : Word = 0; // 1251 for Russian/Windows

// Ansi <-> UTF8 conversion

function AnsiToUtf8(const S:LongString):LongString;
function Utf8ToAnsi(const S:LongString):LongString;

// UnicodeString <-> UTF8 conversion

function UTF8Encode(const S:UnicodeString):LongString;
function UTF8Decode(const S:LongString):UnicodeString;

// UnicodeString <-> UTF8 conversion

function UnicodeToUtf8String(const Source:UnicodeString):LongString;
function Utf8ToUnicodeString(const Source:LongString):UnicodeString;

// Change CP_UTF8 to CP_ACP if DefaultSystemCodePage is CP_UTF8
function DefaultCP(S:LongString):LongString; inline;

// Convert string S from codepage cpFrom to cpTo, then set codepage cpSet
// For example: ConvertCP(S,CP_ACP,1251,CP_NONE) converting string S from
// default CP to Windows-1251, return string with CP_NONE (RawByteString)
// Flag (silent) switch off bug report on exceptions.
function ConvertCP(const s:LongString; cpFrom,cpTo,cpSet:Word;
                   silent:Boolean=false):LongString;

// Simplified interface for CRW-DAQ and DaqPascal.
// UTF8 <--> UnicodeString, UTF8 <--> LongString functions.
// utf8_valid(S)        - S is valid UTF8?
// utf8_errpos(S)       - 0=OK or position of UTF8 error char
// utf8_length(S)       - UTF8 length or 0 on UTF8 error
// utf8_encode_wide(S)  - convert UTF8 to UTF16
// utf8_decode_wide(S)  - convert UTF16 to UTF8
// utf8_encode_ansi(S)  - convert UTF8 to Ansi
// utf8_decode_ansi(S)  - convert Ansi to UTF8
// utf8_copy(S,i,n)     - copy n UTF8 chars of S strar from UTF8 pos i
// utf8_chrpos(S,i)     - char position of UTF8 char number i of string S
// utf8_ord(S,i)        - UCS code of UTF8 char with UTF8 pos i of string S
// utf8_chr(c)          - UTF8 string of Unicode symbol with UCS code c
// utf8_uppercase(S)    - uppercase of UTF8 string S
// utf8_lowercase(S)    - lowercase of UTF8 string S
// utf8_fixbroken(S)    - fix broken UTF8 string (replace to spaces)
// utf8_sametext(S1,S2) - true if S1 and S2 is same UTF8 text (case insensitive)
// utf8_fix_cp(S,cp)    - fix (validate) S codepage (if not UTF8, convert from cp)
// Note: in UTF8 string, char pos/length may differ with UTF8 pos/length

function utf8_valid(const S:LongString):Boolean; overload;
function utf8_valid(S:PChar; Len:SizeInt):Boolean; overload;
function utf8_errpos(const S:LongString):SizeInt;
function utf8_length(const S:LongString):SizeInt;
function utf8_encode_wide(const S:UnicodeString):LongString;
function utf8_decode_wide(const S:LongString):UnicodeString;
function utf8_encode_ansi(const S:LongString):LongString;
function utf8_decode_ansi(const S:LongString):LongString;
function utf8_copy(const S:LongString; i,n:SizeInt):LongString;
function utf8_chrpos(const S:LongString; i:SizeInt):SizeInt;
function utf8_ord(const S:LongString; i:SizeInt=1):SizeInt;
function utf8_chr(c:SizeInt):LongString;
function utf8_uppercase(const S:LongString):LongString;
function utf8_lowercase(const S:LongString):LongString;
function utf8_fixbroken(const S:LongString):LongString;
function utf8_sametext(const S1,S2:LongString):Boolean;
function utf8_fix_cp(const S:LongString; cp:Word=0):LongString;

function Test_crw_utf8:LongString;

implementation

function DefaultCP(S:LongString):LongString; inline;
begin
 Result:=S;
 if (Result<>'') then
 if (DefaultSystemCodePage=CP_UTF8) then
 if (StringCodePage(Result)=CP_UTF8) then
 SetCodePage(Result,CP_ACP,false);
end;

function ConvertCP(const s:LongString; cpFrom,cpTo,cpSet:Word;
                   silent:Boolean=false):LongString;
var tmp:RawByteString;
begin
 Result:='';
 try
  tmp:=s;
  if (cpFrom=CP_ACP) then cpFrom:=DefaultSystemCodePage;
  if (cpTo=CP_ACP) then cpTo:=DefaultSystemCodePage;
  if (cpFrom<>cpTo) then begin
   SetCodePage(tmp,cpFrom,false);
   SetCodePage(tmp,cpTo,true);
   SetCodePage(tmp,cpSet,false);
  end;
  Result:=tmp;
 except
  on E:Exception do if not silent then BugReport(E,nil,'ConvertCodePage');
 end;
end;

function AnsiToUtf8(const S:LongString):LongString;
begin
 Result:=DefaultCP(SysToUtf8(S));
end;

function Utf8ToAnsi(const S:LongString):LongString;
begin
 Result:=DefaultCP(Utf8ToSys(S));
end;

function Utf8Encode(const S:UnicodeString):LongString;
begin
 Result:=DefaultCP(UTF16ToUTF8(S));
end;

function Utf8Decode(const S:LongString):UnicodeString;
begin
 Result:=UTF8ToUTF16(S);
end;

function UnicodeToUtf8String(const Source:UnicodeString):LongString;
begin
 Result:=DefaultCP(UTF16ToUTF8(Source));
end;

function Utf8ToUnicodeString(const Source:LongString):UnicodeString;
begin
 Result:=UTF8ToUTF16(Source);
end;

function utf8_valid(const S:LongString):Boolean; overload;
begin
 if (S='')
 then Result:=true // For valid S, FindInvalidUTF8Codepoint(S)=-1
 else Result:=(FindInvalidUTF8Codepoint(PChar(S),Length(S),true)<0);
end;

function utf8_valid(S:PChar; Len:SizeInt):Boolean; overload;
begin
 if (S=nil) then Result:=false else
 if (Len<0) then Result:=false else
 if (Len=0) then Result:=true  else
 Result:=(FindInvalidUTF8Codepoint(S,Len,true)<0);
end;

function utf8_errpos(const S:LongString):SizeInt;
begin
 if (S='')
 then Result:=0    // For valid S, FindInvalidUTF8Codepoint(S)=-1
 else Result:=Max(0,FindInvalidUTF8Codepoint(PChar(S),Length(S),true)+1);
end;

function utf8_length(const S:LongString):SizeInt;
begin
 if (S<>'') and utf8_valid(S)
 then Result:=UTF8LengthFast(S)
 else Result:=0;
end;

function utf8_encode_wide(const S:UnicodeString):LongString;
begin
 Result:=DefaultCP(UTF16ToUTF8(S));
end;

function utf8_decode_wide(const S:LongString):UnicodeString;
begin
 Result:=UTF8ToUTF16(S);
end;

function utf8_encode_ansi(const S:LongString):LongString;
begin
 Result:=DefaultCP(SysToUtf8(S));
end;

function utf8_decode_ansi(const S:LongString):LongString;
begin
 Result:=DefaultCP(Utf8ToSys(S));
end;

function utf8_copy(const S:LongString; i,n:SizeInt):LongString;
begin
 Result:=DefaultCP(Utf8Copy(S,i,n));
end;

function utf8_chrpos(const S:LongString; i:SizeInt):SizeInt;
var P:PChar;
begin
 Result:=0; if (S='') then Exit;
 if (i<1) or (i>utf8_length(S)) then Exit;
 P:=UTF8CodepointStart(Pointer(S),Length(S),i-1);
 if Assigned(P) then Result:=SubtractPointersAsPtrInt(P,Pointer(S))+1;
end;

function utf8_ord(const S:LongString; i:SizeInt=1):SizeInt;
var P:PChar; L:Integer;
begin
 Result:=-1;
 if (i<1) or (i>utf8_length(S)) then Exit;
 P:=UTF8CodepointStart(PChar(S),Length(S),i-1);
 if Assigned(P) then Result:=UTF8CodepointToUnicode(P,L);
end;

function utf8_chr(c:SizeInt):LongString;
begin
 if (c>=Low(TUnicodeInt)) and (c<=High(TUnicodeInt))
 then Result:=DefaultCP(UnicodeToUTF8(c))
 else Result:='';
end;

function utf8_uppercase(const S:LongString):LongString;
begin
 Result:=DefaultCP(UTF8UpperCase(S));
end;

function utf8_lowercase(const S:LongString):LongString;
begin
 Result:=DefaultCP(UTF8LowerCase(S));
end;

function utf8_fixbroken(const S:LongString):LongString;
var Tmp:String;
begin
 if (S='') then Result:='' else
 if utf8_valid(S) then Result:=S else begin
  Tmp:=S; UTF8FixBroken(Tmp);
  Result:=Tmp;
 end;
end;

function utf8_sametext(const S1,S2:LongString):Boolean;
var code:Integer;
begin
 if utf8_valid(S1) and utf8_valid(S2)
 then code:=UTF8CompareText(S1,S2)
 else code:=CompareText(S1,S2);
 Result:=(code=0);
end;

function utf8_fix_cp(const S:LongString; cp:Word=0):LongString;
begin
 Result:=S;
 if (S='') then Exit;
 if (DefaultSystemCodePage<>CP_UTF8) then Exit;
 if (cp=0) then cp:=TheFavoriteAnsiCodePage;
 if (cp=0) then Exit;
 if (cp=CP_UTF8) then Exit;
 if (cp=CP_NONE) then Exit;
 if utf8_valid(S) then Exit;
 Result:=ConvertCP(S,cp,CP_UTF8,CP_ACP,true);
 if (Result='') then Result:=S;
end;

function Test_crw_utf8:LongString;
var List:TStringList; sa,sb:LongString;
 procedure Test1(What,Test:LongString);
 begin
  sb:=Test;
  List.Add(Format('Test %s:',[What]));
  List.Add(Format(' Ansi: %-36s   Utf8: %-36s',[sa,sb]));
  List.Add(Format(' Stat: %d %-5u %-5u %-14u  Stat: %d %-5u %-5u %-14u',
           [Ord(utf8_valid(sa)),Length(sa),utf8_length(sa),StringCodePage(sa),
            Ord(utf8_valid(sb)),Length(sb),utf8_length(sb),StringCodePage(sb)]));
  if utf8_errpos(sb)>0 then List.Add(Format('Error in position %d',[utf8_errpos(sb)]));
 end;
begin
 Result:='';
 try
  List:=TStringList.Create;
  try
   List.Add('Test_crw_utf8:');
   sa:='Hello, World. Привет, Мир.';
   //Test1('Utf8Encode',Utf8Encode(sa));
   //Test1('Utf8Decode',Utf8Decode(sa));
   Test1('AnsiToUtf8',AnsiToUtf8(sa));
   Test1('uppercase',uppercase(sa));
   Test1('lowercase',lowercase(sa));
   Test1('utf8_uppercase',utf8_uppercase(sa));
   Test1('utf8_lowercase',utf8_lowercase(sa));
   Test1('utf8_copy(15,6)',utf8_copy(sa,15,6));
   Test1('utf8_ord(s[15])',IntToStr(utf8_ord(sa,15)));
   Test1('utf8_chr(s[15])',utf8_chr(utf8_ord(sa,15)));
   Test1('utf8_ord(s[23])',IntToStr(utf8_ord(sa,23)));
   Test1('utf8_chr(s[23])',utf8_chr(utf8_ord(sa,23)));
   Test1('Broken UTF8',copy(sa,1,length(sa)-2));
   List.Add(Format('utf8_chrpos(s,15) = %d',[utf8_chrpos(sa,15)]));
   List.Add(Format('utf8_chrpos(s,23) = %d',[utf8_chrpos(sa,23)]));
   Result:=List.Text;
  finally
   Kill(List);
  end;
 except
  on E:Exception do BugReport(E,nil,'Test_crw_utf8');
 end;
end;

///////////////////////////////////////
// Unit initialization and finalization
///////////////////////////////////////

procedure Init_crw_utf8;
begin
end;

procedure Free_crw_utf8;
begin
end;

initialization

 Init_crw_utf8;

finalization

 Free_crw_utf8;

end.

//////////////
// END OF FILE
//////////////

