#!/bin/bash

##########################################################
# Copyright (c) 2001-2025 Alexey Kuryakin daqgroup@mail.ru
##########################################################

##########################################################
# Utility to check of text files on UTF8 validity.
##########################################################

###########################################################
source $(crwkit which crwlib_base.sh); # Use base library #
source $(crwkit which crwlib_file.sh); # Use file library #
###########################################################

declare -i badfiles=0;
declare -i txtfiles=0;
declare -i sucfiles=0;
declare -i binfiles=0;
declare -i numfiles=0;
declare -i opt_verb=0;
declare -i maxdepth=9;

readonly bit_valid=0;
readonly bit_binary=1;
readonly bit_broken=2;

function do_usage(){
 fatal 1 "Usage: $scriptname (-h|--help|--version|-v|--verb) (file|dir)(s)";
};

function do_version(){
 echo "$scriptname version 1.0";
};

function do_print_help(){
 do_version;
cat<<EOF
$scriptname Copyright (c) 2025 Alexey Kuryakin daqgroup@mail.ru
Utility to check UTF8 validity of text files.
Usage:
 $scriptname [-options] [file|dir …]
Options:
 --version   - print version
 -h,--help   - print help screen
 -v,--verb F - set verbosity flags (F)
 -vF,--verbF - set verbosity flags (F)
   F=0       - less verbosity, print only broken files (default)
   F=1=bit0  - more verbosity, print also valid  files
   F=2=bit1  - more verbosity, print also binary files
   F=4=bit2  - more verbosity, print also broken lines
 -m,--max M  - set max.depth (M=0..9) to find files
 -mM,--maxM  - set max.depth (M=0..9) to find files
               0 mean unlimited depth, default is $maxdepth
Parameters:
 file|dir …  - file or directory list to search in
Verbosity flags:
Examples:
 $scriptname .              # check text file(s) in current directory with default options
 $scriptname -m1 -v7 /etc   # check text file(s) in /etc dir with max.depth 1 and max.verbosity
 $scriptname --help         # print help screen
 $scriptname --version      # print version
EOF
};

function list_files(){
 local depth=" "; if [[ $maxdepth -gt 0 ]]; then depth="-maxdepth $maxdepth"; fi;
 find -L "$@" $depth -readable -type f 2>/dev/null;
};

function has_bit(){
 local -i x=0; let "x=($1 >> $2) & 1";
 if [[ $x -eq 1 ]]; then return 0; fi;
 return 1;
};

function file_is_text(){
 if [[ -n $1 ]] && [[ -r $1 ]]; then
  local -i n="$(file "$1" | sed 's/.*:\s*//' | grep -i '\btext\b' | wc -l)";
  if [[ $n -gt 0 ]]; then return 0; fi;
 fi;
 return 1;
};

function check_utf8(){
 if [[ -n $1 ]] && [[ -r $1 ]]; then
  local numlines="$(cat "$1"                  | wc -l)";
  local badlines="$(cat "$1" | grep -vax '.*' | wc -l)";
  if [[ $badlines -gt 0 ]]; then
   colorize_none print_to_stderr "$1 : ";
   colorize_bold print_to_stderr "UTF8 BROKEN";
   colorize_norm echo_to_stderr  " $badlines of $numlines text line(s) ";
   if has_bit $opt_verb $bit_broken; then
    cat "$1" | grep -n -vax '.*' 1>&2;
   fi;
   return 1;
  else
   if has_bit $opt_verb $bit_valid; then
    colorize_none print_to_stdout "$1 : ";
    colorize_bold print_to_stdout "UTF8 VALID";
    colorize_norm echo_to_stdout  " $numlines text line(s)";
   fi;
   return 0;
  fi;
 fi;
 return 1;
};

function do_check_utf8(){
 for fn in $(list_files "$@"); do
  let numfiles++;
  if file_is_text "$fn"; then
   let txtfiles++;
   if check_utf8 "$fn"; then let sucfiles++; else let badfiles++; fi;
  else
   let binfiles++;
   if has_bit $opt_verb $bit_binary; then
    colorize_none print_to_stderr "$fn : ";
    colorize_warn print_to_stderr "SKIP";
    colorize_head echo_to_stderr  " non-text file";
   fi;
  fi;
 done;
 if [[ -t 1 ]]; then
  local optall="--bg-black"; local optnum="--bold --white-intense";
  local opttxt="--normal --white"; local optbin="--normal --white";
  local optsuc="--normal --white"; local optbad="--normal --green-intense";
  if [[ $badfiles -gt 0 ]]; then optbad="--bold --red-intense"; fi;
  if [[ $txtfiles -gt 0 ]]; then opttxt="--bold --cyan-intense"; fi;
  if [[ $sucfiles -gt 0 ]]; then optsuc="--bold --green-intense"; fi;
  if [[ $binfiles -gt 0 ]]; then optbin="--bold --yellow-intense"; fi;
  unix ansi -n $optall $optnum "$numfiles file(s) processed, ";
  unix ansi -n $optall $opttxt "$txtfiles text file(s), ";
  unix ansi -n $optall $optbin "$binfiles binary file(s), ";
  unix ansi -n $optall $optsuc "$sucfiles valid/utf8, ";
  unix ansi    $optall $optbad "$badfiles broken/utf8";
 else
  echo "$numfiles file(s) processed, $txtfiles text file(s), $binfiles binary file(s), $sucfiles valid/utf8, $badfiles broken/utf8";
 fi;
 local ans="$(echo "$numfiles file(s) processed, $txtfiles text(s), $binfiles binary(s), $sucfiles valid/utf8, $badfiles broken/utf8")";
 if [[ $badfiles -eq 0 ]]; then return 0; else return 1; fi;
};

######
# MAIN
######

function main(){
 if [[ $# -eq 0 ]]; then
  do_usage;
  return 0;
 fi;
 while [[ -n $1 ]]; do
  case $1 in
   --version)                      do_version; return 0; ;;
   -h|-help|--help)                do_print_help; return 0; ;;
   -v[0-7]|-verb[0-7]|--verb[0-7]) let opt_verb=${1: -1:1}; ;;
   -v|-verb|--verb)                if is_number "$2"; then let opt_verb=$2; shift; else let opt_verb=1; fi; ;;
   -m[0-9]|-max[0-9]|--max[0-9])   let maxdepth=${1: -1:1}; ;;
   -m|-max|--max)                  shift; let maxdepth=$1; ;; 
   -*)                             fatal 1 "Error: bad option $1"; ;;
   *)                              break; ;;
  esac;
  shift;
 done;
 do_check_utf8 "$@";
};

main "$@";

##############
## END OF FILE
##############
