#!/bin/bash

###########################################################
## Copyright (c) 2002-2023 Alexey Kuryakin daqgroup@mail.ru
###########################################################

###########################################################
## batch (mass) convert *.htm* files in directory $1 from
## windows-1251 to utf-8
## use string: 'content="text/html; charset=windows-1251"'
## to filter HTML file(s) to convert.
###########################################################

###########################################################
source $(crwkit which crwlib_base.sh); # Use base library #
source $(crwkit which crwlib_file.sh); # Use file library #
###########################################################

readonly good_enca="1251 koi8-r ascii";
readonly list_ansi="cp1251 win-1251 windows-1251 windows-1252 iso_8859-1 iso-8859-1 us-ascii ascii";
readonly list_utf8="utf8 utf-8";

function cat_all(){
  for item in $*; do
   if [ -e $item ]; then cat $item; fi;
  done;
};

function tr_upper_lower(){
 tr '[:upper:]' '[:lower:]';
};

function grep_charset(){
 grep -i '<meta .* content=["'']text/html; *charset=[a-zA-Z0-9-]*["'']';
};

function extract_charset(){
 grep_charset | sed -e 's|.* content=["'']text/html; *charset=||i' -e 's/["''].*//i';
};

function href_to_lower(){
 #sed -i --follow-symlinks 's|\( href=["'']*\)\([a-zA-Z0-9#_\\/:-]*\)\(["'']*\)|\1\L\2\3|ig' "$1";
 sed -i --follow-symlinks 's|\( href=["'']*\)\([^"''<> #]*\)\(["''<> #]*\)|\L\1\L\2\3|ig' "$1";
};

function src_to_lower(){
 sed -i --follow-symlinks 's|\( src=["'']*\)\([^"''<> #]*\)\(["''<> #]*\)|\L\1\L\2\3|ig' "$1";
};

function has_href_src_data(){
 local s="$(cat $1 | grep -i 'src=["'']*data:.*;base64,')";  if [ -n "$s" ]; then return 0; fi;
 local s="$(cat $1 | grep -i 'href=["'']*data:.*;base64,')"; if [ -n "$s" ]; then return 0; fi;
 false;
};

function main(){
 ###################
 # declare variables
 ###################
 local nerrs=0;  local nseds=0;  local nlows=0;
 local nproc=0;  local flist=(); local files="";
 local nsucc=0;  local nfail=0;  local nrefs=0;
 local nsrcs=0;
 ####################################
 # find/process files in directory $1
 ####################################
 if [ -d "$1" ]; then
  #####################################################
  # find and lowercase all *.htm* files in directory $1
  # skip file names contains spaces
  #####################################################
  files="$(find $1 -type f -name '*.htm*' | grep -v ' ' | xargs)";
  echo "Found $(echo $files | wc -w) HTML file(s) in $1";
  for item in $files; do
   if [ -e $item ]; then
    if unix filecase -l -q $item; then let nlows++; else let nerrs++; fi;
    if has_href_src_data $item; then
     echo "skip data: $item";
    else
     if href_to_lower $item; then let nrefs++; else let nerrs++; fi;
     if src_to_lower  $item; then let nsrcs++; else let nerrs++; fi;
    fi;
   fi;
   let nproc++;
  done;
  echo "Processed $nproc, lower $nlows, href $nrefs, src $nsrcs, errors $nerrs";
  ##################
  # list of encoding
  ##################
  #enca $files 2>/dev/null | grep '^\S' | sed 's/^.*: //' | sort | uniq;
  #return;
  #####################
  # list of all charset
  #####################
  #cat_all $files | extract_charset | tr_upper_lower | sort | uniq | xargs;
  #return;
  ###############################################
  # find list of all *.htm* files in directory $1
  # with filter string and detected encoding 1251
  ###############################################
  files="$(find $1 -type f -name '*.htm*' | xargs)";
  echo "Found $(echo $files | wc -w) HTML file(s) in $1";
  for item in $files; do
   if [ -e $item ]; then
    local cs="$(cat $item | extract_charset)";
    if [ -n "$cs" ] && word_is_in_list $cs $list_ansi; then
     local enc="$(enca $item | head -n 1 | tr_upper_lower)";
     local ncp=0;
     for wd in $enc; do
      if word_is_in_list $wd $good_enca; then let ncp++; fi;
     done;
     if [ $ncp -eq 0 ]; then
      echo "$cs $enc - $item";
      continue;
     fi;
     flist+=($item);
    fi;
   fi;
  done;
  echo "Seleted ${#flist[@]} files to convert";
  # return;
  ################################################
  # convert all found *.htm* files in directory $1
  ################################################
  let nproc=0; # let nerrs=0;
  files="${flist[@]}"; # echo "$files";
  echo "Found $(echo $files | wc -w) ANSI HTML file(s) to process";
  for item in $files; do
   if enca -c $item; then
    if sed -i --follow-symlinks 's|content=["'']text/html; charset=[a-zA-Z0-9-]*["'']|content="text/html; charset=utf-8"|i' $item; then let nseds++; else let nerrs++; fi;
    let nsucc++;
   else
    let nfail++;
    let nerrs++;
   fi;
   let nproc++;
  done;
  echo "Processed files: $nproc, Succeeded: $nsucc, Failed: $nfail  Errors: $nerrs";
 else
  fatal 1 "$scriptname: invalid directory \"$1\"";
 fi;
};

main "$@";

##############
## END OF FILE
##############
