#  DATA CHECKING FUNCTIONS
# description: functions to check data that can come from Excel files: 
# checking of column type, format, value, etc.
# author : Juliette Fabre / OSU OREME
# creation : 11/12
# last update : 03/21

# Truncate list of items depending on max_item value
# Arguments:
# - item: vector of items (indices of lines or values)
# - max_item: maximum number of item to return
# 
# Values:
# - item: truncated vector of items (indices of lines or values)
truncate_item <- function(item, max_item)
{
  if(!is.null(max_item) && length(item) > max_item) item <- c(item[1:max_item], paste0("... (", length(item) - max_item + 1, " donnees supplementaires)"))
  return(item)
}

# Format list of items depending on max_item value
# Arguments:
# - item: vector of items (indices of lines or values)
# - max_item: maximum number of item to return
# 
# Values:
# - item: formated vector of items (indices of lines or values)
format_item <- function(item, max_item)
{
  item <- truncate_item(item, max_item)
  item <- paste0(item, collapse = " ; ")
  return(item)
}

# Global informations for further functions:
#
# Arguments:
# - col: name or index of the column to check
# - data: dataframe
# - sheet: optional, name of the Excel sheet. Is used to print the name of the concerned sheet in case of Excel data file
# When it makes sense, a supplementary argument can be provided:
# - msg_type: optional, default 'wrong_values', type of message to return. If 'wrong_lines', a string containing the numbers of the problematic lines will be returned. If 'wrong_values', it will be a list with unique wrong values
# - shift: optional, default 0. Number of lines to be added to the line indices if msg_type='wrong_lines'
# - max_item: optional, default NULL. Maximum number of items (values or lines) that error message should return
#
# Values:
# - res: result of the test: 0 if it failed, 1 otherwise
# - msg: an empty string if the checking succeeded, and an error message if it failed

# Check that there isn't any missing value in a given dataframe column
check_no_missing_value <- function(col, data, sheet = "", shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  if(any(is.na(data[, col]) | as.character(data[, col]) == "")) 
  {
    res <- 0
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[is.na(data[, col]) | as.character(data[, col]) == "",]
    item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
    msg <- paste(msg, "La colonne '", col, "' contient des donnees manquantes, ligne(s) :\n ", item, ".\n\n")
  }
  return(list(res = res, msg = msg))
}

# Check that all values are unique in a given dataframe column
#
# Supplementary arguments:
# - check_consistency: optional, default false. If true, this test actually consists in checking the consistency between the attributes of each individual of the column, and it only changes the returned message (this should be used in case of repetitions of individuals in a column, with other columns containing features of the individuals).
# - cased: optional, default true. If false, the case won't be taken into account 
check_unicity <- function(col, data, check_consistency = F, msg_type = "wrong_values", sheet = "", cased = T, shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(!cased) data[, col] <- tolower(data[, col])
  
  if(any(table(data[, col]) > 1))
  {
    res <- 0
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[!is.na(data[, col]) & data[, col] %in% names(table(data[, col])[table(data[, col]) > 1]), ]
    
    if(msg_type == "wrong_lines") 
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item) 
      if(check_consistency) {
        msg <- paste0(msg, "La colonne '", col, "' contient des elements comportant des attributs differents, ligne(s) :\n", item, ".\n\n") 
      } else {
        msg <- paste0(msg, "La colonne '", col, "' contient plusieurs donnees du même nom, ligne(s) :\n", item, ".\n\n")
      }
    } else 
    {
      item <- format_item(names(table(data[, col])[table(data[, col]) > 1]), max_item)
      if(check_consistency) {
        msg <- paste0(msg, "La colonne '", col, "' contient des elements comportant des attributs differents :\n", item, ".\n\n") 
      } else {
        msg <- paste0(msg, "La colonne '", col, "' contient plusieurs donnees du même nom :\n", item, ".\n\n") 
      }
    }
  }
  
  return(list(res = res, msg = msg))
}

# Check that all values of a given dataframe column belong to a given set
#
# Supplementary arguments:
# - value_set: vector of accepted values
# - value_description: textual description of the value set
# - cased: optional, default true. If false, the case won't be taken into account 
check_belong <- function(col, data, value_set, value_description = "les valeurs autorisees", msg_type = "wrong_values", sheet = "", cased = T, shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(!cased)
  {
    data[, col] <- tolower(data[, col])
    value_set <- tolower(value_set)
  }
  
  if(any(!is.na(data[, col]) & !data[, col] %in% value_set))
  {
    res <- 0
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[!is.na(data[, col]) & !data[, col] %in% value_set, ]
    if(msg_type == "wrong_lines")
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees qui n'existent pas dans ", value_description, ", ligne(s) : \n", item, ".\n\n") 
    } else 
    {
      item <- format_item(unique(sub_data[, col]), max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees qui n'existent pas dans ", value_description, " : \n", item, ".\n\n") 
    }
  }
  return(list(res = res, msg = msg))
}

# Check that a given dataframe column is numeric
check_numeric <- function(col, data, msg_type = "wrong_values", sheet = "", shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(is.factor(data[, col])) data[, col] <- as.character(data[, col])
  
  if(any(!is.na(data[, col]) & data[, col] != '' & is.na(as.numeric(data[, col])))) 
  {
    data_try <- data
    data_try[, col] <- gsub(",", ".",  data_try[, col])
    if(any(!is.na(data_try[, col]) & data_try[, col] != '' & is.na(as.numeric(data_try[, col])))) 
    {
      res <- 0
      if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
      sub_data <- data_try[!is.na(data_try[, col]) & is.na(as.numeric(data_try[, col])), ]
      if(msg_type == "wrong_lines")
      {
        item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
        msg <- paste0(msg, "La colonne '", col, "' contient des donnees non numeriques, ligne(s) :\n ", item, ".\n\n")
      } else 
      {
        item <- format_item(unique(sub_data[, col]), max_item)
        msg <- paste0(msg, "La colonne '", col, "' contient des donnees non numeriques :\n ", item, ".\n\n")
      }
    } 
  }
  return(list(res = res, msg = msg))
}

# Check that all values of a given dataframe column are within a given range (for numeric, date, datetime or time vectors)
#
# Supplementary arguments:
# - min: minimum value of the range. Should be of the same type and format than 'col' (numeric, string, POSIXct, Date)
# - max: maximum value of the range. Should be of the same type and format than 'col' (numeric, string, POSIXct, Date)
# - datatype: type of data : numeric, datetime, date or time (default numeric)
check_within_range <- function(col, data, min, max, datatype = "numeric", msg_type = "wrong_values", sheet = "", shift = 0, max_item = NULL)
{
  if(is.factor(data[, col])) data[, col] <- as.character(data[, col])
  
  # Check vector data type
  if(datatype == "datetime")
  {
    res_check <- check_datetime(col, data, msg_type = msg_type, sheet = sheet, shift = shift, max_item = max_item)
    res <- res_check$res
    msg <- res_check$msg
  } else if(datatype == "date")
  {
    res_check <- check_date(col, data, msg_type = msg_type, sheet = sheet, shift = shift, max_item = max_item)
    res <- res_check$res
    msg <- res_check$msg
  } else if(datatype == "time")
  {
    res_check <- check_time(col, data, msg_type = msg_type, sheet = sheet, shift = shift, max_item = max_item)
    res <- res_check$res
    msg <- res_check$msg
  } else
  {
    res_check <- check_numeric(col, data, msg_type = msg_type, sheet = sheet, shift = shift, max_item = max_item)
    res <- res_check$res
    msg <- res_check$msg
    data[, col] <- as.numeric(data[,col])
  }
  
  # If datatype is OK, check range
  if(res)
  {
    if(any(!is.na(data[, col]) & (data[, col] > max | data[, col] < min)))
    {
      res <- 0
      if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
      sub_data <- data[!is.na(data[, col]) & (data[, col] > max | data[, col] < min), ]
      if(msg_type == "wrong_lines")
      {
        item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
        msg <- paste0(msg, "La colonne '", col, "' contient des donnees qui ne sont pas dans l'intervalle [", min, " - ", max, "], ligne(s) : \n", item, ".\n\n") 
      } else 
      {
        item <- format_item(unique(sub_data[, col]), max_item)
        msg <- paste0(msg, "La colonne '", col, "' contient des donnees qui ne sont pas dans l'intervalle [", min, " - ", max, "] : \n", item, ".\n\n") 
      }
    }
  }
  
  return(list(res = res, msg = msg))
}

# Check that a given dataframe column is integer
check_integer <- function(col, data, msg_type = "wrong_values", sheet = "", shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(is.factor(data[, col])) data[, col] <- as.character(data[, col])
  
  if(any((!is.na(data[, col]) & data[, col] != '' & is.na(as.numeric(data[, col]))) | (!is.na(data[, col]) & data[, col] != '' & !is.na(as.numeric(data[, col])) & round(as.numeric(data[, col])) != as.numeric(data[, col])))) 
  {
    res <- 0
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[(!is.na(data[, col]) & is.na(as.numeric(data[, col]))) | (!is.na(data[, col]) & !is.na(as.numeric(data[, col])) & round(as.numeric(data[, col])) != as.numeric(data[, col])), ]
    if(msg_type == "wrong_lines")
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees non entieres, ligne(s): \n ", item, ".\n\n")
    } else 
    {
      item <- format_item(unique(sub_data[, col]), max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees non entieres : \n ", item, ".\n\n") 
    }
  }
  
  return(list(res = res, msg = msg))
}

# Check that all values in a given dataframe column respect a maximum number of characters
#
# Supplementary arguments:
# - nbchar: maximum number of characters
# - test: optional, default 'sup'. If 'equal', the functions tests if the length of each elements of the column is equal to nbchar, otherwise it tests if the length of each elements of the column is superior to nbchar
check_nb_character <- function(col, data, nbchar, test = "sup", msg_type = "wrong_values", sheet = "", shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(is.factor(data[, col])) data[, col] <- as.character(data[, col])
  
  if(test == "equal" & any(nchar(data[, col]) != nbchar & !is.na(data[, col])))
  {
    res <- 0
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[nchar(data[, col]) != nbchar & !is.na(data[, col]), ]
    if(msg_type == "wrong_lines")
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees qui ne font pas ", nbchar, " caracteres, ligne(s) : \n", item, ".\n\n")
    } else
    {
      item <- format_item(unique(sub_data[, col]), max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees qui ne font pas ", nbchar, " caracteres : \n", item, ".\n\n") 
    }
  }	else if(any(nchar(data[, col]) > nbchar & !is.na(data[, col]))) 
  {
    res <- 0
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[nchar(data[, col]) > nbchar & !is.na(data[, col]), ]
    if(msg_type == "wrong_lines")
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees de plus de ", nbchar, " caracteres, ligne(s) : ", item, ".\n\n") 
    } else 
    {
      item <- format_item(unique(sub_data[, col]), max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des donnees de plus de ", nbchar, " caracteres : \n", item, ".\n\n") 
    }
  }	
  
  return(list(res = res, msg = msg))
}

# Check that a given dataframe column is date-formatted (yyyy-mm-dd)
#
# Supplementary argument:
# - year_accepted: optional, default false. Should years be accepted as dates?
# - date_format: optional, default "%Y-%m-%d"
check_date <- function(col, data, year_accepted = F, msg_type = "wrong_values", date_format="%Y-%m-%d", sheet = "", shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(!year_accepted)
  {
    if(any(!is.na(data[, col]) & is.na(strptime(data[, col], date_format)))) 
    {
      res <- 0
      if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
      sub_data <- data[!is.na(data[, col]) & is.na(strptime(data[, col], "%Y-%m-%d")), ]
      if(msg_type == "wrong_lines")
      {
        item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
        msg <- paste0(msg, "La colonne '", col, "' contient des dates qui ne sont pas au format date 'jj-mm-aaaa', ligne(s) :\n ", item, ".\n\n") 
      } else
      {
        item <- format_item(unique(sub_data[, col]), max_item)
        msg <- paste0(msg, "La colonne '", col, "' contient des dates qui ne sont pas au format date 'jj-mm-aaaa' :\n ", item, ".\n\n") 
      }
    }
  } else if(any(!is.na(data[, col]) & is.na(strptime(data[, col], date_format)) & !is_year(data[, col]))) 
  {
    res <- 0
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[!is.na(data[, col]) & is.na(strptime(data[, col], "%Y-%m-%d")) & !is_year(data[, col]), ]
    if(msg_type == "wrong_lines")
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des dates qui ne sont pas au format date 'aaaa-mm-jj', ligne(s) :\n ", item, ".\n\n") 
    } else 
    {
      item <- format_item(unique(sub_data[, col]), max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des dates qui ne sont pas au format date 'aaaa-mm-jj' :\n ", item, ".\n\n") 
    }
  }
  
  return(list(res = res, msg = msg))
}

# Check that a given dataframe column is datetime-formatted (yyyy-mm-dd HH:MM:SS)
check_datetime <- function(col, data, msg_type = "wrong_values", sheet = "", shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(any(!is.na(data[, col]) & is.na(strptime(data[, col], "%Y-%m-%d %H:%M:%S")))) 
  {
    res <- 0 
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[!is.na(data[, col]) & is.na(strptime(data[, col], "%Y-%m-%d %H:%M:%S")), ]
    if(msg_type == "wrong_lines") 
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des dates qui ne sont pas au format datetime 'aaaa-mm-jj hh:mm:ss', ligne(s) :\n ", item, ".\n\n") 
    } else
    {
      item <- format_item(unique(sub_data[, col]), max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des dates qui ne sont pas au format datetime 'aaaa-mm-jj hh:mm:ss', ligne(s) :\n ", item, ".\n\n")
    }
  }
  
  return(list(res = res, msg = msg))
}

# Check that a given dataframe column is time-formatted (HH:MM:SS)
check_time <- function(col, data, msg_type = "wrong_values", sheet = "", shift = 0, max_item = NULL)
{
  msg <- ""
  res <- 1
  
  if(any(!is.na(data[, col]) & is.na(strptime(data[, col], "%H:%M:%S")))) 
  {
    res <- 0 
    if(nchar(sheet)) msg <- paste0("'", toupper(sheet), "' : ")
    sub_data <- data[!is.na(data[, col]) & is.na(strptime(data[, col], "%H:%M:%S")), ]
    if(msg_type == "wrong_lines") 
    {
      item <- format_item(as.integer(row.names(sub_data)) + 1 + shift, max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des heures qui ne sont pas au format hh:mm:ss, ligne(s) :\n ", item, ".\n") 
    } else 
    {
      item <- format_item(unique(sub_data[, col]), max_item)
      msg <- paste0(msg, "La colonne '", col, "' contient des heures qui ne sont pas au format hh:mm:ss :\n ", item, ".\n") 
    }
  } 
  
  return(list(res = res, msg = msg))
}