"""Read a CSV file and hash the email addresses.
This script reads a CSV file, hashes the email addresses using HMAC MD5, SHA1, SHA2, and then outputs the hashes to another file.
The script will output some execution information (line count, time taken and output file
name) by default unless the --silent flag is specified.
"""

import argparse
import hashlib
import os
import re
import sys
import time
import zipfile

ALLOWED_HASH_TYPES = ['HEMMD5', 'HEMSHA1', 'HEMSHA2']


def is_valid_email_address(email_address):
    regex = re.compile(r'^(?:.+)@(?:.+?)\.(.+)$')
    return re.match(regex, email_address)


def match_hash_type(input, email):
    hash_type = input.upper()
    if hash_type == 'HEMMD5':
        return hashlib.md5(email.encode('utf-8')).hexdigest()
    elif hash_type == 'HEMSHA1':
        return hashlib.sha1(email.encode('utf-8')).hexdigest()
    elif hash_type == 'HEMSHA2':
        return hashlib.sha256(email.encode('utf-8')).hexdigest()
    else:
        return ""


def process_line(line, email_index, id_type_index, hash_type, out):
    line = line.strip().split(',')

    # Get the email, process it, and handle the rest of the columns dynamically.
    email = line[email_index].strip().lower()

    if is_valid_email_address(email):
        # Hash the email address based on the given hash_type
        email_hash = match_hash_type(hash_type, email)

        # Exclude the email and ID Type columns from the row
        row_without_email_and_id_type = [col.strip() for i, col in enumerate(line)
                                         if i != email_index and i != id_type_index]
        # Construct the output row with the hash and the remaining columns
        output_row = [hash_type, email_hash] + row_without_email_and_id_type

        # Write the processed row to the output file.
        out.write(",".join(output_row) + "\n")


def hash_email(args):
    in_file = args.file

    type_input = args.type.upper()
    if type_input in ALLOWED_HASH_TYPES:
        hash_type = type_input
    else:
        sys.exit("Error: unable to find the provided hash type")

    # out_file refers to the name of the output file with _hashed added to it.
    # out_file_zip refers to the name of the ZIP file.
    # output_file decides which file to output, CSV or ZIP.
    in_file_name = os.path.splitext(os.path.basename(in_file))
    out_file = "{0}_hashed_{1}{2}".format(in_file_name[0], hash_type.lower(), in_file_name[1])
    if args.compress:
        out_file_zip = "{0}_hashed_{1}{2}".format(in_file_name[0], hash_type.lower(), ".zip")
    output_file = out_file_zip if args.compress else out_file

    # Check if the output file exists and prompt the user. Do not prompt if
    # the --silent argument was passed and silently override.
    if not args.silent:
        if os.path.isfile(output_file):
            answer = input("The output file {0} exists and will be overwritten\n Proceed? (type yes or no): ".format(output_file))
            if answer not in ("yes", "y"):
                sys.exit()

    if not args.silent:
        print("Please wait, hashing email addresses. This may take a while...")

    # We note the executation start time; this is helpful for profiling.
    start_time = time.time()

    try:
        with open(in_file, "r") as f, open(out_file, "w") as out:
            # Read the first line, which should contain the headers.
            try:
                first_line = f.readline().strip().split(",")
                first_line_lowercased = list(map(lambda x: x.lower(), first_line))
                email_index = first_line_lowercased.index('email')
                id_type_index = first_line_lowercased.index('id type') if 'id type' in first_line_lowercased else None
            except ValueError:
                sys.exit("Error: Unable to find column 'email' in input file {0}".format(os.path.abspath(in_file)))

            # Remove 'email' and 'ID Type' from the headers
            header_without_email_and_id_type = [col for i, col in enumerate(first_line)
                                                if i != email_index and i != id_type_index]
            if len(first_line) <= 2:
                # Write the header to the output file, just "ID Type" and "ID" columns
                out.write("ID Type,ID\n")
            else:
                # Write the header to the output file, adding our own "ID Type" and "ID" columns
                out.write("ID Type,ID," + ",".join(header_without_email_and_id_type) + "\n")

            # Process the rest of the lines.
            for index, line in enumerate(f, 1):
                process_line(line, email_index, id_type_index, hash_type, out)

    except IOError:
        sys.exit("File {0} not found. "
                 "Please check the file path.".format(in_file))

    # If --compress was passed, compress the output file using ZIP.
    if args.compress:
        with zipfile.ZipFile(out_file_zip, 'w', zipfile.ZIP_DEFLATED) as fzip:
            fzip.write(out_file)

    # End of execution.
    end_time = time.time()

    if not args.silent:
        print("Hashed {0} email addresses in {1:0.2f} seconds using {2} "
              "to {3}".format(index, end_time - start_time,
                              hash_type, output_file))


def parse_args(args):
    parser = argparse.ArgumentParser(
            description="Read a CSV file and hash the email addresses."
            )
    parser.add_argument(
            "file",
            metavar="csv-file",
            help="input csv file with email addresses"
            )
    parser.add_argument(
            "type",
            metavar="hash-type",
            help="provide the type of hash to be generated using this script"
            )
    parser.add_argument(
            "--compress",
            action="store_true",
            help="compress the output file (create a ZIP archive)"
            )
    parser.add_argument(
            "--silent",
            action="store_true",
            help="run in silent mode"
            )

    return parser.parse_args(args)


if __name__ == "__main__":
    args = parse_args(sys.argv[1:])
    hash_email(args)