"""Read a CSV file and hash the email addresses.
This script reads a CSV file, hashes the email addresses using HMAC MD5, SHA1, SHA2, and then outputs the hashes to another file.
The script will output some execution information (line count, time taken and output file
name) by default unless the --silent flag is specified.
"""

import os
import sys
import time
import hmac
import random
import getpass
import zipfile
import hashlib
import argparse
import re

ALLOWED_HASH_TYPES = ['HEMMD5', 'HEMSHA1', 'HEMSHA2']

def is_valid_email_address(email_address):
    regex = re.compile(r'^(?:.+)@(?:.+?)\.(.+)$')
    return re.match(regex, email_address)

def matchHashType(hashType, email):
    if hashType == 'HEMMD5':
        return hashlib.md5(email.encode('utf-8')).hexdigest()
    elif hashType == 'HEMSHA1':
        return hashlib.sha1(email.encode('utf-8')).hexdigest()
    elif hashType == 'HEMSHA2':
        return hashlib.sha256(email.encode('utf-8')).hexdigest()
    else:
        return ""

def hash_email(args):
    in_file = args.file
    args.type = args.type.upper()

    if args.type in ALLOWED_HASH_TYPES:
        hashType = args.type
    else:
        sys.exit("Error: unable to find the provided hash type")

    # out_file refers to the name of the output file with _hashed added to it.
    # out_file_zip refers to the name of the ZIP file.
    # output_file decides which file to output, CSV or ZIP.
    in_file_name = os.path.splitext(os.path.basename(in_file))
    out_file = "{0}_hashed_{1}{2}".format(in_file_name[0], hashType.lower(), in_file_name[1])
    if args.compress:
        out_file_zip = "{0}_hashed_{1}{2}".format(in_file_name[0], hashType.lower(), ".zip")
    output_file = out_file_zip if args.compress else out_file

    # Check if the output file exists and prompt the user. Do not prompt if
    # the --silent argument was passed and silently override.
    if not args.silent:
        if os.path.isfile(output_file):
            answer = input("The output file {0} exists and will be overwritten\n Proceed? (type yes or no): ".format(output_file))
            if answer not in ("yes", "y"):
                sys.exit()


    if not args.silent:
        print("Please wait, hashing email addresses. This may take a while...")

    # We note the executation start time; this is helpful for profiling.
    start_time = time.time()

    try:
        with open(in_file, "r") as f, open(out_file, "w") as out:
            # The first line is the header. If it is not or if it is missing
            # the email column, we have a CSV file that we don't know how to
            # process, so quit.
            try:
                first_line = f.readline().split(",")
                first_line = list(map(lambda x: x.lower(), first_line))
                email_index = first_line.index('email')
                add_column_index = first_line.index('add')
                remove_column_index = first_line.index('remove\n')

            except ValueError:
                sys.exit("Error: unable to find column 'email' in "
                         "input file {0}".format(os.path.abspath(in_file)))

            # We know the position of the email column, the Add column and the Remove column, so we use that to split the
            # line, hash the email address and save it to the output file with the respective values for the other columns.
            # Note that we are not using csvreader and that's by design -- we
            # don't need the overhead and it performs a lot worse.

            out.write("ID Type,ID,Add,Remove\n")
            for index, line in enumerate(f, 1):
                line = line.split(',')
                email = line[email_index].strip().lower()
                add_column = line[add_column_index].strip()
                remove_column = line[remove_column_index].strip()

                if is_valid_email_address(email):
                    email_hash = matchHashType(hashType, email)
                    out.write("{0},{1},{2},{3}\n".format(hashType, email_hash, add_column, remove_column))
    except IOError:
        sys.exit("File {0} not found. "
                 "Please check the file path.".format(in_file))

    # If --compress was passed, compress the output file using ZIP.
    if args.compress:
        with zipfile.ZipFile(out_file_zip, 'w', zipfile.ZIP_DEFLATED) as fzip:
            fzip.write(out_file)

    # End of execution.
    end_time = time.time()

    if not args.silent:
        print("Hashed {0} email addresses in {1:0.2f} seconds using {2} "
              "to {3}".format(index, end_time - start_time,
                              hashType, output_file))


def parse_args():
    parser = argparse.ArgumentParser(
            description="Read a CSV file and hash the email addresses."
            )
    parser.add_argument(
            "file",
            metavar="csv-file",
            help="input csv file with email addresses"
            )
    parser.add_argument(
            "type",
            metavar="hash-type",
            help="provide the type of hash to be generated using this script"
            )
    parser.add_argument(
            "--compress",
            action="store_true",
            help="compress the output file (create a ZIP archive)"
            )
    parser.add_argument(
            "--silent",
            action="store_true",
            help="run in silent mode"
            )

    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    hash_email(args)