# -*- coding: utf-8 -*-

import sys
import csv
from math import log
from MeCabWrapper import execute_nlp
from logging import getLogger, StreamHandler, DEBUG
logger = getLogger(__name__)
handler = StreamHandler()
handler.setLevel(DEBUG)
logger.setLevel(DEBUG)
logger.addHandler(handler)

"""
The program calculate idf(inverse document frequency) of each word from original text data and write file.
  
  input: text file (<Original Text>)
  output: text file( <IDF list>)
  
  Process:
  1.Read input file
  2.Calculate each word importance
     2-1.give Morphological Analysis to Original Data
     2-2.Calculate IDF of each word

"""
max_idf_key = "@@MAX_IDF@@"
split_doc = False

def main():
    if len(sys.argv) == 3:
        input_f = sys.argv[1]
        output_f = sys.argv[2]
    else:
        logger.error('Usage: WordImportanceGenerator.py <originalTextFile> <outputIdfFile>')
        sys.exit()

    calculate_df(input_f, output_f)

    logger.info("WordImportanceCalculator is complete.")


def calculate_df(input_f, output_f):
    words_map = {}

#    file = open(input_f, "r", encoding="utf-8") # code for Python3
    reader = open(input_f,'r')
    count = 0
    data_cnt = 0
    for row in reader:

        split_col = []
        if split_doc:
            split_col = row.split('。')
        else:
            split_col.append(row)

        for col in split_col:
            if not col or col == '\n':
                continue

            words = execute_nlp(col)
            single_words = generate_non_duplicated_list(words)
            for word in single_words:
                if not words_map.get(word):
                    words_map.update({word: 1})
                else:
                    words_map.update({word: words_map.get(word) + 1})
            data_cnt += 1

        count += 1

    reader.close()
    r_len = data_cnt
    
    if(count==1):
        logger.warning("The number of data is one. we should prepare more than 1.")
    
    logger.info("Read original data file.The number of data is %s" % count)
    logger.info("Total sentence number is %s" % r_len)

    out_file = open(output_f,'w')
    writer = csv.writer(out_file, lineterminator='\n')

    max_idf = -1

    for key in words_map.iterkeys():
        idf = 0
        if r_len != 1:
            idf = log(r_len /words_map.get(key))
        else:
            idf = 1.0
        
        if idf > max_idf:
            max_idf = idf

        row = [key,idf]
        writer.writerow(row)

    max_row = [max_idf_key,max_idf]
    writer.writerow(max_row)

    out_file.close()


def generate_non_duplicated_list(pos):
    single_word_list = []
    for word in pos:
        if word not in single_word_list:
            single_word_list.append(word)

    return single_word_list

if __name__ == '__main__':
    main()
