# -*- coding: utf-8 -*-

from SentenceImportanceCalculator import calculate_importance_score
from WordImportanceGenerator import max_idf_key
from logging import getLogger, StreamHandler, DEBUG
logger = getLogger(__name__)
handler = StreamHandler()
handler.setLevel(DEBUG)
logger.setLevel(DEBUG)
logger.addHandler(handler)

"""
The program delete duplicate sentence based on similarity.

"""
similarity_limit = 0.5


def delete_duplicate_sentence(sentences, word_list_list, idf_map):
    # type: (list, list[list], dic) -> list
    delete_flaged_id = []
    sentences_length = len(sentences)

    for i in range(0,sentences_length):
        for j in range(i+1,sentences_length):
            if i not in delete_flaged_id and j not in delete_flaged_id:
                similarity = calculate_cosine_similarity(word_list_list[i], word_list_list[j], idf_map)
                #print('Similarity between ' + sentences[i] + ' and ' + sentences[j] + ' is ' + str(similarity))
                #print('limit is ' + str(similarity_limit))
                if similarity >= similarity_limit:
                    importance_i = calculate_importance_score(word_list_list[i], idf_map)
                    importance_j = calculate_importance_score(word_list_list[j], idf_map)
                    #print('Importance[' + str(i) + '] = ' + str(importance_i) +',Importance[' + str(j) + '] = ' + str(importance_j))

                    if importance_i >= importance_j:
                        delete_flaged_id.append(j)
                    else:
                        delete_flaged_id.append(i)


    ans_sentence_index=[]

    for i in range(0,len(sentences)):
        if i not in delete_flaged_id:
            ans_sentence_index.append(i)

    return ans_sentence_index


def calculate_cosine_similarity(words1, words2, idf_map):
    all_words = list (set (words1 + words2))
    word_a_tf_map = generate_tf (words1)
    word_b_tf_map = generate_tf (words2)

    cosine_distance = 0.0
    A = 0.0
    B = 0.0

    for word in all_words:

        idf = 0.0
        if word in idf_map:
            idf = float(idf_map[word])
        else:
            idf = float(idf_map[max_idf_key])

        if word in words1 and word in words2:
            A += pow (idf * word_a_tf_map[word], 2)
            B += pow (idf * word_b_tf_map[word], 2)
            cosine_distance += (idf * word_a_tf_map[word]) * (idf * word_b_tf_map[word])

        elif word in words1 and word not in words2:
            A += pow (idf * word_a_tf_map[word], 2)

        elif word not in words1 and word in words2:
            B += pow (idf * word_b_tf_map[word], 2)

    return cosine_distance / (A * B)


def generate_tf(words):
    tf_map = {}

    for word in words:
        if tf_map.has_key (word):
            tf_map.update ({word:int(tf_map[word]) + 1})
        else:
            tf_map[word] = 1

    return tf_map

if __name__ == '__main__':
    sentences = ["今日の天気がいいですね" , "今日の天気は晴れですね"]
    word_a = ["今日" , "天気"]
    word_b = ["今日" , "晴れ"]
    word_list = [word_a,word_b]
    idf_map = {
        '今日':0.7,
        '天気':0.5,
        '晴れ':0.6
    }

    ans_sentences = delete_duplicate_sentence(sentences,word_list,idf_map)

    for sen in ans_sentences:
        print(sen)