from __future__ import division
import sys
from math import log
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np #numpy provides useful maths and vector operations
from numpy.random import random_sample


#This function isn't actually used in the lab, but included to show
#a simpler example of a bar chart than plot_distributions below.
def plot_histogram(values, counts):
    '''plot_histogram is a very general function that takes a list of
    values and a list of counts (one for each value, counts need no be
    integers), and makes a bar plot showing the count for each value.
    '''
    if len(values) != len(counts):
        print('ERROR: plot_histogram requires two list arguments of the same length')
        sys.exit(1)
    plt.clf()
    #arange() is like range() but returns a numpy array instead of a list
    x_pos = np.arange(len(counts)) 
    #first arg (x_pos) is the position of left hand side of bar
    #second arg (counts) is the height of bar
    plt.bar(x_pos,counts)
    #default bar width is .8, so put labels at left side + .4 (middle of bar)
    plt.xticks(x_pos+.4, values)
    plt.ylim([0,sum(counts)])
    plt.show()

def plot_distributions(true_distr, est_distr):
    ''' plot_distributions takes two distributions, represented as
    dictionaries, with key-value pairs being each outcome and its
    probability.  The first argument is assumed to be the true
    distribution and the second one an estimated distribution. It
    plots these next to each other in a bar plot.  
    '''
    #When we access the items in a dictionary, they are in no
    #particular order, so to make sure the true and estimated
    #probabilities correspond to the same outcome, we need to sort the
    #items in each dictionary, and we also checked to make sure that
    #the keys in the dictionaries match (i.e., they are distributions
    #over the same set of outcomes)
    plt.clf()
    sorted_true = sorted(true_distr.items()) #get sorted list of (key,val) pairs
    #next lines use list comrehension, a concise way to replace for loops
    true_labels = [item[0] for item in sorted_true] #list of keys, still sorted
    true_probs = [item[1] for item in sorted_true] #list of values, still sorted
    sorted_est = sorted(est_distr.items())
    est_labels = [item[0] for item in sorted_est]
    est_probs = [item[1] for item in sorted_est]
    if (true_labels != est_labels):
        print('ERROR: plot_distributions requires two distributions over the same set of outcomes')
        sys.exit(1)
    #the x_pos array will be the position of left side of each bar
    #arange() is like range() except creates an array instead of list
    x_pos = np.arange(len(true_labels))
    bar_width = .4 
    #bar() requires the position and height of each bar;
    #we also add a the width and color
    bars1 = plt.bar(x_pos, true_probs, bar_width, color='r')
    #second distribution has bars shifted to the right by .4
    bars2 = plt.bar(x_pos+.4, est_probs, bar_width, color='b')
    plt.xticks(x_pos+.4, true_labels)
    plt.legend( (bars1[0], bars2[0]), ('True', 'Est'))
    plt.ylim([0,sum(true_probs)]) #set max value of y axis
    plt.xlim([0,len(true_labels)])  #set max value of x axis
    plt.show()

def generate_random_sequence(distribution, N):
    ''' generate_random_sequence takes a distribution (represented as a
    dictionary of outcome-probability pairs) and a number of samples N
    and returns a list of N samples from the distribution.  
    This is a modified version of a sequence generator by fraxel on
    StackOverflow:
    http://stackoverflow.com/questions/11373192/generating-discrete-random-variables-with-specified-weights-using-scipy-or-numpy
    '''
    #As noted elsewhere, the ordering of keys and values accessed from
    #a dictionary is arbitrary. However we are guaranteed that keys()
    #and values() will use the *same* ordering, as long as we have not
    #modified the dictionary in between calling them.
    #The list() wrapper is for Py3.4 compatibility.
    outcomes = np.array(list(distribution.keys())) 
    probs = np.array(list(distribution.values()))
    #make an array with the cumulative sum of probabilities at each
    #index (ie prob. mass func)
    bins = np.cumsum(probs)
    #create N random #s from 0-1
    #digitize tells us which bin they fall into.
    #return the sequence of outcomes associated with that sequence of bins
    #(we convert it from array back to list first)
    return list(outcomes[np.digitize(random_sample(N), bins)])

def normalize_counts(counts):
    ''' normalize_counts takes a dictionary of counts as an argument and
    returns a corresponding dictionary of probabilities by normalizing
    the counts to sum to 1.
    '''
    ## students need to fill in correct function
    probs = counts
    return probs

def compute_likelihood(data, model):
    '''compute_likelihood takes a model (ie distribution, represented
    as a dictionary of outcome-probability pairs) and a list of
    outcomes (the data) and computes the likelihood P(data | model)
    '''
    ## students need to fill in correct function
    return 0
        
def compute_log_likelihood(data, model):        
    '''compute_likelihood takes a model (ie distribution, represented
    as a dictionary of outcome-probability pairs) and a list of
    outcomes (the data) and computes the log (base 10) of the
    likelihood
    '''
    ## students need to fill in correct function
    return 0

## Main body of code ##

#Create a dictionary that stores a probability distribution
distribution = dict([('a', 0.2),
                    ('b', 0.5),
                    ('c', 0.17),
                    ('d', 0.02),
                    ('e', 0.02),
                    ('f', 0.01),
                    ('g', 0.01),
                    ('h', 0.01),
                    ('i', 0.01),
                    ('j', 0.01),
                    ('k', 0.01),
                    ('l', 0.01),
                    ('m', 0.01),
                    ('n', 0.01)])
if (sum(distribution.values()) != 1):
    print('ERROR: Probability distribution does not sum to 1')
    sys.exit(1)

#Generate a sequence of 50 samples from the distribution.
str_list = generate_random_sequence(distribution, 50)
#str_list = generate_random_sequence(distribution, 500) #or do 500
print(str_list)

#count how many times each outcome occurred in the sequence and store
#that in a dictionary
#Uses a list comprehension but it could be done with a for loop
str_counts = dict([(s, str_list.count(s)) for s in distribution.keys()])
print('counts:')
print(sorted(str_counts.items()))

#normalize to get an estimate of the original distribution
str_probs = normalize_counts(str_counts)
print('est probs:')
print(sorted(str_probs.items()))

#compare the true and estimated distributions visually
plot_distributions(distribution, str_probs)

#for later parts of lab - these functions currently just return 0
L1 = compute_likelihood(str_list,distribution)
L2 = compute_likelihood(str_list,str_probs)
LL1 = compute_log_likelihood(str_list,distribution)
LL2 = compute_log_likelihood(str_list,str_probs)
print('likelihoods: ', L1, L2)
print('log likelihoods: ', LL1, LL2)