import cntk as C
import numpy as np

dataset_size = 200000
X = np.random.rand(dataset_size, 2)
labels = np.zeros((dataset_size, 3))
labels[X[:, 0] > X[:,1]] = [0,0,1]
labels[X[:, 0] <= X[:,1]] = [1,0,0]
labels[X[:,1] + X[:, 0] > 1] = [0, 1, 0]

x = C.input_variable(shape=(-1, 2), needs_gradient=False)
t = C.input_variable(shape=(-1, 3), needs_gradient=False)

init = C.initializer.normal(0.01)

theta1 = C.Parameter(shape=(2, 12), init=init )
bias1 = C.Parameter(shape=(1, 12), init=init )

theta2 = C.Parameter(shape=(12,3), init=init )
bias2 = C.Parameter(shape=(1, 3,), init=init )

def forward(x):
    y = C.times(x, theta1) + bias1
    y = C.element_max(y, 0.)
    return C.times(y, theta2) + bias2

def softmax(x):
    e = C.exp(x)
    s = C.reduce_sum(e, axis=1)
    return e/s

def crossentropy(y, t):
    prob = C.squeeze(C.reduce_sum(y*t, axis=1), 1)
    return - C.reduce_mean(C.log(prob))

#y = C.reduce_mean(C.cross_entropy_with_softmax(forward(x), t, axis=1))
y = crossentropy(softmax(forward(x)),t)

batch_size = 20
for i in range(min(dataset_size, 100000) // batch_size ):
    lr = 0.5 * (.1 ** ( max(i - 100 , 0) // 1000))
    sample = X[batch_size*i:batch_size*(i+1)]
    target = labels[batch_size*i:batch_size*(i+1)]
    g = y.grad({x:sample, t:target}, wrt=[theta1, bias1, theta2, bias2])
    for param,grad in g.items():
        param.value = param.value - grad * lr
    loss = y.eval({x:sample, t:target})
    print("cost {} - learning rate {}".format(loss[0], lr))

y = C.squeeze(C.argmax(forward(x), 1),1)
accuracy = 0
for i in range(1000):
    sample = X[batch_size*i:batch_size*(i+1)]
    target = labels[batch_size*i:batch_size*(i+1)]
    tt = y.eval({x:sample})[0]
    accuracy += np.sum(tt == np.argmax(target, axis=1))

print("Accuracy", accuracy / 1000. /batch_size)
# accuracy 99.36