


def do_adam() :
	w_b_dw_db = [(init_w, init_b, 0, 0)]
	w_history, b_history, error_history = [], [], [], []

	w, b, eta, mini_batch_size, num_points_seen = init_w, init_b, 0.1, 10, 0
	m_w, m_b, v_w, v_b, m_w_hat, m_b_hat, v_w_hat, v_b_hat, eps, beta1, beta2 = 0, 0, 0, 0, 0, 0, 0, 0, 1e-8, 0.9, 0.999
	for i in range(max_epochs) :
		dw, db = 0, 0
		for x,y in zip(X, Y) :
			dw += grad_w(w, b, x, y)
			db += grad_b(w, b, x, y)

		m_w = beta1 * m_w + (1-beta1)*dw
		m_b = beta1 * m_b + (1-beta1)*db

		v_w = beta2 * v_w + (1-beta2)*dw**2
		v_b = beta2 * v_b + (1-beta2)*db**2

		m_w_hat = m_w/(1-math.pow(beta1,i+1))
		m_b_hat = m_b/(1-math.pow(beta1,i+1))

		v_w_hat = v_w/(1-math.pow(beta2,i+1))
		v_b_hat = v_b/(1-math.pow(beta2,i+1))

		w = w - (eta / np.sqrt(v_w_hat + eps)) * m_w_hat
		b = b - (eta / np.sqrt(v_b_hat + eps)) * m_b_hat


