I have a problem with my tabular q-learning implementation for tictactoe 3×3 board.
The problem is that only the last move (winning,lose,tie) and its respective board state are stored in the q-table with a q-value other than “0.0”. All other state and actions pairs that lead to the last move still have a value of “0.0”. I added in the following the q-table, where it shows that the last move has a value of “0.2” but all the previous moves have a value of “0.0” and that is just for the first episode. Even after increasing the episodes it does not change anything. Only the last actions have a q-value other than “0.0”
class Mark(enum.StrEnum):
CROSS = "X"
NAUGHT = "O"
EMPTY = "_"
class Reward(enum.IntEnum):
WIN = 1
LOSE = -1
TIE = -0.065
NON_TERMINAL = -0.01
# Constants for Q-Learning
EPSILON = 0.1 # Exploration factor
ALPHA = 0.2 # Learning rate
GAMMA = 0.95 # Discount factor
TOTAL_EPISODES = 1 # Total number of games the agent will play
BOARD = np.array([Mark.EMPTY] * BOARD_SIZE)
def update_q_table(board, action, reward, new_board):
board_key = "".join(board)
new_board_key = "".join(new_board)
old_value = Q_TABLE_DICT.get((board_key, action), 0)
if game_over(new_board):
# If it's a terminal state, there are no future rewards to consider
next_max = 0
else:
# Estimate the optimal future value
next_max = max(
Q_TABLE_DICT.get((new_board_key, a), 0) for a in possible_moves(new_board)
)
# Update the Q-value for the current state-action pair using the bellman equation
q_value = old_value + ALPHA * (reward + GAMMA * next_max - old_value)
Q_TABLE_DICT[(board_key, action)] = q_value
def train_q_learning_agent():
for episode in range(TOTAL_EPISODES):
board = np.array([Mark.EMPTY] * BOARD_SIZE) # Reset the board
current_mark = Mark.CROSS
while not game_over(board):
# Q-learning agent (X) makes a move
if current_mark == Mark.CROSS:
action = choose_action_q_learning(board, training=True)
new_board = make_move_to(board, action, current_mark)
reward = get_reward(new_board, current_mark)
print(new_board)
update_q_table(board, action, reward, new_board)
# Random player (O) makes a move
else:
action = get_random_move(board)
new_board = make_move_to(board, action, current_mark)
board = new_board
current_mark = Mark.NAUGHT if current_mark == Mark.CROSS else Mark.CROSS
def choose_action_q_learning(board, training=True) -> int:
if training and random.uniform(0, 1) < EPSILON:
# Exploration: choose a random action
return np.random.choice(possible_moves(board))
else:
# Exploitation: choose the best action based on current Q-table
board_key = "".join(board)
q_values = {
action: Q_TABLE_DICT.get((board_key, action), 0)
for action in possible_moves(board)
}
return max(q_values, key=q_values.get)
Q-Table Dictionary as json for the first episode:
{
"('_________', 0)": 0.0,
"('XO_______', 2)": 0.0,
"('XOX____O_', 3)": 0.0,
"('XOXX___OO', 4)": 0.0,
"('XOXXXO_OO', 6)": 0.2
}