
Python – Tabular Q-Learning for TicTacToe – Only the last state/action pair is stored in the Q-Table Dictionary with a value other than 0

I have a problem with my tabular q-learning implementation for tictactoe 3×3 board.

The problem is that only the last move (winning,lose,tie) and its respective board state are stored in the q-table with a q-value other than “0.0”. All other state and actions pairs that lead to the last move still have a value of “0.0”. I added in the following the q-table, where it shows that the last move has a value of “0.2” but all the previous moves have a value of “0.0” and that is just for the first episode. Even after increasing the episodes it does not change anything. Only the last actions have a q-value other than “0.0”

class Mark(enum.StrEnum):
    CROSS = "X"
    NAUGHT = "O"
    EMPTY = "_"

class Reward(enum.IntEnum):
    WIN = 1
    LOSE = -1
    TIE = -0.065
    NON_TERMINAL = -0.01

# Constants for Q-Learning
EPSILON = 0.1  # Exploration factor
ALPHA = 0.2  # Learning rate
GAMMA = 0.95  # Discount factor

TOTAL_EPISODES = 1  # Total number of games the agent will play

BOARD = np.array([Mark.EMPTY] * BOARD_SIZE)
def update_q_table(board, action, reward, new_board):
    board_key = "".join(board)
    new_board_key = "".join(new_board)

    old_value = Q_TABLE_DICT.get((board_key, action), 0)

    if game_over(new_board):
        # If it's a terminal state, there are no future rewards to consider
        next_max = 0
        # Estimate the optimal future value
        next_max = max(
            Q_TABLE_DICT.get((new_board_key, a), 0) for a in possible_moves(new_board)

    # Update the Q-value for the current state-action pair using the bellman equation
    q_value = old_value + ALPHA * (reward + GAMMA * next_max - old_value)
    Q_TABLE_DICT[(board_key, action)] = q_value
def train_q_learning_agent():
    for episode in range(TOTAL_EPISODES):
        board = np.array([Mark.EMPTY] * BOARD_SIZE)  # Reset the board
        current_mark = Mark.CROSS

        while not game_over(board):
            # Q-learning agent (X) makes a move
            if current_mark == Mark.CROSS:
                action = choose_action_q_learning(board, training=True)
                new_board = make_move_to(board, action, current_mark)
                reward = get_reward(new_board, current_mark)
                update_q_table(board, action, reward, new_board)

            # Random player (O) makes a move
                action = get_random_move(board)
                new_board = make_move_to(board, action, current_mark)

            board = new_board
            current_mark = Mark.NAUGHT if current_mark == Mark.CROSS else Mark.CROSS
def choose_action_q_learning(board, training=True) -> int:
    if training and random.uniform(0, 1) < EPSILON:
        # Exploration: choose a random action
        return np.random.choice(possible_moves(board))
        # Exploitation: choose the best action based on current Q-table
        board_key = "".join(board)
        q_values = {
            action: Q_TABLE_DICT.get((board_key, action), 0)
            for action in possible_moves(board)
        return max(q_values, key=q_values.get)

Q-Table Dictionary as json for the first episode:

    "('_________', 0)": 0.0,
    "('XO_______', 2)": 0.0,
    "('XOX____O_', 3)": 0.0,
    "('XOXX___OO', 4)": 0.0,
    "('XOXXXO_OO', 6)": 0.2

