[ALL] Cleaning code + changed multithreading to accept non multiples between BATCH_SIZE_EVAL and MCTS_PARALLEL

dylandjian · dylandjian · commit f882e0ac2df8 · 2018-05-11T08:31:05.000+02:00
diff --git a/const.py b/const.py
@@ -12,7 +12,7 @@
 ## Number of evaluation parallel games 
 PARALLEL_EVAL = 3
 ## MCTS parallel
-MCTS_PARALLEL = 12
+MCTS_PARALLEL = 4
 
 
 ##### GLOBAL
@@ -28,7 +28,7 @@
 ## Learning rate
 LR = 0.01
 ## Number of MCTS simulation
-MCTS_SIM = 128
+MCTS_SIM = 64
 ## Exploration constant
 C_PUCT = 0.2
 ## L2 Regularization
diff --git a/lib/game.py b/lib/game.py
@@ -106,8 +106,8 @@ def __call__(self):
         moves = 0
         comp = False
         
-        # if self.id % 10 == 0:
-        print("Starting game number %d" % self.id)
+        if self.id % 10 == 0:
+            print("Starting game number %d" % self.id)
 
         while not done:
 
diff --git a/lib/go.py b/lib/go.py
@@ -110,8 +110,11 @@ def _act(self, action, history):
 
 
     def test_move(self, action):
-        """ Test if a specific valid action should be played,
-            depending on the current score """
+        """
+        Test if a specific valid action should be played,
+        depending on the current score. This is used to stop
+        the agent from passing if it makes him loose
+        """
 
         board_clone = self.board.clone()
         current_score = board_clone.fast_score  + self.komi
@@ -167,7 +170,6 @@ def step(self, action):
 
         # Reward: if nonterminal, then the reward is -1
         if not self.board.is_terminal:
-            self.done = False
             return _format_state(self.history, self.player_color, self.board_size), \
                     -1, False
 
diff --git a/models/mcts.py b/models/mcts.py
@@ -11,8 +11,8 @@
 
 
 @jit
-def _opt_select(nodes, c_puct):
-    """ Optimized version of the selection """
+def _opt_select(nodes, c_puct=C_PUCT):
+    """ Optimized version of the selection based of the PUCT formula """
 
     total_count = 0
     for i in range(nodes.shape[0]):
@@ -38,16 +38,6 @@ def dirichlet_noise(probas):
     return new_probas
 
 
-def _select(nodes, c_puct=C_PUCT):
-    """
-    Select the move that maximises the mean value of the next state +
-    the result of the PUCT function
-    """
-
-    return nodes[_opt_select(np.array([[node.q, node.n, node.p] \
-                    for node in nodes]), c_puct)]
-
-
 class Node:
 
     def __init__(self, parent=None, proba=None, move=None):
@@ -105,19 +95,23 @@ def run(self):
 
             ## Wait for the eval_queue to be filled by new positions to evaluate
             self.condition_search.acquire()
-            while len(self.eval_queue) < BATCH_SIZE_EVAL:
+            while len(self.eval_queue) < BATCH_SIZE_EVAL and \
+                  len(self.eval_queue) != MCTS_PARALLEL - BATCH_SIZE_EVAL or \
+                  len(self.eval_queue) == 0:
                 self.condition_search.wait()
             self.condition_search.release()
 
             self.condition_eval.acquire()
+            keys = list(self.eval_queue.keys())
+
             ## Predict the feature_maps, policy and value
-            states = torch.tensor(np.array(list(self.eval_queue.values()))[0:BATCH_SIZE_EVAL],
-                                 dtype=torch.float, device=DEVICE)
+            states = torch.tensor(np.array(list(self.eval_queue.values()))[0:len(keys)],
+                        dtype=torch.float, device=DEVICE)
             v, probas = self.player.predict(states)
 
             ## Replace the state with the result in the eval_queue
             ## and notify all the threads that the result are available
-            for idx, i in zip(list(self.eval_queue.keys()), range(BATCH_SIZE_EVAL)):
+            for idx, i in zip(keys, range(len(keys))):
                 del self.eval_queue[idx]
                 self.result_queue[idx] = (probas[i].cpu().data.numpy(), v[i])
 
@@ -150,7 +144,10 @@ def run(self):
 
         ## Traverse the tree until leaf
         while not current_node.is_leaf() and not done:
-            current_node = _select(current_node.childrens)
+            ## Select the action that maximizes the PUCT algorithm
+            current_node = current_node.childrens[_opt_select( \
+                    np.array([[node.q, node.n, node.p] \
+                    for node in current_node.childrens]))]
 
             ## Virtual loss since multithreading
             self.lock.acquire()
@@ -161,7 +158,8 @@ def run(self):
 
         if not done:
 
-            ## Add current leaf state to the evaluation queue
+            ## Add current leaf state with random dihedral transformation
+            ## to the evaluation queue
             self.condition_search.acquire()
             self.eval_queue[self.thread_id] = sample_rotation(state, num=1)
             self.condition_search.notify()
@@ -243,7 +241,6 @@ def search(self, current_game, player, competitive=False):
         Search the best moves through the game tree with
         the policy and value network to update node statistics
         """
-        threads = []
 
         ## Locking for thread synchronization
         condition_eval = threading.Condition()
@@ -256,6 +253,7 @@ def search(self, current_game, player, competitive=False):
         evaluator = EvaluatorThread(player, eval_queue, result_queue, condition_search, condition_eval)
         evaluator.start()
 
+        threads = []
         ## Do exactly the required number of simulation per thread
         for sim in range(MCTS_SIM // MCTS_PARALLEL):
             for idx in range(MCTS_PARALLEL):
diff --git a/stats.py b/stats.py
@@ -67,17 +67,20 @@ def stats_report():
     old_values = do_sims(player, old_values, mcts_parallel=6, mcts_sim=64, batch_size_eval=2)
     old_values = do_sims(player, old_values, mcts_parallel=8, mcts_sim=64, batch_size_eval=2)
     old_values = do_sims(player, old_values, mcts_parallel=12, mcts_sim=64, batch_size_eval=2)
+    old_values = do_sims(player, old_values, mcts_parallel=4, mcts_sim=64, batch_size_eval=4)
+    old_values = do_sims(player, old_values, mcts_parallel=6, mcts_sim=64, batch_size_eval=2)
     old_values = do_sims(player, old_values, mcts_parallel=8, mcts_sim=64, batch_size_eval=4)
     old_values = do_sims(player, old_values, mcts_parallel=12, mcts_sim=64, batch_size_eval=4)
     old_values = do_sims(player, old_values, mcts_parallel=12, mcts_sim=64, batch_size_eval=6)
     
 
     ## 128 simulations
-    old_values = do_sims(player, old_values, mcts_parallel=4, mcts_sim=128, batch_size_eval=2)
+    old_values = do_sims(player, old_values, mcts_parallel=2, mcts_sim=128, batch_size_eval=2)
     old_values = do_sims(player, old_values, mcts_parallel=4, mcts_sim=128, batch_size_eval=2)
     old_values = do_sims(player, old_values, mcts_parallel=6, mcts_sim=128, batch_size_eval=2)
     old_values = do_sims(player, old_values, mcts_parallel=8, mcts_sim=128, batch_size_eval=4)
     old_values = do_sims(player, old_values, mcts_parallel=12, mcts_sim=128, batch_size_eval=2)
+    old_values = do_sims(player, old_values, mcts_parallel=4, mcts_sim=128, batch_size_eval=4)
     old_values = do_sims(player, old_values, mcts_parallel=8, mcts_sim=128, batch_size_eval=4)
     old_values = do_sims(player, old_values, mcts_parallel=12, mcts_sim=128, batch_size_eval=4)
     old_values = do_sims(player, old_values, mcts_parallel=12, mcts_sim=128, batch_size_eval=6)