burlap.behavior.singleagent.learning.tdmethods.QLearning Java Examples

The following examples show how to use burlap.behavior.singleagent.learning.tdmethods.QLearning. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: Main.java From cs7641-assignment4 with MIT License

6 votes

/**
 * Runs a learning experiment and shows some cool charts. Apparently, this is only useful for
 * Q-Learning, so I only call this method when Q-Learning is selected and the appropriate flag
 * is enabled.
 */
private static void learningExperimenter(Problem problem, LearningAgent agent, SimulatedEnvironment simulatedEnvironment) {
	LearningAlgorithmExperimenter experimenter = new LearningAlgorithmExperimenter(simulatedEnvironment, 10, problem.getNumberOfIterations(Algorithm.QLearning), new LearningAgentFactory() {

		public String getAgentName() {
			return Algorithm.QLearning.getTitle();
		}

		public LearningAgent generateAgent() {
			return agent;
		}
	});

	/*
	 * Try different PerformanceMetric values below to display different charts.
	 */
	experimenter.setUpPlottingConfiguration(500, 250, 2, 1000, TrialMode.MOST_RECENT_AND_AVERAGE, PerformanceMetric.CUMULATIVE_STEPS_PER_EPISODE, PerformanceMetric.AVERAGE_EPISODE_REWARD);
	experimenter.startExperiment();
}

Example #2

Source File: Main.java From cs7641-assignment4 with MIT License

5 votes

/**
 * Here is where the magic happens. In this method is where I loop through the specific number
 * of episodes (iterations) and run the specific algorithm. To keep things nice and clean, I use
 * this method to run all three algorithms. The specific details are specified through the
 * PlannerFactory interface.
 * 
 * This method collects all the information from the algorithm and packs it in an Analysis
 * instance that later gets dumped on the console.
 */
private static void runAlgorithm(Analysis analysis, Problem problem, SADomain domain, HashableStateFactory hashingFactory, State initialState, PlannerFactory plannerFactory, Algorithm algorithm) {
	ConstantStateGenerator constantStateGenerator = new ConstantStateGenerator(initialState);
	SimulatedEnvironment simulatedEnvironment = new SimulatedEnvironment(domain, constantStateGenerator);
	Planner planner = null;
	Policy policy = null;
	for (int episodeIndex = 1; episodeIndex <= problem.getNumberOfIterations(algorithm); episodeIndex++) {
		long startTime = System.nanoTime();
		planner = plannerFactory.createPlanner(episodeIndex, domain, hashingFactory, simulatedEnvironment);
		policy = planner.planFromState(initialState);

		/*
		 * If we haven't converged, following the policy will lead the agent wandering around
		 * and it might never reach the goal. To avoid this, we need to set the maximum number
		 * of steps to take before terminating the policy rollout. I decided to set this maximum
		 * at the number of grid locations in our map (width * width). This should give the
		 * agent plenty of room to wander around.
		 * 
		 * The smaller this number is, the faster the algorithm will run.
		 */
		int maxNumberOfSteps = problem.getWidth() * problem.getWidth();

		Episode episode = PolicyUtils.rollout(policy, initialState, domain.getModel(), maxNumberOfSteps);
		analysis.add(episodeIndex, episode.rewardSequence, episode.numTimeSteps(), (long) (System.nanoTime() - startTime) / 1000000);
	}

	if (algorithm == Algorithm.QLearning && USE_LEARNING_EXPERIMENTER) {
		learningExperimenter(problem, (LearningAgent) planner, simulatedEnvironment);
	}

	if (SHOW_VISUALIZATION && planner != null && policy != null) {
		visualize(problem, (ValueFunction) planner, policy, initialState, domain, hashingFactory, algorithm.getTitle());
	}
}

Example #3

Source File: Main.java From cs7641-assignment4 with MIT License

5 votes

private static Problem createProblem2() {
	String[] map = new String[] {
			"111111111111111111111",
			"X00010001000100000101",
			"101110101L1010S110101",
			"100010101000100010101",
			"11101010101111S110101",
			"100010100000100000001",
			"1011101S1010101110101",
			"100010101010001000101",
			"101010101011111010111",
			"101000001000100010001",
			"1110101M111010M110101",
			"100010100010100000101",
			"101110101010101111S01",
			"100010001010001010001",
			"111011101010111010111",
			"101010001010001000101",
			"10101011101L001011101",
			"1000001S0000101010001",
			"101011110110101010101",
			"10100000001000100010G",
			"111111111111111111111",
	};

	HashMap<Algorithm, Integer> numIterationsHashMap = new HashMap<Algorithm, Integer>();
	numIterationsHashMap.put(Algorithm.ValueIteration, 100);
	numIterationsHashMap.put(Algorithm.PolicyIteration, 20);
	numIterationsHashMap.put(Algorithm.QLearning, 1000);
	
	HashMap<HazardType, Double> hazardRewardsHashMap = new HashMap<HazardType, Double>();
	hazardRewardsHashMap.put(HazardType.SMALL, -1.0);
	hazardRewardsHashMap.put(HazardType.MEDIUM, -2.0);
	hazardRewardsHashMap.put(HazardType.LARGE, -3.0);

	return new Problem(map, numIterationsHashMap, -0.1, 10, hazardRewardsHashMap);
}

Example #4

Source File: Main.java From cs7641-assignment4 with MIT License

4 votes

private static Problem createProblem1() {
	/*
	 * The surface can be described as follows:
	 * 
	 * X — The starting point of the agent.
	 * 0 — Represents a safe cell where the agent can move.
	 * 1 — Represents a wall. The agent can't move to this cell.
	 * G — Represents the goal that the agent wants to achieve.
	 * S — Represents a small hazard. The agent will be penalized.
	 * M — Represents a medium hazard. The agent will be penalized.
	 * L — Represents a large hazard. The agent will be penalized.
	 */
	String[] map = new String[] {
			"X0011110",
			"01000S10",
			"010M110S",
			"0M0000M1",
			"01111010",
			"00L010S0",
			"0S001000",
			"000000SG",
	};

	/*
	 * Make sure to specify the specific number of iterations for each algorithm. If you don't
	 * do this, I'm still nice and use 100 as the default value, but that wouldn't make sense
	 * all the time.
	 */
	HashMap<Algorithm, Integer> numIterationsHashMap = new HashMap<Algorithm, Integer>();
	numIterationsHashMap.put(Algorithm.ValueIteration, 50);
	numIterationsHashMap.put(Algorithm.PolicyIteration, 10);
	numIterationsHashMap.put(Algorithm.QLearning, 500);

	/*
	 * These are the specific rewards for each one of the hazards. Here you can be creative and
	 * play with different values as you see fit.
	 */
	HashMap<HazardType, Double> hazardRewardsHashMap = new HashMap<HazardType, Double>();
	hazardRewardsHashMap.put(HazardType.SMALL, -1.0);
	hazardRewardsHashMap.put(HazardType.MEDIUM, -2.0);
	hazardRewardsHashMap.put(HazardType.LARGE, -3.0);

	/*
	 * Notice how I specify below the specific default reward for cells with nothing on them (we
	 * want regular cells to have a small penalty that encourages our agent to find the goal),
	 * and the reward for the cell representing the goal (something nice and large so the agent
	 * is happy).
	 */
	return new Problem(map, numIterationsHashMap, -0.1, 10, hazardRewardsHashMap);
}

Example #5

Source File: GridGameExample.java From burlap_examples with MIT License

4 votes

public static void saInterface(){

		GridGame gridGame = new GridGame();
		final OOSGDomain domain = gridGame.generateDomain();

		final HashableStateFactory hashingFactory = new SimpleHashableStateFactory();

		final State s = GridGame.getSimpleGameInitialState();
		JointRewardFunction rf = new GridGame.GGJointRewardFunction(domain, -1, 100, false);
		TerminalFunction tf = new GridGame.GGTerminalFunction(domain);
		SGAgentType at = GridGame.getStandardGridGameAgentType(domain);

		World w = new World(domain, rf, tf, s);

		//single agent Q-learning algorithms which will operate in our stochastic game
		//don't need to specify the domain, because the single agent interface will provide it
		QLearning ql1 = new QLearning(null, 0.99, new SimpleHashableStateFactory(), 0, 0.1);
		QLearning ql2 = new QLearning(null, 0.99, new SimpleHashableStateFactory(), 0, 0.1);

		//create a single-agent interface for each of our learning algorithm instances
		LearningAgentToSGAgentInterface a1 = new LearningAgentToSGAgentInterface(domain, ql1, "agent0", at);
		LearningAgentToSGAgentInterface a2 = new LearningAgentToSGAgentInterface(domain, ql2, "agent1", at);

		w.join(a1);
		w.join(a2);

		//don't have the world print out debug info (comment out if you want to see it!)
		DPrint.toggleCode(w.getDebugId(), false);

		System.out.println("Starting training");
		int ngames = 1000;
		List<GameEpisode> gas = new ArrayList<GameEpisode>(ngames);
		for(int i = 0; i < ngames; i++){
			GameEpisode ga = w.runGame();
			gas.add(ga);
			if(i % 10 == 0){
				System.out.println("Game: " + i + ": " + ga.maxTimeStep());
			}
		}

		System.out.println("Finished training");


		Visualizer v = GGVisualizer.getVisualizer(9, 9);
		new GameSequenceVisualizer(v, domain, gas);


	}

Example #6

Source File: BasicBehavior.java From burlap_examples with MIT License

4 votes

public void qLearningExample(String outputPath){

		LearningAgent agent = new QLearning(domain, 0.99, hashingFactory, 0., 1.);

		//run learning for 50 episodes
		for(int i = 0; i < 50; i++){
			Episode e = agent.runLearningEpisode(env);

			e.write(outputPath + "ql_" + i);
			System.out.println(i + ": " + e.maxTimeStep());

			//reset environment for next learning episode
			env.resetEnvironment();
		}

		simpleValueFunctionVis((ValueFunction)agent, new GreedyQPolicy((QProvider) agent));

	}

Example #7

Source File: BasicBehavior.java From burlap_examples with MIT License

4 votes

public void experimentAndPlotter(){

		//different reward function for more structured performance plots
		((FactoredModel)domain.getModel()).setRf(new GoalBasedRF(this.goalCondition, 5.0, -0.1));

		/**
		 * Create factories for Q-learning agent and SARSA agent to compare
		 */
		LearningAgentFactory qLearningFactory = new LearningAgentFactory() {

			public String getAgentName() {
				return "Q-Learning";
			}


			public LearningAgent generateAgent() {
				return new QLearning(domain, 0.99, hashingFactory, 0.3, 0.1);
			}
		};

		LearningAgentFactory sarsaLearningFactory = new LearningAgentFactory() {

			public String getAgentName() {
				return "SARSA";
			}


			public LearningAgent generateAgent() {
				return new SarsaLam(domain, 0.99, hashingFactory, 0.0, 0.1, 1.);
			}
		};

		LearningAlgorithmExperimenter exp = new LearningAlgorithmExperimenter(env, 10, 100, qLearningFactory, sarsaLearningFactory);
		exp.setUpPlottingConfiguration(500, 250, 2, 1000,
				TrialMode.MOST_RECENT_AND_AVERAGE,
				PerformanceMetric.CUMULATIVE_STEPS_PER_EPISODE,
				PerformanceMetric.AVERAGE_EPISODE_REWARD);

		exp.startExperiment();
		exp.writeStepAndEpisodeDataToCSV("expData");

	}

Example #8

Source File: PlotTest.java From burlap_examples with MIT License

2 votes

public static void main(String [] args){

		GridWorldDomain gw = new GridWorldDomain(11,11); //11x11 grid world
		gw.setMapToFourRooms(); //four rooms layout
		gw.setProbSucceedTransitionDynamics(0.8); //stochastic transitions with 0.8 success rate

		//ends when the agent reaches a location
		final TerminalFunction tf = new SinglePFTF(
				PropositionalFunction.findPF(gw.generatePfs(), GridWorldDomain.PF_AT_LOCATION));

		//reward function definition
		final RewardFunction rf = new GoalBasedRF(new TFGoalCondition(tf), 5., -0.1);

		gw.setTf(tf);
		gw.setRf(rf);


		final OOSADomain domain = gw.generateDomain(); //generate the grid world domain

		//setup initial state
		GridWorldState s = new GridWorldState(new GridAgent(0, 0), new GridLocation(10, 10, "loc0"));



		//initial state generator
		final ConstantStateGenerator sg = new ConstantStateGenerator(s);


		//set up the state hashing system for looking up states
		final SimpleHashableStateFactory hashingFactory = new SimpleHashableStateFactory();


		/**
		 * Create factory for Q-learning agent
		 */
		LearningAgentFactory qLearningFactory = new LearningAgentFactory() {

			public String getAgentName() {
				return "Q-learning";
			}

			public LearningAgent generateAgent() {
				return new QLearning(domain, 0.99, hashingFactory, 0.3, 0.1);
			}
		};

		//define learning environment
		SimulatedEnvironment env = new SimulatedEnvironment(domain, sg);

		//define experiment
		LearningAlgorithmExperimenter exp = new LearningAlgorithmExperimenter(env,
				10, 100, qLearningFactory);

		exp.setUpPlottingConfiguration(500, 250, 2, 1000, TrialMode.MOST_RECENT_AND_AVERAGE,
				PerformanceMetric.CUMULATIVE_STEPS_PER_EPISODE, PerformanceMetric.AVERAGE_EPISODE_REWARD);


		//start experiment
		exp.startExperiment();


	}