burlap.behavior.policy.PolicyUtils Java Examples

The following examples show how to use burlap.behavior.policy.PolicyUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LSPI.java    From burlap with Apache License 2.0 6 votes vote down vote up
@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	Episode ea = maxSteps != -1 ? PolicyUtils.rollout(this.learningPolicy, env, maxSteps) : PolicyUtils.rollout(this.learningPolicy, env);

	this.updateDatasetWithLearningEpisode(ea);

	if(this.shouldRereunPolicyIteration(ea)){
		this.runPolicyIteration(this.maxNumPlanningIterations, this.maxChange);
		this.numStepsSinceLastLearningPI = 0;
	}
	else{
		this.numStepsSinceLastLearningPI += ea.numTimeSteps()-1;
	}

	if(episodeHistory.size() >= numEpisodesToStore){
		episodeHistory.poll();
	}
	episodeHistory.offer(ea);

	return ea;
}
 
Example #2
Source File: ContinuousDomainTutorial.java    From burlap_examples with MIT License 6 votes vote down vote up
public static void IPSS(){

		InvertedPendulum ip = new InvertedPendulum();
		ip.physParams.actionNoise = 0.;
		RewardFunction rf = new InvertedPendulum.InvertedPendulumRewardFunction(Math.PI/8.);
		TerminalFunction tf = new InvertedPendulum.InvertedPendulumTerminalFunction(Math.PI/8.);
		ip.setRf(rf);
		ip.setTf(tf);
		SADomain domain = ip.generateDomain();

		State initialState = new InvertedPendulumState();

		SparseSampling ss = new SparseSampling(domain, 1, new SimpleHashableStateFactory(), 10, 1);
		ss.setForgetPreviousPlanResults(true);
		ss.toggleDebugPrinting(false);
		Policy p = new GreedyQPolicy(ss);

		Episode e = PolicyUtils.rollout(p, initialState, domain.getModel(), 500);
		System.out.println("Num steps: " + e.maxTimeStep());
		Visualizer v = CartPoleVisualizer.getCartPoleVisualizer();
		new EpisodeSequenceVisualizer(v, domain, Arrays.asList(e));

	}
 
Example #3
Source File: VITutorial.java    From burlap_examples with MIT License 5 votes vote down vote up
public static void main(String [] args){

		GridWorldDomain gwd = new GridWorldDomain(11, 11);
		gwd.setTf(new GridWorldTerminalFunction(10, 10));
		gwd.setMapToFourRooms();

		//only go in intended directon 80% of the time
		gwd.setProbSucceedTransitionDynamics(0.8);

		SADomain domain = gwd.generateDomain();

		//get initial state with agent in 0,0
		State s = new GridWorldState(new GridAgent(0, 0));

		//setup vi with 0.99 discount factor, a value
		//function initialization that initializes all states to value 0, and which will
		//run for 30 iterations over the state space
		VITutorial vi = new VITutorial(domain, 0.99, new SimpleHashableStateFactory(),
				new ConstantValueFunction(0.0), 30);

		//run planning from our initial state
		Policy p = vi.planFromState(s);

		//evaluate the policy with one roll out visualize the trajectory
		Episode ea = PolicyUtils.rollout(p, s, domain.getModel());

		Visualizer v = GridWorldVisualizer.getVisualizer(gwd.getMap());
		new EpisodeSequenceVisualizer(v, domain, Arrays.asList(ea));

	}
 
Example #4
Source File: MCVideo.java    From burlap_examples with MIT License 5 votes vote down vote up
public static void main(String[] args) {

		MountainCar mcGen = new MountainCar();
		SADomain domain = mcGen.generateDomain();

		StateGenerator rStateGen = new MCRandomStateGenerator(mcGen.physParams);
		SARSCollector collector = new SARSCollector.UniformRandomSARSCollector(domain);
		SARSData dataset = collector.collectNInstances(rStateGen, domain.getModel(), 5000, 20, null);

		NormalizedVariableFeatures features = new NormalizedVariableFeatures()
				.variableDomain("x", new VariableDomain(mcGen.physParams.xmin, mcGen.physParams.xmax))
				.variableDomain("v", new VariableDomain(mcGen.physParams.vmin, mcGen.physParams.vmax));
		FourierBasis fb = new FourierBasis(features, 4);

		LSPI lspi = new LSPI(domain, 0.99, new DenseCrossProductFeatures(fb, 3), dataset);
		Policy p = lspi.runPolicyIteration(30, 1e-6);

		Visualizer v = MountainCarVisualizer.getVisualizer(mcGen);
		VisualActionObserver vob = new VisualActionObserver(v);
		vob.initGUI();

		SimulatedEnvironment env = new SimulatedEnvironment(domain,
				new MCState(mcGen.physParams.valleyPos(), 0));
		EnvironmentServer envServ = new EnvironmentServer(env, vob);

		for(int i = 0; i < 100; i++){
			PolicyUtils.rollout(p, envServ);
			envServ.resetEnvironment();
		}

		System.out.println("Finished");

	}
 
Example #5
Source File: TestPlanning.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Test
public void testAStar() {
	GridWorldState initialState = new GridWorldState(new GridAgent(0, 0), new GridLocation(10, 10, 0, "loc0"));
	
	Heuristic mdistHeuristic = new Heuristic() {
		
		@Override
		public double h(State s) {

			GridAgent agent = ((GridWorldState)s).agent;
			GridLocation location = ((GridWorldState)s).locations.get(0);

			//get agent position
			int ax = agent.x;
			int ay = agent.y;
			
			//get location position
			int lx = location.x;
			int ly = location.y;
			
			//compute Manhattan distance
			double mdist = Math.abs(ax-lx) + Math.abs(ay-ly);
			
			return -mdist;
		}
	};
	
	//provide A* the heuristic as well as the reward function so that it can keep
	//track of the actual cost
	DeterministicPlanner planner = new AStar(domain, goalCondition,
		hashingFactory, mdistHeuristic);
	planner.planFromState(initialState);
	Policy p = new SDPlannerPolicy(planner);
	
	Episode analysis = PolicyUtils.rollout(p, initialState, domain.getModel());
	this.evaluateEpisode(analysis, true);
}
 
Example #6
Source File: RTDP.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Performs Bellman updates only after a rollout is complete and in reverse order
 * @param initialState the initial state from which to plan
 */
protected void batchRTDP(State initialState){
	
	int totalStates = 0;
	
	int consecutiveSmallDeltas = 0;
	for(int i = 0; i < numRollouts; i++){
		
		Episode ea = PolicyUtils.rollout(rollOutPolicy, initialState, model, maxDepth);
		LinkedList <HashableState> orderedStates = new LinkedList<HashableState>();
		for(State s : ea.stateSequence){
			orderedStates.addFirst(this.stateHash(s));
		}
		
		double delta = this.performOrderedBellmanUpdates(orderedStates);
		totalStates += orderedStates.size();
		DPrint.cl(debugCode, "Pass: " + i + "; Num states: " + orderedStates.size() + " (total: " + totalStates + ")");
		
		if(delta < this.maxDelta){
			consecutiveSmallDeltas++;
			if(consecutiveSmallDeltas >= this.minNumRolloutsWithSmallValueChange){
				break;
			}
		}
		else{
			consecutiveSmallDeltas = 0;
		}
	}
	
	
}
 
Example #7
Source File: Main.java    From cs7641-assignment4 with MIT License 5 votes vote down vote up
/**
 * Here is where the magic happens. In this method is where I loop through the specific number
 * of episodes (iterations) and run the specific algorithm. To keep things nice and clean, I use
 * this method to run all three algorithms. The specific details are specified through the
 * PlannerFactory interface.
 * 
 * This method collects all the information from the algorithm and packs it in an Analysis
 * instance that later gets dumped on the console.
 */
private static void runAlgorithm(Analysis analysis, Problem problem, SADomain domain, HashableStateFactory hashingFactory, State initialState, PlannerFactory plannerFactory, Algorithm algorithm) {
	ConstantStateGenerator constantStateGenerator = new ConstantStateGenerator(initialState);
	SimulatedEnvironment simulatedEnvironment = new SimulatedEnvironment(domain, constantStateGenerator);
	Planner planner = null;
	Policy policy = null;
	for (int episodeIndex = 1; episodeIndex <= problem.getNumberOfIterations(algorithm); episodeIndex++) {
		long startTime = System.nanoTime();
		planner = plannerFactory.createPlanner(episodeIndex, domain, hashingFactory, simulatedEnvironment);
		policy = planner.planFromState(initialState);

		/*
		 * If we haven't converged, following the policy will lead the agent wandering around
		 * and it might never reach the goal. To avoid this, we need to set the maximum number
		 * of steps to take before terminating the policy rollout. I decided to set this maximum
		 * at the number of grid locations in our map (width * width). This should give the
		 * agent plenty of room to wander around.
		 * 
		 * The smaller this number is, the faster the algorithm will run.
		 */
		int maxNumberOfSteps = problem.getWidth() * problem.getWidth();

		Episode episode = PolicyUtils.rollout(policy, initialState, domain.getModel(), maxNumberOfSteps);
		analysis.add(episodeIndex, episode.rewardSequence, episode.numTimeSteps(), (long) (System.nanoTime() - startTime) / 1000000);
	}

	if (algorithm == Algorithm.QLearning && USE_LEARNING_EXPERIMENTER) {
		learningExperimenter(problem, (LearningAgent) planner, simulatedEnvironment);
	}

	if (SHOW_VISUALIZATION && planner != null && policy != null) {
		visualize(problem, (ValueFunction) planner, policy, initialState, domain, hashingFactory, algorithm.getTitle());
	}
}
 
Example #8
Source File: ECorrelatedQJointPolicy.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public double actionProb(State s, Action a) {
	return PolicyUtils.actionProbFromEnum(this, s, a);
}
 
Example #9
Source File: TestBlockDude.java    From burlap with Apache License 2.0 4 votes vote down vote up
public void testDude(State s) {
	TerminalFunction tf = new BlockDudeTF();
	StateConditionTest sc = new TFGoalCondition(tf);

	AStar astar = new AStar(domain, sc, new SimpleHashableStateFactory(), new NullHeuristic());
	astar.toggleDebugPrinting(false);
	astar.planFromState(s);

	Policy p = new SDPlannerPolicy(astar);
	Episode ea = PolicyUtils.rollout(p, s, domain.getModel(), 100);

	State lastState = ea.stateSequence.get(ea.stateSequence.size() - 1);
	Assert.assertEquals(true, tf.isTerminal(lastState));
	Assert.assertEquals(true, sc.satisfies(lastState));
	Assert.assertEquals(-94.0, ea.discountedReturn(1.0), 0.001);

	/*
	BlockDude constructor = new BlockDude();
	Domain d = constructor.generateDomain();

	List<Integer> px = new ArrayList<Integer>();
	List <Integer> ph = new ArrayList<Integer>();

	ph.add(15);
	ph.add(3);
	ph.add(3);
	ph.add(3);
	ph.add(0);
	ph.add(0);
	ph.add(0);
	ph.add(1);
	ph.add(2);
	ph.add(0);
	ph.add(2);
	ph.add(3);
	ph.add(2);
	ph.add(2);
	ph.add(3);
	ph.add(3);
	ph.add(15);
	
	State o = BlockDude.getCleanState(d, px, ph, 6);
	o = BlockDude.setAgent(o, 9, 3, 1, 0);
	o = BlockDude.setExit(o, 1, 0);
	
	o = BlockDude.setBlock(o, 0, 5, 1);
	o = BlockDude.setBlock(o, 1, 6, 1);
	o = BlockDude.setBlock(o, 2, 14, 3);
	o = BlockDude.setBlock(o, 3, 16, 4);
	o = BlockDude.setBlock(o, 4, 17, 4);
	o = BlockDude.setBlock(o, 5, 17, 5);
	
	TerminalFunction tf = new SinglePFTF(d.getPropFunction(BlockDude.PFATEXIT));
	StateConditionTest sc = new SinglePFSCT(d.getPropFunction(BlockDude.PFATEXIT));

	RewardFunction rf = new UniformCostRF();

	AStar astar = new AStar(d, rf, sc, new DiscreteStateHashFactory(), new NullHeuristic());
	astar.toggleDebugPrinting(false);
	astar.planFromState(o);

	Policy p = new SDPlannerPolicy(astar);
	EpisodeAnalysis ea = p.evaluateBehavior(o, rf, tf, 100);

	State lastState = ea.stateSequence.get(ea.stateSequence.size() - 1);
	Assert.assertEquals(true, tf.isTerminal(lastState));
	Assert.assertEquals(true, sc.satisfies(lastState));
	Assert.assertEquals(-94.0, ea.getDiscountedReturn(1.0), 0.001);
	*/
}
 
Example #10
Source File: BoltzmannActor.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public double actionProb(State s, Action a) {
	return PolicyUtils.actionProbFromEnum(this, s, a);
}
 
Example #11
Source File: BoltzmannActor.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public Action action(State s) {
	return PolicyUtils.sampleFromActionDistribution(this, s);
}
 
Example #12
Source File: Episode.java    From burlap with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {
	GridWorldDomain gwd = new GridWorldDomain(11, 11);
	SADomain domain = gwd.generateDomain();
	State s = new GridWorldState(new GridAgent(1, 3));

	Policy p = new RandomPolicy(domain);
	Episode ea = PolicyUtils.rollout(p, s, domain.getModel(), 30);

	String yamlOut = ea.serialize();

	System.out.println(yamlOut);

	System.out.println("\n\n");

	Episode read = Episode.parseEpisode(yamlOut);

	System.out.println(read.actionString());
	System.out.println(read.state(0).toString());
	System.out.println(read.actionSequence.size());
	System.out.println(read.stateSequence.size());

}
 
Example #13
Source File: ApprenticeshipLearning.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public double actionProb(State s, Action a) {
	return PolicyUtils.actionProbFromEnum(this, s, a);
}
 
Example #14
Source File: PolicyFromJointPolicy.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public double actionProb(State s, Action a) {
	return PolicyUtils.actionProbFromEnum(this, s, a);
}
 
Example #15
Source File: EMinMaxPolicy.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public double actionProb(State s, Action a) {
	return PolicyUtils.actionProbFromEnum(this, s, a);
}
 
Example #16
Source File: EMinMaxPolicy.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public Action action(State s) {
	return PolicyUtils.sampleFromActionDistribution(this, s);
}
 
Example #17
Source File: EGreedyMaxWellfare.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public double actionProb(State s, Action a) {
	return PolicyUtils.actionProbFromEnum(this, s, a);
}
 
Example #18
Source File: ECorrelatedQJointPolicy.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public Action action(State s) {
	return PolicyUtils.sampleFromActionDistribution(this, s);
}
 
Example #19
Source File: EGreedyJointPolicy.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public double actionProb(State s, Action a) {
	return PolicyUtils.actionProbFromEnum(this, s, a);
}
 
Example #20
Source File: ContinuousDomainTutorial.java    From burlap_examples with MIT License 4 votes vote down vote up
public static void MCLSPIRBF(){

		MountainCar mcGen = new MountainCar();
		SADomain domain = mcGen.generateDomain();
		MCState s = new MCState(mcGen.physParams.valleyPos(), 0.);

		NormalizedVariableFeatures inputFeatures = new NormalizedVariableFeatures()
				.variableDomain("x", new VariableDomain(mcGen.physParams.xmin, mcGen.physParams.xmax))
				.variableDomain("v", new VariableDomain(mcGen.physParams.vmin, mcGen.physParams.vmax));

		StateGenerator rStateGen = new MCRandomStateGenerator(mcGen.physParams);
		SARSCollector collector = new SARSCollector.UniformRandomSARSCollector(domain);
		SARSData dataset = collector.collectNInstances(rStateGen, domain.getModel(), 5000, 20, null);

		RBFFeatures rbf = new RBFFeatures(inputFeatures, true);
		FlatStateGridder gridder = new FlatStateGridder()
				.gridDimension("x", mcGen.physParams.xmin, mcGen.physParams.xmax, 5)
				.gridDimension("v", mcGen.physParams.vmin, mcGen.physParams.vmax, 5);

		List<State> griddedStates = gridder.gridState(s);
		DistanceMetric metric = new EuclideanDistance();
		for(State g : griddedStates){
			rbf.addRBF(new GaussianRBF(inputFeatures.features(g), metric, 0.2));
		}

		LSPI lspi = new LSPI(domain, 0.99, new DenseCrossProductFeatures(rbf, 3), dataset);
		Policy p = lspi.runPolicyIteration(30, 1e-6);

		Visualizer v = MountainCarVisualizer.getVisualizer(mcGen);
		VisualActionObserver vob = new VisualActionObserver(v);
		vob.initGUI();


		SimulatedEnvironment env = new SimulatedEnvironment(domain, s);
		env.addObservers(vob);

		for(int i = 0; i < 5; i++){
			PolicyUtils.rollout(p, env);
			env.resetEnvironment();
		}

		System.out.println("Finished");


	}
 
Example #21
Source File: BasicBehavior.java    From burlap_examples with MIT License 4 votes vote down vote up
public void AStarExample(String outputPath){

		Heuristic mdistHeuristic = new Heuristic() {

			public double h(State s) {
				GridAgent a = ((GridWorldState)s).agent;
				double mdist = Math.abs(a.x-10) + Math.abs(a.y-10);

				return -mdist;
			}
		};

		DeterministicPlanner planner = new AStar(domain, goalCondition, hashingFactory, mdistHeuristic);
		Policy p = planner.planFromState(initialState);

		PolicyUtils.rollout(p, initialState, domain.getModel()).write(outputPath + "astar");

	}
 
Example #22
Source File: SimpleTester.java    From burlap_caffe with Apache License 2.0 4 votes vote down vote up
@Override
public Episode runTestEpisode(Environment env, int maxSteps) {
    return PolicyUtils.rollout(policy, env, maxSteps);
}
 
Example #23
Source File: MinecraftSolver.java    From burlapcraft with GNU Lesser General Public License v3.0 3 votes vote down vote up
public static void stocasticPlan(double gamma){

		MinecraftDomainGenerator simdg = new MinecraftDomainGenerator();
		
		SADomain domain = simdg.generateDomain();

		State initialState = MinecraftStateGeneratorHelper.getCurrentState(BurlapCraft.currentDungeon);
		
		Planner planner = new ValueIteration(domain, gamma, new SimpleHashableStateFactory(false), 0.001, 1000);
		
		Policy p = planner.planFromState(initialState);
		
		MinecraftEnvironment me = new MinecraftEnvironment();
		PolicyUtils.rollout(p, me);
	}
 
Example #24
Source File: ContinuousDomainTutorial.java    From burlap_examples with MIT License 3 votes vote down vote up
public static void MCLSPIFB(){

		MountainCar mcGen = new MountainCar();
		SADomain domain = mcGen.generateDomain();

		StateGenerator rStateGen = new MCRandomStateGenerator(mcGen.physParams);
		SARSCollector collector = new SARSCollector.UniformRandomSARSCollector(domain);
		SARSData dataset = collector.collectNInstances(rStateGen, domain.getModel(), 5000, 20, null);

		NormalizedVariableFeatures inputFeatures = new NormalizedVariableFeatures()
				.variableDomain("x", new VariableDomain(mcGen.physParams.xmin, mcGen.physParams.xmax))
				.variableDomain("v", new VariableDomain(mcGen.physParams.vmin, mcGen.physParams.vmax));

		FourierBasis fb = new FourierBasis(inputFeatures, 4);

		LSPI lspi = new LSPI(domain, 0.99, new DenseCrossProductFeatures(fb, 3), dataset);
		Policy p = lspi.runPolicyIteration(30, 1e-6);

		Visualizer v = MountainCarVisualizer.getVisualizer(mcGen);
		VisualActionObserver vob = new VisualActionObserver(v);
		vob.initGUI();

		SimulatedEnvironment env = new SimulatedEnvironment(domain, new MCState(mcGen.physParams.valleyPos(), 0.));
		env.addObservers(vob);

		for(int i = 0; i < 5; i++){
			PolicyUtils.rollout(p, env);
			env.resetEnvironment();
		}

		System.out.println("Finished");


	}
 
Example #25
Source File: BasicBehavior.java    From burlap_examples with MIT License 3 votes vote down vote up
public void valueIterationExample(String outputPath){

		Planner planner = new ValueIteration(domain, 0.99, hashingFactory, 0.001, 100);
		Policy p = planner.planFromState(initialState);

		PolicyUtils.rollout(p, initialState, domain.getModel()).write(outputPath + "vi");

		simpleValueFunctionVis((ValueFunction)planner, p);
		//manualValueFunctionVis((ValueFunction)planner, p);

	}
 
Example #26
Source File: BasicBehavior.java    From burlap_examples with MIT License 3 votes vote down vote up
public void DFSExample(String outputPath){

		DeterministicPlanner planner = new DFS(domain, goalCondition, hashingFactory);
		Policy p = planner.planFromState(initialState);
		PolicyUtils.rollout(p, initialState, domain.getModel()).write(outputPath + "dfs");

	}
 
Example #27
Source File: DynamicProgramming.java    From burlap with Apache License 2.0 3 votes vote down vote up
/**
 * Performs a fixed-policy Bellman value function update (i.e., policy evaluation) on the provided state. Results are stored in the value function map as well as returned.
 * @param sh the hashed state on which to perform the Bellman update.
 * @param p the policy that is being evaluated
 * @return the new value of the state
 */
protected double performFixedPolicyBellmanUpdateOn(HashableState sh, EnumerablePolicy p){
	
	
	if(this.model.terminal(sh.s())){
		//terminal states always have a state value of 0
		valueFunction.put(sh, 0.);
		return 0.;
	}
	
	double weightedQ = 0.;
	List<ActionProb> policyDistribution = p.policyDistribution(sh.s());
	

	//List <GroundedAction> gas = sh.s.getAllGroundedActionsFor(this.actions);
	List<Action> gas = this.applicableActions(sh.s());
	for(Action ga : gas){

		double policyProb = PolicyUtils.actionProbGivenDistribution(ga, policyDistribution);
		if(policyProb == 0.){
			continue; //doesn't contribute
		}

		double q = this.computeQ(sh.s(), ga);
		weightedQ += policyProb*q;
	}
		

	
	
	
	valueFunction.put(sh, weightedQ);
	
	return weightedQ;
	
}
 
Example #28
Source File: BasicBehavior.java    From burlap_examples with MIT License 3 votes vote down vote up
public void BFSExample(String outputPath){

		DeterministicPlanner planner = new BFS(domain, goalCondition, hashingFactory);
		Policy p = planner.planFromState(initialState);
		PolicyUtils.rollout(p, initialState, domain.getModel()).write(outputPath + "bfs");

	}