Java Code Examples for burlap.mdp.singleagent.environment.Environment#isInTerminalState()

The following examples show how to use burlap.mdp.singleagent.environment.Environment#isInTerminalState() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SARSCollector.java From burlap with Apache License 2.0

6 votes

/**
 * Collects nSamples of SARS tuples from an {@link burlap.mdp.singleagent.environment.Environment} and returns it in a {@link burlap.behavior.singleagent.learning.lspi.SARSData} object.
 * Each sequence of samples is no longer than maxEpisodeSteps and samples are collected using this object's {@link #collectDataFrom(burlap.mdp.singleagent.environment.Environment, int, SARSData)}
 * method. After each call to {@link #collectDataFrom(burlap.mdp.singleagent.environment.Environment, int, SARSData)}, the provided {@link burlap.mdp.singleagent.environment.Environment}
 * is sent the {@link burlap.mdp.singleagent.environment.Environment#resetEnvironment()} message.
 * @param env The {@link burlap.mdp.singleagent.environment.Environment} from which samples should be collected.
 * @param nSamples The number of samples to generate.
 * @param maxEpisodeSteps the maximum number of steps to take from any initial state of the {@link burlap.mdp.singleagent.environment.Environment}.
 * @param intoDataset the dataset into which the results will be collected. If null, a new dataset is created.
 * @return the intoDataset object, which is created if it is input as null.
 */
public SARSData collectNInstances(Environment env, int nSamples, int maxEpisodeSteps, SARSData intoDataset){

	if(intoDataset == null){
		intoDataset = new SARSData(nSamples);
	}

	while(nSamples > 0 && !env.isInTerminalState()){
		int maxSteps = Math.min(nSamples, maxEpisodeSteps);
		int oldSize = intoDataset.size();
		this.collectDataFrom(env, maxSteps, intoDataset);
		int delta = intoDataset.size() - oldSize;
		nSamples -= delta;
		env.resetEnvironment();
	}

	return intoDataset;

}

Example 2

Source File: SARSCollector.java From burlap with Apache License 2.0

6 votes

@Override
public SARSData collectDataFrom(Environment env, int maxSteps, SARSData intoDataset) {

	if(intoDataset == null){
		intoDataset = new SARSData();
	}

	int nsteps = 0;
	while(!env.isInTerminalState() && nsteps < maxSteps){
		List<Action> gas = ActionUtils.allApplicableActionsForTypes(this.actionTypes, env.currentObservation());
		Action ga = gas.get(RandomFactory.getMapped(0).nextInt(gas.size()));
		EnvironmentOutcome eo = env.executeAction(ga);
		intoDataset.add(eo.o, eo.a, eo.r, eo.op);

		nsteps++;
	}

	return intoDataset;
}

Example 3

Source File: QLTutorial.java From burlap_examples with MIT License

5 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {
	//initialize our episode object with the initial state of the environment
	Episode e = new Episode(env.currentObservation());

	//behave until a terminal state or max steps is reached
	State curState = env.currentObservation();
	int steps = 0;
	while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){

		//select an action
		Action a = this.learningPolicy.action(curState);

		//take the action and observe outcome
		EnvironmentOutcome eo = env.executeAction(a);

		//record result
		e.transition(eo);

		//get the max Q value of the resulting state if it's not terminal, 0 otherwise
		double maxQ = eo.terminated ? 0. : this.value(eo.op);

		//update the old Q-value
		QValue oldQ = this.storedQ(curState, a);
		oldQ.q = oldQ.q + this.learningRate * (eo.r + this.gamma * maxQ - oldQ.q);


		//update state pointer to next environment state observed
		curState = eo.op;
		steps++;

	}

	return e;
}

Example 4

Source File: PolicyUtils.java From burlap with Apache License 2.0

5 votes

/**
 * Follows the policy in the given {@link burlap.mdp.singleagent.environment.Environment}. The policy will stop being followed once a terminal state
 * in the environment is reached.
 * @param p the {@link Policy}
 * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy is to be evaluated.
 * @return An {@link Episode} object specifying the interaction with the environment.
 */
public static Episode rollout(Policy p, Environment env){

	Episode ea = new Episode(env.currentObservation());

	do{
		followAndRecordPolicy(p, env, ea);
	}while(!env.isInTerminalState());

	return ea;
}

Example 5

Source File: PolicyUtils.java From burlap with Apache License 2.0

5 votes

/**
 * Follows the policy in the given {@link burlap.mdp.singleagent.environment.Environment}. The policy will stop being followed once a terminal state
 * in the environment is reached or when the provided number of steps has been taken.
 * @param p the {@link Policy}
 * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy is to be evaluated.
 * @param numSteps the maximum number of steps to take in the environment.
 * @return An {@link Episode} object specifying the interaction with the environment.
 */
public static Episode rollout(Policy p, Environment env, int numSteps){

	Episode ea = new Episode(env.currentObservation());

	int nSteps;
	do{
		followAndRecordPolicy(p, env, ea);
		nSteps = ea.numTimeSteps();
	}while(!env.isInTerminalState() && nSteps < numSteps);

	return ea;
}

Example 6

Source File: Option.java From burlap with Apache License 2.0

5 votes

public static EnvironmentOptionOutcome control(Option o, Environment env, double discount){
	Random rand = RandomFactory.getMapped(0);
	State initial = env.currentObservation();
	State cur = initial;

	Episode episode = new Episode(cur);
	Episode history = new Episode(cur);
	double roll;
	double pT;
	int nsteps = 0;
	double r = 0.;
	double cd = 1.;
	do{
		Action a = o.policy(cur, history);
		EnvironmentOutcome eo = env.executeAction(a);
		nsteps++;
		r += cd*eo.r;
		cur = eo.op;
		cd *= discount;


		history.transition(a, eo.op, eo.r);

		AnnotatedAction annotatedAction = new AnnotatedAction(a, o.toString() + "(" + nsteps + ")");
		episode.transition(annotatedAction, eo.op, r);


		pT = o.probabilityOfTermination(eo.op, history);
		roll = rand.nextDouble();

	}while(roll > pT && !env.isInTerminalState());

	EnvironmentOptionOutcome eoo = new EnvironmentOptionOutcome(initial, o, cur, r, env.isInTerminalState(), discount, episode);

	return eoo;

}

Example 7

Source File: ApproximateQLearning.java From burlap with Apache License 2.0

4 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();
	Episode e = new Episode(initialState);


	int eStepCounter = 0;
	while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

		//check state
		State curState = stateMapping.mapState(env.currentObservation());

		//select action
		Action a = this.learningPolicy.action(curState);

		//take action
		EnvironmentOutcome eo = env.executeAction(a);

		//save outcome in memory
		this.memory.addExperience(eo);

		//record transition and manage option case
		int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
		eStepCounter += stepInc;
		this.totalSteps += stepInc;
		e.transition(a, eo.op, eo.r);

		//perform learners
		List<EnvironmentOutcome> samples = this.memory.sampleExperiences(this.numReplay);
		this.updateQFunction(samples);

		//update stale function
		this.stepsSinceStale++;
		if(this.stepsSinceStale >= this.staleDuration){
			this.updateStaleFunction();
		}

	}

	this.totalEpisodes++;
	return e;
}

Example 8

Source File: DeepQTester.java From burlap_caffe with Apache License 2.0

3 votes

@Override
public Episode runTestEpisode(Environment env, int maxSteps) {

    State initialState = env.currentObservation();
    Episode e = new Episode(initialState);


    int eStepCounter = 0;
    while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

        //check state
        State curState = stateMapping.mapState(env.currentObservation());

        //select action
        Action a = this.policy.action(curState);

        //take action
        EnvironmentOutcome eo = env.executeAction(a);

        //save outcome in memory
        this.memory.addExperience(eo);

        //record transition and manage option case
        int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
        eStepCounter += stepInc;
        e.transition(a, eo.op, eo.r);

    }

    return e;
}

Example 9

Source File: ActorCritic.java From burlap with Apache License 2.0

3 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {


	State initialState = env.currentObservation();
	Episode ea = new Episode(initialState);
	State curState = initialState;

	this.critic.startEpisode(curState);
	this.actor.startEpisode(curState);

	int timeSteps = 0;
	while(!env.isInTerminalState() && (timeSteps < maxSteps || maxSteps == -1)){

		Action ga = this.actor.action(curState);
		EnvironmentOutcome eo = env.executeAction(ga);

		ea.transition(eo);

		double critique = this.critic.critique(eo);
		this.actor.update(eo, critique);

		curState = env.currentObservation();
		timeSteps++;

	}

	this.critic.endEpisode();
	this.actor.endEpisode();

	if(episodeHistory.size() >= numEpisodesToStore){
		episodeHistory.poll();
	}
	episodeHistory.offer(ea);

	return ea;

}

Example 10

Source File: ARTDP.java From burlap with Apache License 2.0

3 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();

	Episode ea = new Episode(initialState);

	State curState = initialState;
	int steps = 0;
	while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){
		Action ga = policy.action(curState);
		EnvironmentOutcome eo = env.executeAction(ga);


		ea.transition(ga, eo.op, eo.r);

		this.model.updateModel(eo);

		this.modelPlanner.performBellmanUpdateOn(eo.o);

		curState = env.currentObservation();
		steps++;

	}

	return ea;
}

Example 11

Source File: QLearning.java From burlap with Apache License 2.0

2 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();

	Episode ea = new Episode(initialState);
	HashableState curState = this.stateHash(initialState);
	eStepCounter = 0;

	maxQChangeInLastEpisode = 0.;
	while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

		Action action = learningPolicy.action(curState.s());
		QValue curQ = this.getQ(curState, action);



		EnvironmentOutcome eo;
		if(!(action instanceof Option)){
			eo = env.executeAction(action);
		}
		else{
			eo = ((Option)action).control(env, this.gamma);
		}



		HashableState nextState = this.stateHash(eo.op);
		double maxQ = 0.;

		if(!eo.terminated){
			maxQ = this.getMaxQ(nextState);
		}

		//manage option specifics
		double r = eo.r;
		double discount = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).discount : this.gamma;
		int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
		eStepCounter += stepInc;

		if(!(action instanceof Option) || !this.shouldDecomposeOptions){
			ea.transition(action, nextState.s(), r);
		}
		else{
			ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode);
		}



		double oldQ = curQ.q;

		//update Q-value
		curQ.q = curQ.q + this.learningRate.pollLearningRate(this.totalNumberOfSteps, curState.s(), action) * (r + (discount * maxQ) - curQ.q);

		double deltaQ = Math.abs(oldQ - curQ.q);
		if(deltaQ > maxQChangeInLastEpisode){
			maxQChangeInLastEpisode = deltaQ;
		}

		//move on polling environment for its current state in case it changed during processing
		curState = this.stateHash(env.currentObservation());
		this.totalNumberOfSteps++;


	}


	return ea;

}

Example 12

Source File: PotentialShapedRMax.java From burlap with Apache License 2.0

2 votes

@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();

	this.modelPlanner.initializePlannerIn(initialState);

	Episode ea = new Episode(initialState);

	Policy policy = this.createUnmodeledFavoredPolicy();

	State curState = initialState;
	int steps = 0;
	while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){

		Action ga = policy.action(curState);
		EnvironmentOutcome eo = env.executeAction(ga);
		ea.transition(ga, eo.op, eo.r);

		boolean modeledTerminal = this.model.terminal(eo.op);

		if(!this.model.transitionIsModeled(curState, ga)
				|| (!KWIKModel.Helper.stateTransitionsModeled(model, this.getActionTypes(), eo.op) && !modeledTerminal)){
			this.model.updateModel(eo);
			if(this.model.transitionIsModeled(curState, ga) || (eo.terminated != modeledTerminal && modeledTerminal != this.model.terminal(eo.op))){
				this.modelPlanner.modelChanged(curState);
				policy = this.createUnmodeledFavoredPolicy();
			}
		}


		curState = env.currentObservation();

		steps++;
	}

	if(episodeHistory.size() >= numEpisodesToStore){
		episodeHistory.poll();
	}
	episodeHistory.offer(ea);


	return ea;

}