burlap.behavior.singleagent.options.EnvironmentOptionOutcome Java Examples

The following examples show how to use burlap.behavior.singleagent.options.EnvironmentOptionOutcome. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DynamicWeightedAStar.java    From burlap with Apache License 2.0 6 votes vote down vote up
public double computeF(PrioritizedSearchNode parentNode, Action generatingAction, HashableState successorState, EnvironmentOutcome eo) {
	double cumR = 0.;
	int d = 0;
	if(parentNode != null){
		double pCumR = cumulatedRewardMap.get(parentNode.s);
		cumR = pCumR + eo.r;
		
		int pD = depthMap.get(parentNode.s);
		if(!(generatingAction instanceof Option)){
			d = pD + 1;
		}
		else{
			d = pD + ((EnvironmentOptionOutcome)eo).numSteps();
		}
	}
	
	double H  = heuristic.h(successorState.s());
	lastComputedCumR = cumR;
	lastComputedDepth = d;
	double weightedE = this.epsilon * this.epsilonWeight(d);
	double F = cumR + ((1. + weightedE)*H);
	
	return F;
}
 
Example #2
Source File: SparseSampling.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Estimates the Q-value using sampling from the transition dynamics. This is the standard Sparse Sampling procedure.
 * @param ga the action for which the Q-value estimate is to be returned
 * @return the Q-value estimate
 */
protected double sampledQEstimate(Action ga){
	
	double sum = 0.;
	
	//generate C samples
	int c = SparseSampling.this.getCAtHeight(this.height);
	for(int i = 0; i < c; i++){
		
		//execute
		EnvironmentOutcome eo = model.sample(sh.s(), ga);
		State ns = eo.op;
		
		//manage option stepsize modifications
		int k = 1;
		if(ga instanceof Option){
			k = ((EnvironmentOptionOutcome)ga).numSteps();
		}
		
		//get reward; our rf will automatically do cumumative discounted if it's an option
		double r = eo.r;
		
		StateNode nsn = SparseSampling.this.getStateNode(ns, this.height-k);
		
		sum += r + Math.pow(SparseSampling.this.gamma, k)*nsn.estimateV();
	}
	sum /= (double)c;
	
	return sum;
}
 
Example #3
Source File: OptionsExample.java    From burlap_examples with MIT License 4 votes vote down vote up
public static Episode optionExecuteResult(SADomain domain, Option o, State s){
	SimulatedEnvironment env = new SimulatedEnvironment(domain, s);
	EnvironmentOptionOutcome eo = o.control(env, 0.99);
	return eo.episode;
}
 
Example #4
Source File: PolicyUtils.java    From burlap with Apache License 2.0 4 votes vote down vote up
/**
 * Follows this policy for one time step in the provided {@link burlap.mdp.singleagent.environment.Environment} and
 * records the interaction in the provided {@link Episode} object. If the policy
 * selects an {@link burlap.behavior.singleagent.options.Option}, then how the option's interaction in the environment
 * is recorded depends on the {@link #rolloutsDecomposeOptions} flag.
 * If {@link #rolloutsDecomposeOptions} is false, then the option is recorded as a single action. If it is true, then
 * the individual primitive actions selected by the environment are recorded.
 * @param p the {@link Policy}
 * @param env The {@link burlap.mdp.singleagent.environment.Environment} in which this policy should be followed.
 * @param ea The {@link Episode} object to which the action selection will be recorded.
 */
protected static void followAndRecordPolicy(Policy p, Environment env, Episode ea){


	//follow policy
	Action a = p.action(env.currentObservation());
	if(a == null){
		throw new PolicyUndefinedException();
	}


	EnvironmentOutcome eo = env.executeAction(a);


	if(a instanceof Option && rolloutsDecomposeOptions){
		ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode);
	}
	else{
		ea.transition(a, eo.op, eo.r);
	}

}
 
Example #5
Source File: ApproximateQLearning.java    From burlap with Apache License 2.0 4 votes vote down vote up
@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();
	Episode e = new Episode(initialState);


	int eStepCounter = 0;
	while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

		//check state
		State curState = stateMapping.mapState(env.currentObservation());

		//select action
		Action a = this.learningPolicy.action(curState);

		//take action
		EnvironmentOutcome eo = env.executeAction(a);

		//save outcome in memory
		this.memory.addExperience(eo);

		//record transition and manage option case
		int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
		eStepCounter += stepInc;
		this.totalSteps += stepInc;
		e.transition(a, eo.op, eo.r);

		//perform learners
		List<EnvironmentOutcome> samples = this.memory.sampleExperiences(this.numReplay);
		this.updateQFunction(samples);

		//update stale function
		this.stepsSinceStale++;
		if(this.stepsSinceStale >= this.staleDuration){
			this.updateStaleFunction();
		}

	}

	this.totalEpisodes++;
	return e;
}
 
Example #6
Source File: UCT.java    From burlap with Apache License 2.0 4 votes vote down vote up
/**
 * Performs a rollout in the UCT tree from the given node, keeping track of how many new nodes can be added to the tree.
 * @param node the node from which to rollout
 * @param depth the depth of the node
 * @param childrenLeftToAdd the number of new subsequent nodes that can be connected to the tree
 * @return the sample return from rolling out from this node
 */
public double treeRollOut(UCTStateNode node, int depth, int childrenLeftToAdd){
	
	numVisits++;
	
	if(depth == maxHorizon){
		return 0.;
	}
	
	if(model.terminal(node.state.s())){
		if(goalCondition != null && goalCondition.satisfies(node.state.s())){
		    foundGoal = true;
               foundGoalOnRollout = true;
		}
		DPrint.cl(debugCode, numRollOutsFromRoot + " Hit terminal at depth: " + depth);
		return 0.;
	}
	
	
	
	UCTActionNode anode = this.selectActionNode(node);
	
	if(anode == null){
		//no actions can be performed in this state
		return 0.;
	}
	
	
	
	//sample the action
	EnvironmentOutcome eo = model.sample(node.state.s(), anode.action);
	HashableState shprime = this.stateHash(eo.op);
	double r = eo.r;
	int depthChange = 1;
	if(anode.action instanceof Option){
		depthChange = ((EnvironmentOptionOutcome)eo).numSteps();
	}
	
	UCTStateNode snprime = this.queryTreeIndex(shprime, depth+depthChange);
	
	double sampledReturn;
	
	boolean shouldConnectNode = false;
	double futureReturn;
	if(snprime != null){
		
		//then this state already exists in the tree
		
		if(!anode.referencesSuccessor(snprime)){ 
			//then this successor has not been generated by this state-action pair before and should be indexed
			anode.addSuccessor(snprime);
		}
		
		futureReturn = this.treeRollOut(snprime, depth + depthChange, childrenLeftToAdd);
		sampledReturn = r + Math.pow(gamma, depthChange) * futureReturn;
		
	}
	else{
		
		//this state is not in the tree at this depth so create it
		snprime = stateNodeConstructor.generate(shprime, depth+1, actionTypes, actionNodeConstructor);
		
		//store it in the tree depending on how many new nodes have already been stored in this roll out
		if(childrenLeftToAdd > 0){
			shouldConnectNode = true;
		}
		
		//and do an exploratory sample from it
		futureReturn = this.treeRollOut(snprime, depth + depthChange, childrenLeftToAdd-1);
		sampledReturn = r + gamma * futureReturn;
		
		
	}
	
	node.n++;
	anode.update(sampledReturn);
	
	if(shouldConnectNode || foundGoalOnRollout){
		this.addNodeToIndexTree(snprime);
		anode.addSuccessor(snprime);
		uniqueStatesInTree.add(snprime.state);
	}
	
	
	return sampledReturn;
}
 
Example #7
Source File: DeepQTester.java    From burlap_caffe with Apache License 2.0 3 votes vote down vote up
@Override
public Episode runTestEpisode(Environment env, int maxSteps) {

    State initialState = env.currentObservation();
    Episode e = new Episode(initialState);


    int eStepCounter = 0;
    while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

        //check state
        State curState = stateMapping.mapState(env.currentObservation());

        //select action
        Action a = this.policy.action(curState);

        //take action
        EnvironmentOutcome eo = env.executeAction(a);

        //save outcome in memory
        this.memory.addExperience(eo);

        //record transition and manage option case
        int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
        eStepCounter += stepInc;
        e.transition(a, eo.op, eo.r);

    }

    return e;
}
 
Example #8
Source File: QLearning.java    From burlap with Apache License 2.0 2 votes vote down vote up
@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {

	State initialState = env.currentObservation();

	Episode ea = new Episode(initialState);
	HashableState curState = this.stateHash(initialState);
	eStepCounter = 0;

	maxQChangeInLastEpisode = 0.;
	while(!env.isInTerminalState() && (eStepCounter < maxSteps || maxSteps == -1)){

		Action action = learningPolicy.action(curState.s());
		QValue curQ = this.getQ(curState, action);



		EnvironmentOutcome eo;
		if(!(action instanceof Option)){
			eo = env.executeAction(action);
		}
		else{
			eo = ((Option)action).control(env, this.gamma);
		}



		HashableState nextState = this.stateHash(eo.op);
		double maxQ = 0.;

		if(!eo.terminated){
			maxQ = this.getMaxQ(nextState);
		}

		//manage option specifics
		double r = eo.r;
		double discount = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).discount : this.gamma;
		int stepInc = eo instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome)eo).numSteps() : 1;
		eStepCounter += stepInc;

		if(!(action instanceof Option) || !this.shouldDecomposeOptions){
			ea.transition(action, nextState.s(), r);
		}
		else{
			ea.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome)eo).episode);
		}



		double oldQ = curQ.q;

		//update Q-value
		curQ.q = curQ.q + this.learningRate.pollLearningRate(this.totalNumberOfSteps, curState.s(), action) * (r + (discount * maxQ) - curQ.q);

		double deltaQ = Math.abs(oldQ - curQ.q);
		if(deltaQ > maxQChangeInLastEpisode){
			maxQChangeInLastEpisode = deltaQ;
		}

		//move on polling environment for its current state in case it changed during processing
		curState = this.stateHash(env.currentObservation());
		this.totalNumberOfSteps++;


	}


	return ea;

}