burlap.behavior.valuefunction.QValue Java Examples

The following examples show how to use burlap.behavior.valuefunction.QValue. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BoundedRTDP.java    From burlap with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the maximum Q-value entry for the given state with ties broken randomly. 
 * @param s the query state for the Q-value
 * @return the maximum Q-value entry for the given state with ties broken randomly. 
 */
protected QValue maxQ(State s){
	
	List<QValue> qs = this.qValues(s);
	double max = Double.NEGATIVE_INFINITY;
	List<QValue> maxQs = new ArrayList<QValue>(qs.size());
	
	for(QValue q : qs){
		if(q.q == max){
			maxQs.add(q);
		}
		else if(q.q > max){
			max = q.q;
			maxQs.clear();
			maxQs.add(q);
		}
	}
	
	//return random max
	int rint = RandomFactory.getMapped(0).nextInt(maxQs.size());
	
	return maxQs.get(rint);
}
 
Example #2
Source File: ARTDP.java    From burlap with Apache License 2.0 6 votes vote down vote up
@Override
public List<QValue> qValues(State s) {
	List<QValue> qs = this.modelPlanner.qValues(s);

	if(this.model instanceof KWIKModel){
		for(QValue q : qs){
			//if Q for unknown action, use value initialization of current state
			if(!((KWIKModel)this.model).transitionIsModeled(s, q.a)){
				q.q = this.modelPlanner.getValueFunctionInitialization().value(s);
			}
		}
	}


	return qs;
}
 
Example #3
Source File: UCT.java    From burlap with Apache License 2.0 6 votes vote down vote up
@Override
public List<QValue> qValues(State s) {

	//if we haven't done any planning, then do so now
	if(this.root == null){
		this.planFromState(s);
	}

	//if the root node isn't the query state, then replan
	HashableState sh = this.hashingFactory.hashState(s);
	if(!sh.equals(this.root.state)){
		this.resetSolver();
		this.planFromState(s);
	}

	//compute the Q-values
	List <QValue> qs = new ArrayList<QValue>(this.root.actionNodes.size());
	for(UCTActionNode act : this.root.actionNodes){
		qs.add(new QValue(s, act.action, act.averageReturn()));
	}

	return qs;
}
 
Example #4
Source File: QLTutorial.java    From burlap_examples with MIT License 6 votes vote down vote up
@Override
public List<QValue> qValues(State s) {
	//first get hashed state
	HashableState sh = this.hashingFactory.hashState(s);

	//check if we already have stored values
	List<QValue> qs = this.qValues.get(sh);

	//create and add initialized Q-values if we don't have them stored for this state
	if(qs == null){
		List<Action> actions = this.applicableActions(s);
		qs = new ArrayList<QValue>(actions.size());
		//create a Q-value for each action
		for(Action a : actions){
			//add q with initialized value
			qs.add(new QValue(s, a, this.qinit.qValue(s, a)));
		}
		//store this for later
		this.qValues.put(sh, qs);
	}

	return qs;
}
 
Example #5
Source File: SparseSampling.java    From burlap with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the estimated Q-value if this node is closed, or estimates it and closes it otherwise.
 * @return the estimated Q-value for this node.
 */
public double estimateV(){
	if(this.closed){
		return this.v;
	}
	
	if(SparseSampling.this.model.terminal(sh.s())){
		this.v = 0.;
		this.closed = true;
		return this.v;
	}
	
	
	List<QValue> Qs = this.estimateQs();
	double [] qs = new double[Qs.size()];
	for(int i = 0; i < Qs.size(); i++){
		qs[i] = Qs.get(i).q;
	}
	SparseSampling.this.numUpdates++;
	this.v = operator.apply(qs);
	this.closed = true;
	return this.v;
}
 
Example #6
Source File: SparseSampling.java    From burlap with Apache License 2.0 6 votes vote down vote up
@Override
public double qValue(State s, Action a) {
	
	HashableState sh = this.hashingFactory.hashState(s);
	List<QValue> qs = this.rootLevelQValues.get(sh);
	if(qs == null){
		this.planFromState(s);
		qs = this.rootLevelQValues.get(sh);
	}
	
	for(QValue qv : qs){
		if(qv.a.equals(a)){
			return qv.q;
		}
	}
	
	throw new RuntimeException("Q-value for action " + a.toString() +" does not exist.");
}
 
Example #7
Source File: QMDP.java    From burlap with Apache License 2.0 6 votes vote down vote up
@Override
public List<QValue> qValues(State s) {

	if(!(s instanceof BeliefState) || !(s instanceof EnumerableBeliefState)){
		throw new RuntimeException("QMDP cannot return the Q-values for the given state, because the given state is not a EnumerableBeliefState instance. It is a " + s.getClass().getName());
	}

	BeliefState bs = (BeliefState)s;

	//get actions for any underlying MDP state
	List<Action> gas = this.applicableActions(bs.sample());
	List<QValue> result = new ArrayList<QValue>(gas.size());

	List<EnumerableBeliefState.StateBelief> beliefs = ((EnumerableBeliefState)bs).nonZeroBeliefs();

	for(Action ga : gas){
		double q = this.qForBeliefList(beliefs, ga);
		QValue Q = new QValue(s, ga, q);
		result.add(Q);
	}
	
	return result;
}
 
Example #8
Source File: DifferentiableDP.java    From burlap with Apache License 2.0 6 votes vote down vote up
/**
 * Performs the Boltzmann value function gradient backup for the given {@link burlap.statehashing.HashableState}.
 * Results are stored in this valueFunction's internal map.
 * @param sh the hashed state on which to perform the Boltzmann gradient update.
 * @return the gradient.
 */
protected FunctionGradient performDPValueGradientUpdateOn(HashableState sh){


	//get q objects
	List<QValue> Qs = this.qValues(sh.s());
	double [] qs = new double[Qs.size()];
	for(int i = 0; i < Qs.size(); i++){
		qs[i] = Qs.get(i).q;
	}

	FunctionGradient [] qGradients = new FunctionGradient[qs.length];
	for(int i = 0; i < qs.length; i++){
		qGradients[i] = this.qGradient(sh.s(), Qs.get(i).a);
	}

	FunctionGradient vGradient = ((DifferentiableDPOperator)operator).gradient(qs, qGradients);
	this.valueGradient.put(sh, vGradient);

	return vGradient;
}
 
Example #9
Source File: RewardValueProjection.java    From burlap with Apache License 2.0 6 votes vote down vote up
@Override
public List<QValue> qValues(State s) {

	if(this.domain != null){
		List<Action> actions = ActionUtils.allApplicableActionsForTypes(this.domain.getActionTypes(), s);
		List<QValue> qs = new ArrayList<QValue>(actions.size());
		for(Action ga : actions){
			qs.add(new QValue(s, ga, this.qValue(s, ga)));
		}
		return qs;
	}

	if(this.projectionType == RewardProjectionType.DESTINATIONSTATE){
		return Arrays.asList(new QValue(s, null, this.rf.reward(null, null, s)));
	}
	else if(this.projectionType == RewardProjectionType.SOURCESTATE){
		return Arrays.asList(new QValue(s, null, this.rf.reward(null, null, s)));
	}
	else if(this.projectionType == RewardProjectionType.STATEACTION){
		throw new RuntimeException("RewardValueProjection cannot generate all state-action Q-values because it was not" +
				"provided the Domain to enumerate the actions. Use the RewardValueProjection(RewardFunction, RewardProjectionType, Domain) " +
				"constructor to specify it.");
	}

	throw new RuntimeException("Unknown RewardProjectionType... this shouldn't happen.");
}
 
Example #10
Source File: SparseSampling.java    From burlap with Apache License 2.0 6 votes vote down vote up
/**
 * Estimates and returns the Q-values for this node. Q-values and used state samples are forgotten after this call completes.
 * @return a {@link List} of the estiamted Q-values for each action.
 */
public List<QValue> estimateQs(){
	List<Action> gas = SparseSampling.this.applicableActions(this.sh.s());
	List<QValue> qs = new ArrayList<QValue>(gas.size());
	for(Action ga : gas){
		if(this.height <= 0){
			qs.add(new QValue(this.sh.s(), ga, SparseSampling.this.vinit.value(this.sh.s())));
		}
		else{
			double q;
			if(!SparseSampling.this.computeExactValueFunction){
				q = this.sampledQEstimate(ga);
			}
			else{
				q = this.exactQValue(ga);
			}
			
			qs.add(new QValue(this.sh.s(), ga, q));
		}
	}
	
	return qs;
}
 
Example #11
Source File: ApproximateQLearning.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Override
public double value(State s) {
	s = this.stateMapping.mapState(s);
	List<QValue> qs = this.qValues(s);
	double max = Double.NEGATIVE_INFINITY;
	for(QValue q : qs){
		max = Math.max(max, q.q);
	}
	return max;
}
 
Example #12
Source File: QLearning.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the maximum Q-value in the hashed stated.
 * @param s the state for which to get he maximum Q-value;
 * @return the maximum Q-value in the hashed stated.
 */
protected double getMaxQ(HashableState s){
	List <QValue> qs = this.getQs(s);
	double max = Double.NEGATIVE_INFINITY;
	for(QValue q : qs){
		if(q.q > max){
			max = q.q;
		}
	}
	return max;
}
 
Example #13
Source File: SparseSampling.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Override
public double value(State s) {
	if(model.terminal(s)){
		return 0.;
	}
	List<QValue> Qs = this.qValues(s);
	double [] qs = new double[Qs.size()];
	for(int i = 0; i < Qs.size(); i++){
		qs[i] = Qs.get(i).q;
	}
	double v = this.operator.apply(qs);
	return v;
}
 
Example #14
Source File: GradientDescentSarsaLam.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Override
public List<QValue> qValues(State s) {
	List<Action> gas = this.applicableActions(s);
	List <QValue> qs = new ArrayList<QValue>(gas.size());

	for(Action ga : gas){
		qs.add(new QValue(s, ga, this.vfa.evaluate(s, ga)));
	}
	
	return qs;
}
 
Example #15
Source File: ApproximateQLearning.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Override
public List<QValue> qValues(State s) {
	s = this.stateMapping.mapState(s);
	List<Action> actions = this.applicableActions(s);
	List<QValue> qs = new ArrayList<QValue>(actions.size());
	for(Action a : actions){
		QValue q = new QValue(s, a, this.qValue(s, a));
		qs.add(q);
	}
	return qs;
}
 
Example #16
Source File: BoltzmannPolicyGradient.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Computes the gradient of a Boltzmann policy using the given differentiable valueFunction.
 * @param s the input state of the policy gradient
 * @param a the action whose policy probability gradient being queried
 * @param planner the differentiable {@link DifferentiableQFunction} valueFunction
 * @param beta the Boltzmann beta parameter. This parameter is the inverse of the Botlzmann temperature. As beta becomes larger, the policy becomes more deterministic. Should lie in [0, +ifnty].
 * @return the gradient of the policy.
 */
public static FunctionGradient computeBoltzmannPolicyGradient(State s, Action a, DifferentiableQFunction planner, double beta){


	//get q objects
	List<QValue> Qs = ((QProvider)planner).qValues(s);
	double [] qs = new double[Qs.size()];
	for(int i = 0; i < Qs.size(); i++){
		qs[i] = Qs.get(i).q;
	}

	//find matching action index
	int aind = -1;
	for(int i = 0; i < Qs.size(); i++){
		if(Qs.get(i).a.equals(a)){
			aind = i;
			break;
		}
	}

	if(aind == -1){
		throw new RuntimeException("Error in computing BoltzmannPolicyGradient: Could not find query action in Q-value list.");
	}

	FunctionGradient [] qGradients = new FunctionGradient[qs.length];
	for(int i = 0; i < qs.length; i++){
		qGradients[i] = planner.qGradient(s, Qs.get(i).a);
	}


	FunctionGradient policyGradient = computePolicyGradient(qs, qGradients, aind, beta);

	return policyGradient;

}
 
Example #17
Source File: ApproximateQLearning.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Returns all Q-value estimates from the current state Q-function
 * @param s the state for which the Q-values are to be returned
 * @return all Q-value estimates from the current state Q-function; a {@link List} of {@link QValue} objects.
 */
public List<QValue> getStaleQs(State s) {
	s = this.stateMapping.mapState(s);
	List<Action> actions = this.applicableActions(s);
	List<QValue> qs = new ArrayList<QValue>(actions.size());
	for(Action a : actions){
		QValue q = this.getStaleQ(s, a);
		qs.add(q);
	}
	return qs;
}
 
Example #18
Source File: ApproximateQLearning.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * The stale state value function estimate (max state Q-value)
 * @param s the state for which the value should be returned
 * @return stale state value function estimate (max state Q-value)
 */
public double staleValue(State s) {
	s = this.stateMapping.mapState(s);
	List<QValue> qs = this.getStaleQs(s);
	double max = Double.NEGATIVE_INFINITY;
	for(QValue q : qs){
		max = Math.max(max, q.q);
	}
	return max;
}
 
Example #19
Source File: SarsaLam.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new eligibility trace to track for an episode.
 * @param sh the state of the trace
 * @param q the q-value (containing the action reference) of the trace
 * @param elgigbility the eligibility value
 */
public EligibilityTrace(HashableState sh, QValue q, double elgigbility){
	this.sh = sh;
	this.q = q;
	this.eligibility = elgigbility;
	this.initialQ = q.q;
}
 
Example #20
Source File: SparseSampling.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Override
public List<QValue> qValues(State s) {
	
	HashableState sh = this.hashingFactory.hashState(s);
	List<QValue> qs = this.rootLevelQValues.get(sh);
	if(qs == null){
		this.planFromState(s);
		qs = this.rootLevelQValues.get(sh);
	}
	
	return qs;
}
 
Example #21
Source File: SparseSampling.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Initializes. Note that you can have h and c set to values that ensure epsilon optimality by using the {@link #setHAndCByMDPError(double, double, int)} method, but in
 * general this will result in very large values that will be intractable. If you set c = -1, then the full transition dynamics will be used. You should
 * only use the full transition dynamics if the number of possible transitions from each state is small and if the model implements {@link burlap.mdp.singleagent.model.FullModel}
 * @param domain the planning domain
 * @param gamma the discount factor
 * @param hashingFactory the state hashing factory for matching generated states with their state nodes.
 * @param h the height of the tree
 * @param c the number of transition dynamics samples used. If set to -1, then the full transition dynamics are used.
 */
public SparseSampling(SADomain domain, double gamma, HashableStateFactory hashingFactory, int h, int c){
	this.solverInit(domain, gamma, hashingFactory);
	this.h = h;
	this.c = c;
	this.nodesByHeight = new HashMap<SparseSampling.HashedHeightState, SparseSampling.StateNode>();
	this.rootLevelQValues = new HashMap<HashableState, List<QValue>>();
	if(this.c < 0){
		this.computeExactValueFunction = true;
	}

	this.debugCode = 7369430;
}
 
Example #22
Source File: QLearning.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the Q-value for a given hashed state and action.
 * @param s the hashed state
 * @param a the action
 * @return the Q-value for a given hashed state and action; null is returned if there is not Q-value currently stored.
 */
protected QValue getQ(HashableState s, Action a) {
	QLearningStateNode node = this.getStateNode(s);

	for(QValue qv : node.qEntry){
		if(qv.a.equals(a)){
			return qv;
		}
	}
	
	return null; //no action for this state indexed
}
 
Example #23
Source File: DynamicProgramming.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Override
public List <QValue> qValues(State s){
	
	List<Action> gas = this.applicableActions(s);
	List<QValue> qs = new ArrayList<QValue>(gas.size());
	for(Action ga : gas){
		QValue q = new QValue(s, ga, this.qValue(s, ga));
		qs.add(q);
	}

	return qs;
	
}
 
Example #24
Source File: SGNaiveQLAgent.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Returns maximum numeric Q-value for a given state
 * @param s the state for which the max Q-value should be returned
 * @return maximum numeric Q-value for a given state
 */
protected double getMaxQValue(State s){
	List<QValue> qs = this.qValues(s);
	double maxQ = Double.NEGATIVE_INFINITY;
	for(QValue q : qs){
		maxQ = Math.max(maxQ, q.q);
	}
	return maxQ;
}
 
Example #25
Source File: SGNaiveQLAgent.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Initializes with a default 0.1 epsilon greedy policy/strategy
 * @param d the domain in which the agent will act
 * @param discount the discount factor
 * @param learningRate the learning rate
 * @param qInitizalizer the Q-value initialization method
 * @param hashFactory the state hashing factory
 */
public SGNaiveQLAgent(SGDomain d, double discount, double learningRate, QFunction qInitizalizer, HashableStateFactory hashFactory) {
	this.init(d);
	this.discount = discount;
	this.learningRate = new ConstantLR(learningRate);
	this.hashFactory = hashFactory;
	this.qInit = qInitizalizer;
	
	this.qMap = new HashMap<HashableState, List<QValue>>();
	stateRepresentations = new HashMap<HashableState, State>();
	this.policy = new EpsilonGreedy(this, 0.1);
	
	this.storedMapAbstraction = new ShallowIdentityStateMapping();
}
 
Example #26
Source File: SGNaiveQLAgent.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Initializes with a default 0.1 epsilon greedy policy/strategy
 * @param d the domain in which the agent will act
 * @param discount the discount factor
 * @param learningRate the learning rate
 * @param defaultQ the default to which all Q-values will be initialized
 * @param hashFactory the state hashing factory
 */
public SGNaiveQLAgent(SGDomain d, double discount, double learningRate, double defaultQ, HashableStateFactory hashFactory) {
	this.init(d);
	this.discount = discount;
	this.learningRate = new ConstantLR(learningRate);
	this.hashFactory = hashFactory;
	this.qInit = new ConstantValueFunction(defaultQ);
	
	this.qMap = new HashMap<HashableState, List<QValue>>();
	stateRepresentations = new HashMap<HashableState, State>();
	this.policy = new EpsilonGreedy(this, 0.1);
	
	this.storedMapAbstraction = new ShallowIdentityStateMapping();
}
 
Example #27
Source File: SGNaiveQLAgent.java    From burlap with Apache License 2.0 5 votes vote down vote up
/**
 * Initializes with a default Q-value of 0 and a 0.1 epsilon greedy policy/strategy
 * @param d the domain in which the agent will act
 * @param discount the discount factor
 * @param learningRate the learning rate
 * @param hashFactory the state hashing factory
 */
public SGNaiveQLAgent(SGDomain d, double discount, double learningRate, HashableStateFactory hashFactory) {
	this.init(d);
	this.discount = discount;
	this.learningRate = new ConstantLR(learningRate);
	this.hashFactory = hashFactory;
	this.qInit = new ConstantValueFunction(0.);
	
	this.qMap = new HashMap<HashableState, List<QValue>>();
	stateRepresentations = new HashMap<HashableState, State>();
	this.policy = new EpsilonGreedy(this, 0.1);
	
	this.storedMapAbstraction = new ShallowIdentityStateMapping();
}
 
Example #28
Source File: GreedyDeterministicQPolicy.java    From burlap with Apache License 2.0 5 votes vote down vote up
@Override
public Action action(State s) {
	
	List<QValue> qValues = this.qplanner.qValues(s);
	double maxQV = Double.NEGATIVE_INFINITY;
	QValue maxQ = null;
	for(QValue q : qValues){
		if(q.q > maxQV){
			maxQV = q.q;
			maxQ = q;
		}
	}
	
	return maxQ.a;
}
 
Example #29
Source File: QLTutorial.java    From burlap_examples with MIT License 5 votes vote down vote up
protected QValue storedQ(State s, Action a){
	//first get all Q-values
	List<QValue> qs = this.qValues(s);

	//iterate through stored Q-values to find a match for the input action
	for(QValue q : qs){
		if(q.a.equals(a)){
			return q;
		}
	}

	throw new RuntimeException("Could not find matching Q-value.");
}
 
Example #30
Source File: QLTutorial.java    From burlap_examples with MIT License 5 votes vote down vote up
@Override
public Episode runLearningEpisode(Environment env, int maxSteps) {
	//initialize our episode object with the initial state of the environment
	Episode e = new Episode(env.currentObservation());

	//behave until a terminal state or max steps is reached
	State curState = env.currentObservation();
	int steps = 0;
	while(!env.isInTerminalState() && (steps < maxSteps || maxSteps == -1)){

		//select an action
		Action a = this.learningPolicy.action(curState);

		//take the action and observe outcome
		EnvironmentOutcome eo = env.executeAction(a);

		//record result
		e.transition(eo);

		//get the max Q value of the resulting state if it's not terminal, 0 otherwise
		double maxQ = eo.terminated ? 0. : this.value(eo.op);

		//update the old Q-value
		QValue oldQ = this.storedQ(curState, a);
		oldQ.q = oldQ.q + this.learningRate * (eo.r + this.gamma * maxQ - oldQ.q);


		//update state pointer to next environment state observed
		curState = eo.op;
		steps++;

	}

	return e;
}