Python tensorflow.python.framework.errors.AbortedError() Examples
The following are 17
code examples of tensorflow.python.framework.errors.AbortedError().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.framework.errors
, or try the search function
.
Example #1
Source File: monitored_session.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def __init__(self, session_creator, hooks, should_recover): """Sets up a Monitored or Hooked Session. Args: session_creator: A factory object to create session. Typically a `ChiefSessionCreator` or a `WorkerSessionCreator`. hooks: An iterable of `SessionRunHook' objects. should_recover: A bool. Indicates whether to recover from `AbortedError` or not. """ self._graph_was_finalized = ops.get_default_graph().finalized self._hooks = hooks or [] for h in self._hooks: h.begin() # Create the session. self._coordinated_creator = self._CoordinatedSessionCreator( session_creator=session_creator or ChiefSessionCreator(), hooks=self._hooks) if should_recover: self._sess = _RecoverableSession(self._coordinated_creator) else: self._sess = self._coordinated_creator.create_session()
Example #2
Source File: monitored_session.py From keras-lambda with MIT License | 6 votes |
def run(self, fetches, feed_dict=None, options=None, run_metadata=None): while True: try: if not self._sess: self._sess = self._create_session() return self._sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata) except errors.AbortedError: logging.info('An AbortedError was raised. Closing the current session. ' 'It\'s most likely due to a preemption in a connected ' 'worker/ps. ' 'A new session will be created on the next session.run().') self.close() self._sess = None
Example #3
Source File: monitored_session.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def run(self, fetches, feed_dict=None, options=None, run_metadata=None): while True: try: if not self._sess: self._sess = self._create_session() return self._sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata) except errors.AbortedError: logging.info('An AbortedError was raised. Closing the current session. ' 'It\'s most likely due to a preemption in a connected ' 'worker/ps. ' 'A new session will be created on the next session.run().') self.close() self._sess = None
Example #4
Source File: monitored_session.py From keras-lambda with MIT License | 6 votes |
def __init__(self, session_creator, hooks, should_recover): """Sets up a Monitored or Hooked Session. Args: session_creator: A factory object to create session. Typically a `ChiefSessionCreator` or a `WorkerSessionCreator`. hooks: An iterable of `SessionRunHook' objects. should_recover: A bool. Indicates whether to recover from `AbortedError` or not. """ self._graph_was_finalized = ops.get_default_graph().finalized self._hooks = hooks or [] for h in self._hooks: h.begin() # Create the session. self._coordinated_creator = self._CoordinatedSessionCreator( session_creator=session_creator or ChiefSessionCreator(), hooks=self._hooks) if should_recover: self._sess = _RecoverableSession(self._coordinated_creator) else: self._sess = self._coordinated_creator.create_session()
Example #5
Source File: monitored_session.py From deep_image_model with Apache License 2.0 | 5 votes |
def run(self, fetches, feed_dict=None, options=None, run_metadata=None): while True: try: if not self._sess: self._sess = self._sess_creator.create_session() return self._sess.run(fetches, feed_dict=feed_dict, options=options, run_metadata=run_metadata) except errors.AbortedError: self.close() self._sess = None
Example #6
Source File: failure_tolerator.py From keras-lambda with MIT License | 5 votes |
def __init__(self, limit=5, init_delay=5.0, backoff_factor=2.0, forgive_after_seconds=6000, handled_exceptions=None): """Creates a FailureTolerator. The result will pause for `init_delay * (backoff_factor^(failure_count-1))` when re-entering `forgive()` after a failure. Args: limit: The maximum number of suppressed, unforgiven, failures. init_delay: How long to pause once the first failure is encountered. Defaults to five seconds. backoff_factor: Each subsequent failure grows the pause by this factor. forgive_after_seconds: Failures older than this are forgiven. handled_exceptions: The exceptions to forgive. Defaults to `(errors.AbortedError,)`. """ self.limit = limit self.backoff = backoff_factor self.delay = init_delay self.forgive_after = forgive_after_seconds self.exceptions = [] self.time_in_delay = 0.0 if handled_exceptions is None: self.handled = (errors.AbortedError,) else: self.handled = tuple(handled_exceptions)
Example #7
Source File: basic_loops.py From keras-lambda with MIT License | 5 votes |
def basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master=""): """Basic loop to train a model. Calls `train_step_fn` in a loop to train a model. The function is called as: ```python train_step_fn(session, *args, **kwargs) ``` It is passed a `tf.Session` in addition to `args` and `kwargs`. The function typically runs one training step in the session. Args: supervisor: `tf.Supervisor` to run the training services. train_step_fn: Callable to execute one training step. Called repeatedly as `train_step_fn(session, *args **kwargs)`. args: Optional positional arguments passed to `train_step_fn`. kwargs: Optional keyword arguments passed to `train_step_fn`. master: Master to use to create the training session. Defaults to `""` which causes the session to be created in the local process. """ if args is None: args = [] if kwargs is None: kwargs = {} should_retry = True while should_retry: try: should_retry = False with supervisor.managed_session(master) as sess: while not supervisor.should_stop(): train_step_fn(sess, *args, **kwargs) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. should_retry = True
Example #8
Source File: monitored_session.py From keras-lambda with MIT License | 5 votes |
def _create_session(self): while True: try: return self._sess_creator.create_session() except errors.AbortedError: logging.info('An AbortedError was raised during initialization. ' 'It\'s most likely due to a preemption in a connected ' 'worker/ps. A new session will be created.')
Example #9
Source File: basic_loops.py From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License | 5 votes |
def basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master=""): """Basic loop to train a model. Calls `train_step_fn` in a loop to train a model. The function is called as: ```python train_step_fn(session, *args, **kwargs) ``` It is passed a `tf.Session` in addition to `args` and `kwargs`. The function typically runs one training step in the session. Args: supervisor: `tf.train.Supervisor` to run the training services. train_step_fn: Callable to execute one training step. Called repeatedly as `train_step_fn(session, *args **kwargs)`. args: Optional positional arguments passed to `train_step_fn`. kwargs: Optional keyword arguments passed to `train_step_fn`. master: Master to use to create the training session. Defaults to `""` which causes the session to be created in the local process. """ if args is None: args = [] if kwargs is None: kwargs = {} should_retry = True while should_retry: try: should_retry = False with supervisor.managed_session(master) as sess: while not supervisor.should_stop(): train_step_fn(sess, *args, **kwargs) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. should_retry = True
Example #10
Source File: monitored_session.py From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License | 5 votes |
def __init__(self, session_creator, hooks, should_recover, stop_grace_period_secs=120): """Sets up a Monitored or Hooked Session. Args: session_creator: A factory object to create session. Typically a `ChiefSessionCreator` or a `WorkerSessionCreator`. hooks: An iterable of `SessionRunHook' objects. should_recover: A bool. Indicates whether to recover from `AbortedError` and `UnavailableError` or not. stop_grace_period_secs: Number of seconds given to threads to stop after `close()` has been called. """ self._graph_was_finalized = ops.get_default_graph().finalized self._hooks = hooks or [] for h in self._hooks: h.begin() # Create the session. self._coordinated_creator = self._CoordinatedSessionCreator( session_creator=session_creator or ChiefSessionCreator(), hooks=self._hooks, stop_grace_period_secs=stop_grace_period_secs) if should_recover: self._sess = _RecoverableSession(self._coordinated_creator) else: self._sess = self._coordinated_creator.create_session()
Example #11
Source File: basic_loops.py From deep_image_model with Apache License 2.0 | 5 votes |
def basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master=""): """Basic loop to train a model. Calls `train_step_fn` in a loop to train a model. The function is called as: ```python train_step_fn(session, *args, **kwargs) ``` It is passed a `tf.Session` in addition to `args` and `kwargs`. The function typically runs one training step in the session. Args: supervisor: `tf.Supervisor` to run the training services. train_step_fn: Callable to execute one training step. Called repeatedly as `train_step_fn(session, *args **kwargs)`. args: Optional positional arguments passed to `train_step_fn`. kwargs: Optional keyword arguments passed to `train_step_fn`. master: Master to use to create the training session. Defaults to `""` which causes the session to be created in the local process. """ if args is None: args = [] if kwargs is None: kwargs = {} should_retry = True while should_retry: try: should_retry = False with supervisor.managed_session(master) as sess: while not supervisor.should_stop(): train_step_fn(sess, *args, **kwargs) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. should_retry = True
Example #12
Source File: monitored_session.py From lambda-packs with MIT License | 5 votes |
def __init__(self, session_creator, hooks, should_recover, stop_grace_period_secs=120): """Sets up a Monitored or Hooked Session. Args: session_creator: A factory object to create session. Typically a `ChiefSessionCreator` or a `WorkerSessionCreator`. hooks: An iterable of `SessionRunHook' objects. should_recover: A bool. Indicates whether to recover from `AbortedError` and `UnavailableError` or not. stop_grace_period_secs: Number of seconds given to threads to stop after `close()` has been called. """ self._graph_was_finalized = ops.get_default_graph().finalized self._hooks = hooks or [] for h in self._hooks: h.begin() # Create the session. self._coordinated_creator = self._CoordinatedSessionCreator( session_creator=session_creator or ChiefSessionCreator(), hooks=self._hooks, stop_grace_period_secs=stop_grace_period_secs) if should_recover: self._sess = _RecoverableSession(self._coordinated_creator) else: self._sess = self._coordinated_creator.create_session()
Example #13
Source File: failure_tolerator.py From auto-alt-text-lambda-api with MIT License | 5 votes |
def __init__(self, limit=5, init_delay=5.0, backoff_factor=2.0, forgive_after_seconds=6000, handled_exceptions=None): """Creates a FailureTolerator. The result will pause for `init_delay * (backoff_factor^(failure_count-1))` when re-entering `forgive()` after a failure. Args: limit: The maximum number of suppressed, unforgiven, failures. init_delay: How long to pause once the first failure is encountered. Defaults to five seconds. backoff_factor: Each subsequent failure grows the pause by this factor. forgive_after_seconds: Failures older than this are forgiven. handled_exceptions: The exceptions to forgive. Defaults to `(errors.AbortedError,)`. """ self.limit = limit self.backoff = backoff_factor self.delay = init_delay self.forgive_after = forgive_after_seconds self.exceptions = [] self.time_in_delay = 0.0 if handled_exceptions is None: self.handled = (errors.AbortedError,) else: self.handled = tuple(handled_exceptions)
Example #14
Source File: basic_loops.py From auto-alt-text-lambda-api with MIT License | 5 votes |
def basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master=""): """Basic loop to train a model. Calls `train_step_fn` in a loop to train a model. The function is called as: ```python train_step_fn(session, *args, **kwargs) ``` It is passed a `tf.Session` in addition to `args` and `kwargs`. The function typically runs one training step in the session. Args: supervisor: `tf.Supervisor` to run the training services. train_step_fn: Callable to execute one training step. Called repeatedly as `train_step_fn(session, *args **kwargs)`. args: Optional positional arguments passed to `train_step_fn`. kwargs: Optional keyword arguments passed to `train_step_fn`. master: Master to use to create the training session. Defaults to `""` which causes the session to be created in the local process. """ if args is None: args = [] if kwargs is None: kwargs = {} should_retry = True while should_retry: try: should_retry = False with supervisor.managed_session(master) as sess: while not supervisor.should_stop(): train_step_fn(sess, *args, **kwargs) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. should_retry = True
Example #15
Source File: monitored_session.py From auto-alt-text-lambda-api with MIT License | 5 votes |
def _create_session(self): while True: try: return self._sess_creator.create_session() except errors.AbortedError: logging.info('An AbortedError was raised during initialization. ' 'It\'s most likely due to a preemption in a connected ' 'worker/ps. A new session will be created.')
Example #16
Source File: failure_tolerator.py From lambda-packs with MIT License | 5 votes |
def __init__(self, limit=5, init_delay=5.0, backoff_factor=2.0, forgive_after_seconds=6000, handled_exceptions=None): """Creates a FailureTolerator. The result will pause for `init_delay * (backoff_factor^(failure_count-1))` when re-entering `forgive()` after a failure. Args: limit: The maximum number of suppressed, unforgiven, failures. init_delay: How long to pause once the first failure is encountered. Defaults to five seconds. backoff_factor: Each subsequent failure grows the pause by this factor. forgive_after_seconds: Failures older than this are forgiven. handled_exceptions: The exceptions to forgive. Defaults to `(errors.AbortedError,)`. """ self.limit = limit self.backoff = backoff_factor self.delay = init_delay self.forgive_after = forgive_after_seconds self.exceptions = [] self.time_in_delay = 0.0 if handled_exceptions is None: self.handled = (errors.AbortedError,) else: self.handled = tuple(handled_exceptions)
Example #17
Source File: basic_loops.py From lambda-packs with MIT License | 5 votes |
def basic_train_loop(supervisor, train_step_fn, args=None, kwargs=None, master=""): """Basic loop to train a model. Calls `train_step_fn` in a loop to train a model. The function is called as: ```python train_step_fn(session, *args, **kwargs) ``` It is passed a `tf.Session` in addition to `args` and `kwargs`. The function typically runs one training step in the session. Args: supervisor: `tf.train.Supervisor` to run the training services. train_step_fn: Callable to execute one training step. Called repeatedly as `train_step_fn(session, *args **kwargs)`. args: Optional positional arguments passed to `train_step_fn`. kwargs: Optional keyword arguments passed to `train_step_fn`. master: Master to use to create the training session. Defaults to `""` which causes the session to be created in the local process. """ if args is None: args = [] if kwargs is None: kwargs = {} should_retry = True while should_retry: try: should_retry = False with supervisor.managed_session(master) as sess: while not supervisor.should_stop(): train_step_fn(sess, *args, **kwargs) except errors.AbortedError: # Always re-run on AbortedError as it indicates a restart of one of the # distributed tensorflow servers. should_retry = True