Python tensorflow.python.framework.errors.AbortedError() Examples

The following are 17 code examples of tensorflow.python.framework.errors.AbortedError(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.python.framework.errors , or try the search function .
Example #1
Source File: monitored_session.py    From auto-alt-text-lambda-api with MIT License 6 votes vote down vote up
def __init__(self, session_creator, hooks, should_recover):
    """Sets up a Monitored or Hooked Session.

    Args:
      session_creator: A factory object to create session. Typically a
        `ChiefSessionCreator` or a `WorkerSessionCreator`.
      hooks: An iterable of `SessionRunHook' objects.
      should_recover: A bool. Indicates whether to recover from `AbortedError`
        or not.
    """
    self._graph_was_finalized = ops.get_default_graph().finalized
    self._hooks = hooks or []
    for h in self._hooks:
      h.begin()
    # Create the session.
    self._coordinated_creator = self._CoordinatedSessionCreator(
        session_creator=session_creator or ChiefSessionCreator(),
        hooks=self._hooks)
    if should_recover:
      self._sess = _RecoverableSession(self._coordinated_creator)
    else:
      self._sess = self._coordinated_creator.create_session() 
Example #2
Source File: monitored_session.py    From keras-lambda with MIT License 6 votes vote down vote up
def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
    while True:
      try:
        if not self._sess:
          self._sess = self._create_session()
        return self._sess.run(fetches,
                              feed_dict=feed_dict,
                              options=options,
                              run_metadata=run_metadata)
      except errors.AbortedError:
        logging.info('An AbortedError was raised. Closing the current session. '
                     'It\'s most likely due to a preemption in a connected '
                     'worker/ps. '
                     'A new session will be created on the next session.run().')
        self.close()
        self._sess = None 
Example #3
Source File: monitored_session.py    From auto-alt-text-lambda-api with MIT License 6 votes vote down vote up
def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
    while True:
      try:
        if not self._sess:
          self._sess = self._create_session()
        return self._sess.run(fetches,
                              feed_dict=feed_dict,
                              options=options,
                              run_metadata=run_metadata)
      except errors.AbortedError:
        logging.info('An AbortedError was raised. Closing the current session. '
                     'It\'s most likely due to a preemption in a connected '
                     'worker/ps. '
                     'A new session will be created on the next session.run().')
        self.close()
        self._sess = None 
Example #4
Source File: monitored_session.py    From keras-lambda with MIT License 6 votes vote down vote up
def __init__(self, session_creator, hooks, should_recover):
    """Sets up a Monitored or Hooked Session.

    Args:
      session_creator: A factory object to create session. Typically a
        `ChiefSessionCreator` or a `WorkerSessionCreator`.
      hooks: An iterable of `SessionRunHook' objects.
      should_recover: A bool. Indicates whether to recover from `AbortedError`
        or not.
    """
    self._graph_was_finalized = ops.get_default_graph().finalized
    self._hooks = hooks or []
    for h in self._hooks:
      h.begin()
    # Create the session.
    self._coordinated_creator = self._CoordinatedSessionCreator(
        session_creator=session_creator or ChiefSessionCreator(),
        hooks=self._hooks)
    if should_recover:
      self._sess = _RecoverableSession(self._coordinated_creator)
    else:
      self._sess = self._coordinated_creator.create_session() 
Example #5
Source File: monitored_session.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def run(self, fetches, feed_dict=None, options=None, run_metadata=None):
    while True:
      try:
        if not self._sess:
          self._sess = self._sess_creator.create_session()
        return self._sess.run(fetches,
                              feed_dict=feed_dict,
                              options=options,
                              run_metadata=run_metadata)
      except errors.AbortedError:
        self.close()
        self._sess = None 
Example #6
Source File: failure_tolerator.py    From keras-lambda with MIT License 5 votes vote down vote up
def __init__(self, limit=5, init_delay=5.0, backoff_factor=2.0,
               forgive_after_seconds=6000, handled_exceptions=None):
    """Creates a FailureTolerator.

    The result will pause for `init_delay *
    (backoff_factor^(failure_count-1))` when re-entering `forgive()`
    after a failure.

    Args:
      limit: The maximum number of suppressed, unforgiven, failures.
      init_delay: How long to pause once the first failure is
        encountered. Defaults to five seconds.
      backoff_factor: Each subsequent failure grows the pause by this factor.
      forgive_after_seconds: Failures older than this are forgiven.
      handled_exceptions: The exceptions to forgive. Defaults to
          `(errors.AbortedError,)`.

    """
    self.limit = limit
    self.backoff = backoff_factor
    self.delay = init_delay
    self.forgive_after = forgive_after_seconds
    self.exceptions = []
    self.time_in_delay = 0.0
    if handled_exceptions is None:
      self.handled = (errors.AbortedError,)
    else:
      self.handled = tuple(handled_exceptions) 
Example #7
Source File: basic_loops.py    From keras-lambda with MIT License 5 votes vote down vote up
def basic_train_loop(supervisor, train_step_fn, args=None,
                     kwargs=None, master=""):
  """Basic loop to train a model.

  Calls `train_step_fn` in a loop to train a model.  The function is called as:

  ```python
  train_step_fn(session, *args, **kwargs)
  ```

  It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
  typically runs one training step in the session.

  Args:
    supervisor: `tf.Supervisor` to run the training services.
    train_step_fn: Callable to execute one training step.  Called
      repeatedly as `train_step_fn(session, *args **kwargs)`.
    args: Optional positional arguments passed to `train_step_fn`.
    kwargs: Optional keyword arguments passed to `train_step_fn`.
    master: Master to use to create the training session.  Defaults to
      `""` which causes the session to be created in the local process.
  """
  if args is None:
    args = []
  if kwargs is None:
    kwargs = {}
  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with supervisor.managed_session(master) as sess:
        while not supervisor.should_stop():
          train_step_fn(sess, *args, **kwargs)
    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      should_retry = True 
Example #8
Source File: monitored_session.py    From keras-lambda with MIT License 5 votes vote down vote up
def _create_session(self):
    while True:
      try:
        return self._sess_creator.create_session()
      except errors.AbortedError:
        logging.info('An AbortedError was raised during initialization. '
                     'It\'s most likely due to a preemption in a connected '
                     'worker/ps. A new session will be created.') 
Example #9
Source File: basic_loops.py    From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License 5 votes vote down vote up
def basic_train_loop(supervisor, train_step_fn, args=None,
                     kwargs=None, master=""):
  """Basic loop to train a model.

  Calls `train_step_fn` in a loop to train a model.  The function is called as:

  ```python
  train_step_fn(session, *args, **kwargs)
  ```

  It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
  typically runs one training step in the session.

  Args:
    supervisor: `tf.train.Supervisor` to run the training services.
    train_step_fn: Callable to execute one training step.  Called
      repeatedly as `train_step_fn(session, *args **kwargs)`.
    args: Optional positional arguments passed to `train_step_fn`.
    kwargs: Optional keyword arguments passed to `train_step_fn`.
    master: Master to use to create the training session.  Defaults to
      `""` which causes the session to be created in the local process.
  """
  if args is None:
    args = []
  if kwargs is None:
    kwargs = {}
  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with supervisor.managed_session(master) as sess:
        while not supervisor.should_stop():
          train_step_fn(sess, *args, **kwargs)
    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      should_retry = True 
Example #10
Source File: monitored_session.py    From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License 5 votes vote down vote up
def __init__(self, session_creator, hooks, should_recover,
               stop_grace_period_secs=120):
    """Sets up a Monitored or Hooked Session.

    Args:
      session_creator: A factory object to create session. Typically a
        `ChiefSessionCreator` or a `WorkerSessionCreator`.
      hooks: An iterable of `SessionRunHook' objects.
      should_recover: A bool. Indicates whether to recover from `AbortedError`
        and `UnavailableError` or not.
      stop_grace_period_secs: Number of seconds given to threads to stop after
        `close()` has been called.
    """
    self._graph_was_finalized = ops.get_default_graph().finalized
    self._hooks = hooks or []
    for h in self._hooks:
      h.begin()
    # Create the session.
    self._coordinated_creator = self._CoordinatedSessionCreator(
        session_creator=session_creator or ChiefSessionCreator(),
        hooks=self._hooks,
        stop_grace_period_secs=stop_grace_period_secs)
    if should_recover:
      self._sess = _RecoverableSession(self._coordinated_creator)
    else:
      self._sess = self._coordinated_creator.create_session() 
Example #11
Source File: basic_loops.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def basic_train_loop(supervisor, train_step_fn, args=None,
                     kwargs=None, master=""):
  """Basic loop to train a model.

  Calls `train_step_fn` in a loop to train a model.  The function is called as:

  ```python
  train_step_fn(session, *args, **kwargs)
  ```

  It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
  typically runs one training step in the session.

  Args:
    supervisor: `tf.Supervisor` to run the training services.
    train_step_fn: Callable to execute one training step.  Called
      repeatedly as `train_step_fn(session, *args **kwargs)`.
    args: Optional positional arguments passed to `train_step_fn`.
    kwargs: Optional keyword arguments passed to `train_step_fn`.
    master: Master to use to create the training session.  Defaults to
      `""` which causes the session to be created in the local process.
  """
  if args is None:
    args = []
  if kwargs is None:
    kwargs = {}
  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with supervisor.managed_session(master) as sess:
        while not supervisor.should_stop():
          train_step_fn(sess, *args, **kwargs)
    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      should_retry = True 
Example #12
Source File: monitored_session.py    From lambda-packs with MIT License 5 votes vote down vote up
def __init__(self, session_creator, hooks, should_recover,
               stop_grace_period_secs=120):
    """Sets up a Monitored or Hooked Session.

    Args:
      session_creator: A factory object to create session. Typically a
        `ChiefSessionCreator` or a `WorkerSessionCreator`.
      hooks: An iterable of `SessionRunHook' objects.
      should_recover: A bool. Indicates whether to recover from `AbortedError`
        and `UnavailableError` or not.
      stop_grace_period_secs: Number of seconds given to threads to stop after
        `close()` has been called.
    """
    self._graph_was_finalized = ops.get_default_graph().finalized
    self._hooks = hooks or []
    for h in self._hooks:
      h.begin()
    # Create the session.
    self._coordinated_creator = self._CoordinatedSessionCreator(
        session_creator=session_creator or ChiefSessionCreator(),
        hooks=self._hooks,
        stop_grace_period_secs=stop_grace_period_secs)
    if should_recover:
      self._sess = _RecoverableSession(self._coordinated_creator)
    else:
      self._sess = self._coordinated_creator.create_session() 
Example #13
Source File: failure_tolerator.py    From auto-alt-text-lambda-api with MIT License 5 votes vote down vote up
def __init__(self, limit=5, init_delay=5.0, backoff_factor=2.0,
               forgive_after_seconds=6000, handled_exceptions=None):
    """Creates a FailureTolerator.

    The result will pause for `init_delay *
    (backoff_factor^(failure_count-1))` when re-entering `forgive()`
    after a failure.

    Args:
      limit: The maximum number of suppressed, unforgiven, failures.
      init_delay: How long to pause once the first failure is
        encountered. Defaults to five seconds.
      backoff_factor: Each subsequent failure grows the pause by this factor.
      forgive_after_seconds: Failures older than this are forgiven.
      handled_exceptions: The exceptions to forgive. Defaults to
          `(errors.AbortedError,)`.

    """
    self.limit = limit
    self.backoff = backoff_factor
    self.delay = init_delay
    self.forgive_after = forgive_after_seconds
    self.exceptions = []
    self.time_in_delay = 0.0
    if handled_exceptions is None:
      self.handled = (errors.AbortedError,)
    else:
      self.handled = tuple(handled_exceptions) 
Example #14
Source File: basic_loops.py    From auto-alt-text-lambda-api with MIT License 5 votes vote down vote up
def basic_train_loop(supervisor, train_step_fn, args=None,
                     kwargs=None, master=""):
  """Basic loop to train a model.

  Calls `train_step_fn` in a loop to train a model.  The function is called as:

  ```python
  train_step_fn(session, *args, **kwargs)
  ```

  It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
  typically runs one training step in the session.

  Args:
    supervisor: `tf.Supervisor` to run the training services.
    train_step_fn: Callable to execute one training step.  Called
      repeatedly as `train_step_fn(session, *args **kwargs)`.
    args: Optional positional arguments passed to `train_step_fn`.
    kwargs: Optional keyword arguments passed to `train_step_fn`.
    master: Master to use to create the training session.  Defaults to
      `""` which causes the session to be created in the local process.
  """
  if args is None:
    args = []
  if kwargs is None:
    kwargs = {}
  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with supervisor.managed_session(master) as sess:
        while not supervisor.should_stop():
          train_step_fn(sess, *args, **kwargs)
    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      should_retry = True 
Example #15
Source File: monitored_session.py    From auto-alt-text-lambda-api with MIT License 5 votes vote down vote up
def _create_session(self):
    while True:
      try:
        return self._sess_creator.create_session()
      except errors.AbortedError:
        logging.info('An AbortedError was raised during initialization. '
                     'It\'s most likely due to a preemption in a connected '
                     'worker/ps. A new session will be created.') 
Example #16
Source File: failure_tolerator.py    From lambda-packs with MIT License 5 votes vote down vote up
def __init__(self, limit=5, init_delay=5.0, backoff_factor=2.0,
               forgive_after_seconds=6000, handled_exceptions=None):
    """Creates a FailureTolerator.

    The result will pause for `init_delay *
    (backoff_factor^(failure_count-1))` when re-entering `forgive()`
    after a failure.

    Args:
      limit: The maximum number of suppressed, unforgiven, failures.
      init_delay: How long to pause once the first failure is
        encountered. Defaults to five seconds.
      backoff_factor: Each subsequent failure grows the pause by this factor.
      forgive_after_seconds: Failures older than this are forgiven.
      handled_exceptions: The exceptions to forgive. Defaults to
          `(errors.AbortedError,)`.

    """
    self.limit = limit
    self.backoff = backoff_factor
    self.delay = init_delay
    self.forgive_after = forgive_after_seconds
    self.exceptions = []
    self.time_in_delay = 0.0
    if handled_exceptions is None:
      self.handled = (errors.AbortedError,)
    else:
      self.handled = tuple(handled_exceptions) 
Example #17
Source File: basic_loops.py    From lambda-packs with MIT License 5 votes vote down vote up
def basic_train_loop(supervisor, train_step_fn, args=None,
                     kwargs=None, master=""):
  """Basic loop to train a model.

  Calls `train_step_fn` in a loop to train a model.  The function is called as:

  ```python
  train_step_fn(session, *args, **kwargs)
  ```

  It is passed a `tf.Session` in addition to `args` and `kwargs`.  The function
  typically runs one training step in the session.

  Args:
    supervisor: `tf.train.Supervisor` to run the training services.
    train_step_fn: Callable to execute one training step.  Called
      repeatedly as `train_step_fn(session, *args **kwargs)`.
    args: Optional positional arguments passed to `train_step_fn`.
    kwargs: Optional keyword arguments passed to `train_step_fn`.
    master: Master to use to create the training session.  Defaults to
      `""` which causes the session to be created in the local process.
  """
  if args is None:
    args = []
  if kwargs is None:
    kwargs = {}
  should_retry = True
  while should_retry:
    try:
      should_retry = False
      with supervisor.managed_session(master) as sess:
        while not supervisor.should_stop():
          train_step_fn(sess, *args, **kwargs)
    except errors.AbortedError:
      # Always re-run on AbortedError as it indicates a restart of one of the
      # distributed tensorflow servers.
      should_retry = True