from builtins import super from collections import namedtuple from concurrent.futures import CancelledError from io import StringIO import json import os import pickle from tempfile import TemporaryDirectory from unittest import mock import joblib try: import pandas as pd HAS_PANDAS = True except ImportError: HAS_PANDAS = False try: import feather # NOQA HAS_FEATHER = True except ImportError: HAS_FEATHER = False try: from sklearn.linear_model import LogisticRegression HAS_SKLEARN = True except ImportError: HAS_SKLEARN = False try: import numpy as np HAS_NUMPY = True except ImportError: HAS_NUMPY = False from civis import APIClient from civis._utils import camel_to_snake from civis.base import CivisAPIError, CivisJobFailure from civis.response import Response from civis.tests import create_client_mock import pytest from civis.ml import _model TRAIN_ID_PROD, PREDICT_ID_PROD, REGISTRATION_ID_PROD = 654, 321, 789 TRAIN_ID_OLD, PREDICT_ID_OLD = 123, 456 TEST_TEMPLATE_ID_ALIAS_OBJECTS = [ # Version-less aliases for the latest production code. Response(dict(alias='civis-civisml-training', object_id=TRAIN_ID_PROD)), Response(dict(alias='civis-civisml-prediction', object_id=PREDICT_ID_PROD)), # noqa Response(dict(alias='civis-civisml-registration', object_id=789)), # Versioned aliases. Pretend that v2.3 is the latest production version. Response(dict(alias='civis-civisml-training-v2-3', object_id=TRAIN_ID_PROD)), # noqa Response(dict(alias='civis-civisml-prediction-v2-3', object_id=PREDICT_ID_PROD)), # noqa Response(dict(alias='civis-civisml-registration-v2-3', object_id=789)), # Versioned aliases. Versions older than v2.2 don't have registration ID. Response(dict(alias='civis-civisml-training-v1-4', object_id=TRAIN_ID_OLD)), # noqa Response(dict(alias='civis-civisml-prediction-v1-4', object_id=PREDICT_ID_OLD)), # noqa # Other special versions, e.g., "dev" Response(dict(alias='civis-civisml-training-dev', object_id=345)), Response(dict(alias='civis-civisml-prediction-dev', object_id=678)), Response(dict(alias='civis-civisml-registration-dev', object_id=901)), ] TEST_TEMPLATE_IDS = { # Must match TEST_TEMPLATE_ID_ALIAS_OBJECTS None: {'training': TRAIN_ID_PROD, 'prediction': PREDICT_ID_PROD, 'registration': REGISTRATION_ID_PROD}, # noqa 'v2.3': {'training': TRAIN_ID_PROD, 'prediction': PREDICT_ID_PROD, 'registration': REGISTRATION_ID_PROD}, # noqa 'v1.4': {'training': TRAIN_ID_OLD, 'prediction': PREDICT_ID_OLD, 'registration': None}, # noqa 'dev': {'training': 345, 'prediction': 678, 'registration': 901}, } def setup_client_mock(script_id=-10, run_id=100, state='succeeded', run_outputs=None): """Return a Mock set up for use in testing container scripts Parameters ---------- script_id: int Mock-create containers with this ID when calling `post_containers` or `post_containers_runs`. run_id: int Mock-create runs with this ID when calling `post_containers_runs`. state: str, optional The reported state of the container run run_outputs: list, optional List of Response objects returned as run outputs Returns ------- `unittest.mock.Mock` With scripts endpoints `post_containers`, `post_containers_runs`, `post_cancel`, and `get_containers_runs` set up. """ c = mock.Mock() c.__class__ = APIClient mock_container = Response({'id': script_id}) c.scripts.post_containers.return_value = mock_container mock_container_run_start = Response({'id': run_id, 'container_id': script_id, 'state': 'queued'}) mock_container_run = Response({'id': run_id, 'container_id': script_id, 'state': state}) c.scripts.post_containers_runs.return_value = mock_container_run_start c.scripts.get_containers_runs.return_value = mock_container_run c.scripts.list_containers_runs_outputs.return_value = (run_outputs or []) c.scripts.list_containers_runs_logs.return_value = [] def change_state_to_cancelled(script_id): mock_container_run.state = "cancelled" return mock_container_run c.scripts.post_cancel.side_effect = change_state_to_cancelled # Avoid channels endpoint while testing here del c.channels return c def test_check_is_fit_exception(): mock_pipe = mock.MagicMock() mock_pipe.train_result_ = None @_model._check_fit_initiated def foo(arg): return 7 with pytest.raises(ValueError): foo(mock_pipe) def test_check_is_fit(): mock_pipe = mock.MagicMock() mock_pipe.train_result_ = True @_model._check_fit_initiated def foo(arg): return 7 assert foo(mock_pipe) == 7 @mock.patch.object(_model, "futures") def test_block_and_handle_missing_exception(mock_fut): @_model._block_and_handle_missing def foo(arg): raise FileNotFoundError mock_super = mock.Mock() mock_super.exception.return_value = Exception with pytest.raises(Exception): foo(mock_super) mock_super.exception.return_value = None with pytest.raises(FileNotFoundError): foo(mock_super) @mock.patch.object(_model, "futures") def test_block_and_handle_missing(mock_fut): @_model._block_and_handle_missing def foo(arg): return 7 assert foo('bar') == 7 @mock.patch.object(_model.cio, 'file_to_civis', return_value=-11) def test_stash_local_data_from_file(mock_file): with TemporaryDirectory() as temp_dir: fname = os.path.join(temp_dir, 'tempfile') with open(fname, 'wt') as _fout: _fout.write("a,b,c\n1,2,3\n") assert _model._stash_local_file(fname) == -11 mock_file.assert_called_once_with(mock.ANY, name='modelpipeline_data.csv', client=mock.ANY) @pytest.mark.skipif(not HAS_PANDAS, reason="pandas not installed") @pytest.mark.skipif(not HAS_NUMPY, reason="numpy not installed") def test_stash_local_dataframe_multiindex_err(): arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] df = pd.DataFrame(np.random.randn(8, 4), index=arrays) with pytest.raises(TypeError): _model._stash_local_dataframe(df, 10000) @pytest.mark.skipif(not HAS_PANDAS, reason="pandas not installed") @mock.patch.object(_model.cio, 'file_to_civis', return_value=-11) def test_stash_local_data_from_dataframe_csv(mock_file): df = pd.DataFrame({'a': [1], 'b': [2]}) assert _model._stash_dataframe_as_csv(df, mock.Mock()) == -11 mock_file.assert_called_once_with(mock.ANY, name='modelpipeline_data.csv', client=mock.ANY) assert isinstance(mock_file.call_args[0][0], StringIO) @pytest.mark.skipif(not HAS_PANDAS, reason="pandas not installed") @pytest.mark.skipif(not HAS_FEATHER, reason="feather not installed") @mock.patch.object(_model.cio, 'file_to_civis', return_value=-11) def test_stash_local_data_from_dataframe_feather(mock_file): df = pd.DataFrame({'a': [1], 'b': [2]}) assert _model._stash_local_dataframe(df, 10000) == -11 mock_file.assert_called_once_with( mock.ANY, name='modelpipeline_data.feather', client=mock.ANY) @mock.patch.object(_model, '_stash_dataframe_as_feather', autospec=True) @mock.patch.object(_model, '_stash_dataframe_as_csv', autospec=True) def test_stash_local_dataframe_format_select_csv(mock_csv, mock_feather): # Always store data as a CSV for CivisML versions <= 2.0. _model._stash_local_dataframe('df', 9969) assert mock_feather.call_count == 0 assert mock_csv.call_count == 1 @mock.patch.object(_model, '_stash_dataframe_as_feather', autospec=True) @mock.patch.object(_model, '_stash_dataframe_as_csv', autospec=True) def test_stash_local_dataframe_format_select_feather(mock_csv, mock_feather): # Try to store data as Feather for CivisML versions > 2.0. _model._stash_local_dataframe('df', 10050) assert mock_feather.call_count == 1 assert mock_csv.call_count == 0 @mock.patch.object(_model, '_retrieve_file', autospec=True) def test_load_estimator(mock_retrieve): obj = {'spam': 'eggs'} def _retrieve_json(fname, job_id, run_id, local_dir, client=None): full_name = os.path.join(local_dir, fname) joblib.dump(obj, full_name) return full_name mock_retrieve.side_effect = _retrieve_json out = _model._load_estimator(13, 17, 'fname') assert out == obj @mock.patch.object(_model.cio, 'file_to_dataframe', autospec=True) @mock.patch.object(_model.cio, 'file_id_from_run_output', autospec=True) def test_load_table_from_outputs(mock_fid, mock_f2df): # Test that _load_table_from_outputs is calling functions # correctly. Let `autospec` catch errors in arguments being passed. mock_client = setup_client_mock() _model._load_table_from_outputs(1, 2, 'fname', client=mock_client) def test_show_civisml_warnings(): warn_list = ["/path:13: UserWarning: A message\n", "/module:42: RuntimeWarning: Profundity\n"] with pytest.warns(UserWarning) as warns: _model._show_civisml_warnings(warn_list) assert len(warns.list) == 2 assert str(warns.list[0].message) == "A message" assert str(warns.list[1].message) == "Profundity" def test_show_civisml_warnings_error(): # If the warnings-parser fails, we should still get a sensible warning. warn_list = ["/path UserWarning: A message\n"] # Malformed warning message with pytest.warns(RuntimeWarning) as warns: _model._show_civisml_warnings(warn_list) assert len(warns.list) == 1 assert warn_list[0] in str(warns.list[0].message) assert "Remote warning from CivisML" in str(warns.list[0].message) ################################### # Tests of ModelFuture below here # ################################### @mock.patch.object(_model.ModelFuture, "_set_model_exception", autospec=True) @mock.patch.object(_model.ModelFuture, "add_done_callback", autospec=True) def test_modelfuture_constructor(mock_adc, mock_spe): c = setup_client_mock(7, 17) mf = _model.ModelFuture(job_id=7, run_id=17, client=c) assert mf.is_training is True assert mf.train_run_id == 17 assert mf.train_job_id == 7 mf = _model.ModelFuture(job_id=7, run_id=17, train_job_id=23, train_run_id=29, client=c) assert mf.is_training is False assert mf.train_run_id == 29 assert mf.train_job_id == 23 @mock.patch.object(_model.cio, "file_id_from_run_output", mock.Mock(return_value=11, spec_set=True)) @mock.patch.object(_model.cio, "file_to_json", mock.Mock(return_value={'run': {'status': 'succeeded'}})) @mock.patch.object(_model, 'APIClient', return_value=setup_client_mock()) def test_modelfuture_pickle_smoke(mock_client): mf = _model.ModelFuture(job_id=7, run_id=13, client=setup_client_mock()) mf.result() mf_pickle = pickle.dumps(mf) pickle.loads(mf_pickle) def test_set_model_exception_metadata_exception(): """Tests cases where accessing metadata throws exceptions """ # State "running" prevents termination when the object is created. mock_client = setup_client_mock(1, 2, state='running') class ModelFutureRaiseExc(_model.ModelFuture): def __init__(self, exc, *args, **kwargs): self.__exc = exc super().__init__(*args, **kwargs) @property def metadata(self): raise self.__exc('What a spectacular failure, you say!') # exception types get caught! for exc in [FileNotFoundError, CivisJobFailure, CancelledError]: fut = ModelFutureRaiseExc(exc, 1, 2, client=mock_client) _model.ModelFuture._set_model_exception(fut) with pytest.warns(UserWarning): # The KeyError is caught, but sends a warning fut = ModelFutureRaiseExc(KeyError, 1, 2, client=mock_client) _model.ModelFuture._set_model_exception(fut) fut = ModelFutureRaiseExc(RuntimeError, 1, 2, client=mock_client) with pytest.raises(RuntimeError): _model.ModelFuture._set_model_exception(fut) def test_set_model_exception_memory_error(): mock_client = setup_client_mock(1, 2, state='failed') err_msg = ('Process ran out of its allowed 3000 MiB of ' 'memory and was killed.') logs = [{'created_at': '2017-05-10T12:00:00.000Z', 'id': 10005, 'level': 'error', 'message': 'Failed'}, {'created_at': '2017-05-10T12:00:00.000Z', 'id': 10003, 'level': 'error', 'message': 'Error on job: Process ended with an ' 'error, exiting: 137.'}, {'created_at': '2017-05-10T12:00:00.000Z', 'id': 10000, 'level': 'error', 'message': err_msg}] mock_client.scripts.list_containers_runs_logs.return_value = logs fut = _model.ModelFuture(1, 2, client=mock_client) with pytest.raises(MemoryError) as err: fut.result() assert str(err.value) == err_msg def test_set_model_exception_unknown_error(): # If we really don't recognize the error, at least give the # user a few lines of logs so they can maybe figure it out themselves. mock_client = setup_client_mock(1, 2, state='failed') logs = [{'created_at': '2017-05-10T12:00:00.000Z', 'id': 10005, 'level': 'error', 'message': 'Failed'}, {'created_at': '2017-05-10T12:00:00.000Z', 'id': 10003, 'level': 'error', 'message': 'Error on job: Process ended with an ' 'error, exiting: 137.'}, {'created_at': '2017-05-10T12:00:00.000Z', 'id': 10000, 'level': 'error', 'message': 'Oops'}] err_msg = '\n'.join([x['message'] for x in logs]) mock_client.scripts.list_containers_runs_logs.return_value = logs fut = _model.ModelFuture(1, 2, client=mock_client) with pytest.raises(CivisJobFailure) as err: fut.result() assert str(err.value).startswith(err_msg) @mock.patch.object(_model.cio, "file_to_json", autospec=True, return_value={'run': {'status': 'succeeded', 'stack_trace': None}}) def test_set_model_exception_no_exception(mock_f2j): # If nothing went wrong, we shouldn't set an exception ro = [{'name': 'model_info.json', 'object_id': 137, 'object_type': 'File'}, {'name': 'metrics.json', 'object_id': 139, 'object_type': 'File'}] ro = [Response(o) for o in ro] mock_client = setup_client_mock(1, 2, state='succeeded', run_outputs=ro) fut = _model.ModelFuture(1, 2, client=mock_client) assert fut.exception() is None class ModelFutureStub: def __init__(self, exc, trn, val): self._exception = exc self.metadata = trn self.validation_metadata = val self.is_training = True self._exception_handled = False def set_exception(self, exc): self._exception = exc def _test_set_model_exc(trn, val, exc): fut = ModelFutureStub(exc, trn, val) assert isinstance(fut._exception, type(exc)) _model.ModelFuture._set_model_exception(fut) assert isinstance(fut._exception, _model.ModelError) assert 'this is a trace' in str(fut._exception) # don't change attribute if it's already a ModelError fut._exception = _model.ModelError('undecipherable message') _model.ModelFuture._set_model_exception(fut) assert isinstance(fut._exception, _model.ModelError) assert 'undecipherable message' in str(fut._exception) def test_set_model_exception_training_metadata(): trn = {'run': {'status': 'exception', 'stack_trace': 'this is a trace'}} val = None exc = FileNotFoundError('Look, no mocks!') _test_set_model_exc(trn, val, exc) def test_set_model_exception_validation_metadata(): trn = {'run': {'status': 'succeeded'}} val = {'run': {'status': 'exception', 'stack_trace': 'this is a trace'}} exc = FileNotFoundError('Hahaha, zero mocks here!') _test_set_model_exc(trn, val, exc) @mock.patch.object(_model.cio, "file_to_json", mock.Mock(return_value='bar', spec_set=True)) @mock.patch.object(_model.ModelFuture, "_set_model_exception", lambda *args: None) def test_getstate(): c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) ret = mf.__getstate__() assert ret['_done_callbacks'] == [] assert not ret['_self_polling_executor'] assert 'client' not in ret assert 'poller' not in ret assert '_condition' not in ret @mock.patch.object(_model.cio, "file_id_from_run_output", mock.Mock(return_value=11, spec_set=True)) @mock.patch.object(_model.cio, "file_to_json", mock.Mock(spec_set=True, return_value={'run': {'status': 'foo'}})) def test_state(): c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) ret = mf.state assert ret == 'foo' c.scripts.get_containers_runs.return_value = Response({'id': 7, 'container_id': 3, 'state': 'failed'}) mf = _model.ModelFuture(3, 7, client=c) assert mf.state == 'failed' @mock.patch.object(_model.ModelFuture, "metadata", return_value={'run': {'configuration': {'data': {'primary_key': 'foo'}}}}) @mock.patch.object(_model, "_load_table_from_outputs", return_value='bar') @mock.patch.object(_model.ModelFuture, "result") @mock.patch.object(_model.ModelFuture, "_set_model_exception", mock.Mock()) def test_table(mock_res, mock_lt, mock_meta): c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mf.table == 'bar' mock_lt.assert_called_once_with(3, 7, 'predictions.csv', index_col=0, client=c) @mock.patch.object(_model.ModelFuture, "metadata", return_value={'run': {'configuration': {'data': {'primary_key': None}}}}) @mock.patch.object(_model, "_load_table_from_outputs", return_value='bar') @mock.patch.object(_model.ModelFuture, "result") @mock.patch.object(_model.ModelFuture, "_set_model_exception", mock.Mock()) def test_table_no_pkey(mock_res, mock_lt, mock_meta): c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mf.table == 'bar' mock_lt.assert_called_once_with(3, 7, 'predictions.csv', index_col=False, client=c) @mock.patch.object(_model.ModelFuture, "metadata", return_value={'run': {'configuration': {'data': {'primary_key': 'foo'}}}}) @mock.patch.object(_model, "_load_table_from_outputs") @mock.patch.object(_model.ModelFuture, "result") @mock.patch.object(_model.ModelFuture, "_set_model_exception", mock.Mock()) def test_table_None(mock_res, mock_lt, mock_meta): mock_lt.side_effect = FileNotFoundError() c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mf.table is None mock_lt.assert_called_once_with(3, 7, 'predictions.csv', index_col=0, client=c) @mock.patch.object(_model.cio, "file_id_from_run_output", mock.Mock(return_value=11, spec_set=True)) @mock.patch.object(_model.cio, "file_to_json", return_value={'foo': 'bar'}) @mock.patch.object(_model.ModelFuture, "_set_model_exception") def test_metadata(mock_spec, mock_f2j): c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mf.metadata == {'foo': 'bar'} mock_f2j.assert_called_once_with(11, client=c) def test_train_data_fname(): # swap out the poller with a simple function that accepts *args, **kwargs # and returns a simple successful Response object def poller(*args, **kwargs): return Response({'state': 'succeeded'}) mock_client = mock.MagicMock() mock_client.scripts.get_containers_runs = poller mf = _model.ModelFuture(job_id=1, run_id=2, client=mock_client) path = '/green/eggs/and/ham' training_meta = {'run': {'configuration': {'data': {'location': path}}}} mf._train_metadata = training_meta assert mf.train_data_fname == 'ham' @mock.patch.object(_model, 'cio', autospec=True) def test_train_data_(mock_cio): def poller(*args, **kwargs): return Response({'state': 'succeeded'}) mock_client = mock.MagicMock() mock_client.scripts.get_containers_runs = poller path = '/green/eggs/and/ham' training_meta = {'run': {'configuration': {'data': {'location': path}}, 'status': 'succeeded'}} mock_cio.file_to_json.return_value = training_meta mf = _model.ModelFuture(job_id=1, run_id=2, train_job_id=11, train_run_id=13, client=mock_client) assert mf.training_metadata == training_meta mock_cio.file_id_from_run_output.assert_called_with( 'model_info.json', 11, 13, client=mock_client) @mock.patch.object(_model, "_load_table_from_outputs", autospec=True) def test_train_data_exc_handling(mock_load_table): def poller(*args, **kwargs): return Response({'state': 'succeeded'}) mock_client = mock.MagicMock() mock_client.scripts.get_containers_runs = poller mf = _model.ModelFuture(job_id=1, run_id=2, client=mock_client) mf._train_data_fname = 'placeholder.csv' # check we catch 404 error and raise some intelligible r = Response({'content': None, 'status_code': 404, 'reason': None}) mock_load_table.side_effect = [CivisAPIError(r)] with pytest.raises(ValueError): mf.train_data @mock.patch.object(_model, "_load_estimator", return_value='spam') @mock.patch.object(_model.ModelFuture, "_set_model_exception", mock.Mock()) def test_estimator(mock_le): c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mock_le.call_count == 0, "Estimator retrieval is lazy." assert mf.estimator == 'spam' assert mock_le.call_count == 1 assert mf.estimator == 'spam' assert mock_le.call_count == 1,\ "The Estimator is only downloaded once and cached." @mock.patch.object(_model.cio, "file_id_from_run_output", autospec=True) @mock.patch.object(_model.cio, "file_to_json", return_value='foo', autospec=True) @mock.patch.object(_model.ModelFuture, "_set_model_exception") def test_validation_metadata_training(mock_spe, mock_f2f, mock_file_id_from_run_output): mock_file_id_from_run_output.return_value = 11 c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mf.validation_metadata == 'foo' mock_f2f.assert_called_once_with(11, client=c) mock_file_id_from_run_output.assert_called_with('metrics.json', 3, 7, client=mock.ANY) @mock.patch.object(_model.cio, "file_id_from_run_output", autospec=True) @mock.patch.object(_model.cio, "file_to_json", autospec=True) @mock.patch.object(_model.ModelFuture, "_set_model_exception") def test_validation_metadata_missing(mock_spe, mock_f2f, mock_file_id_from_run_output): # Make sure that missing validation metadata doesn't cause an error mock_file_id_from_run_output.side_effect = FileNotFoundError c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mf.validation_metadata is None assert mf.metrics is None assert mock_f2f.call_count == 0 assert mock_file_id_from_run_output.call_count == 1 @mock.patch.object(_model.cio, "file_id_from_run_output", autospec=True) @mock.patch.object(_model.cio, "file_to_json", return_value='foo', autospec=True) @mock.patch.object(_model.ModelFuture, "_set_model_exception") def test_validation_metadata_prediction(mock_spe, mock_f2f, mock_file_id_from_run_output): mock_file_id_from_run_output.return_value = 11 c = setup_client_mock(3, 7) mf = _model.ModelFuture(1, 2, 3, 7, client=c) assert mf.validation_metadata == 'foo' mock_f2f.assert_called_once_with(11, client=c) mock_file_id_from_run_output.assert_called_with('metrics.json', 3, 7, client=mock.ANY) @mock.patch.object(_model.cio, "file_id_from_run_output", autospec=True) @mock.patch.object(_model.cio, "file_to_json", mock.MagicMock( return_value={'metrics': 'foo', 'run': {'status': 'succeeded'}})) def test_metrics_training(mock_file_id_from_run_output): mock_file_id_from_run_output.return_value = 11 c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) assert mf.metrics == 'foo' mock_file_id_from_run_output.assert_called_with('metrics.json', 3, 7, client=mock.ANY) @mock.patch.object(_model.cio, "file_id_from_run_output", autospec=True) @mock.patch.object(_model.cio, "file_to_json") def test_metrics_training_None(mock_file_to_json, mock_file_id_from_run_output): mock_file_to_json.return_value = mock.MagicMock( return_value={'metrics': 'foo', 'run': {'status': 'succeeded'}}) mock_file_id_from_run_output.return_value = 11 c = setup_client_mock(3, 7) mf = _model.ModelFuture(3, 7, client=c) # override validation metadata to be None, as though we ran # a train job without validation mf._val_metadata = None mock_file_to_json.return_value = None assert mf.metrics is None mock_file_id_from_run_output.assert_called_with('metrics.json', 3, 7, client=mock.ANY) @mock.patch.object(_model.cio, "file_id_from_run_output", autospec=True) @mock.patch.object(_model.cio, "file_to_json", mock.MagicMock( return_value={'metrics': 'foo', 'run': {'status': 'succeeded'}})) def test_metrics_prediction(mock_file_id_from_run_output): mock_file_id_from_run_output.return_value = 11 c = setup_client_mock(3, 7) mf = _model.ModelFuture(1, 2, 3, 7, client=c) assert mf.metrics == 'foo' mock_file_id_from_run_output.assert_called_with('metrics.json', 3, 7, client=mock.ANY) ############################################### # Tests of utilities for CivisML template IDs # ############################################### @pytest.mark.parametrize( 'alias, expected_job_type, expected_version', [ ('civis-civisml-training', 'training', None), ('civis-civisml-training-v2-3', 'training', 'v2.3'), ('civis-civisml-training-dev', 'training', 'dev'), ('civis-civisml-training-foo-bar', 'training', 'foo-bar'), ], ) def test__get_job_type_version(alias, expected_job_type, expected_version): actual_job_type, actual_version = _model._get_job_type_version(alias) assert actual_job_type == expected_job_type assert actual_version == expected_version @pytest.mark.parametrize( 'invalid_alias', ['foobar', 'civis-civisml', 'civis-civisml-', 'civis-civisml-training-', 'civis-civisml-training-foobar-'] ) def test__get_job_type_version_invalid_alias(invalid_alias): with pytest.raises(ValueError): _model._get_job_type_version(invalid_alias) def test__get_template_ids_all_versions(): m_client = create_client_mock() m_client.aliases.list.return_value = TEST_TEMPLATE_ID_ALIAS_OBJECTS actual_template_ids = _model._get_template_ids_all_versions(m_client) expected_template_ids = TEST_TEMPLATE_IDS assert actual_template_ids == expected_template_ids @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) @pytest.mark.parametrize( 'version, train_id, predict_id, register_id', [(version, ids['training'], ids['prediction'], ids['registration']) for version, ids in TEST_TEMPLATE_IDS.items()] ) def test__get_template_ids(version, train_id, predict_id, register_id): actual_train_id, actual_predict_id, actual_register_id = ( _model._get_template_ids(version, mock.ANY) ) assert actual_train_id == train_id assert actual_predict_id == predict_id assert actual_register_id == register_id @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) def test__get_template_ids_invalid_version(): with pytest.raises(ValueError): _model._get_template_ids('not_a_version', mock.ANY) ##################################### # Tests of ModelPipeline below here # ##################################### @pytest.fixture def mp_setup(): mock_api = setup_client_mock() mock_api.aliases.list.return_value = TEST_TEMPLATE_ID_ALIAS_OBJECTS mp = _model.ModelPipeline('wf', 'dv', client=mock_api) return mp @pytest.mark.skipif(not HAS_SKLEARN, reason="scikit-learn not installed") @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) def test_modelpipeline_etl_init_err(): # If users don't have access to >= v2.0 temlates, then passing # `etl` to a new ModelPipeline should produce a NotImplementedError. mock_client = mock.MagicMock() with pytest.raises(NotImplementedError): _model.ModelPipeline(LogisticRegression(), 'test', etl=LogisticRegression(), client=mock_client) @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) @mock.patch.object(_model, 'ModelFuture') def test_modelpipeline_classmethod_constructor_errors(mock_future): # catch 404 error if model isn't found and throw ValueError mock_client = mock.Mock() response = namedtuple('Reponse', ['content', 'response', 'reason', 'status_code'])(False, None, None, 404) mock_future.side_effect = CivisAPIError(response) with pytest.raises(ValueError): _model.ModelPipeline.from_existing(1, 1, client=mock_client) def _container_response_stub(from_template_id=8387): arguments = { 'MODEL': 'sparse_logistic', 'TARGET_COLUMN': 'brushes_teeth_much', 'PRIMARY_KEY': 'voterbase_id', 'CALIBRATION': 'sigmoid', 'EXCLUDE_COLS': 'dog cat lizard frog', 'CVPARAMS': '{}', 'PARAMS': '{}', 'REQUIRED_CPU': 1000, 'REQUIRED_MEMORY': 9999, 'REQUIRED_DISK_SPACE': -20, 'DEPENDENCIES': 'A B C D', 'GIT_CRED': 9876 } notifications = { 'urls': [], 'failureEmailAddresses': [], 'failureOn': True, 'stallWarningMinutes': None, 'successEmailAddresses': [], 'successEmailBody': None, 'successEmailSubject': None, 'successOn': True } return Response(dict(arguments=arguments, notifications=notifications, required_resources={}, docker_image_tag=None, docker_command=None, repo_http_uri=None, repo_ref=None, name='Civis Model Train', from_template_id=from_template_id, )) @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) @mock.patch.object(_model, 'ModelFuture') def test_modelpipeline_classmethod_constructor(mock_future): mock_client = mock.Mock() mock_client.scripts.get_containers.return_value = \ container = _container_response_stub(TRAIN_ID_PROD) mock_client.credentials.get.return_value = Response({'name': 'Token'}) resources = {'REQUIRED_CPU': 1000, 'REQUIRED_MEMORY': 9999, 'REQUIRED_DISK_SPACE': -20} # test everything is working fine mp = _model.ModelPipeline.from_existing(1, 1, client=mock_client) assert isinstance(mp, _model.ModelPipeline) assert mp.dependent_variable == [container.arguments['TARGET_COLUMN']] assert mp.primary_key == container.arguments['PRIMARY_KEY'] excluded = container.arguments.get('EXCLUDE_COLS', None) assert mp.excluded_columns == excluded.split() if excluded else None assert mp.model == container.arguments['MODEL'] assert mp.calibration == container.arguments['CALIBRATION'] assert mp.cv_params == json.loads(container.arguments['CVPARAMS']) assert mp.parameters == json.loads(container.arguments['PARAMS']) assert mp.job_resources == resources assert mp.model_name == container.name[:-6] assert mp.notifications == {camel_to_snake(key): val for key, val in container.notifications.items()} deps = container.arguments.get('DEPENDENCIES', None) assert mp.dependencies == deps.split() if deps else None assert mp.git_token_name == 'Token' @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) @mock.patch.object(_model, 'ModelFuture') def test_modelpipeline_classmethod_constructor_defaults(mock_future): # checks that it works with a registration template and train template for template_id in [TRAIN_ID_PROD, REGISTRATION_ID_PROD]: container_response_stub = _container_response_stub(template_id) del container_response_stub.arguments['PARAMS'] del container_response_stub.arguments['CVPARAMS'] mock_client = mock.Mock() mock_client.scripts.get_containers.return_value = container_response_stub # noqa mock_client.credentials.get.return_value = Response({'name': 'Token'}) # test everything is working fine mp = _model.ModelPipeline.from_existing(1, 1, client=mock_client) assert mp.cv_params == {} assert mp.parameters == {} @pytest.mark.skipif(not HAS_NUMPY, reason="numpy not installed") @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) def test_modelpipeline_classmethod_constructor_nonint_id(): # Verify that we can still JSON-serialize job and run IDs even # if they're entered in a non-JSON-able format. # We need to turn them into JSON to set them as script arguments. mock_client = setup_client_mock(1, 2) container_response_stub = _container_response_stub(TRAIN_ID_PROD) mock_client.scripts.get_containers.return_value = container_response_stub mp = _model.ModelPipeline.from_existing(np.int64(1), np.int64(2), client=mock_client) out = json.dumps({'job': mp.train_result_.job_id, 'run': mp.train_result_.run_id}) assert out == '{"job": 1, "run": 2}' or out == '{"run": 2, "job": 1}' @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) @pytest.mark.parametrize( 'train_id, predict_id', [(TRAIN_ID_PROD, PREDICT_ID_PROD), (TRAIN_ID_OLD, PREDICT_ID_OLD)], ) @mock.patch.object(_model, 'ModelFuture', autospec=True) def test_modelpipeline_classmethod_constructor_old_version( mock_future, train_id, predict_id): # Test that we select the correct prediction template for different # versions of a training job. mock_client = setup_client_mock() mock_client.scripts.get_containers.return_value = \ _container_response_stub(from_template_id=train_id) mp = _model.ModelPipeline.from_existing(1, 1, client=mock_client) assert mp.predict_template_id == predict_id @mock.patch.object(_model.ModelPipeline, "_create_custom_run") def test_modelpipeline_train(mock_ccr, mp_setup): mp = mp_setup mock1, mock2 = mock.Mock(), mock.Mock() mock_ccr.return_value = 'res', mock1, mock2 assert 'res' == mp.train(file_id=7) assert mp.train_result_ == 'res' @pytest.mark.skipif(not HAS_SKLEARN, reason="scikit-learn not installed") @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) @mock.patch.object(_model, "APIClient", mock.Mock()) @mock.patch.object(_model.cio, "file_to_civis") @mock.patch.object(_model.ModelPipeline, "_create_custom_run") def test_modelpipeline_train_from_estimator(mock_ccr, mock_f2c): # Provide a model as a pre-made model and make sure we can train. mock_f2c.return_value = -21 est = LogisticRegression() mp = _model.ModelPipeline(est, "dv") mock1, mock2 = mock.Mock(), mock.Mock() mock_ccr.return_value = 'res', mock1, mock2 assert 'res' == mp.train(file_id=7) assert mp.train_result_ == 'res' assert mock_f2c.call_count == 1 # Called once to store input Estimator @pytest.mark.skipif(not HAS_SKLEARN, reason="scikit-learn not installed") @mock.patch.object(_model, "_get_template_ids") @mock.patch.object(_model.cio, "file_to_civis") @mock.patch.object(_model.ModelPipeline, "_create_custom_run") def test_modelpipeline_train_custom_etl(mock_ccr, mock_f2c, mock_template_ids): # Provide a custom ETL estimator and make sure we can train. mock_api = setup_client_mock() # training template ID 11111 >= 9968 for the etl arg to work mock_template_ids.return_value = 11111, 22222, 33333 etl = LogisticRegression() mp = _model.ModelPipeline('wf', 'dv', client=mock_api, etl=etl) mock_f2c.return_value = -21 mock1, mock2 = mock.Mock(), mock.Mock() mock_ccr.return_value = 'res', mock1, mock2 assert 'res' == mp.train(file_id=7) assert mp.train_result_ == 'res' assert mock_f2c.call_count == 1 # Called once to store input Estimator @pytest.mark.skipif(not HAS_PANDAS, reason="pandas not installed") @mock.patch.object(_model, "_stash_local_dataframe", return_value=-11, autospec=True) @mock.patch.object(_model.ModelPipeline, "_create_custom_run") def test_modelpipeline_train_df(mock_ccr, mock_stash, mp_setup): mp = mp_setup mock1, mock2 = mock.Mock(), mock.Mock() mock_ccr.return_value = 'res', mock1, mock2 train_data = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) assert 'res' == mp.train(train_data) mock_stash.assert_called_once_with( train_data, TRAIN_ID_PROD, client=mock.ANY) assert mp.train_result_ == 'res' @mock.patch.object(_model, "_stash_local_file", return_value=-11) @mock.patch.object(_model.ModelPipeline, "_create_custom_run") def test_modelpipeline_train_file_name(mock_ccr, mock_stash, mp_setup): mp = mp_setup mock1, mock2 = mock.Mock(), mock.Mock() mock_ccr.return_value = 'res', mock1, mock2 assert 'res' == mp.train(csv_path='meaning_of_life.csv') mock_stash.assert_called_once_with('meaning_of_life.csv', client=mock.ANY) assert mp.train_result_ == 'res' def test_modelpipeline_train_value_no_input_error(mp_setup): mp = mp_setup with pytest.raises(ValueError) as exc: mp.train() assert str(exc.value) == "Provide a source of data." with pytest.raises(ValueError) as exc: mp.train(table_name='tab') assert str(exc.value) == "Provide a source of data." with pytest.raises(ValueError) as exc: mp.train(database_name='db') assert str(exc.value) == "Provide a source of data." def test_modelpipeline_train_value_too_much_input_error(mp_setup): with pytest.raises(ValueError) as exc: mp_setup.train([[1, 2, 3]], table_name='tab', database_name='db') assert str(exc.value) == "Provide a single source of data." with pytest.raises(ValueError) as exc: mp_setup.train([[1, 2, 3]], file_id=12) assert str(exc.value) == "Provide a single source of data." with pytest.raises(ValueError) as exc: mp_setup.train(file_id=7, table_name='tab', database_name='db') assert str(exc.value) == "Provide a single source of data." def test_modelpipeline_state(mp_setup): mp = mp_setup with pytest.raises(ValueError, match="This model hasn't been trained yet."): mp.state mp.train_result_ = mock.Mock() mp.train_result_.state = 'foo' assert mp.state == 'foo' def test_modelpipeline_estimator(mp_setup): mp = mp_setup with pytest.raises(ValueError, match="This model hasn't been trained yet."): mp.estimator mp.train_result_ = mock.Mock() mp.train_result_.estimator = 'foo' assert mp.estimator == 'foo' def test_modelpipeline_predict_value_error(mp_setup): mp = mp_setup with pytest.raises(ValueError, match="This model hasn't been trained yet."): mp.predict() mp.train_result_ = mock.Mock() mp.train_result_.running.return_value = False with pytest.raises(ValueError) as exc: mp.predict() assert str(exc.value) == "Provide a source of data." with pytest.raises(ValueError) as exc: mp.predict(table_name='tab') assert str(exc.value) == "Provide a source of data." with pytest.raises(ValueError) as exc: mp.predict(database_name='db') assert str(exc.value) == "Provide a source of data." def test_modelpipeline_predict_value_too_much_input_error(mp_setup): result = mock.Mock(spec_set=_model.ModelFuture) result.running.return_value = False mp_setup.train_result_ = result # Make this look trained. with pytest.raises(ValueError) as exc: mp_setup.predict([[1, 2, 3]], table_name='tab', database_name='db') assert str(exc.value) == "Provide a single source of data." with pytest.raises(ValueError) as exc: mp_setup.predict([[1, 2, 3]], file_id=12) assert str(exc.value) == "Provide a single source of data." with pytest.raises(ValueError) as exc: mp_setup.predict(file_id=7, table_name='tab', database_name='db') assert str(exc.value) == "Provide a single source of data." with pytest.raises(ValueError) as exc: mp_setup.predict(file_id=7, manifest=123) assert str(exc.value) == "Provide a single source of data." @mock.patch.object(_model, '_get_template_ids_all_versions', mock.Mock(return_value=TEST_TEMPLATE_IDS)) @mock.patch.object(_model, "APIClient", mock.Mock()) @pytest.mark.parametrize( 'version, train_id, predict_id', [('v2.3', TRAIN_ID_PROD, PREDICT_ID_PROD), ('v1.4', TRAIN_ID_OLD, PREDICT_ID_OLD)], ) def test_modelpipeline_pickling_preserves_template_ids( version, train_id, predict_id): # Test that pickling a ModelPipeline object preserves the template IDs # that have already been set during object instantiation. with TemporaryDirectory() as temp_dir: mp = _model.ModelPipeline('wf', 'dv', civisml_version=version) # Before pickling, make sure the template IDs are set as expected assert mp.train_template_id == train_id assert mp.predict_template_id == predict_id pickle_path = os.path.join(temp_dir, 'model.pkl') with open(pickle_path, 'wb') as f: pickle.dump(mp, f) with open(pickle_path, 'rb') as f: mp_unpickled = pickle.load(f) # After unpickling, the template IDs should remain. assert mp_unpickled.train_template_id == train_id assert mp_unpickled.predict_template_id == predict_id