""" .. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com> """ import abc import os.path import re from typing import Mapping # noqa from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Set, Tuple from urllib.parse import urlparse import msgfy import nbformat import retryrequests from simplesqlite import SimpleSQLite from .._common import ResultLogger try: import ujson as json except ImportError: import json # type: ignore if TYPE_CHECKING: from ._base import SourceInfo # noqa KEY_VALUE_TABLE = "kv" def is_ipynb_file_path(file_path: str) -> bool: return urlparse(file_path).scheme == "" and os.path.splitext(file_path)[1] == ".ipynb" def is_ipynb_url(url: str) -> bool: result = urlparse(url) return result.scheme != "" and is_ipynb_file_path(result.path) def _schema_not_found_error_handler(e: Exception) -> None: if re.search("No such file or directory: .+schema.json", str(e)): raise RuntimeError( "ipynb file format conversion not supported for the binary version. " "please try to install sqlitebiter via pip." ) def load_ipynb_file(file_path: str, encoding: str): with open(file_path, encoding=encoding) as f: try: return nbformat.read(f, as_version=4) except AttributeError as e: raise nbformat.reader.NotJSONError(msgfy.to_error_message(e)) except OSError as e: _schema_not_found_error_handler(e) raise def load_ipynb_text(text: str): try: return nbformat.reads(text, as_version=4) except AttributeError as e: raise nbformat.reader.NotJSONError(msgfy.to_error_message(e)) except OSError as e: _schema_not_found_error_handler(e) raise def load_ipynb_url(url: str, proxies: Optional[Dict]) -> Tuple: response = retryrequests.get(url, proxies=proxies) response.raise_for_status() try: return (nbformat.reads(response.text, as_version=4), len(response.content)) except OSError as e: _schema_not_found_error_handler(e) raise class NbAttr: CELL_ID = "cell_id" KEY = "key" LINE_NUMBER = "line_no" SOURECE_ID = "source_id" VALUE = "value" class NbAttrDesc: CELL_ID = "{:s} INTEGER NOT NULL".format(NbAttr.CELL_ID) KEY = "{:s} TEXT NOT NULL".format(NbAttr.KEY) LINE_NUMBER = "{:s} INTEGER NOT NULL".format(NbAttr.LINE_NUMBER) SOURECE_ID = "{:s} INTEGER NOT NULL".format(NbAttr.SOURECE_ID) VALUE = "{:s} TEXT".format(NbAttr.VALUE) class JupyterNotebookConverterInterface(metaclass=abc.ABCMeta): @abc.abstractmethod def convert(self): # pragma: no cover pass class JupyterNotebookConverterBase(JupyterNotebookConverterInterface): @abc.abstractproperty def _base_table_name(self) -> str: # pragma: no cover pass @property def source_id(self): return self._source_info.source_id def __init__( self, logger, source_info: "SourceInfo", con: SimpleSQLite, result_logger: ResultLogger ): self._logger = logger self._source_info = source_info self._con = con self._result_logger = result_logger self._changed_table_name_set = set() # type: Set[str] def _get_log_header(self, info_name: str) -> str: return "{:s}: {:s}({:s})".format( self._source_info.get_name(self._result_logger.verbosity_level), self._base_table_name, info_name, ) def _need_create_table(self, table_name: str) -> bool: return not self._con.has_table(table_name) def _make_table_name(self, names: List[str]) -> Tuple[str, bool]: table_name = "_".join([self._base_table_name] + names) return (table_name, self._need_create_table(table_name)) class MetaDataConverter(JupyterNotebookConverterBase): @property def _base_table_name(self): return "metadata" def __init__( self, logger, source_info: "SourceInfo", con: SimpleSQLite, result_logger: ResultLogger, metadata, ): super().__init__(logger, source_info, con, result_logger) self.__metadata = metadata def convert(self) -> Set[str]: if not self.__metadata: self._logger.debug("metadata not found") return set() self.__convert_kernelspec() self.__convert_language_info() self.__convert_kv() if self.__metadata: self._logger.debug( "cannot convert: {}".format( json.dumps(self.__metadata, indent=4, ensure_ascii=False) ) ) return self._changed_table_name_set def __convert_kernelspec(self) -> None: target = "kernelspec" table_name, need_create_table = self._make_table_name([target]) records = [ [self.source_id, key, value] for key, value in self.__metadata.get(target).items() ] if len(records) > 0: self._con.create_table( table_name, [NbAttrDesc.SOURECE_ID, NbAttrDesc.KEY, "{:s} TEXT NOT NULL".format(NbAttr.VALUE)], ) self._con.insert_many(table_name, records) self._result_logger.logging_success( self._get_log_header(target), table_name, need_create_table ) self._changed_table_name_set.add(table_name) del self.__metadata[target] def __convert_language_info(self) -> None: target = "language_info" language_info = self.__metadata.get(target) record_list = [] codemirror_mode = language_info.get("codemirror_mode") if isinstance(codemirror_mode, dict): for key, value in codemirror_mode.items(): record_list.append((self.source_id, "codemirror_mode_{:s}".format(key), value)) del language_info["codemirror_mode"] for key, value in language_info.items(): record_list.append((self.source_id, key, value)) table_name, need_create_table = self._make_table_name([target]) if len(record_list) > 0: self._con.create_table( table_name, [NbAttrDesc.SOURECE_ID, NbAttrDesc.KEY, "{:s} TEXT NOT NULL".format(NbAttr.VALUE)], ) self._con.insert_many(table_name, record_list) self._result_logger.logging_success( self._get_log_header(target), table_name, need_create_table ) self._changed_table_name_set.add(table_name) del self.__metadata[target] def __convert_kv(self) -> None: target = "anaconda-cloud" if target in self.__metadata: table_name, need_create_table = self._make_table_name([KEY_VALUE_TABLE]) records = [ [self.source_id, key, value] for key, value in self.__metadata.get(target).items() ] if len(records) > 0: self._con.create_table( table_name, [ NbAttrDesc.SOURECE_ID, NbAttrDesc.KEY, "{:s} TEXT NOT NULL".format(NbAttr.VALUE), ], ) self._con.insert_many(table_name, records) self._result_logger.logging_success( self._get_log_header(target), table_name, need_create_table ) self._changed_table_name_set.add(table_name) del self.__metadata[target] class CellConverter(JupyterNotebookConverterBase): @property def _base_table_name(self) -> str: return "cells" def __init__( self, logger, source_info: "SourceInfo", con: SimpleSQLite, result_logger: ResultLogger, cells: Sequence, ): super().__init__(logger, source_info, con, result_logger) self.__cells = cells self._cell_id = None # type: Optional[int] def convert(self) -> Set[str]: for cell_id, cell_data in enumerate(self.__cells): self._cell_id = cell_id self.__convert_cell(cell_data) return self._changed_table_name_set def _get_log_header(self, info_name: str) -> str: return "{:s}: {:s}#{}({:s})".format( self._source_info.base_name, self._base_table_name, self._cell_id, info_name ) def __convert_source(self, cell_data: Dict[str, str]) -> None: target = "source" table_name, need_create_table = self._make_table_name([target]) records = [ [self.source_id, self._cell_id, line_no, source_line.rstrip()] for line_no, source_line in enumerate(cell_data[target].splitlines()) ] del cell_data[target] if len(records) > 0: self._con.create_table( table_name, [ NbAttrDesc.SOURECE_ID, NbAttrDesc.CELL_ID, NbAttrDesc.LINE_NUMBER, "{:s} TEXT".format("text"), ], ) self._con.insert_many(table_name, records) self._result_logger.logging_success( self._get_log_header(target), table_name, need_create_table ) self._changed_table_name_set.add(table_name) def __to_kv_records(self, data_map: Mapping) -> List[Tuple]: record_list = [] # type: List[Tuple] for key, value in data_map.items(): if key == "metadata": if not value: record = (self.source_id, self._cell_id, key, None) else: record = (self.source_id, self._cell_id, key, str(dict(value))) # type: ignore record_list.append(record) continue record_list.append((self.source_id, self._cell_id, key, value)) return record_list def __convert_cell(self, cell_data) -> None: self.__convert_source(cell_data) category = "outputs" if category in cell_data: outputs_table_name, need_create_output_table = self._make_table_name([category]) self._con.create_table( outputs_table_name, [ NbAttrDesc.SOURECE_ID, NbAttrDesc.CELL_ID, "type TEXT NOT NULL", NbAttrDesc.LINE_NUMBER, "{:s} BLOB".format("data"), ], ) outputs_kv_table_name, need_create_output_kv_table = self._make_table_name( [category, KEY_VALUE_TABLE] ) self._con.create_table( outputs_kv_table_name, [NbAttrDesc.SOURECE_ID, NbAttrDesc.CELL_ID, NbAttrDesc.KEY, NbAttrDesc.VALUE], ) for output_data in cell_data.outputs: if self.__convert_output_text(output_data, need_create_output_table): need_create_output_table = False if self.__convert_output_data(output_data, need_create_output_table): need_create_output_table = False self._con.insert_many(outputs_kv_table_name, self.__to_kv_records(output_data)) self._result_logger.logging_success( self._get_log_header("{} {}".format(category, KEY_VALUE_TABLE)), outputs_kv_table_name, need_create_output_kv_table, ) self._changed_table_name_set.add(outputs_kv_table_name) need_create_output_kv_table = False del cell_data[category] if not cell_data: return kv_records = self.__to_kv_records(cell_data) if len(kv_records) == 0: return kv_table_name, need_create_kv_table = self._make_table_name([KEY_VALUE_TABLE]) self._con.create_table( kv_table_name, [NbAttrDesc.SOURECE_ID, NbAttrDesc.CELL_ID, NbAttrDesc.KEY, NbAttrDesc.VALUE], ) self._con.insert_many(kv_table_name, kv_records) self._result_logger.logging_success( self._get_log_header(KEY_VALUE_TABLE), kv_table_name, need_create_kv_table ) self._changed_table_name_set.add(kv_table_name) def __convert_output_text(self, output_data, need_create_table: bool) -> bool: data_type = "text" if data_type not in output_data: return False table_name, _ = self._make_table_name(["outputs"]) num_record = self._con.insert_many( table_name, [ [self.source_id, self._cell_id, data_type, line_no, line] for line_no, line in enumerate(output_data.get(data_type).splitlines()) ], ) del output_data[data_type] if num_record == 0: return False self._result_logger.logging_success( self._get_log_header("outputs {}".format(data_type)), table_name, need_create_table ) self._changed_table_name_set.add(table_name) return True def __convert_output_data( self, output_data: Dict[str, Mapping], need_create_table: bool ) -> bool: output_key = "data" if output_key not in output_data: return False table_name, _ = self._make_table_name(["outputs"]) image_regexp = re.compile("^image/.+") num_record = 0 for data_type, data in output_data[output_key].items(): self._logger.debug( "table={} id={} data_type={} {}".format( table_name, self._cell_id, data_type, type(data) ) ) if image_regexp.search(data_type): self._con.insert(table_name, [self.source_id, self._cell_id, data_type, 0, data]) num_record += 1 continue if isinstance(data, dict): data = json.dumps(data, indent=4, ensure_ascii=False) num_record += self._con.insert_many( table_name, [ [self.source_id, self._cell_id, data_type, data_no, line] for data_no, line in enumerate(data.splitlines()) ], ) del output_data[output_key] if num_record == 0: return False self._result_logger.logging_success( self._get_log_header("outputs {}".format(data_type)), table_name, need_create_table ) self._changed_table_name_set.add(table_name) return True def convert_nb( logger, source_info: "SourceInfo", con: SimpleSQLite, result_logger: ResultLogger, nb ) -> Set[str]: changed_table_name_set = set() # type: Set[str] changed_table_name_set |= CellConverter( logger, source_info, con, result_logger, nb.cells ).convert() changed_table_name_set |= MetaDataConverter( logger, source_info, con, result_logger, nb.metadata ).convert() table_name = KEY_VALUE_TABLE need_create_table = not con.has_table(table_name) kv_records = [ [source_info.source_id, key, nb.get(key)] for key in ("nbformat", "nbformat_minor") ] if len(kv_records) > 0: con.create_table(table_name, [NbAttrDesc.SOURECE_ID, NbAttrDesc.KEY, NbAttrDesc.VALUE]) con.insert_many(table_name, kv_records) result_logger.logging_success( "{}: {}".format(source_info.base_name, table_name), table_name, need_create_table ) changed_table_name_set.add(table_name) con.commit() return changed_table_name_set