""" Generate DL1 (a or b) output files in HDF5 format from {R0,R1,DL0} inputs. """ import pathlib import sys import numpy as np import tables import tables.filters from astropy import units as u from tqdm.autonotebook import tqdm from collections import defaultdict from ctapipe.io import metadata as meta from ..calib.camera import CameraCalibrator, GainSelector from ..containers import ( ImageParametersContainer, TelEventIndexContainer, SimulatedShowerDistribution, IntensityStatisticsContainer, PeakTimeStatisticsContainer, MCDL1CameraContainer, TimingParametersContainer, ) from ..core import Provenance from ..core import QualityQuery, Container, Field, Tool, ToolConfigurationError from ..core.traits import ( Bool, CaselessStrEnum, Int, List, Path, create_class_enum_trait, classes_with_traits, ) from ..image import ImageCleaner from ..image import ( hillas_parameters, descriptive_statistics, concentration as concentration_parameters, timing_parameters, leakage as leakage_parameters, morphology_parameters, ) from ..image.extractor import ImageExtractor from ..io import EventSource, HDF5TableWriter, SimTelEventSource tables.parameters.NODE_CACHE_SLOTS = 3000 # fixes problem with too many datasets PROV = Provenance() # define the version of the DL1 data model written here. This should be updated # when necessary: # - increase the major number if there is a breaking change to the model # (meaning readers need to update scripts) # - increase the minor number if new columns or datasets are added # - increase the patch number if there is a small bugfix to the model. DL1_DATA_MODEL_VERSION = "v1.0.0" def write_reference_metadata_headers(obs_id, subarray, writer): """ Attaches Core Provenence headers to an output HDF5 file. Right now this is hard-coded for use with the ctapipe-stage1-process tool Parameters ---------- output_path: pathlib.Path output HDF5 file obs_id: int observation ID subarray: SubarrayDescription to get metadata from writer: HDF5TableWriter output """ activity = PROV.current_activity.provenance reference = meta.Reference( contact=meta.Contact(name="", email="", organization="CTA Consortium"), product=meta.Product( description="DL1 Data Product", data_category="S", data_level="DL1", data_association="Subarray", data_model_name="ASWG DL1", data_model_version=DL1_DATA_MODEL_VERSION, data_model_url="", format="hdf5", ), process=meta.Process(type_="Simulation", subtype="", id_=int(obs_id)), activity=meta.Activity.from_provenance(activity), instrument=meta.Instrument( site="Other", # need a way to detect site... class_="Subarray", type_="unknown", version="unknown", id_=subarray.name, ), ) # convert all values to strings, since hdf5 can't handle Times, etc.: # TODO: add activity_stop_time? headers = {k: str(v) for k, v in reference.to_dict().items()} meta.write_to_hdf5(headers, writer._h5file) class ImageQualityQuery(QualityQuery): """ for configuring image-wise data checks """ quality_criteria = List( default_value=[ ("size_greater_0", "lambda image_selected: image_selected.sum() > 0") ], help=QualityQuery.quality_criteria.help, ).tag(config=True) class Stage1ProcessorTool(Tool): name = "ctapipe-stage1-process" description = __doc__ + f" This currently writes {DL1_DATA_MODEL_VERSION} DL1 data" examples = """ To process data with all default values: > ctapipe-stage1-process --input events.simtel.gz --output events.dl1.h5 --progress Or use an external configuration file, where you can specify all options: > ctapipe-stage1-process --config stage1_config.json --progress The config file should be in JSON or python format (see traitlets docs). For an example, see ctapipe/examples/stage1_config.json in the main code repo. """ output_path = Path( help="DL1 output filename", default_value=pathlib.Path("events.dl1.h5") ).tag(config=True) write_images = Bool( help="Store DL1/Event/Image data in output", default_value=False ).tag(config=True) write_parameters = Bool( help="Compute and store image parameters", default_value=True ).tag(config=True) compression_level = Int( help="compression level, 0=None, 9=maximum", default_value=5, min=0, max=9 ).tag(config=True) split_datasets_by = CaselessStrEnum( values=["tel_id", "tel_type"], default_value="tel_id", help="Splitting level for the parameters and images datasets", ).tag(config=True) compression_type = CaselessStrEnum( values=["blosc:zstd", "zlib"], help="compressor algorithm to use. ", default_value="blosc:zstd", ).tag(config=True) image_extractor_type = create_class_enum_trait( base_class=ImageExtractor, default_value="NeighborPeakWindowSum", help="Method to use to turn a waveform into a single charge value", ).tag(config=True) gain_selector_type = create_class_enum_trait( base_class=GainSelector, default_value="ThresholdGainSelector" ).tag(config=True) image_cleaner_type = create_class_enum_trait( base_class=ImageCleaner, default_value="TailcutsImageCleaner" ) write_index_tables = Bool( help=( "Generate PyTables index datasets for all tables that contain an " "event_id or tel_id. These speed up in-kernal pytables operations," "but add some overhead to the file. They can also be generated " "and attached after the file is written " ), default_value=False, ).tag(config=True) overwrite = Bool(help="overwrite output file if it exists").tag(config=True) progress_bar = Bool(help="show progress bar during processing").tag(config=True) aliases = { "input": "EventSource.input_url", "output": "Stage1ProcessorTool.output_path", "allowed-tels": "EventSource.allowed_tels", "max-events": "EventSource.max_events", "image-extractor-type": "Stage1ProcessorTool.image_extractor_type", "gain-selector-type": "Stage1ProcessorTool.gain_selector_type", "image-cleaner-type": "Stage1ProcessorTool.image_cleaner_type", } flags = { "write-images": ( {"Stage1ProcessorTool": {"write_images": True}}, "store DL1/Event/Telescope images in output", ), "write-parameters": ( {"Stage1ProcessorTool": {"write_parameters": True}}, "store DL1/Event/Telescope parameters in output", ), "write-index-tables": ( {"Stage1ProcessorTool": {"write_index_tables": True}}, "generate PyTables index tables for the parameter and image datasets", ), "overwrite": ( {"Stage1ProcessorTool": {"overwrite": True}}, "Overwrite output file if it exists", ), "progress": ( {"Stage1ProcessorTool": {"progress_bar": True}}, "show a progress bar during event processing", ), } classes = List( [CameraCalibrator, ImageQualityQuery, EventSource] + classes_with_traits(ImageCleaner) + classes_with_traits(ImageExtractor) + classes_with_traits(GainSelector) ) def setup(self): # prepare output path: self.output_path = self.output_path.expanduser() if self.output_path.exists(): if self.overwrite: self.log.warning(f"Overwriting {self.output_path}") self.output_path.unlink() else: self.log.critical( f"Output file {self.output_path} exists" ", use `--overwrite` to overwrite " ) sys.exit(1) PROV.add_output_file(str(self.output_path), role="DL1/Event") # check that options make sense: if self.write_parameters is False and self.write_images is False: raise ToolConfigurationError( "The options 'write_parameters' and 'write_images' are " "both set to False. No output will be generated in that case. " "Please enable one or both of these options." ) # setup components: self.gain_selector = self.add_component( GainSelector.from_name(self.gain_selector_type, parent=self) ) self.event_source = self.add_component( EventSource.from_config(parent=self, gain_selector=self.gain_selector) ) self.image_extractor = self.add_component( ImageExtractor.from_name( self.image_extractor_type, parent=self, subarray=self.event_source.subarray, ) ) self.calibrate = self.add_component( CameraCalibrator( parent=self, subarray=self.event_source.subarray, image_extractor=self.image_extractor, ) ) self.clean = self.add_component( ImageCleaner.from_name( self.image_cleaner_type, parent=self, subarray=self.event_source.subarray, ) ) self.check_image = self.add_component(ImageQualityQuery(parent=self)) # check component setup if self.event_source.max_events and self.event_source.max_events > 0: self.log.warning( "No Simulated shower distributions will be written because " "EventSource.max_events is set to a non-zero number (and therefore " "shower distributions read from the input MC file are invalid)." ) # setup HDF5 compression: self._hdf5_filters = tables.Filters( complevel=self.compression_level, complib=self.compression_type, fletcher32=True, # attach a checksum to each chunk for error correction ) # store last pointing to only write unique poitings self._last_pointing_tel = defaultdict(lambda: (np.nan * u.deg, np.nan * u.deg)) def _write_simulation_configuration(self, writer): """ Write the simulation headers to a single row of a table. Later if this file is merged with others, that table will grow. Note that this function should be run first """ self.log.debug("Writing simulation configuration") class ExtraMCInfo(Container): container_prefix = "" obs_id = Field(0, "MC Run Identifier") extramc = ExtraMCInfo() extramc.obs_id = self.event_source.obs_id self.event_source.mc_header.prefix = "" writer.write( "configuration/simulation/run", [extramc, self.event_source.mc_header] ) def _write_simulation_histograms(self, writer: HDF5TableWriter): """ Write the distribution of thrown showers Notes ----- - this only runs if this is a simulation file. The current implementation is a bit of a hack and implies we should improve SimTelEventSource to read this info. - Currently the histograms are at the end of the simtel file, so if max_events is set to non-zero, the end of the file may not be read, and this no histograms will be found. """ self.log.debug("Writing simulation histograms") if not isinstance(self.event_source, SimTelEventSource): return def fill_from_simtel( obs_id, eventio_hist, container: SimulatedShowerDistribution ): """ fill from a SimTel Histogram entry""" container.obs_id = obs_id container.hist_id = eventio_hist["id"] container.num_entries = eventio_hist["entries"] xbins = np.linspace( eventio_hist["lower_x"], eventio_hist["upper_x"], eventio_hist["n_bins_x"] + 1, ) ybins = np.linspace( eventio_hist["lower_y"], eventio_hist["upper_y"], eventio_hist["n_bins_y"] + 1, ) container.bins_core_dist = xbins * u.m container.bins_energy = 10 ** ybins * u.TeV container.histogram = eventio_hist["data"] container.meta["hist_title"] = eventio_hist["title"] container.meta["x_label"] = "Log10 E (TeV)" container.meta["y_label"] = "3D Core Distance (m)" hists = self.event_source.file_.histograms if hists is not None: hist_container = SimulatedShowerDistribution() hist_container.prefix = "" for hist in hists: if hist["id"] == 6: fill_from_simtel(self.event_source.obs_id, hist, hist_container) writer.write( table_name="simulation/service/shower_distribution", containers=hist_container, ) def _write_instrument_configuration(self, subarray): """write the SubarrayDescription Parameters ---------- subarray : ctapipe.instrument.SubarrayDescription subarray description """ self.log.debug("Writing instrument configuration") serialize_meta = True subarray.to_table().write( self.output_path, path="/configuration/instrument/subarray/layout", serialize_meta=serialize_meta, append=True, ) subarray.to_table(kind="optics").write( self.output_path, path="/configuration/instrument/telescope/optics", append=True, serialize_meta=serialize_meta, ) for telescope_type in subarray.telescope_types: ids = set(subarray.get_tel_ids_for_type(telescope_type)) if len(ids) > 0: # only write if there is a telescope with this camera tel_id = list(ids)[0] camera = subarray.tel[tel_id].camera camera.geometry.to_table().write( self.output_path, path=f"/configuration/instrument/telescope/camera/geometry_{camera}", append=True, serialize_meta=serialize_meta, ) camera.readout.to_table().write( self.output_path, path=f"/configuration/instrument/telescope/camera/readout_{camera}", append=True, serialize_meta=serialize_meta, ) def _write_processing_statistics(self): """ write out the event selection stats, etc. """ image_stats = self.check_image.to_table(functions=True) image_stats.write( self.output_path, path="/dl1/service/image_statistics", append=True, serialize_meta=True, ) def _parameterize_image(self, tel_id, image, signal_pixels, peak_time=None): """Apply image cleaning and calculate image features Parameters ---------- subarray : SubarrayDescription subarray description data : DL1CameraContainer calibrated camera data tel_id: int which telescope is being cleaned Returns ------- np.ndarray, ImageParametersContainer: cleaning mask, parameters """ tel = self.event_source.subarray.tel[tel_id] geometry = tel.camera.geometry image_selected = image[signal_pixels] # check if image can be parameterized: image_criteria = self.check_image(image_selected) self.log.debug( "image_criteria: %s", list(zip(self.check_image.criteria_names[1:], image_criteria)), ) # parameterize the event if all criteria pass: if all(image_criteria): geom_selected = geometry[signal_pixels] hillas = hillas_parameters(geom=geom_selected, image=image_selected) leakage = leakage_parameters( geom=geometry, image=image, cleaning_mask=signal_pixels ) concentration = concentration_parameters( geom=geom_selected, image=image_selected, hillas_parameters=hillas ) morphology = morphology_parameters(geom=geometry, image_mask=signal_pixels) intensity_statistics = descriptive_statistics( image_selected, container_class=IntensityStatisticsContainer ) if peak_time is not None: timing = timing_parameters( geom=geom_selected, image=image_selected, peak_time=peak_time[signal_pixels], hillas_parameters=hillas, ) peak_time_statistics = descriptive_statistics( peak_time[signal_pixels], container_class=PeakTimeStatisticsContainer, ) else: timing = TimingParametersContainer() peak_time_statistics = PeakTimeStatisticsContainer() return ImageParametersContainer( hillas=hillas, timing=timing, leakage=leakage, morphology=morphology, concentration=concentration, intensity_statistics=intensity_statistics, peak_time_statistics=peak_time_statistics, ) # return the default container (containing nan values) for no # parameterization return ImageParametersContainer() def _process_events(self, writer): self.log.debug("Writing DL1/Event data") self.event_source.subarray.info(printer=self.log.debug) # initial value for last known pointing position last_pointing = (np.nan * u.deg, np.nan * u.deg) for event in tqdm( self.event_source, desc=self.event_source.__class__.__name__, total=self.event_source.max_events, unit="ev", disable=not self.progress_bar, ): self.log.log(9, "Writing event_id=%s", event.index.event_id) self.calibrate(event) event.trigger.prefix = "" p = event.pointing current_pointing = (p.array_azimuth, p.array_altitude) if current_pointing != last_pointing: p.prefix = "" writer.write("dl1/monitoring/subarray/pointing", [event.trigger, p]) last_pointing = current_pointing # write the subarray tables writer.write( table_name="simulation/event/subarray/shower", containers=[event.index, event.mc], ) writer.write( table_name="dl1/event/subarray/trigger", containers=[event.index, event.trigger], ) # write the telescope tables self._write_telescope_event(writer, event) def _write_telescope_event(self, writer, event): """ add entries to the event/telescope tables for each telescope in a single event """ # write the telescope tables for tel_id, dl1_camera in event.dl1.tel.items(): dl1_camera.prefix = "" # don't want a prefix for this container telescope = self.event_source.subarray.tel[tel_id] tel_type = str(telescope) tel_index = TelEventIndexContainer( obs_id=event.index.obs_id, event_id=event.index.event_id, tel_id=np.int16(tel_id), ) p = event.pointing.tel[tel_id] current_pointing = (p.azimuth, p.altitude) if current_pointing != self._last_pointing_tel[tel_id]: p.prefix = "" writer.write( f"dl1/monitoring/telescope/pointing/tel_{tel_id:03d}", [event.trigger.tel[tel_id], p], ) self._last_pointing_tel[tel_id] = current_pointing table_name = ( f"tel_{tel_id:03d}" if self.split_datasets_by == "tel_id" else tel_type ) writer.write( "dl1/event/telescope/trigger", [tel_index, event.trigger.tel[tel_id]] ) if self.event_source.is_simulation: true_image = event.mc.tel[tel_id].true_image has_true_image = ( true_image is not None and np.count_nonzero(true_image) > 0 ) if has_true_image: mcdl1 = MCDL1CameraContainer( true_image=true_image, true_parameters=None ) mcdl1.prefix = "" else: has_true_image = False if self.write_parameters: # apply cleaning dl1_camera.image_mask = self.clean( tel_id=tel_id, image=dl1_camera.image, arrival_times=dl1_camera.peak_time, ) params = self._parameterize_image( tel_id=tel_id, image=dl1_camera.image, signal_pixels=dl1_camera.image_mask, peak_time=dl1_camera.peak_time, ) self.log.debug("params: %s", params.as_dict(recursive=True)) writer.write( table_name=f"dl1/event/telescope/parameters/{table_name}", containers=[tel_index, *params.values()], ) if self.event_source.is_simulation and has_true_image: mcdl1.true_parameters = self._parameterize_image( tel_id, image=true_image, signal_pixels=true_image > 0, peak_time=None, # true image from mc has no peak time ) writer.write( f"simulation/event/telescope/parameters/{table_name}", [tel_index, *mcdl1.true_parameters.values()], ) if self.write_images: # note that we always write the image, even if the image quality # criteria are not met (those are only to determine if the parameters # can be computed). writer.write( table_name=f"dl1/event/telescope/images/{table_name}", containers=[tel_index, dl1_camera], ) if has_true_image: writer.write( f"simulation/event/telescope/images/{table_name}", [tel_index, mcdl1], ) def _generate_table_indices(self, h5file, start_node): for node in h5file.iter_nodes(start_node): if not isinstance(node, tables.group.Group): self.log.debug(f"gen indices for: {node}") if "event_id" in node.colnames: node.cols.event_id.create_index() self.log.debug("generated event_id index") if "tel_id" in node.colnames: node.cols.tel_id.create_index() self.log.debug("generated tel_id index") if "obs_id" in node.colnames: self.log.debug("generated obs_id index") node.cols.obs_id.create_index(kind="ultralight") else: # recurse self._generate_table_indices(h5file, node) def _generate_indices(self, writer): if self.write_images: self._generate_table_indices(writer._h5file, "/dl1/event/telescope/images") self._generate_table_indices(writer._h5file, "/dl1/event/subarray") def _setup_writer(self, writer: HDF5TableWriter): writer.add_column_transform( table_name="dl1/event/subarray/trigger", col_name="tels_with_trigger", transform=self.event_source.subarray.tel_ids_to_mask, ) # exclude some columns that are not writable writer.exclude("dl1/event/subarray/trigger", "tel") writer.exclude("dl1/monitoring/subarray/pointing", "tel") writer.exclude("dl1/monitoring/subarray/pointing", "event_type") for tel_id, telescope in self.event_source.subarray.tel.items(): tel_type = str(telescope) if self.split_datasets_by == "tel_id": table_name = f"tel_{tel_id:03d}" else: table_name = tel_type if self.write_parameters is False: writer.exclude( f"/dl1/event/telescope/images/{table_name}", "image_mask" ) writer.exclude(f"/dl1/event/telescope/images/{table_name}", "parameters") writer.exclude( f"/dl1/monitoring/event/pointing/tel_{tel_id:03d}", "event_type" ) if self.event_source.is_simulation: writer.exclude( f"/simulation/event/telescope/images/{table_name}", "true_parameters", ) # no timing information yet for true images writer.exclude( f"/simulation/event/telescope/parameters/{table_name}", r"peak_time_.*", ) writer.exclude( f"/simulation/event/telescope/parameters/{table_name}", r"timing_.*" ) writer.exclude(f"/simulation/event/subarray/shower", "true_tel") def start(self): # FIXME: this uses astropy tables hdf5 io, internally using h5py, # and must thus be done before the table writer opens the file or it might lead # to "Resource temporary unavailable" if h5py and tables are not linked # against the same libhdf (happens when using the pre-build pip wheels) # should be replaced by writing the table using tables self._write_instrument_configuration(self.event_source.subarray) with HDF5TableWriter( self.output_path, parent=self, mode="a", add_prefix=True, filters=self._hdf5_filters, ) as writer: if self.event_source.is_simulation: self._write_simulation_configuration(writer) self._setup_writer(writer) self._process_events(writer) if self.event_source.is_simulation: self._write_simulation_histograms(writer) if self.write_index_tables: self._generate_indices(writer) write_reference_metadata_headers( subarray=self.event_source.subarray, obs_id=self.event_source.obs_id, writer=writer, ) self._write_processing_statistics() def finish(self): pass def main(): tool = Stage1ProcessorTool() tool.run() if __name__ == "__main__": main()