Source code for km3pipe.io.aanet

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim:set ts=4 sts=4 sw=4 et:
"""
Pump for the Aanet data format.

This is undoubtedly the ugliest module in the entire framework.
If you have a way to read aanet files via the Jpp interface,
your pull request is more than welcome!
"""
from __future__ import absolute_import, print_function, division

from collections import defaultdict
import itertools
import subprocess
import os.path

import numpy as np

from km3pipe.core import Pump, Blob
from km3pipe.io.hdf5 import HDF5Header
from km3pipe.dataclasses import Table
from km3pipe.logger import get_logger

[docs]log = get_logger(__name__)    # pylint: disable=C0103

__author__ = "Moritz Lotze and Tamas Gal"
__copyright__ = "Copyright 2016, Tamas Gal and the KM3NeT collaboration."
__credits__ = "Thomas Heid, Liam Quinn, Javier Barrios Martí"
__license__ = "MIT"
__maintainer__ = "Moritz Lotze and Tamas Gal"
__email__ = "tgal@km3net.de"
__status__ = "Development"

[docs]FITINF2NUM = {
    'JGANDALF_BETA0_RAD': 0,
    'JGANDALF_BETA1_RAD': 1,
    'JGANDALF_CHI2': 2,
    'JGANDALF_NUMBER_OF_HITS': 3,
    'JENERGY_ENERGY': 4,
    'JENERGY_CHI2': 5,
    'JGANDALF_LAMBDA': 6,
    'JGANDALF_NUMBER_OF_ITERATIONS': 7,
    'JSTART_NPE_MIP': 8,
    'JSTART_NPE_MIP_TOTAL': 9,
    'JSTART_LENGTH_METRES': 10,
    'JVETO_NPE': 11,
    'JVETO_NUMBER_OF_HITS': 12,
    'JENERGY_MUON_RANGE_METRES': 13,
    'JENERGY_NOISE_LIKELIHOOD': 14,
    'JENERGY_NDF': 15,
    'JENERGY_NUMBER_OF_HITS': 16,
    'JCOPY_Z_M': 17,
}

[docs]FITINFDUSJ2NUM = {
    'DifferencesFirstAndSecondVertexFit_deltaTime': 0,
    'DifferencesFirstAndSecondVertexFit_distance': 1,
    'FinalShowerHits_0dist60_L1cc_SingleHits_emisAng40_trackShowerTres_N': 2,
    'FinalShowerHits_0dist60_L1cc_SingleHits_emisAng40_trackShowerTres_difference': 3,
    'FinalShowerHits_0dist60_L1cc_SingleHits_emisAng40_trackShowerTres_meanDifference': 4,
    'Fork_muonSuppression_decision': 5,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_correlationCoefficient': 6,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_scalarProduct': 7,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_Xcharge2': 8,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_Xcharge_times_charge': 9,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_XhitProb2': 10,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_XhitProb_times_charge': 11,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_XhitProb_times_hit': 12,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_charge2': 13,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_charge2__forXhitProb': 14,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Charge_sum_hit2': 15,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_HitProbCharge_scalarProduct': 16,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_HitProb_scalarProduct': 17,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_NXcharge': 18,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Ncharge': 19,
    'HitPatternCharge_finalShowerHits_10degAroundCherAngle_Nhits': 20,
    'HitPatternTres_finalShowerHits_10degAroundCherAngle_AbsTres_Mean': 21,
    'HitPatternTres_finalShowerHits_10degAroundCherAngle_AbsTres_weightedMean': 22,
    'HitPatternTres_finalShowerHits_10degAroundCherAngle_Nhits': 23,
    'HitPatternTres_finalShowerHits_10degAroundCherAngle_Tres_Mean': 24,
    'HitPatternTres_finalShowerHits_10degAroundCherAngle_Tres_sum_XhitProb': 25,
    'HitPatternTres_finalShowerHits_10degAroundCherAngle_Tres_weightedMean': 26,
    'L0AroundL1HitSelection__weight_withoutSelfSquared': 27,
    'L0AroundL1HitSelection_weight': 28,
    'L0AroundL1HitSelection_weight_withoutSelfSquared': 29,
    'MuonSuppression_decision': 30,
    'MuonSuppression_deltaTresQ20Q80': 31,
    'MuonSuppression_enoughHits': 32,
    'Trigger_3L1Dmax52_FinalShowerHits_0dist80': 33,
    'Trigger_3L1Dmax52_FinalShowerHits_0dist80m25tres75': 34,
    'Trigger_MX8hitsDmax46_FinalShowerHits_0dist80': 35,
    'Trigger_MX8hitsDmax46_FinalShowerHits_0dist80m25tres75': 36,
    'best_DusjOrcaUsingProbabilitiesFinalFit_BjorkenY': 37,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_Nom': 38,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_Npmt': 39,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_Npmt_maxDeltaT10ns': 40,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_azimuth': 41,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_energy': 42,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_energyErrorDown_bestLLH': 43,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_energyErrorUp_bestLLH': 44,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_energy_bestLLH': 45,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_llhBestSinglePMTperDOM__sum_forNoSignal': 46,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_llhBestSinglePMTperDOM_sum': 47,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_llhSinglePMT_sum': 48,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_llhSinglePMT_sum_forNoSignal': 49,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_llh_sum': 50,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_multiplicity': 51,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_sumExpFromPoissonOMhits': 52,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_sumExpOMhits': 53,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_sumMeasuredOMhits': 54,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_sumMeasuredPMThits': 55,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_FinalLLHValues_zenith': 56,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_azimuth': 57,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_energy': 58,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_llh_diff_forE0p8': 59,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_llh_diff_forE1p2': 60,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_llh_overAllNorm': 61,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_llh_sum': 62,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_llh_sum_forNoSignal': 63,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_llh_total': 64,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_meanNomOverAllNorm': 65,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_multiplicity': 66,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_premiumEventFraction': 67,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_pull': 68,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_relativeWeightForOverAllNorm': 69,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_sigmaNomOverAllNorm': 70,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_sumExpFromPoissonOMhits': 71,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_sumExpOMhits': 72,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_sumMeasuredOMhits': 73,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_sumMeasuredPMThits': 74,
    'best_DusjOrcaUsingProbabilitiesFinalFit_FitResult_aroundCherAngle_FinalLLHValues_zenith': 75,
    'best_DusjOrcaUsingProbabilitiesFinalFit_OUTVicinityNumber': 76,
    'best_FirstDusjOrcaVertexFit_OUTVicinityNumber': 77,
    'best_FirstDusjOrcaVertexFit_OUTVicinityWithTimeResidualToSeedNumber': 78,
    'best_SecondDusjOrcaVertexFit_OUTFiducalNumber': 79,
    'best_SecondDusjOrcaVertexFit_OUTVicinityNumber': 80,
    'deltaTres_Q20_Q80_ClusteredL2ORV1L1HitSelection_SingleHits_N': 81,
    'deltaTres_Q20_Q80_ClusteredL2ORV1L1HitSelection_SingleHits_difference': 82,
    'deltaTres_Q20_Q80_FinalShowerHits_0dist60_L1cc_SingleHits_N': 83,
    'deltaTres_Q20_Q80_FinalShowerHits_0dist60_L1cc_SingleHits_difference': 84,
    'geoCoverage_R130h160_angle20_lmin30_best_DusjOrcaUsingProbabilitiesFinalFit_FitResult': 85,
    'geoCoverage_R130h160_angle45_lmin30_best_DusjOrcaUsingProbabilitiesFinalFit_FitResult': 86,
    'geoCoverage_R130h160_angle60_lmin30_best_DusjOrcaUsingProbabilitiesFinalFit_FitResult': 87,
    'geoCoverage_R130h160_angle75_lmin30_best_DusjOrcaUsingProbabilitiesFinalFit_FitResult': 88
}

# jpp > 10.1 (trunk @10276)
[docs]AANET_RECTYPE_PLACEHOLDER = 4000

[docs]RECO2NUM = {
    'JMUONBEGIN': 0,
    'JMUONPREFIT': 1,
    'JMUONSIMPLEX': 2,
    'JMUONGANDALF': 3,
    'JMUONENERGY': 4,
    'JMUONSTART': 5,
    # JMUONEND @ 10.1, JLINEFIT @ trunk
    'JLINEFIT': 6,
    # 10.1 artifact, REMOVE IN FUTURE
    'LineFit': 7,
    'JMUONEND': 99,
    'JSHOWERBEGIN': 100,
    'JSHOWERPREFIT': 101,
    'JSHOWERPOSITIONFIT': 102,
    'JSHOWERCOMPLETEFIT': 103,
    'JSHOWEREND': 199,
    'JDUSJBEGIN': 200,
    'JDUSJPREFIT': 201,
    'JDUSJPOSITIONFIT': 202,
    'JDUSJCOMPLETEFIT': 203,
    'JDUSJEND': 299,
    'JPP_REC_TYPE': AANET_RECTYPE_PLACEHOLDER,
    'JUSERBEGIN': 1000,
    'JMUONVETO': 1001,
    'JPRESIM': 1002,
    'JMUONPATH': 1003,
    'JMCEVT': 1004,
    'JUSEREND': 1099,
    'KM3DeltaPos': 10000,
}

[docs]JHIST_CHAINS = {
    'JMUON': [
        'JMUONGANDALF', 'JMUONENERGY', 'JMUONPREFIT', 'JMUONSIMPLEX',
        'JMUONSTART'
    ],
    'JSHOWER': ['JSHOWERPREFIT', 'JSHOWERPOSITIONFIT', 'JSHOWERCOMPLETEFIT'],
    'JDUSJ': ['JDUSJPREFIT', 'JDUSJPOSITIONFIT', 'JDUSJCOMPLETEFIT'],
}

[docs]FITINF2NAME = {v: k for k, v in FITINF2NUM.items()}
[docs]FITINFDUSJ2NAME = {v: k for k, v in FITINFDUSJ2NUM.items()}
[docs]RECO2NAME = {v: k for k, v in RECO2NUM.items()}

[docs]IS_CC = {
    3: 0,    # False,
    2: 1,    # True,
    1: 0,    # False,
    0: 1,    # True,
}


[docs]class AanetPump(Pump):
    """A pump for binary Aanet files.

    Parameters
    ----------
    filename: str, optional
        Name of the file to open. If this parameter is not given, ``filenames``
        needs to be specified instead.
    ignore_hits: bool, optional [default=False]
        If true, don't read our the hits/mchits.
    bare: bool, optional [default=False]
        Do not create KM3Pipe specific data, just wrap the bare aanet API.
        This will only give you ``blob['evt']``.
    """

[docs]    def configure(self):
        self.filename = self.require('filename')
        self.ignore_hits = bool(self.get('ignore_hits'))
        self.bare = self.get('bare', default=False)
        self.raw_header = None
        self.header = None
        self.blobs = self.blob_generator()
        self.group_id = 0
        self._generic_dtypes_avail = {}

[docs]    def get_blob(self, index):
        NotImplementedError("Aanet currently does not support indexing.")

[docs]    def blob_generator(self):
        """Create a blob generator."""
        # pylint: disable:F0401,W0612
        import aa    # pylint: disablF0401        # noqa
        from ROOT import EventFile    # pylint: disable F0401

        filename = self.filename
        log.info("Reading from file: {0}".format(filename))
        if not os.path.exists(filename):
            log.warning(filename + " not available: continue without it")

        try:
            event_file = EventFile(filename)
        except Exception:
            raise SystemExit("Could not open file")

        log.info("Generating blobs through new aanet API...")

        self.print("Reading metadata using 'JPrintMeta'")
        meta_parser = MetaParser(filename=filename)
        meta = meta_parser.get_table()
        if meta is None:
            self.log.warning(
                "No metadata found, this means no data provenance!"
            )

        if self.bare:
            log.info("Skipping data conversion, only passing bare aanet data")
            for event in event_file:
                yield Blob({'evt': event, 'event_file': event_file})
        else:
            log.info("Unpacking aanet header into dictionary...")
            hdr = self._parse_header(event_file.header)
            if not hdr:
                log.info("Empty header dict found, skipping...")
                self.raw_header = None
            else:
                log.info("Converting Header dict to Table...")
                self.raw_header = self._convert_header_dict_to_table(hdr)
                log.info("Creating HDF5Header")
                self.header = HDF5Header.from_table(self.raw_header)
            for event in event_file:
                log.debug('Reading event...')
                blob = self._read_event(event, filename)
                log.debug('Reading header...')
                blob["RawHeader"] = self.raw_header
                blob["Header"] = self.header

                if meta is not None:
                    blob['Meta'] = meta

                self.group_id += 1
                yield blob
        del event_file

    def _parse_eventinfo(self, event):
        event_id = event.frame_index
        mc_id = event.frame_index - 1
        # run_id = self._get_run_id()
        wgt1, wgt2, wgt3, wgt4 = self._parse_wgts(event.w)
        tab_data = {
            'event_id': event_id,
            'mc_id': mc_id,
            'run_id': event.run_id,    # TODO: this may segfault in aanet
            'weight_w1': wgt1,
            'weight_w2': wgt2,
            'weight_w3': wgt3,
            'weight_w4': wgt4,
            'group_id': self.group_id,
        }
        tab_data['timestamp'] = event.t.GetSec()
        tab_data['nanoseconds'] = event.t.GetNanoSec()
        tab_data['mc_time'] = event.mc_t
        info = Table(tab_data, h5loc='/event_info', name='EventInfo')
        return info

    @staticmethod
    def _parse_wgts(wgt):
        if len(wgt) == 3:
            wgt1, wgt2, wgt3 = wgt
            wgt4 = np.nan
        elif len(wgt) == 4:
            # what the hell is w4?
            wgt1, wgt2, wgt3, wgt4 = wgt
        else:
            wgt1 = wgt2 = wgt3 = wgt4 = np.nan
        return wgt1, wgt2, wgt3, wgt4

    def _parse_tracks(self, tracks):
        log.info("Reading Tracks...")
        track_dict = defaultdict(list)
        # iterating empty ROOT vector causes segfaults!
        if len(tracks) == 0:
            self.log.debug("Found empty tracks, skipping...")
            return {}

        for i, trk in enumerate(tracks):
            self.log.debug('Reading Track #{}...'.format(i))
            trk_dict = self._read_track(trk)
            # set name + h5loc later, if the name is not available, we need
            # the dtype to make a new name

            trk_type = trk.rec_type
            try:
                trk_name = RECO2NAME[trk_type]
            except KeyError:
                trk_type = AANET_RECTYPE_PLACEHOLDER
            if trk_type == AANET_RECTYPE_PLACEHOLDER:
                # if we have a history available but no name (because JEvt.cc),
                # then use the concatenated history as the name.
                # If that is not available, enumerate the tracks by their
                # dtypes (since they have no other tagging)
                if len(trk.rec_stages) == 0:
                    self.log.error("Unknown reco type & no history!")
                    trk_name = "UnknownTrack{}".format(i)
                else:
                    self.log.info("Unknown recoo type! Using history...")
                    stages_num = [s for s in trk.rec_stages]
                    stages = [RECO2NAME[s] for s in stages_num]

                    is_chain = False
                    for chain, default_stages in JHIST_CHAINS.items():
                        # chain is something like JMUON
                        if (RECO2NUM[chain + 'BEGIN'] < min(stages_num)) and (
                                RECO2NUM[chain + 'END'] > max(stages_num)):
                            if chain == "JDUSJ":
                                self.log.info("Adding missing Dusj parameters")
                                for dusj_param in FITINFDUSJ2NUM:
                                    if dusj_param not in trk_dict:
                                        trk_dict[dusj_param] = np.nan
                            self.log.info(
                                "Found {}, adding stage flags".format(chain)
                            )
                            trk_name = chain
                            for stage in default_stages:
                                if stage in stages:
                                    trk_dict[stage] = True
                                else:
                                    trk_dict[stage] = False
                            is_chain = True
                            break

                    if not is_chain:
                        self.log.info("Unknown chain, using stages as name")
                        trk_name = '__'.join([s for s in stages[::-1]])
                        trk_name = 'JHIST__' + trk_name

            # tab.h5loc = '/reco/{}'.format(trk_name.lower())
            track_dict[trk_name].append(trk_dict)

        return self._merge_tracks(track_dict)

    def _merge_tracks(self, track_dict):
        log.info("Merging tracks into table...")
        out = {}
        for track_name, tracks in track_dict.items():
            self.log.debug("Merging '{}'...".format(track_name))
            cols = set(itertools.chain(*[t.keys() for t in tracks]))
            track_data = defaultdict(list)
            for track in tracks:
                for col in cols:
                    if col in track:
                        track_data[col].append(track[col])
                    else:
                        track_data[col].append(np.nan)

            out[track_name] = Table(
                track_data,
                h5loc='/reco/{}'.format(track_name.lower()),
                name=track_name
            )
        return out

    def _read_track(self, trk):
        out = {}
        out['pos_x'] = trk.pos.x
        out['pos_y'] = trk.pos.y
        out['pos_z'] = trk.pos.z
        out['dir_x'] = trk.dir.x
        out['dir_y'] = trk.dir.y
        out['dir_z'] = trk.dir.z
        out['id'] = trk.id
        out['energy'] = trk.E
        out['time'] = trk.t
        out['length'] = trk.len
        out['likelihood'] = trk.lik
        out['rec_type'] = trk.rec_type
        out['group_id'] = self.group_id
        # TODO: hit_ids,
        # TODO: rec_stages,
        self.log.debug('Reading fitinf...')

        isDusj = False
        if len(trk.rec_stages) > 0:
            if (min(trk.rec_stages) >= RECO2NUM['JDUSJBEGIN']) and (max(
                    trk.rec_stages) <= RECO2NUM['JDUSJEND']):
                isDusj = True
        if isDusj:
            fitinf = self._parse_fitinf_dusj(trk.fitinf)
        else:
            fitinf = self._parse_fitinf(trk.fitinf)

        out.update(fitinf)
        return out

    def _parse_fitinf(self, fitinf):
        # iterating empty ROOT vector causes segfaults!
        if len(fitinf) == 0:
            self.log.debug("Found empty fitinf, skipping...")
            return {}

        out = {}
        for i, elem in enumerate(fitinf):
            name = FITINF2NAME[i]
            self.log.debug("Reading fitinf #{} ('{}')...".format(i, name))
            out[name] = elem
        return out

    def _parse_fitinf_dusj(self, fitinf):
        # iterating empty ROOT vector causes segfaults!
        if len(fitinf) == 0:
            self.log.debug("Found empty fitinf, skipping...")
            return {}

        out = {}
        for i, elem in enumerate(fitinf):
            name = FITINFDUSJ2NAME[i]
            self.log.debug("Reading fitinf #{} ('{}')...".format(i, name))
            out[name] = elem
        return out

    def _parse_mctracks(self, mctracks):
        out = defaultdict(list)
        # iterating empty ROOT vector causes segfaults!
        if len(mctracks) == 0:
            self.log.debug("Found empty mctracks, skipping...")
            return out
        for trk in mctracks:
            out['dir_x'].append(trk.dir.x)
            out['dir_y'].append(trk.dir.y)
            out['dir_z'].append(trk.dir.z)
            out['pos_x'].append(trk.pos.x)
            out['pos_y'].append(trk.pos.y)
            out['pos_z'].append(trk.pos.z)
            out['energy'].append(trk.E)
            out['time'].append(trk.t)
            out['type'].append(trk.type)
            out['id'].append(trk.id)
            out['length'].append(trk.len)
            out['bjorkeny'].append(trk.getusr('by'))
            out['interaction_channel'].append(trk.getusr('ichan'))
            try:
                is_cc = IS_CC[trk.getusr('cc')]
            except KeyError:
                # see git.km3net.de/km3py/km3pipe/issues/112
                # and http://trac.km3net.de/ticket/222
                self.log.error(
                    "Invalid value ({}) for the 'cc' usr-parameter in the "
                    "MC track. 'is_cc' is now set to 0 (False).".format(
                        trk.getusr('cc')
                    )
                )
                is_cc = 0
            finally:
                out['is_cc'].append(is_cc)
        out['group_id'] = self.group_id
        return Table(out, name='McTracks', h5loc='/mc_tracks')

    def _parse_mchits(self, mchits):
        out = defaultdict(list)
        # iterating empty ROOT vector causes segfaults!
        if len(mchits) == 0:
            self.log.debug("Found empty mchits, skipping...")
            return out
        for hit in mchits:
            out['a'].append(hit.a)
            out['origin'].append(hit.origin)
            out['pmt_id'].append(hit.pmt_id)
            out['time'].append(hit.t)
        out['group_id'] = self.group_id
        return Table(out, name='McHits', h5loc='/mc_hits', split_h5=True)

    def _parse_hits(self, hits):
        out = defaultdict(list)
        # iterating empty ROOT vector causes segfaults!
        if len(hits) == 0:
            self.log.debug("Found empty hits, skipping...")
            return out
        for hit in hits:
            out['channel_id'].append(hit.channel_id)
            out['dom_id'].append(hit.dom_id)
            out['time'].append(hit.t)
            out['tot'].append(hit.tot)
            out['triggered'].append(hit.trig)
        out['group_id'] = self.group_id
        return Table(out, name='Hits', h5loc='/hits', split_h5=True)

    @staticmethod
    def _parse_header(header):
        tags = {}
        for key, taglist in header._hdr_dict():
            tags[key] = [k for k in taglist]
        out = {}
        if len(header) == 0:
            return out
        for i, (key, entries) in enumerate(header):
            out[key] = {}
            for j, elem in enumerate(entries.split()):
                if key in tags:
                    try:
                        elem_name = tags[key][j]
                    except IndexError:
                        elem_name = '{}_{}'.format(key, j)
                        log.info(
                            "Can't infer field name, "
                            "setting to '{}'...".format(elem_name)
                        )
                else:
                    elem_name = '{}_{}'.format(key, j)
                    log.info(
                        "Can't infer field name, "
                        "setting to '{}'...".format(elem_name)
                    )
                out[key][elem_name] = elem
        return out

    # TODO: delete this method and use the function in io/hdf5.py
    @staticmethod
    def _convert_header_dict_to_table(header_dict):
        if not header_dict:
            log.warn("Can't convert empty header dict to table, skipping...")
            return
        tab_dict = defaultdict(list)
        log.debug("Param:   field_names    field_values    dtype")
        for parameter, data in header_dict.items():
            fields = []
            values = []
            types = []
            for field_name, field_value in data.items():
                fields.append(field_name)
                values.append(field_value)
                try:
                    _ = float(field_value)    # noqa
                    types.append('f4')
                except ValueError:
                    types.append('a{}'.format(len(field_value)))
            tab_dict['parameter'].append(parameter)
            tab_dict['field_names'].append(' '.join(fields))
            tab_dict['field_values'].append(' '.join(values))
            tab_dict['dtype'].append(' '.join(types))
            log.debug(
                "{}: {} {} {}".format(
                    tab_dict['parameter'][-1],
                    tab_dict['field_names'][-1],
                    tab_dict['field_values'][-1],
                    tab_dict['dtype'][-1],
                )
            )
        return Table(
            tab_dict, h5loc='/raw_header', name='RawHeader', h5singleton=True
        )

    def _read_event(self, event, filename):
        blob = Blob()
        if self.ignore_hits:
            self.log.debug('Skipping Hits...')
        else:
            self.log.debug('Reading Hits...')
            blob['Hits'] = self._parse_hits(event.hits)
            self.log.debug('Reading McHits...')
            blob['McHits'] = self._parse_mchits(event.mc_hits)
        self.log.debug('Reading McTracks...')
        blob['McTracks'] = self._parse_mctracks(event.mc_trks)
        self.log.debug('Reading EventInfo...')
        blob['EventInfo'] = self._parse_eventinfo(event)
        self.log.debug('Reading Tracks...')
        blob.update(self._parse_tracks(event.trks))
        return blob

[docs]    def process(self, blob=None):
        return next(self.blobs)

    def __iter__(self):
        return self

    def __next__(self):
        return next(self.blobs)


[docs]class MetaParser(object):
    """A class which parses the JPrintMeta output for a given filenam"""

    def __init__(self, filename=None, string=None):
        self.log = get_logger(__name__ + '.' + self.__class__.__name__)
        self.meta = []
        if filename is not None:
            string = subprocess.check_output(['JPrintMeta', '-f', filename])
            try:
                self.parse_string(string)
            except IndexError:
                self.log.error("The Jpp metadata could not be parsed.")

[docs]    def parse_string(self, string):
        """Parse ASCII output of JPrintMeta"""
        self.log.info("Parsing ASCII data")

        if not string:
            self.log.warning("Empty metadata")
            return

        lines = string.splitlines()
        application_data = []

        application = lines[0].split()[0]
        self.log.debug("Reading meta information for '%s'" % application)

        for line in lines:
            if application is None:
                self.log.debug(
                    "Reading meta information for '%s'" % application
                )
                application = line.split()[0]
            application_data.append(line)
            if line.startswith(application + b' Linux'):
                self._record_app_data(application_data)
                application_data = []
                application = None

    def _record_app_data(self, data):
        """Parse raw metadata output for a single application
        
        The usual output is:
        ApplicationName RevisionNumber
        ApplicationName ROOT_Version
        ApplicationName KM3NET
        ApplicationName ./command/line --arguments --which --can
        contain
        also
        multiple lines
        and --addtional flags
        etc.
        ApplicationName Linux ... (just the `uname -a` output)
        """
        name, revision = data[0].split()
        root_version = data[1].split()[1]
        command = b'\n'.join(data[3:]).split(b'\n' + name + b' Linux')[0]
        self.meta.append({
            'application_name': np.string_(name),
            'revision': np.string_(revision),
            'root_version': np.string_(root_version),
            'command': np.string_(command)
        })

[docs]    def get_table(self, name='Meta', h5loc='/meta'):
        """Convert metadata to a KM3Pipe Table.

        Returns `None` if there is no data.
        
        Each column's dtype will be set to a fixed size string (numpy.string_)
        with the length of the longest entry, since writing variable length
        strings does not fit the current scheme.
        """
        if not self.meta:
            return None

        data = defaultdict(list)
        for entry in self.meta:
            for key, value in entry.items():
                data[key].append(value)
        dtypes = []
        for key, values in data.items():
            max_len = max(map(len, values))
            dtype = 'S{}'.format(max_len)
            dtypes.append((key, dtype))
        tab = Table(
            data, dtype=dtypes, h5loc=h5loc, name='Meta', h5singleton=True
        )
        return tab