doxygen/xlink_main_2022_07_07_08.22.14/apdb_cassandra_schema_8py_source.html

# This file is part of dax_apdb.

#

# Developed for the LSST Data Management System.

# This product includes software developed by the LSST Project

# (http://www.lsst.org).

# See the COPYRIGHT file at the top-level directory of this distribution

# for details of code ownership.

#

# This program is free software: you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation, either version 3 of the License, or

# (at your option) any later version.

#

# This program is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

# GNU General Public License for more details.

#

# You should have received a copy of the GNU General Public License

# along with this program.  If not, see <http://www.gnu.org/licenses/>.


from __future__ import annotations


__all__ = ["ApdbCassandraSchema"]


import enum

import logging

from typing import List, Mapping, Optional, TYPE_CHECKING, Tuple, Union


from .apdbSchema import ApdbSchema, ApdbTables, ColumnDef, IndexDef, IndexType, TableDef


if TYPE_CHECKING:

    import cassandra.cluster


_LOG = logging.getLogger(__name__)


@enum.unique

class ExtraTables(enum.Enum):

    """Names of the extra tables used by Cassandra implementation."""


    DiaSourceToPartition = "DiaSourceToPartition"

    "Maps diaSourceId ro its partition values (pixel and time)."


    def table_name(self, prefix: str = "") -> str:

        """Return full table name."""

        return prefix + self.value


class ApdbCassandraSchema(ApdbSchema):

    """Class for management of APDB schema.


    Parameters

    ----------

    session : `cassandra.cluster.Session`

        Cassandra session object

    schema_file : `str`

        Name of the YAML schema file.

    schema_name : `str`, optional

        Name of the schema in YAML files.

    prefix : `str`, optional

        Prefix to add to all schema elements.

    time_partition_tables : `bool`

        If True then schema will have a separate table for each time partition.

    """


    _type_map = dict(double="DOUBLE",

                     float="FLOAT",

                     timestamp="TIMESTAMP",

                     long="BIGINT",

                     int="INT",

                     short="INT",

                     byte="TINYINT",

                     binary="BLOB",

                     char="TEXT",

                     string="TEXT",

                     unicode="TEXT",

                     text="TEXT",

                     boolean="BOOLEAN")

    """Map YAML column types to Cassandra"""


    _time_partitioned_tables = [

        ApdbTables.DiaObject,

        ApdbTables.DiaSource,

        ApdbTables.DiaForcedSource,

    ]

    _spatially_partitioned_tables = [ApdbTables.DiaObjectLast]


    def __init__(

        self,

        session: cassandra.cluster.Session,

        keyspace: str,

        schema_file: str,

        schema_name: str = "ApdbSchema",

        prefix: str = "",

        time_partition_tables: bool = False

    ):


        super().__init__(schema_file, schema_name)


        self._session_session = session

        self._keyspace_keyspace = keyspace

        self._prefix_prefix = prefix

        self._time_partition_tables_time_partition_tables = time_partition_tables


        # add columns and index for partitioning.

        self._ignore_tables_ignore_tables = []

        for table, tableDef in self.tableSchemastableSchemas.items():

            columns = []

            add_columns = True

            if table in self._spatially_partitioned_tables_spatially_partitioned_tables:

                # DiaObjectLast does not need temporal partitioning

                columns = ["apdb_part"]

            elif table in self._time_partitioned_tables_time_partitioned_tables:

                if time_partition_tables:

                    columns = ["apdb_part"]

                else:

                    columns = ["apdb_part", "apdb_time_part"]

            elif table is ApdbTables.SSObject:

                # For SSObject there is no natural partition key but we have

                # to partition it because there are too many of them. I'm

                # going to partition on its primary key (and drop separate

                # primary key index).

                columns = ["ssObjectId"]

                tableDef.indices = [

                    index for index in tableDef.indices if index.type is not IndexType.PRIMARY

                ]

                add_columns = False

            else:

                # TODO: Do not know yet how other tables can be partitioned

                self._ignore_tables_ignore_tables.append(table)

                add_columns = False


            if add_columns:

                # add columns to the column list

                columnDefs = [

                    ColumnDef(name=name, type="long", nullable=False) for name in columns

                ]

                tableDef.columns = columnDefs + tableDef.columns


            # make an index

            if columns:

                index = IndexDef(name=f"Part_{tableDef.name}", type=IndexType.PARTITION, columns=columns)

                tableDef.indices.append(index)


        self._extra_tables_extra_tables = self._extraTableSchema_extraTableSchema()


    def _extraTableSchema(self) -> Mapping[ExtraTables, TableDef]:

        """Generate schema for extra tables."""

        return {

            ExtraTables.DiaSourceToPartition: TableDef(

                name=ExtraTables.DiaSourceToPartition.value,

                columns=[

                    ColumnDef(name="diaSourceId", type="long", nullable=False),

                    ColumnDef(name="apdb_part", type="long", nullable=False),

                    ColumnDef(name="apdb_time_part", type="int", nullable=False),

                ],

                indices=[

                    IndexDef(

                        name=f"Part_{ExtraTables.DiaSourceToPartition.value}",

                        type=IndexType.PARTITION,

                        columns=["diaSourceId"],

                    ),

                ],

            ),

        }


    def tableName(self, table_name: Union[ApdbTables, ExtraTables]) -> str:

        """Return Cassandra table name for APDB table.

        """

        return table_name.table_name(self._prefix_prefix)


    def getColumnMap(self, table_name: Union[ApdbTables, ExtraTables]) -> Mapping[str, ColumnDef]:

        """Returns mapping of column names to Column definitions.


        Parameters

        ----------

        table_name : `ApdbTables`

            One of known APDB table names.


        Returns

        -------

        column_map : `dict`

            Mapping of column names to `ColumnDef` instances.

        """

        if isinstance(table_name, ApdbTables):

            table_schema = self.tableSchemastableSchemas[table_name]

        else:

            table_schema = self._extra_tables_extra_tables[table_name]

        cmap = {column.name: column for column in table_schema.columns}

        return cmap


    def partitionColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]:

        """Return a list of columns used for table partitioning.


        Parameters

        ----------

        table_name : `ApdbTables`

            Table name in APDB schema


        Returns

        -------

        columns : `list` of `str`

            Names of columns for used for partitioning.

        """

        if isinstance(table_name, ApdbTables):

            table_schema = self.tableSchemastableSchemas[table_name]

        else:

            table_schema = self._extra_tables_extra_tables[table_name]

        for index in table_schema.indices:

            if index.type is IndexType.PARTITION:

                # there could be just one partitoning index (possibly with few columns)

                return index.columns

        return []


    def clusteringColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]:

        """Return a list of columns used for clustering.


        Parameters

        ----------

        table_name : `ApdbTables`

            Table name in APDB schema


        Returns

        -------

        columns : `list` of `str`

            Names of columns for used for clustering.

        """

        if isinstance(table_name, ApdbTables):

            table_schema = self.tableSchemastableSchemas[table_name]

        else:

            table_schema = self._extra_tables_extra_tables[table_name]

        for index in table_schema.indices:

            if index.type is IndexType.PRIMARY:

                return index.columns

        return []


    def makeSchema(self, drop: bool = False, part_range: Optional[Tuple[int, int]] = None) -> None:

        """Create or re-create all tables.


        Parameters

        ----------

        drop : `bool`

            If True then drop tables before creating new ones.

        part_range : `tuple` [ `int` ] or `None`

            Start and end partition number for time partitions, end is not

            inclusive. Used to create per-partition DiaObject, DiaSource, and

            DiaForcedSource tables. If `None` then per-partition tables are

            not created.

        """

        for table in self.tableSchemastableSchemas:

            self._makeTableSchema_makeTableSchema(table, drop, part_range)

        for extra_table in self._extra_tables_extra_tables:

            self._makeTableSchema_makeTableSchema(extra_table, drop, part_range)


    def _makeTableSchema(

        self,

        table: Union[ApdbTables, ExtraTables],

        drop: bool = False,

        part_range: Optional[Tuple[int, int]] = None

    ) -> None:

        if table in self._ignore_tables_ignore_tables:

            _LOG.debug("Skipping schema for table %s", table)

            return

        _LOG.debug("Making table %s", table)


        fullTable = table.table_name(self._prefix_prefix)


        table_list = [fullTable]

        if part_range is not None:

            if table in self._time_partitioned_tables_time_partitioned_tables:

                partitions = range(*part_range)

                table_list = [f"{fullTable}_{part}" for part in partitions]


        if drop:

            queries = [

                f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list

            ]

            futures = [self._session_session.execute_async(query, timeout=None) for query in queries]

            for future in futures:

                _LOG.debug("wait for query: %s", future.query)

                future.result()

                _LOG.debug("query finished: %s", future.query)


        queries = []

        for table_name in table_list:

            if_not_exists = "" if drop else "IF NOT EXISTS"

            columns = ", ".join(self._tableColumns_tableColumns(table))

            query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})'

            _LOG.debug("query: %s", query)

            queries.append(query)

        futures = [self._session_session.execute_async(query, timeout=None) for query in queries]

        for future in futures:

            _LOG.debug("wait for query: %s", future.query)

            future.result()

            _LOG.debug("query finished: %s", future.query)


    def _tableColumns(self, table_name: Union[ApdbTables, ExtraTables]) -> List[str]:

        """Return set of columns in a table


        Parameters

        ----------

        table_name : `ApdbTables`

            Name of the table.


        Returns

        -------

        column_defs : `list`

            List of strings in the format "column_name type".

        """

        if isinstance(table_name, ApdbTables):

            table_schema = self.tableSchemastableSchemas[table_name]

        else:

            table_schema = self._extra_tables_extra_tables[table_name]


        # must have partition columns and clustering columns

        part_columns = []

        clust_columns = []

        index_columns = set()

        for index in table_schema.indices:

            if index.type is IndexType.PARTITION:

                part_columns = index.columns

            elif index.type is IndexType.PRIMARY:

                clust_columns = index.columns

            index_columns.update(index.columns)

        _LOG.debug("part_columns: %s", part_columns)

        _LOG.debug("clust_columns: %s", clust_columns)

        if not part_columns:

            raise ValueError(f"Table {table_name} configuration is missing partition index")


        # all columns

        column_defs = []

        for column in table_schema.columns:

            ctype = self._type_map_type_map[column.type]

            column_defs.append(f'"{column.name}" {ctype}')


        # primary key definition

        part_columns = [f'"{col}"' for col in part_columns]

        clust_columns = [f'"{col}"' for col in clust_columns]

        if len(part_columns) > 1:

            columns = ", ".join(part_columns)

            part_columns = [f"({columns})"]

        pkey = ", ".join(part_columns + clust_columns)

        _LOG.debug("pkey: %s", pkey)

        column_defs.append(f"PRIMARY KEY ({pkey})")


        return column_defs

items
std::vector< SchemaItem< Flag > > * items
Definition: BaseColumnView.cc:142

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema
Definition: apdbCassandraSchema.py:51

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._session
_session
Definition: apdbCassandraSchema.py:102

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._time_partitioned_tables
list _time_partitioned_tables
Definition: apdbCassandraSchema.py:83

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema.clusteringColumns
List[str] clusteringColumns(self, Union[ApdbTables, ExtraTables] table_name)
Definition: apdbCassandraSchema.py:217

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._time_partition_tables
_time_partition_tables
Definition: apdbCassandraSchema.py:105

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._prefix
_prefix
Definition: apdbCassandraSchema.py:104

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema.getColumnMap
Mapping[str, ColumnDef] getColumnMap(self, Union[ApdbTables, ExtraTables] table_name)
Definition: apdbCassandraSchema.py:174

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema.makeSchema
None makeSchema(self, bool drop=False, Optional[Tuple[int, int]] part_range=None)
Definition: apdbCassandraSchema.py:239

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._extraTableSchema
Mapping[ExtraTables, TableDef] _extraTableSchema(self)
Definition: apdbCassandraSchema.py:149

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._extra_tables
_extra_tables
Definition: apdbCassandraSchema.py:147

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema.partitionColumns
List[str] partitionColumns(self, Union[ApdbTables, ExtraTables] table_name)
Definition: apdbCassandraSchema.py:194

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._type_map
_type_map
Definition: apdbCassandraSchema.py:68

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema.tableName
str tableName(self, Union[ApdbTables, ExtraTables] table_name)
Definition: apdbCassandraSchema.py:169

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema.__init__
def __init__(self, cassandra.cluster.Session session, str keyspace, str schema_file, str schema_name="ApdbSchema", str prefix="", bool time_partition_tables=False)
Definition: apdbCassandraSchema.py:98

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._ignore_tables
_ignore_tables
Definition: apdbCassandraSchema.py:108

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._tableColumns
List[str] _tableColumns(self, Union[ApdbTables, ExtraTables] table_name)
Definition: apdbCassandraSchema.py:299

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._makeTableSchema
None _makeTableSchema(self, Union[ApdbTables, ExtraTables] table, bool drop=False, Optional[Tuple[int, int]] part_range=None)
Definition: apdbCassandraSchema.py:262

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._spatially_partitioned_tables
list _spatially_partitioned_tables
Definition: apdbCassandraSchema.py:88

lsst.dax.apdb.apdbCassandraSchema.ApdbCassandraSchema._keyspace
_keyspace
Definition: apdbCassandraSchema.py:103

lsst.dax.apdb.apdbCassandraSchema.ExtraTables
Definition: apdbCassandraSchema.py:40

lsst.dax.apdb.apdbCassandraSchema.ExtraTables.table_name
str table_name(self, str prefix="")
Definition: apdbCassandraSchema.py:46

lsst.dax.apdb.apdbSchema.ApdbSchema
Definition: apdbSchema.py:164

lsst.dax.apdb.apdbSchema.ApdbSchema.tableSchemas
tableSchemas
Definition: apdbSchema.py:186

lsst.dax.apdb.apdbSchema.ColumnDef
Definition: apdbSchema.py:64

lsst.dax.apdb.apdbSchema.IndexDef
Definition: apdbSchema.py:101

lsst.dax.apdb.apdbSchema.TableDef
Definition: apdbSchema.py:113

set
daf::base::PropertySet * set
Definition: fits.cc:912

ast::append
std::shared_ptr< FrameSet > append(FrameSet const &first, FrameSet const &second)
Construct a FrameSet that performs two transformations in series.
Definition: functional.cc:33