LSST Applications g02d81e74bb+86cf3d8bc9,g180d380827+7a4e862ed4,g2079a07aa2+86d27d4dc4,g2305ad1205+e1ca1c66fa,g29320951ab+012e1474a1,g295015adf3+341ea1ce94,g2bbee38e9b+0e5473021a,g337abbeb29+0e5473021a,g33d1c0ed96+0e5473021a,g3a166c0a6a+0e5473021a,g3ddfee87b4+c429d67c83,g48712c4677+f88676dd22,g487adcacf7+27e1e21933,g50ff169b8f+96c6868917,g52b1c1532d+585e252eca,g591dd9f2cf+b41db86c35,g5a732f18d5+53520f316c,g64a986408d+86cf3d8bc9,g858d7b2824+86cf3d8bc9,g8a8a8dda67+585e252eca,g99cad8db69+84912a7fdc,g9ddcbc5298+9a081db1e4,ga1e77700b3+15fc3df1f7,ga8c6da7877+a2b54eae19,gb0e22166c9+60f28cb32d,gba4ed39666+c2a2e4ac27,gbb8dafda3b+6681f309db,gc120e1dc64+f0fcc2f6d8,gc28159a63d+0e5473021a,gcf0d15dbbd+c429d67c83,gdaeeff99f8+f9a426f77a,ge6526c86ff+0433e6603d,ge79ae78c31+0e5473021a,gee10cc3b42+585e252eca,gff1a9f87cc+86cf3d8bc9,w.2024.17
LSST Data Management Base Package
Loading...
Searching...
No Matches
apdbCassandraSchema.py
Go to the documentation of this file.
1# This file is part of dax_apdb.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (http://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <http://www.gnu.org/licenses/>.
21
22from __future__ import annotations
23
24__all__ = ["ApdbCassandraSchema"]
25
26import enum
27import logging
28from collections.abc import Mapping
29from typing import TYPE_CHECKING
30
31import felis.datamodel
32
33from .. import schema_model
34from ..apdbSchema import ApdbSchema, ApdbTables
35
36if TYPE_CHECKING:
37 import cassandra.cluster
38
39
40_LOG = logging.getLogger(__name__)
41
42
43class InconsistentSchemaError(RuntimeError):
44 """Exception raised when schema state is inconsistent."""
45
46
47@enum.unique
48class ExtraTables(enum.Enum):
49 """Names of the extra tables used by Cassandra implementation."""
50
51 ApdbReplicaChunks = "ApdbReplicaChunks"
52 """Name of the table for replica chunk records."""
53
54 DiaObjectChunks = "DiaObjectChunks"
55 """Name of the table for DIAObject chunk data."""
56
57 DiaSourceChunks = "DiaSourceChunks"
58 """Name of the table for DIASource chunk data."""
59
60 DiaForcedSourceChunks = "DiaForcedSourceChunks"
61 """Name of the table for DIAForcedSource chunk data."""
62
63 DiaSourceToPartition = "DiaSourceToPartition"
64 "Maps diaSourceId to its partition values (pixel and time)."
65
66 def table_name(self, prefix: str = "") -> str:
67 """Return full table name."""
68 return prefix + self.value
69
70 @classmethod
71 def replica_chunk_tables(cls) -> Mapping[ExtraTables, ApdbTables]:
72 """Return mapping of tables used for replica chunks storage to their
73 corresponding regular tables.
74 """
75 return {
76 cls.DiaObjectChunks: ApdbTables.DiaObject,
77 cls.DiaSourceChunks: ApdbTables.DiaSource,
78 cls.DiaForcedSourceChunks: ApdbTables.DiaForcedSource,
79 }
80
81
83 """Class for management of APDB schema.
84
85 Parameters
86 ----------
87 session : `cassandra.cluster.Session`
88 Cassandra session object
89 keyspace : `str`
90 Keyspace name for all tables.
91 schema_file : `str`
92 Name of the YAML schema file.
93 schema_name : `str`, optional
94 Name of the schema in YAML files.
95 prefix : `str`, optional
96 Prefix to add to all schema elements.
97 time_partition_tables : `bool`
98 If `True` then schema will have a separate table for each time
99 partition.
100 enable_replica : `bool`, optional
101 If `True` then use additional tables for replica chunks.
102 """
103
104 _type_map = {
105 felis.datamodel.DataType.double: "DOUBLE",
106 felis.datamodel.DataType.float: "FLOAT",
107 felis.datamodel.DataType.timestamp: "TIMESTAMP",
108 felis.datamodel.DataType.long: "BIGINT",
109 felis.datamodel.DataType.int: "INT",
110 felis.datamodel.DataType.short: "INT",
111 felis.datamodel.DataType.byte: "TINYINT",
112 felis.datamodel.DataType.binary: "BLOB",
113 felis.datamodel.DataType.char: "TEXT",
114 felis.datamodel.DataType.string: "TEXT",
115 felis.datamodel.DataType.unicode: "TEXT",
116 felis.datamodel.DataType.text: "TEXT",
117 felis.datamodel.DataType.boolean: "BOOLEAN",
118 schema_model.ExtraDataTypes.UUID: "UUID",
119 }
120 """Map YAML column types to Cassandra"""
121
122 _time_partitioned_tables = [
123 ApdbTables.DiaObject,
124 ApdbTables.DiaSource,
125 ApdbTables.DiaForcedSource,
126 ]
127 _spatially_partitioned_tables = [ApdbTables.DiaObjectLast]
128
130 self,
131 session: cassandra.cluster.Session,
132 keyspace: str,
133 schema_file: str,
134 schema_name: str = "ApdbSchema",
135 prefix: str = "",
136 time_partition_tables: bool = False,
137 enable_replica: bool = False,
138 ):
139 super().__init__(schema_file, schema_name)
140
141 self._session = session
142 self._keyspace = keyspace
143 self._prefix = prefix
144 self._time_partition_tables = time_partition_tables
145 self._enable_replica = enable_replica
146 self._has_replica_chunks: bool | None = None
147
148 self._apdb_tables = self._apdb_tables_schema(time_partition_tables)
150
151 def _apdb_tables_schema(self, time_partition_tables: bool) -> Mapping[ApdbTables, schema_model.Table]:
152 """Generate schema for regular APDB tables."""
153 apdb_tables: dict[ApdbTables, schema_model.Table] = {}
154
155 # add columns and index for partitioning.
156 for table, apdb_table_def in self.tableSchemas.items():
157 part_columns = []
158 add_columns = []
159 primary_key = apdb_table_def.primary_key[:]
160 if table in self._spatially_partitioned_tables:
161 # DiaObjectLast does not need temporal partitioning
162 part_columns = ["apdb_part"]
163 add_columns = part_columns
164 elif table in self._time_partitioned_tables:
165 if time_partition_tables:
166 part_columns = ["apdb_part"]
167 else:
168 part_columns = ["apdb_part", "apdb_time_part"]
169 add_columns = part_columns
170 elif table is ApdbTables.SSObject:
171 # For SSObject there is no natural partition key but we have
172 # to partition it because there are too many of them. I'm
173 # going to partition on its primary key (and drop separate
174 # primary key index).
175 part_columns = ["ssObjectId"]
176 primary_key = []
177 elif table is ApdbTables.metadata:
178 # Metadata is in one partition because we want to read all of
179 # it in one query, add an extra column for partition.
180 part_columns = ["meta_part"]
181 add_columns = part_columns
182 else:
183 # TODO: Do not know what to do with the other tables
184 continue
185
186 column_defs = []
187 if add_columns:
188 column_defs = [
190 id=f"#{name}", name=name, datatype=felis.datamodel.DataType.long, nullable=False
191 )
192 for name in add_columns
193 ]
194
195 annotations = dict(apdb_table_def.annotations)
196 annotations["cassandra:apdb_column_names"] = [column.name for column in apdb_table_def.columns]
197 if part_columns:
198 annotations["cassandra:partitioning_columns"] = part_columns
199
200 apdb_tables[table] = schema_model.Table(
201 id=apdb_table_def.id,
202 name=apdb_table_def.name,
203 columns=column_defs + apdb_table_def.columns,
204 primary_key=primary_key,
205 indexes=[],
206 constraints=[],
207 annotations=annotations,
208 )
209
210 return apdb_tables
211
212 def _extra_tables_schema(self) -> Mapping[ExtraTables, schema_model.Table]:
213 """Generate schema for extra tables."""
214 extra_tables: dict[ExtraTables, schema_model.Table] = {}
215
216 # This table maps DiaSource ID to its partitions in DiaSource table and
217 # DiaSourceChunks tables.
218 extra_tables[ExtraTables.DiaSourceToPartition] = schema_model.Table(
219 id="#" + ExtraTables.DiaSourceToPartition.value,
220 name=ExtraTables.DiaSourceToPartition.table_name(self._prefix),
221 columns=[
223 id="#diaSourceId",
224 name="diaSourceId",
225 datatype=felis.datamodel.DataType.long,
226 nullable=False,
227 ),
229 id="#apdb_part", name="apdb_part", datatype=felis.datamodel.DataType.long, nullable=False
230 ),
232 id="#apdb_time_part",
233 name="apdb_time_part",
234 datatype=felis.datamodel.DataType.int,
235 nullable=False,
236 ),
238 id="#apdb_replica_chunk",
239 name="apdb_replica_chunk",
240 datatype=felis.datamodel.DataType.long,
241 nullable=True,
242 ),
243 ],
244 primary_key=[],
245 indexes=[],
246 constraints=[],
247 annotations={"cassandra:partitioning_columns": ["diaSourceId"]},
248 )
249
250 replica_chunk_column = schema_model.Column(
251 id="#apdb_replica_chunk",
252 name="apdb_replica_chunk",
253 datatype=felis.datamodel.DataType.long,
254 nullable=False,
255 )
256
257 if not self._enable_replica:
258 return extra_tables
259
260 # Table containing insert IDs, this one is not partitioned, but
261 # partition key must be defined.
262 extra_tables[ExtraTables.ApdbReplicaChunks] = schema_model.Table(
263 id="#" + ExtraTables.ApdbReplicaChunks.value,
264 name=ExtraTables.ApdbReplicaChunks.table_name(self._prefix),
265 columns=[
267 id="#partition", name="partition", datatype=felis.datamodel.DataType.int, nullable=False
268 ),
269 replica_chunk_column,
271 id="#last_update_time",
272 name="last_update_time",
273 datatype=felis.datamodel.DataType.timestamp,
274 nullable=False,
275 ),
277 id="#unique_id",
278 name="unique_id",
279 datatype=schema_model.ExtraDataTypes.UUID,
280 nullable=False,
281 ),
282 ],
283 primary_key=[replica_chunk_column],
284 indexes=[],
285 constraints=[],
286 annotations={"cassandra:partitioning_columns": ["partition"]},
287 )
288
289 for chunk_table_enum, apdb_table_enum in ExtraTables.replica_chunk_tables().items():
290 apdb_table_def = self.tableSchemas[apdb_table_enum]
291
292 extra_tables[chunk_table_enum] = schema_model.Table(
293 id="#" + chunk_table_enum.value,
294 name=chunk_table_enum.table_name(self._prefix),
295 columns=[replica_chunk_column] + apdb_table_def.columns,
296 primary_key=apdb_table_def.primary_key[:],
297 indexes=[],
298 constraints=[],
299 annotations={
300 "cassandra:partitioning_columns": ["apdb_replica_chunk"],
301 "cassandra:apdb_column_names": [column.name for column in apdb_table_def.columns],
302 },
303 )
304
305 return extra_tables
306
307 @property
308 def has_replica_chunks(self) -> bool:
309 """Whether insert ID tables are to be used (`bool`)."""
310 if self._has_replica_chunks is None:
312 return self._has_replica_chunks
313
314 def _check_replica_chunks(self) -> bool:
315 """Check whether database has tables for tracking insert IDs."""
316 table_name = ExtraTables.ApdbReplicaChunks.table_name(self._prefix)
317 query = "SELECT count(*) FROM system_schema.tables WHERE keyspace_name = %s and table_name = %s"
318 result = self._session.execute(query, (self._keyspace, table_name))
319 row = result.one()
320 return bool(row[0])
321
322 def empty(self) -> bool:
323 """Return True if database schema is empty.
324
325 Returns
326 -------
327 empty : `bool`
328 `True` if none of the required APDB tables exist in the database,
329 `False` if all required tables exist.
330
331 Raises
332 ------
333 InconsistentSchemaError
334 Raised when some of the required tables exist but not all.
335 """
336 query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name = %s"
337 result = self._session.execute(query, (self._keyspace,))
338 table_names = set(row[0] for row in result.all())
339
340 existing_tables = []
341 missing_tables = []
342 for table_enum in self._apdb_tables:
343 table_name = table_enum.table_name(self._prefix)
344 if self._time_partition_tables and table_enum in self._time_partitioned_tables:
345 # Check prefix for time-partitioned tables.
346 exists = any(table.startswith(f"{table_name}_") for table in table_names)
347 else:
348 exists = table_name in table_names
349 if exists:
350 existing_tables.append(table_name)
351 else:
352 missing_tables.append(table_name)
353
354 if not missing_tables:
355 return False
356 elif not existing_tables:
357 return True
358 else:
360 f"Only some required APDB tables exist: {existing_tables}, missing tables: {missing_tables}"
361 )
362
363 def tableName(self, table_name: ApdbTables | ExtraTables) -> str:
364 """Return Cassandra table name for APDB table."""
365 return table_name.table_name(self._prefix)
366
367 def keyspace(self) -> str:
368 """Return Cassandra keyspace for APDB tables."""
369 return self._keyspace
370
371 def getColumnMap(self, table_name: ApdbTables | ExtraTables) -> Mapping[str, schema_model.Column]:
372 """Return mapping of column names to Column definitions.
373
374 Parameters
375 ----------
376 table_name : `ApdbTables`
377 One of known APDB table names.
378
379 Returns
380 -------
381 column_map : `dict`
382 Mapping of column names to `ColumnDef` instances.
383 """
384 table_schema = self._table_schema(table_name)
385 cmap = {column.name: column for column in table_schema.columns}
386 return cmap
387
388 def apdbColumnNames(self, table_name: ApdbTables | ExtraTables) -> list[str]:
389 """Return a list of columns names for a table as defined in APDB
390 schema.
391
392 Parameters
393 ----------
394 table_name : `ApdbTables` or `ExtraTables`
395 Enum for a table in APDB schema.
396
397 Returns
398 -------
399 columns : `list` of `str`
400 Names of regular columns in the table.
401 """
402 table_schema = self._table_schema(table_name)
403 return table_schema.annotations["cassandra:apdb_column_names"]
404
405 def partitionColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
406 """Return a list of columns used for table partitioning.
407
408 Parameters
409 ----------
410 table_name : `ApdbTables`
411 Table name in APDB schema
412
413 Returns
414 -------
415 columns : `list` of `str`
416 Names of columns used for partitioning.
417 """
418 table_schema = self._table_schema(table_name)
419 return table_schema.annotations.get("cassandra:partitioning_columns", [])
420
421 def clusteringColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
422 """Return a list of columns used for clustering.
423
424 Parameters
425 ----------
426 table_name : `ApdbTables`
427 Table name in APDB schema
428
429 Returns
430 -------
431 columns : `list` of `str`
432 Names of columns for used for clustering.
433 """
434 table_schema = self._table_schema(table_name)
435 return [column.name for column in table_schema.primary_key]
436
438 self,
439 *,
440 drop: bool = False,
441 part_range: tuple[int, int] | None = None,
442 replication_factor: int | None = None,
443 ) -> None:
444 """Create or re-create all tables.
445
446 Parameters
447 ----------
448 drop : `bool`
449 If True then drop tables before creating new ones. Note that
450 only tables are dropped and not the whole keyspace.
451 part_range : `tuple` [ `int` ] or `None`
452 Start and end partition number for time partitions, end is not
453 inclusive. Used to create per-partition DiaObject, DiaSource, and
454 DiaForcedSource tables. If `None` then per-partition tables are
455 not created.
456 replication_factor : `int`, optional
457 Replication factor used when creating new keyspace, if keyspace
458 already exists its replication factor is not changed.
459 """
460 # Try to create keyspace if it does not exist
461 if replication_factor is None:
462 replication_factor = 1
463 query = (
464 f'CREATE KEYSPACE IF NOT EXISTS "{self._keyspace}"'
465 " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': "
466 f"{replication_factor}"
467 "}"
468 )
469 self._session.execute(query)
470
471 for table in self._apdb_tables:
472 self._makeTableSchema(table, drop, part_range)
473 for extra_table in self._extra_tables:
474 self._makeTableSchema(extra_table, drop, part_range)
475 # Reset cached information.
476 self._has_replica_chunks = None
477
479 self,
480 table: ApdbTables | ExtraTables,
481 drop: bool = False,
482 part_range: tuple[int, int] | None = None,
483 ) -> None:
484 _LOG.debug("Making table %s", table)
485
486 fullTable = table.table_name(self._prefix)
487
488 table_list = [fullTable]
489 if part_range is not None:
490 if table in self._time_partitioned_tables:
491 partitions = range(*part_range)
492 table_list = [f"{fullTable}_{part}" for part in partitions]
493
494 if drop:
495 queries = [f'DROP TABLE IF EXISTS "{self._keyspace}"."{table_name}"' for table_name in table_list]
496 futures = [self._session.execute_async(query, timeout=None) for query in queries]
497 for future in futures:
498 _LOG.debug("wait for query: %s", future.query)
499 future.result()
500 _LOG.debug("query finished: %s", future.query)
501
502 queries = []
503 for table_name in table_list:
504 if_not_exists = "" if drop else "IF NOT EXISTS"
505 columns = ", ".join(self._tableColumns(table))
506 query = f'CREATE TABLE {if_not_exists} "{self._keyspace}"."{table_name}" ({columns})'
507 _LOG.debug("query: %s", query)
508 queries.append(query)
509 futures = [self._session.execute_async(query, timeout=None) for query in queries]
510 for future in futures:
511 _LOG.debug("wait for query: %s", future.query)
512 future.result()
513 _LOG.debug("query finished: %s", future.query)
514
515 def _tableColumns(self, table_name: ApdbTables | ExtraTables) -> list[str]:
516 """Return set of columns in a table
517
518 Parameters
519 ----------
520 table_name : `ApdbTables`
521 Name of the table.
522
523 Returns
524 -------
525 column_defs : `list`
526 List of strings in the format "column_name type".
527 """
528 table_schema = self._table_schema(table_name)
529
530 # must have partition columns and clustering columns
531 part_columns = table_schema.annotations.get("cassandra:partitioning_columns", [])
532 clust_columns = [column.name for column in table_schema.primary_key]
533 _LOG.debug("part_columns: %s", part_columns)
534 _LOG.debug("clust_columns: %s", clust_columns)
535 if not part_columns:
536 raise ValueError(f"Table {table_name} configuration is missing partition index")
537
538 # all columns
539 column_defs = []
540 for column in table_schema.columns:
541 ctype = self._type_map[column.datatype]
542 column_defs.append(f'"{column.name}" {ctype}')
543
544 # primary key definition
545 part_columns = [f'"{col}"' for col in part_columns]
546 clust_columns = [f'"{col}"' for col in clust_columns]
547 if len(part_columns) > 1:
548 columns = ", ".join(part_columns)
549 part_columns = [f"({columns})"]
550 pkey = ", ".join(part_columns + clust_columns)
551 _LOG.debug("pkey: %s", pkey)
552 column_defs.append(f"PRIMARY KEY ({pkey})")
553
554 return column_defs
555
556 def _table_schema(self, table: ApdbTables | ExtraTables) -> schema_model.Table:
557 """Return schema definition for a table."""
558 if isinstance(table, ApdbTables):
559 table_schema = self._apdb_tables[table]
560 else:
561 table_schema = self._extra_tables[table]
562 return table_schema
std::vector< SchemaItem< Flag > > * items
Tag types used to declare specialized field types.
Definition misc.h:31
Mapping[ExtraTables, schema_model.Table] _extra_tables_schema(self)
schema_model.Table _table_schema(self, ApdbTables|ExtraTables table)
list[str] _tableColumns(self, ApdbTables|ExtraTables table_name)
None makeSchema(self, *bool drop=False, tuple[int, int]|None part_range=None, int|None replication_factor=None)
list[str] clusteringColumns(self, ApdbTables|ExtraTables table_name)
Mapping[ApdbTables, schema_model.Table] _apdb_tables_schema(self, bool time_partition_tables)
None _makeTableSchema(self, ApdbTables|ExtraTables table, bool drop=False, tuple[int, int]|None part_range=None)
Mapping[str, schema_model.Column] getColumnMap(self, ApdbTables|ExtraTables table_name)
list[str] apdbColumnNames(self, ApdbTables|ExtraTables table_name)
list[str] partitionColumns(self, ApdbTables|ExtraTables table_name)
__init__(self, cassandra.cluster.Session session, str keyspace, str schema_file, str schema_name="ApdbSchema", str prefix="", bool time_partition_tables=False, bool enable_replica=False)
Mapping[ExtraTables, ApdbTables] replica_chunk_tables(cls)
daf::base::PropertySet * set
Definition fits.cc:931