LSST Applications g0b6bd0c080+a72a5dd7e6,g1182afd7b4+2a019aa3bb,g17e5ecfddb+2b8207f7de,g1d67935e3f+06cf436103,g38293774b4+ac198e9f13,g396055baef+6a2097e274,g3b44f30a73+6611e0205b,g480783c3b1+98f8679e14,g48ccf36440+89c08d0516,g4b93dc025c+98f8679e14,g5c4744a4d9+a302e8c7f0,g613e996a0d+e1c447f2e0,g6c8d09e9e7+25247a063c,g7271f0639c+98f8679e14,g7a9cd813b8+124095ede6,g9d27549199+a302e8c7f0,ga1cf026fa3+ac198e9f13,ga32aa97882+7403ac30ac,ga786bb30fb+7a139211af,gaa63f70f4e+9994eb9896,gabf319e997+ade567573c,gba47b54d5d+94dc90c3ea,gbec6a3398f+06cf436103,gc6308e37c7+07dd123edb,gc655b1545f+ade567573c,gcc9029db3c+ab229f5caf,gd01420fc67+06cf436103,gd877ba84e5+06cf436103,gdb4cecd868+6f279b5b48,ge2d134c3d5+cc4dbb2e3f,ge448b5faa6+86d1ceac1d,gecc7e12556+98f8679e14,gf3ee170dca+25247a063c,gf4ac96e456+ade567573c,gf9f5ea5b4d+ac198e9f13,gff490e6085+8c2580be5c,w.2022.27
LSST Data Management Base Package
registries.py
Go to the documentation of this file.
2# LSST Data Management System
3# Copyright 2008, 2009, 2010 LSST Corporation.
4#
5# This product includes software developed by the
6# LSST Project (http://www.lsst.org/).
7#
8# This program is free software: you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the LSST License Statement and
19# the GNU General Public License along with this program. If not,
20# see <http://www.lsstcorp.org/LegalNotices/>.
21#
22
23"""This module provides registry classes for maintaining dataset metadata
24for use by the Data Butler. Currently only a SQLite3-based registry is
25implemented, but registries based on a text file, a policy file, a MySQL
26(or other) relational database, and data gathered from scanning a filesystem
27are all anticipated.
28
29Currently this module assumes posix access (for both PosixRegistry AND
30SqliteRegistry). It is possible that it can be factored so that at least the
31SqliteRegistry can be remote/not on the local filesystem. For now this module
32is only used by CameraMapper and by PosixStorage, both of which work on the
33local filesystem only, so this works for the time being.
34"""
35import copy
36from . import fsScanner, sequencify
37import os
38import astropy.io.fits
39import re
40import yaml
41
42try:
43 import sqlite3
44 haveSqlite3 = True
45except ImportError:
46 try:
47 # try external pysqlite package; deprecated
48 import sqlite as sqlite3
49 haveSqlite3 = True
50 except ImportError:
51 haveSqlite3 = False
52
53# PostgreSQL support
54try:
55 import psycopg2 as pgsql
56 havePgsql = True
57except ImportError:
58 havePgsql = False
59
60
62 """The registry base class."""
63
64 def __init__(self):
65 pass
66
67 def __del__(self):
68 pass
69
70 @staticmethod
71 def create(location):
72 """Create a registry object of an appropriate type.
73 @param location (string) Path or URL for registry, or None if
74 unavailable"""
75
76 if location is None:
77 return
78
79 # if re.match(r'.*\.registry', location):
80 # return FileRegistry(location)
81
82 if location.endswith(".pgsql"):
83 return PgsqlRegistry(location)
84
85 # look for an sqlite3 registry
86 if re.match(r'.*\.sqlite3', location):
87 if not haveSqlite3:
88 raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" %
89 (location,))
90 registry = SqliteRegistry(location)
91 if registry.conn is None:
92 return None
93 return registry
94
95 # if re.match(r'mysql:', location):
96 # return DbRegistry(location)
97 # return FsRegistry(location)
98
99 # next try to create a PosixRegistry
100 if os.path.isdir(location):
101 return PosixRegistry(root=location)
102
103 raise RuntimeError("Unable to create registry using location: " + location)
104
105
107 """A glob-based filesystem registry"""
108
109 def __init__(self, root):
110 Registry.__init__(self)
111 self.rootroot = root
112
113 @staticmethod
114 def getHduNumber(template, dataId):
115 """Looks up the HDU number for a given template+dataId.
116 :param template: template with HDU specifier (ends with brackets and an
117 identifier that can be populated by a key-value pair in dataId.
118 e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
119 :param dataId: dictionary that hopefully has a key-value pair whose key
120 matches (has the same name) as the key specifier in the template.
121 :return: the HDU specified by the template+dataId pair, or None if the
122 HDU can not be determined.
123 """
124 # sanity check that the template at least ends with a brace.
125 if not template.endswith(']'):
126 return None
127
128 # get the key (with formatting) out of the brances
129 hduKey = template[template.rfind('[') + 1:template.rfind(']')]
130 # extract the key name from the formatting
131 hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')]
132
133 if hduKey in dataId:
134 return dataId[hduKey]
135 return None
136
138
139 def __init__(self, lookupProperties, dataId):
140 self.dataIddataId = copy.copy(dataId)
141 lookupProperties = sequencify(lookupProperties)
142 self.lookupPropertieslookupProperties = copy.copy(lookupProperties)
143 self.foundItemsfoundItems = {}
144 self.cachedStatuscachedStatus = None
145 self.neededKeysneededKeys = set(lookupProperties).union(dataId.keys())
146
147 def __repr__(self):
148 return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
149 (self.lookupPropertieslookupProperties, self.dataIddataId, self.foundItemsfoundItems, self.cachedStatuscachedStatus)
150
151 def status(self):
152 """Query the lookup status
153
154 :return: 'match' if the key+value pairs in dataId have been satisifed and keys in
155 lookupProperties have found and their key+value added to resolvedId
156 'incomplete' if the found data matches but not all keys in lookupProperties have been matched
157 'not match' if data in foundId does not match data in dataId
158 """
159 class NotFound:
160 """Placeholder class for item not found.
161
162 (None might be a valid value so we don't want to use that)
163 """
164 pass
165
166 if self.cachedStatuscachedStatus is not None:
167 return self.cachedStatuscachedStatus
168 self.cachedStatuscachedStatus = 'match'
169 for key in self.lookupPropertieslookupProperties:
170 val = self.foundItemsfoundItems.get(key, NotFound)
171 if val is NotFound:
172 self.cachedStatuscachedStatus = 'incomplete'
173 break
174 for dataIdKey, dataIdValue in self.dataIddataId.items():
175 foundValue = self.foundItemsfoundItems.get(dataIdKey, NotFound)
176 if foundValue is not NotFound and foundValue != dataIdValue:
177 self.cachedStatuscachedStatus = 'notMatch'
178 break
179 return self.cachedStatuscachedStatus
180
181 def setFoundItems(self, items):
182 self.cachedStatuscachedStatus = None
183 self.foundItemsfoundItems = items
184
185 def addFoundItems(self, items):
186 self.cachedStatuscachedStatus = None
187 self.foundItemsfoundItems.update(items)
188
189 def getMissingKeys(self):
190 return self.neededKeysneededKeys - set(self.foundItemsfoundItems.keys())
191
192 def lookup(self, lookupProperties, reference, dataId, **kwargs):
193 """Perform a lookup in the registry.
194
195 Return values are refined by the values in dataId.
196 Returns a list of values that match keys in lookupProperties.
197 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
198 dataId={'visit':1}, and lookupProperties is ['filter'], and the
199 filesystem under self.rootroot has exactly one file 'raw/raw_v1_fg.fits.gz'
200 then the return value will be [('g',)]
201
202 :param lookupProperties: keys whose values will be returned.
203 :param reference: other data types that may be used to search for values.
204 :param dataId: must be an iterable. Keys must be string.
205 If value is a string then will look for elements in the repository that match value for key.
206 If value is a 2-item iterable then will look for elements in the repository are between (inclusive)
207 the first and second items in the value.
208 :param **kwargs: keys required for the posix registry to search for items. If required keys are not
209 provide will return an empty list.
210 'template': required. template parameter (typically from a policy) that can be used to look for files
211 'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'.
212 :return: a list of values that match keys in lookupProperties.
213 """
214 # required kwargs:
215 if 'template' in kwargs:
216 template = kwargs['template']
217 else:
218 return []
219 # optional kwargs:
220 storage = kwargs['storage'] if 'storage' in kwargs else None
221
222 lookupData = PosixRegistry.LookupData(lookupProperties, dataId)
223 scanner = fsScanner.FsScanner(template)
224 allPaths = scanner.processPath(self.rootroot)
225 retItems = [] # one item for each found file that matches
226 for path, foundProperties in allPaths.items():
227 # check for dataId keys that are not present in found properties
228 # search for those keys in metadata of file at path
229 # if present, check for matching values
230 # if not present, file can not match, do not use it.
231 lookupData.setFoundItems(foundProperties)
232 if 'incomplete' == lookupData.status():
233 PosixRegistry.lookupMetadata(os.path.join(self.rootroot, path), template, lookupData, storage)
234 if 'match' == lookupData.status():
235 ll = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties)
236 retItems.append(ll)
237 return retItems
238
239 @staticmethod
240 def lookupMetadata(filepath, template, lookupData, storage):
241 """Dispatcher for looking up metadata in a file of a given storage type
242 """
243 if storage == 'FitsStorage':
244 PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
245
246 @staticmethod
247 def lookupFitsMetadata(filepath, template, lookupData, dataId):
248 """Look up metadata in a fits file.
249 Will try to discover the correct HDU to look in by testing if the
250 template has a value in brackets at the end.
251 If the HDU is specified but the metadata key is not discovered in
252 that HDU, will look in the primary HDU before giving up.
253 :param filepath: path to the file
254 :param template: template that was used to discover the file. This can
255 be used to look up the correct HDU as needed.
256 :param lookupData: an instance if LookupData that contains the
257 lookupProperties, the dataId, and the data that has been found so far.
258 Will be updated with new information as discovered.
259 :param dataId:
260 :return:
261 """
262 try:
263 hdulist = astropy.io.fits.open(filepath, memmap=True)
264 except IOError:
265 return
266 hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
267 if hduNumber is not None and hduNumber < len(hdulist):
268 hdu = hdulist[hduNumber]
269 else:
270 hdu = None
271 if len(hdulist) > 0:
272 primaryHdu = hdulist[0]
273 else:
274 primaryHdu = None
275
276 for property in lookupData.getMissingKeys():
277 propertyValue = None
278 if hdu is not None and property in hdu.header:
279 propertyValue = hdu.header[property]
280 # if the value is not in the indicated HDU, try the primary HDU:
281 elif primaryHdu is not None and property in primaryHdu.header:
282 propertyValue = primaryHdu.header[property]
283 lookupData.addFoundItems({property: propertyValue})
284
285
287 """A base class for SQL-based registries
288
289 Subclasses should define the class variable `placeHolder` (the particular
290 placeholder to use for parameter substitution) appropriately. The
291 database's python module should define `paramstyle` (see PEP 249), which
292 would indicate what to use for a placeholder:
293 * paramstyle = "qmark" --> placeHolder = "?"
294 * paramstyle = "format" --> placeHolder = "%s"
295 Other `paramstyle` values are not currently supported.
296
297 Constructor parameters
298 ----------------------
299 conn : DBAPI connection object
300 Connection object
301 """
302 placeHolder = "?" # Placeholder for parameter substitution
303
304 def __init__(self, conn):
305 """Constructor.
306
307 Parameters
308 ----------
309 conn : DBAPI connection object
310 Connection object
311 """
312 Registry.__init__(self)
313 self.connconn = conn
314
315 def __del__(self):
316 if hasattr(self, "conn") and self.connconn:
317 self.connconn.close()
318 super().__del__()
319
320 def _lookup(self, lookupProperties, dataId, reference, checkColumns=False):
321 """Perform a lookup in the registry.
322
323 This is the worker code for cls.lookuplookup with the added option of checking
324 that all the columns being looked up are in the database. The classic
325 example here is adding a template with an hdu, where the hdu in the dataId
326 prevents us looking up e.g. dateObs. checkColumns results in a performance
327 penalty, so is only invoked when a problem in the dataId keys has been seen
328
329 Return values are refined by the values in dataId.
330 Returns a list of values that match keys in lookupProperties.
331 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
332 dataId={'visit':1}, and lookupProperties is ['filter'], and the
333 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
334 then the return value will be [('g',)]
335
336 :param lookupProperties:
337 :param dataId: must be a key/value iterable. Keys must be string.
338 See `SqlRegistry.lookup` for further details
339 :param reference: other data types that may be used to search for values.
340 :param checkColumns: if True, check that keys are actually in the registry and ignore them if not
341 :return: a list of values that match keys in lookupProperties.
342 """
343 cmd = "SELECT DISTINCT "
344 cmd += ", ".join(lookupProperties)
345 cmd += " FROM " + " NATURAL JOIN ".join(reference)
346 valueList = []
347 if dataId is not None and len(dataId) > 0:
348 whereList = []
349 for k, v in dataId.items():
350 if checkColumns: # check if k is in registry
351 try:
352 self.connconn.cursor().execute(
353 f'SELECT {k} FROM {" NATURAL JOIN ".join(reference)} LIMIT 1')
354 except sqlite3.OperationalError:
355 continue
356
357 if hasattr(k, '__iter__') and not isinstance(k, str):
358 if len(k) != 2:
359 raise RuntimeError("Wrong number of keys for range:%s" % (k,))
360 whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolderplaceHolder, k[0], k[1]))
361 valueList.append(v)
362 else:
363 whereList.append("%s = %s" % (k, self.placeHolderplaceHolder))
364 valueList.append(v)
365 cmd += " WHERE " + " AND ".join(whereList)
366 cursor = self.connconn.cursor()
367 cursor.execute(cmd, valueList)
368 return [row for row in cursor.fetchall()]
369
370 def lookup(self, lookupProperties, reference, dataId, **kwargs):
371 """Perform a lookup in the registry.
372
373 Return values are refined by the values in dataId.
374 Returns a list of values that match keys in lookupProperties.
375 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
376 dataId={'visit':1}, and lookupProperties is ['filter'], and the
377 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
378 then the return value will be [('g',)]
379
380 :param lookupProperties:
381 :param dataId: must be a key/value iterable. Keys must be string.
382 If value is a string then will look for elements in the repository that match value for value.
383 If value is a 2-item iterable then will look for elements in the repository where the value is between
384 the values of value[0] and value[1].
385 :param reference: other data types that may be used to search for values.
386 :param **kwargs: nothing needed for sqlite lookup
387 :return: a list of values that match keys in lookupProperties.
388 """
389 if not self.connconn:
390 return None
391
392 # input variable sanitization:
393 reference = sequencify(reference)
394 lookupProperties = sequencify(lookupProperties)
395
396 try:
397 return self._lookup_lookup(lookupProperties, dataId, reference)
398 except sqlite3.OperationalError: # try again, with extra checking of the dataId keys
399 return self._lookup_lookup(lookupProperties, dataId, reference, checkColumns=True)
400
401 def executeQuery(self, returnFields, joinClause, whereFields, range, values):
402 """Extract metadata from the registry.
403 @param returnFields (list of strings) Metadata fields to be extracted.
404 @param joinClause (list of strings) Tables in which metadata fields
405 are located.
406 @param whereFields (list of tuples) First tuple element is metadata
407 field to query; second is the value that field
408 must have (often '?').
409 @param range (tuple) Value, lower limit, and upper limit for a
410 range condition on the metadata. Any of these can
411 be metadata fields.
412 @param values (tuple) Tuple of values to be substituted for '?'
413 characters in the whereFields values or the range
414 values.
415 @return (list of tuples) All sets of field values that meet the
416 criteria"""
417 if not self.connconn:
418 return None
419 cmd = "SELECT DISTINCT "
420 cmd += ", ".join(returnFields)
421 cmd += " FROM " + " NATURAL JOIN ".join(joinClause)
422 whereList = []
423 if whereFields:
424 for k, v in whereFields:
425 whereList.append("(%s = %s)" % (k, v))
426 if range is not None:
427 whereList.append("(%s BETWEEN %s AND %s)" % range)
428 if len(whereList) > 0:
429 cmd += " WHERE " + " AND ".join(whereList)
430 cursor = self.connconn.cursor()
431 cursor.execute(cmd, values)
432 return [row for row in cursor.fetchall()]
433
434
436 """A SQLite-based registry"""
437 placeHolder = "?" # Placeholder for parameter substitution
438
439 def __init__(self, location):
440 """Constructor
441
442 Parameters
443 ----------
444 location : `str`
445 Path to SQLite3 file
446 """
447 if os.path.exists(location):
448 conn = sqlite3.connect(location)
449 conn.text_factory = str
450 self.rootroot = location
451 else:
452 conn = None
453 SqlRegistry.__init__(self, conn)
454
455
457 """A PostgreSQL-based registry"""
458 placeHolder = "%s"
459
460 def __init__(self, location):
461 """Constructor
462
463 Parameters
464 ----------
465 location : `str`
466 Path to PostgreSQL configuration file.
467 """
468 if not havePgsql:
469 raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2")
470 config = self.readYamlreadYaml(location)
471 self._config_config = config
472 conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"],
473 user=config["user"], password=config["password"])
474 self.rootroot = location
475 SqlRegistry.__init__(self, conn)
476
477 @staticmethod
478 def readYaml(location):
479 """Read YAML configuration file
480
481 The YAML configuration file should contain:
482 * host : host name for database connection
483 * port : port for database connection
484 * user : user name for database connection
485 * database : database name
486
487 It may also contain:
488 * password : password for database connection
489
490 The optional entries are set to `None` in the output configuration.
491
492 Parameters
493 ----------
494 location : `str`
495 Path to PostgreSQL YAML config file.
496
497 Returns
498 -------
499 config : `dict`
500 Configuration
501 """
502 try:
503 # PyYAML >=5.1 prefers a different loader
504 loader = yaml.UnsafeLoader
505 except AttributeError:
506 loader = yaml.Loader
507 with open(location) as ff:
508 data = yaml.load(ff, Loader=loader)
509 requireKeys = set(["host", "port", "database", "user"])
510 optionalKeys = set(["password"])
511 haveKeys = set(data.keys())
512 if haveKeys - optionalKeys != requireKeys:
513 raise RuntimeError(
514 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', "
515 "but this contains: %s" %
516 (location, ",".join("'%s'" % key for key in requireKeys),
517 ",".join("'%s'" % key for key in data.keys()))
518 )
519 for key in optionalKeys:
520 if key not in data:
521 data[key] = None
522
523 return data
524
525 def lookup(self, *args, **kwargs):
526 try:
527 return SqlRegistry.lookup(self, *args, **kwargs)
528 except Exception:
529 self.connconn.rollback()
530 raise
std::vector< SchemaItem< Flag > > * items
def __init__(self, lookupProperties, dataId)
Definition: registries.py:139
def lookupFitsMetadata(filepath, template, lookupData, dataId)
Definition: registries.py:247
def lookup(self, lookupProperties, reference, dataId, **kwargs)
Definition: registries.py:192
def lookupMetadata(filepath, template, lookupData, storage)
Definition: registries.py:240
def lookup(self, lookupProperties, reference, dataId, **kwargs)
Definition: registries.py:370
def _lookup(self, lookupProperties, dataId, reference, checkColumns=False)
Definition: registries.py:320
def executeQuery(self, returnFields, joinClause, whereFields, range, values)
Definition: registries.py:401
daf::base::PropertySet * set
Definition: fits.cc:912