23"""This module provides registry classes for maintaining dataset metadata
24for use by the Data Butler. Currently only a SQLite3-based registry is
25implemented, but registries based on a text file, a policy file, a MySQL
26(
or other) relational database,
and data gathered
from scanning a filesystem
29Currently this module assumes posix access (
for both PosixRegistry AND
30SqliteRegistry). It
is possible that it can be factored so that at least the
31SqliteRegistry can be remote/
not on the local filesystem. For now this module
32is only used by CameraMapper
and by PosixStorage, both of which work on the
33local filesystem only, so this works
for the time being.
36from . import fsScanner, sequencify
48 import sqlite
as sqlite3
55 import psycopg2
as pgsql
62 """The registry base class."""
72 """Create a registry object of an appropriate type.
73 @param location (string) Path
or URL
for registry,
or None if
82 if location.endswith(
".pgsql"):
86 if re.match(
r'.*\.sqlite3', location):
88 raise RuntimeError(
"sqlite3 registry specified (%s), but unable to import sqlite3 module" %
91 if registry.conn
is None:
100 if os.path.isdir(location):
103 raise RuntimeError(
"Unable to create registry using location: " + location)
107 """A glob-based filesystem registry"""
110 Registry.__init__(self)
115 """Looks up the HDU number for a given template+dataId.
116 :param template: template with HDU specifier (ends
with brackets
and an
117 identifier that can be populated by a key-value pair
in dataId.
118 e.g.
"%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
119 :param dataId: dictionary that hopefully has a key-value pair whose key
120 matches (has the same name)
as the key specifier
in the template.
121 :
return: the HDU specified by the template+dataId pair,
or None if the
122 HDU can
not be determined.
125 if not template.endswith(
']'):
129 hduKey = template[template.rfind(
'[') + 1:template.rfind(
']')]
131 hduKey = hduKey[hduKey.rfind(
'(') + 1:hduKey.rfind(
')')]
134 return dataId[hduKey]
141 lookupProperties =
sequencify(lookupProperties)
148 return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
152 """Query the lookup status
154 :return:
'match' if the key+value pairs
in dataId have been satisifed
and keys
in
155 lookupProperties have found
and their key+value added to resolvedId
156 'incomplete' if the found data matches but
not all keys
in lookupProperties have been matched
157 'not match' if data
in foundId does
not match data
in dataId
160 """Placeholder class for item not found.
162 (None might be a valid value so we don
't want to use that)
170 val = self.
foundItemsfoundItems.get(key, NotFound)
174 for dataIdKey, dataIdValue
in self.
dataIddataId.
items():
175 foundValue = self.
foundItemsfoundItems.get(dataIdKey, NotFound)
176 if foundValue
is not NotFound
and foundValue != dataIdValue:
192 def lookup(self, lookupProperties, reference, dataId, **kwargs):
193 """Perform a lookup in the registry.
195 Return values are refined by the values in dataId.
196 Returns a list of values that match keys
in lookupProperties.
197 e.g.
if the template
is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz',
and
198 dataId={
'visit':1},
and lookupProperties
is [
'filter'],
and the
199 filesystem under self.
rootroot has exactly one file
'raw/raw_v1_fg.fits.gz'
200 then the
return value will be [(
'g',)]
202 :param lookupProperties: keys whose values will be returned.
203 :param reference: other data types that may be used to search
for values.
204 :param dataId: must be an iterable. Keys must be string.
205 If value
is a string then will look
for elements
in the repository that match value
for key.
206 If value
is a 2-item iterable then will look
for elements
in the repository are between (inclusive)
207 the first
and second items
in the value.
208 :param **kwargs: keys required
for the posix registry to search
for items. If required keys are
not
209 provide will
return an empty list.
210 'template': required. template parameter (typically
from a policy) that can be used to look
for files
211 'storage': optional. Needed to look
for metadata
in files. Currently supported values:
'FitsStorage'.
212 :
return: a list of values that match keys
in lookupProperties.
215 if 'template' in kwargs:
216 template = kwargs[
'template']
220 storage = kwargs[
'storage']
if 'storage' in kwargs
else None
224 allPaths = scanner.processPath(self.
rootroot)
226 for path, foundProperties
in allPaths.items():
231 lookupData.setFoundItems(foundProperties)
232 if 'incomplete' == lookupData.status():
233 PosixRegistry.lookupMetadata(os.path.join(self.
rootroot, path), template, lookupData, storage)
234 if 'match' == lookupData.status():
235 ll = tuple(lookupData.foundItems[key]
for key
in lookupData.lookupProperties)
241 """Dispatcher for looking up metadata in a file of a given storage type
243 if storage ==
'FitsStorage':
244 PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
248 """Look up metadata in a fits file.
249 Will try to discover the correct HDU to look
in by testing
if the
250 template has a value
in brackets at the end.
251 If the HDU
is specified but the metadata key
is not discovered
in
252 that HDU, will look
in the primary HDU before giving up.
253 :param filepath: path to the file
254 :param template: template that was used to discover the file. This can
255 be used to look up the correct HDU
as needed.
256 :param lookupData: an instance
if LookupData that contains the
257 lookupProperties, the dataId,
and the data that has been found so far.
258 Will be updated
with new information
as discovered.
263 hdulist = astropy.io.fits.open(filepath, memmap=
True)
266 hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
267 if hduNumber
is not None and hduNumber < len(hdulist):
268 hdu = hdulist[hduNumber]
272 primaryHdu = hdulist[0]
276 for property
in lookupData.getMissingKeys():
278 if hdu
is not None and property
in hdu.header:
279 propertyValue = hdu.header[property]
281 elif primaryHdu
is not None and property
in primaryHdu.header:
282 propertyValue = primaryHdu.header[property]
283 lookupData.addFoundItems({property: propertyValue})
287 """A base class for SQL-based registries
289 Subclasses should define the class variable `placeHolder` (the particular
290 placeholder to use
for parameter substitution) appropriately. The
291 database
's python module should define `paramstyle` (see PEP 249), which
292 would indicate what to use for a placeholder:
293 * paramstyle =
"qmark" --> placeHolder =
"?"
294 * paramstyle =
"format" --> placeHolder =
"%s"
295 Other `paramstyle` values are
not currently supported.
297 Constructor parameters
298 ----------------------
299 conn : DBAPI connection object
309 conn : DBAPI connection object
312 Registry.__init__(self)
316 if hasattr(self,
"conn")
and self.
connconn:
317 self.
connconn.close()
320 def _lookup(self, lookupProperties, dataId, reference, checkColumns=False):
321 """Perform a lookup in the registry.
323 This is the worker code
for cls.
lookuplookup
with the added option of checking
324 that all the columns being looked up are
in the database. The classic
325 example here
is adding a template
with an hdu, where the hdu
in the dataId
326 prevents us looking up e.g. dateObs. checkColumns results
in a performance
327 penalty, so
is only invoked when a problem
in the dataId keys has been seen
329 Return values are refined by the values
in dataId.
330 Returns a list of values that match keys
in lookupProperties.
331 e.g.
if the template
is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz',
and
332 dataId={
'visit':1},
and lookupProperties
is [
'filter'],
and the
333 filesystem under self.root has exactly one file
'raw/raw_v1_fg.fits.gz'
334 then the
return value will be [(
'g',)]
336 :param lookupProperties:
337 :param dataId: must be a key/value iterable. Keys must be string.
338 See `SqlRegistry.lookup`
for further details
339 :param reference: other data types that may be used to search
for values.
340 :param checkColumns:
if True, check that keys are actually
in the registry
and ignore them
if not
341 :
return: a list of values that match keys
in lookupProperties.
343 cmd = "SELECT DISTINCT "
344 cmd +=
", ".join(lookupProperties)
345 cmd +=
" FROM " +
" NATURAL JOIN ".join(reference)
347 if dataId
is not None and len(dataId) > 0:
349 for k, v
in dataId.items():
352 self.
connconn.cursor().execute(
353 f
'SELECT {k} FROM {" NATURAL JOIN ".join(reference)} LIMIT 1')
354 except sqlite3.OperationalError:
357 if hasattr(k,
'__iter__')
and not isinstance(k, str):
359 raise RuntimeError(
"Wrong number of keys for range:%s" % (k,))
360 whereList.append(
"(%s BETWEEN %s AND %s)" % (self.
placeHolderplaceHolder, k[0], k[1]))
363 whereList.append(
"%s = %s" % (k, self.
placeHolderplaceHolder))
365 cmd +=
" WHERE " +
" AND ".join(whereList)
366 cursor = self.
connconn.cursor()
367 cursor.execute(cmd, valueList)
368 return [row
for row
in cursor.fetchall()]
370 def lookup(self, lookupProperties, reference, dataId, **kwargs):
371 """Perform a lookup in the registry.
373 Return values are refined by the values in dataId.
374 Returns a list of values that match keys
in lookupProperties.
375 e.g.
if the template
is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz',
and
376 dataId={
'visit':1},
and lookupProperties
is [
'filter'],
and the
377 filesystem under self.root has exactly one file
'raw/raw_v1_fg.fits.gz'
378 then the
return value will be [(
'g',)]
380 :param lookupProperties:
381 :param dataId: must be a key/value iterable. Keys must be string.
382 If value
is a string then will look
for elements
in the repository that match value
for value.
383 If value
is a 2-item iterable then will look
for elements
in the repository where the value
is between
384 the values of value[0]
and value[1].
385 :param reference: other data types that may be used to search
for values.
386 :param **kwargs: nothing needed
for sqlite lookup
387 :
return: a list of values that match keys
in lookupProperties.
389 if not self.
connconn:
394 lookupProperties =
sequencify(lookupProperties)
397 return self.
_lookup_lookup(lookupProperties, dataId, reference)
398 except sqlite3.OperationalError:
399 return self.
_lookup_lookup(lookupProperties, dataId, reference, checkColumns=
True)
401 def executeQuery(self, returnFields, joinClause, whereFields, range, values):
402 """Extract metadata from the registry.
403 @param returnFields (list of strings) Metadata fields to be extracted.
404 @param joinClause (list of strings) Tables
in which metadata fields
406 @param whereFields (list of tuples) First tuple element
is metadata
407 field to query; second
is the value that field
408 must have (often
'?').
409 @param range (tuple) Value, lower limit,
and upper limit
for a
410 range condition on the metadata. Any of these can
412 @param values (tuple) Tuple of values to be substituted
for '?'
413 characters
in the whereFields values
or the range
415 @return (list of tuples) All sets of field values that meet the
417 if not self.
connconn:
419 cmd =
"SELECT DISTINCT "
420 cmd +=
", ".join(returnFields)
421 cmd +=
" FROM " +
" NATURAL JOIN ".join(joinClause)
424 for k, v
in whereFields:
425 whereList.append(
"(%s = %s)" % (k, v))
426 if range
is not None:
427 whereList.append(
"(%s BETWEEN %s AND %s)" % range)
428 if len(whereList) > 0:
429 cmd +=
" WHERE " +
" AND ".join(whereList)
430 cursor = self.
connconn.cursor()
431 cursor.execute(cmd, values)
432 return [row
for row
in cursor.fetchall()]
436 """A SQLite-based registry"""
447 if os.path.exists(location):
448 conn = sqlite3.connect(location)
449 conn.text_factory = str
453 SqlRegistry.__init__(self, conn)
457 """A PostgreSQL-based registry"""
466 Path to PostgreSQL configuration file.
469 raise RuntimeError(
"Cannot use PgsqlRegistry: could not import psycopg2")
470 config = self.
readYamlreadYaml(location)
472 conn = pgsql.connect(host=config[
"host"], port=config[
"port"], database=config[
"database"],
473 user=config[
"user"], password=config[
"password"])
475 SqlRegistry.__init__(self, conn)
479 """Read YAML configuration file
481 The YAML configuration file should contain:
482 * host : host name for database connection
483 * port : port
for database connection
484 * user : user name
for database connection
485 * database : database name
488 * password : password
for database connection
490 The optional entries are set to `
None`
in the output configuration.
495 Path to PostgreSQL YAML config file.
504 loader = yaml.UnsafeLoader
505 except AttributeError:
507 with open(location)
as ff:
508 data = yaml.load(ff, Loader=loader)
509 requireKeys =
set([
"host",
"port",
"database",
"user"])
510 optionalKeys =
set([
"password"])
511 haveKeys =
set(data.keys())
512 if haveKeys - optionalKeys != requireKeys:
514 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', "
515 "but this contains: %s" %
516 (location,
",".join(
"'%s'" % key
for key
in requireKeys),
517 ",".join(
"'%s'" % key
for key
in data.keys()))
519 for key
in optionalKeys:
527 return SqlRegistry.lookup(self, *args, **kwargs)
529 self.
connconn.rollback()
std::vector< SchemaItem< Flag > > * items
def __init__(self, location)
def lookup(self, *args, **kwargs)
def addFoundItems(self, items)
def __init__(self, lookupProperties, dataId)
def setFoundItems(self, items)
def lookupFitsMetadata(filepath, template, lookupData, dataId)
def lookup(self, lookupProperties, reference, dataId, **kwargs)
def lookupMetadata(filepath, template, lookupData, storage)
def getHduNumber(template, dataId)
def lookup(self, lookupProperties, reference, dataId, **kwargs)
def _lookup(self, lookupProperties, dataId, reference, checkColumns=False)
def executeQuery(self, returnFields, joinClause, whereFields, range, values)
def __init__(self, location)
daf::base::PropertySet * set