23 """This module provides registry classes for maintaining dataset metadata
24 for use by the Data Butler. Currently only a SQLite3-based registry is
25 implemented, but registries based on a text file, a policy file, a MySQL
26 (or other) relational database, and data gathered from scanning a filesystem
29 Currently this module assumes posix access (for both PosixRegistry AND
30 SqliteRegistry). It is possible that it can be factored so that at least the
31 SqliteRegistry can be remote/not on the local filesystem. For now this module
32 is only used by CameraMapper and by PosixStorage, both of which work on the
33 local filesystem only, so this works for the time being.
36 from .
import fsScanner, sequencify
38 import astropy.io.fits
48 import sqlite
as sqlite3
55 import psycopg2
as pgsql
62 """The registry base class."""
72 """Create a registry object of an appropriate type.
73 @param location (string) Path or URL for registry, or None if
82 if location.endswith(
".pgsql"):
86 if re.match(
r'.*\.sqlite3', location):
88 raise RuntimeError(
"sqlite3 registry specified (%s), but unable to import sqlite3 module" %
91 if registry.conn
is None:
100 if os.path.isdir(location):
103 raise RuntimeError(
"Unable to create registry using location: " + location)
107 """A glob-based filesystem registry"""
110 Registry.__init__(self)
115 """Looks up the HDU number for a given template+dataId.
116 :param template: template with HDU specifier (ends with brackets and an
117 identifier that can be populated by a key-value pair in dataId.
118 e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
119 :param dataId: dictionary that hopefully has a key-value pair whose key
120 matches (has the same name) as the key specifier in the template.
121 :return: the HDU specified by the template+dataId pair, or None if the
122 HDU can not be determined.
125 if not template.endswith(
']'):
129 hduKey = template[template.rfind(
'[') + 1:template.rfind(
']')]
131 hduKey = hduKey[hduKey.rfind(
'(') + 1:hduKey.rfind(
')')]
134 return dataId[hduKey]
141 lookupProperties =
sequencify(lookupProperties)
148 return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
152 """Query the lookup status
154 :return: 'match' if the key+value pairs in dataId have been satisifed and keys in
155 lookupProperties have found and their key+value added to resolvedId
156 'incomplete' if the found data matches but not all keys in lookupProperties have been matched
157 'not match' if data in foundId does not match data in dataId
160 """Placeholder class for item not found.
162 (None might be a valid value so we don't want to use that)
166 if self.cachedStatus
is not None:
167 return self.cachedStatus
168 self.cachedStatus =
'match'
169 for key
in self.lookupProperties:
170 val = self.foundItems.get(key, NotFound)
172 self.cachedStatus =
'incomplete'
174 for dataIdKey, dataIdValue
in self.dataId.
items():
175 foundValue = self.foundItems.get(dataIdKey, NotFound)
176 if foundValue
is not NotFound
and foundValue != dataIdValue:
177 self.cachedStatus =
'notMatch'
179 return self.cachedStatus
192 def lookup(self, lookupProperties, reference, dataId, **kwargs):
193 """Perform a lookup in the registry.
195 Return values are refined by the values in dataId.
196 Returns a list of values that match keys in lookupProperties.
197 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
198 dataId={'visit':1}, and lookupProperties is ['filter'], and the
199 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
200 then the return value will be [('g',)]
202 :param lookupProperties: keys whose values will be returned.
203 :param reference: other data types that may be used to search for values.
204 :param dataId: must be an iterable. Keys must be string.
205 If value is a string then will look for elements in the repository that match value for key.
206 If value is a 2-item iterable then will look for elements in the repository are between (inclusive)
207 the first and second items in the value.
208 :param **kwargs: keys required for the posix registry to search for items. If required keys are not
209 provide will return an empty list.
210 'template': required. template parameter (typically from a policy) that can be used to look for files
211 'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'.
212 :return: a list of values that match keys in lookupProperties.
215 if 'template' in kwargs:
216 template = kwargs[
'template']
220 storage = kwargs[
'storage']
if 'storage' in kwargs
else None
224 allPaths = scanner.processPath(self.
root)
226 for path, foundProperties
in allPaths.items():
231 lookupData.setFoundItems(foundProperties)
232 if 'incomplete' == lookupData.status():
233 PosixRegistry.lookupMetadata(os.path.join(self.
root, path), template, lookupData, storage)
234 if 'match' == lookupData.status():
235 ll = tuple(lookupData.foundItems[key]
for key
in lookupData.lookupProperties)
241 """Dispatcher for looking up metadata in a file of a given storage type
243 if storage ==
'FitsStorage':
244 PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
248 """Look up metadata in a fits file.
249 Will try to discover the correct HDU to look in by testing if the
250 template has a value in brackets at the end.
251 If the HDU is specified but the metadata key is not discovered in
252 that HDU, will look in the primary HDU before giving up.
253 :param filepath: path to the file
254 :param template: template that was used to discover the file. This can
255 be used to look up the correct HDU as needed.
256 :param lookupData: an instance if LookupData that contains the
257 lookupProperties, the dataId, and the data that has been found so far.
258 Will be updated with new information as discovered.
263 hdulist = astropy.io.fits.open(filepath, memmap=
True)
266 hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
267 if hduNumber
is not None and hduNumber < len(hdulist):
268 hdu = hdulist[hduNumber]
272 primaryHdu = hdulist[0]
276 for property
in lookupData.getMissingKeys():
278 if hdu
is not None and property
in hdu.header:
279 propertyValue = hdu.header[property]
281 elif primaryHdu
is not None and property
in primaryHdu.header:
282 propertyValue = primaryHdu.header[property]
283 lookupData.addFoundItems({property: propertyValue})
287 """A base class for SQL-based registries
289 Subclasses should define the class variable `placeHolder` (the particular
290 placeholder to use for parameter substitution) appropriately. The
291 database's python module should define `paramstyle` (see PEP 249), which
292 would indicate what to use for a placeholder:
293 * paramstyle = "qmark" --> placeHolder = "?"
294 * paramstyle = "format" --> placeHolder = "%s"
295 Other `paramstyle` values are not currently supported.
297 Constructor parameters
298 ----------------------
299 conn : DBAPI connection object
309 conn : DBAPI connection object
312 Registry.__init__(self)
316 if hasattr(self,
"conn")
and self.
conn:
320 def lookup(self, lookupProperties, reference, dataId, **kwargs):
321 """Perform a lookup in the registry.
323 Return values are refined by the values in dataId.
324 Returns a list of values that match keys in lookupProperties.
325 e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
326 dataId={'visit':1}, and lookupProperties is ['filter'], and the
327 filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
328 then the return value will be [('g',)]
330 :param lookupProperties:
331 :param dataId: must be an iterable. Keys must be string.
332 If key is a string then will look for elements in the repository that match value for key.
333 If key is a 2-item iterable then will look for elements in the repository where the value is between
334 the values of key[0] and key[1].
335 :param reference: other data types that may be used to search for values.
336 :param **kwargs: nothing needed for sqlite lookup
337 :return: a list of values that match keys in lookupProperties.
344 lookupProperties =
sequencify(lookupProperties)
346 cmd =
"SELECT DISTINCT "
347 cmd +=
", ".join(lookupProperties)
348 cmd +=
" FROM " +
" NATURAL JOIN ".join(reference)
350 if dataId
is not None and len(dataId) > 0:
352 for k, v
in dataId.items():
353 if hasattr(k,
'__iter__')
and not isinstance(k, str):
355 raise RuntimeError(
"Wrong number of keys for range:%s" % (k,))
356 whereList.append(
"(%s BETWEEN %s AND %s)" % (self.
placeHolder, k[0], k[1]))
359 whereList.append(
"%s = %s" % (k, self.
placeHolder))
361 cmd +=
" WHERE " +
" AND ".join(whereList)
362 cursor = self.
conn.cursor()
363 cursor.execute(cmd, valueList)
364 return [row
for row
in cursor.fetchall()]
366 def executeQuery(self, returnFields, joinClause, whereFields, range, values):
367 """Extract metadata from the registry.
368 @param returnFields (list of strings) Metadata fields to be extracted.
369 @param joinClause (list of strings) Tables in which metadata fields
371 @param whereFields (list of tuples) First tuple element is metadata
372 field to query; second is the value that field
373 must have (often '?').
374 @param range (tuple) Value, lower limit, and upper limit for a
375 range condition on the metadata. Any of these can
377 @param values (tuple) Tuple of values to be substituted for '?'
378 characters in the whereFields values or the range
380 @return (list of tuples) All sets of field values that meet the
384 cmd =
"SELECT DISTINCT "
385 cmd +=
", ".join(returnFields)
386 cmd +=
" FROM " +
" NATURAL JOIN ".join(joinClause)
389 for k, v
in whereFields:
390 whereList.append(
"(%s = %s)" % (k, v))
391 if range
is not None:
392 whereList.append(
"(%s BETWEEN %s AND %s)" % range)
393 if len(whereList) > 0:
394 cmd +=
" WHERE " +
" AND ".join(whereList)
395 cursor = self.
conn.cursor()
396 cursor.execute(cmd, values)
397 return [row
for row
in cursor.fetchall()]
401 """A SQLite-based registry"""
412 if os.path.exists(location):
413 conn = sqlite3.connect(location)
414 conn.text_factory = str
418 SqlRegistry.__init__(self, conn)
422 """A PostgreSQL-based registry"""
431 Path to PostgreSQL configuration file.
434 raise RuntimeError(
"Cannot use PgsqlRegistry: could not import psycopg2")
437 conn = pgsql.connect(host=config[
"host"], port=config[
"port"], database=config[
"database"],
438 user=config[
"user"], password=config[
"password"])
440 SqlRegistry.__init__(self, conn)
444 """Read YAML configuration file
446 The YAML configuration file should contain:
447 * host : host name for database connection
448 * port : port for database connection
449 * user : user name for database connection
450 * database : database name
453 * password : password for database connection
455 The optional entries are set to `None` in the output configuration.
460 Path to PostgreSQL YAML config file.
469 loader = yaml.FullLoader
470 except AttributeError:
472 with open(location)
as ff:
473 data = yaml.load(ff, Loader=loader)
474 requireKeys =
set([
"host",
"port",
"database",
"user"])
475 optionalKeys =
set([
"password"])
476 haveKeys =
set(data.keys())
477 if haveKeys - optionalKeys != requireKeys:
479 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', "
480 "but this contains: %s" %
481 (location,
",".join(
"'%s'" % key
for key
in requireKeys),
482 ",".join(
"'%s'" % key
for key
in data.keys()))
484 for key
in optionalKeys:
492 return SqlRegistry.lookup(self, *args, **kwargs)