LSSTApplications  20.0.0
LSSTDataManagementBasePackage
registries.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008, 2009, 2010 LSST Corporation.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <http://www.lsstcorp.org/LegalNotices/>.
21 #
22 
23 """This module provides registry classes for maintaining dataset metadata
24 for use by the Data Butler. Currently only a SQLite3-based registry is
25 implemented, but registries based on a text file, a policy file, a MySQL
26 (or other) relational database, and data gathered from scanning a filesystem
27 are all anticipated.
28 
29 Currently this module assumes posix access (for both PosixRegistry AND
30 SqliteRegistry). It is possible that it can be factored so that at least the
31 SqliteRegistry can be remote/not on the local filesystem. For now this module
32 is only used by CameraMapper and by PosixStorage, both of which work on the
33 local filesystem only, so this works for the time being.
34 """
35 import copy
36 from . import fsScanner, sequencify
37 import os
38 import astropy.io.fits
39 import re
40 import yaml
41 
42 try:
43  import sqlite3
44  haveSqlite3 = True
45 except ImportError:
46  try:
47  # try external pysqlite package; deprecated
48  import sqlite as sqlite3
49  haveSqlite3 = True
50  except ImportError:
51  haveSqlite3 = False
52 
53 # PostgreSQL support
54 try:
55  import psycopg2 as pgsql
56  havePgsql = True
57 except ImportError:
58  havePgsql = False
59 
60 
61 class Registry:
62  """The registry base class."""
63 
64  def __init__(self):
65  pass
66 
67  def __del__(self):
68  pass
69 
70  @staticmethod
71  def create(location):
72  """Create a registry object of an appropriate type.
73  @param location (string) Path or URL for registry, or None if
74  unavailable"""
75 
76  if location is None:
77  return
78 
79  # if re.match(r'.*\.registry', location):
80  # return FileRegistry(location)
81  # if re.match(r'.*\.paf', location):
82  # return CalibRegistry(location)
83 
84  if location.endswith(".pgsql"):
85  return PgsqlRegistry(location)
86 
87  # look for an sqlite3 registry
88  if re.match(r'.*\.sqlite3', location):
89  if not haveSqlite3:
90  raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" %
91  (location,))
92  registry = SqliteRegistry(location)
93  if registry.conn is None:
94  return None
95  return registry
96 
97  # if re.match(r'mysql:', location):
98  # return DbRegistry(location)
99  # return FsRegistry(location)
100 
101  # next try to create a PosixRegistry
102  if os.path.isdir(location):
103  return PosixRegistry(root=location)
104 
105  raise RuntimeError("Unable to create registry using location: " + location)
106 
107 
109  """A glob-based filesystem registry"""
110 
111  def __init__(self, root):
112  Registry.__init__(self)
113  self.root = root
114 
115  @staticmethod
116  def getHduNumber(template, dataId):
117  """Looks up the HDU number for a given template+dataId.
118  :param template: template with HDU specifier (ends with brackets and an
119  identifier that can be populated by a key-value pair in dataId.
120  e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
121  :param dataId: dictionary that hopefully has a key-value pair whose key
122  matches (has the same name) as the key specifier in the template.
123  :return: the HDU specified by the template+dataId pair, or None if the
124  HDU can not be determined.
125  """
126  # sanity check that the template at least ends with a brace.
127  if not template.endswith(']'):
128  return None
129 
130  # get the key (with formatting) out of the brances
131  hduKey = template[template.rfind('[') + 1:template.rfind(']')]
132  # extract the key name from the formatting
133  hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')]
134 
135  if hduKey in dataId:
136  return dataId[hduKey]
137  return None
138 
139  class LookupData:
140 
141  def __init__(self, lookupProperties, dataId):
142  self.dataId = copy.copy(dataId)
143  lookupProperties = sequencify(lookupProperties)
144  self.lookupProperties = copy.copy(lookupProperties)
145  self.foundItems = {}
146  self.cachedStatus = None
147  self.neededKeys = set(lookupProperties).union(dataId.keys())
148 
149  def __repr__(self):
150  return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
151  (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus)
152 
153  def status(self):
154  """Query the lookup status
155 
156  :return: 'match' if the key+value pairs in dataId have been satisifed and keys in
157  lookupProperties have found and their key+value added to resolvedId
158  'incomplete' if the found data matches but not all keys in lookupProperties have been matched
159  'not match' if data in foundId does not match data in dataId
160  """
161  class NotFound:
162  """Placeholder class for item not found.
163 
164  (None might be a valid value so we don't want to use that)
165  """
166  pass
167 
168  if self.cachedStatus is not None:
169  return self.cachedStatus
170  self.cachedStatus = 'match'
171  for key in self.lookupProperties:
172  val = self.foundItems.get(key, NotFound)
173  if val is NotFound:
174  self.cachedStatus = 'incomplete'
175  break
176  for dataIdKey, dataIdValue in self.dataId.items():
177  foundValue = self.foundItems.get(dataIdKey, NotFound)
178  if foundValue is not NotFound and foundValue != dataIdValue:
179  self.cachedStatus = 'notMatch'
180  break
181  return self.cachedStatus
182 
183  def setFoundItems(self, items):
184  self.cachedStatus = None
185  self.foundItems = items
186 
187  def addFoundItems(self, items):
188  self.cachedStatus = None
189  self.foundItems.update(items)
190 
191  def getMissingKeys(self):
192  return self.neededKeys - set(self.foundItems.keys())
193 
194  def lookup(self, lookupProperties, reference, dataId, **kwargs):
195  """Perform a lookup in the registry.
196 
197  Return values are refined by the values in dataId.
198  Returns a list of values that match keys in lookupProperties.
199  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
200  dataId={'visit':1}, and lookupProperties is ['filter'], and the
201  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
202  then the return value will be [('g',)]
203 
204  :param lookupProperties: keys whose values will be returned.
205  :param reference: other data types that may be used to search for values.
206  :param dataId: must be an iterable. Keys must be string.
207  If value is a string then will look for elements in the repository that match value for key.
208  If value is a 2-item iterable then will look for elements in the repository are between (inclusive)
209  the first and second items in the value.
210  :param **kwargs: keys required for the posix registry to search for items. If required keys are not
211  provide will return an empty list.
212  'template': required. template parameter (typically from a policy) that can be used to look for files
213  'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'.
214  :return: a list of values that match keys in lookupProperties.
215  """
216  # required kwargs:
217  if 'template' in kwargs:
218  template = kwargs['template']
219  else:
220  return []
221  # optional kwargs:
222  storage = kwargs['storage'] if 'storage' in kwargs else None
223 
224  lookupData = PosixRegistry.LookupData(lookupProperties, dataId)
225  scanner = fsScanner.FsScanner(template)
226  allPaths = scanner.processPath(self.root)
227  retItems = [] # one item for each found file that matches
228  for path, foundProperties in allPaths.items():
229  # check for dataId keys that are not present in found properties
230  # search for those keys in metadata of file at path
231  # if present, check for matching values
232  # if not present, file can not match, do not use it.
233  lookupData.setFoundItems(foundProperties)
234  if 'incomplete' == lookupData.status():
235  PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage)
236  if 'match' == lookupData.status():
237  ll = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties)
238  retItems.append(ll)
239  return retItems
240 
241  @staticmethod
242  def lookupMetadata(filepath, template, lookupData, storage):
243  """Dispatcher for looking up metadata in a file of a given storage type
244  """
245  if storage == 'FitsStorage':
246  PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
247 
248  @staticmethod
249  def lookupFitsMetadata(filepath, template, lookupData, dataId):
250  """Look up metadata in a fits file.
251  Will try to discover the correct HDU to look in by testing if the
252  template has a value in brackets at the end.
253  If the HDU is specified but the metadata key is not discovered in
254  that HDU, will look in the primary HDU before giving up.
255  :param filepath: path to the file
256  :param template: template that was used to discover the file. This can
257  be used to look up the correct HDU as needed.
258  :param lookupData: an instance if LookupData that contains the
259  lookupProperties, the dataId, and the data that has been found so far.
260  Will be updated with new information as discovered.
261  :param dataId:
262  :return:
263  """
264  try:
265  hdulist = astropy.io.fits.open(filepath, memmap=True)
266  except IOError:
267  return
268  hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
269  if hduNumber is not None and hduNumber < len(hdulist):
270  hdu = hdulist[hduNumber]
271  else:
272  hdu = None
273  if len(hdulist) > 0:
274  primaryHdu = hdulist[0]
275  else:
276  primaryHdu = None
277 
278  for property in lookupData.getMissingKeys():
279  propertyValue = None
280  if hdu is not None and property in hdu.header:
281  propertyValue = hdu.header[property]
282  # if the value is not in the indicated HDU, try the primary HDU:
283  elif primaryHdu is not None and property in primaryHdu.header:
284  propertyValue = primaryHdu.header[property]
285  lookupData.addFoundItems({property: propertyValue})
286 
287 
289  """A base class for SQL-based registries
290 
291  Subclasses should define the class variable `placeHolder` (the particular
292  placeholder to use for parameter substitution) appropriately. The
293  database's python module should define `paramstyle` (see PEP 249), which
294  would indicate what to use for a placeholder:
295  * paramstyle = "qmark" --> placeHolder = "?"
296  * paramstyle = "format" --> placeHolder = "%s"
297  Other `paramstyle` values are not currently supported.
298 
299  Constructor parameters
300  ----------------------
301  conn : DBAPI connection object
302  Connection object
303  """
304  placeHolder = "?" # Placeholder for parameter substitution
305 
306  def __init__(self, conn):
307  """Constructor.
308 
309  Parameters
310  ----------
311  conn : DBAPI connection object
312  Connection object
313  """
314  Registry.__init__(self)
315  self.conn = conn
316 
317  def __del__(self):
318  if hasattr(self, "conn") and self.conn:
319  self.conn.close()
320  super().__del__()
321 
322  def lookup(self, lookupProperties, reference, dataId, **kwargs):
323  """Perform a lookup in the registry.
324 
325  Return values are refined by the values in dataId.
326  Returns a list of values that match keys in lookupProperties.
327  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
328  dataId={'visit':1}, and lookupProperties is ['filter'], and the
329  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
330  then the return value will be [('g',)]
331 
332  :param lookupProperties:
333  :param dataId: must be an iterable. Keys must be string.
334  If key is a string then will look for elements in the repository that match value for key.
335  If key is a 2-item iterable then will look for elements in the repository where the value is between
336  the values of key[0] and key[1].
337  :param reference: other data types that may be used to search for values.
338  :param **kwargs: nothing needed for sqlite lookup
339  :return: a list of values that match keys in lookupProperties.
340  """
341  if not self.conn:
342  return None
343 
344  # input variable sanitization:
345  reference = sequencify(reference)
346  lookupProperties = sequencify(lookupProperties)
347 
348  cmd = "SELECT DISTINCT "
349  cmd += ", ".join(lookupProperties)
350  cmd += " FROM " + " NATURAL JOIN ".join(reference)
351  valueList = []
352  if dataId is not None and len(dataId) > 0:
353  whereList = []
354  for k, v in dataId.items():
355  if hasattr(k, '__iter__') and not isinstance(k, str):
356  if len(k) != 2:
357  raise RuntimeError("Wrong number of keys for range:%s" % (k,))
358  whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolder, k[0], k[1]))
359  valueList.append(v)
360  else:
361  whereList.append("%s = %s" % (k, self.placeHolder))
362  valueList.append(v)
363  cmd += " WHERE " + " AND ".join(whereList)
364  cursor = self.conn.cursor()
365  cursor.execute(cmd, valueList)
366  return [row for row in cursor.fetchall()]
367 
368  def executeQuery(self, returnFields, joinClause, whereFields, range, values):
369  """Extract metadata from the registry.
370  @param returnFields (list of strings) Metadata fields to be extracted.
371  @param joinClause (list of strings) Tables in which metadata fields
372  are located.
373  @param whereFields (list of tuples) First tuple element is metadata
374  field to query; second is the value that field
375  must have (often '?').
376  @param range (tuple) Value, lower limit, and upper limit for a
377  range condition on the metadata. Any of these can
378  be metadata fields.
379  @param values (tuple) Tuple of values to be substituted for '?'
380  characters in the whereFields values or the range
381  values.
382  @return (list of tuples) All sets of field values that meet the
383  criteria"""
384  if not self.conn:
385  return None
386  cmd = "SELECT DISTINCT "
387  cmd += ", ".join(returnFields)
388  cmd += " FROM " + " NATURAL JOIN ".join(joinClause)
389  whereList = []
390  if whereFields:
391  for k, v in whereFields:
392  whereList.append("(%s = %s)" % (k, v))
393  if range is not None:
394  whereList.append("(%s BETWEEN %s AND %s)" % range)
395  if len(whereList) > 0:
396  cmd += " WHERE " + " AND ".join(whereList)
397  cursor = self.conn.cursor()
398  cursor.execute(cmd, values)
399  return [row for row in cursor.fetchall()]
400 
401 
403  """A SQLite-based registry"""
404  placeHolder = "?" # Placeholder for parameter substitution
405 
406  def __init__(self, location):
407  """Constructor
408 
409  Parameters
410  ----------
411  location : `str`
412  Path to SQLite3 file
413  """
414  if os.path.exists(location):
415  conn = sqlite3.connect(location)
416  conn.text_factory = str
417  self.root = location
418  else:
419  conn = None
420  SqlRegistry.__init__(self, conn)
421 
422 
424  """A PostgreSQL-based registry"""
425  placeHolder = "%s"
426 
427  def __init__(self, location):
428  """Constructor
429 
430  Parameters
431  ----------
432  location : `str`
433  Path to PostgreSQL configuration file.
434  """
435  if not havePgsql:
436  raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2")
437  config = self.readYaml(location)
438  self._config = config
439  conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"],
440  user=config["user"], password=config["password"])
441  self.root = location
442  SqlRegistry.__init__(self, conn)
443 
444  @staticmethod
445  def readYaml(location):
446  """Read YAML configuration file
447 
448  The YAML configuration file should contain:
449  * host : host name for database connection
450  * port : port for database connection
451  * user : user name for database connection
452  * database : database name
453 
454  It may also contain:
455  * password : password for database connection
456 
457  The optional entries are set to `None` in the output configuration.
458 
459  Parameters
460  ----------
461  location : `str`
462  Path to PostgreSQL YAML config file.
463 
464  Returns
465  -------
466  config : `dict`
467  Configuration
468  """
469  try:
470  # PyYAML >=5.1 prefers a different loader
471  loader = yaml.FullLoader
472  except AttributeError:
473  loader = yaml.Loader
474  with open(location) as ff:
475  data = yaml.load(ff, Loader=loader)
476  requireKeys = set(["host", "port", "database", "user"])
477  optionalKeys = set(["password"])
478  haveKeys = set(data.keys())
479  if haveKeys - optionalKeys != requireKeys:
480  raise RuntimeError(
481  "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', "
482  "but this contains: %s" %
483  (location, ",".join("'%s'" % key for key in requireKeys),
484  ",".join("'%s'" % key for key in data.keys()))
485  )
486  for key in optionalKeys:
487  if key not in data:
488  data[key] = None
489 
490  return data
491 
492  def lookup(self, *args, **kwargs):
493  try:
494  return SqlRegistry.lookup(self, *args, **kwargs)
495  except Exception:
496  self.conn.rollback()
497  raise
lsst::daf::persistence.registries.SqlRegistry.lookup
def lookup(self, lookupProperties, reference, dataId, **kwargs)
Definition: registries.py:322
lsst::daf::persistence.registries.SqliteRegistry
Definition: registries.py:402
lsst::daf::persistence.registries.PgsqlRegistry
Definition: registries.py:423
lsst::daf::persistence.registries.PosixRegistry.root
root
Definition: registries.py:113
lsst::daf::persistence.registries.PosixRegistry.__init__
def __init__(self, root)
Definition: registries.py:111
lsst::daf::persistence.registries.PgsqlRegistry.root
root
Definition: registries.py:441
lsst::daf::persistence.registries.PosixRegistry.LookupData.__repr__
def __repr__(self)
Definition: registries.py:149
lsst::daf::persistence.registries.PosixRegistry.LookupData.lookupProperties
lookupProperties
Definition: registries.py:144
lsst::daf::persistence.registries.PosixRegistry.LookupData.foundItems
foundItems
Definition: registries.py:145
lsst::daf::persistence.registries.PosixRegistry.LookupData.getMissingKeys
def getMissingKeys(self)
Definition: registries.py:191
lsst::daf::persistence.registries.PosixRegistry.lookup
def lookup(self, lookupProperties, reference, dataId, **kwargs)
Definition: registries.py:194
lsst::daf::persistence.registries.PosixRegistry.LookupData.__init__
def __init__(self, lookupProperties, dataId)
Definition: registries.py:141
astshim.keyMap.keyMapContinued.keys
def keys(self)
Definition: keyMapContinued.py:6
lsst::daf::persistence.registries.Registry.__init__
def __init__(self)
Definition: registries.py:64
lsst::daf::persistence.registries.SqlRegistry.placeHolder
string placeHolder
Definition: registries.py:304
lsst::daf::persistence.registries.PosixRegistry.lookupFitsMetadata
def lookupFitsMetadata(filepath, template, lookupData, dataId)
Definition: registries.py:249
lsst::daf::persistence.registries.PosixRegistry.getHduNumber
def getHduNumber(template, dataId)
Definition: registries.py:116
lsst::daf::persistence.registries.SqliteRegistry.root
root
Definition: registries.py:417
lsst::daf::persistence.registries.PgsqlRegistry.readYaml
def readYaml(location)
Definition: registries.py:445
lsst::daf::persistence.registries.PgsqlRegistry._config
_config
Definition: registries.py:438
lsst::daf::persistence.registries.SqlRegistry
Definition: registries.py:288
lsst::daf::persistence.utils.sequencify
def sequencify(x)
Definition: utils.py:67
lsst::daf::persistence.registries.PosixRegistry.LookupData.neededKeys
neededKeys
Definition: registries.py:147
lsst::daf::persistence.registries.PgsqlRegistry.__init__
def __init__(self, location)
Definition: registries.py:427
lsst::daf::persistence.registries.Registry
Definition: registries.py:61
lsst::daf::persistence.registries.SqliteRegistry.__init__
def __init__(self, location)
Definition: registries.py:406
lsst::daf::persistence.registries.SqlRegistry.executeQuery
def executeQuery(self, returnFields, joinClause, whereFields, range, values)
Definition: registries.py:368
lsst::daf::persistence.registries.PgsqlRegistry.lookup
def lookup(self, *args, **kwargs)
Definition: registries.py:492
items
std::vector< SchemaItem< Flag > > * items
Definition: BaseColumnView.cc:142
lsst::daf::persistence.registries.PosixRegistry
Definition: registries.py:108
lsst::daf::persistence.registries.PosixRegistry.LookupData.cachedStatus
cachedStatus
Definition: registries.py:146
lsst::daf::persistence.registries.PosixRegistry.LookupData.status
def status(self)
Definition: registries.py:153
lsst::daf::persistence.registries.SqlRegistry.__del__
def __del__(self)
Definition: registries.py:317
lsst::daf::persistence.registries.SqlRegistry.conn
conn
Definition: registries.py:315
lsst::daf::persistence.fsScanner.FsScanner
Definition: fsScanner.py:32
lsst::daf::persistence.registries.Registry.create
def create(location)
Definition: registries.py:71
lsst::daf::persistence.registries.PosixRegistry.LookupData.dataId
dataId
Definition: registries.py:142
lsst::daf::persistence.registries.PosixRegistry.LookupData.setFoundItems
def setFoundItems(self, items)
Definition: registries.py:183
lsst::daf::persistence.registries.PosixRegistry.lookupMetadata
def lookupMetadata(filepath, template, lookupData, storage)
Definition: registries.py:242
lsst::daf::persistence.registries.PosixRegistry.LookupData
Definition: registries.py:139
lsst::daf::persistence.registries.Registry.__del__
def __del__(self)
Definition: registries.py:67
set
daf::base::PropertySet * set
Definition: fits.cc:912
lsst::daf::persistence.registries.SqlRegistry.__init__
def __init__(self, conn)
Definition: registries.py:306
lsst::daf::persistence.registries.PosixRegistry.LookupData.addFoundItems
def addFoundItems(self, items)
Definition: registries.py:187