LSSTApplications  18.1.0
LSSTDataManagementBasePackage
registries.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008, 2009, 2010 LSST Corporation.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <http://www.lsstcorp.org/LegalNotices/>.
21 #
22 
23 """This module provides registry classes for maintaining dataset metadata
24 for use by the Data Butler. Currently only a SQLite3-based registry is
25 implemented, but registries based on a text file, a policy file, a MySQL
26 (or other) relational database, and data gathered from scanning a filesystem
27 are all anticipated.
28 
29 Currently this module assumes posix access (for both PosixRegistry AND
30 SqliteRegistry). It is possible that it can be factored so that at least the
31 SqliteRegistry can be remote/not on the local filesystem. For now this module
32 is only used by CameraMapper and by PosixStorage, both of which work on the
33 local filesystem only, so this works for the time being.
34 """
35 from __future__ import absolute_import
36 from past.builtins import basestring
37 from builtins import super
38 
39 import copy
40 from . import fsScanner, sequencify
41 import os
42 import astropy.io.fits
43 import re
44 import yaml
45 
46 try:
47  import sqlite3
48  haveSqlite3 = True
49 except ImportError:
50  try:
51  # try external pysqlite package; deprecated
52  import sqlite as sqlite3
53  haveSqlite3 = True
54  except ImportError:
55  haveSqlite3 = False
56 
57 # PostgreSQL support
58 try:
59  import psycopg2 as pgsql
60  havePgsql = True
61 except ImportError:
62  havePgsql = False
63 
64 
66  """The registry base class."""
67 
68  def __init__(self):
69  pass
70 
71  def __del__(self):
72  pass
73 
74  @staticmethod
75  def create(location):
76  """Create a registry object of an appropriate type.
77  @param location (string) Path or URL for registry, or None if
78  unavailable"""
79 
80  if location is None:
81  return
82 
83  # if re.match(r'.*\.registry', location):
84  # return FileRegistry(location)
85  # if re.match(r'.*\.paf', location):
86  # return CalibRegistry(location)
87 
88  if location.endswith(".pgsql"):
89  return PgsqlRegistry(location)
90 
91  # look for an sqlite3 registry
92  if re.match(r'.*\.sqlite3', location):
93  if not haveSqlite3:
94  raise RuntimeError("sqlite3 registry specified (%s), but unable to import sqlite3 module" %
95  (location,))
96  registry = SqliteRegistry(location)
97  if registry.conn is None:
98  return None
99  return registry
100 
101  # if re.match(r'mysql:', location):
102  # return DbRegistry(location)
103  # return FsRegistry(location)
104 
105  # next try to create a PosixRegistry
106  if os.path.isdir(location):
107  return PosixRegistry(root=location)
108 
109  raise RuntimeError("Unable to create registry using location: " + location)
110 
111 
113  """A glob-based filesystem registry"""
114 
115  def __init__(self, root):
116  Registry.__init__(self)
117  self.root = root
118 
119  @staticmethod
120  def getHduNumber(template, dataId):
121  """Looks up the HDU number for a given template+dataId.
122  :param template: template with HDU specifier (ends with brackets and an
123  identifier that can be populated by a key-value pair in dataId.
124  e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
125  :param dataId: dictionary that hopefully has a key-value pair whose key
126  matches (has the same name) as the key specifier in the template.
127  :return: the HDU specified by the template+dataId pair, or None if the
128  HDU can not be determined.
129  """
130  # sanity check that the template at least ends with a brace.
131  if not template.endswith(']'):
132  return None
133 
134  # get the key (with formatting) out of the brances
135  hduKey = template[template.rfind('[') + 1:template.rfind(']')]
136  # extract the key name from the formatting
137  hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')]
138 
139  if hduKey in dataId:
140  return dataId[hduKey]
141  return None
142 
144 
145  def __init__(self, lookupProperties, dataId):
146  self.dataId = copy.copy(dataId)
147  lookupProperties = sequencify(lookupProperties)
148  self.lookupProperties = copy.copy(lookupProperties)
149  self.foundItems = {}
150  self.cachedStatus = None
151  self.neededKeys = set(lookupProperties).union(dataId.keys())
152 
153  def __repr__(self):
154  return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
155  (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus)
156 
157  def status(self):
158  """Query the lookup status
159 
160  :return: 'match' if the key+value pairs in dataId have been satisifed and keys in
161  lookupProperties have found and their key+value added to resolvedId
162  'incomplete' if the found data matches but not all keys in lookupProperties have been matched
163  'not match' if data in foundId does not match data in dataId
164  """
165  class NotFound:
166  """Placeholder class for item not found.
167 
168  (None might be a valid value so we don't want to use that)
169  """
170  pass
171 
172  if self.cachedStatus is not None:
173  return self.cachedStatus
174  self.cachedStatus = 'match'
175  for key in self.lookupProperties:
176  val = self.foundItems.get(key, NotFound)
177  if val is NotFound:
178  self.cachedStatus = 'incomplete'
179  break
180  for dataIdKey, dataIdValue in self.dataId.items():
181  foundValue = self.foundItems.get(dataIdKey, NotFound)
182  if foundValue is not NotFound and foundValue != dataIdValue:
183  self.cachedStatus = 'notMatch'
184  break
185  return self.cachedStatus
186 
187  def setFoundItems(self, items):
188  self.cachedStatus = None
189  self.foundItems = items
190 
191  def addFoundItems(self, items):
192  self.cachedStatus = None
193  self.foundItems.update(items)
194 
195  def getMissingKeys(self):
196  return self.neededKeys - set(self.foundItems.keys())
197 
198  def lookup(self, lookupProperties, reference, dataId, **kwargs):
199  """Perform a lookup in the registry.
200 
201  Return values are refined by the values in dataId.
202  Returns a list of values that match keys in lookupProperties.
203  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
204  dataId={'visit':1}, and lookupProperties is ['filter'], and the
205  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
206  then the return value will be [('g',)]
207 
208  :param lookupProperties: keys whose values will be returned.
209  :param reference: other data types that may be used to search for values.
210  :param dataId: must be an iterable. Keys must be string.
211  If value is a string then will look for elements in the repository that match value for key.
212  If value is a 2-item iterable then will look for elements in the repository are between (inclusive)
213  the first and second items in the value.
214  :param **kwargs: keys required for the posix registry to search for items. If required keys are not
215  provide will return an empty list.
216  'template': required. template parameter (typically from a policy) that can be used to look for files
217  'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'.
218  :return: a list of values that match keys in lookupProperties.
219  """
220  # required kwargs:
221  if 'template' in kwargs:
222  template = kwargs['template']
223  else:
224  return []
225  # optional kwargs:
226  storage = kwargs['storage'] if 'storage' in kwargs else None
227 
228  lookupData = PosixRegistry.LookupData(lookupProperties, dataId)
229  scanner = fsScanner.FsScanner(template)
230  allPaths = scanner.processPath(self.root)
231  retItems = [] # one item for each found file that matches
232  for path, foundProperties in allPaths.items():
233  # check for dataId keys that are not present in found properties
234  # search for those keys in metadata of file at path
235  # if present, check for matching values
236  # if not present, file can not match, do not use it.
237  lookupData.setFoundItems(foundProperties)
238  if 'incomplete' == lookupData.status():
239  PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage)
240  if 'match' == lookupData.status():
241  ll = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties)
242  retItems.append(ll)
243  return retItems
244 
245  @staticmethod
246  def lookupMetadata(filepath, template, lookupData, storage):
247  """Dispatcher for looking up metadata in a file of a given storage type
248  """
249  if storage == 'FitsStorage':
250  PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
251 
252  @staticmethod
253  def lookupFitsMetadata(filepath, template, lookupData, dataId):
254  """Look up metadata in a fits file.
255  Will try to discover the correct HDU to look in by testing if the
256  template has a value in brackets at the end.
257  If the HDU is specified but the metadata key is not discovered in
258  that HDU, will look in the primary HDU before giving up.
259  :param filepath: path to the file
260  :param template: template that was used to discover the file. This can
261  be used to look up the correct HDU as needed.
262  :param lookupData: an instance if LookupData that contains the
263  lookupProperties, the dataId, and the data that has been found so far.
264  Will be updated with new information as discovered.
265  :param dataId:
266  :return:
267  """
268  try:
269  hdulist = astropy.io.fits.open(filepath, memmap=True)
270  except IOError:
271  return
272  hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
273  if hduNumber is not None and hduNumber < len(hdulist):
274  hdu = hdulist[hduNumber]
275  else:
276  hdu = None
277  if len(hdulist) > 0:
278  primaryHdu = hdulist[0]
279  else:
280  primaryHdu = None
281 
282  for property in lookupData.getMissingKeys():
283  propertyValue = None
284  if hdu is not None and property in hdu.header:
285  propertyValue = hdu.header[property]
286  # if the value is not in the indicated HDU, try the primary HDU:
287  elif primaryHdu is not None and property in primaryHdu.header:
288  propertyValue = primaryHdu.header[property]
289  lookupData.addFoundItems({property: propertyValue})
290 
291 
293  """A base class for SQL-based registries
294 
295  Subclasses should define the class variable `placeHolder` (the particular
296  placeholder to use for parameter substitution) appropriately. The
297  database's python module should define `paramstyle` (see PEP 249), which
298  would indicate what to use for a placeholder:
299  * paramstyle = "qmark" --> placeHolder = "?"
300  * paramstyle = "format" --> placeHolder = "%s"
301  Other `paramstyle` values are not currently supported.
302 
303  Constructor parameters
304  ----------------------
305  conn : DBAPI connection object
306  Connection object
307  """
308  placeHolder = "?" # Placeholder for parameter substitution
309 
310  def __init__(self, conn):
311  """Constructor.
312 
313  Parameters
314  ----------
315  conn : DBAPI connection object
316  Connection object
317  """
318  Registry.__init__(self)
319  self.conn = conn
320 
321  def __del__(self):
322  if hasattr(self, "conn") and self.conn:
323  self.conn.close()
324  super().__del__()
325 
326  def lookup(self, lookupProperties, reference, dataId, **kwargs):
327  """Perform a lookup in the registry.
328 
329  Return values are refined by the values in dataId.
330  Returns a list of values that match keys in lookupProperties.
331  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
332  dataId={'visit':1}, and lookupProperties is ['filter'], and the
333  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
334  then the return value will be [('g',)]
335 
336  :param lookupProperties:
337  :param dataId: must be an iterable. Keys must be string.
338  If key is a string then will look for elements in the repository that match value for key.
339  If key is a 2-item iterable then will look for elements in the repository where the value is between
340  the values of key[0] and key[1].
341  :param reference: other data types that may be used to search for values.
342  :param **kwargs: nothing needed for sqlite lookup
343  :return: a list of values that match keys in lookupProperties.
344  """
345  if not self.conn:
346  return None
347 
348  # input variable sanitization:
349  reference = sequencify(reference)
350  lookupProperties = sequencify(lookupProperties)
351 
352  cmd = "SELECT DISTINCT "
353  cmd += ", ".join(lookupProperties)
354  cmd += " FROM " + " NATURAL JOIN ".join(reference)
355  valueList = []
356  if dataId is not None and len(dataId) > 0:
357  whereList = []
358  for k, v in dataId.items():
359  if hasattr(k, '__iter__') and not isinstance(k, basestring):
360  if len(k) != 2:
361  raise RuntimeError("Wrong number of keys for range:%s" % (k,))
362  whereList.append("(%s BETWEEN %s AND %s)" % (self.placeHolder, k[0], k[1]))
363  valueList.append(v)
364  else:
365  whereList.append("%s = %s" % (k, self.placeHolder))
366  valueList.append(v)
367  cmd += " WHERE " + " AND ".join(whereList)
368  cursor = self.conn.cursor()
369  cursor.execute(cmd, valueList)
370  return [row for row in cursor.fetchall()]
371 
372  def executeQuery(self, returnFields, joinClause, whereFields, range, values):
373  """Extract metadata from the registry.
374  @param returnFields (list of strings) Metadata fields to be extracted.
375  @param joinClause (list of strings) Tables in which metadata fields
376  are located.
377  @param whereFields (list of tuples) First tuple element is metadata
378  field to query; second is the value that field
379  must have (often '?').
380  @param range (tuple) Value, lower limit, and upper limit for a
381  range condition on the metadata. Any of these can
382  be metadata fields.
383  @param values (tuple) Tuple of values to be substituted for '?'
384  characters in the whereFields values or the range
385  values.
386  @return (list of tuples) All sets of field values that meet the
387  criteria"""
388  if not self.conn:
389  return None
390  cmd = "SELECT DISTINCT "
391  cmd += ", ".join(returnFields)
392  cmd += " FROM " + " NATURAL JOIN ".join(joinClause)
393  whereList = []
394  if whereFields:
395  for k, v in whereFields:
396  whereList.append("(%s = %s)" % (k, v))
397  if range is not None:
398  whereList.append("(%s BETWEEN %s AND %s)" % range)
399  if len(whereList) > 0:
400  cmd += " WHERE " + " AND ".join(whereList)
401  cursor = self.conn.cursor()
402  cursor.execute(cmd, values)
403  return [row for row in cursor.fetchall()]
404 
405 
407  """A SQLite-based registry"""
408  placeHolder = "?" # Placeholder for parameter substitution
409 
410  def __init__(self, location):
411  """Constructor
412 
413  Parameters
414  ----------
415  location : `str`
416  Path to SQLite3 file
417  """
418  if os.path.exists(location):
419  conn = sqlite3.connect(location)
420  conn.text_factory = str
421  self.root = location
422  else:
423  conn = None
424  SqlRegistry.__init__(self, conn)
425 
426 
428  """A PostgreSQL-based registry"""
429  placeHolder = "%s"
430 
431  def __init__(self, location):
432  """Constructor
433 
434  Parameters
435  ----------
436  location : `str`
437  Path to PostgreSQL configuration file.
438  """
439  if not havePgsql:
440  raise RuntimeError("Cannot use PgsqlRegistry: could not import psycopg2")
441  config = self.readYaml(location)
442  self._config = config
443  conn = pgsql.connect(host=config["host"], port=config["port"], database=config["database"],
444  user=config["user"], password=config["password"])
445  self.root = location
446  SqlRegistry.__init__(self, conn)
447 
448  @staticmethod
449  def readYaml(location):
450  """Read YAML configuration file
451 
452  The YAML configuration file should contain:
453  * host : host name for database connection
454  * port : port for database connection
455  * user : user name for database connection
456  * database : database name
457 
458  It may also contain:
459  * password : password for database connection
460 
461  The optional entries are set to `None` in the output configuration.
462 
463  Parameters
464  ----------
465  location : `str`
466  Path to PostgreSQL YAML config file.
467 
468  Returns
469  -------
470  config : `dict`
471  Configuration
472  """
473  try:
474  # PyYAML >=5.1 prefers a different loader
475  loader = yaml.FullLoader
476  except AttributeError:
477  loader = yaml.Loader
478  with open(location) as ff:
479  data = yaml.load(ff, Loader=loader)
480  requireKeys = set(["host", "port", "database", "user"])
481  optionalKeys = set(["password"])
482  haveKeys = set(data.keys())
483  if haveKeys - optionalKeys != requireKeys:
484  raise RuntimeError(
485  "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', "
486  "but this contains: %s" %
487  (location, ",".join("'%s'" % key for key in requireKeys),
488  ",".join("'%s'" % key for key in data.keys()))
489  )
490  for key in optionalKeys:
491  if key not in data:
492  data[key] = None
493 
494  return data
495 
496  def lookup(self, *args, **kwargs):
497  try:
498  return SqlRegistry.lookup(self, *args, **kwargs)
499  except Exception:
500  self.conn.rollback()
501  raise
def lookup(self, lookupProperties, reference, dataId, kwargs)
Definition: registries.py:326
def lookupFitsMetadata(filepath, template, lookupData, dataId)
Definition: registries.py:253
def __init__(self, lookupProperties, dataId)
Definition: registries.py:145
daf::base::PropertySet * set
Definition: fits.cc:884
def lookup(self, lookupProperties, reference, dataId, kwargs)
Definition: registries.py:198
def executeQuery(self, returnFields, joinClause, whereFields, range, values)
Definition: registries.py:372
def lookupMetadata(filepath, template, lookupData, storage)
Definition: registries.py:246
std::vector< SchemaItem< Flag > > * items