23 """This module provides registry classes for maintaining dataset metadata 
   24 for use by the Data Butler.  Currently only a SQLite3-based registry is 
   25 implemented, but registries based on a text file, a policy file, a MySQL 
   26 (or other) relational database, and data gathered from scanning a filesystem 
   29 Currently this module assumes posix access (for both PosixRegistry AND 
   30 SqliteRegistry). It is possible that it can be factored so that at least the 
   31 SqliteRegistry can be remote/not on the local filesystem. For now this module 
   32 is only used by CameraMapper and by PosixStorage, both of which work on the 
   33 local filesystem only, so this works for the time being. 
   36 from . 
import fsScanner, sequencify
 
   38 import astropy.io.fits
 
   48         import sqlite 
as sqlite3
 
   55     import psycopg2 
as pgsql
 
   62     """The registry base class.""" 
   72         """Create a registry object of an appropriate type. 
   73         @param location (string) Path or URL for registry, or None if 
   82         if location.endswith(
".pgsql"):
 
   86         if re.match(
r'.*\.sqlite3', location):
 
   88                 raise RuntimeError(
"sqlite3 registry specified (%s), but unable to import sqlite3 module" %
 
   91             if registry.conn 
is None:
 
  100         if os.path.isdir(location):
 
  103         raise RuntimeError(
"Unable to create registry using location: " + location)
 
  107     """A glob-based filesystem registry""" 
  110         Registry.__init__(self)
 
  115         """Looks up the HDU number for a given template+dataId. 
  116         :param template: template with HDU specifier (ends with brackets and an 
  117         identifier that can be populated by a key-value pair in dataId. 
  118         e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]" 
  119         :param dataId: dictionary that hopefully has a key-value pair whose key 
  120         matches (has the same name) as the key specifier in the template. 
  121         :return: the HDU specified by the template+dataId pair, or None if the 
  122         HDU can not be determined. 
  125         if not template.endswith(
']'):
 
  129         hduKey = template[template.rfind(
'[') + 1:template.rfind(
']')]
 
  131         hduKey = hduKey[hduKey.rfind(
'(') + 1:hduKey.rfind(
')')]
 
  134             return dataId[hduKey]
 
  141             lookupProperties = 
sequencify(lookupProperties)
 
  148             return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
 
  152             """Query the lookup status 
  154             :return: 'match' if the key+value pairs in dataId have been satisifed and keys in 
  155             lookupProperties have found and their key+value added to resolvedId 
  156             'incomplete' if the found data matches but not all keys in lookupProperties have been matched 
  157             'not match' if data in foundId does not match data in dataId 
  160                 """Placeholder class for item not found. 
  162                 (None might be a valid value so we don't want to use that) 
  166             if self.cachedStatus 
is not None:
 
  167                 return self.cachedStatus
 
  168             self.cachedStatus = 
'match' 
  169             for key 
in self.lookupProperties:
 
  170                 val = self.foundItems.get(key, NotFound)
 
  172                     self.cachedStatus = 
'incomplete' 
  174             for dataIdKey, dataIdValue 
in self.dataId.
items():
 
  175                 foundValue = self.foundItems.get(dataIdKey, NotFound)
 
  176                 if foundValue 
is not NotFound 
and foundValue != dataIdValue:
 
  177                     self.cachedStatus = 
'notMatch' 
  179             return self.cachedStatus
 
  192     def lookup(self, lookupProperties, reference, dataId, **kwargs):
 
  193         """Perform a lookup in the registry. 
  195         Return values are refined by the values in dataId. 
  196         Returns a list of values that match keys in lookupProperties. 
  197         e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 
  198         dataId={'visit':1}, and lookupProperties is ['filter'], and the 
  199         filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 
  200         then the return value will be [('g',)] 
  202         :param lookupProperties: keys whose values will be returned. 
  203         :param reference: other data types that may be used to search for values. 
  204         :param dataId: must be an iterable. Keys must be string. 
  205         If value is a string then will look for elements in the repository that match value for key. 
  206         If value is a 2-item iterable then will look for elements in the repository are between (inclusive) 
  207         the first and second items in the value. 
  208         :param **kwargs: keys required for the posix registry to search for items. If required keys are not 
  209         provide will return an empty list. 
  210         'template': required. template parameter (typically from a policy) that can be used to look for files 
  211         'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'. 
  212         :return: a list of values that match keys in lookupProperties. 
  215         if 'template' in kwargs:
 
  216             template = kwargs[
'template']
 
  220         storage = kwargs[
'storage'] 
if 'storage' in kwargs 
else None 
  224         allPaths = scanner.processPath(self.
rootroot)
 
  226         for path, foundProperties 
in allPaths.items():
 
  231             lookupData.setFoundItems(foundProperties)
 
  232             if 'incomplete' == lookupData.status():
 
  233                 PosixRegistry.lookupMetadata(os.path.join(self.
rootroot, path), template, lookupData, storage)
 
  234             if 'match' == lookupData.status():
 
  235                 ll = tuple(lookupData.foundItems[key] 
for key 
in lookupData.lookupProperties)
 
  241         """Dispatcher for looking up metadata in a file of a given storage type 
  243         if storage == 
'FitsStorage':
 
  244             PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
 
  248         """Look up metadata in a fits file. 
  249         Will try to discover the correct HDU to look in by testing if the 
  250         template has a value in brackets at the end. 
  251         If the HDU is specified but the metadata key is not discovered in 
  252         that HDU, will look in the primary HDU before giving up. 
  253         :param filepath: path to the file 
  254         :param template: template that was used to discover the file. This can 
  255         be used to look up the correct HDU as needed. 
  256         :param lookupData: an instance if LookupData that contains the 
  257         lookupProperties, the dataId, and the data that has been found so far. 
  258         Will be updated with new information as discovered. 
  263             hdulist = astropy.io.fits.open(filepath, memmap=
True)
 
  266         hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
 
  267         if hduNumber 
is not None and hduNumber < len(hdulist):
 
  268             hdu = hdulist[hduNumber]
 
  272             primaryHdu = hdulist[0]
 
  276         for property 
in lookupData.getMissingKeys():
 
  278             if hdu 
is not None and property 
in hdu.header:
 
  279                 propertyValue = hdu.header[property]
 
  281             elif primaryHdu 
is not None and property 
in primaryHdu.header:
 
  282                 propertyValue = primaryHdu.header[property]
 
  283             lookupData.addFoundItems({property: propertyValue})
 
  287     """A base class for SQL-based registries 
  289     Subclasses should define the class variable `placeHolder` (the particular 
  290     placeholder to use for parameter substitution) appropriately. The 
  291     database's python module should define `paramstyle` (see PEP 249), which 
  292     would indicate what to use for a placeholder: 
  293     * paramstyle = "qmark" --> placeHolder = "?" 
  294     * paramstyle = "format" --> placeHolder = "%s" 
  295     Other `paramstyle` values are not currently supported. 
  297     Constructor parameters 
  298     ---------------------- 
  299     conn : DBAPI connection object 
  309         conn : DBAPI connection object 
  312         Registry.__init__(self)
 
  316         if hasattr(self, 
"conn") 
and self.
connconn:
 
  317             self.
connconn.close()
 
  320     def _lookup(self, lookupProperties, dataId, reference, checkColumns=False):
 
  321         """Perform a lookup in the registry. 
  323         This is the worker code for cls.lookup with the added option of checking 
  324         that all the columns being looked up are in the database.  The classic 
  325         example here is adding a template with an hdu, where the hdu in the dataId 
  326         prevents us looking up e.g. dateObs.  checkColumns results in a performance 
  327         penalty, so is only invoked when a problem in the dataId keys has been seen 
  329         Return values are refined by the values in dataId. 
  330         Returns a list of values that match keys in lookupProperties. 
  331         e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 
  332         dataId={'visit':1}, and lookupProperties is ['filter'], and the 
  333         filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 
  334         then the return value will be [('g',)] 
  336         :param lookupProperties: 
  337         :param dataId: must be a key/value iterable. Keys must be string. 
  338         See `SqlRegistry.lookup` for further details 
  339         :param reference: other data types that may be used to search for values. 
  340         :param checkColumns: if True, check that keys are actually in the registry and ignore them if not 
  341         :return: a list of values that match keys in lookupProperties. 
  343         cmd = 
"SELECT DISTINCT " 
  344         cmd += 
", ".join(lookupProperties)
 
  345         cmd += 
" FROM " + 
" NATURAL JOIN ".join(reference)
 
  347         if dataId 
is not None and len(dataId) > 0:
 
  349             for k, v 
in dataId.items():
 
  352                         self.
connconn.cursor().execute(
 
  353                             f
'SELECT {k} FROM {" NATURAL JOIN ".join(reference)} LIMIT 1')
 
  354                     except sqlite3.OperationalError:
 
  357                 if hasattr(k, 
'__iter__') 
and not isinstance(k, str):
 
  359                         raise RuntimeError(
"Wrong number of keys for range:%s" % (k,))
 
  360                     whereList.append(
"(%s BETWEEN %s AND %s)" % (self.
placeHolderplaceHolder, k[0], k[1]))
 
  363                     whereList.append(
"%s = %s" % (k, self.
placeHolderplaceHolder))
 
  365             cmd += 
" WHERE " + 
" AND ".join(whereList)
 
  366         cursor = self.
connconn.cursor()
 
  367         cursor.execute(cmd, valueList)
 
  368         return [row 
for row 
in cursor.fetchall()]
 
  370     def lookup(self, lookupProperties, reference, dataId, **kwargs):
 
  371         """Perform a lookup in the registry. 
  373         Return values are refined by the values in dataId. 
  374         Returns a list of values that match keys in lookupProperties. 
  375         e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and 
  376         dataId={'visit':1}, and lookupProperties is ['filter'], and the 
  377         filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz' 
  378         then the return value will be [('g',)] 
  380         :param lookupProperties: 
  381         :param dataId: must be a key/value iterable. Keys must be string. 
  382         If value is a string then will look for elements in the repository that match value for value. 
  383         If value is a 2-item iterable then will look for elements in the repository where the value is between 
  384         the values of value[0] and value[1]. 
  385         :param reference: other data types that may be used to search for values. 
  386         :param **kwargs: nothing needed for sqlite lookup 
  387         :return: a list of values that match keys in lookupProperties. 
  389         if not self.
connconn:
 
  394         lookupProperties = 
sequencify(lookupProperties)
 
  397             return self.
_lookup_lookup(lookupProperties, dataId, reference)
 
  398         except sqlite3.OperationalError:  
 
  399             return self.
_lookup_lookup(lookupProperties, dataId, reference, checkColumns=
True)
 
  401     def executeQuery(self, returnFields, joinClause, whereFields, range, values):
 
  402         """Extract metadata from the registry. 
  403         @param returnFields (list of strings) Metadata fields to be extracted. 
  404         @param joinClause   (list of strings) Tables in which metadata fields 
  406         @param whereFields  (list of tuples) First tuple element is metadata 
  407                             field to query; second is the value that field 
  408                             must have (often '?'). 
  409         @param range        (tuple) Value, lower limit, and upper limit for a 
  410                             range condition on the metadata.  Any of these can 
  412         @param values       (tuple) Tuple of values to be substituted for '?' 
  413                             characters in the whereFields values or the range 
  415         @return (list of tuples) All sets of field values that meet the 
  417         if not self.
connconn:
 
  419         cmd = 
"SELECT DISTINCT " 
  420         cmd += 
", ".join(returnFields)
 
  421         cmd += 
" FROM " + 
" NATURAL JOIN ".join(joinClause)
 
  424             for k, v 
in whereFields:
 
  425                 whereList.append(
"(%s = %s)" % (k, v))
 
  426         if range 
is not None:
 
  427             whereList.append(
"(%s BETWEEN %s AND %s)" % range)
 
  428         if len(whereList) > 0:
 
  429             cmd += 
" WHERE " + 
" AND ".join(whereList)
 
  430         cursor = self.
connconn.cursor()
 
  431         cursor.execute(cmd, values)
 
  432         return [row 
for row 
in cursor.fetchall()]
 
  436     """A SQLite-based registry""" 
  447         if os.path.exists(location):
 
  448             conn = sqlite3.connect(location)
 
  449             conn.text_factory = str
 
  453         SqlRegistry.__init__(self, conn)
 
  457     """A PostgreSQL-based registry""" 
  466             Path to PostgreSQL configuration file. 
  469             raise RuntimeError(
"Cannot use PgsqlRegistry: could not import psycopg2")
 
  470         config = self.
readYamlreadYaml(location)
 
  472         conn = pgsql.connect(host=config[
"host"], port=config[
"port"], database=config[
"database"],
 
  473                              user=config[
"user"], password=config[
"password"])
 
  475         SqlRegistry.__init__(self, conn)
 
  479         """Read YAML configuration file 
  481         The YAML configuration file should contain: 
  482         * host : host name for database connection 
  483         * port : port for database connection 
  484         * user : user name for database connection 
  485         * database : database name 
  488         * password : password for database connection 
  490         The optional entries are set to `None` in the output configuration. 
  495             Path to PostgreSQL YAML config file. 
  504             loader = yaml.UnsafeLoader
 
  505         except AttributeError:
 
  507         with open(location) 
as ff:
 
  508             data = yaml.load(ff, Loader=loader)
 
  509         requireKeys = 
set([
"host", 
"port", 
"database", 
"user"])
 
  510         optionalKeys = 
set([
"password"])
 
  511         haveKeys = 
set(data.keys())
 
  512         if haveKeys - optionalKeys != requireKeys:
 
  514                 "PostgreSQL YAML configuration (%s) should contain only %s, and may contain 'password', " 
  515                 "but this contains: %s" %
 
  516                 (location, 
",".join(
"'%s'" % key 
for key 
in requireKeys),
 
  517                  ",".join(
"'%s'" % key 
for key 
in data.keys()))
 
  519         for key 
in optionalKeys:
 
  527             return SqlRegistry.lookup(self, *args, **kwargs)
 
  529             self.
connconn.rollback()
 
std::vector< SchemaItem< Flag > > * items
def __init__(self, location)
def lookup(self, *args, **kwargs)
def addFoundItems(self, items)
def __init__(self, lookupProperties, dataId)
def setFoundItems(self, items)
def lookupFitsMetadata(filepath, template, lookupData, dataId)
def lookup(self, lookupProperties, reference, dataId, **kwargs)
def lookupMetadata(filepath, template, lookupData, storage)
def getHduNumber(template, dataId)
def lookup(self, lookupProperties, reference, dataId, **kwargs)
def _lookup(self, lookupProperties, dataId, reference, checkColumns=False)
def executeQuery(self, returnFields, joinClause, whereFields, range, values)
def __init__(self, location)
daf::base::PropertySet * set