LSSTApplications  11.0-13-gbb96280,12.1.rc1,12.1.rc1+1,12.1.rc1+2,12.1.rc1+5,12.1.rc1+8,12.1.rc1-1-g06d7636+1,12.1.rc1-1-g253890b+5,12.1.rc1-1-g3d31b68+7,12.1.rc1-1-g3db6b75+1,12.1.rc1-1-g5c1385a+3,12.1.rc1-1-g83b2247,12.1.rc1-1-g90cb4cf+6,12.1.rc1-1-g91da24b+3,12.1.rc1-2-g3521f8a,12.1.rc1-2-g39433dd+4,12.1.rc1-2-g486411b+2,12.1.rc1-2-g4c2be76,12.1.rc1-2-gc9c0491,12.1.rc1-2-gda2cd4f+6,12.1.rc1-3-g3391c73+2,12.1.rc1-3-g8c1bd6c+1,12.1.rc1-3-gcf4b6cb+2,12.1.rc1-4-g057223e+1,12.1.rc1-4-g19ed13b+2,12.1.rc1-4-g30492a7
LSSTDataManagementBasePackage
registries.py
Go to the documentation of this file.
1 #
2 # LSST Data Management System
3 # Copyright 2008, 2009, 2010 LSST Corporation.
4 #
5 # This product includes software developed by the
6 # LSST Project (http://www.lsst.org/).
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the LSST License Statement and
19 # the GNU General Public License along with this program. If not,
20 # see <http://www.lsstcorp.org/LegalNotices/>.
21 #
22 
23 """This module provides registry classes for maintaining dataset metadata
24 for use by the Data Butler. Currently only a SQLite3-based registry is
25 implemented, but registries based on a text file, a policy file, a MySQL
26 (or other) relational database, and data gathered from scanning a filesystem
27 are all anticipated.
28 
29 Currently this module assumes posix access (for both PosixRegistry AND
30 SqliteRegistry). It is possible that it can be factored so that at least the
31 SqliteRegistry can be remote/not on the local filesystem. For now this module
32 is only used by CameraMapper and by PosixStorage, both of which work on the
33 local filesystem only, so this works for the time being.
34 """
35 from __future__ import absolute_import
36 from builtins import object
37 from past.builtins import basestring
38 
39 import copy
40 from . import fsScanner, sequencify
41 import os
42 import astropy.io.fits
43 import re
44 try:
45  import sqlite3
46  haveSqlite3 = True
47 except ImportError:
48  try:
49  # try external pysqlite package; deprecated
50  import sqlite as sqlite3
51  haveSqlite3 = True
52  except ImportError:
53  haveSqlite3 = False
54 
55 
56 class Registry(object):
57  """The registry base class."""
58 
59  def __init__(self):
60  pass
61 
62  @staticmethod
63  def create(location):
64  """Create a registry object of an appropriate type.
65  @param location (string) Path or URL for registry, or None if
66  unavailable"""
67 
68  if location is None:
69  return
70 
71  # if re.match(r'.*\.registry', location):
72  # return FileRegistry(location)
73  # if re.match(r'.*\.paf', location):
74  # return CalibRegistry(location)
75 
76  # look for an sqlite3 registry
77  if haveSqlite3 and re.match(r'.*\.sqlite3', location):
78  registry = SqliteRegistry(location)
79  if registry.conn is None:
80  return None
81  return registry
82 
83  # if re.match(r'mysql:', location):
84  # return DbRegistry(location)
85  # return FsRegistry(location)
86 
87  # next try to create a PosixRegistry
88  if os.path.exists(location):
89  return PosixRegistry(root=location)
90 
91  raise RuntimeError("Unable to create registry using location: " + location)
92 
93 
95  """A glob-based filesystem registry"""
96 
97  def __init__(self, root):
98  Registry.__init__(self)
99  self.root = root
100 
101  @staticmethod
102  def getHduNumber(template, dataId):
103  """Looks up the HDU number for a given template+dataId.
104  :param template: template with HDU specifier (ends with brackets and an
105  identifier that can be populated by a key-value pair in dataId.
106  e.g. "%(visit)07d/instcal%(visit)07d.fits.fz[%(ccdnum)d]"
107  :param dataId: dictionary that hopefully has a key-value pair whose key
108  matches (has the same name) as the key specifier in the template.
109  :return: the HDU specified by the template+dataId pair, or None if the
110  HDU can not be determined.
111  """
112  # sanity check that the template at least ends with a brace.
113  if not template.endswith(']'):
114  return None
115 
116  # get the key (with formatting) out of the brances
117  hduKey = template[template.rfind('[') + 1:template.rfind(']')]
118  # extract the key name from the formatting
119  hduKey = hduKey[hduKey.rfind('(') + 1:hduKey.rfind(')')]
120 
121  if hduKey in dataId:
122  return dataId[hduKey]
123  return None
124 
125  class LookupData(object):
126 
127  def __init__(self, lookupProperties, dataId):
128  self.dataId = copy.copy(dataId)
129  lookupProperties = sequencify(lookupProperties)
130  self.lookupProperties = copy.copy(lookupProperties)
131  self.foundItems = {}
132  self.cachedStatus = None
133  self.neededKeys = set(lookupProperties).union(dataId.keys())
134 
135  def __repr__(self):
136  return "LookupData lookupProperties:%s dataId:%s foundItems:%s cachedStatus:%s" % \
137  (self.lookupProperties, self.dataId, self.foundItems, self.cachedStatus)
138 
139  def status(self):
140  """Query the lookup status
141 
142  :return: 'match' if the key+value pairs in dataId have been satisifed and keys in
143  lookupProperties have found and their key+value added to resolvedId
144  'incomplete' if the found data matches but there are still incomplete data matching in dataId or
145  lookupProperties
146  'not match' if data in foundId does not match data in dataId
147  """
148  if self.cachedStatus is not None:
149  return self.cachedStatus
150  self.cachedStatus = 'match'
151  for key in self.dataId:
152  if key not in self.foundItems:
153  self.cachedStatus = 'incomplete'
154  elif self.dataId[key] != self.foundItems[key]:
155  self.cachedStatus = 'incomplete'
156  if self.cachedStatus == 'match':
157  for key in self.lookupProperties:
158  if key not in self.foundItems:
159  self.cachedStatus = 'incomplete'
160  break
161  return self.cachedStatus
162 
163  def setFoundItems(self, items):
164  self.cachedStatus = None
165  self.foundItems = items
166 
167  def addFoundItems(self, items):
168  self.cachedStatus = None
169  self.foundItems.update(items)
170 
171  def getMissingKeys(self):
172  return self.neededKeys - set(self.foundItems.keys())
173 
174  def lookup(self, lookupProperties, reference, dataId, **kwargs):
175  """Perform a lookup in the registry.
176 
177  Return values are refined by the values in dataId.
178  Returns a list of values that match keys in lookupProperties.
179  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
180  dataId={'visit':1}, and lookupProperties is ['filter'], and the
181  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
182  then the return value will be [('g',)]
183 
184  :param lookupProperties: keys whose values will be returned.
185  :param reference: other data types that may be used to search for values.
186  :param dataId: must be an iterable. Keys must be string.
187  If value is a string then will look for elements in the repository that match value for key.
188  If value is a 2-item iterable then will look for elements in the repository are between (inclusive)
189  the first and second items in the value.
190  :param **kwargs: keys required for the posix registry to search for items. If required keys are not
191  provide will return an empty list.
192  'template': required. template parameter (typically from a policy) that can be used to look for files
193  'storage': optional. Needed to look for metadata in files. Currently supported values: 'FitsStorage'.
194  :return: a list of values that match keys in lookupProperties.
195  """
196  # required kwargs:
197  if 'template' in kwargs:
198  template = kwargs['template']
199  else:
200  return []
201  # optional kwargs:
202  storage = kwargs['storage'] if 'storage' in kwargs else None
203 
204  lookupData = PosixRegistry.LookupData(lookupProperties, dataId)
205  scanner = fsScanner.FsScanner(template)
206  allPaths = scanner.processPath(self.root)
207  retItems = [] # one item for each found file that matches
208  for path, foundProperties in allPaths.items():
209  # check for dataId keys that are not present in found properties
210  # search for those keys in metadata of file at path
211  # if present, check for matching values
212  # if not present, file can not match, do not use it.
213  lookupData.setFoundItems(foundProperties)
214  if 'incomplete' == lookupData.status():
215  PosixRegistry.lookupMetadata(os.path.join(self.root, path), template, lookupData, storage)
216  if 'match' == lookupData.status():
217  l = tuple(lookupData.foundItems[key] for key in lookupData.lookupProperties)
218  retItems.append(l)
219  return retItems
220 
221  @staticmethod
222  def lookupMetadata(filepath, template, lookupData, storage):
223  """Dispatcher for looking up metadata in a file of a given storage type
224  """
225  if storage == 'FitsStorage':
226  PosixRegistry.lookupFitsMetadata(filepath, template, lookupData, storage)
227 
228  @staticmethod
229  def lookupFitsMetadata(filepath, template, lookupData, dataId):
230  """Look up metadata in a fits file.
231  Will try to discover the correct HDU to look in by testing if the
232  template has a value in brackets at the end.
233  If the HDU is specified but the metadata key is not discovered in
234  that HDU, will look in the primary HDU before giving up.
235  :param filepath: path to the file
236  :param template: template that was used to discover the file. This can
237  be used to look up the correct HDU as needed.
238  :param lookupData: an instance if LookupData that contains the
239  lookupProperties, the dataId, and the data that has been found so far.
240  Will be updated with new information as discovered.
241  :param dataId:
242  :return:
243  """
244  try:
245  hdulist = astropy.io.fits.open(filepath, memmap=True)
246  except IOError:
247  return
248  hduNumber = PosixRegistry.getHduNumber(template=template, dataId=dataId)
249  if hduNumber is not None and hduNumber < len(hdulist):
250  hdu = hdulist[hduNumber]
251  else:
252  hdu = None
253  if len(hdulist) > 0:
254  primaryHdu = hdulist[0]
255  else:
256  primaryHdu = None
257 
258  for property in lookupData.getMissingKeys():
259  propertyValue = None
260  if hdu is not None and property in hdu.header:
261  propertyValue = hdu.header[property]
262  # if the value is not in the indicated HDU, try the primary HDU:
263  elif primaryHdu is not None and property in primaryHdu.header:
264  propertyValue = primaryHdu.header[property]
265  lookupData.addFoundItems({property: propertyValue})
266 
267 
269  """A SQLite3-based registry."""
270 
271  def __init__(self, location):
272  """Constructor.
273  @param location (string) Path to SQLite3 file"""
274 
275  Registry.__init__(self)
276  if os.path.exists(location):
277  self.conn = sqlite3.connect(location)
278  self.conn.text_factory = str
279  else:
280  self.conn = None
281 
282  def lookup(self, lookupProperties, reference, dataId, **kwargs):
283  """Perform a lookup in the registry.
284 
285  Return values are refined by the values in dataId.
286  Returns a list of values that match keys in lookupProperties.
287  e.g. if the template is 'raw/raw_v%(visit)d_f%(filter)s.fits.gz', and
288  dataId={'visit':1}, and lookupProperties is ['filter'], and the
289  filesystem under self.root has exactly one file 'raw/raw_v1_fg.fits.gz'
290  then the return value will be [('g',)]
291 
292  :param lookupProperties:
293  :param dataId: must be an iterable. Keys must be string.
294  If key is a string then will look for elements in the repository that match value for key.
295  If key is a 2-item iterable then will look for elements in the repository where the value is between
296  the values of key[0] and key[1].
297  :param reference: other data types that may be used to search for values.
298  :param **kwargs: nothing needed for sqlite lookup
299  :return: a list of values that match keys in lookupProperties.
300  """
301  if not self.conn:
302  return None
303 
304  # input variable sanitization:
305  reference = sequencify(reference)
306  lookupProperties = sequencify(lookupProperties)
307 
308  cmd = "SELECT DISTINCT "
309  cmd += ", ".join(lookupProperties)
310  cmd += " FROM " + " NATURAL JOIN ".join(reference)
311  valueList = []
312  if dataId is not None and len(dataId) > 0:
313  whereList = []
314  for k, v in dataId.items():
315  if hasattr(k, '__iter__') and not isinstance(k, basestring):
316  if len(k) != 2:
317  raise RuntimeError("Wrong number of keys for range:%s" % (k,))
318  whereList.append("(? BETWEEN %s AND %s)" % (k[0], k[1]))
319  valueList.append(v)
320  else:
321  whereList.append("%s = ?" % k)
322  valueList.append(v)
323  cmd += " WHERE " + " AND ".join(whereList)
324  c = self.conn.execute(cmd, valueList)
325  result = []
326  for row in c:
327  result.append(row)
328  return result
329 
330  def executeQuery(self, returnFields, joinClause, whereFields, range, values):
331  """Extract metadata from the registry.
332  @param returnFields (list of strings) Metadata fields to be extracted.
333  @param joinClause (list of strings) Tables in which metadata fields
334  are located.
335  @param whereFields (list of tuples) First tuple element is metadata
336  field to query; second is the value that field
337  must have (often '?').
338  @param range (tuple) Value, lower limit, and upper limit for a
339  range condition on the metadata. Any of these can
340  be metadata fields.
341  @param values (tuple) Tuple of values to be substituted for '?'
342  characters in the whereFields values or the range
343  values.
344  @return (list of tuples) All sets of field values that meet the
345  criteria"""
346  if not self.conn:
347  return None
348  cmd = "SELECT DISTINCT "
349  cmd += ", ".join(returnFields)
350  cmd += " FROM " + " NATURAL JOIN ".join(joinClause)
351  whereList = []
352  if whereFields:
353  for k, v in whereFields:
354  whereList.append("(%s = %s)" % (k, v))
355  if range is not None:
356  whereList.append("(%s BETWEEN %s AND %s)" % range)
357  if len(whereList) > 0:
358  cmd += " WHERE " + " AND ".join(whereList)
359  c = self.conn.execute(cmd, values)
360  result = []
361  for row in c:
362  result.append(row)
363  return result