LSSTApplications  17.0+124,17.0+14,17.0+73,18.0.0+37,18.0.0+80,18.0.0-4-g68ffd23+4,18.1.0-1-g0001055+12,18.1.0-1-g03d53ef+5,18.1.0-1-g1349e88+55,18.1.0-1-g2505f39+44,18.1.0-1-g5315e5e+4,18.1.0-1-g5e4b7ea+14,18.1.0-1-g7e8fceb+4,18.1.0-1-g85f8cd4+48,18.1.0-1-g8ff0b9f+4,18.1.0-1-ga2c679d+1,18.1.0-1-gd55f500+35,18.1.0-10-gb58edde+2,18.1.0-11-g0997b02+4,18.1.0-13-gfe4edf0b+12,18.1.0-14-g259bd21+21,18.1.0-19-gdb69f3f+2,18.1.0-2-g5f9922c+24,18.1.0-2-gd3b74e5+11,18.1.0-2-gfbf3545+32,18.1.0-26-g728bddb4+5,18.1.0-27-g6ff7ca9+2,18.1.0-3-g52aa583+25,18.1.0-3-g8ea57af+9,18.1.0-3-gb69f684+42,18.1.0-3-gfcaddf3+6,18.1.0-32-gd8786685a,18.1.0-4-gf3f9b77+6,18.1.0-5-g1dd662b+2,18.1.0-5-g6dbcb01+41,18.1.0-6-gae77429+3,18.1.0-7-g9d75d83+9,18.1.0-7-gae09a6d+30,18.1.0-9-gc381ef5+4,w.2019.45
LSSTDataManagementBasePackage
convertRepo.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 from __future__ import annotations
22 
23 __all__ = ["ConvertRepoConfig", "ConvertRepoTask", "ConvertRepoSkyMapConfig"]
24 
25 import os
26 import fnmatch
27 from dataclasses import dataclass
28 from typing import Iterable, Optional, List, Dict
29 
30 from lsst.utils import doImport
31 from lsst.daf.butler import (
32  Butler as Butler3,
33  SkyPixDimension
34 )
35 from lsst.pex.config import Config, ConfigurableField, ConfigDictField, DictField, ListField, Field
36 from lsst.pipe.base import Task
37 from lsst.skymap import skyMapRegistry, BaseSkyMap
38 
39 from ..ingest import RawIngestTask
40 from .repoConverter import ConversionSubset
41 from .rootRepoConverter import RootRepoConverter
42 from .calibRepoConverter import CalibRepoConverter
43 from .standardRepoConverter import StandardRepoConverter
44 
45 
46 @dataclass
48  """Struct containing information about a skymap that may appear in a Gen2
49  repository.
50  """
51 
52  name: str
53  """Name of the skymap used in Gen3 data IDs.
54  """
55 
56  sha1: bytes
57  """Hash computed by `BaseSkyMap.getSha1`.
58  """
59 
60  instance: BaseSkyMap
61  """Name of the skymap used in Gen3 data IDs.
62  """
63 
64  used: bool = False
65  """Whether this skymap has been found in at least one repository being
66  converted.
67  """
68 
69 
71  """Sub-config used to hold the parameters of a SkyMap.
72 
73  Notes
74  -----
75  This config only needs to exist because we can't put a
76  `~lsst.pex.config.RegistryField` directly inside a
77  `~lsst.pex.config.ConfigDictField`.
78 
79  It needs to have its only field named "skyMap" for compatibility with the
80  configuration of `lsst.pipe.tasks.MakeSkyMapTask`, which we want so we can
81  use one config file in an obs package to configure both.
82 
83  This name leads to unfortunate repetition with the field named
84  "skymap" that holds it - "skyMap[name].skyMap" - but that seems
85  unavoidable.
86  """
87  skyMap = skyMapRegistry.makeField(
88  doc="Type and parameters for the SkyMap itself.",
89  default="dodeca",
90  )
91 
92 
93 class ConvertRepoConfig(Config):
94  raws = ConfigurableField(
95  "Configuration for subtask responsible for ingesting raws and adding "
96  "visit and exposure dimension entries.",
97  target=RawIngestTask,
98  )
99  skyMaps = ConfigDictField(
100  "Mapping from Gen3 skymap name to the parameters used to construct a "
101  "BaseSkyMap instance. This will be used to associated names with "
102  "existing skymaps found in the Gen2 repo.",
103  keytype=str,
104  itemtype=ConvertRepoSkyMapConfig,
105  default={}
106  )
107  collections = DictField(
108  "Special collections (values) for certain dataset types (keys). "
109  "These are used in addition to rerun collections for datasets in "
110  "reruns. The 'raw' dataset must have an entry here if it is to be "
111  "converted.",
112  keytype=str,
113  itemtype=str,
114  default={
115  "deepCoadd_skyMap": "skymaps",
116  "brightObjectMask": "masks",
117  }
118  )
119  storageClasses = DictField(
120  "Mapping from dataset type name or Gen2 policy entry (e.g. 'python' "
121  "or 'persistable') to the Gen3 StorageClass name.",
122  keytype=str,
123  itemtype=str,
124  default={
125  "BaseSkyMap": "SkyMap",
126  "BaseCatalog": "Catalog",
127  "BackgroundList": "Background",
128  "raw": "Exposure",
129  }
130  )
131  doRegisterInstrument = Field(
132  "If True (default), add dimension records for the Instrument and its "
133  "filters and detectors to the registry instead of assuming they are "
134  "already present.",
135  dtype=bool,
136  default=True,
137  )
138  doWriteCuratedCalibrations = Field(
139  "If True (default), ingest human-curated calibrations directly via "
140  "the Instrument interface. Note that these calibrations are never "
141  "converted from Gen2 repositories.",
142  dtype=bool,
143  default=True,
144  )
145  refCats = ListField(
146  "The names of reference catalogs (subdirectories under ref_cats) to "
147  "be converted",
148  dtype=str,
149  default=[]
150  )
151  fileIgnorePatterns = ListField(
152  "Filename globs that should be ignored instead of being treated as "
153  "datasets.",
154  dtype=str,
155  default=["README.txt", "*~?", "butler.yaml", "gen3.sqlite3"]
156  )
157  datasetIncludePatterns = ListField(
158  "Glob-style patterns for dataset type names that should be converted.",
159  dtype=str,
160  default=["*"]
161  )
162  datasetIgnorePatterns = ListField(
163  "Glob-style patterns for dataset type names that should not be "
164  "converted despite matching a pattern in datasetIncludePatterns.",
165  dtype=str,
166  default=[]
167  )
168  ccdKey = Field(
169  "Key used for the Gen2 equivalent of 'detector' in data IDs.",
170  dtype=str,
171  default="ccd",
172  )
173  relatedOnly = Field(
174  "If True (default), only convert datasets that are related to the "
175  "ingested visits. Ignored unless a list of visits is passed to "
176  "run().",
177  dtype=bool,
178  default=False,
179  )
180 
181  @property
182  def transfer(self):
183  return self.raws.transfer
184 
185  @transfer.setter
186  def transfer(self, value):
187  self.raws.transfer = value
188 
189  @property
190  def instrument(self):
191  return self.raws.instrument
192 
193  @instrument.setter
194  def instrument(self, value):
195  self.raws.instrument = value
196 
197  def setDefaults(self):
198  self.transfer = None
199 
200  # TODO: check that there are no collection overrides for curated
201  # calibrations, since we don't have a good way to utilize them.
202 
203 
205  """A task that converts one or more related Gen2 data repositories to a
206  single Gen3 data repository (with multiple collections).
207 
208  Parameters
209  ----------
210  config: `ConvertRepoConfig`
211  Configuration for this task.
212  butler3: `lsst.daf.butler.Butler`
213  Gen3 Butler instance that represents the data repository datasets will
214  be ingested into. The collection and/or run associated with this
215  Butler will be ignored in favor of collections/runs passed via config
216  or to `run`.
217  kwds
218  Other keyword arguments are forwarded to the `Task` constructor.
219 
220  Notes
221  -----
222  Most of the work of converting repositories is delegated to instances of
223  the `RepoConverter` hierarchy. The `ConvertRepoTask` instance itself holds
224  only state that is relevant for all Gen2 repositories being ingested, while
225  each `RepoConverter` instance holds only state relevant for the conversion
226  of a single Gen2 repository. Both the task and the `RepoConverter`
227  instances are single use; `ConvertRepoTask.run` and most `RepoConverter`
228  methods may only be called once on a particular instance.
229  """
230 
231  ConfigClass = ConvertRepoConfig
232 
233  _DefaultName = "convertRepo"
234 
235  def __init__(self, config=None, *, butler3: Butler3, **kwds):
236  super().__init__(config, **kwds)
237  self.butler3 = butler3
238  self.registry = self.butler3.registry
239  self.universe = self.registry.dimensions
240  if self.isDatasetTypeIncluded("raw"):
241  self.makeSubtask("raws", butler=butler3)
242  self.instrument = self.raws.instrument
243  else:
244  self.raws = None
245  self.instrument = doImport(self.config.instrument)()
246  self._configuredSkyMapsBySha1 = {}
247  self._configuredSkyMapsByName = {}
248  for name, config in self.config.skyMaps.items():
249  instance = config.skyMap.apply()
250  struct = ConfiguredSkyMap(name=name, sha1=instance.getSha1(), instance=instance)
251  self._configuredSkyMapsBySha1[struct.sha1] = struct
252  self._configuredSkyMapsByName[struct.name] = struct
253  self._usedSkyPix = set()
254 
255  def isDatasetTypeIncluded(self, datasetTypeName: str):
256  """Return `True` if configuration indicates that the given dataset type
257  should be converted.
258 
259  This method is intended to be called primarily by the
260  `RepoConverter` instances used interally by the task.
261 
262  Parameters
263  ----------
264  datasetTypeName: str
265  Name of the dataset type.
266 
267  Returns
268  -------
269  included : `bool`
270  Whether the dataset should be included in the conversion.
271  """
272  return (
273  any(fnmatch.fnmatchcase(datasetTypeName, pattern)
274  for pattern in self.config.datasetIncludePatterns) and
275  not any(fnmatch.fnmatchcase(datasetTypeName, pattern)
276  for pattern in self.config.datasetIgnorePatterns)
277  )
278 
279  def useSkyMap(self, skyMap: BaseSkyMap) -> str:
280  """Indicate that a repository uses the given SkyMap.
281 
282  This method is intended to be called primarily by the
283  `RepoConverter` instances used interally by the task.
284 
285  Parameters
286  ----------
287  skyMap : `lsst.skymap.BaseSkyMap`
288  SkyMap instance being used, typically retrieved from a Gen2
289  data repository.
290 
291  Returns
292  -------
293  name : `str`
294  The name of the skymap in Gen3 data IDs.
295  """
296  sha1 = skyMap.getSha1()
297  try:
298  struct = self._configuredSkyMapsBySha1[sha1]
299  except KeyError as err:
300  raise LookupError(f"SkyMap with sha1={sha1} not included in configuration.") from err
301  struct.used = True
302  return struct.name
303 
304  def registerUsedSkyMaps(self, subset: Optional[ConversionSubset]):
305  """Register all skymaps that have been marked as used.
306 
307  This method is intended to be called primarily by the
308  `RepoConverter` instances used interally by the task.
309 
310  Parameters
311  ----------
312  subset : `ConversionSubset`, optional
313  Object that will be used to filter converted datasets by data ID.
314  If given, it will be updated with the tracts of this skymap that
315  overlap the visits in the subset.
316  """
317  for struct in self._configuredSkyMapsBySha1.values():
318  if struct.used:
319  struct.instance.register(struct.name, self.registry)
320  if subset is not None and self.config.relatedOnly:
321  subset.addSkyMap(self.registry, struct.name)
322 
323  def useSkyPix(self, dimension: SkyPixDimension):
324  """Indicate that a repository uses the given SkyPix dimension.
325 
326  This method is intended to be called primarily by the
327  `RepoConverter` instances used interally by the task.
328 
329  Parameters
330  ----------
331  dimension : `lsst.daf.butler.SkyPixDimension`
332  Dimension represening a pixelization of the sky.
333  """
334  self._usedSkyPix.add(dimension)
335 
336  def registerUsedSkyPix(self, subset: Optional[ConversionSubset]):
337  """Register all skymaps that have been marked as used.
338 
339  This method is intended to be called primarily by the
340  `RepoConverter` instances used interally by the task.
341 
342  Parameters
343  ----------
344  subset : `ConversionSubset`, optional
345  Object that will be used to filter converted datasets by data ID.
346  If given, it will be updated with the pixelization IDs that
347  overlap the visits in the subset.
348  """
349  if subset is not None and self.config.relatedOnly:
350  for dimension in self._usedSkyPix:
351  subset.addSkyPix(self.registry, dimension)
352 
353  def run(self, root: str, collections: List[str], *,
354  calibs: Dict[str, List[str]] = None,
355  reruns: Dict[str, List[str]] = None,
356  visits: Optional[Iterable[int]] = None):
357  """Convert a group of related data repositories.
358 
359  Parameters
360  ----------
361  root : `str`
362  Complete path to the root Gen2 data repository. This should be
363  a data repository that includes a Gen2 registry and any raw files
364  and/or reference catalogs.
365  collections : `list` of `str`
366  Gen3 collections that datasets from the root repository should be
367  associated with. This should include any rerun collection that
368  these datasets should also be considered to be part of; because of
369  structural difference between Gen2 parent/child relationships and
370  Gen3 collections, these cannot be reliably inferred.
371  calibs : `dict`
372  Dictionary mapping calibration repository path to the collections
373  that the repository's datasets should be associated with. The path
374  may be relative to ``root`` or absolute. Collections should
375  include child repository collections as appropriate (see
376  documentation for ``collections``).
377  reruns : `dict`
378  Dictionary mapping rerun repository path to the collections that
379  the repository's datasets should be associated with. The path may
380  be relative to ``root`` or absolute. Collections should include
381  child repository collections as appropriate (see documentation for
382  ``collections``).
383  visits : iterable of `int`, optional
384  The integer IDs of visits to convert. If not provided, all visits
385  in the Gen2 root repository will be converted.
386  """
387 
388  if calibs is None:
389  calibs = {}
390  if reruns is None:
391  reruns = {}
392  if visits is not None:
393  subset = ConversionSubset(instrument=self.instrument.getName(), visits=frozenset(visits))
394  else:
395  if self.config.relatedOnly:
396  self.log.warn("config.relatedOnly is True but all visits are being ingested; "
397  "no filtering will be done.")
398  subset = None
399 
400  if self.config.doRegisterInstrument:
401  self.instrument.register(self.registry)
402 
403  # Make and prep converters for all Gen2 repos. This should not modify
404  # the Registry database or filesystem at all, though it may query it.
405  converters = []
406  rootConverter = RootRepoConverter(task=self, root=root, collections=collections, subset=subset)
407  rootConverter.prep()
408  converters.append(rootConverter)
409 
410  for root, collections in calibs.items():
411  if not os.path.isabs(root):
412  root = os.path.join(rootConverter.root, root)
413  converter = CalibRepoConverter(task=self, root=root, collections=collections,
414  mapper=rootConverter.mapper,
415  subset=rootConverter.subset)
416  converter.prep()
417  converters.append(converter)
418 
419  for root, collections in reruns.items():
420  if not os.path.isabs(root):
421  root = os.path.join(rootConverter.root, root)
422  converter = StandardRepoConverter(task=self, root=root, collections=collections,
423  subset=rootConverter.subset)
424  converter.prep()
425  converters.append(converter)
426 
427  # Actual database writes start here. We can't wrap these sanely in
428  # transactions (yet) because we keep initializing new Butler instances
429  # just so we can write into new runs/collections, and transactions
430  # are managed at the Butler level (DM-21246 should let us fix this).
431 
432  # Insert dimensions needed by any converters. These are only the
433  # dimensions that a converter expects to be uniquely derived from the
434  # Gen2 repository it is reponsible for - e.g. visits, exposures, and
435  # calibration_labels.
436  #
437  # Note that we do not try to filter dimensions down to just those
438  # related to the given visits, even if config.relatedOnly is True; we
439  # need them in the Gen3 repo in order to be able to know which datasets
440  # to convert, because Gen2 alone doesn't know enough about the
441  # relationships between data IDs.
442  for converter in converters:
443  converter.insertDimensionData()
444 
445  # Insert dimensions that are potentially shared by all Gen2
446  # repositories (and are hence managed directly by the Task, rather
447  # than a converter instance).
448  # This also finishes setting up the (shared) converter.subsets object
449  # that is used to filter data IDs for config.relatedOnly.
450  self.registerUsedSkyMaps(rootConverter.subset)
451  self.registerUsedSkyPix(rootConverter.subset)
452 
453  # Actually ingest datasets.
454  for converter in converters:
455  converter.ingest()
def makeSubtask(self, name, keyArgs)
Definition: task.py:275
daf::base::PropertySet * set
Definition: fits.cc:902
bool any(CoordinateExpr< N > const &expr) noexcept
Return true if any elements are true.
def getName(self)
Definition: task.py:250
def doImport(pythonType)
Definition: utils.py:106