21 from __future__
import annotations
23 __all__ = [
"RepoConverter"]
25 from dataclasses
import dataclass
26 from collections
import defaultdict
27 from abc
import ABC, abstractmethod
44 from lsst.daf.butler
import DataCoordinate, FileDataset, DatasetType
46 from .repoWalker
import RepoWalker
49 from ..mapping
import Mapping
as CameraMapperMapping
50 from .convertRepo
import ConvertRepoTask
51 from .scanner
import PathElementHandler
52 from lsst.daf.butler
import StorageClass, Registry, SkyPixDimension, FormatterParameter
53 from .._instrument
import Instrument
58 """A helper class for `ConvertRepoTask` and `RepoConverter` that maintains
59 lists of related data ID values that should be included in the conversion.
64 Instrument name used in Gen3 data IDs.
65 visits : `set` of `int`
66 Visit IDs that define the filter.
69 def __init__(self, instrument: str, visits: Set[int]):
77 """Populate the included tract IDs for the given skymap from those that
78 overlap the visits the `ConversionSubset` was initialized with.
82 registry : `lsst.daf.butler.Registry`
83 Registry that can be queried for visit/tract overlaps.
85 SkyMap name used in Gen3 data IDs.
88 self.
tractstracts[name] = tracts
89 for visit
in self.
visitsvisits:
90 for dataId
in registry.queryDataIds([
"tract"],
91 dataId={
"skymap": name,
94 tracts.add(dataId[
"tract"])
96 def addSkyPix(self, registry: Registry, dimension: SkyPixDimension):
97 """Populate the included skypix IDs for the given dimension from those
98 that overlap the visits the `ConversionSubset` was initialized with.
102 registry : `lsst.daf.butler.Registry`
103 Registry that can be queried for visit regions.
105 SkyMap name used in Gen3 data IDs.
107 if self.
regionsregions
is None:
109 for visit
in self.
visitsvisits:
110 dataId = registry.expandDataId(instrument=self.
instrumentinstrument, visit=visit)
113 for region
in self.
regionsregions:
114 ranges = ranges.union(dimension.pixelization.envelope(region))
115 self.
skypixskypix[dimension] = ranges
118 """Test whether the given data ID is related to this subset and hence
119 should be included in a repository conversion.
123 dataId : `lsst.daf.butler.DataCoordinate`
129 `True` if this data ID should be included in a repository
134 More formally, this tests that the given data ID is not unrelated;
135 if a data ID does not involve tracts, visits, or skypix dimensions,
136 we always include it.
138 if self.
visitsvisits
is None:
141 if "visit" in dataId.graph
and dataId[
"visit"]
not in self.
visitsvisits:
143 if "tract" in dataId.graph
and dataId[
"tract"]
not in self.
tractstracts[dataId[
"skymap"]]:
145 for dimension, ranges
in self.
skypixskypix.
items():
146 if dimension
in dataId.graph
and not ranges.intersects(dataId[dimension]):
154 """The name of the instrument, as used in Gen3 data IDs (`str`).
158 """The set of visit IDs that should be included in the conversion (`set`
162 regions: Optional[List[Region]]
163 """Regions for all visits (`list` of `lsst.sphgeom.Region`).
165 Set to `None` before it has been initialized. Any code that attempts to
166 use it when it is `None` has a logic bug.
169 tracts: Dict[str, Set[int]]
170 """Tracts that should be included in the conversion, grouped by skymap
171 name (`dict` mapping `str` to `set` of `int`).
174 skypix: Dict[SkyPixDimension, RangeSet]
175 """SkyPix ranges that should be included in the conversion, grouped by
176 dimension (`dict` mapping `SkyPixDimension` to `lsst.sphgeom.RangeSet`).
181 """An abstract base class for objects that help `ConvertRepoTask` convert
182 datasets from a single Gen2 repository.
186 task : `ConvertRepoTask`
187 Task instance that is using this helper object.
189 Root of the Gen2 repo being converted. Will be converted to an
190 absolute path, resolving symbolic links and ``~``, if necessary.
191 instrument : `Instrument`
192 Gen3 instrument class to use for this conversion.
193 collections : `list` of `str`
194 Gen3 collections with which all converted datasets should be
196 subset : `ConversionSubset, optional
197 Helper object that implements a filter that restricts the data IDs that
202 `RepoConverter` defines the only public API users of its subclasses should
203 use (`prep` `ingest`, and `finish`). These delegate to several abstract
204 methods that subclasses must implement. In some cases, subclasses may
205 reimplement the public methods as well, but are expected to delegate to
206 ``super()`` either at the beginning or end of their own implementation.
209 def __init__(self, *, task: ConvertRepoTask, root: str, instrument: Instrument, run: Optional[str],
210 subset: Optional[ConversionSubset] =
None):
212 self.
rootroot = os.path.realpath(os.path.expanduser(root))
217 self._fileDatasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] \
218 = defaultdict(
lambda: defaultdict(list))
222 """Test whether the given dataset is handled specially by this
223 converter and hence should be ignored by generic base-class logic that
224 searches for dataset types to convert.
228 datasetTypeName : `str`
229 Name of the dataset type to test.
234 `True` if the dataset type is special.
236 raise NotImplementedError()
240 """Iterate over all `CameraMapper` `Mapping` objects that should be
241 considered for conversion by this repository.
243 This this should include any datasets that may appear in the
244 repository, including those that are special (see
245 `isDatasetTypeSpecial`) and those that are being ignored (see
246 `ConvertRepoTask.isDatasetTypeIncluded`); this allows the converter
247 to identify and hence skip these datasets quietly instead of warning
248 about them as unrecognized.
252 datasetTypeName: `str`
253 Name of the dataset type.
254 mapping : `lsst.obs.base.mapping.Mapping`
255 Mapping object used by the Gen2 `CameraMapper` to describe the
258 raise NotImplementedError()
262 storageClass: StorageClass,
263 formatter: FormatterParameter =
None,
264 targetHandler: Optional[PathElementHandler] =
None,
265 ) -> RepoWalker.Target:
266 """Make a struct that identifies a dataset type to be extracted by
267 walking the repo directory structure.
271 datasetTypeName : `str`
272 Name of the dataset type (the same in both Gen2 and Gen3).
274 The full Gen2 filename template.
275 keys : `dict` [`str`, `type`]
276 A dictionary mapping Gen2 data ID key to the type of its value.
277 storageClass : `lsst.daf.butler.StorageClass`
278 Gen3 storage class for this dataset type.
279 formatter : `lsst.daf.butler.Formatter` or `str`, optional
280 A Gen 3 formatter class or fully-qualified name.
281 targetHandler : `PathElementHandler`, optional
282 Specialist target handler to use for this dataset type.
286 target : `RepoWalker.Target`
287 A struct containing information about the target dataset (much of
288 it simplify forwarded from the arguments).
290 raise NotImplementedError()
293 """Return a list of directory paths that should not be searched for
296 These may be directories that simply do not contain datasets (or
297 contain datasets in another repository), or directories whose datasets
298 are handled specially by a subclass.
302 directories : `list` [`str`]
303 The full paths of directories to skip, relative to the repository
309 """Perform preparatory work associated with the dataset types to be
310 converted from this repository (but not the datasets themselves).
314 This should be a relatively fast operation that should not depend on
315 the size of the repository.
317 Subclasses may override this method, but must delegate to the base
318 class implementation at some point in their own logic.
319 More often, subclasses will specialize the behavior of `prep` by
320 overriding other methods to which the base class implementation
321 delegates. These include:
323 - `isDatasetTypeSpecial`
324 - `getSpecialDirectories`
325 - `makeRepoWalkerTarget`
327 This should not perform any write operations to the Gen3 repository.
328 It is guaranteed to be called before `ingest`.
330 self.
tasktask.log.info(f
"Preparing other dataset types from root {self.root}.")
331 walkerInputs: List[Union[RepoWalker.Target, RepoWalker.Skip]] = []
332 for datasetTypeName, mapping
in self.
iterMappingsiterMappings():
334 template = mapping.template
343 if (
not self.
tasktask.isDatasetTypeIncluded(datasetTypeName)
350 if storageClass
is None:
355 message = f
"no storage class found for {datasetTypeName}"
359 if template.endswith(
".fits"):
360 extensions.extend((
".gz",
".fz"))
361 for extension
in extensions:
363 walkerInput = RepoWalker.Skip(
364 template=template+extension,
368 self.
tasktask.log.debug(
"Skipping template in walker: %s", template)
370 assert message
is None
371 targetHandler = self.
tasktask.config.targetHandlerClasses.get(datasetTypeName)
372 if targetHandler
is not None:
373 targetHandler =
doImport(targetHandler)
375 datasetTypeName=datasetTypeName,
376 template=template+extension,
378 storageClass=storageClass,
379 formatter=self.
tasktask.config.formatterClasses.get(datasetTypeName),
380 targetHandler=targetHandler,
382 self.
tasktask.log.debug(
"Adding template to walker: %s + %s, for %s", template, extension,
383 walkerInput.datasetType)
384 walkerInputs.append(walkerInput)
395 fileIgnoreRegExTerms = []
396 for pattern
in self.
tasktask.config.fileIgnorePatterns:
397 fileIgnoreRegExTerms.append(fnmatch.translate(pattern))
398 if fileIgnoreRegExTerms:
399 fileIgnoreRegEx = re.compile(
"|".join(fileIgnoreRegExTerms))
401 fileIgnoreRegEx =
None
403 log=self.
tasktask.log.getChild(
"repoWalker"))
406 """Iterate over datasets in the repository that should be ingested into
409 The base class implementation yields nothing; the datasets handled by
410 the `RepoConverter` base class itself are read directly in
413 Subclasses should override this method if they support additional
414 datasets that are handled some other way.
418 dataset : `FileDataset`
419 Structures representing datasets to be ingested. Paths should be
425 assert self.
_repoWalker_repoWalker,
"prep() must be called before findDatasets."
426 self.
tasktask.log.info(
"Adding special datasets in repo %s.", self.
rootroot)
428 assert len(dataset.refs) == 1
431 self._fileDatasets[dataset.refs[0].datasetType][
None].
append(dataset)
432 self.
tasktask.log.info(
"Finding datasets from files in repo %s.", self.
rootroot)
433 datasetsByTypeAndCalibDate = self.
_repoWalker_repoWalker.walk(
435 predicate=(self.
subsetsubset.isRelated
if self.
subsetsubset
is not None else None)
437 for datasetType, datasetsByCalibDate
in datasetsByTypeAndCalibDate.items():
438 for calibDate, datasets
in datasetsByCalibDate.items():
439 self._fileDatasets[datasetType][calibDate].extend(datasets)
442 """Expand the data IDs for all datasets to be inserted.
444 Subclasses may override this method, but must delegate to the base
445 class implementation if they do.
447 This involves queries to the registry, but not writes. It is
448 guaranteed to be called between `findDatasets` and `ingest`.
451 for datasetType, datasetsByCalibDate
in self._fileDatasets.
items():
452 for calibDate, datasetsForCalibDate
in datasetsByCalibDate.items():
453 nDatasets = len(datasetsForCalibDate)
454 suffix =
"" if nDatasets == 1
else "s"
455 if calibDate
is not None:
456 self.
tasktask.log.info(
"Expanding data IDs for %s %s dataset%s at calibDate %s.",
462 self.
tasktask.log.info(
"Expanding data IDs for %s %s non-calibration dataset%s.",
467 for dataset
in datasetsForCalibDate:
468 for i, ref
in enumerate(dataset.refs):
469 self.
tasktask.log.debug(
"Expanding data ID %s.", ref.dataId)
471 dataId = self.
tasktask.registry.expandDataId(ref.dataId)
472 dataset.refs[i] = ref.expanded(dataId)
473 except LookupError
as err:
474 self.
tasktask.log.warn(
"Skipping ingestion for '%s': %s", dataset.path, err)
477 dataset.refs[i] =
None
478 dataset.refs[:] = itertools.filterfalse(
lambda x: x
is None, dataset.refs)
480 expanded.append(dataset)
481 datasetsForCalibDate[:] = expanded
484 """Insert converted datasets into the Gen3 repository.
486 Subclasses may override this method, but must delegate to the base
487 class implementation at some point in their own logic.
489 This method is guaranteed to be called after `expandDataIds`.
491 for datasetType, datasetsByCalibDate
in self._fileDatasets.
items():
492 self.
tasktask.registry.registerDatasetType(datasetType)
493 for calibDate, datasetsForCalibDate
in datasetsByCalibDate.items():
495 run = self.
getRungetRun(datasetType.name, calibDate)
497 self.
tasktask.log.warn(f
"No run configured for dataset type {datasetType.name}.")
499 nDatasets = len(datasetsForCalibDate)
500 self.
tasktask.log.info(
"Ingesting %s %s dataset%s into run %s.", nDatasets,
501 datasetType.name,
"" if nDatasets == 1
else "s", run)
503 self.
tasktask.registry.registerRun(run)
504 self.
tasktask.butler3.ingest(*datasetsForCalibDate, transfer=self.
tasktask.config.transfer,
506 except LookupError
as err:
508 f
"Error expanding data ID for dataset type {datasetType.name}."
512 """Finish conversion of a repository.
514 This is run after ``ingest``, and delegates to `_finish`, which should
515 be overridden by derived classes instead of this method.
517 self.
_finish_finish(self._fileDatasets)
519 def _finish(self, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]]) ->
None:
520 """Subclass implementation hook for `_finish`.
522 The default implementation does nothing. This is generally the best
523 place to define and populate non-``RUN`` collections that may contain
524 some of the datasets that have just been ingested.
529 Nested mapping containing all converted datasets. The outer
530 mapping keys are `DatasetType` instances. Values are mappings from
531 ``calibDate`` or `None` to a `list` of `FileDataset` instances.
535 def getRun(self, datasetTypeName: str, calibDate: Optional[str] =
None) -> str:
536 """Return the name of the run to insert instances of the given dataset
537 type into in this collection.
541 datasetTypeName : `str`
542 Name of the dataset type.
543 calibDate : `str`, optional
544 If not `None`, the "CALIBDATE" associated with this (calibration)
545 dataset in the Gen2 data repository.
550 Name of the `~lsst.daf.butler.CollectionType.RUN` collection.
552 assert self.
_run_run
is not None,
"Method must be overridden if self._run is allowed to be None"
553 assert calibDate
is None,
"Method must be overridden if calibDate is allowed to be not None"
556 def _guessStorageClass(self, datasetTypeName: str, mapping: CameraMapperMapping
557 ) -> Optional[StorageClass]:
558 """Infer the Gen3 `StorageClass` from a dataset from a combination of
559 configuration and Gen2 dataset type information.
561 datasetTypeName: `str`
562 Name of the dataset type.
563 mapping : `lsst.obs.base.mapping.Mapping`
564 Mapping object used by the Gen2 `CameraMapper` to describe the
567 storageClassName = self.
tasktask.config.storageClasses.get(datasetTypeName)
568 if storageClassName
is None and mapping.python
is not None:
569 storageClassName = self.
tasktask.config.storageClasses.get(mapping.python,
None)
570 if storageClassName
is None and mapping.persistable
is not None:
571 storageClassName = self.
tasktask.config.storageClasses.get(mapping.persistable,
None)
572 if storageClassName
is None and mapping.python
is not None:
573 unqualified = mapping.python.split(
".")[-1]
574 storageClassName = self.
tasktask.config.storageClasses.get(unqualified,
None)
575 if storageClassName
is not None:
576 storageClass = self.
tasktask.butler3.storageClasses.getStorageClass(storageClassName)
579 storageClass = self.
tasktask.butler3.storageClasses.getStorageClass(mapping.persistable)
582 if storageClass
is None and mapping.python
is not None:
584 storageClass = self.
tasktask.butler3.storageClasses.getStorageClass(unqualified)
587 if storageClass
is None:
588 self.
tasktask.log.debug(
"No StorageClass found for %s; skipping.", datasetTypeName)
590 self.
tasktask.log.debug(
"Using StorageClass %s for %s.", storageClass.name, datasetTypeName)
596 task: ConvertRepoTask
597 """The parent task that constructed and uses this converter
602 """Root path to the Gen2 repository this converter manages (`str`).
604 This is a complete path, not relative to some other repository root.
607 subset: Optional[ConversionSubset]
608 """An object that represents a filter to be applied to the datasets that
609 are converted (`ConversionSubset` or `None`).
std::vector< SchemaItem< Flag > > * items
def addSkyPix(self, Registry registry, SkyPixDimension dimension)
def addSkyMap(self, Registry registry, str name)
def __init__(self, str instrument, Set[int] visits)
bool isRelated(self, DataCoordinate dataId)
bool isDatasetTypeSpecial(self, str datasetTypeName)
str getRun(self, str datasetTypeName, Optional[str] calibDate=None)
Iterator[FileDataset] iterDatasets(self)
RepoWalker.Target makeRepoWalkerTarget(self, str datasetTypeName, str template, Dict[str, type] keys, StorageClass storageClass, FormatterParameter formatter=None, Optional[PathElementHandler] targetHandler=None)
Optional[StorageClass] _guessStorageClass(self, str datasetTypeName, CameraMapperMapping mapping)
None _finish(self, Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] datasets)
List[str] getSpecialDirectories(self)
def __init__(self, *ConvertRepoTask task, str root, Instrument instrument, Optional[str] run, Optional[ConversionSubset] subset=None)
Iterator[Tuple[str, CameraMapperMapping]] iterMappings(self)
A RangeSet is a set of unsigned 64 bit integers.
std::shared_ptr< FrameSet > append(FrameSet const &first, FrameSet const &second)
Construct a FrameSet that performs two transformations in series.
daf::base::PropertySet * set