LSSTApplications  20.0.0
LSSTDataManagementBasePackage
builders.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Classes used in `RepoWalker` construction.
22 
23 The objects here form a temporary tree that is pruned and then transformed
24 into a similar tree of `PathElementHandler` instances. See `BuilderNode`
25 method documentation for more information.
26 """
27 from __future__ import annotations
28 
29 __all__ = ["BuilderSkipInput", "BuilderTargetInput", "BuilderTree"]
30 
31 from abc import ABC, abstractmethod
32 import os
33 import re
34 from typing import (
35  Any,
36  Dict,
37  List,
38  Optional,
39  Tuple,
40 )
41 
42 from lsst.daf.butler import DatasetType, DimensionUniverse, StorageClass, FormatterParameter
43 from ..translators import TranslatorFactory
44 from .parser import PathElementParser
45 from .scanner import PathElementHandler, DirectoryScanner
46 from .handlers import (IgnoreHandler, SubdirectoryHandler, SkipHandler,
47  TargetFileHandler)
48 
49 
50 class BuilderNode(ABC):
51  """Abstract interface for nodes in the temporary tree that is used to
52  construct a `RepoWalker`.
53  """
54 
55  @abstractmethod
56  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
57  """Attempt to prune this node and its children from the tree.
58 
59  Returns
60  -------
61  replacement : `BuilderNode`
62  The result of recursively pruning child nodes; often just ``self``.
63  messages : `list` [`str`]
64  Warning messages that should be logged by a parent node when a
65  matching path element is encountered, if this node is pruned.
66  prune : `bool`
67  If `True`, this node may be pruned from the tree (but will not
68  necessarily be - it may correspond to a path element that should
69  be skipped with siblings that should not be).
70  """
71  raise NotImplementedError()
72 
73  @abstractmethod
74  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
75  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
76  ) -> PathElementHandler:
77  """Transform this node in the build tree into a corresponding
78  `PathElementHandler`, recursing to any children.
79 
80  Must be called after `prune`.
81 
82  Parameters
83  ----------
84  parser : `PathElementParser`
85  An object that matches the path element the new handler is
86  responsible for and extracts a (partial) Gen2 data ID from it.
87  allKeys : `dict` [`str`, `type`]
88  A mapping from Gen2 data ID key to the type of its value. Will
89  contain all keys that may be extracted by the given parser, and
90  possibly others.
91  cumulativeKeys : `dict` [`str`, `type`], optional
92  A dictionary containing key strings and types for Gen2 data ID keys
93  that have been extracted from previous path elements for this
94  template, including those extracted by ``parser``.
95 
96  Returns
97  -------
98  handler : `PathElementHandler`
99  A new handler object.
100  """
101  raise NotImplementedError()
102 
103 
105  """An intermediate base for `BuilderNode` classes that are provided as
106  direct inputs to a `RepoWalker`, and generally correspond to exactly one
107  Gen2 dataset type.
108 
109  Parameters
110  ----------
111  template : `str`
112  The complete Gen2 template to be matched (not just the template for
113  one path element).
114  keys : `dict` [`str`, `type`]
115  A mapping from Gen2 data ID key to the type of its value.
116  """
117  def __init__(self, template: str, keys: Dict[str, type]):
118  self.template = template
119  self.keys = keys
120  self.elements = self.template.split(os.path.sep)
121 
122  template: str
123  """The complete Gen2 template to be matched (`str`).
124  """
125 
126  keys: Dict[str, type]
127  """A mapping from Gen2 data ID key to the type of its value
128  (`dict` [`str`, `type`]).
129  """
130 
131  elements: List[str]
132  """The path elements (file or directory levels) of `template`
133  (`list` of `str`).
134  """
135 
136 
138  """An input to a `RepoWalker` that indicates that matched files should be
139  skipped, possibly with a warning message.
140 
141  BuilderSkipInputs can be pruned. When they are not pruned, they build
142  `SkipHandler` instances.
143 
144  Parameters
145  ----------
146  template : `str`
147  The complete Gen2 template to be matched (not just the template for
148  one path element).
149  keys : `dict` [`str`, `type`]
150  A mapping from Gen2 data ID key to the type of its value.
151  message : `str`, optional
152  If not `None`, a warning message that should be printed either when a
153  matching file is enountered or a directory that may contain such files
154  is skipped.
155  isForFiles : `bool`, optional
156  If `True` (default), this handler should be run on files. Otherwise it
157  should be run on directories.
158  """
159  def __init__(self, template: str, keys: Dict[str, type], message: Optional[str] = None, *,
160  isForFiles: bool = True):
161  super().__init__(template=template, keys=keys)
162  self._message = message
163  self._isForFiles = isForFiles
164 
165  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
166  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
167  ) -> PathElementHandler:
168  # Docstring inherited from BuilderNode.
169  return SkipHandler(parser=parser, isForFiles=self._isForFiles, message=self._message)
170 
171  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
172  # Docstring inherited from BuilderNode.
173  return self, [self._message] if self._message is not None else [], True
174 
175 
177  """An input to a `RepoWalker` that matches files that correspond to
178  datasets that we want to extract.
179 
180  BuilderTargetInputs can never be pruned, and always build
181  `TargetFileHandler` instances.
182 
183  Parameters
184  ----------
185  datasetTypeName : `str`
186  Name of the dataset type.
187  template : `str`
188  Full Gen2 filename template.
189  keys : `dict` [`str`, `type`]
190  Dictionary that maps Gen2 data ID key to the type of its value.
191  storageClass : `StorageClass`
192  `StorageClass` for the Gen3 dataset type.
193  universe : `DimensionUniverse`
194  All candidate dimensions for the Gen3 dataset type.
195  formatter : `lsst.daf.butler.Formatter` or `str`, optional
196  A Gen 3 formatter class or fully-qualified name.
197  translatorFactory : `TranslatorFactory`
198  Object that can be used to construct data ID translators.
199  targetHandler : `PathElementHandler`, optional
200  Override target handler for this dataset type.
201  **kwargs:
202  Additional keyword arguments are passed to `Translator.makeMatching`,
203  in along with ``datasetTypeName`` and ``keys``.
204  """
205  def __init__(self, *, datasetTypeName: str, template: str, keys: Dict[str, type],
206  storageClass: StorageClass, universe: DimensionUniverse,
207  formatter: FormatterParameter, translatorFactory: TranslatorFactory,
208  targetHandler: Optional[PathElementHandler] = None,
209  **kwargs: Any):
210  # strip off [%HDU] identifiers from e.g. DECAM Community Pipeline products
211  template = template.split('[%(')[0]
212  super().__init__(template=template, keys=keys)
213  self._translator = translatorFactory.makeMatching(datasetTypeName, keys, **kwargs)
214  self.datasetType = DatasetType(datasetTypeName, dimensions=self._translator.dimensionNames,
215  storageClass=storageClass, universe=universe)
216  self._formatter = formatter
217  if targetHandler is None:
218  targetHandler = TargetFileHandler
219  self._handler = targetHandler
220 
221  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
222  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
223  ) -> PathElementHandler:
224  # Docstring inherited from BuilderNode.
225  return self._handler(parser=parser, translator=self._translator, datasetType=self.datasetType,
226  formatter=self._formatter)
227 
228  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
229  # Docstring inherited from BuilderNode.
230  return self, [], False
231 
232  datasetType: DatasetType
233  """The Gen3 dataset type extracted by the handler this object builds
234  (`lsst.daf.butler.DatasetType`).
235  """
236 
237 
239  """A `BuilderNode` that represents a subdirectory to be skipped,
240  created by pruning `BuilderTree` that contained only `BuilderSkipInput`
241  instances.
242 
243  BuilderPrunedTrees can be pruned. When they are not pruned, they
244  build `SkipHandler` instances.
245 
246  Parameters
247  ----------
248  messages : `list` [`str`]
249  A list of warning messages to be printed when the handler produced by
250  this builder matches a subdirectory.
251  """
252 
253  def __init__(self, messages: List[str]):
254  self._messages = messages
255 
256  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
257  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
258  ) -> PathElementHandler:
259  # Docstring inherited from BuilderNode.
260  message = "; ".join(self._messages) if self._messages else None
261  return SkipHandler(parser=parser, isForFiles=False, message=message)
262 
263  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
264  # Docstring inherited from BuilderNode.
265  return self, self._messages, True
266 
267 
269  """A `BuilderNode` that represents a collection of `BuilderInput` instances
270  that all have the same template.
271  """
272  def __init__(self, old: BuilderInput, new: BuilderInput):
273  self._children = []
274  if isinstance(old, BuilderDuplicateInputs):
275  self._children.extend(old._children)
276  else:
277  self._children.append(old)
278  self._children.append(new)
279  self._messages = [] # populated in prune()
280 
281  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
282  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
283  ) -> PathElementHandler:
284  # Docstring inherited from BuilderNode.
285  message = "; ".join(self._messages) if self._messages else None
286  return SkipHandler(parser=parser, isForFiles=False, message=message)
287 
288  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
289  # Docstring inherited from BuilderNode.
290  unprunable = []
291  newChildren = []
292  for child in self._children:
293  newChild, childMessages, toPruneChild = child.prune()
294  if toPruneChild:
295  self._messages.extend(childMessages)
296  else:
297  unprunable.append(newChild)
298  newChildren.append(newChildren)
299  self._children = newChildren
300  if len(unprunable) == 0:
301  # All children are just skips, so we can prune this node if we
302  # remember their messages.
303  return self, self._messages, True
304  elif len(unprunable) == 1 and not self._messages:
305  # Exactly one child is a target, and the others were ignored with
306  # no warning messages. Tell parent node to just use that child,
307  # so if we see any matching files, we just assume they're for that
308  # target.
309  return unprunable[0], [], False
310  else:
311  # Multiple targets or skips with messages, which means we won't
312  # know how to handle any matching files. Replace any messages we
313  # have with a single message that combines them all as well as
314  # any target dataset types that they are ambiguous with.
315  nested = [f"{c.datasetType.name} (target)" for c in unprunable]
316  nested.extend(self._messages)
317  self._messages = [f"ambiguous match: [{', '.join(nested)}]"]
318  return self, self._messages, True
319 
320 
322  """A `BuilderNode` that represents a directory.
323 
324  This is the only `BuilderNode` class that is not a leaf node. If all
325  of its children can be pruned, it is replaced by a `BuilderPrunedTree`
326  (which can then be pruned itself). It builds `SubdirectoryHandler`
327  instances when not pruned.
328  """
329  def __init__(self):
330  self._children = {} # Maps template path element to BuilderNode
331 
332  def insert(self, level: int, leaf: BuilderInput):
333  """Insert an input leaf node into the tree, recursively constructing
334  intermediate parents in order to put it at the right level.
335 
336  Parameters
337  ----------
338  level : `int`
339  The level ``self``is at in the larger tree, with zero the
340  repository root. The right level for the leaf is given by the
341  length of ``leaf.elements``.
342  leaf : `BuilderInput`
343  The leaf node to insert.
344  """
345  nextLevel = level + 1
346  element = leaf.elements[level]
347  if nextLevel == len(leaf.elements):
348  conflict = self._children.get(element)
349  if conflict is not None:
350  # Sadly, the Gen2 butler has some actual dataset types that
351  # use the exact same template.
352  leaf = BuilderDuplicateInputs(conflict, leaf)
353  self._children[element] = leaf
354  else:
355  child = self._children.setdefault(element, BuilderTree())
356  child.insert(nextLevel, leaf)
357 
358  def fill(self, scanner: DirectoryScanner, allKeys: Dict[str, type], previousKeys: Dict[str, type], *,
359  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]):
360  """Fill a `DirectoryScanner` instance by recursively building all
361  child nodes.
362 
363  Parameters
364  ----------
365  scanner : `DirectoryScanner`
366  Object to populate.
367  allKeys : `dict` [`str`, `type`]
368  Mapping from Gen2 data ID key to its value type, covering all keys
369  that could be used in any child template.
370  previousKeys : `dict` [`str`, `type`], optional
371  A dictionary containing key strings and types for Gen2 data ID keys
372  that have been extracted from previous path elements of the same
373  template.
374  fileIgnoreRegEx : `re.Pattern`, optional
375  A regular expression pattern that identifies non-dataset files that
376  can be ignored, to be applied at all levels of the directory tree.
377  dirIgnoreRegEx : `re.Pattern`, optional
378  A regular expression pattern that identifies non-dataset
379  subdirectories that can be ignored, to be applied at all levels of
380  the directory tree.
381  """
382  if fileIgnoreRegEx is not None:
383  scanner.add(IgnoreHandler(fileIgnoreRegEx, isForFiles=True))
384  if dirIgnoreRegEx is not None:
385  scanner.add(IgnoreHandler(dirIgnoreRegEx, isForFiles=False))
386  for template, child in self._children.items():
387  parser = PathElementParser(template, allKeys, previousKeys=previousKeys)
388  cumulativeKeys = previousKeys.copy()
389  cumulativeKeys.update(parser.keys)
390  scanner.add(child.build(parser, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
391  dirIgnoreRegEx=dirIgnoreRegEx))
392 
393  def prune(self) -> Tuple[BuilderNode, List[str], bool]:
394  # Docstring inherited from BuilderNode.
395  toPruneThis = True
396  newChildren = {}
397  messages = []
398  # Recursively prune children.
399  for template, child in list(self._children.items()):
400  newChild, childMessages, toPruneChild = child.prune()
401  newChildren[template] = newChild
402  messages.extend(childMessages)
403  if not toPruneChild:
404  toPruneThis = False
405  self._children = newChildren
406  if toPruneThis:
407  return BuilderPrunedTree(messages), messages, True
408  else:
409  return self, [], False
410 
411  def build(self, parser: PathElementParser, allKeys: Dict[str, type], cumulativeKeys: Dict[str, type], *,
412  fileIgnoreRegEx: Optional[re.Pattern], dirIgnoreRegEx: Optional[re.Pattern]
413  ) -> PathElementHandler:
414  # Docstring inherited from BuilderNode.
415  built = SubdirectoryHandler(parser)
416  self.fill(built.scanner, allKeys, cumulativeKeys, fileIgnoreRegEx=fileIgnoreRegEx,
417  dirIgnoreRegEx=dirIgnoreRegEx)
418  return built
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:221
lsst.obs.base.gen2to3.repoWalker.handlers.SkipHandler
Definition: handlers.py:158
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput._message
_message
Definition: builders.py:161
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput.__init__
def __init__(self, str template, Dict[str, type] keys, Optional[str] message=None, *bool isForFiles=True)
Definition: builders.py:159
lsst.obs.base.gen2to3.repoWalker.handlers.SubdirectoryHandler
Definition: handlers.py:196
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.__init__
def __init__(self, str template, Dict[str, type] keys)
Definition: builders.py:117
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.template
template
Definition: builders.py:118
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput._handler
_handler
Definition: builders.py:215
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree
Definition: builders.py:321
lsst.obs.base.gen2to3.repoWalker.builders.BuilderNode.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:56
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.keys
keys
Definition: builders.py:119
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:411
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.__init__
def __init__(self)
Definition: builders.py:329
ast::append
std::shared_ptr< FrameSet > append(FrameSet const &first, FrameSet const &second)
Construct a FrameSet that performs two transformations in series.
Definition: functional.cc:33
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:228
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:171
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput._formatter
_formatter
Definition: builders.py:212
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.__init__
def __init__(self, *str datasetTypeName, str template, Dict[str, type] keys, StorageClass storageClass, DimensionUniverse universe, FormatterParameter formatter, TranslatorFactory translatorFactory, Optional[PathElementHandler] targetHandler=None, **Any kwargs)
Definition: builders.py:205
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs._messages
_messages
Definition: builders.py:279
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree.__init__
def __init__(self, List[str] messages)
Definition: builders.py:253
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree._children
_children
Definition: builders.py:330
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs
Definition: builders.py:268
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:281
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput
Definition: builders.py:176
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree
Definition: builders.py:238
lsst.obs.base.gen2to3.repoWalker.handlers.IgnoreHandler
Definition: handlers.py:55
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput.elements
elements
Definition: builders.py:120
lsst.obs.base.gen2to3.repoWalker.builders.BuilderNode
Definition: builders.py:50
lsst.obs.base.gen2to3.repoWalker.builders.BuilderInput
Definition: builders.py:104
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput._translator
_translator
Definition: builders.py:209
items
std::vector< SchemaItem< Flag > > * items
Definition: BaseColumnView.cc:142
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:393
list
daf::base::PropertyList * list
Definition: fits.cc:913
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput
Definition: builders.py:137
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs._children
_children
Definition: builders.py:273
lsst.obs.base.gen2to3.repoWalker.builders.BuilderNode.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:74
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:263
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:256
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.insert
def insert(self, int level, BuilderInput leaf)
Definition: builders.py:332
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs.__init__
def __init__(self, BuilderInput old, BuilderInput new)
Definition: builders.py:272
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput.build
PathElementHandler build(self, PathElementParser parser, Dict[str, type] allKeys, Dict[str, type] cumulativeKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:165
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTree.fill
def fill(self, DirectoryScanner scanner, Dict[str, type] allKeys, Dict[str, type] previousKeys, *Optional[re.Pattern] fileIgnoreRegEx, Optional[re.Pattern] dirIgnoreRegEx)
Definition: builders.py:358
lsst.obs.base.gen2to3.repoWalker.builders.BuilderDuplicateInputs.prune
Tuple[BuilderNode, List[str], bool] prune(self)
Definition: builders.py:288
lsst.obs.base.gen2to3.repoWalker.builders.BuilderSkipInput._isForFiles
_isForFiles
Definition: builders.py:162
lsst.obs.base.gen2to3.repoWalker.builders.BuilderTargetInput.datasetType
datasetType
Definition: builders.py:210
lsst.obs.base.gen2to3.repoWalker.builders.BuilderPrunedTree._messages
_messages
Definition: builders.py:254
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser
Definition: parser.py:116