LSSTApplications  19.0.0-14-gb0260a2+72efe9b372,20.0.0+7927753e06,20.0.0+8829bf0056,20.0.0+995114c5d2,20.0.0+b6f4b2abd1,20.0.0+bddc4f4cbe,20.0.0-1-g253301a+8829bf0056,20.0.0-1-g2b7511a+0d71a2d77f,20.0.0-1-g5b95a8c+7461dd0434,20.0.0-12-g321c96ea+23efe4bbff,20.0.0-16-gfab17e72e+fdf35455f6,20.0.0-2-g0070d88+ba3ffc8f0b,20.0.0-2-g4dae9ad+ee58a624b3,20.0.0-2-g61b8584+5d3db074ba,20.0.0-2-gb780d76+d529cf1a41,20.0.0-2-ged6426c+226a441f5f,20.0.0-2-gf072044+8829bf0056,20.0.0-2-gf1f7952+ee58a624b3,20.0.0-20-geae50cf+e37fec0aee,20.0.0-25-g3dcad98+544a109665,20.0.0-25-g5eafb0f+ee58a624b3,20.0.0-27-g64178ef+f1f297b00a,20.0.0-3-g4cc78c6+e0676b0dc8,20.0.0-3-g8f21e14+4fd2c12c9a,20.0.0-3-gbd60e8c+187b78b4b8,20.0.0-3-gbecbe05+48431fa087,20.0.0-38-ge4adf513+a12e1f8e37,20.0.0-4-g97dc21a+544a109665,20.0.0-4-gb4befbc+087873070b,20.0.0-4-gf910f65+5d3db074ba,20.0.0-5-gdfe0fee+199202a608,20.0.0-5-gfbfe500+d529cf1a41,20.0.0-6-g64f541c+d529cf1a41,20.0.0-6-g9a5b7a1+a1cd37312e,20.0.0-68-ga3f3dda+5fca18c6a4,20.0.0-9-g4aef684+e18322736b,w.2020.45
LSSTDataManagementBasePackage
scanner.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Interfaces and common code for recursively scanning directories for Gen2
22 dataset files.
23 
24 The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for
25 dependency reasons: `DirectoryScanner` uses the ABC, while its concrete
26 implementations use `DirectorySCanner`.
27 """
28 from __future__ import annotations
29 
30 __all__ = ["PathElementHandler", "DirectoryScanner"]
31 
32 from abc import ABC, abstractmethod
33 import bisect
34 import os
35 from typing import (
36  Callable,
37  Iterator,
38  List,
39  Mapping,
40  Optional,
41  Tuple,
42 )
43 
44 from lsst.log import Log
45 from lsst.daf.butler import (
46  DataCoordinate,
47  DatasetType,
48  FileDataset,
49 )
50 
51 
52 class PathElementHandler(ABC):
53  """An interface for objects that handle a single path element (directory or
54  file) in a Gen2 data repository.
55 
56  Handlers are added to a `DirectoryScanner` instance, which then calls them
57  until one succeeds when it processes each element in a directory.
58  """
59  def __init__(self):
60  self.lastDataId2 = {}
61 
62  __slots__ = ("lastDataId2", "log")
63 
64  @abstractmethod
65  def isForFiles(self) -> bool:
66  """Report what kind of path element this object handlers.
67 
68  Returns
69  -------
70  Return `True` if this handler is for file entries, or `False` if it
71  is for directories.
72  """
73  raise NotImplementedError()
74 
75  @abstractmethod
76  def __call__(self, path: str, name: str,
77  datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *,
78  predicate: Callable[[DataCoordinate], bool]) -> bool:
79  """Apply the handler to a file path.
80 
81  Parameters
82  ----------
83  path : `str`
84  Full path of the file or directory.
85  name : `str`
86  Local name of the file or directory within its parent directory.
87  datasets : `dict` [`DatasetType`, `dict` ]
88  Dictionary that found datasets should be added to. Nested dicts
89  are keyed by either `None` (for most datasets) or a `str`
90  "CALIBDATE" for calibration datasets.
91  predicate : `~collections.abc.Callable`
92  A callable taking a single `DataCoordinate` argument and returning
93  `bool`, indicating whether that (Gen3) data ID represents one
94  that should be included in the scan.'
95 
96  Returns
97  -------
98  matched : `bool`
99  `True` if this handler was a match for the given path and no other
100  handlers need to be tried on it, `False` otherwise.
101  """
102  raise NotImplementedError()
103 
104  @property
105  @abstractmethod
106  def rank(self) -> int:
107  """Return a rough indication of how flexible this handler is in terms
108  of the path element names it can match.
109 
110  Handlers that match a constant path element should always return zero.
111  """
112  raise NotImplementedError()
113 
114  def translate(self, dataId2: dict, *, partial: bool = False
115  ) -> Tuple[Optional[DataCoordinate], Optional[str]]:
116  """Translate the given data ID from Gen2 to Gen3.
117 
118  The default implementation returns `None`. Subclasses that are able
119  to translate data IDs should override this method.
120 
121  Parameters
122  ----------
123  dataId2 : `dict`
124  Gen2 data ID.
125  partial : `bool`, optional
126  If `True` (`False` is default) this is a partial data ID for some
127  dataset, and missing keys are expected.
128 
129  Returns
130  -------
131  dataId3 : `lsst.daf.butler.DataCoordinate` or `None`
132  A Gen3 data ID, or `None` if this handler cannot translate data
133  IDs.
134  calibDate : `str` or `None`
135  A Gen2 calibration "CALIBDATE" value, or `None` if there was no
136  such value in the template.
137  """
138  return None, None
139 
140  def __lt__(self, other: PathElementHandler):
141  """Handlers are sorted by rank to reduce the possibility that more
142  flexible handlers will have a chance to match something they shouldn't.
143  """
144  return self.rank < other.rank
145 
146  lastDataId2: dict
147  """The Gen2 data ID obtained by processing parent levels in the directory
148  tree.
149 
150  This attribute should be reset by calling code whenever a new parent
151  directory is entered, before invoking `__call__`.
152  """
153 
154  log: Log
155  """A logger to use for all diagnostic messages (`lsst.log.Log`).
156 
157  This attribute is set on a handler in `DirectoryScanner.add`; this avoids
158  needing to forward one through all subclass constructors.
159  """
160 
161 
163  """An object that uses `PathElementHandler` instances to process the files
164  and subdirectories in a directory tree.
165 
166  Parameters
167  ----------
168  log : `Log`, optional
169  Log to use to report warnings and debug information.
170  """
171  def __init__(self, log: Optional[Log] = None):
172  self._files = []
173  self._subdirectories = []
174  if log is None:
175  log = Log.getLogger("obs.base.gen2to3.walker")
176  self.log = log
177 
178  __slots__ = ("_files", "_subdirectories", "log")
179 
180  def add(self, handler: PathElementHandler):
181  """Add a new handler to the scanner.
182 
183  Parameters
184  ----------
185  handler : `PathElementHandler`
186  The handler to be added.
187  """
188  handler.log = self.log
189  if handler.isForFiles():
190  bisect.insort(self._files, handler)
191  else:
192  bisect.insort(self._subdirectories, handler)
193 
194  def __iter__(self) -> Iterator[PathElementHandler]:
195  """Iterate over all handlers.
196  """
197  yield from self._files
198  yield from self._subdirectories
199 
200  def scan(self, path: str, datasets: Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]], *,
201  predicate: Callable[[DataCoordinate], bool]):
202  """Process a directory.
203 
204  Parameters
205  ----------
206  path : `str`
207  Full path to the directory to be processed.
208  datasets : `dict` [`DatasetType`, `list` ]
209  Dictionary that found datasets should be added to. Nested lists
210  elements are tuples of `FileDataset` and an optional "CALIBDATE"
211  `str` value (for calibration datasets only).
212  predicate : `~collections.abc.Callable`
213  A callable taking a single `DataCoordinate` argument and returning
214  `bool`, indicating whether that (Gen3) data ID represents one
215  that should be included in the scan.
216  """
217  unrecognized = []
218  for entry in os.scandir(path):
219  if entry.is_file():
220  handlers = self._files
221  elif entry.is_dir():
222  handlers = self._subdirectories
223  else:
224  continue
225  for handler in handlers:
226  if handler(entry.path, entry.name, datasets, predicate=predicate):
227  break
228  else:
229  unrecognized.append(entry.name)
230  if unrecognized:
231  self.log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)
lsst::log.log.logContinued.warn
def warn(fmt, *args)
Definition: logContinued.py:205
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner
Definition: scanner.py:162
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler
Definition: scanner.py:52
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.lastDataId2
lastDataId2
Definition: scanner.py:60
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__init__
def __init__(self)
Definition: scanner.py:59
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__lt__
def __lt__(self, PathElementHandler other)
Definition: scanner.py:140
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner._files
_files
Definition: scanner.py:172
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__call__
bool __call__(self, str path, str name, Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] datasets, *Callable[[DataCoordinate], bool] predicate)
Definition: scanner.py:76
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.__iter__
Iterator[PathElementHandler] __iter__(self)
Definition: scanner.py:194
lsst::log
Definition: Log.h:706
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.rank
int rank(self)
Definition: scanner.py:106
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.isForFiles
bool isForFiles(self)
Definition: scanner.py:65
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.__init__
def __init__(self, Optional[Log] log=None)
Definition: scanner.py:171
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.scan
def scan(self, str path, Mapping[DatasetType, Mapping[Optional[str], List[FileDataset]]] datasets, *Callable[[DataCoordinate], bool] predicate)
Definition: scanner.py:200
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.log
log
Definition: scanner.py:176
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.add
def add(self, PathElementHandler handler)
Definition: scanner.py:180
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner._subdirectories
_subdirectories
Definition: scanner.py:173
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.translate
Tuple[Optional[DataCoordinate], Optional[str]] translate(self, dict dataId2, *bool partial=False)
Definition: scanner.py:114