LSSTApplications  20.0.0
LSSTDataManagementBasePackage
scanner.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Interfaces and common code for recursively scanning directories for Gen2
22 dataset files.
23 
24 The `PathElementHandler` ABC is defined here instead of ``handlers.py`` for
25 dependency reasons: `DirectoryScanner` uses the ABC, while its concrete
26 implementations use `DirectorySCanner`.
27 """
28 from __future__ import annotations
29 
30 __all__ = ["PathElementHandler", "DirectoryScanner"]
31 
32 from abc import ABC, abstractmethod
33 import bisect
34 import os
35 from typing import (
36  Callable,
37  Iterator,
38  List,
39  Mapping,
40  Optional,
41 )
42 
43 from lsst.log import Log
44 from lsst.daf.butler import (
45  DataCoordinate,
46  DatasetType,
47  FileDataset,
48 )
49 
50 
51 class PathElementHandler(ABC):
52  """An interface for objects that handle a single path element (directory or
53  file) in a Gen2 data repository.
54 
55  Handlers are added to a `DirectoryScanner` instance, which then calls them
56  until one succeeds when it processes each element in a directory.
57  """
58  def __init__(self):
59  self.lastDataId2 = {}
60 
61  __slots__ = ("lastDataId2", "log")
62 
63  @abstractmethod
64  def isForFiles(self) -> bool:
65  """Report what kind of path element this object handlers.
66 
67  Returns
68  -------
69  Return `True` if this handler is for file entries, or `False` if it
70  is for directories.
71  """
72  raise NotImplementedError()
73 
74  @abstractmethod
75  def __call__(self, path: str, name: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
76  predicate: Callable[[DataCoordinate], bool]) -> bool:
77  """Apply the handler to a file path.
78 
79  Parameters
80  ----------
81  path : `str`
82  Full path of the file or directory.
83  name : `str`
84  Local name of the file or directory within its parent directory.
85  datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
86  Dictionary that found datasets should be added to.
87  predicate : `~collections.abc.Callable`
88  A callable taking a single `DataCoordinate` argument and returning
89  `bool`, indicating whether that (Gen3) data ID represents one
90  that should be included in the scan.'
91 
92  Returns
93  -------
94  matched : `bool`
95  `True` if this handler was a match for the given path and no other
96  handlers need to be tried on it, `False` otherwise.
97  """
98  raise NotImplementedError()
99 
100  @property
101  @abstractmethod
102  def rank(self) -> int:
103  """Return a rough indication of how flexible this handler is in terms
104  of the path element names it can match.
105 
106  Handlers that match a constant path element should always return zero.
107  """
108  raise NotImplementedError()
109 
110  def translate(self, dataId2: dict, *, partial: bool = False) -> Optional[DataCoordinate]:
111  """Translate the given data ID from Gen2 to Gen3.
112 
113  The default implementation returns `None`. Subclasses that are able
114  to translate data IDs should override this method.
115 
116  Parameters
117  ----------
118  dataId2 : `dict`
119  Gen2 data ID.
120  partial : `bool`, optional
121  If `True` (`False` is default) this is a partial data ID for some
122  dataset, and missing keys are expected.
123 
124  Returns
125  -------
126  dataId3 : `lsst.daf.butler.DataCoordinate` or `None`
127  A Gen3 data ID, or `None` if this handler cannot translate data
128  IDs.
129  """
130  return None
131 
132  def __lt__(self, other: PathElementHandler):
133  """Handlers are sorted by rank to reduce the possibility that more
134  flexible handlers will have a chance to match something they shouldn't.
135  """
136  return self.rank < other.rank
137 
138  lastDataId2: dict
139  """The Gen2 data ID obtained by processing parent levels in the directory
140  tree.
141 
142  This attribute should be reset by calling code whenever a new parent
143  directory is entered, before invoking `__call__`.
144  """
145 
146  log: Log
147  """A logger to use for all diagnostic messages (`lsst.log.Log`).
148 
149  This attribute is set on a handler in `DirectoryScanner.add`; this avoids
150  needing to forward one through all subclass constructors.
151  """
152 
153 
155  """An object that uses `PathElementHandler` instances to process the files
156  and subdirectories in a directory tree.
157 
158  Parameters
159  ----------
160  log : `Log`, optional
161  Log to use to report warnings and debug information.
162  """
163  def __init__(self, log: Optional[Log] = None):
164  self._files = []
165  self._subdirectories = []
166  if log is None:
167  log = Log.getLogger("obs.base.gen2to3.walker")
168  self.log = log
169 
170  __slots__ = ("_files", "_subdirectories", "log")
171 
172  def add(self, handler: PathElementHandler):
173  """Add a new handler to the scanner.
174 
175  Parameters
176  ----------
177  handler : `PathElementHandler`
178  The handler to be added.
179  """
180  handler.log = self.log
181  if handler.isForFiles():
182  bisect.insort(self._files, handler)
183  else:
184  bisect.insort(self._subdirectories, handler)
185 
186  def __iter__(self) -> Iterator[PathElementHandler]:
187  """Iterate over all handlers.
188  """
189  yield from self._files
190  yield from self._subdirectories
191 
192  def scan(self, path: str, datasets: Mapping[DatasetType, List[FileDataset]], *,
193  predicate: Callable[[DataCoordinate], bool]):
194  """Process a directory.
195 
196  Parameters
197  ----------
198  path : `str`
199  Full path to the directory to be processed.
200  datasets : `dict` [`DatasetType`, `list` [`FileDataset`] ]
201  Dictionary that found datasets should be added to.
202  predicate : `~collections.abc.Callable`
203  A callable taking a single `DataCoordinate` argument and returning
204  `bool`, indicating whether that (Gen3) data ID represents one
205  that should be included in the scan.
206  """
207  unrecognized = []
208  for entry in os.scandir(path):
209  if entry.is_file():
210  handlers = self._files
211  elif entry.is_dir():
212  handlers = self._subdirectories
213  else:
214  continue
215  for handler in handlers:
216  if handler(entry.path, entry.name, datasets, predicate=predicate):
217  break
218  else:
219  unrecognized.append(entry.name)
220  if unrecognized:
221  self.log.warn("Skipped unrecognized entries in %s: %s", path, unrecognized)
lsst::log.log.logContinued.warn
def warn(fmt, *args)
Definition: logContinued.py:202
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.translate
Optional[DataCoordinate] translate(self, dict dataId2, *bool partial=False)
Definition: scanner.py:110
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner
Definition: scanner.py:154
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler
Definition: scanner.py:51
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.lastDataId2
lastDataId2
Definition: scanner.py:59
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__init__
def __init__(self)
Definition: scanner.py:58
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__lt__
def __lt__(self, PathElementHandler other)
Definition: scanner.py:132
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner._files
_files
Definition: scanner.py:164
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.__call__
bool __call__(self, str path, str name, Mapping[DatasetType, List[FileDataset]] datasets, *Callable[[DataCoordinate], bool] predicate)
Definition: scanner.py:75
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.__iter__
Iterator[PathElementHandler] __iter__(self)
Definition: scanner.py:186
lsst::log
Definition: Log.h:706
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.scan
def scan(self, str path, Mapping[DatasetType, List[FileDataset]] datasets, *Callable[[DataCoordinate], bool] predicate)
Definition: scanner.py:192
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.rank
int rank(self)
Definition: scanner.py:102
lsst.obs.base.gen2to3.repoWalker.scanner.PathElementHandler.isForFiles
bool isForFiles(self)
Definition: scanner.py:64
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.__init__
def __init__(self, Optional[Log] log=None)
Definition: scanner.py:163
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.log
log
Definition: scanner.py:168
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner.add
def add(self, PathElementHandler handler)
Definition: scanner.py:172
lsst.obs.base.gen2to3.repoWalker.scanner.DirectoryScanner._subdirectories
_subdirectories
Definition: scanner.py:165