LSSTApplications  20.0.0
LSSTDataManagementBasePackage
parser.py
Go to the documentation of this file.
1 # This file is part of obs_base.
2 #
3 # Developed for the LSST Data Management System.
4 # This product includes software developed by the LSST Project
5 # (http://www.lsst.org).
6 # See the COPYRIGHT file at the top-level directory of this distribution
7 # for details of code ownership.
8 #
9 # This program is free software: you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program. If not, see <http://www.gnu.org/licenses/>.
21 """Classes that transform (part of) a Gen2 filename template into a regular
22 expression that we can use to extract Gen2 data IDs from files.
23 """
24 from __future__ import annotations
25 
26 __all__ = ["PathElementParser"]
27 
28 
29 from abc import ABC, abstractmethod
30 import re
31 from typing import ClassVar, Dict, Optional
32 
33 from lsst.log import Log
34 
35 
36 class FormattableRegEx(ABC):
37  """An interface that generates a regular expression from a template and
38  a data ID.
39 
40  This is used by `PathElementParser` to abstract over whether a path
41  element's regex needs to include values from a data ID extracted from
42  parent path elements or not.
43  """
44 
45  @abstractmethod
46  def format(self, dataId: dict) -> re.Pattern:
47  """Substitute values from the given data ID and return a regular
48  expression.
49 
50  Parameters
51  ----------
52  dataId : `dict`
53  A dictionary whose entries may be used to format the regular
54  expression. May include unused entries.
55  """
56  raise NotImplementedError()
57 
58 
60  """A trivial implementation of `FormattableRegEx` that does no formatting.
61 
62  Parameters
63  ----------
64  regex : `re.Pattern`
65  The fixed regular expression to return.
66  """
67  def __init__(self, regex: re.Pattern):
68  self.regex = regex
69 
70  __slots__ = ("regex",)
71 
72  def format(self, dataId: dict) -> re.Pattern:
73  # Docstring inherited from FormattableRegEx.
74  return self.regex
75 
76  def __str__(self):
77  return f"{type(self).__name__}({self.regex})"
78 
79 
81  """An implementation of `FormattableRegEx` formed from a concatenation of
82  actual regular terms and %-style format strings.
83  """
84  def __init__(self):
85  self._terms = []
86 
87  __slots__ = ("_terms",)
88 
89  def addRegexTerm(self, regex: str):
90  """Add a regular expression term.
91  """
92  self._terms.append((regex, False))
93 
94  def addSubstitutionTerm(self, template: str):
95  """Add a %-style format template term.
96  """
97  self._terms.append((template, True))
98 
99  def format(self, dataId: dict) -> re.Pattern:
100  # Docstring inherited from FormattableRegEx.
101  return re.compile("".join(re.escape(s % dataId) if isSub else s
102  for s, isSub in self._terms))
103 
104  def simplify(self) -> FormattableRegEx:
105  """Return a possibly-simplified version of this object.
106 
107  If `addSubstitionTerm` was never called, this returns a simple
108  `FixedRegEx`.
109  """
110  if not any(isSub for _, isSub in self._terms):
111  return FixedRegEx(re.compile("".join(s for s, _ in self._terms)))
112  else:
113  return self
114 
115 
117  """An object that matches Gen2 file names and extracts Gen2 data IDs.
118 
119  Parameters
120  ----------
121  target : `str`
122  Either a full Gen2 path template or the part of one the corresponds to
123  a single path element (a subdirectory or file name).
124  allKeys : `dict` [`str`, `type`]
125  A dictionary that provides types for all Gen2 data ID keys that are
126  substituted into the given template. Additional key-value pairs may
127  be present and will be ignored.
128  previousKeys : `dict` [`str`, `type`], optional
129  A dictionary containing key strings and types for Gen2 data ID keys
130  that have been extracted from previous path elements of the same
131  template. Values for these keys must be provided via the
132  ``lastDataId`` argument when calling `parse`.
133  """
134  def __init__(self, template: str, allKeys: Dict[str, type], *,
135  previousKeys: Optional[Dict[str, type]] = None):
136  self.template = template
137  self.keys = {}
138  # For each template path element, we iterate over each %-tagged
139  # substitution string.
140  last = 0
141  self.regex = SubstitutableRegEx()
142  for match in self.TEMPLATE_RE.finditer(self.template):
143  # Copy the (escaped) regular string between the last substitution
144  # and this one, escaping it appropriately.
145  self.regex.addRegexTerm(re.escape(self.template[last:match.start()]))
146  # Pull out the data ID key from the name used in the
147  # substitution string. Use that and the substition
148  # type to come up with the pattern to use in the regex.
149  name = match.group("name")
150  if name == "patch":
151  pattern = r"\d+,\d+"
152  elif match.group("type") in "id": # integers
153  pattern = r"0*\d+"
154  else:
155  pattern = ".+"
156  # Create a new named groups for the first occurence of a key
157  # within an element.
158  if name not in self.keys:
159  if previousKeys and name in previousKeys:
160  # Key is new to this part of the template, but it appeared
161  # in some previous part of the template. We'll format the
162  # original template with the data ID from that previous
163  # step later.
164  start, stop = match.span()
165  self.regex.addSubstitutionTerm(self.template[start:stop])
166  else:
167  # Key is new; expect to extract a data ID value from it.
168  self.regex.addRegexTerm(r"(?P<%s>%s)" % (name, pattern))
169  self.keys[name] = allKeys[name]
170  else:
171  # Require a match with the last group for a second
172  # occurrence.
173  self.regex.addRegexTerm(r"(?P=<%s>)" % name)
174  # Remember the end of this match
175  last = match.end()
176  # Append anything remaining after the last substitution string.
177  self.regex.addRegexTerm(re.escape(self.template[last:]))
178  # If there are no substitutions, join and compile into a single regex
179  # now.
180  self.regex = self.regex.simplify()
181 
182  __slots__ = ("keys", "template", "regex")
183 
184  TEMPLATE_RE: ClassVar[re.Pattern] = re.compile(r"\%\‍((?P<name>\w+)\‍)[^\%]*?(?P<type>[idrs])")
185  """Regular expression that matches a single substitution in
186  Gen2 CameraMapper template, such as "%(tract)04d".
187  """
188 
189  def __str__(self):
190  return f"{type(self).__name__}({self.regex})"
191 
192  def parse(self, name: str, lastDataId: dict, *, log: Optional[Log] = None) -> Optional[dict]:
193  """Parse the path element.
194 
195  Parameters
196  ----------
197  name : `str`
198  The path name to parse.
199  lastDataId : `dict`
200  The cumulative Gen2 data ID obtaining by calling `parse` on parsers
201  for parent directories of the same path.
202  log : `Log`, optional
203  Log to use to report warnings and debug information.
204 
205  Returns
206  -------
207  dataId : `dict` or `None`
208  Gen2 data ID that combines key-value pairs obtained from this path
209  with those from ``lastDataId``. `None` if ``name`` is not matched
210  by this parser. If the keys extracted are inconsistent with those
211  in ``lastDataID``, a warning is sent to ``log`` and `None` is
212  returned.
213  """
214  m = self.regex.format(lastDataId).fullmatch(name)
215  if m is None:
216  return None
217  newDataId = {k: v(m.group(k)) for k, v in self.keys.items()}
218  for commonKey in lastDataId.keys() & newDataId.keys():
219  if newDataId[commonKey] != lastDataId[commonKey]:
220  if log is not None:
221  log.warn("Inconsistent value %s=%r when parsing %r with %r.",
222  commonKey, newDataId[commonKey], name, lastDataId)
223  return None
224  newDataId.update(lastDataId)
225  return newDataId
226 
227  keys: Dict[str, type]
228  """Dictionary mapping Gen2 data ID key to the type of its associated
229  value, covering only those keys that can be extracted from this path
230  element.
231  """
232 
233  template: str
234  """The portion of the original Gen2 filename template that this parser was
235  constructed with.
236  """
237 
238  regex: re.Pattern
239  """A regular expression that can be used to match the path element and
240  populate the Gen2 data ID items whose keys are in ``keys``.
241  """
lsst.obs.base.gen2to3.repoWalker.parser.FormattableRegEx
Definition: parser.py:36
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser.parse
Optional[dict] parse(self, str name, dict lastDataId, *Optional[Log] log=None)
Definition: parser.py:192
lsst.obs.base.gen2to3.repoWalker.parser.FixedRegEx
Definition: parser.py:59
lsst.obs.base.gen2to3.repoWalker.parser.FixedRegEx.format
re.Pattern format(self, dict dataId)
Definition: parser.py:72
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser.keys
keys
Definition: parser.py:136
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser.__init__
def __init__(self, str template, Dict[str, type] allKeys, *Optional[Dict[str, type]] previousKeys=None)
Definition: parser.py:134
pex.config.history.format
def format(config, name=None, writeSourceLine=True, prefix="", verbose=False)
Definition: history.py:174
lsst.obs.base.gen2to3.repoWalker.parser.FixedRegEx.regex
regex
Definition: parser.py:68
ast::append
std::shared_ptr< FrameSet > append(FrameSet const &first, FrameSet const &second)
Construct a FrameSet that performs two transformations in series.
Definition: functional.cc:33
lsst.obs.base.gen2to3.repoWalker.parser.SubstitutableRegEx.simplify
FormattableRegEx simplify(self)
Definition: parser.py:104
lsst.obs.base.gen2to3.repoWalker.parser.SubstitutableRegEx.addRegexTerm
def addRegexTerm(self, str regex)
Definition: parser.py:89
lsst.obs.base.gen2to3.repoWalker.parser.FixedRegEx.__init__
def __init__(self, re.Pattern regex)
Definition: parser.py:67
lsst.obs.base.gen2to3.repoWalker.parser.SubstitutableRegEx
Definition: parser.py:80
lsst.obs.base.gen2to3.repoWalker.parser.SubstitutableRegEx._terms
_terms
Definition: parser.py:85
lsst.obs.base.gen2to3.repoWalker.parser.SubstitutableRegEx.format
re.Pattern format(self, dict dataId)
Definition: parser.py:99
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser.template
template
Definition: parser.py:135
lsst::geom::any
bool any(CoordinateExpr< N > const &expr) noexcept
Return true if any elements are true.
Definition: CoordinateExpr.h:89
lsst::log
Definition: Log.h:706
lsst.obs.base.gen2to3.repoWalker.parser.SubstitutableRegEx.addSubstitutionTerm
def addSubstitutionTerm(self, str template)
Definition: parser.py:94
lsst.obs.base.gen2to3.repoWalker.parser.FixedRegEx.__str__
def __str__(self)
Definition: parser.py:76
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser.regex
regex
Definition: parser.py:140
items
std::vector< SchemaItem< Flag > > * items
Definition: BaseColumnView.cc:142
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser.__str__
def __str__(self)
Definition: parser.py:189
lsst.obs.base.gen2to3.repoWalker.parser.SubstitutableRegEx.__init__
def __init__(self)
Definition: parser.py:84
lsst.obs.base.gen2to3.repoWalker.parser.FormattableRegEx.format
re.Pattern format(self, dict dataId)
Definition: parser.py:46
lsst.obs.base.gen2to3.repoWalker.parser.PathElementParser
Definition: parser.py:116