22__all__ = [
'ConvertCatalogCoordinatesConfig',
'MatchProbabilisticConfig',
'MatcherProbabilistic']
26from dataclasses
import dataclass
30from scipy.spatial
import cKDTree
32from typing
import Callable, Set
34logger_default = logging.getLogger(__name__)
37def _mul_column(column: np.array, value: float):
38 if value
is not None and value != 1:
43def _radec_to_xyz(ra, dec):
44 """Convert input ra/dec coordinates to spherical unit vectors.
48 ra, dec: `numpy.ndarray`
49 Arrays of right ascension/declination in degrees.
53 vectors : `numpy.ndarray`, (N, 3)
56 if ra.size != dec.size:
57 raise ValueError(
'ra and dec must be same size')
59 decs = np.radians(dec)
60 vectors = np.empty((ras.size, 3))
62 sin_dec = np.sin(np.pi / 2 - decs)
63 vectors[:, 0] = sin_dec * np.cos(ras)
64 vectors[:, 1] = sin_dec * np.sin(ras)
65 vectors[:, 2] = np.cos(np.pi / 2 - decs)
72 """Store frequently-reference (meta)data relevant for matching a catalog.
76 catalog : `pandas.DataFrame`
77 A pandas catalog to store extra information for.
78 select : `numpy.array`
79 A numpy boolean array of the same length
as catalog to be used
for
86 coordinate_factor: float = None
88 def __init__(self, catalog: pd.DataFrame, select: np.array =
None, coordinate_factor: float =
None):
89 self.
nn = len(catalog)
90 self.
selectselect = np.ones(self.
nn, dtype=bool)
if select
is None else select
91 self.
indicesindices = np.flatnonzero(select)
if select
is not None else np.arange(self.
nn)
95@dataclass(frozen=True)
97 """A catalog with sources with coordinate columns in some standard format/units.
99 catalog : `pandas.DataFrame`
100 A catalog with comparable coordinate columns.
101 column_coord1 : `str`
102 The first spatial coordinate column name.
103 column_coord2 : `str`
104 The second spatial coordinate column name.
105 coord1 : `numpy.array`
106 The first spatial coordinate values.
107 coord2 : `numpy.array`
108 The second spatial coordinate values.
109 extras : `CatalogExtras`
110 Extra cached (meta)data
for the `catalog`.
112 catalog: pd.DataFrame
117 extras: CatalogExtras
121 """Configuration for the MatchProbabilistic matcher.
123 column_ref_coord1 = pexConfig.Field(
126 doc=
'The reference table column for the first spatial coordinate (usually x or ra).',
128 column_ref_coord2 = pexConfig.Field(
131 doc=
'The reference table column for the second spatial coordinate (usually y or dec).'
132 'Units must match column_ref_coord1.',
134 column_target_coord1 = pexConfig.Field(
137 doc=
'The target table column for the first spatial coordinate (usually x or ra).'
138 'Units must match column_ref_coord1.',
140 column_target_coord2 = pexConfig.Field(
143 doc=
'The target table column for the second spatial coordinate (usually y or dec).'
144 'Units must match column_ref_coord2.',
146 coords_spherical = pexConfig.Field(
149 doc=
'Whether column_*_coord[12] are spherical coordinates (ra/dec) or not (pixel x/y)',
151 coords_ref_factor = pexConfig.Field(
154 doc=
'Multiplicative factor for reference catalog coordinates.'
155 'If coords_spherical is true, this must be the number of degrees per unit increment of '
156 'column_ref_coord[12]. Otherwise, it must convert the coordinate to the same units'
157 ' as the target coordinates.',
159 coords_target_factor = pexConfig.Field(
162 doc=
'Multiplicative factor for target catalog coordinates.'
163 'If coords_spherical is true, this must be the number of degrees per unit increment of '
164 'column_target_coord[12]. Otherwise, it must convert the coordinate to the same units'
165 ' as the reference coordinates.',
167 coords_ref_to_convert = pexConfig.DictField(
171 dictCheck=
lambda x: len(x) == 2,
172 doc=
'Dict mapping sky coordinate columns to be converted to pixel columns',
174 mag_zeropoint_ref = pexConfig.Field(
177 doc=
'Magnitude zeropoint for reference catalog.',
182 catalog_ref: pd.DataFrame,
183 catalog_target: pd.DataFrame,
184 select_ref: np.array =
None,
185 select_target: np.array =
None,
186 radec_to_xy_func: Callable =
None,
187 return_converted_columns: bool =
False,
190 """Format matched catalogs that may require coordinate conversions.
194 catalog_ref : `pandas.DataFrame`
195 A reference catalog for comparison to `catalog_target`.
196 catalog_target : `pandas.DataFrame`
197 A target catalog
with measurements
for comparison to `catalog_ref`.
198 select_ref : `numpy.ndarray`, (Nref,)
199 A boolean array of len `catalog_ref`,
True for valid match candidates.
200 select_target : `numpy.ndarray`, (Ntarget,)
201 A boolean array of len `catalog_target`,
True for valid match candidates.
202 radec_to_xy_func : `typing.Callable`
203 Function taking equal-length ra, dec arrays
and returning an ndarray of
204 - ``x``: current parameter (`float`).
205 - ``extra_args``: additional arguments (`dict`).
206 return_converted_columns : `bool`
207 Whether to
return converted columns
in the `coord1`
and `coord2`
208 attributes, rather than keep the original values.
213 compcat_ref, compcat_target : `ComparableCatalog`
214 Comparable catalogs corresponding to the input reference
and target.
218 if convert_ref
and not callable(radec_to_xy_func):
219 raise TypeError(
'radec_to_xy_func must be callable if converting ref coords')
222 extras_ref, extras_target = (
223 CatalogExtras(catalog, select=select, coordinate_factor=coord_factor)
224 for catalog, select, coord_factor
in zip(
225 (catalog_ref, catalog_target),
226 (select_ref, select_target),
234 for catalog, extras, (column1, column2), convert
in (
239 _mul_column(catalog[column], extras.coordinate_factor)
240 for column
in (column1, column2)
243 xy_ref = radec_to_xy_func(coord1, coord2, self.
coords_ref_factorcoords_ref_factor, **kwargs)
245 coord = np.array([xy[idx_coord]
for xy
in xy_ref])
246 catalog[column_out] = coord
247 if convert_ref
and return_converted_columns:
249 coord1, coord2 = catalog[column1], catalog[column2]
250 if isinstance(coord1, pd.Series):
251 coord1 = coord1.values
252 if isinstance(coord2, pd.Series):
253 coord2 = coord2.values
256 catalog=catalog, column_coord1=column1, column_coord2=column2,
257 coord1=coord1, coord2=coord2, extras=extras,
260 return tuple(compcats)
264 """Configuration for the MatchProbabilistic matcher.
266 column_ref_order = pexConfig.Field(
270 doc=
"Name of column in reference catalog specifying order for matching."
271 " Derived from columns_ref_flux if not set.",
285 columns_all.extend(columns)
287 return set(columns_all)
302 columns_all.extend(columns)
303 return set(columns_all)
305 columns_ref_copy = pexConfig.ListField(
308 listCheck=
lambda x: len(
set(x)) == len(x),
310 doc=
'Reference table columns to copy unchanged into both match tables',
312 columns_ref_flux = pexConfig.ListField(
315 listCheck=
lambda x: len(
set(x)) == len(x),
317 doc=
"List of reference flux columns to nansum total magnitudes from if column_order is None",
319 columns_ref_meas = pexConfig.ListField(
321 doc=
'The reference table columns to compute match likelihoods from '
322 '(usually centroids and fluxes/magnitudes)',
324 columns_target_copy = pexConfig.ListField(
327 listCheck=
lambda x: len(
set(x)) == len(x),
329 doc=
'Target table columns to copy unchanged into both match tables',
331 columns_target_meas = pexConfig.ListField(
333 doc=
'Target table columns with measurements corresponding to columns_ref_meas',
335 columns_target_err = pexConfig.ListField(
337 doc=
'Target table columns with standard errors (sigma) corresponding to columns_ref_meas',
339 columns_target_select_true = pexConfig.ListField(
341 default=(
'detect_isPrimary',),
342 doc=
'Target table columns to require to be True for selecting sources',
344 columns_target_select_false = pexConfig.ListField(
346 default=(
'merge_peak_sky',),
347 doc=
'Target table columns to require to be False for selecting sources',
349 coord_format = pexConfig.ConfigField(
350 dtype=ConvertCatalogCoordinatesConfig,
351 doc=
"Configuration for coordinate conversion",
353 mag_brightest_ref = pexConfig.Field(
356 doc=
'Bright magnitude cutoff for selecting reference sources to match.'
357 ' Ignored if column_ref_order is None.'
359 mag_faintest_ref = pexConfig.Field(
362 doc=
'Faint magnitude cutoff for selecting reference sources to match.'
363 ' Ignored if column_ref_order is None.'
365 match_dist_max = pexConfig.Field(
368 doc=
'Maximum match distance. Units must be arcseconds if coords_spherical, '
369 'or else match those of column_*_coord[12] multiplied by coords_*_factor.',
371 match_n_max = pexConfig.Field(
375 doc=
'Maximum number of spatial matches to consider (in ascending distance order).',
377 match_n_finite_min = pexConfig.Field(
381 doc=
'Minimum number of columns with a finite value to measure match likelihood',
383 order_ascending = pexConfig.Field(
387 doc=
'Whether to order reference match candidates in ascending order of column_ref_order '
388 '(should be False if the column is a flux and True if it is a magnitude.',
395 elif dtype == np.signedinteger:
397 elif dtype == np.unsignedinteger:
403 """A probabilistic, greedy catalog matcher.
407 config: `MatchProbabilisticConfig`
408 A configuration instance.
410 config: MatchProbabilisticConfig
414 config: MatchProbabilisticConfig,
420 catalog_ref: pd.DataFrame,
421 catalog_target: pd.DataFrame,
422 select_ref: np.array =
None,
423 select_target: np.array =
None,
424 logger: logging.Logger =
None,
425 logging_n_rows: int =
None,
432 catalog_ref : `pandas.DataFrame`
433 A reference catalog to match in order of a given column (i.e. greedily).
434 catalog_target : `pandas.DataFrame`
435 A target catalog
for matching sources
from `catalog_ref`. Must contain measurements
with errors.
436 select_ref : `numpy.array`
437 A boolean array of the same length
as `catalog_ref` selecting the sources that can be matched.
438 select_target : `numpy.array`
439 A boolean array of the same length
as `catalog_target` selecting the sources that can be matched.
440 logger : `logging.Logger`
441 A Logger
for logging.
442 logging_n_rows : `int`
443 The number of sources to match before printing a log message.
445 Additional keyword arguments to
pass to `format_catalogs`.
449 catalog_out_ref : `pandas.DataFrame`
450 A catalog of identical length to `catalog_ref`, containing match information
for rows selected by
451 `select_ref` (including the matching row index
in `catalog_target`).
452 catalog_out_target : `pandas.DataFrame`
453 A catalog of identical length to `catalog_target`, containing the indices of matching rows
in
455 exceptions : `dict` [`int`, `Exception`]
456 A dictionary keyed by `catalog_target` row number of the first exception caught when matching.
459 logger = logger_default
461 config = self.
configconfig
471 ref, target = config.coord_format.format_catalogs(
472 catalog_ref=catalog_ref, catalog_target=catalog_target,
473 select_ref=select_ref, select_target=select_target,
483 catalog_ref.loc[ref.extras.select, config.column_ref_order]
484 if config.column_ref_order
is not None else
485 np.nansum(catalog_ref.loc[ref.extras.select, config.columns_ref_flux], axis=1)
487 order = np.argsort(column_order
if config.order_ascending
else -column_order)
489 n_ref_select = len(ref.extras.indices)
491 match_dist_max = config.match_dist_max
492 coords_spherical = config.coord_format.coords_spherical
494 match_dist_max = np.radians(match_dist_max / 3600.)
497 func_convert = _radec_to_xyz
if coords_spherical
else np.vstack
498 vec_ref, vec_target = (
499 func_convert(cat.coord1[cat.extras.select], cat.coord2[cat.extras.select])
500 for cat
in (ref, target)
504 logger.info(
'Generating cKDTree with match_n_max=%d', config.match_n_max)
505 tree_obj = cKDTree(vec_target)
507 scores, idxs_target_select = tree_obj.query(
509 distance_upper_bound=match_dist_max,
510 k=config.match_n_max,
513 n_target_select = len(target.extras.indices)
514 n_matches = np.sum(idxs_target_select != n_target_select, axis=1)
515 n_matched_max = np.sum(n_matches == config.match_n_max)
516 if n_matched_max > 0:
518 '%d/%d (%.2f%%) selected true objects have n_matches=n_match_max(%d)',
519 n_matched_max, n_ref_select, 100.*n_matched_max/n_ref_select, config.match_n_max
523 target_row_match = np.full(target.extras.n, np.nan, dtype=np.int64)
524 ref_candidate_match = np.zeros(ref.extras.n, dtype=bool)
525 ref_row_match = np.full(ref.extras.n, np.nan, dtype=np.int64)
526 ref_match_count = np.zeros(ref.extras.n, dtype=np.int32)
527 ref_match_meas_finite = np.zeros(ref.extras.n, dtype=np.int32)
528 ref_chisq = np.full(ref.extras.n, np.nan, dtype=float)
531 idx_orig_ref, idx_orig_target = (np.argwhere(cat.extras.select)
for cat
in (ref, target))
534 columns_convert = config.coord_format.coords_ref_to_convert
535 if columns_convert
is None:
537 data_ref = ref.catalog[
538 [columns_convert.get(column, column)
for column
in config.columns_ref_meas]
539 ].iloc[ref.extras.indices[order]]
540 data_target = target.catalog[config.columns_target_meas][target.extras.select]
541 errors_target = target.catalog[config.columns_target_err][target.extras.select]
545 matched_target = {n_target_select, }
547 t_begin = time.process_time()
549 logger.info(
'Matching n_indices=%d/%d', len(order), len(ref.catalog))
550 for index_n, index_row_select
in enumerate(order):
551 index_row = idx_orig_ref[index_row_select]
552 ref_candidate_match[index_row] =
True
553 found = idxs_target_select[index_row_select, :]
558 found = [x
for x
in found
if x
not in matched_target]
563 (data_target.iloc[found].values - data_ref.iloc[index_n].values)
564 / errors_target.iloc[found].values
566 finite = np.isfinite(chi)
567 n_finite = np.sum(finite, axis=1)
569 chisq_good = n_finite >= config.match_n_finite_min
570 if np.any(chisq_good):
572 chisq_sum = np.zeros(n_found, dtype=float)
573 chisq_sum[chisq_good] = np.nansum(chi[chisq_good, :] ** 2, axis=1)
574 idx_chisq_min = np.nanargmin(chisq_sum / n_finite)
575 ref_match_meas_finite[index_row] = n_finite[idx_chisq_min]
576 ref_match_count[index_row] = len(chisq_good)
577 ref_chisq[index_row] = chisq_sum[idx_chisq_min]
578 idx_match_select = found[idx_chisq_min]
579 row_target = target.extras.indices[idx_match_select]
580 ref_row_match[index_row] = row_target
582 target_row_match[row_target] = index_row
583 matched_target.add(idx_match_select)
584 except Exception
as error:
587 exceptions[index_row] = error
589 if logging_n_rows
and ((index_n + 1) % logging_n_rows == 0):
590 t_elapsed = time.process_time() - t_begin
592 'Processed %d/%d in %.2fs at sort value=%.3f',
593 index_n + 1, n_ref_select, t_elapsed, column_order[order[index_n]],
597 'match_candidate': ref_candidate_match,
598 'match_row': ref_row_match,
599 'match_count': ref_match_count,
600 'match_chisq': ref_chisq,
601 'match_n_chisq_finite': ref_match_meas_finite,
604 'match_candidate': target.extras.select
if target.extras.select
is not None else (
605 np.ones(target.extras.n, dtype=bool)),
606 'match_row': target_row_match,
609 for (columns, out_original, out_matched, in_original, in_matched, matches)
in (
611 self.
configconfig.columns_ref_copy,
619 self.
configconfig.columns_target_copy,
627 matched = matches >= 0
628 idx_matched = matches[matched]
630 for column
in columns:
631 values = in_original.catalog[column]
632 out_original[column] = values
633 dtype = in_original.catalog[column].dtype
637 raise RuntimeError(f
'Column {column} dtype={dtype} has multiple types={types}')
640 column_match = np.full(in_matched.extras.n,
default_value(dtype), dtype=dtype)
641 column_match[matched] = in_original.catalog[column][idx_matched]
642 out_matched[f
'match_{column}'] = column_match
644 catalog_out_ref = pd.DataFrame(data_ref)
645 catalog_out_target = pd.DataFrame(data_target)
647 return catalog_out_ref, catalog_out_target, exceptions
def format_catalogs(self, pd.DataFrame catalog_ref, pd.DataFrame catalog_target, np.array select_ref=None, np.array select_target=None, Callable radec_to_xy_func=None, bool return_converted_columns=False, **kwargs)
Set[str] columns_in_target(self)
columns_target_select_false
columns_target_select_true
Set[str] columns_in_ref(self)
def match(self, pd.DataFrame catalog_ref, pd.DataFrame catalog_target, np.array select_ref=None, np.array select_target=None, logging.Logger logger=None, int logging_n_rows=None, **kwargs)
def __init__(self, MatchProbabilisticConfig config)
daf::base::PropertyList * list
daf::base::PropertySet * set