LSST Applications 24.1.6,g063fba187b+e7121a6b04,g0f08755f38+4e0faf0f7f,g12f32b3c4e+7915c4de30,g1653933729+a8ce1bb630,g168dd56ebc+a8ce1bb630,g28da252d5a+94d9f37a33,g2bbee38e9b+ae03bbfc84,g2bc492864f+ae03bbfc84,g3156d2b45e+6e55a43351,g347aa1857d+ae03bbfc84,g35bb328faa+a8ce1bb630,g3a166c0a6a+ae03bbfc84,g3e281a1b8c+c5dd892a6c,g414038480c+6b9177ef31,g41af890bb2+9e154f3e8d,g6b1c1869cb+adc49b6f1a,g781aacb6e4+a8ce1bb630,g7af13505b9+3363a39af3,g7f202ee025+406ba613a5,g80478fca09+8fbba356e2,g82479be7b0+0d223595df,g858d7b2824+4e0faf0f7f,g89c8672015+f4add4ffd5,g9125e01d80+a8ce1bb630,g9726552aa6+414189b318,ga5288a1d22+32d6120315,gacef1a1666+7f85da65db,gb58c049af0+d64f4d3760,gbcfae0f0a0+a8c62e8bb6,gc28159a63d+ae03bbfc84,gcf0d15dbbd+412a8a6f35,gda6a2b7d83+412a8a6f35,gdaeeff99f8+1711a396fd,ge79ae78c31+ae03bbfc84,gf0baf85859+c1f95f4921,gfa517265be+4e0faf0f7f,gfa999e8aa5+17cd334064,gfb92a5be7c+4e0faf0f7f
LSST Data Management Base Package
Loading...
Searching...
No Matches
matcher_probabilistic.py
Go to the documentation of this file.
1# This file is part of meas_astrom.
2#
3# Developed for the LSST Data Management System.
4# This product includes software developed by the LSST Project
5# (https://www.lsst.org).
6# See the COPYRIGHT file at the top-level directory of this distribution
7# for details of code ownership.
8#
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13#
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18#
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21
22__all__ = [
23 "CatalogExtras", "ComparableCatalog", "ConvertCatalogCoordinatesConfig", "MatchProbabilisticConfig",
24 "MatcherProbabilistic",
25]
26
27from dataclasses import dataclass
28import logging
29import time
30from typing import Callable, Set
31import warnings
32
33import astropy.table
34import lsst.pex.config as pexConfig
35import numpy as np
36import pandas as pd
37from scipy.spatial import cKDTree
38from smatch.matcher import Matcher
39
40logger_default = logging.getLogger(__name__)
41
42
43def _mul_column(column: np.array, value: float):
44 if value is not None and value != 1:
45 column *= value
46 return column
47
48
49def _radec_to_xyz(ra, dec):
50 """Convert input ra/dec coordinates to spherical unit vectors.
51
52 Parameters
53 ----------
54 ra, dec: `numpy.ndarray`
55 Arrays of right ascension/declination in degrees.
56
57 Returns
58 -------
59 vectors : `numpy.ndarray`, (N, 3)
60 Output unit vectors.
61 """
62 if ra.size != dec.size:
63 raise ValueError('ra and dec must be same size')
64 ras = np.radians(ra)
65 decs = np.radians(dec)
66 vectors = np.empty((ras.size, 3))
67
68 sin_dec = np.sin(np.pi / 2 - decs)
69 vectors[:, 0] = sin_dec * np.cos(ras)
70 vectors[:, 1] = sin_dec * np.sin(ras)
71 vectors[:, 2] = np.cos(np.pi / 2 - decs)
72
73 return vectors
74
75
76@dataclass
78 """Store frequently-reference (meta)data relevant for matching a catalog.
79
80 Parameters
81 ----------
82 catalog : `pandas.DataFrame`
83 A pandas catalog to store extra information for.
84 select : `numpy.array`
85 A numpy boolean array of the same length as catalog to be used for
86 target selection.
87 """
88
89 n: int
90 indices: np.array
91 select: np.array
92
93 coordinate_factor: float = None
94
96 self,
97 catalog: astropy.table.Table | pd.DataFrame,
98 select: np.array = None,
99 coordinate_factor: float = None,
100 ):
101 self.nn = len(catalog)
102 self.selectselect = np.ones(self.nn, dtype=bool) if select is None else select
103 self.indicesindices = np.flatnonzero(select) if select is not None else np.arange(self.nn)
104 self.coordinate_factorcoordinate_factor = coordinate_factor
105
106
107@dataclass(frozen=True)
109 """A catalog with sources with coordinate columns in some standard format/units.
110
111 catalog : `pandas.DataFrame`
112 A catalog with comparable coordinate columns.
113 column_coord1 : `str`
114 The first spatial coordinate column name.
115 column_coord2 : `str`
116 The second spatial coordinate column name.
117 coord1 : `numpy.array`
118 The first spatial coordinate values.
119 coord2 : `numpy.array`
120 The second spatial coordinate values.
121 extras : `CatalogExtras`
122 Extra cached (meta)data for the `catalog`.
123 """
124
125 catalog: astropy.table.Table | pd.DataFrame
126 column_coord1: str
127 column_coord2: str
128 coord1: np.array
129 coord2: np.array
130 extras: CatalogExtras
131
132
133class ConvertCatalogCoordinatesConfig(pexConfig.Config):
134 """Configuration for the MatchProbabilistic matcher."""
135
136 column_ref_coord1 = pexConfig.Field[str](
137 default='ra',
138 doc='The reference table column for the first spatial coordinate (usually x or ra).',
139 )
140 column_ref_coord2 = pexConfig.Field[str](
141 default='dec',
142 doc='The reference table column for the second spatial coordinate (usually y or dec).'
143 'Units must match column_ref_coord1.',
144 )
145 column_target_coord1 = pexConfig.Field[str](
146 default='coord_ra',
147 doc='The target table column for the first spatial coordinate (usually x or ra).'
148 'Units must match column_ref_coord1.',
149 )
150 column_target_coord2 = pexConfig.Field[str](
151 default='coord_dec',
152 doc='The target table column for the second spatial coordinate (usually y or dec).'
153 'Units must match column_ref_coord2.',
154 )
155 coords_spherical = pexConfig.Field[bool](
156 default=True,
157 doc='Whether column_*_coord[12] are spherical coordinates (ra/dec) or not (pixel x/y).',
158 )
159 coords_ref_factor = pexConfig.Field[float](
160 default=1.0,
161 doc='Multiplicative factor for reference catalog coordinates.'
162 'If coords_spherical is true, this must be the number of degrees per unit increment of '
163 'column_ref_coord[12]. Otherwise, it must convert the coordinate to the same units'
164 ' as the target coordinates.',
165 )
166 coords_target_factor = pexConfig.Field[float](
167 default=1.0,
168 doc='Multiplicative factor for target catalog coordinates.'
169 'If coords_spherical is true, this must be the number of degrees per unit increment of '
170 'column_target_coord[12]. Otherwise, it must convert the coordinate to the same units'
171 ' as the reference coordinates.',
172 )
173 coords_ref_to_convert = pexConfig.DictField[str, str](
174 default=None,
175 optional=True,
176 dictCheck=lambda x: len(x) == 2,
177 doc='Dict mapping sky coordinate columns to be converted to pixel columns.',
178 )
179 mag_zeropoint_ref = pexConfig.Field[float](
180 default=31.4,
181 doc='Magnitude zeropoint for reference catalog.',
182 )
183 return_converted_coords = pexConfig.Field[float](
184 default=True,
185 doc='Whether to return converted coordinates for matching or only write them.',
186 )
187
189 self,
190 catalog_ref: astropy.table.Table | pd.DataFrame,
191 catalog_target: astropy.table.Table | pd.DataFrame,
192 select_ref: np.array = None,
193 select_target: np.array = None,
194 radec_to_xy_func: Callable = None,
195 **kwargs,
196 ):
197 """Format matched catalogs that may require coordinate conversions.
198
199 Parameters
200 ----------
201 catalog_ref : `pandas.DataFrame`
202 A reference catalog for comparison to `catalog_target`.
203 catalog_target : `pandas.DataFrame`
204 A target catalog with measurements for comparison to `catalog_ref`.
205 select_ref : `numpy.ndarray`, (Nref,)
206 A boolean array of len `catalog_ref`, True for valid match candidates.
207 select_target : `numpy.ndarray`, (Ntarget,)
208 A boolean array of len `catalog_target`, True for valid match candidates.
209 radec_to_xy_func : `typing.Callable`
210 Function taking equal-length ra, dec arrays and returning an ndarray of
211 - ``x``: current parameter (`float`).
212 - ``extra_args``: additional arguments (`dict`).
213 kwargs
214 Additional keyword arguments to pass to radec_to_xy_func.
215
216 Returns
217 -------
218 compcat_ref, compcat_target : `ComparableCatalog`
219 Comparable catalogs corresponding to the input reference and target.
220 """
221 # TODO: Remove pandas support in DM-46523
222 is_ref_pd = isinstance(catalog_ref, pd.DataFrame)
223 is_target_pd = isinstance(catalog_target, pd.DataFrame)
224 if is_ref_pd:
225 catalog_ref = astropy.table.Table.from_pandas(catalog_ref)
226 if is_target_pd:
227 catalog_target = astropy.table.Table.from_pandas(catalog_target)
228 if is_ref_pd or is_target_pd:
229 warnings.warn("pandas usage in MatchProbabilisticTask is deprecated; it will be removed "
230 " in favour of astropy.table after release 28.0.0", category=FutureWarning)
231
232 convert_ref = self.coords_ref_to_convert
233 if convert_ref and not callable(radec_to_xy_func):
234 raise TypeError('radec_to_xy_func must be callable if converting ref coords')
235
236 # Set up objects with frequently-used attributes like selection bool array
237 extras_ref, extras_target = (
238 CatalogExtras(catalog, select=select, coordinate_factor=coord_factor)
239 for catalog, select, coord_factor in zip(
240 (catalog_ref, catalog_target),
241 (select_ref, select_target),
243 )
244 )
245
246 compcats = []
247
248 # Retrieve coordinates and multiply them by scaling factors
249 for catalog, extras, (column1, column2), convert in (
250 (catalog_ref, extras_ref, (self.column_ref_coord1, self.column_ref_coord2), convert_ref),
251 (catalog_target, extras_target, (self.column_target_coord1, self.column_target_coord2), False),
252 ):
253 coord1, coord2 = (
254 _mul_column(catalog[column], extras.coordinate_factor)
255 for column in (column1, column2)
256 )
257 if convert:
258 xy_ref = radec_to_xy_func(coord1, coord2, self.coords_ref_factor, **kwargs)
259 for idx_coord, column_out in enumerate(self.coords_ref_to_convert.values()):
260 coord = np.array([xy[idx_coord] for xy in xy_ref])
261 catalog[column_out] = coord
262 if convert_ref:
263 column1, column2 = self.coords_ref_to_convert.values()
265 coord1, coord2 = catalog[column1], catalog[column2]
266 if isinstance(coord1, pd.Series):
267 coord1 = coord1.values
268 if isinstance(coord2, pd.Series):
269 coord2 = coord2.values
270
271 compcats.append(ComparableCatalog(
272 catalog=catalog, column_coord1=column1, column_coord2=column2,
273 coord1=coord1, coord2=coord2, extras=extras,
274 ))
275
276 return compcats[0], compcats[1]
277
278
279class MatchProbabilisticConfig(pexConfig.Config):
280 """Configuration for the MatchProbabilistic matcher."""
281
282 column_ref_order = pexConfig.Field(
283 dtype=str,
284 default=None,
285 optional=True,
286 doc='Name of column in reference catalog specifying order for matching'
287 ' Derived from columns_ref_flux if not set.',
288 )
289
290 @property
291 def columns_in_ref(self) -> Set[str]:
292 columns_all = [
293 self.coord_format.column_ref_coord1,
294 self.coord_format.column_ref_coord2,
295 ]
296 for columns in (
297 self.columns_ref_flux,
298 self.columns_ref_meas,
301 self.columns_ref_copy,
302 ):
303 columns_all.extend(columns)
304 if self.column_ref_order:
305 columns_all.append(self.column_ref_order)
306
307 return set(columns_all)
308
309 @property
310 def columns_in_target(self) -> Set[str]:
311 columns_all = [
312 self.coord_format.column_target_coord1,
313 self.coord_format.column_target_coord2,
314 ]
315 for columns in (
321 ):
322 columns_all.extend(columns)
323 return set(columns_all)
324
325 columns_ref_copy = pexConfig.ListField(
326 dtype=str,
327 default=[],
328 listCheck=lambda x: len(set(x)) == len(x),
329 optional=True,
330 doc='Reference table columns to copy unchanged into both match tables',
331 )
332 columns_ref_flux = pexConfig.ListField(
333 dtype=str,
334 default=[],
335 optional=True,
336 doc="List of reference flux columns to nansum total magnitudes from if column_order is None",
337 )
338 columns_ref_meas = pexConfig.ListField(
339 dtype=str,
340 doc='The reference table columns to compute match likelihoods from '
341 '(usually centroids and fluxes/magnitudes)',
342 )
343 columns_ref_select_true = pexConfig.ListField(
344 dtype=str,
345 default=tuple(),
346 doc='Reference table columns to require to be True for selecting sources',
347 )
348 columns_ref_select_false = pexConfig.ListField(
349 dtype=str,
350 default=tuple(),
351 doc='Reference table columns to require to be False for selecting sources',
352 )
353 columns_target_copy = pexConfig.ListField(
354 dtype=str,
355 default=[],
356 listCheck=lambda x: len(set(x)) == len(x),
357 optional=True,
358 doc='Target table columns to copy unchanged into both match tables',
359 )
360 columns_target_meas = pexConfig.ListField(
361 dtype=str,
362 doc='Target table columns with measurements corresponding to columns_ref_meas',
363 )
364 columns_target_err = pexConfig.ListField(
365 dtype=str,
366 doc='Target table columns with standard errors (sigma) corresponding to columns_ref_meas',
367 )
368 columns_target_select_true = pexConfig.ListField(
369 dtype=str,
370 default=('detect_isPrimary',),
371 doc='Target table columns to require to be True for selecting sources',
372 )
373 columns_target_select_false = pexConfig.ListField(
374 dtype=str,
375 default=('merge_peak_sky',),
376 doc='Target table columns to require to be False for selecting sources',
377 )
378 coord_format = pexConfig.ConfigField(
379 dtype=ConvertCatalogCoordinatesConfig,
380 doc="Configuration for coordinate conversion",
381 )
382 mag_brightest_ref = pexConfig.Field(
383 dtype=float,
384 default=-np.inf,
385 doc='Bright magnitude cutoff for selecting reference sources to match.'
386 ' Ignored if column_ref_order is None.'
387 )
388 mag_faintest_ref = pexConfig.Field(
389 dtype=float,
390 default=np.inf,
391 doc='Faint magnitude cutoff for selecting reference sources to match.'
392 ' Ignored if column_ref_order is None.'
393 )
394 match_dist_max = pexConfig.Field(
395 dtype=float,
396 default=0.5,
397 doc='Maximum match distance. Units must be arcseconds if coords_spherical, '
398 'or else match those of column_*_coord[12] multiplied by coords_*_factor.',
399 )
400 match_n_max = pexConfig.Field(
401 dtype=int,
402 default=10,
403 optional=True,
404 doc='Maximum number of spatial matches to consider (in ascending distance order).',
405 check=lambda x: x >= 1,
406 )
407 match_n_finite_min = pexConfig.Field(
408 dtype=int,
409 default=2,
410 optional=True,
411 doc='Minimum number of columns with a finite value to measure match likelihood',
412 )
413 order_ascending = pexConfig.Field(
414 dtype=bool,
415 default=False,
416 optional=True,
417 doc='Whether to order reference match candidates in ascending order of column_ref_order '
418 '(should be False if the column is a flux and True if it is a magnitude.',
419 )
420
421 def validate(self):
422 super().validate()
423 n_ref_meas = len(self.columns_ref_meas)
424 n_target_meas = len(self.columns_target_meas)
425 n_target_err = len(self.columns_target_err)
426 match_n_finite_min = self.match_n_finite_min
427 errors = []
428 if n_target_meas != n_ref_meas:
429 errors.append(f"{len(self.columns_target_meas)=} != {len(self.columns_ref_meas)=}")
430 if n_target_err != n_ref_meas:
431 errors.append(f"{len(self.columns_target_err)=} != {len(self.columns_ref_meas)=}")
432 if not (n_ref_meas >= match_n_finite_min):
433 errors.append(
434 f"{len(self.columns_ref_meas)=} !>= {self.match_n_finite_min=}, no matches possible"
435 )
436 if errors:
437 raise ValueError("\n".join(errors))
438
439
440def default_value(dtype):
441 if dtype is str:
442 return ''
443 elif np.issubdtype(dtype, np.signedinteger):
444 return np.inf
445 elif np.issubdtype(dtype, np.unsignedinteger):
446 return -np.inf
447 return None
448
449
451 """A probabilistic, greedy catalog matcher.
452
453 Parameters
454 ----------
455 config: `MatchProbabilisticConfig`
456 A configuration instance.
457 """
458
459 config: MatchProbabilisticConfig
460
462 self,
463 config: MatchProbabilisticConfig,
464 ):
465 self.configconfig = config
466
467 def match(
468 self,
469 catalog_ref: astropy.table.Table | pd.DataFrame,
470 catalog_target: astropy.table.Table | pd.DataFrame,
471 select_ref: np.array = None,
472 select_target: np.array = None,
473 logger: logging.Logger = None,
474 logging_n_rows: int = None,
475 **kwargs
476 ):
477 """Match catalogs.
478
479 Parameters
480 ----------
481 catalog_ref : `pandas.DataFrame` | `astropy.table.Table`
482 A reference catalog to match in order of a given column (i.e. greedily).
483 catalog_target : `pandas.DataFrame` | `astropy.table.Table`
484 A target catalog for matching sources from `catalog_ref`. Must contain measurements with errors.
485 select_ref : `numpy.array`
486 A boolean array of the same length as `catalog_ref` selecting the sources that can be matched.
487 select_target : `numpy.array`
488 A boolean array of the same length as `catalog_target` selecting the sources that can be matched.
489 logger : `logging.Logger`
490 A Logger for logging.
491 logging_n_rows : `int`
492 The number of sources to match before printing a log message.
493 kwargs
494 Additional keyword arguments to pass to `format_catalogs`.
495
496 Returns
497 -------
498 catalog_out_ref : `pandas.DataFrame`
499 A catalog of identical length to `catalog_ref`, containing match information for rows selected by
500 `select_ref` (including the matching row index in `catalog_target`).
501 catalog_out_target : `pandas.DataFrame`
502 A catalog of identical length to `catalog_target`, containing the indices of matching rows in
503 `catalog_ref`.
504 exceptions : `dict` [`int`, `Exception`]
505 A dictionary keyed by `catalog_target` row number of the first exception caught when matching.
506 """
507 if logger is None:
508 logger = logger_default
509
510 # TODO: Remove pandas support in DM-46523
511 is_ref_pd = isinstance(catalog_ref, pd.DataFrame)
512 is_target_pd = isinstance(catalog_target, pd.DataFrame)
513 if is_ref_pd:
514 catalog_ref = astropy.table.Table.from_pandas(catalog_ref)
515 if is_target_pd:
516 catalog_target = astropy.table.Table.from_pandas(catalog_target)
517 if is_ref_pd or is_target_pd:
518 warnings.warn("pandas usage in MatchProbabilisticTask is deprecated; it will be removed "
519 " in favour of astropy.table after release 28.0.0", category=FutureWarning)
520
521 t_init = time.process_time()
522 config = self.configconfig
523
524 # Transform any coordinates, if required
525 # Note: The returned objects contain the original catalogs, as well as
526 # transformed coordinates, and the selection of sources for matching.
527 # These might be identical to the arrays passed as kwargs, but that
528 # depends on config settings.
529 # For the rest of this function, the selection arrays will be used,
530 # but the indices of the original, unfiltered catalog will also be
531 # output, so some further indexing steps are needed.
532 with warnings.catch_warnings():
533 # We already issued a deprecation warning; no need to repeat it.
534 warnings.filterwarnings(action="ignore", category=FutureWarning)
535 ref, target = config.coord_format.format_catalogs(
536 catalog_ref=catalog_ref, catalog_target=catalog_target,
537 select_ref=select_ref, select_target=select_target,
538 **kwargs
539 )
540
541 # If no order is specified, take nansum of all flux columns for a 'total flux'
542 # Note: it won't actually be a total flux if bands overlap significantly
543 # (or it might define a filter with >100% efficiency
544 column_order = (
545 catalog_ref[config.column_ref_order][ref.extras.select]
546 if config.column_ref_order is not None else
547 np.nansum([catalog_ref[col][ref.extras.select] for col in config.columns_ref_flux], axis=0)
548 )
549 order = np.argsort(column_order if config.order_ascending else -column_order)
550
551 n_ref_select = len(ref.extras.indices)
552
553 coords_spherical = config.coord_format.coords_spherical
554 coords_ref, coords_target = (
555 (cat.coord1[cat.extras.select], cat.coord2[cat.extras.select])
556 for cat in (ref, target)
557 )
558
559 # Generate K-d tree to compute distances
560 logger.info('Generating cKDTree with match_n_max=%d', config.match_n_max)
561
562 if coords_spherical:
563 match_dist_max = config.match_dist_max/3600.
564 with Matcher(coords_target[0], coords_target[1]) as matcher:
565 idxs_target_select = matcher.query_knn(
566 coords_ref[0], coords_ref[1],
567 distance_upper_bound=match_dist_max,
568 k=config.match_n_max,
569 )
570 # Call scipy for non-spherical case
571 # The spherical case won't trigger, but the implementation is left for comparison, if needed
572 else:
573 match_dist_max = np.radians(config.match_dist_max/3600.)
574 # Convert ra/dec sky coordinates to spherical vectors for accurate distances
575 func_convert = _radec_to_xyz if coords_spherical else np.vstack
576 vec_ref, vec_target = (
577 func_convert(coords[0], coords[1])
578 for coords in (coords_ref, coords_target)
579 )
580 tree_obj = cKDTree(vec_target)
581 _, idxs_target_select = tree_obj.query(
582 vec_ref,
583 distance_upper_bound=match_dist_max,
584 k=config.match_n_max,
585 )
586
587 n_target_select = len(target.extras.indices)
588 n_matches = np.sum(idxs_target_select != n_target_select, axis=1)
589 n_matched_max = np.sum(n_matches == config.match_n_max)
590 if n_matched_max > 0:
591 logger.warning(
592 '%d/%d (%.2f%%) selected true objects have n_matches=n_match_max(%d)',
593 n_matched_max, n_ref_select, 100.*n_matched_max/n_ref_select, config.match_n_max
594 )
595
596 # Pre-allocate outputs
597 target_row_match = np.full(target.extras.n, np.iinfo(np.int64).min, dtype=np.int64)
598 ref_candidate_match = np.zeros(ref.extras.n, dtype=bool)
599 ref_row_match = np.full(ref.extras.n, np.iinfo(np.int64).min, dtype=np.int64)
600 ref_match_count = np.zeros(ref.extras.n, dtype=np.int32)
601 ref_match_meas_finite = np.zeros(ref.extras.n, dtype=np.int32)
602 ref_chisq = np.full(ref.extras.n, np.nan, dtype=float)
603
604 # Need the original reference row indices for output
605 idx_orig_ref, idx_orig_target = (np.argwhere(cat.extras.select)[:, 0] for cat in (ref, target))
606
607 # Retrieve required columns, including any converted ones (default to original column name)
608 columns_convert = config.coord_format.coords_ref_to_convert
609 if columns_convert is None:
610 columns_convert = {}
611 data_ref = np.array([
612 ref.catalog[columns_convert.get(column, column)][ref.extras.indices[order]]
613 for column in config.columns_ref_meas
614 ])
615 data_target = np.array([
616 target.catalog[col][target.extras.select] for col in config.columns_target_meas
617 ])
618 errors_target = np.array([
619 target.catalog[col][target.extras.select] for col in config.columns_target_err
620 ])
621
622 exceptions = {}
623 # The kdTree uses len(inputs) as a sentinel value for no match
624 matched_target = {n_target_select, }
625 index_ref = idx_orig_ref[order]
626 # Fill in the candidate column
627 ref_candidate_match[index_ref] = True
628
629 # Count this as the time when disambiguation begins
630 t_begin = time.process_time()
631
632 # Exclude unmatched sources
633 matched_ref = idxs_target_select[order, 0] != n_target_select
634 order = order[matched_ref]
635 idx_first = idxs_target_select[order, 0]
636 chi_0 = (data_target[:, idx_first] - data_ref[:, matched_ref])/errors_target[:, idx_first]
637 chi_finite_0 = np.isfinite(chi_0)
638 n_finite_0 = np.sum(chi_finite_0, axis=0)
639 chi_0[~chi_finite_0] = 0
640 chisq_sum_0 = np.sum(chi_0*chi_0, axis=0)
641 n_meas = len(config.columns_ref_meas)
642
643 logger.info('Disambiguating %d/%d matches/targets', len(order), len(ref.catalog))
644 for index_n, index_row_select in enumerate(order):
645 index_row = idx_orig_ref[index_row_select]
646 found = idxs_target_select[index_row_select, :]
647 # Unambiguous match, short-circuit some evaluations
648 if (found[1] == n_target_select) and (found[0] not in matched_target):
649 n_finite = n_finite_0[index_n]
650 if not (n_finite >= config.match_n_finite_min):
651 continue
652 idx_chisq_min = 0
653 n_matched = 1
654 chisq_sum = chisq_sum_0[index_n]
655 else:
656 # Select match candidates from nearby sources not already matched
657 # Note: set lookup is apparently fast enough that this is a few percent faster than:
658 # found = [x for x in found[found != n_target_select] if x not in matched_target]
659 # ... at least for ~1M sources
660 found = [x for x in found if x not in matched_target]
661 n_found = len(found)
662 if n_found == 0:
663 continue
664 # This is an ndarray of n_found rows x len(data_ref/target) columns
665 chi = (
666 data_target[:, found] - data_ref[:, index_n].reshape((n_meas, 1))
667 )/errors_target[:, found]
668 finite = np.isfinite(chi)
669 n_finite = np.sum(finite, axis=0)
670 # Require some number of finite chi_sq to match
671 chisq_good = n_finite >= config.match_n_finite_min
672 if not any(chisq_good):
673 continue
674 try:
675 chisq_sum = np.zeros(n_found, dtype=float)
676 chisq_sum[chisq_good] = np.nansum(chi[:, chisq_good] ** 2, axis=0)
677 idx_chisq_min = np.nanargmin(chisq_sum / n_finite)
678 n_finite = n_finite[idx_chisq_min]
679 n_matched = len(chisq_good)
680 chisq_sum = chisq_sum[idx_chisq_min]
681 except Exception as error:
682 # Can't foresee any exceptions, but they shouldn't prevent
683 # matching subsequent sources
684 exceptions[index_row] = error
685 ref_match_meas_finite[index_row] = n_finite
686 ref_match_count[index_row] = n_matched
687 ref_chisq[index_row] = chisq_sum
688 idx_match_select = found[idx_chisq_min]
689 row_target = target.extras.indices[idx_match_select]
690 ref_row_match[index_row] = row_target
691
692 target_row_match[row_target] = index_row
693 matched_target.add(idx_match_select)
694
695 if logging_n_rows and ((index_n + 1) % logging_n_rows == 0):
696 t_elapsed = time.process_time() - t_begin
697 logger.info(
698 'Processed %d/%d in %.2fs at sort value=%.3f',
699 index_n + 1, n_ref_select, t_elapsed, column_order[order[index_n]],
700 )
701
702 data_ref = {
703 'match_candidate': ref_candidate_match,
704 'match_row': ref_row_match,
705 'match_count': ref_match_count,
706 'match_chisq': ref_chisq,
707 'match_n_chisq_finite': ref_match_meas_finite,
708 }
709 data_target = {
710 'match_candidate': target.extras.select if target.extras.select is not None else (
711 np.ones(target.extras.n, dtype=bool)),
712 'match_row': target_row_match,
713 }
714
715 for (columns, out_original, out_matched, in_original, in_matched, matches, name_cat) in (
716 (
717 self.configconfig.columns_ref_copy,
718 data_ref,
719 data_target,
720 ref,
721 target,
722 target_row_match,
723 'target',
724 ),
725 (
726 self.configconfig.columns_target_copy,
727 data_target,
728 data_ref,
729 target,
730 ref,
731 ref_row_match,
732 'reference',
733 ),
734 ):
735 matched = matches >= 0
736 idx_matched = matches[matched]
737 logger.info('Matched %d/%d %s sources', np.sum(matched), len(matched), name_cat)
738
739 for column in columns:
740 values = in_original.catalog[column]
741 out_original[column] = values
742 dtype = in_original.catalog[column].dtype
743
744 # Pandas object columns can have mixed types - check for that
745 if dtype is object:
746 types = list(set((type(x) for x in values)))
747 if len(types) != 1:
748 raise RuntimeError(f'Column {column} dtype={dtype} has multiple types={types}')
749 dtype = types[0]
750
751 value_fill = default_value(dtype)
752
753 # Without this, the dtype would be '<U1' for an empty Unicode string
754 if dtype is str:
755 dtype = f'<U{max(len(x) for x in values)}'
756
757 column_match = np.full(in_matched.extras.n, value_fill, dtype=dtype)
758 column_match[matched] = in_original.catalog[column][idx_matched]
759 out_matched[f'match_{column}'] = column_match
760
761 logger.info(
762 'Completed match disambiguating in %.2fs (total %.2fs)',
763 time.process_time() - t_begin,
764 time.process_time() - t_init,
765 )
766
767 catalog_out_ref = (pd.DataFrame if is_ref_pd else astropy.table.Table)(data_ref)
768 if not is_ref_pd:
769 for column, description in {
770 'match_candidate': 'Whether the object was selected as a candidate for matching',
771 'match_row': 'The index of the best matched row in the target table, if any',
772 'match_count': 'The number of candidate matching target objects, i.e. those within the match'
773 ' distance but excluding objects already matched to a ref object',
774 'match_chisq': 'The sum of all finite reduced chi-squared values over all match columns',
775 'match_n_chisq_finite': 'The number of match columns with finite chisq',
776 }.items():
777 catalog_out_ref[column].description = description
778 catalog_out_target = (pd.DataFrame if is_target_pd else astropy.table.Table)(data_target)
779
780 return catalog_out_ref, catalog_out_target, exceptions
std::vector< SchemaItem< Flag > > * items
__init__(self, astropy.table.Table|pd.DataFrame catalog, np.array select=None, float coordinate_factor=None)
format_catalogs(self, astropy.table.Table|pd.DataFrame catalog_ref, astropy.table.Table|pd.DataFrame catalog_target, np.array select_ref=None, np.array select_target=None, Callable radec_to_xy_func=None, **kwargs)
match(self, astropy.table.Table|pd.DataFrame catalog_ref, astropy.table.Table|pd.DataFrame catalog_target, np.array select_ref=None, np.array select_target=None, logging.Logger logger=None, int logging_n_rows=None, **kwargs)