469 catalog_ref: astropy.table.Table | pd.DataFrame,
470 catalog_target: astropy.table.Table | pd.DataFrame,
471 select_ref: np.array =
None,
472 select_target: np.array =
None,
473 logger: logging.Logger =
None,
474 logging_n_rows: int =
None,
481 catalog_ref : `pandas.DataFrame` | `astropy.table.Table`
482 A reference catalog to match in order of a given column (i.e. greedily).
483 catalog_target : `pandas.DataFrame` | `astropy.table.Table`
484 A target catalog for matching sources from `catalog_ref`. Must contain measurements with errors.
485 select_ref : `numpy.array`
486 A boolean array of the same length as `catalog_ref` selecting the sources that can be matched.
487 select_target : `numpy.array`
488 A boolean array of the same length as `catalog_target` selecting the sources that can be matched.
489 logger : `logging.Logger`
490 A Logger for logging.
491 logging_n_rows : `int`
492 The number of sources to match before printing a log message.
494 Additional keyword arguments to pass to `format_catalogs`.
498 catalog_out_ref : `pandas.DataFrame`
499 A catalog of identical length to `catalog_ref`, containing match information for rows selected by
500 `select_ref` (including the matching row index in `catalog_target`).
501 catalog_out_target : `pandas.DataFrame`
502 A catalog of identical length to `catalog_target`, containing the indices of matching rows in
504 exceptions : `dict` [`int`, `Exception`]
505 A dictionary keyed by `catalog_target` row number of the first exception caught when matching.
508 logger = logger_default
511 is_ref_pd = isinstance(catalog_ref, pd.DataFrame)
512 is_target_pd = isinstance(catalog_target, pd.DataFrame)
514 catalog_ref = astropy.table.Table.from_pandas(catalog_ref)
516 catalog_target = astropy.table.Table.from_pandas(catalog_target)
517 if is_ref_pd
or is_target_pd:
518 warnings.warn(
"pandas usage in MatchProbabilisticTask is deprecated; it will be removed "
519 " in favour of astropy.table after release 28.0.0", category=FutureWarning)
521 t_init = time.process_time()
532 with warnings.catch_warnings():
534 warnings.filterwarnings(action=
"ignore", category=FutureWarning)
535 ref, target = config.coord_format.format_catalogs(
536 catalog_ref=catalog_ref, catalog_target=catalog_target,
537 select_ref=select_ref, select_target=select_target,
545 catalog_ref[config.column_ref_order][ref.extras.select]
546 if config.column_ref_order
is not None else
547 np.nansum([catalog_ref[col][ref.extras.select]
for col
in config.columns_ref_flux], axis=0)
549 order = np.argsort(column_order
if config.order_ascending
else -column_order)
551 n_ref_select = len(ref.extras.indices)
553 coords_spherical = config.coord_format.coords_spherical
554 coords_ref, coords_target = (
555 (cat.coord1[cat.extras.select], cat.coord2[cat.extras.select])
556 for cat
in (ref, target)
560 logger.info(
'Generating cKDTree with match_n_max=%d', config.match_n_max)
563 match_dist_max = config.match_dist_max/3600.
564 with Matcher(coords_target[0], coords_target[1])
as matcher:
565 idxs_target_select = matcher.query_knn(
566 coords_ref[0], coords_ref[1],
567 distance_upper_bound=match_dist_max,
568 k=config.match_n_max,
573 match_dist_max = np.radians(config.match_dist_max/3600.)
575 func_convert = _radec_to_xyz
if coords_spherical
else np.vstack
576 vec_ref, vec_target = (
577 func_convert(coords[0], coords[1])
578 for coords
in (coords_ref, coords_target)
580 tree_obj = cKDTree(vec_target)
581 _, idxs_target_select = tree_obj.query(
583 distance_upper_bound=match_dist_max,
584 k=config.match_n_max,
587 n_target_select = len(target.extras.indices)
588 n_matches = np.sum(idxs_target_select != n_target_select, axis=1)
589 n_matched_max = np.sum(n_matches == config.match_n_max)
590 if n_matched_max > 0:
592 '%d/%d (%.2f%%) selected true objects have n_matches=n_match_max(%d)',
593 n_matched_max, n_ref_select, 100.*n_matched_max/n_ref_select, config.match_n_max
597 target_row_match = np.full(target.extras.n, np.iinfo(np.int64).min, dtype=np.int64)
598 ref_candidate_match = np.zeros(ref.extras.n, dtype=bool)
599 ref_row_match = np.full(ref.extras.n, np.iinfo(np.int64).min, dtype=np.int64)
600 ref_match_count = np.zeros(ref.extras.n, dtype=np.int32)
601 ref_match_meas_finite = np.zeros(ref.extras.n, dtype=np.int32)
602 ref_chisq = np.full(ref.extras.n, np.nan, dtype=float)
605 idx_orig_ref, idx_orig_target = (np.argwhere(cat.extras.select)[:, 0]
for cat
in (ref, target))
608 columns_convert = config.coord_format.coords_ref_to_convert
609 if columns_convert
is None:
611 data_ref = np.array([
612 ref.catalog[columns_convert.get(column, column)][ref.extras.indices[order]]
613 for column
in config.columns_ref_meas
615 data_target = np.array([
616 target.catalog[col][target.extras.select]
for col
in config.columns_target_meas
618 errors_target = np.array([
619 target.catalog[col][target.extras.select]
for col
in config.columns_target_err
624 matched_target = {n_target_select, }
625 index_ref = idx_orig_ref[order]
627 ref_candidate_match[index_ref] =
True
630 t_begin = time.process_time()
633 matched_ref = idxs_target_select[order, 0] != n_target_select
634 order = order[matched_ref]
635 idx_first = idxs_target_select[order, 0]
636 chi_0 = (data_target[:, idx_first] - data_ref[:, matched_ref])/errors_target[:, idx_first]
637 chi_finite_0 = np.isfinite(chi_0)
638 n_finite_0 = np.sum(chi_finite_0, axis=0)
639 chi_0[~chi_finite_0] = 0
640 chisq_sum_0 = np.sum(chi_0*chi_0, axis=0)
641 n_meas = len(config.columns_ref_meas)
643 logger.info(
'Disambiguating %d/%d matches/targets', len(order), len(ref.catalog))
644 for index_n, index_row_select
in enumerate(order):
645 index_row = idx_orig_ref[index_row_select]
646 found = idxs_target_select[index_row_select, :]
648 if (found[1] == n_target_select)
and (found[0]
not in matched_target):
649 n_finite = n_finite_0[index_n]
650 if not (n_finite >= config.match_n_finite_min):
654 chisq_sum = chisq_sum_0[index_n]
660 found = [x
for x
in found
if x
not in matched_target]
666 data_target[:, found] - data_ref[:, index_n].reshape((n_meas, 1))
667 )/errors_target[:, found]
668 finite = np.isfinite(chi)
669 n_finite = np.sum(finite, axis=0)
671 chisq_good = n_finite >= config.match_n_finite_min
672 if not any(chisq_good):
675 chisq_sum = np.zeros(n_found, dtype=float)
676 chisq_sum[chisq_good] = np.nansum(chi[:, chisq_good] ** 2, axis=0)
677 idx_chisq_min = np.nanargmin(chisq_sum / n_finite)
678 n_finite = n_finite[idx_chisq_min]
679 n_matched = len(chisq_good)
680 chisq_sum = chisq_sum[idx_chisq_min]
681 except Exception
as error:
684 exceptions[index_row] = error
685 ref_match_meas_finite[index_row] = n_finite
686 ref_match_count[index_row] = n_matched
687 ref_chisq[index_row] = chisq_sum
688 idx_match_select = found[idx_chisq_min]
689 row_target = target.extras.indices[idx_match_select]
690 ref_row_match[index_row] = row_target
692 target_row_match[row_target] = index_row
693 matched_target.add(idx_match_select)
695 if logging_n_rows
and ((index_n + 1) % logging_n_rows == 0):
696 t_elapsed = time.process_time() - t_begin
698 'Processed %d/%d in %.2fs at sort value=%.3f',
699 index_n + 1, n_ref_select, t_elapsed, column_order[order[index_n]],
703 'match_candidate': ref_candidate_match,
704 'match_row': ref_row_match,
705 'match_count': ref_match_count,
706 'match_chisq': ref_chisq,
707 'match_n_chisq_finite': ref_match_meas_finite,
710 'match_candidate': target.extras.select
if target.extras.select
is not None else (
711 np.ones(target.extras.n, dtype=bool)),
712 'match_row': target_row_match,
715 for (columns, out_original, out_matched, in_original, in_matched, matches, name_cat)
in (
735 matched = matches >= 0
736 idx_matched = matches[matched]
737 logger.info(
'Matched %d/%d %s sources', np.sum(matched), len(matched), name_cat)
739 for column
in columns:
740 values = in_original.catalog[column]
741 out_original[column] = values
742 dtype = in_original.catalog[column].dtype
746 types = list(set((type(x)
for x
in values)))
748 raise RuntimeError(f
'Column {column} dtype={dtype} has multiple types={types}')
755 dtype = f
'<U{max(len(x) for x in values)}'
757 column_match = np.full(in_matched.extras.n, value_fill, dtype=dtype)
758 column_match[matched] = in_original.catalog[column][idx_matched]
759 out_matched[f
'match_{column}'] = column_match
762 'Completed match disambiguating in %.2fs (total %.2fs)',
763 time.process_time() - t_begin,
764 time.process_time() - t_init,
767 catalog_out_ref = (pd.DataFrame
if is_ref_pd
else astropy.table.Table)(data_ref)
769 for column, description
in {
770 'match_candidate':
'Whether the object was selected as a candidate for matching',
771 'match_row':
'The index of the best matched row in the target table, if any',
772 'match_count':
'The number of candidate matching target objects, i.e. those within the match'
773 ' distance but excluding objects already matched to a ref object',
774 'match_chisq':
'The sum of all finite reduced chi-squared values over all match columns',
775 'match_n_chisq_finite':
'The number of match columns with finite chisq',
777 catalog_out_ref[column].description = description
778 catalog_out_target = (pd.DataFrame
if is_target_pd
else astropy.table.Table)(data_target)
780 return catalog_out_ref, catalog_out_target, exceptions