LSSTApplications  10.0+286,10.0+36,10.0+46,10.0-2-g4f67435,10.1+152,10.1+37,11.0,11.0+1,11.0-1-g47edd16,11.0-1-g60db491,11.0-1-g7418c06,11.0-2-g04d2804,11.0-2-g68503cd,11.0-2-g818369d,11.0-2-gb8b8ce7
LSSTDataManagementBasePackage
multiMatch.py
Go to the documentation of this file.
1 import numpy
2 import collections
3 import lsst.afw.geom
4 from .tableLib import SchemaMapper, CoordKey, SourceRecord
5 
6 class MultiMatch(object):
7 
8  def __init__(self, schema, dataIdFormat, coordField="coord", idField="id", radius=None,
9  RecordClass=SourceRecord):
10  """Initialize a multi-catalog match.
11 
12  Arguments:
13  schema -------- schema shared by all catalogs to be included in the match.
14  dataIdFormat -- dict of name: type for all data ID keys (e.g. {"visit":int, "ccd":int}).
15  coordField ---- prefix for _ra and _dec fields that contain the coordinates to use for the match.
16  idField ------- name of the field in schema that contains unique object IDs.
17  radius -------- lsst.afw.geom.Angle; maximum separation for a match. Defaults to 0.5 arcseconds.
18  RecordClass --- type of record (a subclass of lsst.afw.table.BaseRecord) to expect in catalogs
19  to be matched.
20  """
21  if radius is None:
22  radius = 0.5*lsst.afw.geom.arcseconds
23  elif not isinstance(radius, lsst.afw.geom.Angle):
24  raise ValueError("'radius' argument must be an Angle")
25  self.radius = radius
26  self.mapper = SchemaMapper(schema)
27  self.mapper.addMinimalSchema(schema, True)
28  self.coordKey = CoordKey(schema[coordField])
29  self.idKey = schema.find(idField).key
30  self.dataIdKeys = {}
31  outSchema = self.mapper.editOutputSchema()
32  self.objectKey = outSchema.addField("object", type=numpy.int64, doc="Unique ID for joined sources")
33  for name, dataType in dataIdFormat.iteritems():
34  self.dataIdKeys[name] = outSchema.addField(name, type=dataType, doc="'%s' data ID component")
35  # self.result will be a catalog containing the union of all matched records, with an 'object' ID
36  # column that can be used to group matches. Sources that have ambiguous matches may appear
37  # multiple times.
38  self.result = None
39  # self.reference will be a subset of self.result, with exactly one record for each group of matches
40  # (we'll use the one from the first catalog matched into this group)
41  # We'll use this to match against each subsequent catalog.
42  self.reference = None
43  # A set of ambiguous objects that we may want to ultimately remove from the final merged catalog.
44  self.ambiguous = set()
45  # Table used to allocate new records for the ouput catalog.
46  self.table = RecordClass.Table.make(self.mapper.getOutputSchema())
47  # Counter used to assign the next object ID
48  self.nextObjId = 1
49 
50  def makeRecord(self, inputRecord, dataId, objId):
51  """Create a new result record from the given input record, using the given data ID and object ID
52  to fill in additional columns."""
53  outputRecord = self.table.copyRecord(inputRecord, self.mapper)
54  for name, key in self.dataIdKeys.iteritems():
55  outputRecord.set(key, dataId[name])
56  outputRecord.set(self.objectKey, objId)
57  return outputRecord
58 
59  def add(self, catalog, dataId):
60  """Add a new catalog to the match, corresponding to the given data ID.
61  """
62  if self.result is None:
63  self.result = self.table.Catalog(self.table)
64  for record in catalog:
65  self.result.append(self.makeRecord(record, dataId, objId=self.nextObjId))
66  self.nextObjId += 1
67  self.reference = self.result.copy(deep=False)
68  return
69  # Temporary dict mapping object ID to reference record
70  # Will remove from this dict as objects are matched.
71  objById = {record.get(self.objectKey): record for record in self.reference}
72  # Temporary dict mapping source ID to new catalog record.
73  # Will remove from this dict as objects are matched.
74  newById = {record.get(self.idKey): record for record in catalog}
75  # Temporary dict mapping new source ID to a set of associated objects.
76  newToObj = {}
77  matches = lsst.afw.table.matchRaDec(self.reference, catalog, self.radius)
78  for refRecord, newRecord, distance in matches:
79  objId = refRecord.get(self.objectKey)
80  if objById.pop(objId, None) is None:
81  # We've already matched this object against another new source,
82  # mark it as ambiguous.
83  self.ambiguous.add(objId)
84  if newById.pop(newRecord.get(self.idKey), None) is None:
85  # We've already matched this new source to one or more other objects
86  # Mark all involved objects as ambiguous
87  self.ambiguous.add(objId)
88  self.ambiguous |= newToObj.get(newRecord.get(self.idKey), set())
89  # Populate the newToObj dict (setdefault trick is an idiom for appending to a dict-of-sets)
90  newToObj.setdefault(newRecord.get(self.idKey), set()).add(objId)
91  # Add a new result record for this match.
92  self.result.append(self.makeRecord(newRecord, dataId, objId))
93  # Add any unmatched sources from the new catalog as new objects to both the joined result catalog
94  # and the reference catalog.
95  for newRecord in newById.itervalues():
96  resultRecord = self.makeRecord(newRecord, dataId, self.nextObjId)
97  self.nextObjId += 1
98  self.result.append(resultRecord)
99  self.reference.append(resultRecord)
100 
101  def finish(self, removeAmbiguous=True):
102  """Return the final match catalog, after sorting it by object, copying it to ensure contiguousness,
103  and optionally removing ambiguous matches.
104 
105  After calling finish(), the in-progress state of the matcher is returned to the state it was
106  just after construction, with the exception of the object ID counter (which is not reset).
107  """
108  if removeAmbiguous:
109  result = self.table.Catalog(self.table)
110  for record in self.result:
111  if record.get(self.objectKey) not in self.ambiguous:
112  result.append(record)
113  else:
114  result = self.result
115  result.sort(self.objectKey)
116  result = result.copy(deep=True)
117  self.result = None
118  self.reference = None
119  self.ambiguous = set()
120  return result
121 
122 
123 class GroupView(collections.Mapping):
124  """A mapping (i.e. dict-like object) that provides convenient operations on the concatenated
125  catalogs returned by a MultiMatch object.
126 
127  A GroupView provides access to a catalog of grouped objects, in which the grouping is indicated by
128  a field for which all records in a group have the same value. Once constructed, it allows operations
129  similar to those supported by SQL "GROUP BY", such as filtering and aggregate calculation.
130  """
131 
132  @classmethod
133  def build(cls, catalog, groupField="object"):
134  """!Construct a GroupView from a concatenated catalog.
135 
136  @param[in] catalog Input catalog, containing records grouped by a field in which all records
137  in the same group have the same value. Must be sorted by the group field.
138  @param[in] groupField Name or Key for the field that indicates groups.
139  """
140  groupKey = catalog.schema.find(groupField).key
141  ids, indices = numpy.unique(catalog.get(groupKey), return_index=True)
142  groups = numpy.zeros(len(ids), dtype=object)
143  ends = list(indices[1:]) + [len(catalog)]
144  for n, (i1, i2) in enumerate(zip(indices, ends)):
145  groups[n] = catalog[i1:i2]
146  assert (groups[n].get(groupKey) == ids[n]).all()
147  return cls(catalog.schema, ids, groups)
148 
149  def __init__(self, schema, ids, groups):
150  """Direct constructor; most users should call build() instead.
151 
152  This constructor takes the constituent arrays of the object directly, to allow multiple
153  methods for construction.
154  """
155  self.schema = schema
156  self.ids = ids
157  self.groups = groups
158  self.count = sum(len(cat) for cat in self.groups)
159 
160  def __len__(self):
161  """Return the number of groups"""
162  return len(self.ids)
163 
164  def __iter__(self):
165  """Iterate over group field values"""
166  return self.ids
167 
168  def __getitem__(self, key):
169  """Return the catalog subset that corresponds to an group field value"""
170  index = numpy.searchsorted(self.ids, key)
171  if self.ids[index] != key:
172  raise KeyError("Group with ID {0} not found".format(key))
173  return self.groups[index]
174 
175  def where(self, predicate):
176  """Return a new GroupView that contains only groups for which the given predicate function
177  returns True.
178 
179  The predicate function is called once for each group, and passed a single argument: the subset
180  catalog for that group.
181  """
182  mask = numpy.zeros(len(self), dtype=bool)
183  for i in xrange(len(self)):
184  mask[i] = predicate(self.groups[i])
185  return type(self)(self.schema, self.ids[mask], self.groups[mask])
186 
187  def aggregate(self, function, field=None, dtype=float):
188  """!Run an aggregate function on each group, returning an array with one element for each group.
189 
190  @param[in] function Callable object that computes the aggregate value. If field is None,
191  called with the entire subset catalog as an argument. If field is not
192  None, called with an array view into that field.
193  @param[in] field A string name or Key object that indicates a single field the aggregate
194  is computed over.
195  @param[in] dtype Data type for the output array.
196  """
197  result = numpy.zeros(len(self), dtype=float)
198  if field is not None:
199  key = self.schema.find(field).key
200  f = lambda cat: function(cat.get(key))
201  else:
202  f = function
203  for i in xrange(len(self)):
204  result[i] = f(self.groups[i])
205  return result
206 
207  def apply(self, function, field=None, dtype=float):
208  """!Run a non-aggregate function on each group, returning an array with one element for each record.
209 
210  @param[in] function Callable object that computes the aggregate value. If field is None,
211  called with the entire subset catalog as an argument. If field is not
212  None, called with an array view into that field.
213  @param[in] field A string name or Key object that indicates a single field the aggregate
214  is computed over.
215  @param[in] dtype Data type for the output array.
216  """
217  result = numpy.zeros(self.count, dtype=float)
218  if field is not None:
219  key = self.schema.find(field).key
220  f = lambda cat: function(cat.get(key))
221  else:
222  f = function
223  last = 0
224  for i in xrange(len(self)):
225  next = last + len(self.groups[i])
226  result[last:next] = f(self.groups[i])
227  last = next
228  return result
bool all(CoordinateExpr< N > const &expr)
Return true if all elements are true.
def aggregate
Run an aggregate function on each group, returning an array with one element for each group...
Definition: multiMatch.py:187
boost::enable_if< typename ExpressionTraits< Scalar >::IsScalar, Scalar >::type sum(Scalar const &scalar)
Definition: operators.h:1250
def build
Construct a GroupView from a concatenated catalog.
Definition: multiMatch.py:133
def apply
Run a non-aggregate function on each group, returning an array with one element for each record...
Definition: multiMatch.py:207
std::vector< Match< typename Cat::Record, typename Cat::Record > > matchRaDec(Cat const &cat, Angle radius, bool symmetric=true)
A FunctorKey used to get or set celestial coordiantes from a pair of Angle keys.
Definition: aggregates.h:119