doxygen/x_masterDoxyDoc/repository_iterator_8py_source.html

 #

 # LSST Data Management System

 # Copyright 2008, 2009, 2010, 2011, 2012 LSST Corporation.

 #

 # This product includes software developed by the

 # LSST Project (http://www.lsst.org/).

 #

 # This program is free software: you can redistribute it and/or modify

 # it under the terms of the GNU General Public License as published by

 # the Free Software Foundation, either version 3 of the License, or

 # (at your option) any later version.

 #

 # This program is distributed in the hope that it will be useful,

 # but WITHOUT ANY WARRANTY; without even the implied warranty of

 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    See the

 # GNU General Public License for more details.

 #

 # You should have received a copy of the LSST License Statement and

 # the GNU General Public License along with this program.  If not,

 # see <http://www.lsstcorp.org/LegalNotices/>.

 #

 """Tools to help you iterate over a set of repositories.


 Helpful while creating them or harvesting data from them.

 """

 import itertools


 import numpy


 STR_PADDING = 5  # used by _getDTypeList; the number of characters to add to the first string value seen

 # when estimating the number of characters needed to store values for a key


 def _getDTypeList(keyTuple, valTuple):

     """Construct a numpy dtype for a data ID or repository ID


     @param[in] keyTuple: ID key names, in order

     @param[in] valTuple: a value tuple

     @return numpy dtype as a list


     @warning: this guesses at string length (STR_PADDING + length of string in valTuple);

     longer strings will be truncated when inserted into numpy structured arrays

     """

     typeList = []

     for name, val in zip(keyTuple, valTuple):

         if isinstance(val, str):

             predLen = len(val) + STR_PADDING

             typeList.append((name, str, predLen))

         else:

             typeList.append((name, numpy.array([val]).dtype))

     return typeList


 class SourceData:

     """Accumulate a set of measurements from a set of source tables


     To use:

     - specify the desired source measurements when constructing this object

     - call addSourceMetrics for each repository you harvest data from

     - call finalize to produce the final data


     Data available after calling finalize:

     - self.sourceArr: a numpy structured array of shape (num repositories, num sources)

         containing named columns for:

         - source ID

         - each data ID key

         - each item of data extracted from the source table

     - self.sourceIdDict: a dict of (source ID: index of axis 1 of self.sourceArr)

     - self.repoArr: a numpy structured array of shape (num repositories,)

         containing a named column for each repository key (see RepositoryIterator)


     @note: sources that had non-finite data (e.g. NaN) for every value extracted are silently omitted

     """


     def __init__(self, datasetType, sourceKeyTuple):

         """

         @param[in] datasetType: dataset type for source

         @param[in] sourceKeyTuple: list of keys of data items to extract from the source tables


         @raise RuntimeError if sourceKeyTuple is empty

         """

         if len(sourceKeyTuple) < 1:

             raise RuntimeError("Must specify at least one key in sourceKeyTuple")

         self.datasetTypedatasetType = datasetType

         self._sourceKeyTuple_sourceKeyTuple = tuple(sourceKeyTuple)


         self._idKeyTuple_idKeyTuple = None  # tuple of data ID keys, in order; set by first call to _getSourceMetrics

         self._idKeyDTypeList_idKeyDTypeList = None  # numpy dtype for data ID tuple, as a list of (key, type);

         # set by first call to _getSourceMetrics

         self._sourceDTypeList_sourceDTypeList = None  # numpy dtype for source data, as a list of (key, type);

         # set by first call to _getSourceMetrics

         self._repoKeyTuple_repoKeyTuple = None  # tuple of repo ID keys, in order; set by first call to addSourceMetrics

         self._repoDTypeList_repoDTypeList = None  # numpy dtype for repoArr, as a list of (key, type);

         # set by first call to addSourceMetrics


         self._tempDataList_tempDataList = []  # list (one entry per repository)

         # of dict of source ID: tuple of data ID data concatenated with source metric data, where:

         # data ID data is in order self._idKeyTuple

         # source metric data is in order self._sourceKeyTuple

         self.repoInfoListrepoInfoList = []  # list of repoInfo


     def _getSourceMetrics(self, idKeyTuple, idValList, sourceTableList):

         """Obtain the desired source measurements from a list of source tables


         Extracts a set of source measurements (specified by sourceKeyTuple) from a list of source tables

         (one per data ID) and saves them as a dict of source ID: list of data


         @param[in] idKeyTuple: a tuple of data ID keys; must be the same for each call

         @param[in] idValList: a list of data ID value tuples;

             each tuple contains values in the order in idKeyTuple

         @param[in] sourceTableList: a list of source tables, one per entry in idValList


         @return a dict of source id: data id tuple + source data tuple

             where source data tuple order matches sourceKeyTuple

             and data id tuple matches self._idKeyTuple (which is set from the first idKeyTuple)


         @raise RuntimeError if idKeyTuple is different than it was for the first call.


         GetRepositoryDataTask.run returns idKeyTuple and idValList; you can easily make

         a subclass of GetRepositoryDataTask that also returns sourceTableList.


         Updates instance variables:

         - self._idKeyTuple if not already set.

         """

         if self._idKeyTuple_idKeyTuple is None:

             self._idKeyTuple_idKeyTuple = tuple(idKeyTuple)

             self._idKeyDTypeList_idKeyDTypeList = _getDTypeList(keyTuple=self._idKeyTuple_idKeyTuple,

                                                  valTuple=idValList[0])

         else:

             if self._idKeyTuple_idKeyTuple != tuple(idKeyTuple):

                 raise RuntimeError("idKeyTuple = %s != %s = first idKeyTuple; must be the same each time" %

                                    (idKeyTuple, self._idKeyTuple_idKeyTuple))


         dataDict = {}

         for idTuple, sourceTable in zip(idValList, sourceTableList):

             if len(sourceTable) == 0:

                 continue


             idList = sourceTable.get("id")

             dataList = [sourceTable.get(key) for key in self._sourceKeyTuple_sourceKeyTuple]


             if self._sourceDTypeList_sourceDTypeList is None:

                 self._sourceDTypeList_sourceDTypeList = [(key, arr.dtype)

                                          for key, arr in zip(self._sourceKeyTuple_sourceKeyTuple, dataList)]


             transposedDataList = list(zip(*dataList))

             del dataList


             dataDict.update((srcId, idTuple + tuple(data))

                             for srcId, data in zip(idList, transposedDataList))

         return dataDict


     def addSourceMetrics(self, repoInfo, idKeyTuple, idValList, sourceTableList):

         """Accumulate source measurements from a list of source tables.


         Once you have accumulated all source measurements, call finalize to process the data.


         @param[in] repoInfo: a RepositoryInfo instance

         @param[in] idKeyTuple: a tuple of data ID keys; must be the same for each call

         @param[in] idValList: a list of data ID value tuples;

             each tuple contains values in the order in idKeyTuple

         @param[in] sourceTableList: a list of source tables, one per entry in idValList


         @raise RuntimeError if idKeyTuple is different than it was for the first call.


         Accumulates the data in temporary cache self._tempDataList.


         @return number of sources

         """

         if self._repoKeyTuple_repoKeyTuple is None:

             self._repoKeyTuple_repoKeyTuple = repoInfo.keyTuple

             self._repoDTypeList_repoDTypeList = repoInfo.dtype


         dataDict = self._getSourceMetrics_getSourceMetrics(idKeyTuple, idValList, sourceTableList)


         self._tempDataList_tempDataList.append(dataDict)

         self.repoInfoListrepoInfoList.append(repoInfo)

         return len(dataDict)


     def finalize(self):

         """Process the accumulated source measurements to create the final data products.


         Only call this after you have added all source metrics using addSourceMetrics.


         Reads temporary cache self._tempDataList and then deletes it.

         """

         if len(self._tempDataList_tempDataList) == 0:

             raise RuntimeError("No data found")


         fullSrcIdSet = set()

         for dataIdDict in self._tempDataList_tempDataList:

             fullSrcIdSet.update(iter(dataIdDict.keys()))


         # source data

         sourceArrDType = [("sourceId", int)] + self._idKeyDTypeList_idKeyDTypeList + self._sourceDTypeList_sourceDTypeList

         # data for missing sources (only for the data in the source data dict, so excludes srcId)

         nullSourceTuple = tuple(numpy.zeros(1, dtype=self._idKeyDTypeList_idKeyDTypeList + self._sourceDTypeList_sourceDTypeList)[0])


         sourceData = [[(srcId,) + srcDataDict.get(srcId, nullSourceTuple) for srcId in fullSrcIdSet]

                       for srcDataDict in self._tempDataList_tempDataList]


         self.sourceArrsourceArr = numpy.array(sourceData, dtype=sourceArrDType)

         del sourceData


         self.sourceIdDictsourceIdDict = dict((srcId, i) for i, srcId in enumerate(fullSrcIdSet))


         # repository data

         repoData = [repoInfo.valTuple for repoInfo in self.repoInfoListrepoInfoList]

         self.repoArrrepoArr = numpy.array(repoData, dtype=self._repoDTypeList_repoDTypeList)


         self._tempDataList_tempDataList = None


 class RepositoryInfo:

     """Information about one data repository


     Constructed by RepositoryIterator and used by SourceData.

     """


     def __init__(self, keyTuple, valTuple, dtype, name):

         if len(keyTuple) != len(valTuple):

             raise RuntimeError("lengths of keyTuple=%s and valTuple=%s do not match" % (keyTuple, valTuple))

         self.keyTuplekeyTuple = tuple(keyTuple)

         self.valTuplevalTuple = tuple(valTuple)

         self.dtypedtype = dtype

         self.namename = name


 class RepositoryIterator:

     """Iterate over a set of data repositories that use a naming convention based on parameter values

     """


     def __init__(self, formatStr, **dataDict):

         """Construct a repository iterator from a dict of name: valueList


         @param[in] formatStr: format string using dictionary notation, e.g.: "%(foo)s_%(bar)d"

         @param[in] **dataDict: name=valueList pairs

         """

         self._formatStr_formatStr = formatStr

         self._keyTuple_keyTuple = tuple(sorted(dataDict.keys()))

         self._valListOfLists_valListOfLists = [numpy.array(dataDict[key]) for key in self._keyTuple_keyTuple]

         self._dtype_dtype = [(key, self._valListOfLists_valListOfLists[i].dtype)

                        for i, key in enumerate(self._keyTuple_keyTuple)]


     def __iter__(self):

         """Retrieve next RepositoryInfo object

         """

         for valTuple in itertools.product(*self._valListOfLists_valListOfLists):

             valDict = dict(zip(self._keyTuple_keyTuple, valTuple))

             name = self.formatformat(valDict)

             yield RepositoryInfo(keyTuple=self._keyTuple_keyTuple, valTuple=valTuple, dtype=self._dtype_dtype, name=name)


     def __len__(self):

         """Return the number of items in the iterator"""

         n = 1

         for valTuple in self._valListOfLists_valListOfLists:

             n *= len(valTuple)

         return n


     def format(self, valDict):

         """Return formatted string for a specified value dictionary


         @param[in] valDict: a dict of key: value pairs that identify a repository

         """

         return self._formatStr_formatStr % valDict


     def getKeyTuple(self):

         """Return the a tuple of keys in the same order as items in value tuples

         """

         return self._keyTuple_keyTuple


     def _getDTypeList(self):

         """Get a dtype for a structured array of repository keys

         """

         return self._dtype_dtype

lsst.pipe.tasks.repositoryIterator.RepositoryInfo
Definition: repositoryIterator.py:214

lsst.pipe.tasks.repositoryIterator.RepositoryInfo.dtype
dtype
Definition: repositoryIterator.py:225

lsst.pipe.tasks.repositoryIterator.RepositoryInfo.keyTuple
keyTuple
Definition: repositoryIterator.py:223

lsst.pipe.tasks.repositoryIterator.RepositoryInfo.__init__
def __init__(self, keyTuple, valTuple, dtype, name)
Definition: repositoryIterator.py:220

lsst.pipe.tasks.repositoryIterator.RepositoryInfo.valTuple
valTuple
Definition: repositoryIterator.py:224

lsst.pipe.tasks.repositoryIterator.RepositoryInfo.name
name
Definition: repositoryIterator.py:226

lsst.pipe.tasks.repositoryIterator.RepositoryIterator
Definition: repositoryIterator.py:229

lsst.pipe.tasks.repositoryIterator.RepositoryIterator._valListOfLists
_valListOfLists
Definition: repositoryIterator.py:241

lsst.pipe.tasks.repositoryIterator.RepositoryIterator.__init__
def __init__(self, formatStr, **dataDict)
Definition: repositoryIterator.py:233

lsst.pipe.tasks.repositoryIterator.RepositoryIterator.__len__
def __len__(self)
Definition: repositoryIterator.py:253

lsst.pipe.tasks.repositoryIterator.RepositoryIterator.getKeyTuple
def getKeyTuple(self)
Definition: repositoryIterator.py:267

lsst.pipe.tasks.repositoryIterator.RepositoryIterator._keyTuple
_keyTuple
Definition: repositoryIterator.py:240

lsst.pipe.tasks.repositoryIterator.RepositoryIterator._formatStr
_formatStr
Definition: repositoryIterator.py:239

lsst.pipe.tasks.repositoryIterator.RepositoryIterator.__iter__
def __iter__(self)
Definition: repositoryIterator.py:245

lsst.pipe.tasks.repositoryIterator.RepositoryIterator.format
def format(self, valDict)
Definition: repositoryIterator.py:260

lsst.pipe.tasks.repositoryIterator.RepositoryIterator._dtype
_dtype
Definition: repositoryIterator.py:242

lsst.pipe.tasks.repositoryIterator.SourceData
Definition: repositoryIterator.py:54

lsst.pipe.tasks.repositoryIterator.SourceData._getSourceMetrics
def _getSourceMetrics(self, idKeyTuple, idValList, sourceTableList)
Definition: repositoryIterator.py:102

lsst.pipe.tasks.repositoryIterator.SourceData.repoInfoList
repoInfoList
Definition: repositoryIterator.py:100

lsst.pipe.tasks.repositoryIterator.SourceData.sourceIdDict
sourceIdDict
Definition: repositoryIterator.py:205

lsst.pipe.tasks.repositoryIterator.SourceData._tempDataList
_tempDataList
Definition: repositoryIterator.py:96

lsst.pipe.tasks.repositoryIterator.SourceData.__init__
def __init__(self, datasetType, sourceKeyTuple)
Definition: repositoryIterator.py:75

lsst.pipe.tasks.repositoryIterator.SourceData._repoDTypeList
_repoDTypeList
Definition: repositoryIterator.py:93

lsst.pipe.tasks.repositoryIterator.SourceData._repoKeyTuple
_repoKeyTuple
Definition: repositoryIterator.py:92

lsst.pipe.tasks.repositoryIterator.SourceData._sourceKeyTuple
_sourceKeyTuple
Definition: repositoryIterator.py:85

lsst.pipe.tasks.repositoryIterator.SourceData.repoArr
repoArr
Definition: repositoryIterator.py:209

lsst.pipe.tasks.repositoryIterator.SourceData.finalize
def finalize(self)
Definition: repositoryIterator.py:180

lsst.pipe.tasks.repositoryIterator.SourceData.datasetType
datasetType
Definition: repositoryIterator.py:84

lsst.pipe.tasks.repositoryIterator.SourceData._sourceDTypeList
_sourceDTypeList
Definition: repositoryIterator.py:90

lsst.pipe.tasks.repositoryIterator.SourceData.sourceArr
sourceArr
Definition: repositoryIterator.py:202

lsst.pipe.tasks.repositoryIterator.SourceData._idKeyDTypeList
_idKeyDTypeList
Definition: repositoryIterator.py:88

lsst.pipe.tasks.repositoryIterator.SourceData._idKeyTuple
_idKeyTuple
Definition: repositoryIterator.py:87

lsst.pipe.tasks.repositoryIterator.SourceData.addSourceMetrics
def addSourceMetrics(self, repoInfo, idKeyTuple, idValList, sourceTableList)
Definition: repositoryIterator.py:153

list
daf::base::PropertyList * list
Definition: fits.cc:913

set
daf::base::PropertySet * set
Definition: fits.cc:912

ast::append
std::shared_ptr< FrameSet > append(FrameSet const &first, FrameSet const &second)
Construct a FrameSet that performs two transformations in series.
Definition: functional.cc:33

astshim.fitsChanContinued.iter
def iter(self)
Definition: fitsChanContinued.py:88