23 """Tools to help you iterate over a set of repositories.
25 Helpful while creating them or harvesting data from them.
35 """Construct a numpy dtype for a data ID or repository ID
37 @param[in] keyTuple: ID key names, in order
38 @param[in] valTuple: a value tuple
39 @return numpy dtype as a list
41 @warning: this guesses at string length (STR_PADDING + length of string in valTuple);
42 longer strings will be truncated when inserted into numpy structured arrays
45 for name, val
in itertools.izip(keyTuple, valTuple):
46 if isinstance(val, str):
47 predLen = len(val) + STR_PADDING
48 typeList.append((name, str, predLen))
50 typeList.append((name, numpy.array([val]).dtype))
54 """Accumulate a set of measurements from a set of source tables
57 - specify the desired source measurements when constructing this object
58 - call addSourceMetrics for each repository you harvest data from
59 - call finalize to produce the final data
61 Data available after calling finalize:
62 - self.sourceArr: a numpy structured array of shape (num repositories, num sources)
63 containing named columns for:
66 - each item of data extracted from the source table
67 - self.sourceIdDict: a dict of (source ID: index of axis 1 of self.sourceArr)
68 - self.repoArr: a numpy structured array of shape (num repositories,)
69 containing a named column for each repository key (see RepositoryIterator)
71 @note: sources that had non-finite data (e.g. NaN) for every value extracted are silently omitted
73 def __init__(self, datasetType, sourceKeyTuple):
75 @param[in] datasetType: dataset type for source
76 @param[in] sourceKeyTuple: list of keys of data items to extract from the source tables
78 @raise RuntimeError if sourceKeyTuple is empty
80 if len(sourceKeyTuple) < 1:
81 raise RuntimeError(
"Must specify at least one key in sourceKeyTuple")
101 """Obtain the desired source measurements from a list of source tables
103 Extracts a set of source measurements (specified by sourceKeyTuple) from a list of source tables
104 (one per data ID) and saves them as a dict of source ID: list of data
106 @param[in] idKeyTuple: a tuple of data ID keys; must be the same for each call
107 @param[in] idValList: a list of data ID value tuples;
108 each tuple contains values in the order in idKeyTuple
109 @param[in] sourceTableList: a list of source tables, one per entry in idValList
111 @return a dict of source id: data id tuple + source data tuple
112 where source data tuple order matches sourceKeyTuple
113 and data id tuple matches self._idKeyTuple (which is set from the first idKeyTuple)
115 @raise RuntimeError if idKeyTuple is different than it was for the first call.
117 GetRepositoryDataTask.run returns idKeyTuple and idValList; you can easily make
118 a subclass of GetRepositoryDataTask that also returns sourceTableList.
120 Updates instance variables:
121 - self._idKeyTuple if not already set.
126 valTuple = idValList[0])
129 raise RuntimeError(
"idKeyTuple = %s != %s = first idKeyTuple; must be the same each time" % \
133 for idTuple, sourceTable
in itertools.izip(idValList, sourceTableList):
134 if len(sourceTable) == 0:
137 idList = sourceTable.get(
"id")
144 transposedDataList = zip(*dataList)
147 dataDict.update((srcId, idTuple + tuple(data))
148 for srcId, data
in itertools.izip(idList, transposedDataList))
152 """Accumulate source measurements from a list of source tables.
154 Once you have accumulated all source measurements, call finalize to process the data.
156 @param[in] repoInfo: a RepositoryInfo instance
157 @param[in] idKeyTuple: a tuple of data ID keys; must be the same for each call
158 @param[in] idValList: a list of data ID value tuples;
159 each tuple contains values in the order in idKeyTuple
160 @param[in] sourceTableList: a list of source tables, one per entry in idValList
162 @raise RuntimeError if idKeyTuple is different than it was for the first call.
164 Accumulates the data in temporary cache self._tempDataList.
166 @return number of sources
174 self._tempDataList.append(dataDict)
175 self.repoInfoList.append(repoInfo)
179 """Process the accumulated source measurements to create the final data products.
181 Only call this after you have added all source metrics using addSourceMetrics.
183 Reads temporary cache self._tempDataList and then deletes it.
186 raise RuntimeError(
"No data found")
190 fullSrcIdSet.update(dataIdDict.iterkeys())
197 sourceData = [[(srcId,) + srcDataDict.get(srcId, nullSourceTuple)
for srcId
in fullSrcIdSet]
200 self.
sourceArr = numpy.array(sourceData, dtype=sourceArrDType)
203 self.
sourceIdDict = dict((srcId, i)
for i, srcId
in enumerate(fullSrcIdSet))
206 repoData = [repoInfo.valTuple
for repoInfo
in self.
repoInfoList]
213 """Information about one data repository
215 Constructed by RepositoryIterator and used by SourceData.
217 def __init__(self, keyTuple, valTuple, dtype, name):
218 if len(keyTuple) != len(valTuple):
219 raise RuntimeError(
"lengths of keyTuple=%s and valTuple=%s do not match" % (keyTuple, valTuple))
227 """Iterate over a set of data repositories that use a naming convention based on parameter values
230 """Construct a repository iterator from a dict of name: valueList
232 @param[in] formatStr: format string using dictionary notation, e.g.: "%(foo)s_%(bar)d"
233 @param[in] **dataDict: name=valueList pairs
242 """Retrieve next RepositoryInfo object
245 valDict = dict(zip(self.
_keyTuple, valTuple))
250 """Return the number of items in the iterator"""
257 """Return formatted string for a specified value dictionary
259 @param[in] valDict: a dict of key: value pairs that identify a repository
264 """Return the a tuple of keys in the same order as items in value tuples
269 """Get a dtype for a structured array of repository keys