doxygen/x_10_1/ingest_sources_task_8py_source.html

 import MySQLdb

 import math

 import re

 import sys

 import traceback


 import lsst.afw.table as afwTable

 import lsst.daf.base as dafBase

 import lsst.daf.persistence as dafPersist

 import lsst.pex.config as pexConfig

 import lsst.pipe.base as pipeBase


 class ColumnFormatter(object):

     """A class to format a column in an afw.SourceCatalog.


     A little tricky because a column's values may be composite entities

     (coordinates, matrixes, etc.).


     This class is basically a container for a SQL type, a function returning

     SQL column names, and a function returning the formatted value of a

     column."""


     def __init__(self, sqlType, columnNameCallable, formatValueCallable):

         """Store the column formatting information."""

         self.sqlType = sqlType

         self.columnNameCallable = columnNameCallable

         self.formatValueCallable = formatValueCallable


     def getSqlType(self):

         """Return the SQL type (e.g. BIGINT, DOUBLE) for the column's basic

         values."""

         return self.sqlType


     def getColumnNames(self, baseName):

         """Return an iterable of the names that should be used for columns in

         SQL given a SQL-compatible base name derived from the catalog's column

         name."""

         return self.columnNameCallable(baseName)


     def formatValue(self, value):

         """Return a string suitable for inclusion in an INSERT/REPLACE

         statement (not a CSV file) resulting from formatting the column's

         value.  One value should be provided for each of the column names that

         had been returned, of course.  Values should be separated by commas.

         This method also handles changing "None" values to SQL NULLs."""

         if value is None:

             return "NULL"

         return self.formatValueCallable(value)


 def _formatNumber(fmt, number):

     """Auxiliary function for formatting a number, handling conversion of NaN

     and infinities to NULL."""

     if math.isnan(number) or math.isinf(number):

         return "NULL"

     return fmt % (number,)


 def _formatList(fmt, list):

     """Auxiliary function for formatting a list of numbers using a common

     format, joining the results with commas."""

     return ", ".join([_formatNumber(fmt, x) for x in list])


 """Describe how to handle each of the column types. Array and Cov (plain)

 types are not yet processed."""

 columnFormatters = dict(

         Flag = ColumnFormatter("BIT", lambda x: (x,),

             lambda v: "1" if v else "0"),

         I = ColumnFormatter("INT", lambda x: (x,),

             lambda v: str(v)),

         L = ColumnFormatter("BIGINT", lambda x: (x,),

             lambda v: str(v)),

         F = ColumnFormatter("FLOAT", lambda x: (x,),

             lambda v: _formatNumber("%.9g", v)),

         D = ColumnFormatter("DOUBLE", lambda x: (x,),

             lambda v: _formatNumber("%.17g", v)),

         Angle = ColumnFormatter("DOUBLE", lambda x: (x,),

             lambda v: _formatNumber("%.17g", v.asDegrees())),

         Coord = ColumnFormatter("DOUBLE", lambda x: (x + "_ra", x + "_dec"),

             lambda v: _formatList("%.17g",

                 (v.getRa().asDegrees(), v.getDec().asDegrees()))),

         PointI = ColumnFormatter("INT", lambda x: (x + "_x", x + "_y"),

             lambda v: _formatList("%d", (v[0], v[1]))),

         PointF = ColumnFormatter("FLOAT", lambda x: (x + "_x", x + "_y"),

             lambda v: _formatList("%.9g", (v[0], v[1]))),

         PointD = ColumnFormatter("DOUBLE", lambda x: (x + "_x", x + "_y"),

             lambda v: _formatList("%.17g", (v[0], v[1]))),

         MomentsF = ColumnFormatter("FLOAT",

             lambda x: (x + "_xx", x + "_xy", x + "_yy"),

             lambda v: _formatList("%.9g",

                 (v.getIxx(), v.getIxy(), v.getIyy()))),

         MomentsD = ColumnFormatter("DOUBLE",

             lambda x: (x + "_xx", x + "_xy", x + "_yy"),

             lambda v: _formatList("%.17g",

                 (v.getIxx(), v.getIxy(), v.getIyy()))),

         CovPointF = ColumnFormatter("FLOAT",

             lambda x: (x + "_xx", x + "_xy", x + "_yy"),

             lambda v: _formatList("%.9g", (v[0, 0], v[0, 1], v[1, 1]))),

         CovPointD = ColumnFormatter("DOUBLE",

             lambda x: (x + "_xx", x + "_xy", x + "_yy"),

             lambda v: _formatList("%.17g", (v[0, 0], v[0, 1], v[1, 1]))),

         CovMomentsF = ColumnFormatter("FLOAT",

             lambda x: (x + "_xx_xx", x + "_xx_xy", x + "_xx_yy",

                 x + "_xy_xy", x + "_xy_yy", x + "_yy_yy"),

             lambda v: _formatList("%.9g",

                 (v[0, 0], v[0, 1], v[0, 2], v[1, 1], v[1, 2], v[2, 2]))),

         CovMomentsD = ColumnFormatter("DOUBLE",

             lambda x: (x + "_xx_xx", x + "_xx_xy", x + "_xx_yy",

                 x + "_xy_xy", x + "_xy_yy", x + "_yy_yy"),

             lambda v: _formatList("%.17g",

                 (v[0, 0], v[0, 1], v[0, 2], v[1, 1], v[1, 2], v[2, 2])))

     )


 class IngestSourcesConfig(pexConfig.Config):

     """Configuration for the IngestSourcesTask."""

     allowReplace = pexConfig.Field(

             "Allow replacement of existing rows with the same source IDs",

             bool, default=False)

     maxQueryLen = pexConfig.Field(

             "Maximum length of a query string."

             " None means use a non-standard, database-specific way to get"

             " the maximum.",

             int, optional=True, default=None)

     idColumnName = pexConfig.Field(

             "Name of unique identifier column",

             str, default="id")

     remap = pexConfig.DictField(

             "Column name remapping. "

             "key = normal SQL column name, value = desired SQL column name",

             keytype=str, itemtype=str,

             optional=True,

             default={"coord_ra": "ra", "coord_dec": "decl"})

     extraColumns = pexConfig.Field(

             "Extra column definitions, comma-separated, to put into the"

             " CREATE TABLE statement if the table is being created",

             str, optional=True, default="")


 class IngestSourcesTaskRunner(pipeBase.TaskRunner):

     @staticmethod

     def getTargetList(parsedCmd):

         """Override the target list to add additional run() method

         parameters."""

         return pipeBase.TaskRunner.getTargetList(parsedCmd,

                 dstype=parsedCmd.dstype,

                 tableName=parsedCmd.tableName,

                 host=parsedCmd.host,

                 db=parsedCmd.db,

                 port=parsedCmd.port,

                 user=parsedCmd.user)


     def precall(self, parsedCmd):

         """Override the precall to not write schemas, not require writing of

         configs, and set the task's name appropriately."""

         self.TaskClass._DefaultName += "_" + parsedCmd.dstype

         task = self.TaskClass(config=self.config, log=self.log)

         try:

             task.writeConfig(parsedCmd.butler, clobber=self.clobberConfig)

         except Exception, e:

             # Often no mapping for config, but in any case just skip

             task.log.warn("Could not persist config: %s" % (e,))

         return True


 class IngestSourcesTask(pipeBase.CmdLineTask):

     """Task to ingest a SourceCatalog of arbitrary schema into a database table.


     This task connects to a database using connection information given

     through command line arguments or run() parameters.  It attempts to use

     a .mysql.cnf file if present (by not specifying a password) and falls back

     to using credentials obtained via the DbAuth interface if not.


     If run from the command line, it will ingest each catalog of Sources

     specified by a data id and dataset type.  A sample command line might look

     like:

         $DATAREL_DIR/bin/ingest/ingestSources.py

                 {repository path}

                 --host lsst-db.ncsa.illinois.edu

                 --database {user}_S12_sdss_u_s2012prod_{runid}

                 --table DiaSources

                 --dstype goodSeeingDiff_src

                 --id run=... camcol=... filter=... field=...

     As usual for tasks, multiple --id options may be specified, or ranges and

     lists of values can be specified for data id keys.


     There are also two methods (ingest() and runFile()) that can be manually

     called to ingest catalogs, either by passing the catalog explicitly or by

     passing the name of a FITS file containing the catalog.  Both, like run(),

     require database connection information.


     The ingestion process creates the destination table in the database if it

     doesn't exist.  The schema is translated from the source catalog's schema.

     The database table must contain a unique identifier column, named in the

     idColumnName configuration parameter.  The only index provided is a unique

     one on this id field.  (Additional ones can be created later, of course.)

     Columns can be renamed using the remap configuration parameter.  The names

     to be remapped have been canonicalized (for now by changing any non-word

     characters to underscores) and may have additional subfield tags appended

     (such as "_ra" or "_y").  Extra columns (e.g. ones to be filled in later

     by spatial indexing code) may be added to the table via the extraColumns

     configuration parameter.


     Note that "nullable integer" columns are not provided.  There is no way to

     represent these explicitly in the source catalog, and translating 0 to

     NULL seems to have little value and might be error-prone.  (An option

     could be provided to do this if it turns out to be necessary.)


     Also note that covariances and moments are assumed to be in pixel space

     (or something else) and not angular space and so do not need

     radians-to-degrees conversion.


     If the table does exist, one row of the input (the first) is checked to

     see if it already exists in the destination table.  If it does, the

     ingestion fails unless the allowReplace configuration parameter is set to

     True.


     Rows are inserted into the database via INSERT statements.  As many rows

     as possible are packed into each INSERT to maximize throughput.  The limit

     on INSERT statement length is either set by configuration or determined by

     querying the database (in a MySQL-specific way).  This may not be as

     efficient in its use of the database as converting to CSV and doing a bulk

     load, but it eliminates the use of (often shared) disk resources.  The use

     of INSERTs (committed once at the end) may not be fully parallelizable

     (particularly due to the unique id index), but tests seem to indicate that

     it is at least not much slower to execute many such INSERTs in parallel

     compared with executing them all sequentially.  This remains an area for

     future optimization.


     The columnFormatters dictionary is used to determine how to format each

     type of column in the source catalog.  If new column types are added to

     afw::table and are used in Source catalogs, they should also be added

     here.  While lambdas are used for the formatting functions for

     compactness, they can be any callable (and so can handle more complex

     logic than can be embedded in a lambda -- e.g. checking a column's units

     to see if it needs to be converted from radians to degrees).

     """


     ConfigClass = IngestSourcesConfig

     _DefaultName = "ingestSources"

     RunnerClass = IngestSourcesTaskRunner


     @classmethod

     def _makeArgumentParser(cls):

         """Extend the default argument parser with database-specific

         arguments and the dataset type for the Sources to be read."""

         parser = pipeBase.ArgumentParser(name=cls._DefaultName)

         parser.add_argument("-H", "--host", dest="host", required=True,

                 help="Database hostname")

         parser.add_argument("-D", "--database", dest="db", required=True,

                 help="Database name")

         parser.add_argument("-U", "--user", dest="user",

                 help="Database username (optional)", default=None)

         parser.add_argument("-P", "--port", dest="port",

                 help="Database port number (optional)", default=3306)

         parser.add_argument("-t", "--table", dest="tableName", required=True,

                 help="Table to ingest into")

         parser.add_id_argument("--id", pipeBase.DatasetArgument("dstype"),

                 help="Source dataset data id to ingest")

                 # Use DatasetArgument to require dataset type be specified on

                 # the command line

         return parser


     def runFile(self, fileName, tableName, host, db, port=3306, user=None):

         """Ingest a SourceCatalog specified by a filename."""

         cat = afwTable.SourceCatalog.readFits(fileName)

         self.ingest(cat, tableName, host, db, port, user)


     def run(self, dataRef, dstype, tableName, host, db, port=3306, user=None):

         """Ingest a SourceCatalog specified by a dataref and dataset type."""

         self.ingest(dataRef.get(dstype), tableName, host, db, port, user)


     @pipeBase.timeMethod

     def ingest(self, cat, tableName, host, db, port=3306, user=None):

         """Ingest a SourceCatalog passed as an object.


         @param cat (SourceCatalog) Catalog to ingest.

         @param tableName (str)   Name of the database table to create.

         @param host (str)        Name of the database host machine.

         @param db (str)          Name of the database to ingest into.

         @param port (int)        Port number on the database host.

         @param user (str)        User name to use for the database."""


         try:

             # See if we can connect without a password (e.g. via my.cnf)

             self.db = MySQLdb.connect(host=host, port=port, user=user, db=db)

         except:

             # Fallback to DbAuth

             user = dafPersist.DbAuth.username(host, str(port))

             passwd = dafPersist.DbAuth.password(host, str(port))

             self.db = MySQLdb.connect(host=host, port=port,

                     user=user, passwd=passwd, db=db)

         self.tableName = tableName


         # Determine the maximum query length (MySQL-specific) if not

         # configured.

         if self.config.maxQueryLen is None:

             self.maxQueryLen = int(self._getSqlScalar("""

                 SELECT variable_value

                 FROM information_schema.session_variables

                 WHERE variable_name = 'max_allowed_packet';"""))

         else:

             self.maxQueryLen = self.config.maxQueryLen


         """Ingest a SourceCatalog by converting it to one or more (large)

         INSERT or REPLACE statements, executing those statements, and

         committing the result."""


         tableName = self.db.escape_string(self.tableName)

         self._checkTable(tableName, cat)

         pos = 0

         while pos < len(cat):

             if self.config.allowReplace:

                 sql = "REPLACE"

             else:

                 sql = "INSERT"

             sql += " INTO `%s` (" % (tableName,)

             keys = []

             firstCol = True

             for col in cat.schema:

                 if col.field.getTypeString() not in columnFormatters:

                     self.log.warn(

                             "Skipping complex column: {name} ({type})".format(

                                 name=col.field.getName(),

                                 type=col.field.getTypeString()))


                     continue

                 formatter = columnFormatters[col.field.getTypeString()]

                 keys.append((col.key, formatter))

                 if firstCol:

                     firstCol = False

                 else:

                     sql += ", "

                 sql += self._columnDef(col, includeTypes=False)

             sql += ") VALUES "

             initialPos = pos

             maxValueLen = self.maxQueryLen - len(sql)

             while pos < len(cat):

                 source = cat[pos]

                 value = "("

                 value += ", ".join([formatter.formatValue(source.get(key))

                     for (key, formatter) in keys])

                 value += "), "

                 maxValueLen -= len(value)

                 if maxValueLen < 0:

                     break

                 else:

                     sql += value

                     pos += 1

             if pos == initialPos:

                 # Have not made progress

                 raise RuntimeError("Single row too large to insert")

             self._executeSql(sql[:-2] + ";")

         self.db.commit()


     def _executeSql(self, sql):

         """Execute a SQL query with no expectation of result."""

         self.log.logdebug("executeSql: " + sql)

         self.db.query(sql)


     def _getSqlScalar(self, sql):

         """Execute a SQL query and return a single scalar result."""

         cur = self.db.cursor()

         self.log.logdebug("getSqlScalar: " + sql)

         rows = cur.execute(sql)

         if rows != 1:

             raise RuntimeError(

                     "Wrong number of rows (%d) for scalar query: %s" %

                     (rows, sql))

         row = cur.fetchone()

         self.log.logdebug("Result: " + str(row))

         return row[0]


     def _checkTable(self, tableName, cat):

         """Check to make sure a table exists by selecting a row from it.  If

         the row contains the unique id of the first item in the input

         SourceCatalog, assume that the rest are present as well.  If the table

         does not exist, create it."""


         sampleId = cat[0][self.config.idColumnName]

         count = 0

         try:

             count = self._getSqlScalar(

                     "SELECT COUNT(*) FROM `%s` WHERE %s = %d;" % (

                         tableName, self.config.idColumnName, sampleId))

         except RuntimeError, e:

             raise e

         except:

             pass

         if count == 0:

             self._createTable(tableName, cat.schema)

         elif self.config.allowReplace:

             self.log.warn("Overwriting existing rows")

         else:

             raise RuntimeError("Row exists: {name}={id}".format(

                 name=self.config.idColumnName, id=sampleId))


     def _createTable(self, tableName, schema):

         """Create a table.  Use column definitions based on the provided table

         schema, adding in any extra columns specified in the config.  The

         unique id column is given a key."""

         sql = "CREATE TABLE IF NOT EXISTS `%s` (" % (tableName,)

         sql += ", ".join([self._columnDef(col) for col in schema if

             col.field.getTypeString() in columnFormatters])

         if self.config.extraColumns is not None and self.config.extraColumns != "":

             sql += ", " + self.config.extraColumns

         sql += ", UNIQUE(%s)" % (self.config.idColumnName,)

         sql += ");"

         self._executeSql(sql)


     def _columnDef(self, col, includeTypes=True):

         """Return the column definition for a given schema column, which may

         be composed of multiple database columns (separated by commas).  If

         includeTypes is True (the default), include the SQL type for the

         column as for a CREATE TABLE statement."""

         formatter = columnFormatters[col.field.getTypeString()]

         baseName = self._canonicalizeName(col.field.getName())

         columnType = " " + formatter.getSqlType() if includeTypes else ""

         return ", ".join(["%s%s" % (self._remapColumn(columnName), columnType)

             for columnName in formatter.getColumnNames(baseName)])


     def _remapColumn(self, colName):

         """Remap a column name according to the remap dictionary in the

         config."""

         if colName in self.config.remap:

             return self.config.remap[colName]

         return colName


     def _canonicalizeName(self, colName):

         """Return a SQL-compatible version of the schema column name."""

         return re.sub(r'[^\w]', '_', colName)

lsst.pex.config.history.format
def format
Definition: history.py:127

lsst.datarel.ingestSourcesTask.IngestSourcesTask._checkTable
def _checkTable
Definition: ingestSourcesTask.py:371

lsst.datarel.ingestSourcesTask._formatList
def _formatList
Definition: ingestSourcesTask.py:58

lsst.datarel.ingestSourcesTask.ColumnFormatter.sqlType
sqlType
Definition: ingestSourcesTask.py:25

lsst.datarel.ingestSourcesTask.IngestSourcesTask._executeSql
def _executeSql
Definition: ingestSourcesTask.py:353

lsst.datarel.ingestSourcesTask.IngestSourcesTask._makeArgumentParser
def _makeArgumentParser
Definition: ingestSourcesTask.py:241

lsst.daf.persistence
Definition: __init__.py:1

lsst.afw.table
Definition: __init__.py:1

lsst::pipe::base
Definition: eb9a0a203c631f7666a585cb1c3304336bd7afa6.dox:1

lsst.datarel.ingestSourcesTask.IngestSourcesTask
Definition: ingestSourcesTask.py:163

lsst.pex.config
Definition: __init__.py:1

lsst.datarel.ingestSourcesTask.IngestSourcesConfig
Definition: ingestSourcesTask.py:113

lsst.datarel.ingestSourcesTask.IngestSourcesTask.db
db
Definition: ingestSourcesTask.py:283

lsst.datarel.ingestSourcesTask.ColumnFormatter
Definition: ingestSourcesTask.py:13

lsst.datarel.ingestSourcesTask.IngestSourcesTask.ingest
def ingest
Definition: ingestSourcesTask.py:271

lsst.datarel.ingestSourcesTask.IngestSourcesTask.runFile
def runFile
Definition: ingestSourcesTask.py:261

lsst.datarel.ingestSourcesTask.ColumnFormatter.formatValueCallable
formatValueCallable
Definition: ingestSourcesTask.py:27

lsst.datarel.ingestSourcesTask.IngestSourcesTask._getSqlScalar
def _getSqlScalar
Definition: ingestSourcesTask.py:358

lsst.datarel.ingestSourcesTask.ColumnFormatter.formatValue
def formatValue
Definition: ingestSourcesTask.py:40

lsst.datarel.ingestSourcesTask.IngestSourcesTaskRunner.getTargetList
def getTargetList
Definition: ingestSourcesTask.py:139

lsst.datarel.ingestSourcesTask._formatNumber
def _formatNumber
Definition: ingestSourcesTask.py:51

lsst.datarel.ingestSourcesTask.IngestSourcesTask.maxQueryLen
maxQueryLen
Definition: ingestSourcesTask.py:295

lsst.datarel.ingestSourcesTask.IngestSourcesTask._columnDef
def _columnDef
Definition: ingestSourcesTask.py:408

lsst.datarel.ingestSourcesTask.IngestSourcesTaskRunner.precall
def precall
Definition: ingestSourcesTask.py:150

lsst.daf.base
Definition: __init__.py:1

lsst.datarel.ingestSourcesTask.IngestSourcesTask._createTable
def _createTable
Definition: ingestSourcesTask.py:395

lsst.datarel.ingestSourcesTask.IngestSourcesTask.tableName
tableName
Definition: ingestSourcesTask.py:290

lsst.datarel.ingestSourcesTask.ColumnFormatter.__init__
def __init__
Definition: ingestSourcesTask.py:23

lsst.datarel.ingestSourcesTask.IngestSourcesTaskRunner
Definition: ingestSourcesTask.py:137

lsst.datarel.ingestSourcesTask.IngestSourcesTask.run
def run
Definition: ingestSourcesTask.py:266

lsst.datarel.ingestSourcesTask.ColumnFormatter.getSqlType
def getSqlType
Definition: ingestSourcesTask.py:29

lsst.datarel.ingestSourcesTask.ColumnFormatter.getColumnNames
def getColumnNames
Definition: ingestSourcesTask.py:34

lsst.datarel.ingestSourcesTask.IngestSourcesTask._canonicalizeName
def _canonicalizeName
Definition: ingestSourcesTask.py:426

lsst.datarel.ingestSourcesTask.ColumnFormatter.columnNameCallable
columnNameCallable
Definition: ingestSourcesTask.py:26

lsst.datarel.ingestSourcesTask.IngestSourcesTask._remapColumn
def _remapColumn
Definition: ingestSourcesTask.py:419