doxygen/xlink_master_2021_04_01_08.31.19/pipe_tools_8py_source.html

 # This file is part of pipe_base.

 #

 # Developed for the LSST Data Management System.

 # This product includes software developed by the LSST Project

 # (http://www.lsst.org).

 # See the COPYRIGHT file at the top-level directory of this distribution

 # for details of code ownership.

 #

 # This program is free software: you can redistribute it and/or modify

 # it under the terms of the GNU General Public License as published by

 # the Free Software Foundation, either version 3 of the License, or

 # (at your option) any later version.

 #

 # This program is distributed in the hope that it will be useful,

 # but WITHOUT ANY WARRANTY; without even the implied warranty of

 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

 # GNU General Public License for more details.

 #

 # You should have received a copy of the GNU General Public License

 # along with this program.  If not, see <http://www.gnu.org/licenses/>.


 """Module defining few methods to manipulate or query pipelines.

 """


 # No one should do import * from this module

 __all__ = ["isPipelineOrdered", "orderPipeline"]


 # -------------------------------

 #  Imports of standard modules --

 # -------------------------------

 import itertools


 # -----------------------------

 #  Imports for other modules --

 # -----------------------------

 from .connections import iterConnections


 # ----------------------------------

 #  Local non-exported definitions --

 # ----------------------------------


 def _loadTaskClass(taskDef, taskFactory):

     """Import task class if necessary.


     Raises

     ------

     `ImportError` is raised when task class cannot be imported.

     `MissingTaskFactoryError` is raised when TaskFactory is needed but not

     provided.

     """

     taskClass = taskDef.taskClass

     if not taskClass:

         if not taskFactory:

             raise MissingTaskFactoryError("Task class is not defined but task "

                                           "factory instance is not provided")

         taskClass = taskFactory.loadTaskClass(taskDef.taskName)

     return taskClass


 # ------------------------

 #  Exported definitions --

 # ------------------------


 class MissingTaskFactoryError(Exception):

     """Exception raised when client fails to provide TaskFactory instance.

     """

     pass


 class DuplicateOutputError(Exception):

     """Exception raised when Pipeline has more than one task for the same

     output.

     """

     pass


 class PipelineDataCycleError(Exception):

     """Exception raised when Pipeline has data dependency cycle.

     """

     pass


 def isPipelineOrdered(pipeline, taskFactory=None):

     """Checks whether tasks in pipeline are correctly ordered.


     Pipeline is correctly ordered if for any DatasetType produced by a task

     in a pipeline all its consumer tasks are located after producer.


     Parameters

     ----------

     pipeline : `pipe.base.Pipeline`

         Pipeline description.

     taskFactory: `pipe.base.TaskFactory`, optional

         Instance of an object which knows how to import task classes. It is

         only used if pipeline task definitions do not define task classes.


     Returns

     -------

     True for correctly ordered pipeline, False otherwise.


     Raises

     ------

     `ImportError` is raised when task class cannot be imported.

     `DuplicateOutputError` is raised when there is more than one producer for a

     dataset type.

     `MissingTaskFactoryError` is raised when TaskFactory is needed but not

     provided.

     """

     # Build a map of DatasetType name to producer's index in a pipeline

     producerIndex = {}

     for idx, taskDef in enumerate(pipeline):


         for attr in iterConnections(taskDef.connections, 'outputs'):

             if attr.name in producerIndex:

                 raise DuplicateOutputError("DatasetType `{}' appears more than "

                                            "once as output".format(attr.name))

             producerIndex[attr.name] = idx


     # check all inputs that are also someone's outputs

     for idx, taskDef in enumerate(pipeline):


         # get task input DatasetTypes, this can only be done via class method

         inputs = {name: getattr(taskDef.connections, name) for name in taskDef.connections.inputs}

         for dsTypeDescr in inputs.values():

             # all pre-existing datasets have effective index -1

             prodIdx = producerIndex.get(dsTypeDescr.name, -1)

             if prodIdx >= idx:

                 # not good, producer is downstream

                 return False


     return True


 def orderPipeline(pipeline):

     """Re-order tasks in pipeline to satisfy data dependencies.


     When possible new ordering keeps original relative order of the tasks.


     Parameters

     ----------

     pipeline : `list` of `pipe.base.TaskDef`

         Pipeline description.


     Returns

     -------

     Correctly ordered pipeline (`list` of `pipe.base.TaskDef` objects).


     Raises

     ------

     `DuplicateOutputError` is raised when there is more than one producer for a

     dataset type.

     `PipelineDataCycleError` is also raised when pipeline has dependency

     cycles.  `MissingTaskFactoryError` is raised when TaskFactory is needed but

     not provided.

     """


     # This is a modified version of Kahn's algorithm that preserves order


     # build mapping of the tasks to their inputs and outputs

     inputs = {}   # maps task index to its input DatasetType names

     outputs = {}  # maps task index to its output DatasetType names

     allInputs = set()   # all inputs of all tasks

     allOutputs = set()  # all outputs of all tasks

     for idx, taskDef in enumerate(pipeline):

         # task outputs

         dsMap = {name: getattr(taskDef.connections, name) for name in taskDef.connections.outputs}

         for dsTypeDescr in dsMap.values():

             if dsTypeDescr.name in allOutputs:

                 raise DuplicateOutputError("DatasetType `{}' appears more than "

                                            "once as output".format(dsTypeDescr.name))

         outputs[idx] = set(dsTypeDescr.name for dsTypeDescr in dsMap.values())

         allOutputs.update(outputs[idx])


         # task inputs

         connectionInputs = itertools.chain(taskDef.connections.inputs, taskDef.connections.prerequisiteInputs)

         dsMap = [getattr(taskDef.connections, name).name for name in connectionInputs]

         inputs[idx] = set(dsMap)

         allInputs.update(inputs[idx])


     # for simplicity add pseudo-node which is a producer for all pre-existing

     # inputs, its index is -1

     preExisting = allInputs - allOutputs

     outputs[-1] = preExisting


     # Set of nodes with no incoming edges, initially set to pseudo-node

     queue = [-1]

     result = []

     while queue:


         # move to final list, drop -1

         idx = queue.pop(0)

         if idx >= 0:

             result.append(idx)


         # remove task outputs from other tasks inputs

         thisTaskOutputs = outputs.get(idx, set())

         for taskInputs in inputs.values():

             taskInputs -= thisTaskOutputs


         # find all nodes with no incoming edges and move them to the queue

         topNodes = [key for key, value in inputs.items() if not value]

         queue += topNodes

         for key in topNodes:

             del inputs[key]


         # keep queue ordered

         queue.sort()


     # if there is something left it means cycles

     if inputs:

         # format it in usable way

         loops = []

         for idx, inputNames in inputs.items():

             taskName = pipeline[idx].label

             outputNames = outputs[idx]

             edge = "   {} -> {} -> {}".format(inputNames, taskName, outputNames)

             loops.append(edge)

         raise PipelineDataCycleError("Pipeline has data cycles:\n" + "\n".join(loops))


     return [pipeline[idx] for idx in result]

lsst.pipe.base.pipeTools.DuplicateOutputError
Definition: pipeTools.py:71

lsst.pipe.base.pipeTools.MissingTaskFactoryError
Definition: pipeTools.py:65

lsst.pipe.base.pipeTools.PipelineDataCycleError
Definition: pipeTools.py:78

set
daf::base::PropertySet * set
Definition: fits.cc:912

lsst.pex.config.history.format
def format(config, name=None, writeSourceLine=True, prefix="", verbose=False)
Definition: history.py:174

lsst.pipe.base.connections.iterConnections
typing.Generator[BaseConnection, None, None] iterConnections(PipelineTaskConnections connections, Union[str, Iterable[str]] connectionType)
Definition: connections.py:503

lsst.pipe.base.pipeTools.isPipelineOrdered
def isPipelineOrdered(pipeline, taskFactory=None)
Definition: pipeTools.py:84

lsst.pipe.base.pipeTools.orderPipeline
def orderPipeline(pipeline)
Definition: pipeTools.py:135