Skip to content

base


DataCollection

An Abstract base class with common interface for data collection classes.

It represents a collection of data assets on a filesystem, though in the future it would make sense that we also allow use for object storage etc.

This also handles automatic egress/ingress and validation of the configurations for these collections.

Parameters:

Name Type Description Default
_identifier

The identifier of the data collection.

required
_path

The base path of the data collection.

required

Raises:

Type Description
AssertionError

Raised if identifier is not specified, or no hemispheres are selected.

Source code in download_toolbox/base.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class DataCollection(metaclass=ABCMeta):
    """An Abstract base class with common interface for data collection classes.

    It represents a collection of data assets on a filesystem, though in the future
    it would make sense that we also allow use for object storage etc.

    This also handles automatic egress/ingress and validation of the configurations
    for these collections.

    :param _identifier: The identifier of the data collection.
    :param _path: The base path of the data collection.
    :raises AssertionError: Raised if identifier is not specified, or no hemispheres are selected.
    """

    @abstractmethod
    def __init__(self,
                 *,
                 identifier: str,
                 base_path: str = os.path.abspath(os.path.join(".", "data")),
                 config_path: os.PathLike = None,
                 config_type: str = "data_collection",
                 path_components: list = None,
                 dummy: bool = False,
                 **kwargs) -> None:
        self._identifier = identifier

        path_components = list() if path_components is None else path_components
        if not isinstance(path_components, list):
            raise DataCollectionError("path_components should be an Iterator")

        # TODO: seriously: root_path, path and base_path!? Rationalise this, too smelly for words
        self._base_path = base_path
        self._dummy = dummy
        self._path_components = path_components
        self._root_path = None
        self._path = None
        self._config = None
        self._config_path = config_path
        self._config_type = config_type

        self.init()

    def copy_to(self,
                new_identifier: object,
                base_path: os.PathLike = None,
                skip_copy: bool = False):
        """

        Args:
            new_identifier:
            base_path:
            skip_copy:
        """
        old_path = self.path

        if base_path is not None:
            logging.info("Setting base path for copy to {}".format(base_path))
            self._base_path = base_path

        self.identifier = new_identifier

        if not skip_copy:
            logging.info("Copying {} to {}".format(old_path, self.path))
            shutil.copytree(old_path, self.path, dirs_exist_ok=True)
        else:
            logging.warning("Skipping naive copy of {} to {}".format(old_path, self.path))

    def get_config(self,
                   config_funcs: dict = None,
                   strip_keys: list = None) -> dict:
        """get_config returns the implementation configuration for re-instantiation

        get_config returns a configuration dictionary that provides not just a reference
        but also a portability layer for recreating classes.

        For things that aren't serialisable natively, use config_funcs to serialise or represent
        values that allow recreation (it's on you to recreate those appropriately). An example
        is available at ...download_toolbox.interface.get_dataset_config_implementation

        If you supply any arguments in a derived implementation, use strip_keys to prevent
        them being exported into configurations that would then result in duplicate arguments
        when the class is recreated from config

        TODO: documenting get_config in derived implementations
        TODO: schema and validation for this library and others, helping to control implementations
         to aid portability of pipelines

        Args:
            config_funcs:
            strip_keys:

        Returns:

        """
        strip_keys = [] if strip_keys is None else strip_keys
        return {k: config_funcs[k](v) if config_funcs is not None and k in config_funcs else v
                for k, v in self.__dict__.items() if k not in ["_path", "_config", "_config_path", "_root_path"] + strip_keys}

    def init(self):
        self._config = None
        self._root_path = os.path.join(self._base_path, self._identifier)
        self._path = os.path.join(self._root_path, *self._path_components)

        if self._identifier is None:
            raise DataCollectionError("No identifier supplied")

        if not self._dummy:
            if os.path.exists(self._path):
                logging.debug("{} already exists".format(self._path))
            else:
                if not os.path.islink(self._path):
                    logging.info("Creating path: {}".format(self._path))
                    os.makedirs(self._path, exist_ok=True)
                else:
                    logging.info("Skipping creation for symlink: {}".format(self._path))
        else:
            logging.debug("Avoiding creation of path {} as collection is marked dummy".format(self._path))

    def save_config(self):
        saved_config = self.config.render(self)
        logging.info("Saved dataset config {}".format(saved_config))
        return saved_config

    @property
    def base_path(self):
        return self._base_path

    @property
    def config(self):
        if self._config is None:
            self._config = Configuration(
                config_path=self._config_path if self._config_path is not None else self._root_path,
                config_type=self._config_type,
                identifier=self.identifier)
        return self._config

    @property
    def config_path(self):
        return self.config.output_path

    @config_path.setter
    def config_path(self, config_path: os.PathLike) -> None:
        self._config_path = config_path
        self._config.output_path = config_path

    @property
    def config_type(self):
        return self._config_type

    @property
    def identifier(self) -> str:
        """The identifier (label) for this data collection."""
        return self._identifier

    @identifier.setter
    def identifier(self, identifier: str) -> None:
        self._identifier = identifier
        self.init()

    @property
    def path(self) -> str:
        """The base path of the data collection."""
        return self._path

    @path.setter
    def path(self, path: str) -> None:
        self._path = path

    @property
    def path_components(self):
        return self._path_components

    @property
    def root_path(self):
        return self._root_path

identifier property writable

The identifier (label) for this data collection.

path property writable

The base path of the data collection.

copy_to(new_identifier, base_path=None, skip_copy=False)

Args: new_identifier: base_path: skip_copy:

Source code in download_toolbox/base.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def copy_to(self,
            new_identifier: object,
            base_path: os.PathLike = None,
            skip_copy: bool = False):
    """

    Args:
        new_identifier:
        base_path:
        skip_copy:
    """
    old_path = self.path

    if base_path is not None:
        logging.info("Setting base path for copy to {}".format(base_path))
        self._base_path = base_path

    self.identifier = new_identifier

    if not skip_copy:
        logging.info("Copying {} to {}".format(old_path, self.path))
        shutil.copytree(old_path, self.path, dirs_exist_ok=True)
    else:
        logging.warning("Skipping naive copy of {} to {}".format(old_path, self.path))

get_config(config_funcs=None, strip_keys=None)

get_config returns the implementation configuration for re-instantiation

get_config returns a configuration dictionary that provides not just a reference but also a portability layer for recreating classes.

For things that aren't serialisable natively, use config_funcs to serialise or represent values that allow recreation (it's on you to recreate those appropriately). An example is available at ...download_toolbox.interface.get_dataset_config_implementation

If you supply any arguments in a derived implementation, use strip_keys to prevent them being exported into configurations that would then result in duplicate arguments when the class is recreated from config

TODO: documenting get_config in derived implementations TODO: schema and validation for this library and others, helping to control implementations to aid portability of pipelines

Args: config_funcs: strip_keys:

Returns:

Source code in download_toolbox/base.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def get_config(self,
               config_funcs: dict = None,
               strip_keys: list = None) -> dict:
    """get_config returns the implementation configuration for re-instantiation

    get_config returns a configuration dictionary that provides not just a reference
    but also a portability layer for recreating classes.

    For things that aren't serialisable natively, use config_funcs to serialise or represent
    values that allow recreation (it's on you to recreate those appropriately). An example
    is available at ...download_toolbox.interface.get_dataset_config_implementation

    If you supply any arguments in a derived implementation, use strip_keys to prevent
    them being exported into configurations that would then result in duplicate arguments
    when the class is recreated from config

    TODO: documenting get_config in derived implementations
    TODO: schema and validation for this library and others, helping to control implementations
     to aid portability of pipelines

    Args:
        config_funcs:
        strip_keys:

    Returns:

    """
    strip_keys = [] if strip_keys is None else strip_keys
    return {k: config_funcs[k](v) if config_funcs is not None and k in config_funcs else v
            for k, v in self.__dict__.items() if k not in ["_path", "_config", "_config_path", "_root_path"] + strip_keys}