Skip to content

Tabular

siapy.datasets.tabular

MetaDataEntity

Bases: BaseModel

image_idx instance-attribute

image_idx: int

image_filepath instance-attribute

image_filepath: Path

camera_id instance-attribute

camera_id: str

shape_idx instance-attribute

shape_idx: int

shape_type instance-attribute

shape_type: str

shape_label instance-attribute

shape_label: str | None

geometry_idx instance-attribute

geometry_idx: int

TabularDataEntity

Bases: MetaDataEntity

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

signatures instance-attribute

signatures: Signatures

image_idx instance-attribute

image_idx: int

image_filepath instance-attribute

image_filepath: Path

camera_id instance-attribute

camera_id: str

shape_idx instance-attribute

shape_idx: int

shape_type instance-attribute

shape_type: str

shape_label instance-attribute

shape_label: str | None

geometry_idx instance-attribute

geometry_idx: int

TabularDataset dataclass

TabularDataset(container: ImageContainerType)

Creates a tabular dataset that can extract and organize spectral signatures from geometric shapes within spectral images for analysis and modeling.

PARAMETER DESCRIPTION
container

Either a single SpectralImage or a SpectralImageSet containing multiple spectral images to process.

TYPE: ImageContainerType

Example
from siapy.entities import SpectralImage
from siapy.datasets import TabularDataset

# With a single image
image = SpectralImage.open_rasterio("path/to/image.tif")
dataset = TabularDataset(image)

# With multiple images
image_set = SpectralImageSet([image1, image2])
dataset = TabularDataset(image_set)
Source code in siapy/datasets/tabular.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def __init__(self, container: ImageContainerType):
    """Initialize a TabularDataset from spectral image data.

    Creates a tabular dataset that can extract and organize spectral signatures
    from geometric shapes within spectral images for analysis and modeling.

    Args:
        container: Either a single SpectralImage or a SpectralImageSet containing
            multiple spectral images to process.

    Example:
        ```python
        from siapy.entities import SpectralImage
        from siapy.datasets import TabularDataset

        # With a single image
        image = SpectralImage.open_rasterio("path/to/image.tif")
        dataset = TabularDataset(image)

        # With multiple images
        image_set = SpectralImageSet([image1, image2])
        dataset = TabularDataset(image_set)
        ```
    """
    self._image_set = SpectralImageSet([container]) if isinstance(container, SpectralImage) else container
    self._data_entities: list[TabularDataEntity] = []

image_set property

image_set: SpectralImageSet

Get the spectral image set being processed.

RETURNS DESCRIPTION
SpectralImageSet

The SpectralImageSet containing all spectral images in this dataset.

Note

This is the original image set provided during initialization, possibly converted from a single SpectralImage.

data_entities property

data_entities: list[TabularDataEntity]

Get all processed data entities.

RETURNS DESCRIPTION
list[TabularDataEntity]

A list of TabularDataEntity objects, each containing spectral signatures and metadata for a geometric shape instance within the image set.

Note

This list will be empty until process_image_data() is called. Each entity represents signatures extracted from one geometric shape in one image.

process_image_data

process_image_data() -> None

Extract spectral signatures from geometric shapes in all images.

Processes each image in the image set, extracting spectral signatures from within the convex hull of each geometric shape. Creates TabularDataEntity objects containing the signatures along with associated metadata.

Side Effects
  • Clears any existing data entities
  • Populates the data_entities list with new TabularDataEntity objects
  • Each geometric shape may produce multiple entities if signatures are organized into multiple groups
Note

This method must be called before accessing data entities through iteration, indexing, or generate_dataset_data().

Example
dataset = TabularDataset(image_set)
dataset.process_image_data()
print(f"Processed {len(dataset)} data entities")
Source code in siapy/datasets/tabular.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def process_image_data(self) -> None:
    """Extract spectral signatures from geometric shapes in all images.

    Processes each image in the image set, extracting spectral signatures from
    within the convex hull of each geometric shape. Creates TabularDataEntity
    objects containing the signatures along with associated metadata.

    Side Effects:
        - Clears any existing data entities
        - Populates the `data_entities` list with new TabularDataEntity objects
        - Each geometric shape may produce multiple entities if signatures
          are organized into multiple groups

    Note:
        This method must be called before accessing data entities through
        iteration, indexing, or `generate_dataset_data()`.

    Example:
        ```python
        dataset = TabularDataset(image_set)
        dataset.process_image_data()
        print(f"Processed {len(dataset)} data entities")
        ```
    """
    self.data_entities.clear()
    for image_idx, image in enumerate(self.image_set):
        for shape_idx, shape in enumerate(image.geometric_shapes.shapes):
            signatures_hull = get_signatures_within_convex_hull(image, shape)
            for geometry_idx, signatures in enumerate(signatures_hull):
                entity = TabularDataEntity(
                    image_idx=image_idx,
                    shape_idx=shape_idx,
                    geometry_idx=geometry_idx,
                    image_filepath=image.filepath,
                    camera_id=image.camera_id,
                    shape_type=shape.shape_type,
                    shape_label=shape.label,
                    signatures=signatures,
                )
                self.data_entities.append(entity)

generate_dataset_data

generate_dataset_data(
    mean_signatures: bool = True,
) -> TabularDatasetData

Generate structured dataset data for analysis or export.

Combines all spectral signatures and metadata from processed data entities into a unified TabularDatasetData structure suitable for machine learning or statistical analysis.

PARAMETER DESCRIPTION
mean_signatures

If True, compute the mean of all signatures within each data entity. If False, include all individual signature measurements. Defaults to True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
TabularDatasetData

A TabularDatasetData object containing:
- signatures: Combined Signatures object with spectral data
- metadata: DataFrame with image and shape metadata for each signature
- (optional) Target values if available in the data entities.

RAISES DESCRIPTION
InvalidInputError

If no data entities exist (image data hasn't been processed yet).

Note

The metadata DataFrame columns correspond to MetaDataEntity fields: image_idx, image_filepath, camera_id, shape_idx, shape_type, shape_label, geometry_idx.

Example
dataset.process_image_data()

# Get averaged signatures per shape
data = dataset.generate_dataset_data(mean_signatures=True)

# Get all individual signature measurements
data_detailed = dataset.generate_dataset_data(mean_signatures=False)

print(f"Signatures shape: {data.signatures.to_numpy().shape}")
print(f"Metadata shape: {data.metadata.shape}")
Source code in siapy/datasets/tabular.py
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def generate_dataset_data(self, mean_signatures: bool = True) -> TabularDatasetData:
    """Generate structured dataset data for analysis or export.

    Combines all spectral signatures and metadata from processed data entities
    into a unified TabularDatasetData structure suitable for machine learning
    or statistical analysis.

    Args:
        mean_signatures: If True, compute the mean of all signatures within each
            data entity. If False, include all individual signature measurements.
            Defaults to True.

    Returns:
        A TabularDatasetData object containing: <br>
            - signatures: Combined Signatures object with spectral data <br>
            - metadata: DataFrame with image and shape metadata for each signature <br>
            - (optional) Target values if available in the data entities.

    Raises:
        InvalidInputError: If no data entities exist (image data hasn't been
            processed yet).

    Note:
        The metadata DataFrame columns correspond to MetaDataEntity fields:
        image_idx, image_filepath, camera_id, shape_idx, shape_type,
        shape_label, geometry_idx.

    Example:
        ```python
        dataset.process_image_data()

        # Get averaged signatures per shape
        data = dataset.generate_dataset_data(mean_signatures=True)

        # Get all individual signature measurements
        data_detailed = dataset.generate_dataset_data(mean_signatures=False)

        print(f"Signatures shape: {data.signatures.to_numpy().shape}")
        print(f"Metadata shape: {data.metadata.shape}")
        ```
    """
    self._check_data_entities()
    signatures_dfs = []
    metadata_dfs = []
    for entity in self.data_entities:
        signatures_df = entity.signatures.to_dataframe().dropna()
        if mean_signatures:
            signatures_df = signatures_df.mean().to_frame().T

        signatures_len = len(signatures_df)
        metadata_df = pd.DataFrame(
            {
                "image_idx": [str(entity.image_idx)] * signatures_len,
                "image_filepath": [str(entity.image_filepath)] * signatures_len,
                "camera_id": [entity.camera_id] * signatures_len,
                "shape_idx": [str(entity.shape_idx)] * signatures_len,
                "shape_type": [entity.shape_type] * signatures_len,
                "shape_label": [entity.shape_label] * signatures_len,
                "geometry_idx": [str(entity.geometry_idx)] * signatures_len,
            }
        )

        assert list(metadata_df.columns) == list(MetaDataEntity.model_fields.keys()), (
            "Sanity check failed! The columns in metadata_df do not match MetaDataEntity fields."
        )

        signatures_dfs.append(signatures_df)
        metadata_dfs.append(metadata_df)

    signatures_concat = pd.concat(signatures_dfs, ignore_index=True)
    metadata_concat = pd.concat(metadata_dfs, ignore_index=True)
    signatures = Signatures.from_dataframe(signatures_concat)
    return TabularDatasetData(signatures=signatures, metadata=metadata_concat)