Source code for torch_geometric.datasets.teeth3ds

import json
import os
import os.path as osp
from glob import glob
from typing import Callable, Dict, List, Optional

import numpy as np
import torch
from tqdm import tqdm

from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
    extract_zip,
)


[docs]class Teeth3DS(InMemoryDataset):
    r"""The Teeth3DS+ dataset from the `"An Extended Benchmark for Intra-oral
    3D Scans Analysis" <https://crns-smartvision.github.io/teeth3ds/>`_ paper.

    This dataset is the first comprehensive public benchmark designed to
    advance the field of intra-oral 3D scan analysis developed as part of the
    3DTeethSeg 2022 and 3DTeethLand 2024 MICCAI challenges, aiming to drive
    research in teeth identification, segmentation, labeling, 3D modeling,
    and dental landmark identification.
    The dataset includes at least 1,800 intra-oral scans (containing 23,999
    annotated teeth) collected from 900 patients, covering both upper and lower
    jaws separately.

    Args:
        root (str): Root directory where the dataset should be saved.
        split (str): The split name (one of :obj:`"Teeth3DS"`,
            :obj:`"3DTeethSeg22_challenge"` or :obj:`"3DTeethLand_challenge"`).
        train (bool, optional): If :obj:`True`, loads the training dataset,
            otherwise the test dataset. (default: :obj:`True`)
        num_samples (int, optional): Number of points to sample from each mesh.
            (default: :obj:`30000`)
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)
        force_reload (bool, optional): Whether to re-process the dataset.
            (default: :obj:`False`)
    """
    urls = {
        'data_part_1.zip':
        'https://osf.io/download/qhprs/',
        'data_part_2.zip':
        'https://osf.io/download/4pwnr/',
        'data_part_3.zip':
        'https://osf.io/download/frwdp/',
        'data_part_4.zip':
        'https://osf.io/download/2arn4/',
        'data_part_5.zip':
        'https://osf.io/download/xrz5f/',
        'data_part_6.zip':
        'https://osf.io/download/23hgq/',
        'data_part_7.zip':
        'https://osf.io/download/u83ad/',
        'train_test_split':
        'https://files.de-1.osf.io/v1/'
        'resources/xctdy/providers/osfstorage/?zip='
    }

    sample_url = {
        'teeth3ds_sample': 'https://osf.io/download/vr38s/',
    }

    landmarks_urls = {
        '3DTeethLand_landmarks_train.zip': 'https://osf.io/download/k5hbj/',
        '3DTeethLand_landmarks_test.zip': 'https://osf.io/download/sqw5e/',
    }

    def __init__(
        self,
        root: str,
        split:
        str = 'Teeth3DS',  # [3DTeethSeg22_challenge, 3DTeethLand_challenge]
        train: bool = True,
        num_samples: int = 30000,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
        force_reload: bool = False,
    ) -> None:

        self.mode = 'training' if train else 'testing'
        self.split = split
        self.num_samples = num_samples

        super().__init__(root, transform, pre_transform,
                         force_reload=force_reload)

    @property
    def processed_dir(self) -> str:
        return os.path.join(self.root, f'processed_{self.split}_{self.mode}')

    @property
    def raw_file_names(self) -> List[str]:
        return ['license.txt']

    @property
    def processed_file_names(self) -> List[str]:
        # Directory containing train/test split files:
        split_subdir = 'teeth3ds_sample' if self.split == 'sample' else ''
        split_dir = osp.join(
            self.raw_dir,
            split_subdir,
            f'{self.split}_train_test_split',
        )

        split_files = glob(osp.join(split_dir, f'{self.mode}*.txt'))

        # Collect all file names from the split files:
        combined_list = []
        for file_path in split_files:
            with open(file_path) as file:
                combined_list.extend(file.read().splitlines())

        # Generate the list of processed file paths:
        return [f'{file_name}.pt' for file_name in combined_list]

    def download(self) -> None:
        if self.split == 'sample':
            for key, url in self.sample_url.items():
                path = download_url(url, self.root, filename=key)
                extract_zip(path, self.raw_dir)
                os.unlink(path)
        else:
            for key, url in self.urls.items():
                path = download_url(url, self.root, filename=key)
                extract_zip(path, self.raw_dir)
                os.unlink(path)
            for key, url in self.landmarks_urls.items():
                path = download_url(url, self.root, filename=key)
                extract_zip(path, self.raw_dir)  # Extract each downloaded part
                os.unlink(path)

    def process_file(self, file_path: str) -> Optional[Data]:
        """Processes the input file path to load mesh data, annotations,
        and prepare the input features for a graph-based deep learning model.
        """
        import trimesh
        from fpsample import bucket_fps_kdline_sampling

        mesh = trimesh.load_mesh(file_path)

        if isinstance(mesh, list):
            # Handle the case where a list of Geometry objects is returned
            mesh = mesh[0]

        vertices = mesh.vertices
        vertex_normals = mesh.vertex_normals

        # Perform sampling on mesh vertices:
        if len(vertices) < self.num_samples:
            sampled_indices = np.random.choice(
                len(vertices),
                self.num_samples,
                replace=True,
            )
        else:
            sampled_indices = bucket_fps_kdline_sampling(
                vertices,
                self.num_samples,
                h=5,
                start_idx=0,
            )

        if len(sampled_indices) != self.num_samples:
            raise RuntimeError(f"Sampled points mismatch, expected "
                               f"{self.num_samples} points, but got "
                               f"{len(sampled_indices)} for '{file_path}'")

        # Extract features and annotations for the sampled points:
        pos = torch.tensor(vertices[sampled_indices], dtype=torch.float)
        x = torch.tensor(vertex_normals[sampled_indices], dtype=torch.float)

        # Load segmentation annotations:
        seg_annotation_path = file_path.replace('.obj', '.json')
        if osp.exists(seg_annotation_path):
            with open(seg_annotation_path) as f:
                seg_annotations = json.load(f)
            y = torch.tensor(
                np.asarray(seg_annotations['labels'])[sampled_indices],
                dtype=torch.float)
            instances = torch.tensor(
                np.asarray(seg_annotations['instances'])[sampled_indices],
                dtype=torch.float)
        else:
            y = torch.empty(0, 3)
            instances = torch.empty(0, 3)

        # Load landmarks annotations:
        landmarks_annotation_path = file_path.replace('.obj', '__kpt.json')

        # Parse keypoint annotations into structured tensors:
        keypoints_dict: Dict[str, List] = {
            key: []
            for key in [
                'Mesial', 'Distal', 'Cusp', 'InnerPoint', 'OuterPoint',
                'FacialPoint'
            ]
        }
        keypoint_tensors: Dict[str, torch.Tensor] = {
            key: torch.empty(0, 3)
            for key in [
                'Mesial', 'Distal', 'Cusp', 'InnerPoint', 'OuterPoint',
                'FacialPoint'
            ]
        }
        if osp.exists(landmarks_annotation_path):
            with open(landmarks_annotation_path) as f:
                landmarks_annotations = json.load(f)

            for keypoint in landmarks_annotations['objects']:
                keypoints_dict[keypoint['class']].extend(keypoint['coord'])

            keypoint_tensors = {
                k: torch.tensor(np.asarray(v),
                                dtype=torch.float).reshape(-1, 3)
                for k, v in keypoints_dict.items()
            }

        data = Data(
            pos=pos,
            x=x,
            y=y,
            instances=instances,
            jaw=file_path.split('.obj')[0].split('_')[1],
            mesial=keypoint_tensors['Mesial'],
            distal=keypoint_tensors['Distal'],
            cusp=keypoint_tensors['Cusp'],
            inner_point=keypoint_tensors['InnerPoint'],
            outer_point=keypoint_tensors['OuterPoint'],
            facial_point=keypoint_tensors['FacialPoint'],
        )

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        return data

    def process(self) -> None:
        for file in tqdm(self.processed_file_names):
            name = file.split('.')[0]
            path = osp.join(self.raw_dir, '**', '*', name + '.obj')
            paths = glob(path)
            if len(paths) == 1:
                data = self.process_file(paths[0])
                torch.save(data, osp.join(self.processed_dir, file))

    def len(self) -> int:
        return len(self.processed_file_names)

    def get(self, idx: int) -> Data:
        return torch.load(
            osp.join(self.processed_dir, self.processed_file_names[idx]),
            weights_only=False,
        )

    def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({len(self)}, '
                f'mode={self.mode}, split={self.split})')