def predictions_to_glb(
predictions,
conf_thres=50.0,
filter_by_frames="all",
mask_black_bg=False,
mask_white_bg=False,
show_cam=True,
mask_sky=False,
target_dir=None,
prediction_mode="Predicted Pointmap",
) -> trimesh.Scene:
"""
Converts VGGT predictions to a 3D scene represented as a GLB file.
Args:
predictions (dict): Dictionary containing model predictions with keys:
- world_points: 3D point coordinates (S, H, W, 3)
- world_points_conf: Confidence scores (S, H, W)
- images: Input images (S, H, W, 3)
- extrinsic: Camera extrinsic matrices (S, 3, 4)
conf_thres (float): Percentage of low-confidence points to filter out (default: 50.0)
filter_by_frames (str): Frame filter specification (default: "all")
mask_black_bg (bool): Mask out black background pixels (default: False)
mask_white_bg (bool): Mask out white background pixels (default: False)
show_cam (bool): Include camera visualization (default: True)
mask_sky (bool): Apply sky segmentation mask (default: False)
target_dir (str): Output directory for intermediate files (default: None)
prediction_mode (str): Prediction mode selector (default: "Predicted Pointmap")
Returns:
trimesh.Scene: Processed 3D scene containing point cloud and cameras
Raises:
ValueError: If input predictions structure is invalid
"""
if not isinstance(predictions, dict):
raise ValueError("predictions must be a dictionary")
if conf_thres is None:
conf_thres = 10.0
print("Building GLB scene")
selected_frame_idx = None
if filter_by_frames != "all" and filter_by_frames != "All":
try:
# Extract the index part before the colon
selected_frame_idx = int(filter_by_frames.split(":")[0])
except (ValueError, IndexError):
pass
if "Pointmap" in prediction_mode:
print("Using Pointmap Branch")
if "world_points" in predictions:
pred_world_points = predictions["world_points"] # No batch dimension to remove
pred_world_points_conf = predictions.get("world_points_conf", np.ones_like(pred_world_points[..., 0]))
else:
print("Warning: world_points not found in predictions, falling back to depth-based points")
pred_world_points = predictions["world_points_from_depth"]
pred_world_points_conf = predictions.get("depth_conf", np.ones_like(pred_world_points[..., 0]))
else:
print("Using Depthmap and Camera Branch")
pred_world_points = predictions["world_points_from_depth"]
pred_world_points_conf = predictions.get("depth_conf", np.ones_like(pred_world_points[..., 0]))
# Get images from predictions
images = predictions["images"]
# Use extrinsic matrices instead of pred_extrinsic_list
camera_matrices = predictions["extrinsic"]
if mask_sky:
if target_dir is not None:
import onnxruntime
skyseg_session = None
target_dir_images = target_dir + "/images"
image_list = sorted(os.listdir(target_dir_images))
sky_mask_list = []
# Get the shape of pred_world_points_conf to match
S, H, W = (
pred_world_points_conf.shape
if hasattr(pred_world_points_conf, "shape")
else (len(images), images.shape[1], images.shape[2])
)
# Download skyseg.onnx if it doesn't exist
if not os.path.exists("skyseg.onnx"):
print("Downloading skyseg.onnx...")
download_file_from_url(
"https://huggingface.co/JianyuanWang/skyseg/resolve/main/skyseg.onnx", "skyseg.onnx"
)
for i, image_name in enumerate(image_list):
image_filepath = os.path.join(target_dir_images, image_name)
mask_filepath = os.path.join(target_dir, "sky_masks", image_name)
# Check if mask already exists
if os.path.exists(mask_filepath):
# Load existing mask
sky_mask = cv2.imread(mask_filepath, cv2.IMREAD_GRAYSCALE)
else:
# Generate new mask
if skyseg_session is None:
skyseg_session = onnxruntime.InferenceSession("skyseg.onnx")
sky_mask = segment_sky(image_filepath, skyseg_session, mask_filepath)
# Resize mask to match H×W if needed
if sky_mask.shape[0] != H or sky_mask.shape[1] != W:
sky_mask = cv2.resize(sky_mask, (W, H))
sky_mask_list.append(sky_mask)
# Convert list to numpy array with shape S×H×W
sky_mask_array = np.array(sky_mask_list)
# Apply sky mask to confidence scores
sky_mask_binary = (sky_mask_array > 0.1).astype(np.float32)
pred_world_points_conf = pred_world_points_conf * sky_mask_binary
if selected_frame_idx is not None:
pred_world_points = pred_world_points[selected_frame_idx][None]
pred_world_points_conf = pred_world_points_conf[selected_frame_idx][None]
images = images[selected_frame_idx][None]
camera_matrices = camera_matrices[selected_frame_idx][None]
vertices_3d = pred_world_points.reshape(-1, 3)
# Handle different image formats - check if images need transposing
if images.ndim == 4 and images.shape[1] == 3: # NCHW format
colors_rgb = np.transpose(images, (0, 2, 3, 1))
else: # Assume already in NHWC format
colors_rgb = images
colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
conf = pred_world_points_conf.reshape(-1)
# Convert percentage threshold to actual confidence value
if conf_thres == 0.0:
conf_threshold = 0.0
else:
conf_threshold = np.percentile(conf, conf_thres)
conf_mask = (conf >= conf_threshold) & (conf > 1e-5)
if mask_black_bg:
black_bg_mask = colors_rgb.sum(axis=1) >= 16
conf_mask = conf_mask & black_bg_mask
if mask_white_bg:
# Filter out white background pixels (RGB values close to white)
# Consider pixels white if all RGB values are above 240
white_bg_mask = ~((colors_rgb[:, 0] > 240) & (colors_rgb[:, 1] > 240) & (colors_rgb[:, 2] > 240))
conf_mask = conf_mask & white_bg_mask
vertices_3d = vertices_3d[conf_mask]
colors_rgb = colors_rgb[conf_mask]
if vertices_3d is None or np.asarray(vertices_3d).size == 0:
vertices_3d = np.array([[1, 0, 0]])
colors_rgb = np.array([[255, 255, 255]])
scene_scale = 1
else:
# Calculate the 5th and 95th percentiles along each axis
lower_percentile = np.percentile(vertices_3d, 5, axis=0)
upper_percentile = np.percentile(vertices_3d, 95, axis=0)
# Calculate the diagonal length of the percentile bounding box
scene_scale = np.linalg.norm(upper_percentile - lower_percentile)
colormap = matplotlib.colormaps.get_cmap("gist_rainbow")
# Initialize a 3D scene
scene_3d = trimesh.Scene()
# Add point cloud data to the scene
point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
scene_3d.add_geometry(point_cloud_data)
# Prepare 4x4 matrices for camera extrinsics
num_cameras = len(camera_matrices)
extrinsics_matrices = np.zeros((num_cameras, 4, 4))
extrinsics_matrices[:, :3, :4] = camera_matrices
extrinsics_matrices[:, 3, 3] = 1
if show_cam:
# Add camera models to the scene
for i in range(num_cameras):
world_to_camera = extrinsics_matrices[i]
camera_to_world = np.linalg.inv(world_to_camera)
rgba_color = colormap(i / num_cameras)
current_color = tuple(int(255 * x) for x in rgba_color[:3])
integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
# Align scene to the observation of the first camera
scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
print("GLB Scene built")
return scene_3d
点云后处理
最新推荐文章于 2025-08-08 11:09:55 发布