nvmlDeviceGetProcessUtilization return error

My cuda verison is 11.4 and my driver version is 450.191.x
I run nvmlDeviceGetProcessUtilization to get GPU util and the result is wrong.
Here is my code

#include <stdio.h>
#include <stdlib.h>
#include <nvml.h>
#include <sys/time.h>

int main() {
    nvmlReturn_t result;
    nvmlDevice_t device;
    unsigned int deviceCount;
    unsigned int utilizationCount = 100;
    nvmlProcessUtilizationSample_t *utilization = NULL;
    unsigned long long lastSeenTimeStamp = 0;

    result = nvmlInit();
    if (result != NVML_SUCCESS) {
        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
        return 1;
    }

    result = nvmlDeviceGetCount(&deviceCount);
    if (result != NVML_SUCCESS) {
        printf("Failed to get device count: %s\n", nvmlErrorString(result));
        nvmlShutdown();
        return 1;
    }

    for (unsigned int i = 0; i < deviceCount; i++) {
        result = nvmlDeviceGetHandleByIndex(i, &device);
        if (result != NVML_SUCCESS) {
            printf("Failed to get handle for device %d: %s\n", i, nvmlErrorString(result));
            continue;
        }

        printf("%d\n", utilizationCount);   // ------> first get process count
        result = nvmlDeviceGetProcessUtilization(device, utilization, &utilizationCount, lastSeenTimeStamp);
        printf("%d\n", utilizationCount);   // ------> second get process count
        if (result == NVML_ERROR_INSUFFICIENT_SIZE) {
            utilization = (nvmlProcessUtilizationSample_t *)malloc(utilizationCount * sizeof(nvmlProcessUtilizationSample_t));
            if (utilization == NULL) {
                printf("Failed to allocate memory for process utilization samples.\n");
                nvmlShutdown();
                return 1;
            }
        }

        time_t nowt = time(NULL);
        lastSeenTimeStamp = ((int)nowt - 1) * pow(10, 6);
        printf("%d\n", utilizationCount);  // ------> first get process count
        result = nvmlDeviceGetProcessUtilization(device, utilization, &utilizationCount, lastSeenTimeStamp);
        printf("%d %s\n", utilizationCount, nvmlErrorString(result));  // ------> second get process count
        if (result == NVML_SUCCESS) {
            for (unsigned int j = 0; j < utilizationCount; j++) {
                printf("  PID: %u\n", utilization[j].pid);
                printf("  GPU Utilization: %u%%\n", utilization[j].smUtil);
                printf("  Memory Utilization: %u%%\n", utilization[j].memUtil);
                printf("--------------------------\n");
            }
        } else if (result == NVML_ERROR_INSUFFICIENT_SIZE) {
            printf("Buffer too small. Increase utilizationCount.\n");
            
        } else {
            printf("Failed to get process utilization for device %d: %s\n", i, nvmlErrorString(result));
        }

        if (utilization != NULL) {
            free(utilization);
            utilization = NULL;
        }

    }
    nvmlShutdown();

    return 0;
}

The output is following

Every 0.5s: ./n1                                                                                                                                                                                                                        n147-167-074: Fri Mar 21 18:20:51 2025

device: 0
4
100
100
1 Success
  PID: 650092
  GPU Utilization: 98%
  Memory Utilization: 83%
--------------------------
device: 1
4
100
100
100 Success
  PID: 3593859416
  GPU Utilization: 3482903712%
  Memory Utilization: 21961%
--------------------------
  PID: 0
  GPU Utilization: 0%
  Memory Utilization: 0%
--------------------------
  PID: 0
  GPU Utilization: 0%
  Memory Utilization: 0%
--------------------------
  PID: 0
  GPU Utilization: 4294967295%
  Memory Utilization: 0%
--------------------------
  PID: 4294901760
  GPU Utilization: 4294967295%
  Memory Utilization: 4294967295%
--------------------------
  PID: 3482903968
  GPU Utilization: 0%
  Memory Utilization: 0%

I have run one process in device 0 and no process in device 1. Why second print process count is 100 and return SUCCESS which will result to some wrong pid and wrong util? Is it related to a mismatch between the CUDA version and the driver version?"